From: Julian Seward Date: Mon, 14 Feb 2011 13:30:26 +0000 (+0000) Subject: Merge from trunk, r2076 (Implement SSE4.x EXTRACTPS, BLENDVPD, X-Git-Tag: svn/VALGRIND_3_6_1^2~10 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f3a6734679332a9b1c671b01924ce798cf166df6;p=thirdparty%2Fvalgrind.git Merge from trunk, r2076 (Implement SSE4.x EXTRACTPS, BLENDVPD, BLENDVPS, PBLENDVB.) git-svn-id: svn://svn.valgrind.org/vex/branches/VEX_3_6_BRANCH@2093 --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 3d723b416a..da85c4961a 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -14643,6 +14643,55 @@ DisResult disInstr_AMD64_WRK ( goto decode_success; } + + /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract + float from xmm reg and store in gen.reg or mem. This is + identical to PEXTRD, except that REX.W appears to be ignored. + */ + if ( have66noF2noF3( pfx ) + && sz == 2 /* REX.W == 0; perhaps too strict? */ + && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x17 ) { + + Int imm8_10; + IRTemp xmm_vec = newTemp(Ity_V128); + IRTemp src_dword = newTemp(Ity_I32); + + modrm = insn[3]; + assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) ); + breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 ); + + if ( epartIsReg( modrm ) ) { + imm8_10 = (Int)(insn[3+1] & 3); + } else { + addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 ); + imm8_10 = (Int)(insn[3+alen] & 3); + } + + switch ( imm8_10 ) { + case 0: assign( src_dword, mkexpr(t0) ); break; + case 1: assign( src_dword, mkexpr(t1) ); break; + case 2: assign( src_dword, mkexpr(t2) ); break; + case 3: assign( src_dword, mkexpr(t3) ); break; + default: vassert(0); + } + + if ( epartIsReg( modrm ) ) { + putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) ); + delta += 3+1+1; + DIP( "extractps $%d, %s,%s\n", imm8_10, + nameXMMReg( gregOfRexRM(pfx, modrm) ), + nameIReg32( eregOfRexRM(pfx, modrm) ) ); + } else { + storeLE( mkexpr(addr), mkexpr(src_dword) ); + delta += 3+alen+1; + DIP( "extractps $%d, %s,%s\n", + imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf ); + } + + goto decode_success; + } + + /* 66 0F 38 37 = PCMPGTQ 64x2 comparison (signed, presumably; the Intel docs don't say :-) */ @@ -15731,6 +15780,74 @@ DisResult disInstr_AMD64_WRK ( goto decode_success; } + /* 66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128 (double gran) + 66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128 (float gran) + 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128 (byte gran) + Blend at various granularities, with XMM0 (implicit operand) + providing the controlling mask. + */ + if (have66noF2noF3(pfx) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x15 || insn[2] == 0x14 || insn[2] == 0x10)) { + modrm = insn[3]; + + HChar* nm = NULL; + UInt gran = 0; + IROp opSAR = Iop_INVALID; + switch (insn[2]) { + case 0x15: + nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2; + break; + case 0x14: + nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4; + break; + case 0x10: + nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16; + break; + } + vassert(nm); + + IRTemp vecE = newTemp(Ity_V128); + IRTemp vecG = newTemp(Ity_V128); + IRTemp vec0 = newTemp(Ity_V128); + + if ( epartIsReg(modrm) ) { + assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm))); + delta += 3+1; + DIP( "%s %s,%s\n", nm, + nameXMMReg( eregOfRexRM(pfx, modrm) ), + nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 ); + gen_SEGV_if_not_16_aligned( addr ); + assign(vecE, loadLE( Ity_V128, mkexpr(addr) )); + delta += 3+alen; + DIP( "%s %s,%s\n", nm, + dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } + + assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm))); + assign(vec0, getXMMReg(0)); + + /* Now the tricky bit is to convert vec0 into a suitable mask, + by copying the most significant bit of each lane into all + positions in the lane. */ + IRTemp sh = newTemp(Ity_I8); + assign(sh, mkU8(8 * gran - 1)); + + IRTemp mask = newTemp(Ity_V128); + assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh))); + + IRTemp notmask = newTemp(Ity_V128); + assign(notmask, unop(Iop_NotV128, mkexpr(mask))); + + IRExpr* res = binop(Iop_OrV128, + binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)), + binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))); + putXMMReg(gregOfRexRM(pfx, modrm), res); + + goto decode_success; + } /* ---------------------------------------------------- */ /* --- end of the SSE4 decoder --- */ diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index a54444a860..8f8e4a3179 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3659,6 +3659,54 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } + case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2; + goto do_SseAssistedVectorAndScalar; + case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16; + goto do_SseAssistedVectorAndScalar; + do_SseAssistedVectorAndScalar: { + /* RRRufff! RRRufff code is what we're generating here. Oh + well. */ + vassert(fn != 0); + HReg dst = newVRegV(env); + HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); + HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2); + HReg argp = newVRegI(env); + /* subq $112, %rsp -- make a space*/ + sub_from_rsp(env, 112); + /* leaq 48(%rsp), %r_argp -- point into it */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), + argp)); + /* andq $-16, %r_argp -- 16-align the pointer */ + addInstr(env, AMD64Instr_Alu64R(Aalu_AND, + AMD64RMI_Imm( ~(UInt)15 ), + argp)); + /* Prepare 2 vector arg regs: + leaq 0(%r_argp), %rdi + leaq 16(%r_argp), %rsi + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp), + hregAMD64_RSI())); + /* Store the vector arg, at (%rsi): + movupd %argL, 0(%rsi) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL, + AMD64AMode_IR(0, hregAMD64_RSI()))); + /* And get the scalar value into rdx */ + addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX())); + + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 )); + /* fetch the result from memory, using %r_argp, which the + register allocator will keep alive across the call. */ + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst, + AMD64AMode_IR(0, argp))); + /* and finally, clear the space */ + add_to_rsp(env, 112); + return dst; + } + default: break; } /* switch (e->Iex.Binop.op) */ diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c index 8ed516609d..2430e67bbe 100644 --- a/VEX/priv/host_generic_simd128.c +++ b/VEX/priv/host_generic_simd128.c @@ -94,6 +94,16 @@ static inline ULong cmpGT64S ( Long xx, Long yy ) ? 0xFFFFFFFFFFFFFFFFULL : 0ULL; } +static inline ULong sar64 ( ULong v, UInt n ) +{ + return ((Long)v) >> n; +} + +static inline UChar sar8 ( UChar v, UInt n ) +{ + return toUChar(((Char)v) >> n); +} + void h_generic_calc_Mul32x4 ( /*OUT*/V128* res, V128* argL, V128* argR ) { @@ -214,6 +224,44 @@ void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res, res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]); } +/* ------------ Shifting ------------ */ +/* Note that because these primops are undefined if the shift amount + equals or exceeds the lane width, the shift amount is masked so + that the scalar shifts are always in range. In fact, given the + semantics of these primops (Sar64x2, etc) it is an error if in + fact we are ever given an out-of-range shift amount. +*/ +void h_generic_calc_SarN64x2 ( /*OUT*/V128* res, + V128* argL, UInt nn) +{ + /* vassert(nn < 64); */ + nn &= 63; + res->w64[0] = sar64(argL->w64[0], nn); + res->w64[1] = sar64(argL->w64[1], nn); +} + +void h_generic_calc_SarN8x16 ( /*OUT*/V128* res, + V128* argL, UInt nn) +{ + /* vassert(nn < 8); */ + nn &= 7; + res->w8[ 0] = sar8(argL->w8[ 0], nn); + res->w8[ 1] = sar8(argL->w8[ 1], nn); + res->w8[ 2] = sar8(argL->w8[ 2], nn); + res->w8[ 3] = sar8(argL->w8[ 3], nn); + res->w8[ 4] = sar8(argL->w8[ 4], nn); + res->w8[ 5] = sar8(argL->w8[ 5], nn); + res->w8[ 6] = sar8(argL->w8[ 6], nn); + res->w8[ 7] = sar8(argL->w8[ 7], nn); + res->w8[ 8] = sar8(argL->w8[ 8], nn); + res->w8[ 9] = sar8(argL->w8[ 9], nn); + res->w8[10] = sar8(argL->w8[10], nn); + res->w8[11] = sar8(argL->w8[11], nn); + res->w8[12] = sar8(argL->w8[12], nn); + res->w8[13] = sar8(argL->w8[13], nn); + res->w8[14] = sar8(argL->w8[14], nn); + res->w8[15] = sar8(argL->w8[15], nn); +} /*---------------------------------------------------------------*/ /*--- end host_generic_simd128.c ---*/ diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h index 53850cbdd2..d764439e16 100644 --- a/VEX/priv/host_generic_simd128.h +++ b/VEX/priv/host_generic_simd128.h @@ -58,6 +58,8 @@ extern void h_generic_calc_Min16Ux8 ( /*OUT*/V128*, V128*, V128* ); extern void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* ); extern void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* ); extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_SarN64x2 ( /*OUT*/V128*, V128*, UInt ); +extern void h_generic_calc_SarN8x16 ( /*OUT*/V128*, V128*, UInt ); #endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */