goto decode_success;
}
+
+ /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
+ float from xmm reg and store in gen.reg or mem. This is
+ identical to PEXTRD, except that REX.W appears to be ignored.
+ */
+ if ( have66noF2noF3( pfx )
+ && sz == 2 /* REX.W == 0; perhaps too strict? */
+ && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x17 ) {
+
+ Int imm8_10;
+ IRTemp xmm_vec = newTemp(Ity_V128);
+ IRTemp src_dword = newTemp(Ity_I32);
+
+ modrm = insn[3];
+ assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+ breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+
+ if ( epartIsReg( modrm ) ) {
+ imm8_10 = (Int)(insn[3+1] & 3);
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+ imm8_10 = (Int)(insn[3+alen] & 3);
+ }
+
+ switch ( imm8_10 ) {
+ case 0: assign( src_dword, mkexpr(t0) ); break;
+ case 1: assign( src_dword, mkexpr(t1) ); break;
+ case 2: assign( src_dword, mkexpr(t2) ); break;
+ case 3: assign( src_dword, mkexpr(t3) ); break;
+ default: vassert(0);
+ }
+
+ if ( epartIsReg( modrm ) ) {
+ putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
+ delta += 3+1+1;
+ DIP( "extractps $%d, %s,%s\n", imm8_10,
+ nameXMMReg( gregOfRexRM(pfx, modrm) ),
+ nameIReg32( eregOfRexRM(pfx, modrm) ) );
+ } else {
+ storeLE( mkexpr(addr), mkexpr(src_dword) );
+ delta += 3+alen+1;
+ DIP( "extractps $%d, %s,%s\n",
+ imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+ }
+
+ goto decode_success;
+ }
+
+
/* 66 0F 38 37 = PCMPGTQ
64x2 comparison (signed, presumably; the Intel docs don't say :-)
*/
goto decode_success;
}
+ /* 66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128 (double gran)
+ 66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128 (float gran)
+ 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128 (byte gran)
+ Blend at various granularities, with XMM0 (implicit operand)
+ providing the controlling mask.
+ */
+ if (have66noF2noF3(pfx) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x15 || insn[2] == 0x14 || insn[2] == 0x10)) {
+ modrm = insn[3];
+
+ HChar* nm = NULL;
+ UInt gran = 0;
+ IROp opSAR = Iop_INVALID;
+ switch (insn[2]) {
+ case 0x15:
+ nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
+ break;
+ case 0x14:
+ nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
+ break;
+ case 0x10:
+ nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
+ break;
+ }
+ vassert(nm);
+
+ IRTemp vecE = newTemp(Ity_V128);
+ IRTemp vecG = newTemp(Ity_V128);
+ IRTemp vec0 = newTemp(Ity_V128);
+
+ if ( epartIsReg(modrm) ) {
+ assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
+ delta += 3+1;
+ DIP( "%s %s,%s\n", nm,
+ nameXMMReg( eregOfRexRM(pfx, modrm) ),
+ nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+ gen_SEGV_if_not_16_aligned( addr );
+ assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
+ delta += 3+alen;
+ DIP( "%s %s,%s\n", nm,
+ dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ }
+
+ assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
+ assign(vec0, getXMMReg(0));
+
+ /* Now the tricky bit is to convert vec0 into a suitable mask,
+ by copying the most significant bit of each lane into all
+ positions in the lane. */
+ IRTemp sh = newTemp(Ity_I8);
+ assign(sh, mkU8(8 * gran - 1));
+
+ IRTemp mask = newTemp(Ity_V128);
+ assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
+
+ IRTemp notmask = newTemp(Ity_V128);
+ assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
+
+ IRExpr* res = binop(Iop_OrV128,
+ binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
+ binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask)));
+ putXMMReg(gregOfRexRM(pfx, modrm), res);
+
+ goto decode_success;
+ }
/* ---------------------------------------------------- */
/* --- end of the SSE4 decoder --- */
return dst;
}
+ case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
+ goto do_SseAssistedVectorAndScalar;
+ case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
+ goto do_SseAssistedVectorAndScalar;
+ do_SseAssistedVectorAndScalar: {
+ /* RRRufff! RRRufff code is what we're generating here. Oh
+ well. */
+ vassert(fn != 0);
+ HReg dst = newVRegV(env);
+ HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+ HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+ HReg argp = newVRegI(env);
+ /* subq $112, %rsp -- make a space*/
+ sub_from_rsp(env, 112);
+ /* leaq 48(%rsp), %r_argp -- point into it */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+ argp));
+ /* andq $-16, %r_argp -- 16-align the pointer */
+ addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+ AMD64RMI_Imm( ~(UInt)15 ),
+ argp));
+ /* Prepare 2 vector arg regs:
+ leaq 0(%r_argp), %rdi
+ leaq 16(%r_argp), %rsi
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+ hregAMD64_RSI()));
+ /* Store the vector arg, at (%rsi):
+ movupd %argL, 0(%rsi)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ /* And get the scalar value into rdx */
+ addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
+
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
+ AMD64AMode_IR(0, argp)));
+ /* and finally, clear the space */
+ add_to_rsp(env, 112);
+ return dst;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
}
+static inline ULong sar64 ( ULong v, UInt n )
+{
+ return ((Long)v) >> n;
+}
+
+static inline UChar sar8 ( UChar v, UInt n )
+{
+ return toUChar(((Char)v) >> n);
+}
+
void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
}
+/* ------------ Shifting ------------ */
+/* Note that because these primops are undefined if the shift amount
+ equals or exceeds the lane width, the shift amount is masked so
+ that the scalar shifts are always in range. In fact, given the
+ semantics of these primops (Sar64x2, etc) it is an error if in
+ fact we are ever given an out-of-range shift amount.
+*/
+void h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
+ V128* argL, UInt nn)
+{
+ /* vassert(nn < 64); */
+ nn &= 63;
+ res->w64[0] = sar64(argL->w64[0], nn);
+ res->w64[1] = sar64(argL->w64[1], nn);
+}
+
+void h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
+ V128* argL, UInt nn)
+{
+ /* vassert(nn < 8); */
+ nn &= 7;
+ res->w8[ 0] = sar8(argL->w8[ 0], nn);
+ res->w8[ 1] = sar8(argL->w8[ 1], nn);
+ res->w8[ 2] = sar8(argL->w8[ 2], nn);
+ res->w8[ 3] = sar8(argL->w8[ 3], nn);
+ res->w8[ 4] = sar8(argL->w8[ 4], nn);
+ res->w8[ 5] = sar8(argL->w8[ 5], nn);
+ res->w8[ 6] = sar8(argL->w8[ 6], nn);
+ res->w8[ 7] = sar8(argL->w8[ 7], nn);
+ res->w8[ 8] = sar8(argL->w8[ 8], nn);
+ res->w8[ 9] = sar8(argL->w8[ 9], nn);
+ res->w8[10] = sar8(argL->w8[10], nn);
+ res->w8[11] = sar8(argL->w8[11], nn);
+ res->w8[12] = sar8(argL->w8[12], nn);
+ res->w8[13] = sar8(argL->w8[13], nn);
+ res->w8[14] = sar8(argL->w8[14], nn);
+ res->w8[15] = sar8(argL->w8[15], nn);
+}
/*---------------------------------------------------------------*/
/*--- end host_generic_simd128.c ---*/
extern void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* );
extern void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* );
extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_SarN64x2 ( /*OUT*/V128*, V128*, UInt );
+extern void h_generic_calc_SarN8x16 ( /*OUT*/V128*, V128*, UInt );
#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */