.. by giving them their own vector IROps rather than doing each lane individually.
IRTemp argV = newTemp(Ity_V128);
IRTemp rmode = newTemp(Ity_I32);
UInt rG = gregOfRexRM(pfx,modrm);
- IRTemp t0, t1, t2, t3;
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx,modrm);
assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
: get_sse_roundingmode() );
- t0 = t1 = t2 = t3 = IRTemp_INVALID;
- breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
- /* This is less than ideal. If it turns out to be a performance
- bottleneck it can be improved. */
-# define CVT(_t) \
- binop( Iop_F64toI32S, \
- mkexpr(rmode), \
- unop( Iop_F32toF64, \
- unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
-
- putXMMRegLane32( rG, 3, CVT(t3) );
- putXMMRegLane32( rG, 2, CVT(t2) );
- putXMMRegLane32( rG, 1, CVT(t1) );
- putXMMRegLane32( rG, 0, CVT(t0) );
-# undef CVT
+ putXMMReg( rG, binop(Iop_F32toI32Sx4, mkexpr(rmode), mkexpr(argV)) );
if (isAvx)
putYMMRegLane128( rG, 1, mkV128(0) );
IRTemp argV = newTemp(Ity_V256);
IRTemp rmode = newTemp(Ity_I32);
UInt rG = gregOfRexRM(pfx,modrm);
- IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx,modrm);
assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
: get_sse_roundingmode() );
- t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
- breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
- /* This is less than ideal. If it turns out to be a performance
- bottleneck it can be improved. */
-# define CVT(_t) \
- binop( Iop_F64toI32S, \
- mkexpr(rmode), \
- unop( Iop_F32toF64, \
- unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
-
- putYMMRegLane32( rG, 7, CVT(t7) );
- putYMMRegLane32( rG, 6, CVT(t6) );
- putYMMRegLane32( rG, 5, CVT(t5) );
- putYMMRegLane32( rG, 4, CVT(t4) );
- putYMMRegLane32( rG, 3, CVT(t3) );
- putYMMRegLane32( rG, 2, CVT(t2) );
- putYMMRegLane32( rG, 1, CVT(t1) );
- putYMMRegLane32( rG, 0, CVT(t0) );
-# undef CVT
-
+ putYMMReg( rG, binop(Iop_F32toI32Sx8, mkexpr(rmode), mkexpr(argV)) );
return delta;
}
IRTemp argV = newTemp(Ity_V128);
IRTemp rmode = newTemp(Ity_I32);
UInt rG = gregOfRexRM(pfx,modrm);
- IRTemp t0, t1, t2, t3;
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx,modrm);
}
assign( rmode, get_sse_roundingmode() );
- t0 = IRTemp_INVALID;
- t1 = IRTemp_INVALID;
- t2 = IRTemp_INVALID;
- t3 = IRTemp_INVALID;
- breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
+ putXMMReg(rG, binop(Iop_I32StoF32x4, mkexpr(rmode), mkexpr(argV)));
-# define CVT(_t) binop( Iop_F64toF32, \
- mkexpr(rmode), \
- unop(Iop_I32StoF64,mkexpr(_t)))
-
- putXMMRegLane32F( rG, 3, CVT(t3) );
- putXMMRegLane32F( rG, 2, CVT(t2) );
- putXMMRegLane32F( rG, 1, CVT(t1) );
- putXMMRegLane32F( rG, 0, CVT(t0) );
-# undef CVT
if (isAvx)
putYMMRegLane128( rG, 1, mkV128(0) );
IRTemp argV = newTemp(Ity_V256);
IRTemp rmode = newTemp(Ity_I32);
UInt rG = gregOfRexRM(pfx,modrm);
- IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
if (epartIsReg(modrm)) {
UInt rE = eregOfRexRM(pfx,modrm);
}
assign( rmode, get_sse_roundingmode() );
- t0 = IRTemp_INVALID;
- t1 = IRTemp_INVALID;
- t2 = IRTemp_INVALID;
- t3 = IRTemp_INVALID;
- t4 = IRTemp_INVALID;
- t5 = IRTemp_INVALID;
- t6 = IRTemp_INVALID;
- t7 = IRTemp_INVALID;
- breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
-
-# define CVT(_t) binop( Iop_F64toF32, \
- mkexpr(rmode), \
- unop(Iop_I32StoF64,mkexpr(_t)))
-
- putYMMRegLane32F( rG, 7, CVT(t7) );
- putYMMRegLane32F( rG, 6, CVT(t6) );
- putYMMRegLane32F( rG, 5, CVT(t5) );
- putYMMRegLane32F( rG, 4, CVT(t4) );
- putYMMRegLane32F( rG, 3, CVT(t3) );
- putYMMRegLane32F( rG, 2, CVT(t2) );
- putYMMRegLane32F( rG, 1, CVT(t1) );
- putYMMRegLane32F( rG, 0, CVT(t0) );
-# undef CVT
+ putYMMReg(rG, binop(Iop_I32StoF32x8, mkexpr(rmode), mkexpr(argV)));
return delta;
}
case Asse_RCPF: return "rcp";
case Asse_RSQRTF: return "rsqrt";
case Asse_SQRTF: return "sqrt";
+ case Asse_I2F: return "cvtdq2ps.";
+ case Asse_F2I: return "cvtps2dq.";
case Asse_AND: return "and";
case Asse_OR: return "or";
case Asse_XOR: return "xor";
case Asse_SHL16: return "psllw";
case Asse_SHL32: return "pslld";
case Asse_SHL64: return "psllq";
+ case Asse_SHL128: return "pslldq";
case Asse_SHR16: return "psrlw";
case Asse_SHR32: return "psrld";
case Asse_SHR64: return "psrlq";
+ case Asse_SHR128: return "psrldq";
case Asse_SAR16: return "psraw";
case Asse_SAR32: return "psrad";
case Asse_PACKSSD: return "packssdw";
vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
|| i->Ain.Sse32Fx4.op == Asse_RSQRTF
- || i->Ain.Sse32Fx4.op == Asse_SQRTF );
+ || i->Ain.Sse32Fx4.op == Asse_SQRTF
+ || i->Ain.Sse32Fx4.op == Asse_I2F
+ || i->Ain.Sse32Fx4.op == Asse_F2I );
addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
addHRegUse(u, unary ? HRmWrite : HRmModify,
i->Ain.Sse32Fx4.dst);
case Ain_Sse32Fx4:
xtra = 0;
+ switch (i->Ain.Sse32Fx4.op) {
+ case Asse_F2I: *p++ = 0x66; break;
+ default: break;
+ }
*p++ = clearWBit(
rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst),
vregEnc3210(i->Ain.Sse32Fx4.src) ));
case Asse_RCPF: *p++ = 0x53; break;
case Asse_RSQRTF: *p++ = 0x52; break;
case Asse_SQRTF: *p++ = 0x51; break;
+ case Asse_I2F: *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx
+ case Asse_F2I: *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx
case Asse_SUBF: *p++ = 0x5C; break;
case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
Asse_CMPEQF, Asse_CMPLTF, Asse_CMPLEF, Asse_CMPUNF,
/* Floating point unary */
Asse_RCPF, Asse_RSQRTF, Asse_SQRTF,
+ /* Floating point conversion */
+ Asse_I2F, // i32-signed to float conversion, aka cvtdq2ps in vec form
+ Asse_F2I, // float to i32-signed conversion, aka cvtps2dq in vec form
/* Bitwise */
Asse_AND, Asse_OR, Asse_XOR, Asse_ANDN,
Asse_ADD8, Asse_ADD16, Asse_ADD32, Asse_ADD64,
return dst;
}
+ case Iop_I32StoF32x4:
+ case Iop_F32toI32Sx4: {
+ HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
+ HReg dst = newVRegV(env);
+ AMD64SseOp mop
+ = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I;
+ set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
+ addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst));
+ set_SSE_rounding_default(env);
+ return dst;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
return;
}
+ case Iop_I32StoF32x8:
+ case Iop_F32toI32Sx8: {
+ HReg argHi, argLo;
+ iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ AMD64SseOp mop
+ = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I;
+ set_SSE_rounding_mode(env, e->Iex.Binop.arg1);
+ addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi));
+ addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo));
+ set_SSE_rounding_default(env);
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
case Iop_I32UtoFx4: vex_printf("I32UtoFx4"); return;
case Iop_I32StoFx4: vex_printf("I32StoFx4"); return;
+ case Iop_I32StoF32x4: vex_printf("I32StoF32x4"); return;
+ case Iop_F32toI32Sx4: vex_printf("F32toI32Sx4"); return;
+
case Iop_F32toF16x4: vex_printf("F32toF16x4"); return;
case Iop_F16toF32x4: vex_printf("F16toF32x4"); return;
case Iop_F16toF64x2: vex_printf("F16toF64x2"); return;
case Iop_Sub32Fx8: vex_printf("Sub32Fx8"); return;
case Iop_Mul32Fx8: vex_printf("Mul32Fx8"); return;
case Iop_Div32Fx8: vex_printf("Div32Fx8"); return;
+ case Iop_I32StoF32x8: vex_printf("I32StoF32x8"); return;
+ case Iop_F32toI32Sx8: vex_printf("F32toI32Sx8"); return;
case Iop_AndV256: vex_printf("AndV256"); return;
case Iop_OrV256: vex_printf("OrV256"); return;
case Iop_XorV256: vex_printf("XorV256"); return;
case Iop_Sqrt64Fx2:
case Iop_Sqrt32Fx4:
+ case Iop_I32StoF32x4:
+ case Iop_F32toI32Sx4:
BINARY(ity_RMode,Ity_V128, Ity_V128);
case Iop_64HLtoV128:
case Iop_Perm32x8:
BINARY(Ity_V256,Ity_V256, Ity_V256);
+ case Iop_I32StoF32x8:
+ case Iop_F32toI32Sx8:
+ BINARY(ity_RMode,Ity_V256, Ity_V256);
+
case Iop_V256toV128_1: case Iop_V256toV128_0:
UNARY(Ity_V256, Ity_V128);
/* Unlike the standard fp conversions, these irops take no
rounding mode argument. Instead the irop trailers _R{M,P,N,Z}
indicate the mode: {-inf, +inf, nearest, zero} respectively. */
+
+ // FIXME These carry no rounding mode
Iop_I32UtoFx4, Iop_I32StoFx4, /* I32x4 -> F32x4 */
+
+ Iop_I32StoF32x4, /* IRRoundingMode(I32) x V128 -> V128 */
+ Iop_F32toI32Sx4, /* IRRoundingMode(I32) x V128 -> V128 */
+
Iop_FtoI32Ux4_RZ, Iop_FtoI32Sx4_RZ, /* F32x4 -> I32x4 */
Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ, /* F32x4 -> I32x4 (saturating) */
Iop_RoundF32x4_RM, Iop_RoundF32x4_RP, /* round to fp integer */
/* --- Single to/from half conversion --- */
/* FIXME: what kind of rounding in F32x4 -> F16x4 case? */
+ // FIXME these carry no rounding mode
Iop_F32toF16x4, Iop_F16toF32x4, /* F32x4 <-> F16x4 */
-
-
/* -- Double to/from half conversion -- */
- Iop_F64toF16x2, Iop_F16toF64x2,
+ Iop_F64toF16x2, // FIXME this carries no rounding mode (?)
+ Iop_F16toF64x2,
/* Values from two registers converted in smaller type and put in one
IRRoundingMode(I32) x (F32x4 | F32x4) -> Q16x8 */
Iop_Add64Fx4, Iop_Sub64Fx4, Iop_Mul64Fx4, Iop_Div64Fx4,
Iop_Add32Fx8, Iop_Sub32Fx8, Iop_Mul32Fx8, Iop_Div32Fx8,
+ Iop_I32StoF32x8, /* IRRoundingMode(I32) x V256 -> V256 */
+ Iop_F32toI32Sx8, /* IRRoundingMode(I32) x V256 -> V256 */
+
Iop_Sqrt32Fx8,
Iop_Sqrt64Fx4,
Iop_RSqrtEst32Fx8,
static
IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
{
- /* Same scheme as unary32Fx4_w_rm. */
+ /* Same scheme as binaryFx4_w_rm. */
IRAtom* t1 = unary32Fx4(mce, vatomX);
// PCast the RM, and widen it to 128 bits
IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
return t1;
}
+/* --- ... and ... 32Fx8 versions of the same --- */
+
+static
+IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
+{
+ /* Same scheme as unary32Fx8_w_rm. */
+ IRAtom* t1 = unary32Fx8(mce, vatomX);
+ // PCast the RM, and widen it to 256 bits
+ IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
+ // Roll it into the result
+ t1 = mkUifUV256(mce, t1, t2);
+ return t1;
+}
+
/* --- --- Vector saturated narrowing --- --- */
/* V128-bit SIMD */
+ case Iop_I32StoF32x4:
+ case Iop_F32toI32Sx4:
case Iop_Sqrt32Fx4:
return unary32Fx4_w_rm(mce, vatom1, vatom2);
case Iop_Sqrt64Fx2:
case Iop_CmpGT64Sx4:
return binary64Ix4(mce, vatom1, vatom2);
- /* Perm32x8: rearrange values in left arg using steering values
- from right arg. So rearrange the vbits in the same way but
- pessimise wrt steering values. */
+ case Iop_I32StoF32x8:
+ case Iop_F32toI32Sx8:
+ return unary32Fx8_w_rm(mce, vatom1, vatom2);
+
+ /* Perm32x8: rearrange values in left arg using steering values
+ from right arg. So rearrange the vbits in the same way but
+ pessimise wrt steering values. */
case Iop_Perm32x8:
return mkUifUV256(
mce,
{ DEFOP(Iop_RSqrtStep32Fx4, UNDEF_UNKNOWN), },
{ DEFOP(Iop_I32UtoFx4, UNDEF_UNKNOWN), },
{ DEFOP(Iop_I32StoFx4, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_I32StoF32x4, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_F32toI32Sx4, UNDEF_UNKNOWN), },
{ DEFOP(Iop_FtoI32Ux4_RZ, UNDEF_UNKNOWN), },
{ DEFOP(Iop_FtoI32Sx4_RZ, UNDEF_UNKNOWN), },
{ DEFOP(Iop_QFtoI32Ux4_RZ, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Sub32Fx8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Mul32Fx8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Div32Fx8, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_I32StoF32x8, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_F32toI32Sx8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Sqrt32Fx8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Sqrt64Fx4, UNDEF_UNKNOWN), },
{ DEFOP(Iop_RSqrtEst32Fx8, UNDEF_UNKNOWN), },