From: Julian Seward Date: Sat, 22 Dec 2018 15:11:39 +0000 (+0100) Subject: amd64 pipeline: improve performance of cvtdq2ps and cvtps2dq (128 and 256 bit version... X-Git-Tag: VALGRIND_3_15_0~118 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dda0d80f3db1632b204b522a1dbb009490265b0e;p=thirdparty%2Fvalgrind.git amd64 pipeline: improve performance of cvtdq2ps and cvtps2dq (128 and 256 bit versions) .. .. by giving them their own vector IROps rather than doing each lane individually. --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 2451a292eb..fea0ecadf8 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -10671,7 +10671,6 @@ static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V128); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10689,21 +10688,7 @@ static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx, assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO) : get_sse_roundingmode() ); - t0 = t1 = t2 = t3 = IRTemp_INVALID; - breakupV128to32s( argV, &t3, &t2, &t1, &t0 ); - /* This is less than ideal. If it turns out to be a performance - bottleneck it can be improved. */ -# define CVT(_t) \ - binop( Iop_F64toI32S, \ - mkexpr(rmode), \ - unop( Iop_F32toF64, \ - unop( Iop_ReinterpI32asF32, mkexpr(_t))) ) - - putXMMRegLane32( rG, 3, CVT(t3) ); - putXMMRegLane32( rG, 2, CVT(t2) ); - putXMMRegLane32( rG, 1, CVT(t1) ); - putXMMRegLane32( rG, 0, CVT(t0) ); -# undef CVT + putXMMReg( rG, binop(Iop_F32toI32Sx4, mkexpr(rmode), mkexpr(argV)) ); if (isAvx) putYMMRegLane128( rG, 1, mkV128(0) ); @@ -10721,7 +10706,6 @@ static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V256); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3, t4, t5, t6, t7; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10739,26 +10723,7 @@ static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx, assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO) : get_sse_roundingmode() ); - t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID; - breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 ); - /* This is less than ideal. If it turns out to be a performance - bottleneck it can be improved. */ -# define CVT(_t) \ - binop( Iop_F64toI32S, \ - mkexpr(rmode), \ - unop( Iop_F32toF64, \ - unop( Iop_ReinterpI32asF32, mkexpr(_t))) ) - - putYMMRegLane32( rG, 7, CVT(t7) ); - putYMMRegLane32( rG, 6, CVT(t6) ); - putYMMRegLane32( rG, 5, CVT(t5) ); - putYMMRegLane32( rG, 4, CVT(t4) ); - putYMMRegLane32( rG, 3, CVT(t3) ); - putYMMRegLane32( rG, 2, CVT(t2) ); - putYMMRegLane32( rG, 1, CVT(t1) ); - putYMMRegLane32( rG, 0, CVT(t0) ); -# undef CVT - + putYMMReg( rG, binop(Iop_F32toI32Sx8, mkexpr(rmode), mkexpr(argV)) ); return delta; } @@ -10882,7 +10847,6 @@ static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V128); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10899,21 +10863,8 @@ static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx, } assign( rmode, get_sse_roundingmode() ); - t0 = IRTemp_INVALID; - t1 = IRTemp_INVALID; - t2 = IRTemp_INVALID; - t3 = IRTemp_INVALID; - breakupV128to32s( argV, &t3, &t2, &t1, &t0 ); + putXMMReg(rG, binop(Iop_I32StoF32x4, mkexpr(rmode), mkexpr(argV))); -# define CVT(_t) binop( Iop_F64toF32, \ - mkexpr(rmode), \ - unop(Iop_I32StoF64,mkexpr(_t))) - - putXMMRegLane32F( rG, 3, CVT(t3) ); - putXMMRegLane32F( rG, 2, CVT(t2) ); - putXMMRegLane32F( rG, 1, CVT(t1) ); - putXMMRegLane32F( rG, 0, CVT(t0) ); -# undef CVT if (isAvx) putYMMRegLane128( rG, 1, mkV128(0) ); @@ -10930,7 +10881,6 @@ static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx, IRTemp argV = newTemp(Ity_V256); IRTemp rmode = newTemp(Ity_I32); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0, t1, t2, t3, t4, t5, t6, t7; if (epartIsReg(modrm)) { UInt rE = eregOfRexRM(pfx,modrm); @@ -10945,29 +10895,7 @@ static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx, } assign( rmode, get_sse_roundingmode() ); - t0 = IRTemp_INVALID; - t1 = IRTemp_INVALID; - t2 = IRTemp_INVALID; - t3 = IRTemp_INVALID; - t4 = IRTemp_INVALID; - t5 = IRTemp_INVALID; - t6 = IRTemp_INVALID; - t7 = IRTemp_INVALID; - breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 ); - -# define CVT(_t) binop( Iop_F64toF32, \ - mkexpr(rmode), \ - unop(Iop_I32StoF64,mkexpr(_t))) - - putYMMRegLane32F( rG, 7, CVT(t7) ); - putYMMRegLane32F( rG, 6, CVT(t6) ); - putYMMRegLane32F( rG, 5, CVT(t5) ); - putYMMRegLane32F( rG, 4, CVT(t4) ); - putYMMRegLane32F( rG, 3, CVT(t3) ); - putYMMRegLane32F( rG, 2, CVT(t2) ); - putYMMRegLane32F( rG, 1, CVT(t1) ); - putYMMRegLane32F( rG, 0, CVT(t0) ); -# undef CVT + putYMMReg(rG, binop(Iop_I32StoF32x8, mkexpr(rmode), mkexpr(argV))); return delta; } diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 1536d81be9..e3a2c7206a 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -530,6 +530,8 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_RCPF: return "rcp"; case Asse_RSQRTF: return "rsqrt"; case Asse_SQRTF: return "sqrt"; + case Asse_I2F: return "cvtdq2ps."; + case Asse_F2I: return "cvtps2dq."; case Asse_AND: return "and"; case Asse_OR: return "or"; case Asse_XOR: return "xor"; @@ -568,9 +570,11 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_SHL16: return "psllw"; case Asse_SHL32: return "pslld"; case Asse_SHL64: return "psllq"; + case Asse_SHL128: return "pslldq"; case Asse_SHR16: return "psrlw"; case Asse_SHR32: return "psrld"; case Asse_SHR64: return "psrlq"; + case Asse_SHR128: return "psrldq"; case Asse_SAR16: return "psraw"; case Asse_SAR32: return "psrad"; case Asse_PACKSSD: return "packssdw"; @@ -1643,7 +1647,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) vassert(i->Ain.Sse32Fx4.op != Asse_MOV); unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF || i->Ain.Sse32Fx4.op == Asse_RSQRTF - || i->Ain.Sse32Fx4.op == Asse_SQRTF ); + || i->Ain.Sse32Fx4.op == Asse_SQRTF + || i->Ain.Sse32Fx4.op == Asse_I2F + || i->Ain.Sse32Fx4.op == Asse_F2I ); addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src); addHRegUse(u, unary ? HRmWrite : HRmModify, i->Ain.Sse32Fx4.dst); @@ -3648,6 +3654,10 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Ain_Sse32Fx4: xtra = 0; + switch (i->Ain.Sse32Fx4.op) { + case Asse_F2I: *p++ = 0x66; break; + default: break; + } *p++ = clearWBit( rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32Fx4.dst), vregEnc3210(i->Ain.Sse32Fx4.src) )); @@ -3661,6 +3671,8 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Asse_RCPF: *p++ = 0x53; break; case Asse_RSQRTF: *p++ = 0x52; break; case Asse_SQRTF: *p++ = 0x51; break; + case Asse_I2F: *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx + case Asse_F2I: *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx case Asse_SUBF: *p++ = 0x5C; break; case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break; case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break; diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index e1715a0b46..c45229feb3 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -316,6 +316,9 @@ typedef Asse_CMPEQF, Asse_CMPLTF, Asse_CMPLEF, Asse_CMPUNF, /* Floating point unary */ Asse_RCPF, Asse_RSQRTF, Asse_SQRTF, + /* Floating point conversion */ + Asse_I2F, // i32-signed to float conversion, aka cvtdq2ps in vec form + Asse_F2I, // float to i32-signed conversion, aka cvtps2dq in vec form /* Bitwise */ Asse_AND, Asse_OR, Asse_XOR, Asse_ANDN, Asse_ADD8, Asse_ADD16, Asse_ADD32, Asse_ADD64, diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 59fd75240a..486901cb45 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3688,6 +3688,18 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } + case Iop_I32StoF32x4: + case Iop_F32toI32Sx4: { + HReg arg = iselVecExpr(env, e->Iex.Binop.arg2); + HReg dst = newVRegV(env); + AMD64SseOp mop + = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I; + set_SSE_rounding_mode(env, e->Iex.Binop.arg1); + addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst)); + set_SSE_rounding_default(env); + return dst; + } + default: break; } /* switch (e->Iex.Binop.op) */ @@ -4224,6 +4236,23 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } + case Iop_I32StoF32x8: + case Iop_F32toI32Sx8: { + HReg argHi, argLo; + iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + AMD64SseOp mop + = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I; + set_SSE_rounding_mode(env, e->Iex.Binop.arg1); + addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi)); + addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo)); + set_SSE_rounding_default(env); + *rHi = dstHi; + *rLo = dstLo; + return; + } + default: break; } /* switch (e->Iex.Binop.op) */ diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index ae1c203732..93de80f913 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -440,6 +440,9 @@ void ppIROp ( IROp op ) case Iop_I32UtoFx4: vex_printf("I32UtoFx4"); return; case Iop_I32StoFx4: vex_printf("I32StoFx4"); return; + case Iop_I32StoF32x4: vex_printf("I32StoF32x4"); return; + case Iop_F32toI32Sx4: vex_printf("F32toI32Sx4"); return; + case Iop_F32toF16x4: vex_printf("F32toF16x4"); return; case Iop_F16toF32x4: vex_printf("F16toF32x4"); return; case Iop_F16toF64x2: vex_printf("F16toF64x2"); return; @@ -1237,6 +1240,8 @@ void ppIROp ( IROp op ) case Iop_Sub32Fx8: vex_printf("Sub32Fx8"); return; case Iop_Mul32Fx8: vex_printf("Mul32Fx8"); return; case Iop_Div32Fx8: vex_printf("Div32Fx8"); return; + case Iop_I32StoF32x8: vex_printf("I32StoF32x8"); return; + case Iop_F32toI32Sx8: vex_printf("F32toI32Sx8"); return; case Iop_AndV256: vex_printf("AndV256"); return; case Iop_OrV256: vex_printf("OrV256"); return; case Iop_XorV256: vex_printf("XorV256"); return; @@ -2990,6 +2995,8 @@ void typeOfPrimop ( IROp op, case Iop_Sqrt64Fx2: case Iop_Sqrt32Fx4: + case Iop_I32StoF32x4: + case Iop_F32toI32Sx4: BINARY(ity_RMode,Ity_V128, Ity_V128); case Iop_64HLtoV128: @@ -3579,6 +3586,10 @@ void typeOfPrimop ( IROp op, case Iop_Perm32x8: BINARY(Ity_V256,Ity_V256, Ity_V256); + case Iop_I32StoF32x8: + case Iop_F32toI32Sx8: + BINARY(ity_RMode,Ity_V256, Ity_V256); + case Iop_V256toV128_1: case Iop_V256toV128_0: UNARY(Ity_V256, Ity_V128); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 459d14b6c6..f8ba2c7cd5 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1387,7 +1387,13 @@ typedef /* Unlike the standard fp conversions, these irops take no rounding mode argument. Instead the irop trailers _R{M,P,N,Z} indicate the mode: {-inf, +inf, nearest, zero} respectively. */ + + // FIXME These carry no rounding mode Iop_I32UtoFx4, Iop_I32StoFx4, /* I32x4 -> F32x4 */ + + Iop_I32StoF32x4, /* IRRoundingMode(I32) x V128 -> V128 */ + Iop_F32toI32Sx4, /* IRRoundingMode(I32) x V128 -> V128 */ + Iop_FtoI32Ux4_RZ, Iop_FtoI32Sx4_RZ, /* F32x4 -> I32x4 */ Iop_QFtoI32Ux4_RZ, Iop_QFtoI32Sx4_RZ, /* F32x4 -> I32x4 (saturating) */ Iop_RoundF32x4_RM, Iop_RoundF32x4_RP, /* round to fp integer */ @@ -1400,12 +1406,12 @@ typedef /* --- Single to/from half conversion --- */ /* FIXME: what kind of rounding in F32x4 -> F16x4 case? */ + // FIXME these carry no rounding mode Iop_F32toF16x4, Iop_F16toF32x4, /* F32x4 <-> F16x4 */ - - /* -- Double to/from half conversion -- */ - Iop_F64toF16x2, Iop_F16toF64x2, + Iop_F64toF16x2, // FIXME this carries no rounding mode (?) + Iop_F16toF64x2, /* Values from two registers converted in smaller type and put in one IRRoundingMode(I32) x (F32x4 | F32x4) -> Q16x8 */ @@ -1957,6 +1963,9 @@ typedef Iop_Add64Fx4, Iop_Sub64Fx4, Iop_Mul64Fx4, Iop_Div64Fx4, Iop_Add32Fx8, Iop_Sub32Fx8, Iop_Mul32Fx8, Iop_Div32Fx8, + Iop_I32StoF32x8, /* IRRoundingMode(I32) x V256 -> V256 */ + Iop_F32toI32Sx8, /* IRRoundingMode(I32) x V256 -> V256 */ + Iop_Sqrt32Fx8, Iop_Sqrt64Fx4, Iop_RSqrtEst32Fx8, diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 6e449e2c96..c6ac3a5f1a 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -2810,7 +2810,7 @@ IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) static IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) { - /* Same scheme as unary32Fx4_w_rm. */ + /* Same scheme as binaryFx4_w_rm. */ IRAtom* t1 = unary32Fx4(mce, vatomX); // PCast the RM, and widen it to 128 bits IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM); @@ -2819,6 +2819,20 @@ IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) return t1; } +/* --- ... and ... 32Fx8 versions of the same --- */ + +static +IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX ) +{ + /* Same scheme as unary32Fx8_w_rm. */ + IRAtom* t1 = unary32Fx8(mce, vatomX); + // PCast the RM, and widen it to 256 bits + IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM); + // Roll it into the result + t1 = mkUifUV256(mce, t1, t2); + return t1; +} + /* --- --- Vector saturated narrowing --- --- */ @@ -3665,6 +3679,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, /* V128-bit SIMD */ + case Iop_I32StoF32x4: + case Iop_F32toI32Sx4: case Iop_Sqrt32Fx4: return unary32Fx4_w_rm(mce, vatom1, vatom2); case Iop_Sqrt64Fx2: @@ -4743,9 +4759,13 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_CmpGT64Sx4: return binary64Ix4(mce, vatom1, vatom2); - /* Perm32x8: rearrange values in left arg using steering values - from right arg. So rearrange the vbits in the same way but - pessimise wrt steering values. */ + case Iop_I32StoF32x8: + case Iop_F32toI32Sx8: + return unary32Fx8_w_rm(mce, vatom1, vatom2); + + /* Perm32x8: rearrange values in left arg using steering values + from right arg. So rearrange the vbits in the same way but + pessimise wrt steering values. */ case Iop_Perm32x8: return mkUifUV256( mce, diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index 66b40ef00f..1047fa354c 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -640,6 +640,8 @@ static irop_t irops[] = { { DEFOP(Iop_RSqrtStep32Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_I32UtoFx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_I32StoFx4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_I32StoF32x4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F32toI32Sx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_FtoI32Ux4_RZ, UNDEF_UNKNOWN), }, { DEFOP(Iop_FtoI32Sx4_RZ, UNDEF_UNKNOWN), }, { DEFOP(Iop_QFtoI32Ux4_RZ, UNDEF_UNKNOWN), }, @@ -1123,6 +1125,8 @@ static irop_t irops[] = { { DEFOP(Iop_Sub32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Mul32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Div32Fx8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_I32StoF32x8, UNDEF_UNKNOWN), }, + { DEFOP(Iop_F32toI32Sx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sqrt32Fx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Sqrt64Fx4, UNDEF_UNKNOWN), }, { DEFOP(Iop_RSqrtEst32Fx8, UNDEF_UNKNOWN), },