From: Julian Seward Date: Sat, 22 Dec 2018 17:04:42 +0000 (+0100) Subject: amd64 back end: generate better code for 2x64<-->V128 and 4x64<-->V256 transfers .. X-Git-Tag: VALGRIND_3_15_0~117 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b17d5ffdb844cf081c86d7df9489f61b4392ca47;p=thirdparty%2Fvalgrind.git amd64 back end: generate better code for 2x64<-->V128 and 4x64<-->V256 transfers .. .. by adding support for MOVQ xmm/ireg and using that to implement 64HLtoV128, 4x64toV256 and their inverses. This reduces the number of instructions, removes the use of memory as an intermediary, and avoids store-forwarding stalls. --- diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index e3a2c7206a..8e55197444 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -1020,6 +1020,16 @@ AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op, i->Ain.SseShiftN.dst = dst; return i; } +AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) { + AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); + i->tag = Ain_SseMOVQ; + i->Ain.SseMOVQ.gpr = gpr; + i->Ain.SseMOVQ.xmm = xmm; + i->Ain.SseMOVQ.toXMM = toXMM; + vassert(hregClass(gpr) == HRcInt64); + vassert(hregClass(xmm) == HRcVec128); + return i; +} //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, //uu HReg reg, AMD64AMode* addr ) { //uu AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); @@ -1377,6 +1387,18 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 ) i->Ain.SseShiftN.shiftBits); ppHRegAMD64(i->Ain.SseShiftN.dst); return; + case Ain_SseMOVQ: + vex_printf("movq "); + if (i->Ain.SseMOVQ.toXMM) { + ppHRegAMD64(i->Ain.SseMOVQ.gpr); + vex_printf(","); + ppHRegAMD64(i->Ain.SseMOVQ.xmm); + } else { + ppHRegAMD64(i->Ain.SseMOVQ.xmm); + vex_printf(","); + ppHRegAMD64(i->Ain.SseMOVQ.gpr); + }; + return; //uu case Ain_AvxLdSt: //uu vex_printf("vmovups "); //uu if (i->Ain.AvxLdSt.isLoad) { @@ -1714,6 +1736,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) case Ain_SseShiftN: addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst); return; + case Ain_SseMOVQ: + addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite, + i->Ain.SseMOVQ.gpr); + addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead, + i->Ain.SseMOVQ.xmm); + return; //uu case Ain_AvxLdSt: //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr); //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead, @@ -1932,6 +1960,10 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) case Ain_SseShiftN: mapReg(m, &i->Ain.SseShiftN.dst); return; + case Ain_SseMOVQ: + mapReg(m, &i->Ain.SseMOVQ.gpr); + mapReg(m, &i->Ain.SseMOVQ.xmm); + return; //uu case Ain_AvxLdSt: //uu mapReg(m, &i->Ain.AvxLdSt.reg); //uu mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr); @@ -2301,6 +2333,11 @@ static inline UChar clearWBit ( UChar rex ) return rex & ~(1<<3); } +static inline UChar setWBit ( UChar rex ) +{ + return rex | (1<<3); +} + /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */ inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am ) @@ -3914,6 +3951,18 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, goto done; } + case Ain_SseMOVQ: { + Bool toXMM = i->Ain.SseMOVQ.toXMM; + HReg gpr = i->Ain.SseMOVQ.gpr; + HReg xmm = i->Ain.SseMOVQ.xmm; + *p++ = 0x66; + *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) ); + *p++ = 0x0F; + *p++ = toXMM ? 0x6E : 0x7E; + p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) ); + goto done; + } + //uu case Ain_AvxLdSt: { //uu UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg), //uu i->Ain.AvxLdSt.addr ); diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index c45229feb3..64bd810247 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -404,6 +404,7 @@ typedef Ain_SseCMov, /* SSE conditional move */ Ain_SseShuf, /* SSE2 shuffle (pshufd) */ Ain_SseShiftN, /* SSE2 shift by immediate */ + Ain_SseMOVQ, /* SSE2 moves of xmm[63:0] to/from GPR */ //uu Ain_AvxLdSt, /* AVX load/store 256 bits, //uu no alignment constraints */ //uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */ @@ -704,6 +705,11 @@ typedef UInt shiftBits; HReg dst; } SseShiftN; + struct { + HReg gpr; + HReg xmm; + Bool toXMM; // when moving to xmm, xmm[127:64] is zeroed out + } SseMOVQ; //uu struct { //uu Bool isLoad; //uu HReg reg; @@ -784,6 +790,7 @@ extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp, UInt shiftBits, HReg dst ); +extern AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ); //uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* ); //uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 486901cb45..e67edc5bd9 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -91,7 +91,7 @@ static IRExpr* bind ( Int binder ) return IRExpr_Binder(binder); } -static Bool isZeroU8 ( IRExpr* e ) +static Bool isZeroU8 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U8 @@ -291,20 +291,32 @@ static Bool fitsIn32Bits ( ULong x ) /* Is this a 64-bit zero expression? */ -static Bool isZeroU64 ( IRExpr* e ) +static Bool isZeroU64 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U64 && e->Iex.Const.con->Ico.U64 == 0ULL; } -static Bool isZeroU32 ( IRExpr* e ) +static Bool isZeroU32 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U32 && e->Iex.Const.con->Ico.U32 == 0; } +/* Are both args atoms and the same? This is copy of eqIRAtom + that omits the assertions that the args are indeed atoms. */ + +static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 ) +{ + if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp) + return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp); + if (a1->tag == Iex_Const && a2->tag == Iex_Const) + return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con); + return False; +} + /* Make a int reg-reg move. */ static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst ) @@ -1609,44 +1621,47 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ) } /* V128{HI}to64 */ - case Iop_V128HIto64: case Iop_V128to64: { HReg dst = newVRegI(env); - Int off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16; - HReg rsp = hregAMD64_RSP(); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp); - addInstr(env, AMD64Instr_SseLdSt(False/*store*/, - 16, vec, m16_rsp)); - addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, - AMD64RMI_Mem(off_rsp), dst )); + addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/)); + return dst; + } + case Iop_V128HIto64: { + HReg dst = newVRegI(env); + HReg vec = iselVecExpr(env, e->Iex.Unop.arg); + HReg vec2 = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(vec, vec2)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2)); + addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/)); return dst; } + /* V256to64_{3,2,1,0} */ case Iop_V256to64_0: case Iop_V256to64_1: case Iop_V256to64_2: case Iop_V256to64_3: { HReg vHi, vLo, vec; iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg); /* Do the first part of the selection by deciding which of - the 128 bit registers do look at, and second part using + the 128 bit registers to look at, and second part using the same scheme as for V128{HI}to64 above. */ - Int off = 0; + Bool low64of128 = True; switch (e->Iex.Unop.op) { - case Iop_V256to64_0: vec = vLo; off = -16; break; - case Iop_V256to64_1: vec = vLo; off = -8; break; - case Iop_V256to64_2: vec = vHi; off = -16; break; - case Iop_V256to64_3: vec = vHi; off = -8; break; + case Iop_V256to64_0: vec = vLo; low64of128 = True; break; + case Iop_V256to64_1: vec = vLo; low64of128 = False; break; + case Iop_V256to64_2: vec = vHi; low64of128 = True; break; + case Iop_V256to64_3: vec = vHi; low64of128 = False; break; default: vassert(0); } - HReg dst = newVRegI(env); - HReg rsp = hregAMD64_RSP(); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp); - addInstr(env, AMD64Instr_SseLdSt(False/*store*/, - 16, vec, m16_rsp)); - addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, - AMD64RMI_Mem(off_rsp), dst )); + HReg dst = newVRegI(env); + if (low64of128) { + addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/)); + } else { + HReg vec2 = newVRegV(env); + addInstr(env, mk_vMOVsd_RR(vec, vec2)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2)); + addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/)); + } return dst; } @@ -3355,16 +3370,26 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) } case Iop_64HLtoV128: { - HReg rsp = hregAMD64_RSP(); - AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - AMD64RI* qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1); - AMD64RI* qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp)); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp)); - HReg dst = newVRegV(env); - /* One store-forwarding stall coming up, oh well :-( */ - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp)); + const IRExpr* arg1 = e->Iex.Binop.arg1; + const IRExpr* arg2 = e->Iex.Binop.arg2; + HReg dst = newVRegV(env); + HReg tmp = newVRegV(env); + HReg qHi = iselIntExpr_R(env, arg1); + // If the args are trivially the same (tmp or const), use the same + // source register for both, and only one movq since those are + // (relatively) expensive. + if (areAtomsAndEqual(arg1, arg2)) { + addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/)); + addInstr(env, mk_vMOVsd_RR(dst, tmp)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); + } else { + HReg qLo = iselIntExpr_R(env, arg2); + addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); + addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); + } return dst; } @@ -4071,6 +4096,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, } case Iop_V128HLtoV256: { + // Curiously, there doesn't seem to be any benefit to be had here by + // checking whether arg1 and arg2 are the same, in the style of how + // (eg) 64HLtoV128 is handled elsewhere in this file. *rHi = iselVecExpr(env, e->Iex.Binop.arg1); *rLo = iselVecExpr(env, e->Iex.Binop.arg2); return; @@ -4313,27 +4341,44 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) { - HReg rsp = hregAMD64_RSP(); - HReg vHi = newVRegV(env); - HReg vLo = newVRegV(env); - AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp); - AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); - /* arg1 is the most significant (Q3), arg4 the least (Q0) */ - /* Get all the args into regs, before messing with the stack. */ - AMD64RI* q3 = iselIntExpr_RI(env, e->Iex.Qop.details->arg1); - AMD64RI* q2 = iselIntExpr_RI(env, e->Iex.Qop.details->arg2); - AMD64RI* q1 = iselIntExpr_RI(env, e->Iex.Qop.details->arg3); - AMD64RI* q0 = iselIntExpr_RI(env, e->Iex.Qop.details->arg4); - /* less significant lane (Q2) at the lower address (-16(rsp)) */ - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp)); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp)); - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp)); - /* and then the lower half .. */ - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp)); - addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp)); - addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp)); - *rHi = vHi; - *rLo = vLo; + const IRExpr* arg1 = e->Iex.Qop.details->arg1; + const IRExpr* arg2 = e->Iex.Qop.details->arg2; + const IRExpr* arg3 = e->Iex.Qop.details->arg3; + const IRExpr* arg4 = e->Iex.Qop.details->arg4; + // If the args are trivially the same (tmp or const), use the same + // source register for all four, and only one movq since those are + // (relatively) expensive. + if (areAtomsAndEqual(arg1, arg2) + && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) { + HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1); + HReg tmp = newVRegV(env); + HReg dst = newVRegV(env); + addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/)); + addInstr(env, mk_vMOVsd_RR(dst, tmp)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); + *rHi = dst; + *rLo = dst; + } else { + /* arg1 is the most significant (Q3), arg4 the least (Q0) */ + HReg q3 = iselIntExpr_R(env, arg1); + HReg q2 = iselIntExpr_R(env, arg2); + HReg q1 = iselIntExpr_R(env, arg3); + HReg q0 = iselIntExpr_R(env, arg4); + HReg tmp = newVRegV(env); + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); + addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi)); + addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi)); + addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo)); + addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/)); + addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + } return; }