From: Julian Seward Date: Sat, 22 Dec 2018 12:34:11 +0000 (+0100) Subject: amd64 back end: generate better code for 128/256 bit vector shifts by immediate.... X-Git-Tag: VALGRIND_3_15_0~119 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=901f3d3813c551b18a34ca5a52e3d9393524544c;p=thirdparty%2Fvalgrind.git amd64 back end: generate better code for 128/256 bit vector shifts by immediate. n-i-bz. --- diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 48ca268ab0..1536d81be9 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -1007,6 +1007,15 @@ AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) { vassert(order >= 0 && order <= 0xFF); return i; } +AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op, + UInt shiftBits, HReg dst ) { + AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); + i->tag = Ain_SseShiftN; + i->Ain.SseShiftN.op = op; + i->Ain.SseShiftN.shiftBits = shiftBits; + i->Ain.SseShiftN.dst = dst; + return i; +} //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, //uu HReg reg, AMD64AMode* addr ) { //uu AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr)); @@ -1359,6 +1368,11 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 ) vex_printf(","); ppHRegAMD64(i->Ain.SseShuf.dst); return; + case Ain_SseShiftN: + vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op), + i->Ain.SseShiftN.shiftBits); + ppHRegAMD64(i->Ain.SseShiftN.dst); + return; //uu case Ain_AvxLdSt: //uu vex_printf("vmovups "); //uu if (i->Ain.AvxLdSt.isLoad) { @@ -1691,6 +1705,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 ) addHRegUse(u, HRmRead, i->Ain.SseShuf.src); addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst); return; + case Ain_SseShiftN: + addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst); + return; //uu case Ain_AvxLdSt: //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr); //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead, @@ -1906,6 +1923,9 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 ) mapReg(m, &i->Ain.SseShuf.src); mapReg(m, &i->Ain.SseShuf.dst); return; + case Ain_SseShiftN: + mapReg(m, &i->Ain.SseShiftN.dst); + return; //uu case Ain_AvxLdSt: //uu mapReg(m, &i->Ain.AvxLdSt.reg); //uu mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr); @@ -3840,6 +3860,48 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, *p++ = (UChar)(i->Ain.SseShuf.order); goto done; + case Ain_SseShiftN: { + opc = 0; // invalid + subopc_imm = 0; // invalid + UInt limit = 0; + UInt shiftImm = i->Ain.SseShiftN.shiftBits; + switch (i->Ain.SseShiftN.op) { + case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break; + case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break; + case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break; + case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break; + case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break; + case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break; + case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break; + case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break; + case Asse_SHL128: + if ((shiftImm & 7) != 0) goto bad; + shiftImm >>= 3; + limit = 15; opc = 0x73; subopc_imm = 7; + break; + case Asse_SHR128: + if ((shiftImm & 7) != 0) goto bad; + shiftImm >>= 3; + limit = 15; opc = 0x73; subopc_imm = 3; + break; + default: + // This should never happen .. SSE2 only offers the above 10 insns + // for the "shift with immediate" case + goto bad; + } + vassert(limit > 0 && opc > 0 && subopc_imm > 0); + if (shiftImm > limit) goto bad; + *p++ = 0x66; + *p++ = clearWBit( + rexAMode_R_enc_enc( subopc_imm, + vregEnc3210(i->Ain.SseShiftN.dst) )); + *p++ = 0x0F; + *p++ = opc; + p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst)); + *p++ = shiftImm; + goto done; + } + //uu case Ain_AvxLdSt: { //uu UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg), //uu i->Ain.AvxLdSt.addr ); diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 6a72943f95..e1715a0b46 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -334,8 +334,8 @@ typedef Asse_MIN8U, Asse_CMPEQ8, Asse_CMPEQ16, Asse_CMPEQ32, Asse_CMPGT8S, Asse_CMPGT16S, Asse_CMPGT32S, - Asse_SHL16, Asse_SHL32, Asse_SHL64, - Asse_SHR16, Asse_SHR32, Asse_SHR64, + Asse_SHL16, Asse_SHL32, Asse_SHL64, Asse_SHL128, + Asse_SHR16, Asse_SHR32, Asse_SHR64, Asse_SHR128, Asse_SAR16, Asse_SAR32, Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW, Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ, @@ -400,6 +400,7 @@ typedef Ain_SseReRg, /* SSE binary general reg-reg, Re, Rg */ Ain_SseCMov, /* SSE conditional move */ Ain_SseShuf, /* SSE2 shuffle (pshufd) */ + Ain_SseShiftN, /* SSE2 shift by immediate */ //uu Ain_AvxLdSt, /* AVX load/store 256 bits, //uu no alignment constraints */ //uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */ @@ -695,6 +696,11 @@ typedef HReg src; HReg dst; } SseShuf; + struct { + AMD64SseOp op; + UInt shiftBits; + HReg dst; + } SseShiftN; //uu struct { //uu Bool isLoad; //uu HReg reg; @@ -773,6 +779,8 @@ extern AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg dst ); extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ); +extern AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp, + UInt shiftBits, HReg dst ); //uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* ); //uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg ); extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter, diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 7974c80364..59fd75240a 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3135,9 +3135,10 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) HWord fn = 0; /* address of helper fn, if required */ Bool arg1isEReg = False; AMD64SseOp op = Asse_INVALID; - IRType ty = typeOfIRExpr(env->type_env,e); vassert(e); + IRType ty = typeOfIRExpr(env->type_env, e); vassert(ty == Ity_V128); + UInt laneBits = 0; if (e->tag == Iex_RdTmp) { return lookupIRTemp(env, e->Iex.RdTmp.tmp); @@ -3521,20 +3522,33 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) return dst; } - case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift; - case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift; - case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift; - case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift; - case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift; - case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift; - case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift; - case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift; + case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift; + case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift; + case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift; + case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift; + case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift; + case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift; + case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift; + case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift; do_SseShift: { - HReg greg = iselVecExpr(env, e->Iex.Binop.arg1); + HReg dst = newVRegV(env); + HReg greg = iselVecExpr(env, e->Iex.Binop.arg1); + /* If it's a shift by an in-range immediate, generate a single + instruction. */ + if (e->Iex.Binop.arg2->tag == Iex_Const) { + IRConst* c = e->Iex.Binop.arg2->Iex.Const.con; + vassert(c->tag == Ico_U8); + UInt shift = c->Ico.U8; + if (shift < laneBits) { + addInstr(env, mk_vMOVsd_RR(greg, dst)); + addInstr(env, AMD64Instr_SseShiftN(op, shift, dst)); + return dst; + } + } + /* Otherwise we have to do it the longwinded way. */ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); HReg ereg = newVRegV(env); - HReg dst = newVRegV(env); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0)); @@ -3762,8 +3776,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, { HWord fn = 0; /* address of helper fn, if required */ vassert(e); - IRType ty = typeOfIRExpr(env->type_env,e); + IRType ty = typeOfIRExpr(env->type_env, e); vassert(ty == Ity_V256); + UInt laneBits = 0; AMD64SseOp op = Asse_INVALID; @@ -3997,22 +4012,39 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, return; } - case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift; - case Iop_ShlN32x8: op = Asse_SHL32; goto do_SseShift; - case Iop_ShlN64x4: op = Asse_SHL64; goto do_SseShift; - case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift; - case Iop_SarN32x8: op = Asse_SAR32; goto do_SseShift; - case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift; - case Iop_ShrN32x8: op = Asse_SHR32; goto do_SseShift; - case Iop_ShrN64x4: op = Asse_SHR64; goto do_SseShift; + case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift; + case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift; + case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift; + case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift; + case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift; + case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift; + case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift; + case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift; do_SseShift: { + HReg dstHi = newVRegV(env); + HReg dstLo = newVRegV(env); HReg gregHi, gregLo; iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1); + /* If it's a shift by an in-range immediate, generate two single + instructions. */ + if (e->Iex.Binop.arg2->tag == Iex_Const) { + IRConst* c = e->Iex.Binop.arg2->Iex.Const.con; + vassert(c->tag == Ico_U8); + UInt shift = c->Ico.U8; + if (shift < laneBits) { + addInstr(env, mk_vMOVsd_RR(gregHi, dstHi)); + addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi)); + addInstr(env, mk_vMOVsd_RR(gregLo, dstLo)); + addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo)); + *rHi = dstHi; + *rLo = dstLo; + return; + } + } + /* Otherwise we have to do it the longwinded way. */ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); HReg ereg = newVRegV(env); - HReg dstHi = newVRegV(env); - HReg dstLo = newVRegV(env); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));