i->Ain.SseShiftN.dst = dst;
return i;
}
+AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
+ AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+ i->tag = Ain_SseMOVQ;
+ i->Ain.SseMOVQ.gpr = gpr;
+ i->Ain.SseMOVQ.xmm = xmm;
+ i->Ain.SseMOVQ.toXMM = toXMM;
+ vassert(hregClass(gpr) == HRcInt64);
+ vassert(hregClass(xmm) == HRcVec128);
+ return i;
+}
//uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
//uu HReg reg, AMD64AMode* addr ) {
//uu AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
i->Ain.SseShiftN.shiftBits);
ppHRegAMD64(i->Ain.SseShiftN.dst);
return;
+ case Ain_SseMOVQ:
+ vex_printf("movq ");
+ if (i->Ain.SseMOVQ.toXMM) {
+ ppHRegAMD64(i->Ain.SseMOVQ.gpr);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.SseMOVQ.xmm);
+ } else {
+ ppHRegAMD64(i->Ain.SseMOVQ.xmm);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.SseMOVQ.gpr);
+ };
+ return;
//uu case Ain_AvxLdSt:
//uu vex_printf("vmovups ");
//uu if (i->Ain.AvxLdSt.isLoad) {
case Ain_SseShiftN:
addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
return;
+ case Ain_SseMOVQ:
+ addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite,
+ i->Ain.SseMOVQ.gpr);
+ addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead,
+ i->Ain.SseMOVQ.xmm);
+ return;
//uu case Ain_AvxLdSt:
//uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
//uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
case Ain_SseShiftN:
mapReg(m, &i->Ain.SseShiftN.dst);
return;
+ case Ain_SseMOVQ:
+ mapReg(m, &i->Ain.SseMOVQ.gpr);
+ mapReg(m, &i->Ain.SseMOVQ.xmm);
+ return;
//uu case Ain_AvxLdSt:
//uu mapReg(m, &i->Ain.AvxLdSt.reg);
//uu mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
return rex & ~(1<<3);
}
+static inline UChar setWBit ( UChar rex )
+{
+ return rex | (1<<3);
+}
+
/* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
goto done;
}
+ case Ain_SseMOVQ: {
+ Bool toXMM = i->Ain.SseMOVQ.toXMM;
+ HReg gpr = i->Ain.SseMOVQ.gpr;
+ HReg xmm = i->Ain.SseMOVQ.xmm;
+ *p++ = 0x66;
+ *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) );
+ *p++ = 0x0F;
+ *p++ = toXMM ? 0x6E : 0x7E;
+ p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) );
+ goto done;
+ }
+
//uu case Ain_AvxLdSt: {
//uu UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
//uu i->Ain.AvxLdSt.addr );
return IRExpr_Binder(binder);
}
-static Bool isZeroU8 ( IRExpr* e )
+static Bool isZeroU8 ( const IRExpr* e )
{
return e->tag == Iex_Const
&& e->Iex.Const.con->tag == Ico_U8
/* Is this a 64-bit zero expression? */
-static Bool isZeroU64 ( IRExpr* e )
+static Bool isZeroU64 ( const IRExpr* e )
{
return e->tag == Iex_Const
&& e->Iex.Const.con->tag == Ico_U64
&& e->Iex.Const.con->Ico.U64 == 0ULL;
}
-static Bool isZeroU32 ( IRExpr* e )
+static Bool isZeroU32 ( const IRExpr* e )
{
return e->tag == Iex_Const
&& e->Iex.Const.con->tag == Ico_U32
&& e->Iex.Const.con->Ico.U32 == 0;
}
+/* Are both args atoms and the same? This is copy of eqIRAtom
+ that omits the assertions that the args are indeed atoms. */
+
+static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
+{
+ if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
+ return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
+ if (a1->tag == Iex_Const && a2->tag == Iex_Const)
+ return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
+ return False;
+}
+
/* Make a int reg-reg move. */
static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
}
/* V128{HI}to64 */
- case Iop_V128HIto64:
case Iop_V128to64: {
HReg dst = newVRegI(env);
- Int off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
- HReg rsp = hregAMD64_RSP();
HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
- AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
- AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
- addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
- 16, vec, m16_rsp));
- addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
- AMD64RMI_Mem(off_rsp), dst ));
+ addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
+ return dst;
+ }
+ case Iop_V128HIto64: {
+ HReg dst = newVRegI(env);
+ HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
+ HReg vec2 = newVRegV(env);
+ addInstr(env, mk_vMOVsd_RR(vec, vec2));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
+ addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
return dst;
}
+ /* V256to64_{3,2,1,0} */
case Iop_V256to64_0: case Iop_V256to64_1:
case Iop_V256to64_2: case Iop_V256to64_3: {
HReg vHi, vLo, vec;
iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
/* Do the first part of the selection by deciding which of
- the 128 bit registers do look at, and second part using
+ the 128 bit registers to look at, and second part using
the same scheme as for V128{HI}to64 above. */
- Int off = 0;
+ Bool low64of128 = True;
switch (e->Iex.Unop.op) {
- case Iop_V256to64_0: vec = vLo; off = -16; break;
- case Iop_V256to64_1: vec = vLo; off = -8; break;
- case Iop_V256to64_2: vec = vHi; off = -16; break;
- case Iop_V256to64_3: vec = vHi; off = -8; break;
+ case Iop_V256to64_0: vec = vLo; low64of128 = True; break;
+ case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
+ case Iop_V256to64_2: vec = vHi; low64of128 = True; break;
+ case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
default: vassert(0);
}
- HReg dst = newVRegI(env);
- HReg rsp = hregAMD64_RSP();
- AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
- AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
- addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
- 16, vec, m16_rsp));
- addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
- AMD64RMI_Mem(off_rsp), dst ));
+ HReg dst = newVRegI(env);
+ if (low64of128) {
+ addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
+ } else {
+ HReg vec2 = newVRegV(env);
+ addInstr(env, mk_vMOVsd_RR(vec, vec2));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
+ addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
+ }
return dst;
}
}
case Iop_64HLtoV128: {
- HReg rsp = hregAMD64_RSP();
- AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp);
- AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
- AMD64RI* qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
- AMD64RI* qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
- addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
- addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
- HReg dst = newVRegV(env);
- /* One store-forwarding stall coming up, oh well :-( */
- addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
+ const IRExpr* arg1 = e->Iex.Binop.arg1;
+ const IRExpr* arg2 = e->Iex.Binop.arg2;
+ HReg dst = newVRegV(env);
+ HReg tmp = newVRegV(env);
+ HReg qHi = iselIntExpr_R(env, arg1);
+ // If the args are trivially the same (tmp or const), use the same
+ // source register for both, and only one movq since those are
+ // (relatively) expensive.
+ if (areAtomsAndEqual(arg1, arg2)) {
+ addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
+ addInstr(env, mk_vMOVsd_RR(dst, tmp));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
+ addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+ } else {
+ HReg qLo = iselIntExpr_R(env, arg2);
+ addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
+ addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
+ addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+ }
return dst;
}
}
case Iop_V128HLtoV256: {
+ // Curiously, there doesn't seem to be any benefit to be had here by
+ // checking whether arg1 and arg2 are the same, in the style of how
+ // (eg) 64HLtoV128 is handled elsewhere in this file.
*rHi = iselVecExpr(env, e->Iex.Binop.arg1);
*rLo = iselVecExpr(env, e->Iex.Binop.arg2);
return;
if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
- HReg rsp = hregAMD64_RSP();
- HReg vHi = newVRegV(env);
- HReg vLo = newVRegV(env);
- AMD64AMode* m8_rsp = AMD64AMode_IR(-8, rsp);
- AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
- /* arg1 is the most significant (Q3), arg4 the least (Q0) */
- /* Get all the args into regs, before messing with the stack. */
- AMD64RI* q3 = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
- AMD64RI* q2 = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
- AMD64RI* q1 = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
- AMD64RI* q0 = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
- /* less significant lane (Q2) at the lower address (-16(rsp)) */
- addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
- addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
- addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
- /* and then the lower half .. */
- addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
- addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
- addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
- *rHi = vHi;
- *rLo = vLo;
+ const IRExpr* arg1 = e->Iex.Qop.details->arg1;
+ const IRExpr* arg2 = e->Iex.Qop.details->arg2;
+ const IRExpr* arg3 = e->Iex.Qop.details->arg3;
+ const IRExpr* arg4 = e->Iex.Qop.details->arg4;
+ // If the args are trivially the same (tmp or const), use the same
+ // source register for all four, and only one movq since those are
+ // (relatively) expensive.
+ if (areAtomsAndEqual(arg1, arg2)
+ && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
+ HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
+ HReg tmp = newVRegV(env);
+ HReg dst = newVRegV(env);
+ addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
+ addInstr(env, mk_vMOVsd_RR(dst, tmp));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
+ addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+ *rHi = dst;
+ *rLo = dst;
+ } else {
+ /* arg1 is the most significant (Q3), arg4 the least (Q0) */
+ HReg q3 = iselIntExpr_R(env, arg1);
+ HReg q2 = iselIntExpr_R(env, arg2);
+ HReg q1 = iselIntExpr_R(env, arg3);
+ HReg q0 = iselIntExpr_R(env, arg4);
+ HReg tmp = newVRegV(env);
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
+ addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
+ addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
+ addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
+ addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
+ addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
+ addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
+ addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
+ *rHi = dstHi;
+ *rLo = dstLo;
+ }
return;
}