vassert(order >= 0 && order <= 0xFF);
return i;
}
+AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
+ UInt shiftBits, HReg dst ) {
+ AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+ i->tag = Ain_SseShiftN;
+ i->Ain.SseShiftN.op = op;
+ i->Ain.SseShiftN.shiftBits = shiftBits;
+ i->Ain.SseShiftN.dst = dst;
+ return i;
+}
//uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
//uu HReg reg, AMD64AMode* addr ) {
//uu AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
vex_printf(",");
ppHRegAMD64(i->Ain.SseShuf.dst);
return;
+ case Ain_SseShiftN:
+ vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
+ i->Ain.SseShiftN.shiftBits);
+ ppHRegAMD64(i->Ain.SseShiftN.dst);
+ return;
//uu case Ain_AvxLdSt:
//uu vex_printf("vmovups ");
//uu if (i->Ain.AvxLdSt.isLoad) {
addHRegUse(u, HRmRead, i->Ain.SseShuf.src);
addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
return;
+ case Ain_SseShiftN:
+ addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
+ return;
//uu case Ain_AvxLdSt:
//uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
//uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
mapReg(m, &i->Ain.SseShuf.src);
mapReg(m, &i->Ain.SseShuf.dst);
return;
+ case Ain_SseShiftN:
+ mapReg(m, &i->Ain.SseShiftN.dst);
+ return;
//uu case Ain_AvxLdSt:
//uu mapReg(m, &i->Ain.AvxLdSt.reg);
//uu mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
*p++ = (UChar)(i->Ain.SseShuf.order);
goto done;
+ case Ain_SseShiftN: {
+ opc = 0; // invalid
+ subopc_imm = 0; // invalid
+ UInt limit = 0;
+ UInt shiftImm = i->Ain.SseShiftN.shiftBits;
+ switch (i->Ain.SseShiftN.op) {
+ case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
+ case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
+ case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
+ case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
+ case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
+ case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
+ case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
+ case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
+ case Asse_SHL128:
+ if ((shiftImm & 7) != 0) goto bad;
+ shiftImm >>= 3;
+ limit = 15; opc = 0x73; subopc_imm = 7;
+ break;
+ case Asse_SHR128:
+ if ((shiftImm & 7) != 0) goto bad;
+ shiftImm >>= 3;
+ limit = 15; opc = 0x73; subopc_imm = 3;
+ break;
+ default:
+ // This should never happen .. SSE2 only offers the above 10 insns
+ // for the "shift with immediate" case
+ goto bad;
+ }
+ vassert(limit > 0 && opc > 0 && subopc_imm > 0);
+ if (shiftImm > limit) goto bad;
+ *p++ = 0x66;
+ *p++ = clearWBit(
+ rexAMode_R_enc_enc( subopc_imm,
+ vregEnc3210(i->Ain.SseShiftN.dst) ));
+ *p++ = 0x0F;
+ *p++ = opc;
+ p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
+ *p++ = shiftImm;
+ goto done;
+ }
+
//uu case Ain_AvxLdSt: {
//uu UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
//uu i->Ain.AvxLdSt.addr );
Asse_MIN8U,
Asse_CMPEQ8, Asse_CMPEQ16, Asse_CMPEQ32,
Asse_CMPGT8S, Asse_CMPGT16S, Asse_CMPGT32S,
- Asse_SHL16, Asse_SHL32, Asse_SHL64,
- Asse_SHR16, Asse_SHR32, Asse_SHR64,
+ Asse_SHL16, Asse_SHL32, Asse_SHL64, Asse_SHL128,
+ Asse_SHR16, Asse_SHR32, Asse_SHR64, Asse_SHR128,
Asse_SAR16, Asse_SAR32,
Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW,
Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ,
Ain_SseReRg, /* SSE binary general reg-reg, Re, Rg */
Ain_SseCMov, /* SSE conditional move */
Ain_SseShuf, /* SSE2 shuffle (pshufd) */
+ Ain_SseShiftN, /* SSE2 shift by immediate */
//uu Ain_AvxLdSt, /* AVX load/store 256 bits,
//uu no alignment constraints */
//uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */
HReg src;
HReg dst;
} SseShuf;
+ struct {
+ AMD64SseOp op;
+ UInt shiftBits;
+ HReg dst;
+ } SseShiftN;
//uu struct {
//uu Bool isLoad;
//uu HReg reg;
extern AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp, HReg, HReg );
extern AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode, HReg src, HReg dst );
extern AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp,
+ UInt shiftBits, HReg dst );
//uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* );
//uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg );
extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
HWord fn = 0; /* address of helper fn, if required */
Bool arg1isEReg = False;
AMD64SseOp op = Asse_INVALID;
- IRType ty = typeOfIRExpr(env->type_env,e);
vassert(e);
+ IRType ty = typeOfIRExpr(env->type_env, e);
vassert(ty == Ity_V128);
+ UInt laneBits = 0;
if (e->tag == Iex_RdTmp) {
return lookupIRTemp(env, e->Iex.RdTmp.tmp);
return dst;
}
- case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
- case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
- case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
- case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
- case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
- case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
- case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
- case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
+ case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
+ case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
+ case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
+ case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
+ case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
+ case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
+ case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
+ case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
do_SseShift: {
- HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
+ HReg dst = newVRegV(env);
+ HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
+ /* If it's a shift by an in-range immediate, generate a single
+ instruction. */
+ if (e->Iex.Binop.arg2->tag == Iex_Const) {
+ IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
+ vassert(c->tag == Ico_U8);
+ UInt shift = c->Ico.U8;
+ if (shift < laneBits) {
+ addInstr(env, mk_vMOVsd_RR(greg, dst));
+ addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
+ return dst;
+ }
+ }
+ /* Otherwise we have to do it the longwinded way. */
AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
HReg ereg = newVRegV(env);
- HReg dst = newVRegV(env);
addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
addInstr(env, AMD64Instr_Push(rmi));
addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
{
HWord fn = 0; /* address of helper fn, if required */
vassert(e);
- IRType ty = typeOfIRExpr(env->type_env,e);
+ IRType ty = typeOfIRExpr(env->type_env, e);
vassert(ty == Ity_V256);
+ UInt laneBits = 0;
AMD64SseOp op = Asse_INVALID;
return;
}
- case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
- case Iop_ShlN32x8: op = Asse_SHL32; goto do_SseShift;
- case Iop_ShlN64x4: op = Asse_SHL64; goto do_SseShift;
- case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
- case Iop_SarN32x8: op = Asse_SAR32; goto do_SseShift;
- case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
- case Iop_ShrN32x8: op = Asse_SHR32; goto do_SseShift;
- case Iop_ShrN64x4: op = Asse_SHR64; goto do_SseShift;
+ case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
+ case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
+ case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
+ case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
+ case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
+ case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
+ case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
+ case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
do_SseShift: {
+ HReg dstHi = newVRegV(env);
+ HReg dstLo = newVRegV(env);
HReg gregHi, gregLo;
iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
+ /* If it's a shift by an in-range immediate, generate two single
+ instructions. */
+ if (e->Iex.Binop.arg2->tag == Iex_Const) {
+ IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
+ vassert(c->tag == Ico_U8);
+ UInt shift = c->Ico.U8;
+ if (shift < laneBits) {
+ addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
+ addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
+ addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
+ addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
+ *rHi = dstHi;
+ *rLo = dstLo;
+ return;
+ }
+ }
+ /* Otherwise we have to do it the longwinded way. */
AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
HReg ereg = newVRegV(env);
- HReg dstHi = newVRegV(env);
- HReg dstLo = newVRegV(env);
addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
addInstr(env, AMD64Instr_Push(rmi));
addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));