From: Julian Seward <jseward@acm.org>
Date: Sat, 22 Dec 2018 17:04:42 +0000 (+0100)
Subject: amd64 back end: generate better code for 2x64<-->V128 and 4x64<-->V256 transfers ..
X-Git-Tag: VALGRIND_3_15_0~117
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b17d5ffdb844cf081c86d7df9489f61b4392ca47;p=thirdparty%2Fvalgrind.git

amd64 back end: generate better code for 2x64<-->V128 and 4x64<-->V256 transfers ..

.. by adding support for MOVQ xmm/ireg and using that to implement 64HLtoV128,
4x64toV256 and their inverses.  This reduces the number of instructions,
removes the use of memory as an intermediary, and avoids store-forwarding
stalls.
---

diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c
index e3a2c7206a..8e55197444 100644
--- a/VEX/priv/host_amd64_defs.c
+++ b/VEX/priv/host_amd64_defs.c
@@ -1020,6 +1020,16 @@ AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
    i->Ain.SseShiftN.dst       = dst;
    return i;
 }
+AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
+   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag               = Ain_SseMOVQ;
+   i->Ain.SseMOVQ.gpr   = gpr;
+   i->Ain.SseMOVQ.xmm   = xmm;
+   i->Ain.SseMOVQ.toXMM = toXMM;
+   vassert(hregClass(gpr) == HRcInt64);
+   vassert(hregClass(xmm) == HRcVec128);
+   return i;
+}
 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
 //uu                                  HReg reg, AMD64AMode* addr ) {
 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
@@ -1377,6 +1387,18 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
                                 i->Ain.SseShiftN.shiftBits);
          ppHRegAMD64(i->Ain.SseShiftN.dst);
          return;
+      case Ain_SseMOVQ:
+         vex_printf("movq ");
+         if (i->Ain.SseMOVQ.toXMM) {
+            ppHRegAMD64(i->Ain.SseMOVQ.gpr);
+            vex_printf(",");
+            ppHRegAMD64(i->Ain.SseMOVQ.xmm);
+         } else {
+            ppHRegAMD64(i->Ain.SseMOVQ.xmm);
+            vex_printf(",");
+            ppHRegAMD64(i->Ain.SseMOVQ.gpr);
+         };
+         return;
       //uu case Ain_AvxLdSt:
       //uu    vex_printf("vmovups ");
       //uu    if (i->Ain.AvxLdSt.isLoad) {
@@ -1714,6 +1736,12 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
       case Ain_SseShiftN:
          addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
          return;
+      case Ain_SseMOVQ:
+         addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite,
+                    i->Ain.SseMOVQ.gpr);
+         addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead,
+                    i->Ain.SseMOVQ.xmm);
+         return;
       //uu case Ain_AvxLdSt:
       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
@@ -1932,6 +1960,10 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
       case Ain_SseShiftN:
          mapReg(m, &i->Ain.SseShiftN.dst);
          return;
+      case Ain_SseMOVQ:
+         mapReg(m, &i->Ain.SseMOVQ.gpr);
+         mapReg(m, &i->Ain.SseMOVQ.xmm);
+         return;
       //uu case Ain_AvxLdSt:
       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
@@ -2301,6 +2333,11 @@ static inline UChar clearWBit ( UChar rex )
    return rex & ~(1<<3);
 }
 
+static inline UChar setWBit ( UChar rex )
+{
+   return rex | (1<<3);
+}
+
 
 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
@@ -3914,6 +3951,18 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
       goto done;
    }
 
+   case Ain_SseMOVQ: {
+      Bool toXMM = i->Ain.SseMOVQ.toXMM;
+      HReg gpr = i->Ain.SseMOVQ.gpr;
+      HReg xmm = i->Ain.SseMOVQ.xmm;
+      *p++ = 0x66;
+      *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) );
+      *p++ = 0x0F;
+      *p++ = toXMM ? 0x6E : 0x7E;
+      p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) );
+      goto done;
+   }
+
    //uu case Ain_AvxLdSt: {
    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
    //uu                           i->Ain.AvxLdSt.addr );
diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h
index c45229feb3..64bd810247 100644
--- a/VEX/priv/host_amd64_defs.h
+++ b/VEX/priv/host_amd64_defs.h
@@ -404,6 +404,7 @@ typedef
       Ain_SseCMov,     /* SSE conditional move */
       Ain_SseShuf,     /* SSE2 shuffle (pshufd) */
       Ain_SseShiftN,   /* SSE2 shift by immediate */
+      Ain_SseMOVQ,     /* SSE2 moves of xmm[63:0] to/from GPR */
       //uu Ain_AvxLdSt,     /* AVX load/store 256 bits,
       //uu                     no alignment constraints */
       //uu Ain_AvxReRg,     /* AVX binary general reg-reg, Re, Rg */
@@ -704,6 +705,11 @@ typedef
             UInt       shiftBits;
             HReg       dst;
          } SseShiftN;
+         struct {
+            HReg gpr;
+            HReg xmm;
+            Bool toXMM; // when moving to xmm, xmm[127:64] is zeroed out
+         } SseMOVQ;
          //uu struct {
          //uu    Bool        isLoad;
          //uu    HReg        reg;
@@ -784,6 +790,7 @@ extern AMD64Instr* AMD64Instr_SseCMov    ( AMD64CondCode, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseShuf    ( Int order, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseShiftN  ( AMD64SseOp,
                                            UInt shiftBits, HReg dst );
+extern AMD64Instr* AMD64Instr_SseMOVQ    ( HReg gpr, HReg xmm, Bool toXMM );
 //uu extern AMD64Instr* AMD64Instr_AvxLdSt    ( Bool isLoad, HReg, AMD64AMode* );
 //uu extern AMD64Instr* AMD64Instr_AvxReRg    ( AMD64SseOp, HReg, HReg );
 extern AMD64Instr* AMD64Instr_EvCheck    ( AMD64AMode* amCounter,
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index 486901cb45..e67edc5bd9 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -91,7 +91,7 @@ static IRExpr* bind ( Int binder )
    return IRExpr_Binder(binder);
 }
 
-static Bool isZeroU8 ( IRExpr* e )
+static Bool isZeroU8 ( const IRExpr* e )
 {
    return e->tag == Iex_Const
           && e->Iex.Const.con->tag == Ico_U8
@@ -291,20 +291,32 @@ static Bool fitsIn32Bits ( ULong x )
 
 /* Is this a 64-bit zero expression? */
 
-static Bool isZeroU64 ( IRExpr* e )
+static Bool isZeroU64 ( const IRExpr* e )
 {
    return e->tag == Iex_Const
           && e->Iex.Const.con->tag == Ico_U64
           && e->Iex.Const.con->Ico.U64 == 0ULL;
 }
 
-static Bool isZeroU32 ( IRExpr* e )
+static Bool isZeroU32 ( const IRExpr* e )
 {
    return e->tag == Iex_Const
           && e->Iex.Const.con->tag == Ico_U32
           && e->Iex.Const.con->Ico.U32 == 0;
 }
 
+/* Are both args atoms and the same?  This is copy of eqIRAtom
+   that omits the assertions that the args are indeed atoms. */
+
+static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 )
+{
+   if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
+      return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
+   if (a1->tag == Iex_Const && a2->tag == Iex_Const)
+      return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
+   return False;
+}
+
 /* Make a int reg-reg move. */
 
 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
@@ -1609,44 +1621,47 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e )
          }
 
          /* V128{HI}to64 */
-         case Iop_V128HIto64:
          case Iop_V128to64: {
             HReg dst = newVRegI(env);
-            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
-            HReg rsp = hregAMD64_RSP();
             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
-            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
-            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
-            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
-                                             16, vec, m16_rsp));
-            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, 
-                                             AMD64RMI_Mem(off_rsp), dst ));
+            addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
+            return dst;
+         }
+         case Iop_V128HIto64: {
+            HReg dst  = newVRegI(env);
+            HReg vec  = iselVecExpr(env, e->Iex.Unop.arg);
+            HReg vec2 = newVRegV(env);
+            addInstr(env, mk_vMOVsd_RR(vec, vec2));
+            addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
+            addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
             return dst;
          }
 
+         /* V256to64_{3,2,1,0} */
          case Iop_V256to64_0: case Iop_V256to64_1:
          case Iop_V256to64_2: case Iop_V256to64_3: {
             HReg vHi, vLo, vec;
             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
             /* Do the first part of the selection by deciding which of
-               the 128 bit registers do look at, and second part using
+               the 128 bit registers to look at, and second part using
                the same scheme as for V128{HI}to64 above. */
-            Int off = 0;
+            Bool low64of128 = True;
             switch (e->Iex.Unop.op) {
-               case Iop_V256to64_0: vec = vLo; off = -16; break;
-               case Iop_V256to64_1: vec = vLo; off =  -8; break;
-               case Iop_V256to64_2: vec = vHi; off = -16; break;
-               case Iop_V256to64_3: vec = vHi; off =  -8; break;
+               case Iop_V256to64_0: vec = vLo; low64of128 = True;  break;
+               case Iop_V256to64_1: vec = vLo; low64of128 = False; break;
+               case Iop_V256to64_2: vec = vHi; low64of128 = True;  break;
+               case Iop_V256to64_3: vec = vHi; low64of128 = False; break;
                default: vassert(0);
             }
-            HReg        dst     = newVRegI(env);
-            HReg        rsp     = hregAMD64_RSP();
-            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
-            AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
-            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
-                                             16, vec, m16_rsp));
-            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, 
-                                             AMD64RMI_Mem(off_rsp), dst ));
+            HReg dst = newVRegI(env);
+            if (low64of128) {
+               addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/));
+            } else {
+               HReg vec2 = newVRegV(env);
+               addInstr(env, mk_vMOVsd_RR(vec, vec2));
+               addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2));
+               addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/));
+            }
             return dst;
          }
 
@@ -3355,16 +3370,26 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
       }
 
       case Iop_64HLtoV128: {
-         HReg        rsp     = hregAMD64_RSP();
-         AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
-         AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
-         AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
-         AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
-         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
-         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
-         HReg        dst = newVRegV(env);
-         /* One store-forwarding stall coming up, oh well :-( */
-         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
+         const IRExpr* arg1 = e->Iex.Binop.arg1;
+         const IRExpr* arg2 = e->Iex.Binop.arg2;
+         HReg dst = newVRegV(env);
+         HReg tmp = newVRegV(env);
+         HReg qHi = iselIntExpr_R(env, arg1);
+         // If the args are trivially the same (tmp or const), use the same
+         // source register for both, and only one movq since those are
+         // (relatively) expensive.
+         if (areAtomsAndEqual(arg1, arg2)) {
+            addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
+            addInstr(env, mk_vMOVsd_RR(dst, tmp));
+            addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
+            addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+         } else {
+            HReg qLo = iselIntExpr_R(env, arg2);
+            addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/));
+            addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
+            addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/));
+            addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+         }
          return dst;
       }
 
@@ -4071,6 +4096,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
       }
 
       case Iop_V128HLtoV256: {
+         // Curiously, there doesn't seem to be any benefit to be had here by
+         // checking whether arg1 and arg2 are the same, in the style of how
+         // (eg) 64HLtoV128 is handled elsewhere in this file.
          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
          return;
@@ -4313,27 +4341,44 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
 
 
    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
-      HReg        rsp     = hregAMD64_RSP();
-      HReg        vHi     = newVRegV(env);
-      HReg        vLo     = newVRegV(env);
-      AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
-      AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
-      /* arg1 is the most significant (Q3), arg4 the least (Q0) */
-      /* Get all the args into regs, before messing with the stack. */
-      AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
-      AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
-      AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
-      AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
-      /* less significant lane (Q2) at the lower address (-16(rsp)) */
-      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
-      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
-      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
-      /* and then the lower half .. */
-      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
-      addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
-      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
-      *rHi = vHi;
-      *rLo = vLo;
+      const IRExpr* arg1 = e->Iex.Qop.details->arg1;
+      const IRExpr* arg2 = e->Iex.Qop.details->arg2;
+      const IRExpr* arg3 = e->Iex.Qop.details->arg3;
+      const IRExpr* arg4 = e->Iex.Qop.details->arg4;
+      // If the args are trivially the same (tmp or const), use the same
+      // source register for all four, and only one movq since those are
+      // (relatively) expensive.
+      if (areAtomsAndEqual(arg1, arg2)
+          && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) {
+         HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1);
+         HReg tmp = newVRegV(env);
+         HReg dst = newVRegV(env);
+         addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/));
+         addInstr(env, mk_vMOVsd_RR(dst, tmp));
+         addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst));
+         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+         *rHi = dst;
+         *rLo = dst;
+      } else {
+         /* arg1 is the most significant (Q3), arg4 the least (Q0) */
+         HReg q3 = iselIntExpr_R(env, arg1);
+         HReg q2 = iselIntExpr_R(env, arg2);
+         HReg q1 = iselIntExpr_R(env, arg3);
+         HReg q0 = iselIntExpr_R(env, arg4);
+         HReg tmp = newVRegV(env);
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
+         addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/));
+         addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi));
+         addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/));
+         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi));
+         addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/));
+         addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo));
+         addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/));
+         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo));
+         *rHi = dstHi;
+         *rLo = dstLo;
+      }
       return;
    }