From: Julian Seward <jseward@acm.org>
Date: Sat, 22 Dec 2018 12:34:11 +0000 (+0100)
Subject: amd64 back end: generate better code for 128/256 bit vector shifts by immediate.... 
X-Git-Tag: VALGRIND_3_15_0~119
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=901f3d3813c551b18a34ca5a52e3d9393524544c;p=thirdparty%2Fvalgrind.git

amd64 back end: generate better code for 128/256 bit vector shifts by immediate.  n-i-bz.
---

diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c
index 48ca268ab0..1536d81be9 100644
--- a/VEX/priv/host_amd64_defs.c
+++ b/VEX/priv/host_amd64_defs.c
@@ -1007,6 +1007,15 @@ AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
    vassert(order >= 0 && order <= 0xFF);
    return i;
 }
+AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
+                                   UInt shiftBits, HReg dst ) {
+   AMD64Instr* i              = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag                     = Ain_SseShiftN;
+   i->Ain.SseShiftN.op        = op;
+   i->Ain.SseShiftN.shiftBits = shiftBits;
+   i->Ain.SseShiftN.dst       = dst;
+   return i;
+}
 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
 //uu                                  HReg reg, AMD64AMode* addr ) {
 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
@@ -1359,6 +1368,11 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
          vex_printf(",");
          ppHRegAMD64(i->Ain.SseShuf.dst);
          return;
+      case Ain_SseShiftN:
+         vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
+                                i->Ain.SseShiftN.shiftBits);
+         ppHRegAMD64(i->Ain.SseShiftN.dst);
+         return;
       //uu case Ain_AvxLdSt:
       //uu    vex_printf("vmovups ");
       //uu    if (i->Ain.AvxLdSt.isLoad) {
@@ -1691,6 +1705,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
          return;
+      case Ain_SseShiftN:
+         addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
+         return;
       //uu case Ain_AvxLdSt:
       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
@@ -1906,6 +1923,9 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
          mapReg(m, &i->Ain.SseShuf.src);
          mapReg(m, &i->Ain.SseShuf.dst);
          return;
+      case Ain_SseShiftN:
+         mapReg(m, &i->Ain.SseShiftN.dst);
+         return;
       //uu case Ain_AvxLdSt:
       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
@@ -3840,6 +3860,48 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
       *p++ = (UChar)(i->Ain.SseShuf.order);
       goto done;
 
+   case Ain_SseShiftN: {
+      opc         = 0; // invalid
+      subopc_imm  = 0; // invalid
+      UInt limit  = 0;
+      UInt shiftImm = i->Ain.SseShiftN.shiftBits;
+      switch (i->Ain.SseShiftN.op) {
+         case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
+         case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
+         case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
+         case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
+         case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
+         case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
+         case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
+         case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
+         case Asse_SHL128:
+            if ((shiftImm & 7) != 0) goto bad;
+            shiftImm >>= 3;
+            limit = 15; opc = 0x73; subopc_imm = 7;
+            break;
+         case Asse_SHR128:
+            if ((shiftImm & 7) != 0) goto bad;
+            shiftImm >>= 3;
+            limit = 15; opc = 0x73; subopc_imm = 3;
+            break;
+         default:
+            // This should never happen .. SSE2 only offers the above 10 insns
+            // for the "shift with immediate" case
+            goto bad;
+      }
+      vassert(limit > 0 && opc > 0 && subopc_imm > 0);
+      if (shiftImm > limit) goto bad;
+      *p++ = 0x66;
+      *p++ = clearWBit(
+             rexAMode_R_enc_enc( subopc_imm,
+                                 vregEnc3210(i->Ain.SseShiftN.dst) ));
+      *p++ = 0x0F;
+      *p++ = opc;
+      p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
+      *p++ = shiftImm;
+      goto done;
+   }
+
    //uu case Ain_AvxLdSt: {
    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
    //uu                           i->Ain.AvxLdSt.addr );
diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h
index 6a72943f95..e1715a0b46 100644
--- a/VEX/priv/host_amd64_defs.h
+++ b/VEX/priv/host_amd64_defs.h
@@ -334,8 +334,8 @@ typedef
       Asse_MIN8U,
       Asse_CMPEQ8, Asse_CMPEQ16, Asse_CMPEQ32,
       Asse_CMPGT8S, Asse_CMPGT16S, Asse_CMPGT32S,
-      Asse_SHL16, Asse_SHL32, Asse_SHL64,
-      Asse_SHR16, Asse_SHR32, Asse_SHR64,
+      Asse_SHL16, Asse_SHL32, Asse_SHL64, Asse_SHL128,
+      Asse_SHR16, Asse_SHR32, Asse_SHR64, Asse_SHR128,
       Asse_SAR16, Asse_SAR32, 
       Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW,
       Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ,
@@ -400,6 +400,7 @@ typedef
       Ain_SseReRg,     /* SSE binary general reg-reg, Re, Rg */
       Ain_SseCMov,     /* SSE conditional move */
       Ain_SseShuf,     /* SSE2 shuffle (pshufd) */
+      Ain_SseShiftN,   /* SSE2 shift by immediate */
       //uu Ain_AvxLdSt,     /* AVX load/store 256 bits,
       //uu                     no alignment constraints */
       //uu Ain_AvxReRg,     /* AVX binary general reg-reg, Re, Rg */
@@ -695,6 +696,11 @@ typedef
             HReg   src;
             HReg   dst;
          } SseShuf;
+         struct {
+            AMD64SseOp op;
+            UInt       shiftBits;
+            HReg       dst;
+         } SseShiftN;
          //uu struct {
          //uu    Bool        isLoad;
          //uu    HReg        reg;
@@ -773,6 +779,8 @@ extern AMD64Instr* AMD64Instr_Sse64FLo   ( AMD64SseOp, HReg, HReg );
 extern AMD64Instr* AMD64Instr_SseReRg    ( AMD64SseOp, HReg, HReg );
 extern AMD64Instr* AMD64Instr_SseCMov    ( AMD64CondCode, HReg src, HReg dst );
 extern AMD64Instr* AMD64Instr_SseShuf    ( Int order, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_SseShiftN  ( AMD64SseOp,
+                                           UInt shiftBits, HReg dst );
 //uu extern AMD64Instr* AMD64Instr_AvxLdSt    ( Bool isLoad, HReg, AMD64AMode* );
 //uu extern AMD64Instr* AMD64Instr_AvxReRg    ( AMD64SseOp, HReg, HReg );
 extern AMD64Instr* AMD64Instr_EvCheck    ( AMD64AMode* amCounter,
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index 7974c80364..59fd75240a 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -3135,9 +3135,10 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
    HWord      fn = 0; /* address of helper fn, if required */
    Bool       arg1isEReg = False;
    AMD64SseOp op = Asse_INVALID;
-   IRType     ty = typeOfIRExpr(env->type_env,e);
    vassert(e);
+   IRType ty = typeOfIRExpr(env->type_env, e);
    vassert(ty == Ity_V128);
+   UInt laneBits = 0;
 
    if (e->tag == Iex_RdTmp) {
       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
@@ -3521,20 +3522,33 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
          return dst;
       }
 
-      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
-      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
-      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
-      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
-      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
-      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
-      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
-      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
+      case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
+      case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
+      case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
       do_SseShift: {
-         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg dst  = newVRegV(env);
+         HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
+         /* If it's a shift by an in-range immediate, generate a single
+            instruction. */
+         if (e->Iex.Binop.arg2->tag == Iex_Const) {
+            IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
+            vassert(c->tag == Ico_U8);
+            UInt shift = c->Ico.U8;
+            if (shift < laneBits) {
+               addInstr(env, mk_vMOVsd_RR(greg, dst));
+               addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
+               return dst;
+            }
+         }
+         /* Otherwise we have to do it the longwinded way. */
          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
          HReg        ereg = newVRegV(env);
-         HReg        dst  = newVRegV(env);
          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
          addInstr(env, AMD64Instr_Push(rmi));
          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
@@ -3762,8 +3776,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
 {
    HWord fn = 0; /* address of helper fn, if required */
    vassert(e);
-   IRType ty = typeOfIRExpr(env->type_env,e);
+   IRType ty = typeOfIRExpr(env->type_env, e);
    vassert(ty == Ity_V256);
+   UInt laneBits = 0;
 
    AMD64SseOp op = Asse_INVALID;
 
@@ -3997,22 +4012,39 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
          return;
       }
 
-      case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
-      case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
-      case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
-      case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
-      case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
-      case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
-      case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
-      case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
+      case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
+      case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
+      case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
       do_SseShift: {
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
          HReg gregHi, gregLo;
          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
+         /* If it's a shift by an in-range immediate, generate two single
+            instructions. */
+         if (e->Iex.Binop.arg2->tag == Iex_Const) {
+            IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
+            vassert(c->tag == Ico_U8);
+            UInt shift = c->Ico.U8;
+            if (shift < laneBits) {
+               addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
+               addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
+               addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
+               addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
+               *rHi = dstHi;
+               *rLo = dstLo;
+               return;
+            }
+         }
+         /* Otherwise we have to do it the longwinded way. */
          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
          HReg        ereg  = newVRegV(env);
-         HReg        dstHi = newVRegV(env);
-         HReg        dstLo = newVRegV(env);
          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
          addInstr(env, AMD64Instr_Push(rmi));
          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));