amd64 back end: generate better code for 128/256 bit vector shifts by immediate....

author Julian Seward <jseward@acm.org>

Sat, 22 Dec 2018 12:34:11 +0000 (13:34 +0100)

committer Julian Seward <jseward@acm.org>

Sat, 22 Dec 2018 12:34:11 +0000 (13:34 +0100)
author Julian Seward <jseward@acm.org>
Sat, 22 Dec 2018 12:34:11 +0000 (13:34 +0100)
committer Julian Seward <jseward@acm.org>
Sat, 22 Dec 2018 12:34:11 +0000 (13:34 +0100)
diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c

index 48ca268ab0b8d93b766e6b70beee4d4d0f715929..1536d81be9da51d645a1cd8348c4acd5fce3823d 100644 (file)
--- a/VEX/priv/host_amd64_defs.c
+++ b/VEX/priv/host_amd64_defs.c
@@ -1007,6 +1007,15 @@ AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
     vassert(order >= 0 && order <= 0xFF);
     return i;
  }
+AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
+                                   UInt shiftBits, HReg dst ) {
+   AMD64Instr* i              = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag                     = Ain_SseShiftN;
+   i->Ain.SseShiftN.op        = op;
+   i->Ain.SseShiftN.shiftBits = shiftBits;
+   i->Ain.SseShiftN.dst       = dst;
+   return i;
+}
  //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
  //uu                                  HReg reg, AMD64AMode* addr ) {
  //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
@@ -1359,6 +1368,11 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
           vex_printf(",");
           ppHRegAMD64(i->Ain.SseShuf.dst);
           return;
+      case Ain_SseShiftN:
+         vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
+                                i->Ain.SseShiftN.shiftBits);
+         ppHRegAMD64(i->Ain.SseShiftN.dst);
+         return;
        //uu case Ain_AvxLdSt:
        //uu    vex_printf("vmovups ");
        //uu    if (i->Ain.AvxLdSt.isLoad) {
@@ -1691,6 +1705,9 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
           addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
           addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
           return;
+      case Ain_SseShiftN:
+         addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
+         return;
        //uu case Ain_AvxLdSt:
        //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
        //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
@@ -1906,6 +1923,9 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
           mapReg(m, &i->Ain.SseShuf.src);
           mapReg(m, &i->Ain.SseShuf.dst);
           return;
+      case Ain_SseShiftN:
+         mapReg(m, &i->Ain.SseShiftN.dst);
+         return;
        //uu case Ain_AvxLdSt:
        //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
        //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
@@ -3840,6 +3860,48 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
        *p++ = (UChar)(i->Ain.SseShuf.order);
        goto done;
  
+   case Ain_SseShiftN: {
+      opc         = 0; // invalid
+      subopc_imm  = 0; // invalid
+      UInt limit  = 0;
+      UInt shiftImm = i->Ain.SseShiftN.shiftBits;
+      switch (i->Ain.SseShiftN.op) {
+         case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
+         case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
+         case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
+         case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
+         case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
+         case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
+         case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
+         case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
+         case Asse_SHL128:
+            if ((shiftImm & 7) != 0) goto bad;
+            shiftImm >>= 3;
+            limit = 15; opc = 0x73; subopc_imm = 7;
+            break;
+         case Asse_SHR128:
+            if ((shiftImm & 7) != 0) goto bad;
+            shiftImm >>= 3;
+            limit = 15; opc = 0x73; subopc_imm = 3;
+            break;
+         default:
+            // This should never happen .. SSE2 only offers the above 10 insns
+            // for the "shift with immediate" case
+            goto bad;
+      }
+      vassert(limit > 0 && opc > 0 && subopc_imm > 0);
+      if (shiftImm > limit) goto bad;
+      *p++ = 0x66;
+      *p++ = clearWBit(
+             rexAMode_R_enc_enc( subopc_imm,
+                                 vregEnc3210(i->Ain.SseShiftN.dst) ));
+      *p++ = 0x0F;
+      *p++ = opc;
+      p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
+      *p++ = shiftImm;
+      goto done;
+   }
+
     //uu case Ain_AvxLdSt: {
     //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
     //uu                           i->Ain.AvxLdSt.addr );
diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h

index 6a72943f95b9034ad0093a44d3dfba7ea7e17a4e..e1715a0b4677f19ceaf3331e9835bc8254139693 100644 (file)
--- a/VEX/priv/host_amd64_defs.h
+++ b/VEX/priv/host_amd64_defs.h
@@ -334,8 +334,8 @@ typedef
        Asse_MIN8U,
        Asse_CMPEQ8, Asse_CMPEQ16, Asse_CMPEQ32,
        Asse_CMPGT8S, Asse_CMPGT16S, Asse_CMPGT32S,
-      Asse_SHL16, Asse_SHL32, Asse_SHL64,
-      Asse_SHR16, Asse_SHR32, Asse_SHR64,
+      Asse_SHL16, Asse_SHL32, Asse_SHL64, Asse_SHL128,
+      Asse_SHR16, Asse_SHR32, Asse_SHR64, Asse_SHR128,
        Asse_SAR16, Asse_SAR32, 
        Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW,
        Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ,
@@ -400,6 +400,7 @@ typedef
        Ain_SseReRg,     /* SSE binary general reg-reg, Re, Rg */
        Ain_SseCMov,     /* SSE conditional move */
        Ain_SseShuf,     /* SSE2 shuffle (pshufd) */
+      Ain_SseShiftN,   /* SSE2 shift by immediate */
        //uu Ain_AvxLdSt,     /* AVX load/store 256 bits,
        //uu                     no alignment constraints */
        //uu Ain_AvxReRg,     /* AVX binary general reg-reg, Re, Rg */
@@ -695,6 +696,11 @@ typedef
              HReg   src;
              HReg   dst;
           } SseShuf;
+         struct {
+            AMD64SseOp op;
+            UInt       shiftBits;
+            HReg       dst;
+         } SseShiftN;
           //uu struct {
           //uu    Bool        isLoad;
           //uu    HReg        reg;
@@ -773,6 +779,8 @@ extern AMD64Instr* AMD64Instr_Sse64FLo   ( AMD64SseOp, HReg, HReg );
  extern AMD64Instr* AMD64Instr_SseReRg    ( AMD64SseOp, HReg, HReg );
  extern AMD64Instr* AMD64Instr_SseCMov    ( AMD64CondCode, HReg src, HReg dst );
  extern AMD64Instr* AMD64Instr_SseShuf    ( Int order, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_SseShiftN  ( AMD64SseOp,
+                                           UInt shiftBits, HReg dst );
  //uu extern AMD64Instr* AMD64Instr_AvxLdSt    ( Bool isLoad, HReg, AMD64AMode* );
  //uu extern AMD64Instr* AMD64Instr_AvxReRg    ( AMD64SseOp, HReg, HReg );
  extern AMD64Instr* AMD64Instr_EvCheck    ( AMD64AMode* amCounter,
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c

index 7974c80364b681e0c2a87d4aa69a112d95f5f3da..59fd75240af2db9458cea25b62ee8c32bcc2fa34 100644 (file)
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -3135,9 +3135,10 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
     HWord      fn = 0; /* address of helper fn, if required */
     Bool       arg1isEReg = False;
     AMD64SseOp op = Asse_INVALID;
-   IRType     ty = typeOfIRExpr(env->type_env,e);
     vassert(e);
+   IRType ty = typeOfIRExpr(env->type_env, e);
     vassert(ty == Ity_V128);
+   UInt laneBits = 0;
  
     if (e->tag == Iex_RdTmp) {
        return lookupIRTemp(env, e->Iex.RdTmp.tmp);
@@ -3521,20 +3522,33 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e )
           return dst;
        }
  
-      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
-      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
-      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
-      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
-      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
-      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
-      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
-      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
+      case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift;
+      case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
+      case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift;
        do_SseShift: {
-         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg dst  = newVRegV(env);
+         HReg greg = iselVecExpr(env, e->Iex.Binop.arg1);
+         /* If it's a shift by an in-range immediate, generate a single
+            instruction. */
+         if (e->Iex.Binop.arg2->tag == Iex_Const) {
+            IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
+            vassert(c->tag == Ico_U8);
+            UInt shift = c->Ico.U8;
+            if (shift < laneBits) {
+               addInstr(env, mk_vMOVsd_RR(greg, dst));
+               addInstr(env, AMD64Instr_SseShiftN(op, shift, dst));
+               return dst;
+            }
+         }
+         /* Otherwise we have to do it the longwinded way. */
           AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
           AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
           HReg        ereg = newVRegV(env);
-         HReg        dst  = newVRegV(env);
           addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
           addInstr(env, AMD64Instr_Push(rmi));
           addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
@@ -3762,8 +3776,9 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
  {
     HWord fn = 0; /* address of helper fn, if required */
     vassert(e);
-   IRType ty = typeOfIRExpr(env->type_env,e);
+   IRType ty = typeOfIRExpr(env->type_env, e);
     vassert(ty == Ity_V256);
+   UInt laneBits = 0;
  
     AMD64SseOp op = Asse_INVALID;
  
@@ -3997,22 +4012,39 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
           return;
        }
  
-      case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
-      case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
-      case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
-      case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
-      case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
-      case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
-      case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
-      case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
+      case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x8:  laneBits = 32; op = Asse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x4:  laneBits = 64; op = Asse_SHL64; goto do_SseShift;
+      case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift;
+      case Iop_SarN32x8:  laneBits = 32; op = Asse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x8:  laneBits = 32; op = Asse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x4:  laneBits = 64; op = Asse_SHR64; goto do_SseShift;
        do_SseShift: {
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
           HReg gregHi, gregLo;
           iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
+         /* If it's a shift by an in-range immediate, generate two single
+            instructions. */
+         if (e->Iex.Binop.arg2->tag == Iex_Const) {
+            IRConst* c = e->Iex.Binop.arg2->Iex.Const.con;
+            vassert(c->tag == Ico_U8);
+            UInt shift = c->Ico.U8;
+            if (shift < laneBits) {
+               addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
+               addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi));
+               addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
+               addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo));
+               *rHi = dstHi;
+               *rLo = dstLo;
+               return;
+            }
+         }
+         /* Otherwise we have to do it the longwinded way. */
           AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
           AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
           HReg        ereg  = newVRegV(env);
-         HReg        dstHi = newVRegV(env);
-         HReg        dstLo = newVRegV(env);
           addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
           addInstr(env, AMD64Instr_Push(rmi));
           addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
author	Julian Seward <jseward@acm.org>
	Sat, 22 Dec 2018 12:34:11 +0000 (13:34 +0100)
committer	Julian Seward <jseward@acm.org>
	Sat, 22 Dec 2018 12:34:11 +0000 (13:34 +0100)
VEX/priv/host_amd64_defs.c		patch \| blob \| blame \| history
VEX/priv/host_amd64_defs.h		patch \| blob \| blame \| history
VEX/priv/host_amd64_isel.c		patch \| blob \| blame \| history