From: Julian Seward <jseward@acm.org>
Date: Sat, 25 Aug 2007 23:21:08 +0000 (+0000)
Subject: Merge from CGTUNE branch, code generation improvements for amd64:
X-Git-Tag: svn/VALGRIND_3_3_1^2~32
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=73321f44e43e86bde68e5e4849910a860ea4e61f;p=thirdparty%2Fvalgrind.git

Merge from CGTUNE branch, code generation improvements for amd64:

r1772:
When generating code for helper calls, be more aggressive about
computing values directly into argument registers, thereby avoiding
some reg-reg shuffling.  This reduces the amount of code (on amd64)
generated by Cachegrind by about 6% and has zero or marginal benefit
for other tools.

r1773:
Emit 64-bit branch targets using 32-bit short forms when possible.
Since (with V's default amd64 load address of 0x38000000) this is
usually possible, it saves about 7% in code size for Memcheck and even
more for Cachegrind.


git-svn-id: svn://svn.valgrind.org/vex/trunk@1781
---

diff --git a/VEX/priv/host-amd64/hdefs.c b/VEX/priv/host-amd64/hdefs.c
index a45550debe..401dc46e2f 100644
--- a/VEX/priv/host-amd64/hdefs.c
+++ b/VEX/priv/host-amd64/hdefs.c
@@ -1991,6 +1991,17 @@ static Bool fits8bits ( UInt w32 )
    Int i32 = (Int)w32;
    return toBool(i32 == ((i32 << 24) >> 24));
 }
+/* Can the lower 32 bits be signedly widened to produce the whole
+   64-bit value?  In other words, are the top 33 bits either all 0 or
+   all 1 ? */
+static Bool fitsIn32Bits ( ULong x )
+{
+   Long y0 = (Long)x;
+   Long y1 = y0;
+   y1 <<= 32;
+   y1 >>=/*s*/ 32;
+   return toBool(x == y1);
+}
 
 
 /* Forming mod-reg-rm bytes and scale-index-base bytes.
@@ -2601,25 +2612,36 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i,
             goto bad;
       }
 
-   case Ain_Call:
+   case Ain_Call: {
       /* As per detailed comment for Ain_Call in
          getRegUsage_AMD64Instr above, %r11 is used as an address
          temporary. */
       /* jump over the following two insns if the condition does not
          hold */
+      Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
       if (i->Ain.Call.cond != Acc_ALWAYS) {
          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
-         *p++ = 13; /* 13 bytes in the next two insns */
+         *p++ = shortImm ? 10 : 13;
+         /* 10 or 13 bytes in the next two insns */
+      }
+      if (shortImm) {
+         /* 7 bytes: movl sign-extend(imm32), %r11 */
+         *p++ = 0x49;
+         *p++ = 0xC7;
+         *p++ = 0xC3;
+         p = emit32(p, (UInt)i->Ain.Call.target);
+      } else {
+         /* 10 bytes: movabsq $target, %r11 */
+         *p++ = 0x49;
+         *p++ = 0xBB;
+         p = emit64(p, i->Ain.Call.target);
       }
-      /* movabsq $target, %r11 */
-      *p++ = 0x49;
-      *p++ = 0xBB;
-      p = emit64(p, i->Ain.Call.target);
-      /* call *%r11 */
+      /* 3 bytes: call *%r11 */
       *p++ = 0x41;
       *p++ = 0xFF;
       *p++ = 0xD3;
       goto done;
+   }
 
    case Ain_Goto:
       /* Use ptmp for backpatching conditional jumps. */
@@ -2701,11 +2723,19 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i,
          destined for %rax immediately prior to this Ain_Goto. */
       vassert(sizeof(ULong) == sizeof(void*));
       vassert(dispatch != NULL);
-      /* movabsq $imm64, %rdx */
-      *p++ = 0x48;
-      *p++ = 0xBA;
-      p = emit64(p, Ptr_to_ULong(dispatch));
 
+      if (fitsIn32Bits(Ptr_to_ULong(dispatch))) {
+         /* movl sign-extend(imm32), %rdx */
+         *p++ = 0x48;
+         *p++ = 0xC7;
+         *p++ = 0xC2;
+         p = emit32(p, (UInt)Ptr_to_ULong(dispatch));
+      } else {
+         /* movabsq $imm64, %rdx */
+         *p++ = 0x48;
+         *p++ = 0xBA;
+         p = emit64(p, Ptr_to_ULong(dispatch));
+      }
       /* jmp *%rdx */
       *p++ = 0xFF;
       *p++ = 0xE2;
diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c
index 6f25c9fb9c..265e7c1060 100644
--- a/VEX/priv/host-amd64/isel.c
+++ b/VEX/priv/host-amd64/isel.c
@@ -372,20 +372,54 @@ static void sub_from_rsp ( ISelEnv* env, Int n )
 //.. }
 
 
-/* Used only in doHelperCall.  See big comment in doHelperCall re
-   handling of register-parameter args.  This function figures out
-   whether evaluation of an expression might require use of a fixed
-   register.  If in doubt return True (safe but suboptimal).
-*/
-static
-Bool mightRequireFixedRegs ( IRExpr* e )
+/* Used only in doHelperCall.  If possible, produce a single
+   instruction which computes 'e' into 'dst'.  If not possible, return
+   NULL. */
+
+static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
+                                                    HReg     dst,
+                                                    IRExpr*  e )
 {
-   switch (e->tag) {
-      case Iex_RdTmp: case Iex_Const: case Iex_Get: 
-         return False;
-      default:
-         return True;
+   vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
+
+   if (e->tag == Iex_Const) {
+      vassert(e->Iex.Const.con->tag == Ico_U64);
+      if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
+         return AMD64Instr_Alu64R(
+                   Aalu_MOV,
+                   AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
+                   dst
+                );
+      } else {
+         return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
+      }
+   }
+
+   if (e->tag == Iex_RdTmp) {
+      HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      return mk_iMOVsd_RR(src, dst);
    }
+
+   if (e->tag == Iex_Get) {
+      vassert(e->Iex.Get.ty == Ity_I64);
+      return AMD64Instr_Alu64R(
+                Aalu_MOV,
+                AMD64RMI_Mem(
+                   AMD64AMode_IR(e->Iex.Get.offset,
+                                 hregAMD64_RBP())),
+                dst);
+   }
+
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_32Uto64 
+       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
+      HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
+      return AMD64Instr_MovZLQ(src, dst);
+   }
+
+   if (0) { ppIRExpr(e); vex_printf("\n"); }
+
+   return NULL;
 }
 
 
@@ -401,7 +435,7 @@ void doHelperCall ( ISelEnv* env,
    AMD64CondCode cc;
    HReg          argregs[6];
    HReg          tmpregs[6];
-   Bool          go_fast;
+   AMD64Instr*   fastinstrs[6];
    Int           n_args, i, argreg;
 
    /* Marshal args for a call and do the call.
@@ -471,12 +505,13 @@ void doHelperCall ( ISelEnv* env,
    tmpregs[0] = tmpregs[1] = tmpregs[2] =
    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
 
+   fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
+   fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
+
    /* First decide which scheme (slow or fast) is to be used.  First
       assume the fast scheme, and select slow if any contraindications
       (wow) appear. */
 
-   go_fast = True;
-
    if (guard) {
       if (guard->tag == Iex_Const 
           && guard->Iex.Const.con->tag == Ico_U1
@@ -484,91 +519,94 @@ void doHelperCall ( ISelEnv* env,
          /* unconditional */
       } else {
          /* Not manifestly unconditional -- be conservative. */
-         go_fast = False;
+         goto slowscheme;
       }
    }
 
-   if (go_fast) {
-      for (i = 0; i < n_args; i++) {
-         if (mightRequireFixedRegs(args[i])) {
-            go_fast = False;
-            break;
-         }
-      }
+   /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
+      use the slow scheme.  Because this is tentative, we can't call
+      addInstr (that is, commit to) any instructions until we're
+      handled all the arguments.  So park the resulting instructions
+      in a buffer and emit that if we're successful. */
+
+   /* FAST SCHEME */
+   argreg = 0;
+   if (passBBP) {
+      fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
+      argreg++;
    }
 
-   /* At this point the scheme to use has been established.  Generate
-      code to get the arg values into the argument rregs. */
+   for (i = 0; i < n_args; i++) {
+      vassert(argreg < 6);
+      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+      fastinstrs[argreg] 
+         = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
+      if (fastinstrs[argreg] == NULL)
+         goto slowscheme;
+      argreg++;
+   }
 
-   if (go_fast) {
+   /* Looks like we're in luck.  Emit the accumulated instructions and
+      move on to doing the call itself. */
+   vassert(argreg <= 6);
+   for (i = 0; i < argreg; i++)
+      addInstr(env, fastinstrs[i]);
 
-      /* FAST SCHEME */
-      argreg = 0;
-      if (passBBP) {
-         addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]));
-         argreg++;
-      }
+   /* Fast scheme only applies for unconditional calls.  Hence: */
+   cc = Acc_ALWAYS;
 
-      for (i = 0; i < n_args; i++) {
-         vassert(argreg < 6);
-         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
-         addInstr(env, AMD64Instr_Alu64R(
-                          Aalu_MOV, 
-                          iselIntExpr_RMI(env, args[i]),
-                          argregs[argreg]
-                       )
-                 );
-         argreg++;
-      }
-
-      /* Fast scheme only applies for unconditional calls.  Hence: */
-      cc = Acc_ALWAYS;
-
-   } else {
-
-      /* SLOW SCHEME; move via temporaries */
-      argreg = 0;
-
-      if (passBBP) {
-         /* This is pretty stupid; better to move directly to rdi
-            after the rest of the args are done. */
-         tmpregs[argreg] = newVRegI(env);
-         addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
-         argreg++;
-      }
-
-      for (i = 0; i < n_args; i++) {
-         vassert(argreg < 6);
-         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
-         tmpregs[argreg] = iselIntExpr_R(env, args[i]);
-         argreg++;
-      }
-
-      /* Now we can compute the condition.  We can't do it earlier
-         because the argument computations could trash the condition
-         codes.  Be a bit clever to handle the common case where the
-         guard is 1:Bit. */
-      cc = Acc_ALWAYS;
-      if (guard) {
-         if (guard->tag == Iex_Const 
-             && guard->Iex.Const.con->tag == Ico_U1
-             && guard->Iex.Const.con->Ico.U1 == True) {
-            /* unconditional -- do nothing */
-         } else {
-            cc = iselCondCode( env, guard );
-         }
-      }
+   goto handle_call;
+
+
+   /* SLOW SCHEME; move via temporaries */
+  slowscheme:
+#if 0
+if (n_args > 0) {for (i = 0; args[i]; i++) {
+ppIRExpr(args[i]); vex_printf(" "); }
+vex_printf("\n");}
+#endif
+   argreg = 0;
 
-      /* Move the args to their final destinations. */
-      for (i = 0; i < argreg; i++) {
-         /* None of these insns, including any spill code that might
-            be generated, may alter the condition codes. */
-         addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
+   if (passBBP) {
+      /* This is pretty stupid; better to move directly to rdi
+         after the rest of the args are done. */
+      tmpregs[argreg] = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
+      argreg++;
+   }
+
+   for (i = 0; i < n_args; i++) {
+      vassert(argreg < 6);
+      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+      tmpregs[argreg] = iselIntExpr_R(env, args[i]);
+      argreg++;
+   }
+
+   /* Now we can compute the condition.  We can't do it earlier
+      because the argument computations could trash the condition
+      codes.  Be a bit clever to handle the common case where the
+      guard is 1:Bit. */
+   cc = Acc_ALWAYS;
+   if (guard) {
+      if (guard->tag == Iex_Const 
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional -- do nothing */
+      } else {
+         cc = iselCondCode( env, guard );
       }
+   }
 
+   /* Move the args to their final destinations. */
+   for (i = 0; i < argreg; i++) {
+      /* None of these insns, including any spill code that might
+         be generated, may alter the condition codes. */
+      addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
    }
 
+
    /* Finally, the call itself. */
+  handle_call:
    addInstr(env, AMD64Instr_Call( 
                     cc, 
                     Ptr_to_ULong(cee->addr),