From: Julian Seward Date: Sat, 25 Aug 2007 23:21:08 +0000 (+0000) Subject: Merge from CGTUNE branch, code generation improvements for amd64: X-Git-Tag: svn/VALGRIND_3_3_1^2~32 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=73321f44e43e86bde68e5e4849910a860ea4e61f;p=thirdparty%2Fvalgrind.git Merge from CGTUNE branch, code generation improvements for amd64: r1772: When generating code for helper calls, be more aggressive about computing values directly into argument registers, thereby avoiding some reg-reg shuffling. This reduces the amount of code (on amd64) generated by Cachegrind by about 6% and has zero or marginal benefit for other tools. r1773: Emit 64-bit branch targets using 32-bit short forms when possible. Since (with V's default amd64 load address of 0x38000000) this is usually possible, it saves about 7% in code size for Memcheck and even more for Cachegrind. git-svn-id: svn://svn.valgrind.org/vex/trunk@1781 --- diff --git a/VEX/priv/host-amd64/hdefs.c b/VEX/priv/host-amd64/hdefs.c index a45550debe..401dc46e2f 100644 --- a/VEX/priv/host-amd64/hdefs.c +++ b/VEX/priv/host-amd64/hdefs.c @@ -1991,6 +1991,17 @@ static Bool fits8bits ( UInt w32 ) Int i32 = (Int)w32; return toBool(i32 == ((i32 << 24) >> 24)); } +/* Can the lower 32 bits be signedly widened to produce the whole + 64-bit value? In other words, are the top 33 bits either all 0 or + all 1 ? */ +static Bool fitsIn32Bits ( ULong x ) +{ + Long y0 = (Long)x; + Long y1 = y0; + y1 <<= 32; + y1 >>=/*s*/ 32; + return toBool(x == y1); +} /* Forming mod-reg-rm bytes and scale-index-base bytes. @@ -2601,25 +2612,36 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i, goto bad; } - case Ain_Call: + case Ain_Call: { /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr above, %r11 is used as an address temporary. */ /* jump over the following two insns if the condition does not hold */ + Bool shortImm = fitsIn32Bits(i->Ain.Call.target); if (i->Ain.Call.cond != Acc_ALWAYS) { *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1))); - *p++ = 13; /* 13 bytes in the next two insns */ + *p++ = shortImm ? 10 : 13; + /* 10 or 13 bytes in the next two insns */ + } + if (shortImm) { + /* 7 bytes: movl sign-extend(imm32), %r11 */ + *p++ = 0x49; + *p++ = 0xC7; + *p++ = 0xC3; + p = emit32(p, (UInt)i->Ain.Call.target); + } else { + /* 10 bytes: movabsq $target, %r11 */ + *p++ = 0x49; + *p++ = 0xBB; + p = emit64(p, i->Ain.Call.target); } - /* movabsq $target, %r11 */ - *p++ = 0x49; - *p++ = 0xBB; - p = emit64(p, i->Ain.Call.target); - /* call *%r11 */ + /* 3 bytes: call *%r11 */ *p++ = 0x41; *p++ = 0xFF; *p++ = 0xD3; goto done; + } case Ain_Goto: /* Use ptmp for backpatching conditional jumps. */ @@ -2701,11 +2723,19 @@ Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i, destined for %rax immediately prior to this Ain_Goto. */ vassert(sizeof(ULong) == sizeof(void*)); vassert(dispatch != NULL); - /* movabsq $imm64, %rdx */ - *p++ = 0x48; - *p++ = 0xBA; - p = emit64(p, Ptr_to_ULong(dispatch)); + if (fitsIn32Bits(Ptr_to_ULong(dispatch))) { + /* movl sign-extend(imm32), %rdx */ + *p++ = 0x48; + *p++ = 0xC7; + *p++ = 0xC2; + p = emit32(p, (UInt)Ptr_to_ULong(dispatch)); + } else { + /* movabsq $imm64, %rdx */ + *p++ = 0x48; + *p++ = 0xBA; + p = emit64(p, Ptr_to_ULong(dispatch)); + } /* jmp *%rdx */ *p++ = 0xFF; *p++ = 0xE2; diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c index 6f25c9fb9c..265e7c1060 100644 --- a/VEX/priv/host-amd64/isel.c +++ b/VEX/priv/host-amd64/isel.c @@ -372,20 +372,54 @@ static void sub_from_rsp ( ISelEnv* env, Int n ) //.. } -/* Used only in doHelperCall. See big comment in doHelperCall re - handling of register-parameter args. This function figures out - whether evaluation of an expression might require use of a fixed - register. If in doubt return True (safe but suboptimal). -*/ -static -Bool mightRequireFixedRegs ( IRExpr* e ) +/* Used only in doHelperCall. If possible, produce a single + instruction which computes 'e' into 'dst'. If not possible, return + NULL. */ + +static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env, + HReg dst, + IRExpr* e ) { - switch (e->tag) { - case Iex_RdTmp: case Iex_Const: case Iex_Get: - return False; - default: - return True; + vassert(typeOfIRExpr(env->type_env, e) == Ity_I64); + + if (e->tag == Iex_Const) { + vassert(e->Iex.Const.con->tag == Ico_U64); + if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) { + return AMD64Instr_Alu64R( + Aalu_MOV, + AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)), + dst + ); + } else { + return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst); + } + } + + if (e->tag == Iex_RdTmp) { + HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp); + return mk_iMOVsd_RR(src, dst); } + + if (e->tag == Iex_Get) { + vassert(e->Iex.Get.ty == Ity_I64); + return AMD64Instr_Alu64R( + Aalu_MOV, + AMD64RMI_Mem( + AMD64AMode_IR(e->Iex.Get.offset, + hregAMD64_RBP())), + dst); + } + + if (e->tag == Iex_Unop + && e->Iex.Unop.op == Iop_32Uto64 + && e->Iex.Unop.arg->tag == Iex_RdTmp) { + HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp); + return AMD64Instr_MovZLQ(src, dst); + } + + if (0) { ppIRExpr(e); vex_printf("\n"); } + + return NULL; } @@ -401,7 +435,7 @@ void doHelperCall ( ISelEnv* env, AMD64CondCode cc; HReg argregs[6]; HReg tmpregs[6]; - Bool go_fast; + AMD64Instr* fastinstrs[6]; Int n_args, i, argreg; /* Marshal args for a call and do the call. @@ -471,12 +505,13 @@ void doHelperCall ( ISelEnv* env, tmpregs[0] = tmpregs[1] = tmpregs[2] = tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG; + fastinstrs[0] = fastinstrs[1] = fastinstrs[2] = + fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL; + /* First decide which scheme (slow or fast) is to be used. First assume the fast scheme, and select slow if any contraindications (wow) appear. */ - go_fast = True; - if (guard) { if (guard->tag == Iex_Const && guard->Iex.Const.con->tag == Ico_U1 @@ -484,91 +519,94 @@ void doHelperCall ( ISelEnv* env, /* unconditional */ } else { /* Not manifestly unconditional -- be conservative. */ - go_fast = False; + goto slowscheme; } } - if (go_fast) { - for (i = 0; i < n_args; i++) { - if (mightRequireFixedRegs(args[i])) { - go_fast = False; - break; - } - } + /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll + use the slow scheme. Because this is tentative, we can't call + addInstr (that is, commit to) any instructions until we're + handled all the arguments. So park the resulting instructions + in a buffer and emit that if we're successful. */ + + /* FAST SCHEME */ + argreg = 0; + if (passBBP) { + fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]); + argreg++; } - /* At this point the scheme to use has been established. Generate - code to get the arg values into the argument rregs. */ + for (i = 0; i < n_args; i++) { + vassert(argreg < 6); + vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64); + fastinstrs[argreg] + = iselIntExpr_single_instruction( env, argregs[argreg], args[i] ); + if (fastinstrs[argreg] == NULL) + goto slowscheme; + argreg++; + } - if (go_fast) { + /* Looks like we're in luck. Emit the accumulated instructions and + move on to doing the call itself. */ + vassert(argreg <= 6); + for (i = 0; i < argreg; i++) + addInstr(env, fastinstrs[i]); - /* FAST SCHEME */ - argreg = 0; - if (passBBP) { - addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg])); - argreg++; - } + /* Fast scheme only applies for unconditional calls. Hence: */ + cc = Acc_ALWAYS; - for (i = 0; i < n_args; i++) { - vassert(argreg < 6); - vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64); - addInstr(env, AMD64Instr_Alu64R( - Aalu_MOV, - iselIntExpr_RMI(env, args[i]), - argregs[argreg] - ) - ); - argreg++; - } - - /* Fast scheme only applies for unconditional calls. Hence: */ - cc = Acc_ALWAYS; - - } else { - - /* SLOW SCHEME; move via temporaries */ - argreg = 0; - - if (passBBP) { - /* This is pretty stupid; better to move directly to rdi - after the rest of the args are done. */ - tmpregs[argreg] = newVRegI(env); - addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg])); - argreg++; - } - - for (i = 0; i < n_args; i++) { - vassert(argreg < 6); - vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64); - tmpregs[argreg] = iselIntExpr_R(env, args[i]); - argreg++; - } - - /* Now we can compute the condition. We can't do it earlier - because the argument computations could trash the condition - codes. Be a bit clever to handle the common case where the - guard is 1:Bit. */ - cc = Acc_ALWAYS; - if (guard) { - if (guard->tag == Iex_Const - && guard->Iex.Const.con->tag == Ico_U1 - && guard->Iex.Const.con->Ico.U1 == True) { - /* unconditional -- do nothing */ - } else { - cc = iselCondCode( env, guard ); - } - } + goto handle_call; + + + /* SLOW SCHEME; move via temporaries */ + slowscheme: +#if 0 +if (n_args > 0) {for (i = 0; args[i]; i++) { +ppIRExpr(args[i]); vex_printf(" "); } +vex_printf("\n");} +#endif + argreg = 0; - /* Move the args to their final destinations. */ - for (i = 0; i < argreg; i++) { - /* None of these insns, including any spill code that might - be generated, may alter the condition codes. */ - addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) ); + if (passBBP) { + /* This is pretty stupid; better to move directly to rdi + after the rest of the args are done. */ + tmpregs[argreg] = newVRegI(env); + addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg])); + argreg++; + } + + for (i = 0; i < n_args; i++) { + vassert(argreg < 6); + vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64); + tmpregs[argreg] = iselIntExpr_R(env, args[i]); + argreg++; + } + + /* Now we can compute the condition. We can't do it earlier + because the argument computations could trash the condition + codes. Be a bit clever to handle the common case where the + guard is 1:Bit. */ + cc = Acc_ALWAYS; + if (guard) { + if (guard->tag == Iex_Const + && guard->Iex.Const.con->tag == Ico_U1 + && guard->Iex.Const.con->Ico.U1 == True) { + /* unconditional -- do nothing */ + } else { + cc = iselCondCode( env, guard ); } + } + /* Move the args to their final destinations. */ + for (i = 0; i < argreg; i++) { + /* None of these insns, including any spill code that might + be generated, may alter the condition codes. */ + addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) ); } + /* Finally, the call itself. */ + handle_call: addInstr(env, AMD64Instr_Call( cc, Ptr_to_ULong(cee->addr),