From: Josef Weidendorfer Date: Fri, 2 Jul 2010 19:56:23 +0000 (+0000) Subject: Callgrind: add branch prediction from Cachegrind X-Git-Tag: svn/VALGRIND_3_6_0~238 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b4b9af9a8264f894200476098e76449750e5ffe7;p=thirdparty%2Fvalgrind.git Callgrind: add branch prediction from Cachegrind Callgrind now uses Cachegrind's command line option to switch on simulation: "--branch-sim=yes/no" for branch prediction, and "--cache-sim=yes/no" for cache simulation (for more consistency and to avoid confusion). However, the previously used "--simulate-cache=yes/no" still is supported but deprecated. Included: according documentation and tests. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11207 --- diff --git a/NEWS b/NEWS index ff3e5df6c8..25cceb3700 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,10 @@ Improvements: harder than the heap-level output, but this option is useful if you want to account for every byte of memory used by a program. +- Callgrind now can do branch prediction simulation, similar to Cachegrind. + In addition, it optionally can count the number of executed global bus events. + Both can be used for a better approximation of a "Cycle Estimation" as + derived event (you need to update the event formula in KCachegrind yourself). Release 3.5.0 (19 August 2009) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 8620c6f7bc..24862a80ca 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -580,6 +580,7 @@ void CLG_(setup_bbcc)(BB* bb) if (last_bb) { passed = CLG_(current_state).jmps_passed; + CLG_ASSERT(passed <= last_bb->cjmp_count); if (passed == last_bb->cjmp_count) { jmpkind = last_bb->jmpkind; @@ -599,9 +600,9 @@ void CLG_(setup_bbcc)(BB* bb) last_bbcc->ecounter_sum++; last_bbcc->jmp[passed].ecounter++; if (!CLG_(clo).simulate_cache) { - /* update Ir cost */ - int instr_count = last_bb->jmp[passed].instr+1; - CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count; + /* update Ir cost */ + UInt instr_count = last_bb->jmp[passed].instr+1; + CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count; } } diff --git a/callgrind/clo.c b/callgrind/clo.c index e4da421a10..fc99b0df62 100644 --- a/callgrind/clo.c +++ b/callgrind/clo.c @@ -415,8 +415,6 @@ Bool CLG_(process_cmd_line_option)(Char* arg) /* compatibility alias, deprecated option */ else if VG_BOOL_CLO(arg, "--trace-jump", CLG_(clo).collect_jumps) {} - else if VG_BOOL_CLO(arg, "--collect-bus", CLG_(clo).collect_bus) {} - else if VG_BOOL_CLO(arg, "--combine-dumps", CLG_(clo).combine_dumps) {} else if VG_BOOL_CLO(arg, "--collect-atstart", CLG_(clo).collect_atstart) {} @@ -527,8 +525,13 @@ Bool CLG_(process_cmd_line_option)(Char* arg) else if VG_BOOL_CLO(arg, "--collect-alloc", CLG_(clo).collect_alloc) {} else if VG_BOOL_CLO(arg, "--collect-systime", CLG_(clo).collect_systime) {} + else if VG_BOOL_CLO(arg, "--collect-bus", CLG_(clo).collect_bus) {} + /* for option compatibility with cachegrind */ + else if VG_BOOL_CLO(arg, "--cache-sim", CLG_(clo).simulate_cache) {} + /* compatibility alias, deprecated option */ else if VG_BOOL_CLO(arg, "--simulate-cache", CLG_(clo).simulate_cache) {} - + /* for option compatibility with cachegrind */ + else if VG_BOOL_CLO(arg, "--branch-sim", CLG_(clo).simulate_branch) {} else { Bool isCachesimOption = (*CLG_(cachesim).parse_opt)(arg); @@ -592,6 +595,9 @@ void CLG_(print_usage)(void) #if CLG_EXPERIMENTAL " --fn-group= Put function into separation group \n" #endif +"\n simulation options:\n" +" --branch-sim=no|yes Do branch prediction simulation [no]\n" +" --cache-sim=no|yes Do cache simulation [no]\n" ); (*CLG_(cachesim).print_opts)(); @@ -642,6 +648,7 @@ void CLG_(set_clo_defaults)(void) CLG_(clo).collect_jumps = False; CLG_(clo).collect_alloc = False; CLG_(clo).collect_systime = False; + CLG_(clo).collect_bus = False; CLG_(clo).skip_plt = True; CLG_(clo).separate_callers = 0; @@ -651,6 +658,7 @@ void CLG_(set_clo_defaults)(void) /* Instrumentation */ CLG_(clo).instrument_atstart = True; CLG_(clo).simulate_cache = False; + CLG_(clo).simulate_branch = False; /* Call graph */ CLG_(clo).pop_on_jump = False; diff --git a/callgrind/docs/cl-manual.xml b/callgrind/docs/cl-manual.xml index a1339841de..e2289ff552 100644 --- a/callgrind/docs/cl-manual.xml +++ b/callgrind/docs/cl-manual.xml @@ -4,7 +4,7 @@ [ %vg-entities; ]> -Callgrind: a call-graph generating cache profiler +Callgrind: a call-graph generating cache and branch prediction profiler To use this tool, you must specify @@ -14,14 +14,14 @@ Valgrind command line. Overview -Callgrind is a profiling tool that can -construct a call graph for a program's run. +Callgrind is a profiling tool that records the call history among +functions in a program's run as a call-graph. By default, the collected data consists of the number of instructions executed, their relationship to source lines, the caller/callee relationship between functions, and the numbers of such calls. -Optionally, a cache simulator (similar to Cachegrind) can produce -further information about the memory access behavior of the application. +Optionally, cache simulation and/or branch prediction (similar to Cachegrind) +can produce further information about the runtime behavior of an application. The profile data is written out to a file at program @@ -175,10 +175,10 @@ on heuristics to detect calls and returns. results in this case. If you are additionally interested in measuring the - cache behavior of your - program, use Callgrind with the option - - However, expect a further slow down approximately by a factor of 2. + cache behavior of your program, use Callgrind with the option + . For + branch prediction simulation, use . + Expect a further slow down approximately by a factor of 2. If the program section you want to profile is somewhere in the middle of the run, it is beneficial to @@ -371,7 +371,7 @@ callgrind.out.pid.part-threa "global bus events" is used. The short name of the event type used for global bus events is "Ge". - To count global bus events, use . + To count global bus events, use . @@ -779,7 +779,7 @@ Also see . - + @@ -917,23 +917,55 @@ Also see . + -Cache simulation options + xreflabel="Simulation options"> +Simulation options - - + + - + Specify if you want to do full cache simulation. By default, - only instruction read accesses will be profiled. + only instruction read accesses will be counted ("Ir"). + With cache simulation, further event counters are enabled: + Cache misses on instruction reads ("I1mr"/"I2mr"), + data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"), + data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw"). + For more information, see . + + + + + + + Specify if you want to do branch prediction simulation. + Further event counters are enabled: Number of executed conditional + branches and related predictor misses ("Bc"/"Bcm"), executed indirect + jumps and related misses of the jump address predictor ("Bi"/"Bim"). + + + + + + + + + + +Cache simulation options + + + + diff --git a/callgrind/global.h b/callgrind/global.h index db694a8cc5..706edec988 100644 --- a/callgrind/global.h +++ b/callgrind/global.h @@ -92,6 +92,7 @@ struct _CommandLineOptions { /* Instrument options */ Bool instrument_atstart; /* Instrument at start? */ Bool simulate_cache; /* Call into cache simulator ? */ + Bool simulate_branch; /* Call into branch prediction simulator ? */ /* Call graph generation */ Bool pop_on_jump; /* Handle a jump between functions as ret+call */ @@ -652,7 +653,7 @@ struct cachesim_if void (*post_clo_init)(void); void (*clear)(void); void (*getdesc)(Char* buf); - void (*printstat)(void); + void (*printstat)(Int,Int,Int); void (*add_icost)(SimCost, BBCC*, InstrInfo*, ULong); void (*finish)(void); @@ -681,9 +682,11 @@ extern ULong* CLG_(cost_base); #define EG_IR 1 #define EG_DR 2 #define EG_DW 3 -#define EG_BUS 4 -#define EG_ALLOC 5 -#define EG_SYS 6 +#define EG_BC 4 +#define EG_BI 5 +#define EG_BUS 6 +#define EG_ALLOC 7 +#define EG_SYS 8 struct event_sets { EventSet *base, *full; diff --git a/callgrind/main.c b/callgrind/main.c index c0290b460d..e36ba8a187 100644 --- a/callgrind/main.c +++ b/callgrind/main.c @@ -37,6 +37,8 @@ #include +#include "cg_branchpred.c" + /*------------------------------------------------------------*/ /*--- Global variables ---*/ /*------------------------------------------------------------*/ @@ -103,11 +105,13 @@ static void log_global_event(InstrInfo* ii) { ULong* cost_Bus; - CLG_DEBUG(0, "log_global_event: Ir %#lx/%u\n", + CLG_DEBUG(6, "log_global_event: Ir %#lx/%u\n", CLG_(bb_base) + ii->instr_offset, ii->instr_size); if (!CLG_(current_state).collect) return; + CLG_ASSERT( (ii->eventset->mask & (1u<0 ); + CLG_(current_state).cost[ fullOffset(EG_BUS) ]++; if (CLG_(current_state).nonskipped) @@ -118,6 +122,71 @@ static void log_global_event(InstrInfo* ii) } +/* For branches, we consult two different predictors, one which + predicts taken/untaken for conditional branches, and the other + which predicts the branch target address for indirect branches + (jump-to-register style ones). */ + +static VG_REGPARM(2) +void log_cond_branch(InstrInfo* ii, Word taken) +{ + Bool miss; + Int fullOffset_Bc; + ULong* cost_Bc; + + CLG_DEBUG(6, "log_cond_branch: Ir %#lx, taken %lu\n", + CLG_(bb_base) + ii->instr_offset, taken); + + miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken); + + if (!CLG_(current_state).collect) return; + + CLG_ASSERT( (ii->eventset->mask & (1u<0 ); + + if (CLG_(current_state).nonskipped) + cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC); + else + cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC]; + + fullOffset_Bc = fullOffset(EG_BC); + CLG_(current_state).cost[ fullOffset_Bc ]++; + cost_Bc[0]++; + if (miss) { + CLG_(current_state).cost[ fullOffset_Bc+1 ]++; + cost_Bc[1]++; + } +} + +static VG_REGPARM(2) +void log_ind_branch(InstrInfo* ii, UWord actual_dst) +{ + Bool miss; + Int fullOffset_Bi; + ULong* cost_Bi; + + CLG_DEBUG(6, "log_ind_branch: Ir %#lx, dst %#lx\n", + CLG_(bb_base) + ii->instr_offset, actual_dst); + + miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst); + + if (!CLG_(current_state).collect) return; + + CLG_ASSERT( (ii->eventset->mask & (1u<0 ); + + if (CLG_(current_state).nonskipped) + cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI); + else + cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI]; + + fullOffset_Bi = fullOffset(EG_BI); + CLG_(current_state).cost[ fullOffset_Bi ]++; + cost_Bi[0]++; + if (miss) { + CLG_(current_state).cost[ fullOffset_Bi+1 ]++; + cost_Bi[1]++; + } +} + /*------------------------------------------------------------*/ /*--- Instrumentation structures and event queue handling ---*/ /*------------------------------------------------------------*/ @@ -161,6 +230,8 @@ typedef Ev_Dr, // Data read Ev_Dw, // Data write Ev_Dm, // Data modify (read then write) + Ev_Bc, // branch conditional + Ev_Bi, // branch indirect (to unknown destination) Ev_G // Global bus event } EventTag; @@ -184,6 +255,12 @@ typedef IRAtom* ea; Int szB; } Dm; + struct { + IRAtom* taken; /* :: Ity_I1 */ + } Bc; + struct { + IRAtom* dst; + } Bi; struct { } G; } Ev; @@ -269,6 +346,16 @@ static void showEvent ( Event* ev ) ppIRExpr(ev->Ev.Dm.ea); VG_(printf)("\n"); break; + case Ev_Bc: + VG_(printf)("Bc %p GA=", ev->inode); + ppIRExpr(ev->Ev.Bc.taken); + VG_(printf)("\n"); + break; + case Ev_Bi: + VG_(printf)("Bi %p DST=", ev->inode); + ppIRExpr(ev->Ev.Bi.dst); + VG_(printf)("\n"); + break; case Ev_G: VG_(printf)("G %p\n", ev->inode); break; @@ -306,18 +393,28 @@ static void flushEvents ( ClgState* clgs ) ev->inode->eventset = CLG_(sets).base; break; case Ev_Dr: - // extend event set by Dr counter + // extend event set by Dr counters ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset, EG_DR); break; case Ev_Dw: case Ev_Dm: - // extend event set by Dw counter + // extend event set by Dw counters ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset, EG_DW); break; + case Ev_Bc: + // extend event set by Bc counters + ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset, + EG_BC); + break; + case Ev_Bi: + // extend event set by Bi counters + ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset, + EG_BI); + break; case Ev_G: - // extend event set by Bus counter + // extend event set by Bus counter ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset, EG_BUS); break; @@ -436,6 +533,22 @@ static void flushEvents ( ClgState* clgs ) regparms = 3; inew = i+1; break; + case Ev_Bc: + /* Conditional branch */ + helperName = "log_cond_branch"; + helperAddr = &log_cond_branch; + argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken ); + regparms = 2; + inew = i+1; + break; + case Ev_Bi: + /* Branch to an unknown destination */ + helperName = "log_ind_branch"; + helperAddr = &log_ind_branch; + argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst ); + regparms = 2; + inew = i+1; + break; case Ev_G: /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */ helperName = "log_global_event"; @@ -548,11 +661,52 @@ void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea ) clgs->events_used++; } +static +void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard ) +{ + Event* evt; + tl_assert(isIRAtom(guard)); + tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard) + == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64)); + if (!CLG_(clo).simulate_branch) return; + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Bc; + evt->inode = inode; + evt->Ev.Bc.taken = guard; + clgs->events_used++; +} + +static +void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo ) +{ + Event* evt; + tl_assert(isIRAtom(whereTo)); + tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo) + == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64)); + if (!CLG_(clo).simulate_branch) return; + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Bi; + evt->inode = inode; + evt->Ev.Bi.dst = whereTo; + clgs->events_used++; +} + static void addEvent_G ( ClgState* clgs, InstrInfo* inode ) { Event* evt; if (!CLG_(clo).collect_bus) return; + if (clgs->events_used == N_EVENTS) flushEvents(clgs); tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); @@ -753,6 +907,7 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure, Int i, isize; IRStmt* st; Addr origAddr; + Addr64 cia; /* address of current insn */ InstrInfo* curr_inode = NULL; ClgState clgs; UInt cJumps = 0; @@ -789,6 +944,8 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure, CLG_ASSERT(Ist_IMark == st->tag); origAddr = (Addr)st->Ist.IMark.addr; + cia = st->Ist.IMark.addr; + isize = st->Ist.IMark.len; CLG_ASSERT(origAddr == st->Ist.IMark.addr); // XXX: check no overflow /* Get BB struct (creating if necessary). @@ -819,8 +976,9 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure, break; case Ist_IMark: { - CLG_ASSERT(clgs.instr_offset == (Addr)st->Ist.IMark.addr - origAddr); - isize = st->Ist.IMark.len; + cia = st->Ist.IMark.addr; + isize = st->Ist.IMark.len; + CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr); // If Vex fails to decode an instruction, the size will be zero. // Pretend otherwise. if (isize == 0) isize = VG_MIN_INSTR_SZB; @@ -925,7 +1083,63 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure, } case Ist_Exit: { - UInt jmps_passed; + Bool guest_exit, inverted; + + /* VEX code generation sometimes inverts conditional branches. + * As Callgrind counts (conditional) jumps, it has to correct + * inversions. The heuristic is the following: + * (1) Callgrind switches off SB chasing and unrolling, and + * therefore it assumes that a candidate for inversion only is + * the last conditional branch in an SB. + * (2) inversion is assumed if the branch jumps to the address of + * the next guest instruction in memory. + * This heuristic is precalculated in CLG_(collectBlockInfo)(). + * + * Branching behavior is also used for branch prediction. Note that + * above heuristic is different from what Cachegrind does. + * Cachegrind uses (2) for all branches. + */ + if (cJumps+1 == clgs.bb->cjmp_count) + inverted = clgs.bb->cjmp_inverted; + else + inverted = False; + + // call branch predictor only if this is a branch in guest code + guest_exit = (st->Ist.Exit.jk == Ijk_Boring) || + (st->Ist.Exit.jk == Ijk_Call) || + (st->Ist.Exit.jk == Ijk_Ret); + + if (guest_exit) { + /* Stuff to widen the guard expression to a host word, so + we can pass it to the branch predictor simulation + functions easily. */ + IRType tyW = hWordTy; + IROp widen = tyW==Ity_I32 ? Iop_1Uto32 : Iop_1Uto64; + IROp opXOR = tyW==Ity_I32 ? Iop_Xor32 : Iop_Xor64; + IRTemp guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1); + IRTemp guardW = newIRTemp(clgs.sbOut->tyenv, tyW); + IRTemp guard = newIRTemp(clgs.sbOut->tyenv, tyW); + IRExpr* one = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1)) + : IRExpr_Const(IRConst_U64(1)); + + /* Widen the guard expression. */ + addStmtToIRSB( clgs.sbOut, + IRStmt_WrTmp( guard1, st->Ist.Exit.guard )); + addStmtToIRSB( clgs.sbOut, + IRStmt_WrTmp( guardW, + IRExpr_Unop(widen, + IRExpr_RdTmp(guard1))) ); + /* If the exit is inverted, invert the sense of the guard. */ + addStmtToIRSB( + clgs.sbOut, + IRStmt_WrTmp( + guard, + inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one) + : IRExpr_RdTmp(guardW) + )); + /* And post the event. */ + addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) ); + } /* We may never reach the next statement, so need to flush all outstanding transactions now. */ @@ -940,12 +1154,9 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure, /* Update global variable jmps_passed before the jump * A correction is needed if VEX inverted the last jump condition */ - jmps_passed = cJumps; - if ((cJumps+1 == clgs.bb->cjmp_count) && clgs.bb->cjmp_inverted) - jmps_passed++; addConstMemStoreStmt( clgs.sbOut, (UWord) &CLG_(current_state).jmps_passed, - jmps_passed, hWordTy); + inverted ? cJumps+1 : cJumps, hWordTy); cJumps++; break; @@ -966,6 +1177,26 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure, } } + /* Deal with branches to unknown destinations. Except ignore ones + which are function returns as we assume the return stack + predictor never mispredicts. */ + if (sbIn->jumpkind == Ijk_Boring) { + if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); } + switch (sbIn->next->tag) { + case Iex_Const: + break; /* boring - branch to known address */ + case Iex_RdTmp: + /* looks like an indirect branch (branch to unknown) */ + addEvent_Bi( &clgs, curr_inode, sbIn->next ); + break; + default: + /* shouldn't happen - if the incoming IR is properly + flattened, should only have tmp and const cases to + consider. */ + tl_assert(0); + } + } + /* At the end of the bb. Flush outstandings. */ flushEvents( &clgs ); @@ -1236,10 +1467,61 @@ void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno, } } +static UInt ULong_width(ULong n) +{ + UInt w = 0; + while (n > 0) { + n = n / 10; + w++; + } + if (w == 0) w = 1; + return w + (w-1)/3; // add space for commas +} + +static +void branchsim_printstat(int l1, int l2, int l3) +{ + static Char buf1[128], buf2[128], buf3[128], fmt[128]; + FullCost total; + ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp; + ULong B_total_b, B_total_mp; + + total = CLG_(total_cost); + Bc_total_b = total[ fullOffset(EG_BC) ]; + Bc_total_mp = total[ fullOffset(EG_BC)+1 ]; + Bi_total_b = total[ fullOffset(EG_BI) ]; + Bi_total_mp = total[ fullOffset(EG_BI)+1 ]; + + /* Make format string, getting width right for numbers */ + VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu cond + %%,%dllu ind)\n", + l1, l2, l3); + + if (0 == Bc_total_b) Bc_total_b = 1; + if (0 == Bi_total_b) Bi_total_b = 1; + B_total_b = Bc_total_b + Bi_total_b; + B_total_mp = Bc_total_mp + Bi_total_mp; + + VG_(umsg)("\n"); + VG_(umsg)(fmt, "Branches: ", + B_total_b, Bc_total_b, Bi_total_b); + + VG_(umsg)(fmt, "Mispredicts: ", + B_total_mp, Bc_total_mp, Bi_total_mp); + + VG_(percentify)(B_total_mp, B_total_b, 1, l1+1, buf1); + VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2); + VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3); + + VG_(umsg)("Mispred rate: %s (%s + %s )\n", buf1, buf2,buf3); +} + + static void finish(void) { - char buf[RESULTS_BUF_LEN]; + Char buf[RESULTS_BUF_LEN], fmt[128]; + Int l1, l2, l3; + FullCost total; CLG_DEBUG(0, "finish()\n"); @@ -1334,8 +1616,33 @@ void finish(void) VG_(message)(Vg_UserMsg, "Collected : %s\n", buf); VG_(message)(Vg_UserMsg, "\n"); - // if (CLG_(clo).simulate_cache) - (*CLG_(cachesim).printstat)(); + /* determine value widths for statistics */ + total = CLG_(total_cost); + l1 = ULong_width( total[fullOffset(EG_IR)] ); + l2 = l3 = 0; + if (CLG_(clo).simulate_cache) { + l2 = ULong_width( total[fullOffset(EG_DR)] ); + l3 = ULong_width( total[fullOffset(EG_DW)] ); + } + if (CLG_(clo).simulate_branch) { + int l2b = ULong_width( total[fullOffset(EG_BC)] ); + int l3b = ULong_width( total[fullOffset(EG_BI)] ); + if (l2b > l2) l2 = l2b; + if (l3b > l3) l3 = l3b; + } + + /* Make format string, getting width right for numbers */ + VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1); + + /* Always print this */ + VG_(umsg)(fmt, "I refs: ", total[fullOffset(EG_IR)] ); + + if (CLG_(clo).simulate_cache) + (*CLG_(cachesim).printstat)(l1, l2, l3); + + if (CLG_(clo).simulate_branch) + branchsim_printstat(l1, l2, l3); + } diff --git a/callgrind/sim.c b/callgrind/sim.c index 01fd5bc279..0841d2c657 100644 --- a/callgrind/sim.c +++ b/callgrind/sim.c @@ -1490,8 +1490,7 @@ static void cachesim_print_opts(void) { VG_(printf)( -"\n cache simulator options:\n" -" --simulate-cache=no|yes Do cache simulation [no]\n" +"\n cache simulator options (does cache simulation if used):\n" " --simulate-wb=no|yes Count write-back events [no]\n" " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n" #if CLG_EXPERIMENTAL @@ -1614,7 +1613,7 @@ void percentify(Int n, Int ex, Int field_width, char buf[]) } static -void cachesim_printstat(void) +void cachesim_printstat(Int l1, Int l2, Int l3) { FullCost total = CLG_(total_cost), D_total = 0; ULong L2_total_m, L2_total_mr, L2_total_mw, @@ -1622,7 +1621,6 @@ void cachesim_printstat(void) char buf1[RESULTS_BUF_LEN], buf2[RESULTS_BUF_LEN], buf3[RESULTS_BUF_LEN]; - Int l1, l2, l3; Int p; if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) { @@ -1633,13 +1631,6 @@ void cachesim_printstat(void) VG_(message)(Vg_DebugMsg, "\n"); } - /* I cache results. Use the I_refs value to determine the first column - * width. */ - l1 = commify(total[fullOffset(EG_IR)], 0, buf1); - VG_(message)(Vg_UserMsg, "I refs: %s\n", buf1); - - if (!CLG_(clo).simulate_cache) return; - commify(total[fullOffset(EG_IR) +1], l1, buf1); VG_(message)(Vg_UserMsg, "I1 misses: %s\n", buf1); @@ -1671,8 +1662,8 @@ void cachesim_printstat(void) CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) ); commify( D_total[0], l1, buf1); - l2 = commify(total[fullOffset(EG_DR)], 0, buf2); - l3 = commify(total[fullOffset(EG_DW)], 0, buf3); + commify(total[fullOffset(EG_DR)], l2, buf2); + commify(total[fullOffset(EG_DW)], l3, buf3); VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)\n", buf1, buf2, buf3); @@ -1782,6 +1773,11 @@ void CLG_(init_eventsets)() CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw"); } + if (CLG_(clo).simulate_branch) { + CLG_(register_event_group2)(EG_BC, "Bc", "Bcm"); + CLG_(register_event_group2)(EG_BI, "Bi", "Bim"); + } + if (CLG_(clo).collect_bus) CLG_(register_event_group)(EG_BUS, "Ge"); @@ -1796,6 +1792,7 @@ void CLG_(init_eventsets)() // event set comprising all event groups, used for inclusive cost CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW); + CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI); CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS); CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS); @@ -1819,6 +1816,10 @@ void CLG_(init_eventsets)() CLG_(append_event)(CLG_(dumpmap), "I2dmr"); CLG_(append_event)(CLG_(dumpmap), "D2dmr"); CLG_(append_event)(CLG_(dumpmap), "D2dmw"); + CLG_(append_event)(CLG_(dumpmap), "Bc"); + CLG_(append_event)(CLG_(dumpmap), "Bcm"); + CLG_(append_event)(CLG_(dumpmap), "Bi"); + CLG_(append_event)(CLG_(dumpmap), "Bim"); CLG_(append_event)(CLG_(dumpmap), "AcCost1"); CLG_(append_event)(CLG_(dumpmap), "SpLoss1"); CLG_(append_event)(CLG_(dumpmap), "AcCost2"); diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr index 1a58540d45..d2d754456b 100755 --- a/callgrind/tests/filter_stderr +++ b/callgrind/tests/filter_stderr @@ -19,6 +19,9 @@ perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' | # Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' | +# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines +perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' | + # Remove CPUID warnings lines for P4s and other machines sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" | sed "/Simulating a 16 KB I-cache with 32 B lines/d" | diff --git a/callgrind/tests/simwork-both.stderr.exp b/callgrind/tests/simwork-both.stderr.exp new file mode 100644 index 0000000000..b742c213ba --- /dev/null +++ b/callgrind/tests/simwork-both.stderr.exp @@ -0,0 +1,24 @@ + + +Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim +Collected : + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: + +Branches: +Mispredicts: +Mispred rate: diff --git a/callgrind/tests/simwork-both.stdout.exp b/callgrind/tests/simwork-both.stdout.exp new file mode 100644 index 0000000000..d4c867cc82 --- /dev/null +++ b/callgrind/tests/simwork-both.stdout.exp @@ -0,0 +1 @@ +Sum: 1000000 diff --git a/callgrind/tests/simwork-both.vgtest b/callgrind/tests/simwork-both.vgtest new file mode 100644 index 0000000000..19c3ff8539 --- /dev/null +++ b/callgrind/tests/simwork-both.vgtest @@ -0,0 +1,3 @@ +prog: simwork +vgopts: --cache-sim=yes --branch-sim=yes +cleanup: rm callgrind.out.* diff --git a/callgrind/tests/simwork-branch.stderr.exp b/callgrind/tests/simwork-branch.stderr.exp new file mode 100644 index 0000000000..7cda62ea4a --- /dev/null +++ b/callgrind/tests/simwork-branch.stderr.exp @@ -0,0 +1,10 @@ + + +Events : Ir Bc Bcm Bi Bim +Collected : + +I refs: + +Branches: +Mispredicts: +Mispred rate: diff --git a/callgrind/tests/simwork-branch.stdout.exp b/callgrind/tests/simwork-branch.stdout.exp new file mode 100644 index 0000000000..d4c867cc82 --- /dev/null +++ b/callgrind/tests/simwork-branch.stdout.exp @@ -0,0 +1 @@ +Sum: 1000000 diff --git a/callgrind/tests/simwork-branch.vgtest b/callgrind/tests/simwork-branch.vgtest new file mode 100644 index 0000000000..a866e1e0b4 --- /dev/null +++ b/callgrind/tests/simwork-branch.vgtest @@ -0,0 +1,3 @@ +prog: simwork +vgopts: --branch-sim=yes +cleanup: rm callgrind.out.* diff --git a/callgrind/tests/simwork-cache.stderr.exp b/callgrind/tests/simwork-cache.stderr.exp new file mode 100644 index 0000000000..0705c1c849 --- /dev/null +++ b/callgrind/tests/simwork-cache.stderr.exp @@ -0,0 +1,20 @@ + + +Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw +Collected : + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: diff --git a/callgrind/tests/simwork-cache.stdout.exp b/callgrind/tests/simwork-cache.stdout.exp new file mode 100644 index 0000000000..d4c867cc82 --- /dev/null +++ b/callgrind/tests/simwork-cache.stdout.exp @@ -0,0 +1 @@ +Sum: 1000000 diff --git a/callgrind/tests/simwork-cache.vgtest b/callgrind/tests/simwork-cache.vgtest new file mode 100644 index 0000000000..ce222c001f --- /dev/null +++ b/callgrind/tests/simwork-cache.vgtest @@ -0,0 +1,3 @@ +prog: simwork +vgopts: --cache-sim=yes +cleanup: rm callgrind.out.*