]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
Callgrind: fix instrumentation for arbitrary events per guest instruction
authorJosef Weidendorfer <Josef.Weidendorfer@gmx.de>
Mon, 15 Jun 2009 00:16:36 +0000 (00:16 +0000)
committerJosef Weidendorfer <Josef.Weidendorfer@gmx.de>
Mon, 15 Jun 2009 00:16:36 +0000 (00:16 +0000)
(should fix bug 169505)

This uses the same event queue scheme as cachegrind and lackey, and
same kind of helpers (1/2/3 Ir events, Ir+Dr, Dr, Ir+Dw, Dw).
Note that in contrast to Cachegrind, Callgrind interpretes a modify event
as Dw (otherwise the cache model generating write back events would not work).

Callgrind uses per-(guest)instruction event sets for cost counters.
An per-instruction eventset is incrementally extended as events for the
same guest instruction are flushed. Event sets always start with Ir counters,
but depending on Dr/Dw order afterwards, there exist IrDr(Dw) and IrDw(Dr).
Per-instruction event sets now are consistently named according to event ordering.
Event set "sim" is a subset of "full", was never used and was removed.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10321

callgrind/bbcc.c
callgrind/debug.c
callgrind/global.h
callgrind/main.c
callgrind/sim.c

index dfe737b2f4cc4e4a326087f8305dbe24df3f654d..7917c25261c3e1c6f835e5806918b669e2d99465 100644 (file)
@@ -601,7 +601,7 @@ void CLG_(setup_bbcc)(BB* bb)
          if (!CLG_(clo).simulate_cache) {
              /* update Ir cost */
              int instr_count = last_bb->jmp[passed].instr+1;
-             CLG_(current_state).cost[CLG_(sets).off_sim_Ir] += instr_count;
+             CLG_(current_state).cost[CLG_(sets).off_full_Ir] += instr_count;
          }
       }
 
index f04bab41476880cd3654dc08e8f1a9641caa318f..2ac38a29784071cc89a1a3a0f87f726d38829f1f 100644 (file)
@@ -217,9 +217,9 @@ void CLG_(print_short_jcc)(jCC* jcc)
                    bb_jmpaddr(jcc->from->bb),
                    bb_addr(jcc->to->bb),
                    jcc->call_counter,
-                   jcc->cost ? jcc->cost[CLG_(sets).off_sim_Ir]:0,
-                   jcc->cost ? jcc->cost[CLG_(sets).off_sim_Dr]:0,
-                   jcc->cost ? jcc->cost[CLG_(sets).off_sim_Dw]:0);
+                   jcc->cost ? jcc->cost[CLG_(sets).off_full_Ir]:0,
+                   jcc->cost ? jcc->cost[CLG_(sets).off_full_Dr]:0,
+                   jcc->cost ? jcc->cost[CLG_(sets).off_full_Dw]:0);
     else
        VG_(printf)("[Skipped JCC]");
 }
index 461218a8c4328c601f5b25f4aa1fcc8d0b3cd814..367f2d7d34684cbc529772b38dd6a11c86e810e4 100644 (file)
@@ -270,7 +270,6 @@ typedef struct _InstrInfo InstrInfo;
 struct _InstrInfo {
   UInt instr_offset;
   UInt instr_size;
-  UInt data_size;
   UInt cost_offset;
   EventSet* eventset;
 };
@@ -657,19 +656,19 @@ struct cachesim_if
     void (*finish)(void);
     
     void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
+    void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
+    void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
 
-    void (*log_1I1Dr)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_1I1Dw)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_1I2D)(InstrInfo*, Addr, Addr) VG_REGPARM(3);
+    void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+    void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
 
-    void (*log_0I1Dr)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_0I1Dw)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_0I2D)(InstrInfo*, Addr, Addr) VG_REGPARM(3);
+    void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+    void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
 
     // function names of helpers (for debugging generated code)
-    Char *log_1I0D_name;
-    Char *log_1I1Dr_name, *log_1I1Dw_name, *log_1I2D_name;
-    Char *log_0I1Dr_name, *log_0I1Dw_name, *log_0I2D_name;
+    Char *log_1I0D_name, *log_2I0D_name, *log_3I0D_name;
+    Char *log_1I1Dr_name, *log_1I1Dw_name;
+    Char *log_0I1Dr_name, *log_0I1Dw_name;
 };
 
 
@@ -687,15 +686,13 @@ void CLG_(print_debug_usage)(void);
 
 /* from sim.c */
 struct event_sets {
-  EventSet *use, *Ir, *Dr, *Dw;
-  EventSet *D0, *D1r, *D1w, *D2;
-  EventSet *sim;
-  EventSet *full; /* sim plus user events */
+  EventSet *Use, *Ir, *Dr, *Dw;
+  EventSet *UIr, *UIrDr, *UIrDrDw, *UIrDw, *UIrDwDr;
+  EventSet *full;
 
   /* offsets into eventsets */  
-  Int off_sim_Ir, off_sim_Dr, off_sim_Dw;
   Int off_full_Ir, off_full_Dr, off_full_Dw;
-  Int off_full_user, off_full_alloc, off_full_systime;
+  Int off_full_alloc, off_full_systime;
 };
 
 extern struct event_sets CLG_(sets);
index 68d13814dc39ccf9745413a05611cb4d7d0da95a..f2d1250371d8c06a1b34a7915d552b54725d5930 100644 (file)
@@ -94,224 +94,490 @@ static void CLG_(init_statistics)(Statistics* s)
 }
 
 
-    
-
 /*------------------------------------------------------------*/
-/*--- Cache simulation instrumentation phase               ---*/
+/*--- Instrumentation structures and event queue handling  ---*/
 /*------------------------------------------------------------*/
 
+/* Maintain an ordered list of memory events which are outstanding, in
+   the sense that no IR has yet been generated to do the relevant
+   helper calls.  The BB is scanned top to bottom and memory events
+   are added to the end of the list, merging with the most recent
+   notified event where possible (Dw immediately following Dr and
+   having the same size and EA can be merged).
+
+   This merging is done so that for architectures which have
+   load-op-store instructions (x86, amd64), the insn is treated as if
+   it makes just one memory reference (a modify), rather than two (a
+   read followed by a write at the same address).
+
+   At various points the list will need to be flushed, that is, IR
+   generated from it.  That must happen before any possible exit from
+   the block (the end, or an IRStmt_Exit).  Flushing also takes place
+   when there is no space to add a new event.
+
+   If we require the simulation statistics to be up to date with
+   respect to possible memory exceptions, then the list would have to
+   be flushed before each memory reference.  That would however lose
+   performance by inhibiting event-merging during flushing.
+
+   Flushing the list consists of walking it start to end and emitting
+   instrumentation IR for each event, in the order in which they
+   appear.  It may be possible to emit a single call for two adjacent
+   events in order to reduce the number of helper function calls made.
+   For example, it could well be profitable to handle two adjacent Ir
+   events with a single helper call.  */
+
+typedef
+   IRExpr
+   IRAtom;
+
+typedef
+   enum {
+      Ev_Ir,  // Instruction read
+      Ev_Dr,  // Data read
+      Ev_Dw,  // Data write
+      Ev_Dm,  // Data modify (read then write)
+   }
+   EventTag;
+
+typedef
+   struct {
+      EventTag   tag;
+      InstrInfo* inode;
+      union {
+        struct {
+        } Ir;
+        struct {
+           IRAtom* ea;
+           Int     szB;
+        } Dr;
+        struct {
+           IRAtom* ea;
+           Int     szB;
+        } Dw;
+        struct {
+           IRAtom* ea;
+           Int     szB;
+        } Dm;
+      } Ev;
+   }
+   Event;
+
+static void init_Event ( Event* ev ) {
+   VG_(memset)(ev, 0, sizeof(Event));
+}
+
+static IRAtom* get_Event_dea ( Event* ev ) {
+   switch (ev->tag) {
+      case Ev_Dr: return ev->Ev.Dr.ea;
+      case Ev_Dw: return ev->Ev.Dw.ea;
+      case Ev_Dm: return ev->Ev.Dm.ea;
+      default:    tl_assert(0);
+   }
+}
+
+static Int get_Event_dszB ( Event* ev ) {
+   switch (ev->tag) {
+      case Ev_Dr: return ev->Ev.Dr.szB;
+      case Ev_Dw: return ev->Ev.Dw.szB;
+      case Ev_Dm: return ev->Ev.Dm.szB;
+      default:    tl_assert(0);
+   }
+}
+
+
+/* Up to this many unnotified events are allowed.  Number is
+   arbitrary.  Larger numbers allow more event merging to occur, but
+   potentially induce more spilling due to extending live ranges of
+   address temporaries. */
+#define N_EVENTS 16
+
+
+/* A struct which holds all the running state during instrumentation.
+   Mostly to avoid passing loads of parameters everywhere. */
+typedef struct {
+    /* The current outstanding-memory-event list. */
+    Event events[N_EVENTS];
+    Int   events_used;
+
+    /* The array of InstrInfo's is part of BB struct. */
+    BB* bb;
 
-static Bool loadStoreAddrsMatch(IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+    /* BB seen before (ie. re-instrumentation) */
+    Bool seen_before;
+
+    /* Number InstrInfo bins 'used' so far. */
+    UInt ii_index;
+
+    // current offset of guest instructions from BB start
+    UInt instr_offset;
+
+    /* The output SB being constructed. */
+    IRSB* sbOut;
+} ClgState;
+
+
+static void showEvent ( Event* ev )
 {
-  // I'm assuming that for 'modify' instructions, that Vex always makes
-  // the loadAddrExpr and storeAddrExpr be of the same type, ie. both Tmp
-  // expressions, or both Const expressions.
-  CLG_ASSERT(isIRAtom(loadAddrExpr));
-  CLG_ASSERT(isIRAtom(storeAddrExpr));
-  return eqIRAtom(loadAddrExpr, storeAddrExpr);
+   switch (ev->tag) {
+      case Ev_Ir:
+        VG_(printf)("Ir (InstrInfo %p) at +%d\n",
+                    ev->inode, ev->inode->instr_offset);
+        break;
+      case Ev_Dr:
+        VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
+                    ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
+        ppIRExpr(ev->Ev.Dr.ea);
+        VG_(printf)("\n");
+        break;
+      case Ev_Dw:
+        VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
+                    ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
+        ppIRExpr(ev->Ev.Dw.ea);
+        VG_(printf)("\n");
+        break;
+      case Ev_Dm:
+        VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
+                    ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
+        ppIRExpr(ev->Ev.Dm.ea);
+        VG_(printf)("\n");
+        break;
+      default:
+        tl_assert(0);
+        break;
+   }
 }
 
-static
-EventSet* insert_simcall(IRSB* bbOut, InstrInfo* ii, UInt dataSize,
-                        Bool instrIssued,
-                        IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+/* Generate code for all outstanding memory events, and mark the queue
+   empty.  Code is generated into cgs->sbOut, and this activity
+   'consumes' slots in cgs->bb. */
+
+static void flushEvents ( ClgState* clgs )
 {
-    HChar*    helperName;
-    void*     helperAddr;
-    Int       argc;
-    EventSet* es;
-    IRExpr   *arg1, *arg2 = 0, *arg3 = 0, **argv;
-    IRDirty* di;
-
-    /* Check type of original instruction regarding memory access,
-     * and collect info to be able to generate fitting helper call
-     */
-    if (!loadAddrExpr && !storeAddrExpr) {
-       // no load/store
-       CLG_ASSERT(0 == dataSize);
-       if (instrIssued) {
-           helperName = 0;
-           helperAddr = 0;
-       }
-       else {
-           helperName = CLG_(cachesim).log_1I0D_name;
-           helperAddr = CLG_(cachesim).log_1I0D;
-       }
-       argc = 1;
-       es = CLG_(sets).D0;
-       
-    } else if (loadAddrExpr && !storeAddrExpr) {
-       // load
-       CLG_ASSERT( isIRAtom(loadAddrExpr) );
-       if (instrIssued) {
-           helperName = CLG_(cachesim).log_0I1Dr_name;
-           helperAddr = CLG_(cachesim).log_0I1Dr;
-       }
-       else {
-           helperName = CLG_(cachesim).log_1I1Dr_name;
-           helperAddr = CLG_(cachesim).log_1I1Dr;
-       }
-       argc = 2;
-       arg2 = loadAddrExpr;
-       es = CLG_(sets).D1r;
-
-    } else if (!loadAddrExpr && storeAddrExpr) {
-       // store
-       CLG_ASSERT( isIRAtom(storeAddrExpr) );
-       if (instrIssued) {
-           helperName = CLG_(cachesim).log_0I1Dw_name;
-           helperAddr = CLG_(cachesim).log_0I1Dw;
-       }
-       else {
-           helperName = CLG_(cachesim).log_1I1Dw_name;
-           helperAddr = CLG_(cachesim).log_1I1Dw;
-       }
-       argc = 2;
-       arg2 = storeAddrExpr;
-       es = CLG_(sets).D1w;
-       
-    } else {
-       CLG_ASSERT( loadAddrExpr && storeAddrExpr );
-       CLG_ASSERT( isIRAtom(loadAddrExpr) );
-       CLG_ASSERT( isIRAtom(storeAddrExpr) );
-       
-       if ( loadStoreAddrsMatch(loadAddrExpr, storeAddrExpr) ) {
-           /* modify: suppose write access, as this is
-            * more resource consuming (as in callgrind for VG2)
-            * Cachegrind does a read here (!)
-            * DISCUSS: Best way depends on simulation model?
-            */
-           if (instrIssued) {
-               helperName = CLG_(cachesim).log_0I1Dw_name;
-               helperAddr = CLG_(cachesim).log_0I1Dw;
+   Int        i, regparms, inew;
+   Char*      helperName;
+   void*      helperAddr;
+   IRExpr**   argv;
+   IRExpr*    i_node_expr;
+   IRDirty*   di;
+   Event*     ev;
+   Event*     ev2;
+   Event*     ev3;
+
+   if (!clgs->seen_before) {
+       // extend event sets as needed
+       // available sets: D0 Dr
+       for(i=0; i<clgs->events_used; i++) {
+          ev  = &clgs->events[i];
+          switch(ev->tag) {
+          case Ev_Ir:
+              // Ir event always is first for a guest instruction
+              CLG_ASSERT(ev->inode->eventset == 0);
+              ev->inode->eventset = CLG_(sets).UIr;
+              break;
+          case Ev_Dr:
+              // extend event set by Dr counter
+              if ((ev->inode->eventset == CLG_(sets).UIrDr)   ||
+                  (ev->inode->eventset == CLG_(sets).UIrDrDw) ||
+                  (ev->inode->eventset == CLG_(sets).UIrDwDr))
+                  break;
+              if (ev->inode->eventset == CLG_(sets).UIrDw) {
+                  ev->inode->eventset = CLG_(sets).UIrDwDr;
+                  break;
+              }
+              CLG_ASSERT(ev->inode->eventset == CLG_(sets).UIr);
+              ev->inode->eventset = CLG_(sets).UIrDr;
+              break;
+          case Ev_Dw:
+          case Ev_Dm:
+              // extend event set by Dw counter
+              if ((ev->inode->eventset == CLG_(sets).UIrDw)   ||
+                  (ev->inode->eventset == CLG_(sets).UIrDwDr) ||
+                  (ev->inode->eventset == CLG_(sets).UIrDrDw))
+                  break;
+              if (ev->inode->eventset == CLG_(sets).UIrDr) {
+                  ev->inode->eventset = CLG_(sets).UIrDrDw;
+                  break;
+              }
+              CLG_ASSERT(ev->inode->eventset == CLG_(sets).UIr);
+              ev->inode->eventset = CLG_(sets).UIrDw;
+              break;
+          default:
+              tl_assert(0);
+          }
+       }
+   }
+
+   for(i = 0; i < clgs->events_used; i = inew) {
+
+      helperName = NULL;
+      helperAddr = NULL;
+      argv       = NULL;
+      regparms   = 0;
+
+      /* generate IR to notify event i and possibly the ones
+        immediately following it. */
+      tl_assert(i >= 0 && i < clgs->events_used);
+
+      ev  = &clgs->events[i];
+      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
+      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
+
+      CLG_DEBUGIF(5) {
+        VG_(printf)("   flush ");
+        showEvent( ev );
+      }
+
+      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
+
+      /* Decide on helper fn to call and args to pass it, and advance
+        i appropriately.
+        Dm events have same effect as Dw events */
+      switch (ev->tag) {
+        case Ev_Ir:
+           /* Merge an Ir with a following Dr. */
+           if (ev2 && ev2->tag == Ev_Dr) {
+              /* Why is this true?  It's because we're merging an Ir
+                 with a following Dr.  The Ir derives from the
+                 instruction's IMark and the Dr from data
+                 references which follow it.  In short it holds
+                 because each insn starts with an IMark, hence an
+                 Ev_Ir, and so these Dr must pertain to the
+                 immediately preceding Ir.  Same applies to analogous
+                 assertions in the subsequent cases. */
+              tl_assert(ev2->inode == ev->inode);
+              helperName = CLG_(cachesim).log_1I1Dr_name;
+              helperAddr = CLG_(cachesim).log_1I1Dr;
+              argv = mkIRExprVec_3( i_node_expr,
+                                    get_Event_dea(ev2),
+                                    mkIRExpr_HWord( get_Event_dszB(ev2) ) );
+              regparms = 3;
+              inew = i+2;
            }
-           else {
-               helperName = CLG_(cachesim).log_1I1Dw_name;
-               helperAddr = CLG_(cachesim).log_1I1Dw;
+           /* Merge an Ir with a following Dw/Dm. */
+           else
+           if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
+              tl_assert(ev2->inode == ev->inode);
+              helperName = CLG_(cachesim).log_1I1Dw_name;
+              helperAddr = CLG_(cachesim).log_1I1Dw;
+              argv = mkIRExprVec_3( i_node_expr,
+                                    get_Event_dea(ev2),
+                                    mkIRExpr_HWord( get_Event_dszB(ev2) ) );
+              regparms = 3;
+              inew = i+2;
            }
-           argc = 2;
-           arg2 = storeAddrExpr;
-           es = CLG_(sets).D1w;
-           
-       } else {
-           // load/store
-           if (instrIssued) {
-               helperName = CLG_(cachesim).log_0I2D_name;
-               helperAddr = CLG_(cachesim).log_0I2D;
+           /* Merge an Ir with two following Irs. */
+           else
+           if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
+              helperName = CLG_(cachesim).log_3I0D_name;
+              helperAddr = CLG_(cachesim).log_3I0D;
+              argv = mkIRExprVec_3( i_node_expr,
+                                    mkIRExpr_HWord( (HWord)ev2->inode ),
+                                    mkIRExpr_HWord( (HWord)ev3->inode ) );
+              regparms = 3;
+              inew = i+3;
            }
+           /* Merge an Ir with one following Ir. */
+           else
+           if (ev2 && ev2->tag == Ev_Ir) {
+              helperName = CLG_(cachesim).log_2I0D_name;
+              helperAddr = CLG_(cachesim).log_2I0D;
+              argv = mkIRExprVec_2( i_node_expr,
+                                    mkIRExpr_HWord( (HWord)ev2->inode ) );
+              regparms = 2;
+              inew = i+2;
+           }
+           /* No merging possible; emit as-is. */
            else {
-               helperName = CLG_(cachesim).log_1I2D_name;
-               helperAddr = CLG_(cachesim).log_1I2D;
+              helperName = CLG_(cachesim).log_1I0D_name;
+              helperAddr = CLG_(cachesim).log_1I0D;
+              argv = mkIRExprVec_1( i_node_expr );
+              regparms = 1;
+              inew = i+1;
            }
-           argc = 3;
-           arg2 = loadAddrExpr;
-           arg3 = storeAddrExpr;
-           es = CLG_(sets).D2;
-       }
-    }
+           break;
+        case Ev_Dr:
+           /* Data read or modify */
+           helperName = CLG_(cachesim).log_0I1Dr_name;
+           helperAddr = CLG_(cachesim).log_0I1Dr;
+           argv = mkIRExprVec_3( i_node_expr,
+                                 get_Event_dea(ev),
+                                 mkIRExpr_HWord( get_Event_dszB(ev) ) );
+           regparms = 3;
+           inew = i+1;
+           break;
+        case Ev_Dw:
+        case Ev_Dm:
+           /* Data write */
+           helperName = CLG_(cachesim).log_0I1Dw_name;
+           helperAddr = CLG_(cachesim).log_0I1Dw;
+           argv = mkIRExprVec_3( i_node_expr,
+                                 get_Event_dea(ev),
+                                 mkIRExpr_HWord( get_Event_dszB(ev) ) );
+           regparms = 3;
+           inew = i+1;
+           break;
+        default:
+           tl_assert(0);
+      }
 
-    /* helper could be unset depending on the simulator used */
-    if (helperAddr == 0) return 0;
-    
-    /* Setup 1st arg: InstrInfo */
-    arg1 = mkIRExpr_HWord( (HWord)ii );
-    
-    // Add call to the instrumentation function
-    if      (argc == 1)
-       argv = mkIRExprVec_1(arg1);
-    else if (argc == 2)
-       argv = mkIRExprVec_2(arg1, arg2);
-    else if (argc == 3)
-       argv = mkIRExprVec_3(arg1, arg2, arg3);
-    else
-       VG_(tool_panic)("argc... not 1 or 2 or 3?");
-    
-    di = unsafeIRDirty_0_N( argc, helperName, 
-                                  VG_(fnptr_to_fnentry)( helperAddr ), argv);
-    addStmtToIRSB( bbOut, IRStmt_Dirty(di) );
+      CLG_DEBUGIF(5) {
+         if (inew > i+1) {
+             VG_(printf)("   merge ");
+             showEvent( ev2 );
+         }
+         if (inew > i+2) {
+             VG_(printf)("   merge ");
+             showEvent( ev3 );
+         }
+         if (helperAddr)
+             VG_(printf)("   call  %s (%p)\n",
+                         helperName, helperAddr);
+      }
+
+      /* helper could be unset depending on the simulator used */
+      if (helperAddr == 0) continue;
+
+      /* Add the helper. */
+      tl_assert(helperName);
+      tl_assert(helperAddr);
+      tl_assert(argv);
+      di = unsafeIRDirty_0_N( regparms,
+                             helperName, VG_(fnptr_to_fnentry)( helperAddr ),
+                             argv );
+      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+   }
 
-    return es;
+   clgs->events_used = 0;
 }
 
+static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
+{
+   Event* evt;
+   tl_assert(clgs->seen_before || (inode->eventset == 0));
+   if (!CLG_(clo).simulate_cache) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag      = Ev_Ir;
+   evt->inode    = inode;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
+   if (!CLG_(clo).simulate_cache) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dr;
+   evt->inode     = inode;
+   evt->Ev.Dr.szB = datasize;
+   evt->Ev.Dr.ea  = ea;
+   clgs->events_used++;
+}
 
-/* Instrumentation before a conditional jump or at the end
- * of each original instruction.
- * Fills the InstrInfo struct if not seen before
- */
 static
-void endOfInstr(IRSB* bbOut, InstrInfo* ii, Bool bb_seen_before,
-               UInt instr_offset, UInt instrLen, UInt dataSize, 
-               UInt* cost_offset, Bool instrIssued,
-               IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
 {
-   IRType    wordTy;
-   EventSet* es;
-
-   // Stay sane ...
-   CLG_ASSERT(sizeof(HWord) == sizeof(void*));
-   if (sizeof(HWord) == 4) {
-      wordTy = Ity_I32;
-   } else
-   if (sizeof(HWord) == 8) {
-      wordTy = Ity_I64;
-   } else {
-      VG_(tool_panic)("endOfInstr: strange word size");
+   Event* lastEvt;
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
+   if (!CLG_(clo).simulate_cache) return;
+
+   /* Is it possible to merge this write with the preceding read? */
+   lastEvt = &clgs->events[clgs->events_used-1];
+   if (clgs->events_used > 0
+       && lastEvt->tag       == Ev_Dr
+       && lastEvt->Ev.Dr.szB == datasize
+       && lastEvt->inode     == inode
+       && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
+   {
+      lastEvt->tag   = Ev_Dm;
+      return;
    }
 
-   if (loadAddrExpr) 
-      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, loadAddrExpr));
-   if (storeAddrExpr) 
-      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, storeAddrExpr));
-
-   // Large (eg. 28B, 108B, 512B on x86) data-sized instructions will be
-   // done inaccurately, but they're very rare and this avoids errors from
-   // hitting more than two cache lines in the simulation.
-   if (dataSize > MIN_LINE_SIZE) dataSize = MIN_LINE_SIZE;
-
-   /* returns 0 if simulator needs no instrumentation */
-   es = insert_simcall(bbOut, ii, dataSize, instrIssued,
-                      loadAddrExpr, storeAddrExpr);
-
-   CLG_DEBUG(5, "  Instr +%2d (Size %d, DSize %d): ESet %s (Size %d)\n",
-            instr_offset, instrLen, dataSize, 
-            es ? es->name : (Char*)"(no instrumentation)",
-            es ? es->size : 0);
-
-   if (bb_seen_before) {
-       CLG_DEBUG(5, "   before: Instr +%2d (Size %d, DSize %d)\n",
-                ii->instr_offset, ii->instr_size, ii->data_size);
-
-       CLG_ASSERT(ii->instr_offset == instr_offset);
-       CLG_ASSERT(ii->instr_size == instrLen);
-       CLG_ASSERT(ii->cost_offset == *cost_offset);
-       CLG_ASSERT(ii->eventset == es);
-
-       /* Only check size if data size >0.
-       * This is needed: e.g. for rep or cmov x86 instructions, the same InstrInfo
-       * is used both for 2 simulator calls: for the pure instruction fetch and
-        * separately for an memory access (which may not happen depending on flags).
-       * If checked always, this triggers an assertion failure on retranslation.
-       */
-       if (dataSize>0) CLG_ASSERT(ii->data_size == dataSize);
+   /* No.  Add as normal. */
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dw;
+   evt->inode     = inode;
+   evt->Ev.Dw.szB = datasize;
+   evt->Ev.Dw.ea  = ea;
+   clgs->events_used++;
+}
 
+/* Initialise or check (if already seen before) an InstrInfo for next insn.
+   We only can set instr_offset/instr_size here. The required event set and
+   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
+   instructions. The event set is extended as required on flush of the event
+   queue (when Dm events were determined), cost offsets are determined at
+   end of BB instrumentation. */
+static
+InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
+{
+   InstrInfo* ii;
+   tl_assert(clgs->ii_index >= 0);
+   tl_assert(clgs->ii_index < clgs->bb->instr_count);
+   ii = &clgs->bb->instr[ clgs->ii_index ];
+
+   if (clgs->seen_before) {
+       CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
+       CLG_ASSERT(ii->instr_size == instr_size);
    }
    else {
-       ii->instr_offset = instr_offset;
-       ii->instr_size = instrLen;
-       ii->cost_offset = *cost_offset;
-       ii->eventset = es;
-       
-       /* data size only relevant if >0 */
-       if (dataSize > 0) ii->data_size = dataSize;
+       ii->instr_offset = clgs->instr_offset;
+       ii->instr_size = instr_size;
+       ii->cost_offset = 0;
+       ii->eventset = 0;
+   }
 
+   clgs->ii_index++;
+   clgs->instr_offset += instr_size;
+   CLG_(stat).distinct_instrs++;
 
-       CLG_(stat).distinct_instrs++;
-   }
+   return ii;
+}
 
-   *cost_offset += es ? es->size : 0;
+// return total number of cost values needed for this BB
+static
+UInt update_cost_offsets( ClgState* clgs )
+{
+    Int i;
+    InstrInfo* ii;
+    UInt cost_offset = 0;
+
+    CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
+    for(i=0; i<clgs->ii_index; i++) {
+       ii = &clgs->bb->instr[i];
+       if (clgs->seen_before) {
+           CLG_ASSERT(ii->cost_offset == cost_offset);
+       } else
+           ii->cost_offset = cost_offset;
+       cost_offset += ii->eventset ? ii->eventset->size : 0;
+    }
 
+    return cost_offset;
 }
 
+/*------------------------------------------------------------*/
+/*--- Instrumentation                                      ---*/
+/*------------------------------------------------------------*/
+
 #if defined(VG_BIGENDIAN)
 # define CLGEndness Iend_BE
 #elif defined(VG_LITTLEENDIAN)
@@ -344,7 +610,7 @@ Addr IRConst2Addr(IRConst* con)
  *
  * Called from CLG_(get_bb)
  */
-void CLG_(collectBlockInfo)(IRSB* bbIn,
+void CLG_(collectBlockInfo)(IRSB* sbIn,
                            /*INOUT*/ UInt* instrs,
                            /*INOUT*/ UInt* cjmps,
                            /*INOUT*/ Bool* cjmp_inverted)
@@ -360,10 +626,10 @@ void CLG_(collectBlockInfo)(IRSB* bbIn,
     // nothing to do with client code
     Bool inPreamble = True;
 
-    if (!bbIn) return;
+    if (!sbIn) return;
 
-    for (i = 0; i < bbIn->stmts_used; i++) {
-         st = bbIn->stmts[i];
+    for (i = 0; i < sbIn->stmts_used; i++) {
+         st = sbIn->stmts[i];
          if (Ist_IMark == st->tag) {
              inPreamble = False;
 
@@ -377,7 +643,7 @@ void CLG_(collectBlockInfo)(IRSB* bbIn,
          if (Ist_Exit == st->tag) {
              jumpDst = IRConst2Addr(st->Ist.Exit.dst);
              toNextInstr =  (jumpDst == instrAddr + instrLen);
-             
+
              (*cjmps)++;
          }
     }
@@ -388,98 +654,6 @@ void CLG_(collectBlockInfo)(IRSB* bbIn,
     *cjmp_inverted = toNextInstr;
 }
 
-static
-void collectStatementInfo(IRTypeEnv* tyenv, IRStmt* st,
-                         Addr* instrAddr, UInt* instrLen,
-                         IRExpr** loadAddrExpr, IRExpr** storeAddrExpr,
-                         UInt* dataSize)
-{
-   CLG_ASSERT(isFlatIRStmt(st));
-
-   switch (st->tag) {
-   case Ist_NoOp:
-      break;
-
-   case Ist_AbiHint:
-      /* ABI hints aren't interesting.  Ignore. */
-      break;
-
-   case Ist_IMark:
-      /* st->Ist.IMark.addr is a 64-bit int.  ULong_to_Ptr casts this
-         to the host's native pointer type; if that is 32 bits then it
-         discards the upper 32 bits.  If we are cachegrinding on a
-         32-bit host then we are also ensured that the guest word size
-         is 32 bits, due to the assertion in cg_instrument that the
-         host and guest word sizes must be the same.  Hence
-         st->Ist.IMark.addr will have been derived from a 32-bit guest
-         code address and truncation of it is safe.  I believe this
-         assignment should be correct for both 32- and 64-bit
-         machines. */
-      *instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
-      *instrLen =        st->Ist.IMark.len;
-      break;
-
-   case Ist_WrTmp: {
-      IRExpr* data = st->Ist.WrTmp.data;
-      if (data->tag == Iex_Load) {
-         IRExpr* aexpr = data->Iex.Load.addr;
-         CLG_ASSERT( isIRAtom(aexpr) );
-         // Note also, endianness info is ignored.  I guess that's not
-         // interesting.
-         // XXX: repe cmpsb does two loads... the first one is ignored here!
-         //tl_assert( NULL == *loadAddrExpr );          // XXX: ???
-         *loadAddrExpr = aexpr;
-         *dataSize = sizeofIRType(data->Iex.Load.ty);
-      }
-      break;
-   }
-      
-   case Ist_Store: {
-      IRExpr* data  = st->Ist.Store.data;
-      IRExpr* aexpr = st->Ist.Store.addr;
-      CLG_ASSERT( isIRAtom(aexpr) );
-      if ( NULL == *storeAddrExpr ) {
-          /* this is a kludge: ignore all except the first store from
-             an instruction. */
-          *storeAddrExpr = aexpr;
-          *dataSize = sizeofIRType(typeOfIRExpr(tyenv, data));
-      }
-      break;
-   }
-   
-   case Ist_Dirty: {
-      IRDirty* d = st->Ist.Dirty.details;
-      if (d->mFx != Ifx_None) {
-         /* This dirty helper accesses memory.  Collect the
-            details. */
-         CLG_ASSERT(d->mAddr != NULL);
-         CLG_ASSERT(d->mSize != 0);
-         *dataSize = d->mSize;
-         if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
-            *loadAddrExpr = d->mAddr;
-         if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
-            *storeAddrExpr = d->mAddr;
-      } else {
-         CLG_ASSERT(d->mAddr == NULL);
-         CLG_ASSERT(d->mSize == 0);
-      }
-      break;
-   }
-
-   case Ist_Put:
-   case Ist_PutI:
-   case Ist_MBE:
-   case Ist_Exit:
-       break;
-
-   default:
-      VG_(printf)("\n");
-      ppIRStmt(st);
-      VG_(printf)("\n");
-      VG_(tool_panic)("Callgrind: unhandled IRStmt");
-   }
-}
-
 static
 void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
 {
@@ -491,29 +665,56 @@ void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
                                IRExpr_Const(IRConst_U32(val)) ));
 }   
 
+
+/* add helper call to setup_bbcc, with pointer to BB struct as argument
+ *
+ * precondition for setup_bbcc:
+ * - jmps_passed has number of cond.jumps passed in last executed BB
+ * - current_bbcc has a pointer to the BBCC of the last executed BB
+ *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
+ *     current_bbcc->bb->jmp_addr
+ *   gives the address of the jump source.
+ *
+ * the setup does 2 things:
+ * - trace call:
+ *   * Unwind own call stack, i.e sync our ESP with real ESP
+ *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
+ *   * For CALLs or JMPs crossing objects, record call arg +
+ *     push are on own call stack
+ *
+ * - prepare for cache log functions:
+ *   set current_bbcc to BBCC that gets the costs for this BB execution
+ *   attached
+ */
+static
+void addBBSetupCall(ClgState* clgs)
+{
+   IRDirty* di;
+   IRExpr  *arg1, **argv;
+
+   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
+   argv = mkIRExprVec_1(arg1);
+   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
+                             VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
+                             argv);
+   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+}
+
+
 static
 IRSB* CLG_(instrument)( VgCallbackClosure* closure,
-                       IRSB* bbIn,
+                       IRSB* sbIn,
                        VexGuestLayout* layout,
                        VexGuestExtents* vge,
                        IRType gWordTy, IRType hWordTy )
 {
-   Int      i;
-   IRSB*    bbOut;
-   IRStmt*  st, *stnext;
-   Addr     instrAddr, origAddr;
-   UInt     instrLen = 0, dataSize;
-   UInt     instrCount, costOffset;
-   IRExpr  *loadAddrExpr, *storeAddrExpr;
-
-   BB*         bb;
+   Int      i, isize;
+   IRStmt*  st;
+   Addr     origAddr;
+   InstrInfo* curr_inode = NULL;
+   ClgState clgs;
+   UInt     cJumps = 0;
 
-   IRDirty* di;
-   IRExpr  *arg1, **argv;
-
-   Bool        bb_seen_before     = False;
-   UInt        cJumps = 0, cJumpsCorrected;
-   Bool        beforeIBoundary, instrIssued;
 
    if (gWordTy != hWordTy) {
       /* We don't currently support this case. */
@@ -524,173 +725,206 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure,
    if (! CLG_(instrument_state)) {
        CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
                 (Addr)closure->readdr);
-       return bbIn;
+       return sbIn;
    }
 
    CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
 
    /* Set up SB for instrumented IR */
-   bbOut = deepCopyIRSBExceptStmts(bbIn);
+   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
 
    // Copy verbatim any IR preamble preceding the first IMark
    i = 0;
-   while (i < bbIn->stmts_used && bbIn->stmts[i]->tag != Ist_IMark) {
-      addStmtToIRSB( bbOut, bbIn->stmts[i] );
+   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
+      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
       i++;
    }
 
    // Get the first statement, and origAddr from it
-   CLG_ASSERT(bbIn->stmts_used > 0);
-   st = bbIn->stmts[i];
+   CLG_ASSERT(sbIn->stmts_used >0);
+   CLG_ASSERT(i < sbIn->stmts_used);
+   st = sbIn->stmts[i];
    CLG_ASSERT(Ist_IMark == st->tag);
-   instrAddr = origAddr = (Addr)st->Ist.IMark.addr;
+
+   origAddr = (Addr)st->Ist.IMark.addr;
    CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
 
-   /* Get BB (creating if necessary).
+   /* Get BB struct (creating if necessary).
     * JS: The hash table is keyed with orig_addr_noredir -- important!
     * JW: Why? If it is because of different chasing of the redirection,
     *     this is not needed, as chasing is switched off in callgrind
     */
-   bb = CLG_(get_bb)(origAddr, bbIn, &bb_seen_before);
-   //bb = CLG_(get_bb)(orig_addr_noredir, bbIn, &bb_seen_before);
-
-   /* 
-    * Precondition:
-    * - jmps_passed has number of cond.jumps passed in last executed BB
-    * - current_bbcc has a pointer to the BBCC of the last executed BB
-    *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
-    *     current_bbcc->bb->jmp_addr
-    *   gives the address of the jump source.
-    *   
-    * The BBCC setup does 2 things:
-    * - trace call:
-    *   * Unwind own call stack, i.e sync our ESP with real ESP
-    *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
-    *   * For CALLs or JMPs crossing objects, record call arg +
-    *     push are on own call stack
-    *
-    * - prepare for cache log functions:
-    *   Set current_bbcc to BBCC that gets the costs for this BB execution
-    *   attached
-    */
+   clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
+
+   addBBSetupCall(&clgs);
+
+   // Set up running state
+   clgs.events_used = 0;
+   clgs.ii_index = 0;
+   clgs.instr_offset = 0;
+
+   for (/*use current i*/; i < sbIn->stmts_used; i++) {
+
+      st = sbIn->stmts[i];
+      CLG_ASSERT(isFlatIRStmt(st));
+
+      switch (st->tag) {
+        case Ist_NoOp:
+        case Ist_AbiHint:
+        case Ist_Put:
+        case Ist_PutI:
+        case Ist_MBE:
+           break;
+
+        case Ist_IMark: {
+           CLG_ASSERT(clgs.instr_offset == (Addr)st->Ist.IMark.addr - origAddr);
+           isize = st->Ist.IMark.len;
+           // If Vex fails to decode an instruction, the size will be zero.
+           // Pretend otherwise.
+           if (isize == 0) isize = VG_MIN_INSTR_SZB;
+
+           // Sanity-check size.
+           tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
+                    || VG_CLREQ_SZB == isize );
+
+           // Init the inode, record it as the current one.
+           // Subsequent Dr/Dw/Dm events from the same instruction will
+           // also use it.
+           curr_inode = next_InstrInfo (&clgs, isize);
+
+           addEvent_Ir( &clgs, curr_inode );
+           break;
+        }
+
+        case Ist_WrTmp: {
+           IRExpr* data = st->Ist.WrTmp.data;
+           if (data->tag == Iex_Load) {
+              IRExpr* aexpr = data->Iex.Load.addr;
+              // Note also, endianness info is ignored.  I guess
+              // that's not interesting.
+              addEvent_Dr( &clgs, curr_inode,
+                           sizeofIRType(data->Iex.Load.ty), aexpr );
+           }
+           break;
+        }
+
+        case Ist_Store: {
+           IRExpr* data  = st->Ist.Store.data;
+           IRExpr* aexpr = st->Ist.Store.addr;
+           addEvent_Dw( &clgs, curr_inode,
+                        sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
+           break;
+        }
+
+        case Ist_Dirty: {
+           Int      dataSize;
+           IRDirty* d = st->Ist.Dirty.details;
+           if (d->mFx != Ifx_None) {
+              /* This dirty helper accesses memory.  Collect the details. */
+              tl_assert(d->mAddr != NULL);
+              tl_assert(d->mSize != 0);
+              dataSize = d->mSize;
+              // Large (eg. 28B, 108B, 512B on x86) data-sized
+              // instructions will be done inaccurately, but they're
+              // very rare and this avoids errors from hitting more
+              // than two cache lines in the simulation.
+              if (dataSize > MIN_LINE_SIZE)
+                 dataSize = MIN_LINE_SIZE;
+              if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+                 addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
+              if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+                 addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
+           } else {
+              tl_assert(d->mAddr == NULL);
+              tl_assert(d->mSize == 0);
+           }
+           break;
+        }
 
-   // helper call to setup_bbcc, with pointer to basic block info struct as argument
-   arg1 = mkIRExpr_HWord( (HWord)bb );
-   argv = mkIRExprVec_1(arg1);
-   di = unsafeIRDirty_0_N( 1, "setup_bbcc", 
-                              VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ), 
-                              argv);
-   addStmtToIRSB( bbOut, IRStmt_Dirty(di) );
-
-   instrCount = 0;
-   costOffset = 0;
-
-   // loop for each host instruction (starting from 'i')
-   do {
-
-      // We should be at an IMark statement
-      CLG_ASSERT(Ist_IMark == st->tag);
-
-      // Reset stuff for this original instruction
-      loadAddrExpr = storeAddrExpr = NULL;
-      instrIssued = False;
-      dataSize = 0;
-
-      // Process all the statements for this original instruction (ie. until
-      // the next IMark statement, or the end of the block)
-      do {
-         i++;
-         stnext = ( i < bbIn->stmts_used ? bbIn->stmts[i] : NULL );
-         beforeIBoundary = !stnext || (Ist_IMark == stnext->tag);
-         collectStatementInfo(bbIn->tyenv, st, &instrAddr, &instrLen,
-                              &loadAddrExpr, &storeAddrExpr, &dataSize);
-
-         // instrument a simulator call before conditional jumps
-         if (st->tag == Ist_Exit) {
-             // Nb: instrLen will be zero if Vex failed to decode it.
-             // Also Client requests can appear to be very large (eg. 18
-             // bytes on x86) because they are really multiple instructions.
-             CLG_ASSERT( 0 == instrLen ||
-                         bbIn->jumpkind == Ijk_ClientReq ||
-                         (instrLen >= VG_MIN_INSTR_SZB && 
-                          instrLen <= VG_MAX_INSTR_SZB) );
-
-              // Add instrumentation before this statement
-             endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
-                        instrAddr - origAddr, instrLen, dataSize, &costOffset,
-                        instrIssued, loadAddrExpr, storeAddrExpr);
-
-             // prepare for a possible further simcall in same host instr
-             loadAddrExpr = storeAddrExpr = NULL;
-             instrIssued = True;
-
-             if (!bb_seen_before) {
-                 bb->jmp[cJumps].instr = instrCount;
-                 bb->jmp[cJumps].skip = False;
-             }
-             
-             /* Update global variable jmps_passed (this is before the jump!)
-              * A correction is needed if VEX inverted the last jump condition
-              */
-             cJumpsCorrected = cJumps;
-             if ((cJumps+1 == bb->cjmp_count) && bb->cjmp_inverted) cJumpsCorrected++;
-             addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
-                                   cJumpsCorrected, hWordTy);
-
-             cJumps++;
-         }
+        case Ist_Exit: {
+           UInt jmps_passed;
+
+           /* We may never reach the next statement, so need to flush
+              all outstanding transactions now. */
+           flushEvents( &clgs );
 
-         addStmtToIRSB( bbOut, st );
-         st = stnext;
-      } 
-      while (!beforeIBoundary);
+           CLG_ASSERT(clgs.ii_index>0);
+           if (!clgs.seen_before) {
+               clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
+               clgs.bb->jmp[cJumps].skip = False;
+           }
+
+           /* Update global variable jmps_passed before the jump
+            * A correction is needed if VEX inverted the last jump condition
+           */
+           jmps_passed = cJumps;
+           if ((cJumps+1 == clgs.bb->cjmp_count) && clgs.bb->cjmp_inverted)
+               jmps_passed++;
+           addConstMemStoreStmt( clgs.sbOut,
+                                 (UWord) &CLG_(current_state).jmps_passed,
+                                 jmps_passed, hWordTy);
+           cJumps++;
+
+           break;
+        }
+
+        default:
+           tl_assert(0);
+           break;
+      }
 
-      // Add instrumentation for this original instruction.
-      if (!instrIssued || (loadAddrExpr != 0) || (storeAddrExpr !=0))
-         endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
-                    instrAddr - origAddr, instrLen, dataSize, &costOffset,
-                    instrIssued, loadAddrExpr, storeAddrExpr);
+      /* Copy the original statement */
+      addStmtToIRSB( clgs.sbOut, st );
 
-      instrCount++;
+      CLG_DEBUGIF(5) {
+        VG_(printf)("   pass  ");
+        ppIRStmt(st);
+        VG_(printf)("\n");
+      }
    }
-   while (st);
 
-   /* Always update global variable jmps_passed (at end of BB)
+   /* At the end of the bb.  Flush outstandings. */
+   flushEvents( &clgs );
+
+   /* Always update global variable jmps_passed at end of bb.
     * A correction is needed if VEX inverted the last jump condition
     */
-   cJumpsCorrected = cJumps;
-   if (bb->cjmp_inverted) cJumpsCorrected--;
-   addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
-                        cJumpsCorrected, hWordTy);
+   {
+      UInt jmps_passed = cJumps;
+      if (clgs.bb->cjmp_inverted) jmps_passed--;
+      addConstMemStoreStmt( clgs.sbOut,
+                           (UWord) &CLG_(current_state).jmps_passed,
+                           jmps_passed, hWordTy);
+   }
+   CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
+   CLG_ASSERT(clgs.bb->instr_count = clgs.ii_index);
 
    /* This stores the instr of the call/ret at BB end */
-   bb->jmp[cJumps].instr = instrCount-1;
+   clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
 
-   CLG_ASSERT(bb->cjmp_count == cJumps);
-   CLG_ASSERT(bb->instr_count == instrCount);
-
-   instrAddr += instrLen;
-   if (bb_seen_before) {
-       CLG_ASSERT(bb->instr_len == instrAddr - origAddr);
-       CLG_ASSERT(bb->cost_count == costOffset);
-       CLG_ASSERT(bb->jmpkind == bbIn->jumpkind);
+   if (clgs.seen_before) {
+       CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
+       CLG_ASSERT(clgs.bb->instr_len = clgs.instr_offset);
+       CLG_ASSERT(clgs.bb->jmpkind == sbIn->jumpkind);
    }
    else {
-       bb->instr_len = instrAddr - origAddr;
-       bb->cost_count = costOffset;
-       bb->jmpkind = bbIn->jumpkind;
+       clgs.bb->cost_count = update_cost_offsets(&clgs);
+       clgs.bb->instr_len = clgs.instr_offset;
+       clgs.bb->jmpkind = sbIn->jumpkind;
    }
-   
+
    CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
-            origAddr, bb->instr_len, bb->cjmp_count, bb->cost_count);
+            origAddr, clgs.bb->instr_len,
+            clgs.bb->cjmp_count, clgs.bb->cost_count);
    if (cJumps>0) {
        CLG_DEBUG(3, "                     [ ");
        for (i=0;i<cJumps;i++)
-          CLG_DEBUG(3, "%d ", bb->jmp[i].instr);
-       CLG_DEBUG(3, "], last inverted: %s \n", bb->cjmp_inverted ? "yes":"no");
+          CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
+       CLG_DEBUG(3, "], last inverted: %s \n",
+                clgs.bb->cjmp_inverted ? "yes":"no");
    }
 
-  return bbOut;
+  return clgs.sbOut;
 }
 
 /*--------------------------------------------------------------------*/
index 9edbecc17257a8625a7d965ef7faaefa4d863917..9e53f8916916f14ebc6c78d272f05252050d3b22 100644 (file)
@@ -113,22 +113,21 @@ static Bool clo_collect_cacheuse = False;
  * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
  */
 
-/* Offset to events in event set, used in log_* functions */
-static Int off_D0_Ir;
-static Int off_D1r_Ir;
-static Int off_D1r_Dr;
-static Int off_D1w_Ir;
-static Int off_D1w_Dw;
-static Int off_D2_Ir;
-static Int off_D2_Dr;
-static Int off_D2_Dw;
+/* Offset to events in event set, used in log_* functions
+ * <off_EventSet_BasicEventSet>: offset where basic set is found
+ */
+static Int off_UIr_Ir;
+static Int off_UIrDr_Ir,   off_UIrDr_Dr;
+static Int off_UIrDrDw_Ir, off_UIrDrDw_Dr, off_UIrDrDw_Dw;
+static Int off_UIrDw_Ir,   off_UIrDw_Dw;
+static Int off_UIrDwDr_Ir, off_UIrDwDr_Dr, off_UIrDwDr_Dw;
 
 static Addr   bb_base;
 static ULong* cost_base;
 static InstrInfo* current_ii;
 
 /* Cache use offsets */
-/* FIXME: The offsets are only correct because all eventsets get
+/* The offsets are only correct because all per-instruction event sets get
  * the "Use" set added first !
  */
 static Int off_I1_AcCost  = 0;
@@ -984,13 +983,13 @@ static
 void cacheuse_finish(void)
 {
   int i;
-  InstrInfo ii = { 0,0,0,0,0 };
+  InstrInfo ii = { 0,0,0,0 };
 
   if (!CLG_(current_state).collect) return;
 
   bb_base = 0;
   current_ii = &ii;
-  cost_base = 0;  
+  cost_base = 0;
 
   /* update usage counters */
   if (I1.use)
@@ -1043,6 +1042,19 @@ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
     }
 }
 
+static
+Char* cacheRes(CacheModelResult r)
+{
+    switch(r) {
+    case L1_Hit:    return "L1 Hit ";
+    case L2_Hit:    return "L2 Hit ";
+    case MemAccess: return "L2 Miss";
+    case WriteBackMemAccess: return "L2 Miss (dirty)";
+    default:
+       tl_assert(0);
+    }
+    return "??";
+}
 
 VG_REGPARM(1)
 static void log_1I0D(InstrInfo* ii)
@@ -1052,37 +1064,101 @@ static void log_1I0D(InstrInfo* ii)
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
 
-    CLG_DEBUG(6, "log_1I0D:  Ir=%#lx/%u => Ir %d\n",
-             bb_base + ii->instr_offset, ii->instr_size, IrRes);
+    CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
+             bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
 
     if (CLG_(current_state).collect) {
        ULong* cost_Ir;
-       
+
        if (CLG_(current_state).nonskipped)
            cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
        else
-           cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
+           cost_Ir = cost_base + ii->cost_offset + off_UIr_Ir;
 
        inc_costs(IrRes, cost_Ir, 
                  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
     }
 }
 
+VG_REGPARM(2)
+static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
+{
+    CacheModelResult Ir1Res, Ir2Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
+
+    CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
+             bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+             bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
+
+    if (!CLG_(current_state).collect) return;
+
+    global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
+    if (CLG_(current_state).nonskipped) {
+       ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
+                                CLG_(sets).off_full_Ir;
+       inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+       inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+       return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
+    inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
+}
+
+VG_REGPARM(3)
+static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
+{
+    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
+    current_ii = ii3;
+    Ir3Res = (*simulator.I1_Read)(bb_base + ii3->instr_offset, ii3->instr_size);
+
+    CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
+             bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+             bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
+             bb_base + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
+
+    if (!CLG_(current_state).collect) return;
+
+    global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
+    if (CLG_(current_state).nonskipped) {
+       ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
+                                CLG_(sets).off_full_Ir;
+       inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+       inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+       inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
+       return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
+    inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
+    inc_costs(Ir3Res, global_cost_Ir, cost_base + ii3->cost_offset + off_UIr_Ir);
+}
 
 /* Instruction doing a read access */
 
-VG_REGPARM(2)
-static void log_1I1Dr(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult IrRes, DrRes;
 
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
-             bb_base + ii->instr_offset, ii->instr_size,
-             data, ii->data_size, IrRes, DrRes);
+    CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
+             bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+             data_addr, data_size, cacheRes(DrRes));
 
     if (CLG_(current_state).collect) {
        ULong *cost_Ir, *cost_Dr;
@@ -1092,8 +1168,11 @@ static void log_1I1Dr(InstrInfo* ii, Addr data)
            cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
        }
        else {
-           cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
-           cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+           // event set must be UIrDr or extension
+           CLG_ASSERT((ii->eventset == CLG_(sets).UIrDr) ||
+                      (ii->eventset == CLG_(sets).UIrDrDw));
+           cost_Ir = cost_base + ii->cost_offset + off_UIrDr_Ir;
+           cost_Dr = cost_base + ii->cost_offset + off_UIrDr_Dr;
        }
        
        inc_costs(IrRes, cost_Ir, 
@@ -1104,16 +1183,16 @@ static void log_1I1Dr(InstrInfo* ii, Addr data)
 }
 
 
-VG_REGPARM(2)
-static void log_0I1Dr(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult DrRes;
 
     current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
-             data, ii->data_size, DrRes);
+    CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
+             data_addr, data_size, cacheRes(DrRes));
 
     if (CLG_(current_state).collect) {
        ULong *cost_Dr;
@@ -1122,9 +1201,15 @@ static void log_0I1Dr(InstrInfo* ii, Addr data)
            cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
        }
        else {
-           cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+           Int off_Dr;
+           if      (ii->eventset == CLG_(sets).UIrDr)   off_Dr = off_UIrDr_Dr;
+           else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dr = off_UIrDrDw_Dr;
+           else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dr = off_UIrDwDr_Dr;
+           else CLG_ASSERT(0);
+
+           cost_Dr = cost_base + ii->cost_offset + off_Dr;
        }
-       
+
        inc_costs(DrRes, cost_Dr,
                  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
     }
@@ -1133,29 +1218,33 @@ static void log_0I1Dr(InstrInfo* ii, Addr data)
 
 /* Instruction doing a write access */
 
-VG_REGPARM(2)
-static void log_1I1Dw(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult IrRes, DwRes;
 
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
-             bb_base + ii->instr_offset, ii->instr_size,
-             data, ii->data_size, IrRes, DwRes);
+    CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
+             bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+             data_addr, data_size, cacheRes(DwRes));
 
     if (CLG_(current_state).collect) {
        ULong *cost_Ir, *cost_Dw;
        
        if (CLG_(current_state).nonskipped) {
-           cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
-           cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
+           cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
+           cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
        }
        else {
-           cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
-           cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+           // This helper is called when a Dr event follows Ir;
+           // Event set must be UIrDw or extension
+           CLG_ASSERT((ii->eventset == CLG_(sets).UIrDw) ||
+                      (ii->eventset == CLG_(sets).UIrDwDr));
+           cost_Ir = cost_base + ii->cost_offset + off_UIrDw_Ir;
+           cost_Dw = cost_base + ii->cost_offset + off_UIrDw_Dw;
        }
        
        inc_costs(IrRes, cost_Ir,
@@ -1165,16 +1254,16 @@ static void log_1I1Dw(InstrInfo* ii, Addr data)
     }
 }
 
-VG_REGPARM(2)
-static void log_0I1Dw(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult DwRes;
 
     current_ii = ii;
-    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
-             data, ii->data_size, DwRes);
+    CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
+             data_addr, data_size, cacheRes(DwRes));
 
     if (CLG_(current_state).collect) {
        ULong *cost_Dw;
@@ -1183,7 +1272,13 @@ static void log_0I1Dw(InstrInfo* ii, Addr data)
            cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
        }
        else {
-           cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+           Int off_Dw;
+           if      (ii->eventset == CLG_(sets).UIrDw)   off_Dw = off_UIrDw_Dw;
+           else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dw = off_UIrDwDr_Dw;
+           else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dw = off_UIrDrDw_Dw;
+           else CLG_ASSERT(0);
+
+           cost_Dw = cost_base + ii->cost_offset + off_Dw;
        }
        
        inc_costs(DwRes, cost_Dw,
@@ -1191,77 +1286,6 @@ static void log_0I1Dw(InstrInfo* ii, Addr data)
     }
 }
 
-/* Instruction doing a read and a write access */
-
-VG_REGPARM(3)
-static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
-{
-    CacheModelResult IrRes, DrRes, DwRes;
-
-    current_ii = ii;
-    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
-    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
-
-    CLG_DEBUG(6,
-             "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
-             bb_base + ii->instr_offset, ii->instr_size,
-             data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
-
-    if (CLG_(current_state).collect) {
-       ULong *cost_Ir, *cost_Dr, *cost_Dw;
-
-       if (CLG_(current_state).nonskipped) {
-           cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
-           cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
-           cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
-       }
-       else {
-           cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
-           cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
-           cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
-       }
-       
-       inc_costs(IrRes, cost_Ir, 
-                 CLG_(current_state).cost + CLG_(sets).off_full_Ir );
-       inc_costs(DrRes, cost_Dr, 
-                 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
-       inc_costs(DwRes, cost_Dw, 
-                 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
-    }
-}
-
-VG_REGPARM(3)
-static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
-{
-    CacheModelResult DrRes, DwRes;
-
-    current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
-    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
-
-    CLG_DEBUG(6,
-             "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
-             data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
-
-    if (CLG_(current_state).collect) {
-       ULong *cost_Dr, *cost_Dw;
-
-       if (CLG_(current_state).nonskipped) {
-           cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
-           cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
-       }
-       else {
-           cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
-           cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
-       }
-       
-       inc_costs(DrRes, cost_Dr, 
-                 CLG_(current_state).cost + CLG_(sets).off_full_Dr );
-       inc_costs(DwRes, cost_Dw, 
-                 CLG_(current_state).cost + CLG_(sets).off_full_Dw );
-    }
-}
 
 
 /*------------------------------------------------------------*/
@@ -1369,20 +1393,20 @@ static void cachesim_post_clo_init(void)
   if (!CLG_(clo).simulate_cache) {
     CLG_(cachesim).log_1I0D  = 0;
     CLG_(cachesim).log_1I0D_name = "(no function)";
+    CLG_(cachesim).log_2I0D  = 0;
+    CLG_(cachesim).log_2I0D_name = "(no function)";
+    CLG_(cachesim).log_3I0D  = 0;
+    CLG_(cachesim).log_3I0D_name = "(no function)";
 
     CLG_(cachesim).log_1I1Dr = 0;
-    CLG_(cachesim).log_1I1Dw = 0;
-    CLG_(cachesim).log_1I2D  = 0;
     CLG_(cachesim).log_1I1Dr_name = "(no function)";
+    CLG_(cachesim).log_1I1Dw = 0;
     CLG_(cachesim).log_1I1Dw_name = "(no function)";
-    CLG_(cachesim).log_1I2D_name = "(no function)";
 
     CLG_(cachesim).log_0I1Dr = 0;
-    CLG_(cachesim).log_0I1Dw = 0;
-    CLG_(cachesim).log_0I2D  = 0;
     CLG_(cachesim).log_0I1Dr_name = "(no function)";
+    CLG_(cachesim).log_0I1Dw = 0;
     CLG_(cachesim).log_0I1Dw_name = "(no function)";
-    CLG_(cachesim).log_0I2D_name = "(no function)";
     return;
   }
 
@@ -1402,20 +1426,20 @@ static void cachesim_post_clo_init(void)
 
   CLG_(cachesim).log_1I0D  = log_1I0D;
   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
+  CLG_(cachesim).log_2I0D  = log_2I0D;
+  CLG_(cachesim).log_2I0D_name  = "log_2I0D";
+  CLG_(cachesim).log_3I0D  = log_3I0D;
+  CLG_(cachesim).log_3I0D_name  = "log_3I0D";
 
   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
-  CLG_(cachesim).log_1I2D  = log_1I2D;
   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
-  CLG_(cachesim).log_1I2D_name  = "log_1I2D";
 
   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
-  CLG_(cachesim).log_0I2D  = log_0I2D;
   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
-  CLG_(cachesim).log_0I2D_name  = "log_0I2D";
 
   if (clo_collect_cacheuse) {
 
@@ -1763,26 +1787,29 @@ struct event_sets CLG_(sets);
 void CLG_(init_eventsets)(Int max_user)
 {
   EventType * e1, *e2, *e3, *e4;
-  EventSet *Ir, *Dr, *Dw;
-  EventSet *D0, *D1r, *D1w, *D2;
-  EventSet *sim, *full;
-  EventSet *use;
+  // Basic event sets from which others are composed
+  EventSet *Use, *Ir, *Dr, *Dw;
+  // Compositions of basic sets used for per-instruction counters
+  EventSet *UIr, *UIrDr, *UIrDrDw, *UIrDw, *UIrDwDr;
+  // Composition used for global counters and aggregation
+  EventSet *full;
   int sizeOfUseIr;
 
-  use = CLG_(get_eventset)("Use", 4);
+  // the "Use" events types only are used with "cacheuse" simulation
+  Use = CLG_(get_eventset)("Use", 4);
   if (clo_collect_cacheuse) {
     /* if TUse is 0, there was never a load, and no loss, too */
     e1 = CLG_(register_eventtype)("AcCost1");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("SpLoss1");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("AcCost2");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("SpLoss2");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
   }
 
-  Ir = CLG_(get_eventset)("Ir", 4);    
+  Ir = CLG_(get_eventset)("Ir", 4);
   Dr = CLG_(get_eventset)("Dr", 4);
   Dw = CLG_(get_eventset)("Dw", 4);
   if (CLG_(clo).simulate_cache) {
@@ -1822,74 +1849,76 @@ void CLG_(init_eventsets)(Int max_user)
     CLG_(add_eventtype)(Ir, e1);
   }
 
-  sizeOfUseIr =  use->size + Ir->size;
-  D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
-  CLG_(add_eventset)(D0, use);
-  off_D0_Ir  = CLG_(add_eventset)(D0, Ir);
-
-  D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
-  CLG_(add_eventset)(D1r, use);
-  off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
-  off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
-
-  D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
-  CLG_(add_eventset)(D1w, use);
-  off_D1w_Ir   = CLG_(add_eventset)(D1w, Ir);
-  off_D1w_Dw   = CLG_(add_eventset)(D1w, Dw);
-
-  D2  = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
-  CLG_(add_eventset)(D2, use);
-  off_D2_Ir    = CLG_(add_eventset)(D2, Ir);
-  off_D2_Dr    = CLG_(add_eventset)(D2, Dr);
-  off_D2_Dw    = CLG_(add_eventset)(D2, Dw);
-
-  sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
-  CLG_(add_eventset)(sim, use);
-  CLG_(sets).off_sim_Ir   = CLG_(add_eventset)(sim, Ir);
-  CLG_(sets).off_sim_Dr   = CLG_(add_eventset)(sim, Dr);
-  CLG_(sets).off_sim_Dw   = CLG_(add_eventset)(sim, Dw);
+  // Self cost event sets per guest instruction (U used only for cacheUse).
+  // Each basic event set only appears once, as eg. multiple different Dr's
+  // in one guest instruction are counted in the same counter.
 
-  if (CLG_(clo).collect_alloc)   max_user += 2;
-  if (CLG_(clo).collect_systime) max_user += 2;
+  sizeOfUseIr =  Use->size + Ir->size;
+  UIr = CLG_(get_eventset)("UIr", sizeOfUseIr);
+  CLG_(add_eventset)(UIr, Use);
+  off_UIr_Ir  = CLG_(add_eventset)(UIr, Ir);
 
-  full = CLG_(get_eventset)("full", sim->size + max_user);
-  CLG_(add_eventset)(full, sim);
-  CLG_(sets).off_full_Ir   = CLG_(sets).off_sim_Ir;
-  CLG_(sets).off_full_Dr   = CLG_(sets).off_sim_Dr;
-  CLG_(sets).off_full_Dw   = CLG_(sets).off_sim_Dw;
+  UIrDr = CLG_(get_eventset)("UIrDr", sizeOfUseIr + Dr->size);
+  CLG_(add_eventset)(UIrDr, Use);
+  off_UIrDr_Ir = CLG_(add_eventset)(UIrDr, Ir);
+  off_UIrDr_Dr = CLG_(add_eventset)(UIrDr, Dr);
 
-  CLG_(sets).use = use;
-  CLG_(sets).Ir  = Ir;
-  CLG_(sets).Dr  = Dr;
-  CLG_(sets).Dw  = Dw;
+  UIrDrDw  = CLG_(get_eventset)("IrDrDw", sizeOfUseIr + Dr->size + Dw->size);
+  CLG_(add_eventset)(UIrDrDw, Use);
+  off_UIrDrDw_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
+  off_UIrDrDw_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
+  off_UIrDrDw_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
 
-  CLG_(sets).D0  = D0;
-  CLG_(sets).D1r = D1r;
-  CLG_(sets).D1w = D1w;
-  CLG_(sets).D2  = D2;
+  UIrDw = CLG_(get_eventset)("UIrDw", sizeOfUseIr + Dw->size);
+  CLG_(add_eventset)(UIrDw, Use);
+  off_UIrDw_Ir   = CLG_(add_eventset)(UIrDw, Ir);
+  off_UIrDw_Dw   = CLG_(add_eventset)(UIrDw, Dw);
+
+  UIrDwDr  = CLG_(get_eventset)("IrDwDr", sizeOfUseIr + Dw->size + Dr->size);
+  CLG_(add_eventset)(UIrDwDr, Use);
+  off_UIrDwDr_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
+  off_UIrDwDr_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
+  off_UIrDwDr_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
 
-  CLG_(sets).sim  = sim;
-  CLG_(sets).full = full;
 
+  // the "full" event set is used as global counter and for aggregation
+  if (CLG_(clo).collect_alloc)   max_user += 2;
+  if (CLG_(clo).collect_systime) max_user += 2;
+  full = CLG_(get_eventset)("full",
+                           sizeOfUseIr + Dr->size + Dw->size + max_user);
+  CLG_(add_eventset)(full, Use);
+  CLG_(sets).off_full_Ir   = CLG_(add_eventset)(full, Ir);
+  CLG_(sets).off_full_Dr   = CLG_(add_eventset)(full, Dr);
+  CLG_(sets).off_full_Dw   = CLG_(add_eventset)(full, Dw);
   if (CLG_(clo).collect_alloc) {
-    e1 = CLG_(register_eventtype)("allocCount");
-    e2 = CLG_(register_eventtype)("allocSize");
-    CLG_(sets).off_full_user =  CLG_(add_dep_event2)(full, e1,e2);
+      e1 = CLG_(register_eventtype)("allocCount");
+      e2 = CLG_(register_eventtype)("allocSize");
+      CLG_(sets).off_full_alloc =  CLG_(add_dep_event2)(full, e1,e2);
   }
-
   if (CLG_(clo).collect_systime) {
-    e1 = CLG_(register_eventtype)("sysCount");
-    e2 = CLG_(register_eventtype)("sysTime");
-    CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
+      e1 = CLG_(register_eventtype)("sysCount");
+      e2 = CLG_(register_eventtype)("sysTime");
+      CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
   }
 
+  CLG_(sets).Use = Use;
+  CLG_(sets).Ir  = Ir;
+  CLG_(sets).Dr  = Dr;
+  CLG_(sets).Dw  = Dw;
+  CLG_(sets).UIr  = UIr;
+  CLG_(sets).UIrDr = UIrDr;
+  CLG_(sets).UIrDrDw  = UIrDrDw;
+  CLG_(sets).UIrDw = UIrDw;
+  CLG_(sets).UIrDwDr  = UIrDwDr;
+  CLG_(sets).full = full;
+
+
   CLG_DEBUGIF(1) {
     CLG_DEBUG(1, "EventSets:\n");
-    CLG_(print_eventset)(-2, use);
+    CLG_(print_eventset)(-2, Use);
     CLG_(print_eventset)(-2, Ir);
     CLG_(print_eventset)(-2, Dr);
     CLG_(print_eventset)(-2, Dw);
-    CLG_(print_eventset)(-2, sim);
     CLG_(print_eventset)(-2, full);
   }
 
@@ -1924,34 +1953,41 @@ static
 void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
 {
   /* if eventset use is defined, it is always first (hardcoded!) */
-  CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);  
+  CLG_(add_and_zero_cost)( CLG_(sets).Use, dst, cost);
 
-  /* FIXME: This is hardcoded... */
-  if (es == CLG_(sets).D0) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-                           cost + off_D0_Ir);
+  if (es == CLG_(sets).UIr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+                           cost + off_UIr_Ir);
   }
-  else if (es == CLG_(sets).D1r) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-                           cost + off_D1r_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
-                           cost + off_D1r_Dr);
+  else if (es == CLG_(sets).UIrDr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+                           cost + off_UIrDr_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+                           cost + off_UIrDr_Dr);
   }
-  else if (es == CLG_(sets).D1w) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-                           cost + off_D1w_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
-                           cost + off_D1w_Dw);
+  else if (es == CLG_(sets).UIrDrDw) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+                           cost + off_UIrDrDw_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+                           cost + off_UIrDrDw_Dr);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+                           cost + off_UIrDrDw_Dw);
   }
-  else {
-    CLG_ASSERT(es == CLG_(sets).D2);
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-                           cost + off_D2_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
-                           cost + off_D2_Dr);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
-                           cost + off_D2_Dw);
+  else if (es == CLG_(sets).UIrDw) {
+      CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+                              cost + off_UIrDw_Ir);
+      CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+                              cost + off_UIrDw_Dw);
+  }
+  else if (es == CLG_(sets).UIrDwDr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+                           cost + off_UIrDwDr_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+                           cost + off_UIrDwDr_Dw);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+                            cost + off_UIrDwDr_Dr);
   }
+  else CLG_ASSERT(0);
 }
 
 /* this is called at dump time for every instruction executed */
@@ -1959,7 +1995,7 @@ static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
                               InstrInfo* ii, ULong exe_count)
 {
   if (!CLG_(clo).simulate_cache)
-      cost[CLG_(sets).off_sim_Ir] += exe_count;
+      cost[CLG_(sets).off_full_Ir] += exe_count;
   else {
 
 #if 0
@@ -2019,24 +2055,24 @@ struct cachesim_if CLG_(cachesim) = {
 
   /* these will be set by cachesim_post_clo_init */
   .log_1I0D        = 0,
+  .log_2I0D        = 0,
+  .log_3I0D        = 0,
 
   .log_1I1Dr       = 0,
   .log_1I1Dw       = 0,
-  .log_1I2D        = 0,
 
   .log_0I1Dr       = 0,
   .log_0I1Dw       = 0,
-  .log_0I2D        = 0,
 
   .log_1I0D_name = "(no function)",
+  .log_2I0D_name = "(no function)",
+  .log_3I0D_name = "(no function)",
 
   .log_1I1Dr_name = "(no function)",
   .log_1I1Dw_name = "(no function)",
-  .log_1I2D_name = "(no function)",
 
   .log_0I1Dr_name = "(no function)",
   .log_0I1Dw_name = "(no function)",
-  .log_0I2D_name = "(no function)"
 };