From a3b8d67b91ddead7fbe5ef209345033efe429c6e Mon Sep 17 00:00:00 2001
From: Josef Weidendorfer <Josef.Weidendorfer@gmx.de>
Date: Mon, 20 Mar 2006 10:27:30 +0000
Subject: [PATCH] Callgrind merge: code

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5780
---
 Makefile.am                     |    1 +
 callgrind/Makefile.am           |   59 +
 callgrind/bb.c                  |  338 +++++
 callgrind/bbcc.c                |  883 +++++++++++++
 callgrind/callgrind.h           |  130 ++
 callgrind/callgrind_annotate.in | 1191 +++++++++++++++++
 callgrind/callgrind_control.in  |  485 +++++++
 callgrind/callstack.c           |  424 ++++++
 callgrind/clo.c                 |  765 +++++++++++
 callgrind/command.c             |  517 ++++++++
 callgrind/context.c             |  328 +++++
 callgrind/costs.c               |   79 ++
 callgrind/costs.h               |   35 +
 callgrind/debug.c               |  453 +++++++
 callgrind/docs/Makefile.am      |    1 +
 callgrind/dump.c                | 1715 ++++++++++++++++++++++++
 callgrind/events.c              |  575 ++++++++
 callgrind/events.h              |  113 ++
 callgrind/fn.c                  |  616 +++++++++
 callgrind/global.h              |  838 ++++++++++++
 callgrind/jumps.c               |  233 ++++
 callgrind/main.c                | 1086 ++++++++++++++++
 callgrind/sim.c                 | 2162 +++++++++++++++++++++++++++++++
 callgrind/tests/Makefile.am     |   14 +
 callgrind/threads.c             |  456 +++++++
 configure.in                    |    5 +
 26 files changed, 13502 insertions(+)
 create mode 100644 callgrind/Makefile.am
 create mode 100644 callgrind/bb.c
 create mode 100644 callgrind/bbcc.c
 create mode 100644 callgrind/callgrind.h
 create mode 100644 callgrind/callgrind_annotate.in
 create mode 100644 callgrind/callgrind_control.in
 create mode 100644 callgrind/callstack.c
 create mode 100644 callgrind/clo.c
 create mode 100644 callgrind/command.c
 create mode 100644 callgrind/context.c
 create mode 100644 callgrind/costs.c
 create mode 100644 callgrind/costs.h
 create mode 100644 callgrind/debug.c
 create mode 100644 callgrind/docs/Makefile.am
 create mode 100644 callgrind/dump.c
 create mode 100644 callgrind/events.c
 create mode 100644 callgrind/events.h
 create mode 100644 callgrind/fn.c
 create mode 100644 callgrind/global.h
 create mode 100644 callgrind/jumps.c
 create mode 100644 callgrind/main.c
 create mode 100644 callgrind/sim.c
 create mode 100644 callgrind/tests/Makefile.am
 create mode 100644 callgrind/threads.c

diff --git a/Makefile.am b/Makefile.am
index 57592746ef..0bae4d0000 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -6,6 +6,7 @@ include $(top_srcdir)/Makefile.all.am
 ## addrcheck must come after memcheck, for mac_*.o
 TOOLS =		memcheck \
 		cachegrind \
+		callgrind \
 		massif \
 		lackey \
 		none
diff --git a/callgrind/Makefile.am b/callgrind/Makefile.am
new file mode 100644
index 0000000000..0d8cf57510
--- /dev/null
+++ b/callgrind/Makefile.am
@@ -0,0 +1,59 @@
+include $(top_srcdir)/Makefile.tool.am
+
+bin_SCRIPTS = callgrind_annotate callgrind_control
+
+noinst_HEADERS = global.h costs.h events.h
+
+noinst_PROGRAMS = 
+if VG_X86_LINUX
+noinst_PROGRAMS += callgrind-x86-linux
+endif
+if VG_AMD64_LINUX
+noinst_PROGRAMS += callgrind-amd64-linux
+endif
+if VG_PPC32_LINUX
+noinst_PROGRAMS += callgrind-ppc32-linux
+endif
+if VG_PPC64_LINUX
+noinst_PROGRAMS += callgrind-ppc64-linux
+endif
+
+CALLGRIND_SOURCES_COMMON = main.c events.c bb.c clo.c \
+                           costs.c bbcc.c command.c debug.c fn.c \
+                           sim.c callstack.c context.c dump.c jumps.c \
+                           threads.c
+
+CALLGRIND_SOURCES_X86 = ../cachegrind/cg-x86.c
+CALLGRIND_SOURCES_AMD64 = ../cachegrind/cg-amd64.c
+CALLGRIND_SOURCES_PPC32 = ../cachegrind/cg-ppc32.c
+CALLGRIND_SOURCES_PPC64 = ../cachegrind/cg-ppc64.c
+
+CALLGRIND_CFLAGS_COMMON = -I../cachegrind
+
+callgrind_x86_linux_SOURCES      = $(CALLGRIND_SOURCES_COMMON) $(CALLGRIND_SOURCES_X86)
+callgrind_x86_linux_CPPFLAGS     = $(AM_CPPFLAGS_X86_LINUX)
+callgrind_x86_linux_CFLAGS       = $(CALLGRIND_CFLAGS_COMMON) $(AM_CFLAGS_X86_LINUX)
+callgrind_x86_linux_DEPENDENCIES = $(COREGRIND_LIBS_X86_LINUX)
+callgrind_x86_linux_LDADD        = $(TOOL_LDADD_X86_LINUX)
+callgrind_x86_linux_LDFLAGS      = $(TOOL_LDFLAGS_X86_LINUX)
+
+callgrind_amd64_linux_SOURCES      = $(CALLGRIND_SOURCES_COMMON) $(CALLGRIND_SOURCES_AMD64)
+callgrind_amd64_linux_CPPFLAGS     = $(AM_CPPFLAGS_AMD64_LINUX)
+callgrind_amd64_linux_CFLAGS       = $(CALLGRIND_CFLAGS_COMMON) $(AM_CFLAGS_AMD64_LINUX)
+callgrind_amd64_linux_DEPENDENCIES = $(COREGRIND_LIBS_AMD64_LINUX)
+callgrind_amd64_linux_LDADD        = $(TOOL_LDADD_AMD64_LINUX)
+callgrind_amd64_linux_LDFLAGS      = $(TOOL_LDFLAGS_AMD64_LINUX)
+
+callgrind_ppc32_linux_SOURCES      = $(CALLGRIND_SOURCES_COMMON) $(CALLGRIND_SOURCES_PPC32)
+callgrind_ppc32_linux_CPPFLAGS     = $(AM_CPPFLAGS_PPC32_LINUX)
+callgrind_ppc32_linux_CFLAGS       = $(CALLGRIND_CFLAGS_COMMON) $(AM_CFLAGS_PPC32_LINUX)
+callgrind_ppc32_linux_DEPENDENCIES = $(COREGRIND_LIBS_PPC32_LINUX)
+callgrind_ppc32_linux_LDADD        = $(TOOL_LDADD_PPC32_LINUX)
+callgrind_ppc32_linux_LDFLAGS      = $(TOOL_LDFLAGS_PPC32_LINUX)
+
+callgrind_ppc64_linux_SOURCES      = $(CALLGRIND_SOURCES_COMMON) $(CALLGRIND_SOURCES_PPC64)
+callgrind_ppc64_linux_CPPFLAGS     = $(AM_CPPFLAGS_PPC64_LINUX)
+callgrind_ppc64_linux_CFLAGS       = $(CALLGRIND_CFLAGS_COMMON) $(AM_CFLAGS_PPC64_LINUX)
+callgrind_ppc64_linux_DEPENDENCIES = $(COREGRIND_LIBS_PPC64_LINUX)
+callgrind_ppc64_linux_LDADD        = $(TOOL_LDADD_PPC64_LINUX)
+callgrind_ppc64_linux_LDFLAGS      = $(TOOL_LDFLAGS_PPC64_LINUX)
\ No newline at end of file
diff --git a/callgrind/bb.c b/callgrind/bb.c
new file mode 100644
index 0000000000..a6c8ebadcf
--- /dev/null
+++ b/callgrind/bb.c
@@ -0,0 +1,338 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                         bb.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Basic block (BB) operations                          ---*/
+/*------------------------------------------------------------*/
+
+/* BB hash, resizable */
+bb_hash bbs;
+
+void CLG_(init_bb_hash)()
+{
+   Int i;
+
+   bbs.size    = 8437;
+   bbs.entries = 0;
+   bbs.table = (BB**) CLG_MALLOC(bbs.size * sizeof(BB*));
+
+   for (i = 0; i < bbs.size; i++) bbs.table[i] = NULL;
+}
+
+bb_hash* CLG_(get_bb_hash)()
+{
+  return &bbs;
+}
+
+/* The hash stores BBs according to
+ * - ELF object (is 0 for code in anonymous mapping)
+ * - BB base as object file offset
+ */
+static __inline__
+UInt bb_hash_idx(obj_node* obj, OffT offset, UInt size)
+{
+  return (((Addr)obj) + offset) % size;
+}
+
+/* double size of bb table  */
+static
+void resize_bb_table(void)
+{
+    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
+    BB **new_table, *curr, *next;
+    UInt new_idx;
+
+    new_size  = 2* bbs.size +3;
+    new_table = (BB**) CLG_MALLOC(new_size * sizeof(BB*));
+ 
+    if (!new_table) return;
+ 
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+ 
+    for (i = 0; i < bbs.size; i++) {
+	if (bbs.table[i] == NULL) continue;
+ 
+	curr = bbs.table[i];
+	while (NULL != curr) {
+	    next = curr->next;
+
+	    new_idx = bb_hash_idx(curr->obj, curr->offset, new_size);
+
+	    curr->next = new_table[new_idx];
+	    new_table[new_idx] = curr;
+	    if (curr->next) {
+		conflicts1++;
+		if (curr->next->next)
+		    conflicts2++;
+	    }
+
+	    curr = next;
+	}
+    }
+
+    VG_(free)(bbs.table);
+
+
+    CLG_DEBUG(0, "Resize BB Hash: %d => %d (entries %d, conflicts %d/%d)\n",
+	     bbs.size, new_size,
+	     bbs.entries, conflicts1, conflicts2);
+
+    bbs.size  = new_size;
+    bbs.table = new_table;
+    CLG_(stat).bb_hash_resizes++;
+}
+
+
+/**
+ * Allocate new BB structure (including space for event type list)
+ * Not initialized:
+ * - instr_len, cost_count, instr[]
+ */
+static BB* new_bb(obj_node* obj, OffT offset,
+		  UInt instr_count, UInt cjmp_count, Bool cjmp_inverted)
+{
+   BB* new;
+   UInt new_idx;
+
+   /* check fill degree of bb hash table and resize if needed (>80%) */
+   bbs.entries++;
+   if (10 * bbs.entries / bbs.size > 8)
+       resize_bb_table();
+
+   new = (BB*) CLG_MALLOC(sizeof(BB) +
+			  instr_count * sizeof(InstrInfo) +
+			  (cjmp_count+1) * sizeof(CJmpInfo));
+
+   new->obj        = obj;
+   new->offset     = offset;
+   
+   new->instr_count = instr_count;
+   new->cjmp_count  = cjmp_count;
+   new->cjmp_inverted = cjmp_inverted;
+   new->jmp         = (CJmpInfo*) &(new->instr[instr_count]);
+   new->instr_len   = 0;
+   new->cost_count  = 0;
+   new->sect_kind   = VG_(seginfo_sect_kind)(offset + obj->offset);
+   new->fn          = 0;
+   new->line        = 0;
+   new->is_entry    = 0;
+   new->bbcc_list   = 0;
+   new->last_bbcc   = 0;
+
+   /* insert into BB hash table */
+   new_idx = bb_hash_idx(obj, offset, bbs.size);
+   new->next = bbs.table[new_idx];
+   bbs.table[new_idx] = new;
+
+   CLG_(stat).distinct_bbs++;
+
+#if CLG_ENABLE_DEBUG
+   CLG_DEBUGIF(3) {
+     VG_(printf)("  new_bb (instr %d, jmps %d, inv %s) [now %d]: ",
+		 instr_count, cjmp_count,
+		 cjmp_inverted ? "yes":"no",
+		 CLG_(stat).distinct_bbs);
+      CLG_(print_bb)(0, new);
+      VG_(printf)("\n");
+   }
+#endif
+
+   CLG_(get_fn_node)(new);
+
+   return new;
+}
+
+
+/* get the BB structure for a BB start address */
+static __inline__
+BB* lookup_bb(obj_node* obj, OffT offset)
+{
+    BB* bb;
+    Int idx;
+
+    idx = bb_hash_idx(obj, offset, bbs.size);
+    bb = bbs.table[idx];
+
+    while(bb) {
+      if ((bb->obj == obj) && (bb->offset == offset)) break;
+      bb = bb->next;
+    }
+
+    CLG_DEBUG(5, "  lookup_bb (Obj %s, off %p): %p\n",
+	     obj->name, offset, bb);
+    return bb;
+}
+
+static __inline__
+obj_node* obj_of_address(Addr addr)
+{
+  obj_node* obj;
+  SegInfo* si;
+  OffT offset;
+
+  si = VG_(find_seginfo)(addr);
+  obj = CLG_(get_obj_node)( si );
+
+  /* Update symbol offset in object if remapped */
+  offset = si ? VG_(seginfo_sym_offset)(si):0;
+  if (obj->offset != offset) {
+      Addr start = si ? VG_(seginfo_start)(si) : 0;
+
+      CLG_DEBUG(0, "Mapping changed for '%s': %p -> %p\n",
+		obj->name, obj->start, start);
+
+      /* Size should be the same, and offset diff == start diff */
+      CLG_ASSERT( obj->size == (si ? VG_(seginfo_size)(si) : 0) );
+      CLG_ASSERT( obj->start - start == obj->offset - offset );
+      obj->offset = offset;
+      obj->start = start;
+  }
+
+  return obj;
+}
+
+/* Get the BB structure for a BB start address.
+ * If the BB has to be created, the IRBB is needed to
+ * compute the event type list for costs, and seen_before is
+ * set to False. Otherwise, seen_before is set to True.
+ *
+ * BBs are never discarded. There are 2 cases where this function
+ * is called from CLG_(instrument)() and a BB already exists:
+ * - The instrumented version was removed from Valgrinds TT cache
+ * - The ELF object of the BB was unmapped and mapped again.
+ *   This involves a possibly different address, but is handled by
+ *   looking up a BB keyed by (obj_node, file offset).
+ *
+ * bbIn==0 is possible for artifical BB without real code.
+ * Such a BB is created when returning to an unknown function.
+ */
+BB* CLG_(get_bb)(Addr addr, IRBB* bbIn, /*OUT*/ Bool *seen_before)
+{
+  BB*   bb;
+  obj_node* obj;
+  UInt n_instrs, n_jmps;
+  Bool cjmp_inverted = False;
+
+  CLG_DEBUG(5, "+ get_bb(BB %p)\n", addr);
+
+  obj = obj_of_address(addr);
+  bb = lookup_bb(obj, addr - obj->offset);
+
+  n_instrs = 0;
+  n_jmps = 0;
+  CLG_(collectBlockInfo)(bbIn, &n_instrs, &n_jmps, &cjmp_inverted);
+
+  *seen_before = bb ? True : False;
+  if (*seen_before) {
+    if (bb->instr_count != n_instrs) {
+      VG_(message)(Vg_DebugMsg, 
+		   "ERROR: BB Retranslation Mismatch at BB %p", addr);
+      VG_(message)(Vg_DebugMsg,
+		   "  new: Obj %s, Off %p, BBOff %p, Instrs %u",
+		   obj->name, obj->offset,
+		   addr - obj->offset, n_instrs);
+      VG_(message)(Vg_DebugMsg,
+		   "  old: Obj %s, Off %p, BBOff %p, Instrs %u",
+		   bb->obj->name, bb->obj->offset,
+		   bb->offset, bb->instr_count);
+      CLG_ASSERT(bb->instr_count == n_instrs );
+    }
+    CLG_ASSERT(bb->cjmp_count == n_jmps );
+    CLG_(stat).bb_retranslations++;
+
+    CLG_DEBUG(5, "- get_bb(BB %p): seen before.\n", addr);
+    return bb;
+  }
+
+  bb = new_bb(obj, addr - obj->offset, n_instrs, n_jmps, cjmp_inverted);
+
+  CLG_DEBUG(5, "- get_bb(BB %p)\n", addr);
+
+  return bb;
+}
+
+/* Delete the BB info for the bb with unredirected entry-point
+   address 'addr'. */
+void CLG_(delete_bb)(Addr addr)
+{
+    BB  *bb, *bp;
+    Int idx, size;
+
+    obj_node* obj = obj_of_address(addr);
+    OffT offset = addr - obj->offset;
+
+    idx = bb_hash_idx(obj, offset, bbs.size);
+    bb = bbs.table[idx];
+
+    /* bb points at the current bb under consideration, and bp is the
+       one before. */
+    bp = NULL;
+    while(bb) {
+      if ((bb->obj == obj) && (bb->offset == offset)) break;
+      bp = bb;
+      bb = bb->next;
+    }
+
+    if (bb == NULL) {
+	CLG_DEBUG(3, "  delete_bb (Obj %s, off %p): NOT FOUND\n",
+		  obj->name, offset);
+
+	/* we didn't find it.  That's strange. */
+	return;
+    }
+
+    /* unlink it from hash table */
+
+    if (bp == NULL) {
+       /* we found the first one in the list. */
+       tl_assert(bb == bbs.table[idx]);
+       bbs.table[idx] = bb->next;
+    } else {
+       tl_assert(bb != bbs.table[idx]);
+       bp->next = bb->next;
+    }
+
+    CLG_DEBUG(3, "  delete_bb (Obj %s, off %p): %p, BBCC head: %p\n",
+	      obj->name, offset, bb, bb->bbcc_list);
+
+    if (bb->bbcc_list == 0) {
+	/* can be safely deleted */
+
+	/* Fill the block up with junk and then free it, so we will
+	   hopefully get a segfault if it is used again by mistake. */
+	size = sizeof(BB)
+	    + bb->instr_count * sizeof(InstrInfo)
+	    + (bb->cjmp_count+1) * sizeof(CJmpInfo);
+	VG_(memset)( bb, 0xAA, size );
+	CLG_FREE(bb);
+    }
+    CLG_DEBUG(3, "  delete_bb: BB in use, can not free!\n");
+}
diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c
new file mode 100644
index 0000000000..d2eb4b93bb
--- /dev/null
+++ b/callgrind/bbcc.c
@@ -0,0 +1,883 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                       bbcc.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+#include "costs.h"
+
+#include <pub_tool_threadstate.h>
+
+/*------------------------------------------------------------*/
+/*--- BBCC operations                                      ---*/
+/*------------------------------------------------------------*/
+
+#define N_BBCC_INITIAL_ENTRIES  10437
+
+/* BBCC table (key is BB/Context), per thread, resizable */
+bbcc_hash current_bbccs;
+
+void CLG_(init_bbcc_hash)(bbcc_hash* bbccs)
+{
+   Int i;
+
+   CLG_ASSERT(bbccs != 0);
+
+   bbccs->size    = N_BBCC_INITIAL_ENTRIES;
+   bbccs->entries = 0;
+   bbccs->table = (BBCC**) CLG_MALLOC(bbccs->size * sizeof(BBCC*));
+
+   for (i = 0; i < bbccs->size; i++) bbccs->table[i] = NULL;
+}
+
+void CLG_(copy_current_bbcc_hash)(bbcc_hash* dst)
+{
+  CLG_ASSERT(dst != 0);
+
+  dst->size    = current_bbccs.size;
+  dst->entries = current_bbccs.entries;
+  dst->table   = current_bbccs.table;
+}
+
+bbcc_hash* CLG_(get_current_bbcc_hash)()
+{
+  return &current_bbccs;
+}
+
+void CLG_(set_current_bbcc_hash)(bbcc_hash* h)
+{
+  CLG_ASSERT(h != 0);
+
+  current_bbccs.size    = h->size;
+  current_bbccs.entries = h->entries;
+  current_bbccs.table   = h->table;
+}
+
+/*
+ * Zero all costs of a BBCC
+ */
+void CLG_(zero_bbcc)(BBCC* bbcc)
+{
+  Int i;
+  jCC* jcc;
+
+  CLG_ASSERT(bbcc->cxt != 0);
+  CLG_DEBUG(1, "  zero_bbcc: BB %p, Cxt %d "
+	   "(fn '%s', rec %d)\n", 
+	   bb_addr(bbcc->bb),
+	   bbcc->cxt->base_number + bbcc->rec_index,
+	   bbcc->cxt->fn[0]->name,
+	   bbcc->rec_index);
+
+  if ((bbcc->ecounter_sum ==0) &&
+      (bbcc->ret_counter ==0)) return;
+
+  for(i=0;i<bbcc->bb->cost_count;i++)
+    bbcc->cost[i] = 0;
+  for(i=0;i <= bbcc->bb->cjmp_count;i++) {
+    bbcc->jmp[i].ecounter = 0;
+    for(jcc=bbcc->jmp[i].jcc_list; jcc; jcc=jcc->next_from)
+	CLG_(init_cost)( CLG_(sets).full, jcc->cost );
+  }
+  bbcc->ecounter_sum = 0;
+  bbcc->ret_counter = 0;
+}
+
+
+
+void CLG_(forall_bbccs)(void (*func)(BBCC*))
+{
+  BBCC *bbcc, *bbcc2;
+  int i, j;
+	
+  for (i = 0; i < current_bbccs.size; i++) {
+    if ((bbcc=current_bbccs.table[i]) == NULL) continue;
+    while (bbcc) {
+      /* every bbcc should have a rec_array */
+      CLG_ASSERT(bbcc->rec_array != 0);
+
+      for(j=0;j<bbcc->cxt->fn[0]->separate_recursions;j++) {
+	if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
+
+	(*func)(bbcc2);
+      }
+      bbcc = bbcc->next;
+    }
+  }
+}
+
+
+/* All BBCCs for recursion level 0 are inserted into a
+ * thread specific hash table with key
+ * - address of BB structure (unique, as never freed)
+ * - current context (includes caller chain)
+ * BBCCs for other recursion levels are in bbcc->rec_array.
+ *
+ * The hash is used in setup_bb(), i.e. to find the cost
+ * counters to be changed in the execution of a BB.
+ */
+
+static __inline__
+UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
+{
+   CLG_ASSERT(bb != 0);
+   CLG_ASSERT(cxt != 0);
+
+   return ((Addr)bb + (Addr)cxt) % size;
+}
+ 
+
+/* Lookup for a BBCC in hash.
+ */ 
+static
+BBCC* lookup_bbcc(BB* bb, Context* cxt)
+{
+   BBCC* bbcc = bb->last_bbcc;
+   UInt  idx;
+
+   /* check LRU */
+   if (bbcc->cxt == cxt) {
+       if (!CLG_(clo).separate_threads) {
+	   /* if we don't dump threads separate, tid doesn't have to match */
+	   return bbcc;
+       }
+       if (bbcc->tid == CLG_(current_tid)) return bbcc;
+   }
+
+   CLG_(stat).bbcc_lru_misses++;
+
+   idx = bbcc_hash_idx(bb, cxt, current_bbccs.size);
+   bbcc = current_bbccs.table[idx];
+   while (bbcc &&
+	  (bb      != bbcc->bb ||
+	   cxt     != bbcc->cxt)) {
+       bbcc = bbcc->next;
+   }
+   
+   CLG_DEBUG(2,"  lookup_bbcc(BB %p, Cxt %d, fn '%s'): %p (tid %d)\n",
+	    bb_addr(bb), cxt->base_number, cxt->fn[0]->name, 
+	    bbcc, bbcc ? bbcc->tid : 0);
+
+   CLG_DEBUGIF(2)
+     if (bbcc) CLG_(print_bbcc)(-2,bbcc,False);
+
+   return bbcc;
+}
+
+
+/* double size of hash table 1 (addr->BBCC) */
+static void resize_bbcc_hash(void)
+{
+    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
+    BBCC** new_table;
+    UInt new_idx;
+    BBCC *curr_BBCC, *next_BBCC;
+
+    new_size = 2*current_bbccs.size+3;
+    new_table = (BBCC**) CLG_MALLOC(new_size * sizeof(BBCC*));
+ 
+    if (!new_table) return;
+ 
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+ 
+    for (i = 0; i < current_bbccs.size; i++) {
+	if (current_bbccs.table[i] == NULL) continue;
+ 
+	curr_BBCC = current_bbccs.table[i];
+	while (NULL != curr_BBCC) {
+	    next_BBCC = curr_BBCC->next;
+
+	    new_idx = bbcc_hash_idx(curr_BBCC->bb,
+				    curr_BBCC->cxt,
+				    new_size);
+
+	    curr_BBCC->next = new_table[new_idx];
+	    new_table[new_idx] = curr_BBCC;
+	    if (curr_BBCC->next) {
+		conflicts1++;
+		if (curr_BBCC->next->next)
+		    conflicts2++;
+	    }
+
+	    curr_BBCC = next_BBCC;
+	}
+    }
+
+    VG_(free)(current_bbccs.table);
+
+
+    CLG_DEBUG(0,"Resize BBCC Hash: %d => %d (entries %d, conflicts %d/%d)\n",
+	     current_bbccs.size, new_size,
+	     current_bbccs.entries, conflicts1, conflicts2);
+
+    current_bbccs.size = new_size;
+    current_bbccs.table = new_table;
+    CLG_(stat).bbcc_hash_resizes++;
+}
+
+
+static __inline
+BBCC** new_recursion(int size)
+{
+    BBCC** bbccs;
+    int i;
+    
+    bbccs = (BBCC**) CLG_MALLOC(sizeof(BBCC*) * size);
+    for(i=0;i<size;i++)
+	bbccs[i] = 0;
+
+    CLG_DEBUG(3,"  new_recursion(size %d): %p\n", size, bbccs);
+    
+    return bbccs;
+}
+  
+
+/*
+ * Allocate a new BBCC
+ *
+ * Uninitialized:
+ * cxt, rec_index, rec_array, next_bbcc, next1, next2
+ */
+static __inline__ 
+BBCC* new_bbcc(BB* bb)
+{
+   BBCC* new;
+   Int i;
+
+   /* We need cjmp_count+1 JmpData structs:
+    * the last is for the unconditional jump/call/ret at end of BB
+    */
+   new = (BBCC*)CLG_MALLOC(sizeof(BBCC) +
+			   (bb->cjmp_count+1) * sizeof(JmpData));
+   new->bb  = bb;
+   new->tid = CLG_(current_tid);
+
+   new->ret_counter = 0;
+   new->skipped = 0;
+   new->cost = CLG_(get_costarray)(bb->cost_count);
+   for(i=0;i<bb->cost_count;i++)
+     new->cost[i] = 0;
+   for(i=0; i<=bb->cjmp_count; i++) {
+       new->jmp[i].ecounter = 0;
+       new->jmp[i].jcc_list = 0;
+   }
+   new->ecounter_sum = 0;
+
+   /* Init pointer caches (LRU) */
+   new->lru_next_bbcc = 0;
+   new->lru_from_jcc  = 0;
+   new->lru_to_jcc  = 0;
+   
+   CLG_(stat).distinct_bbccs++;
+
+   CLG_DEBUG(3, "  new_bbcc(BB %p): %p (now %d)\n", 
+	    bb_addr(bb), new, CLG_(stat).distinct_bbccs);
+
+   return new;
+}
+
+
+/**
+ * Inserts a new BBCC into hashes.
+ * BBCC specific items must be set as this is used for the hash
+ * keys:
+ *  fn     : current function
+ *  tid    : current thread ID
+ *  from   : position where current function is called from
+ *
+ * Recursion level doesn't need to be set as this is not included
+ * in the hash key: Only BBCCs with rec level 0 are in hashes.
+ */
+static
+void insert_bbcc_into_hash(BBCC* bbcc)
+{
+    UInt idx;
+    
+    CLG_ASSERT(bbcc->cxt != 0);
+
+    CLG_DEBUG(3,"+ insert_bbcc_into_hash(BB %p, fn '%s')\n",
+	     bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name);
+
+    /* check fill degree of hash and resize if needed (>90%) */
+    current_bbccs.entries++;
+    if (100 * current_bbccs.entries / current_bbccs.size > 90)
+	resize_bbcc_hash();
+
+    idx = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size);
+    bbcc->next = current_bbccs.table[idx];
+    current_bbccs.table[idx] = bbcc;
+
+    CLG_DEBUG(3,"- insert_bbcc_into_hash: %d entries\n",
+	     current_bbccs.entries);
+}
+
+static Char* mangled_cxt(Context* cxt, int rec_index)
+{
+    static Char mangled[FN_NAME_LEN];
+    int i, p;
+
+    if (!cxt) return "(no context)";
+
+    p = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
+    if (rec_index >0)
+	p += VG_(sprintf)(mangled+p, "'%d", rec_index +1);
+    for(i=1;i<cxt->size;i++)
+	p += VG_(sprintf)(mangled+p, "'%s", cxt->fn[i]->name);
+
+    return mangled;
+}
+
+
+/* Create a new BBCC as a copy of an existing one,
+ * but with costs set to 0 and jcc chains empty.
+ *
+ * This is needed when a BB is executed in another context than
+ * the one at instrumentation time of the BB.
+ *
+ * Use cases:
+ *  rec_index == 0: clone from a BBCC with differing tid/cxt
+ *                  and insert into hashes
+ *  rec_index >0  : clone from a BBCC with same tid/cxt and rec_index 0
+ *                  don't insert into hashes
+ */
+static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
+{
+    BBCC*      new;
+
+    CLG_DEBUG(3,"+ clone_bbcc(BB %p, rec %d, fn %s)\n",
+	     bb_addr(orig->bb), rec_index, cxt->fn[0]->name);
+
+    new  = new_bbcc(orig->bb);
+
+    if (rec_index == 0) {
+
+      /* hash insertion is only allowed if tid or cxt is different */
+      CLG_ASSERT((orig->tid != CLG_(current_tid)) ||
+		(orig->cxt != cxt));
+
+      new->rec_index = 0;
+      new->cxt = cxt;
+      new->rec_array = new_recursion(cxt->fn[0]->separate_recursions);
+      new->rec_array[0] = new;
+
+      insert_bbcc_into_hash(new);
+    }
+    else {
+      if (CLG_(clo).separate_threads)
+	CLG_ASSERT(orig->tid == CLG_(current_tid));
+
+      CLG_ASSERT(orig->cxt == cxt);
+      CLG_ASSERT(orig->rec_array);
+      CLG_ASSERT(cxt->fn[0]->separate_recursions > rec_index);
+      CLG_ASSERT(orig->rec_array[rec_index] ==0);
+
+      /* new BBCC will only have differing recursion level */
+      new->rec_index = rec_index;
+      new->cxt = cxt;
+      new->rec_array = orig->rec_array;
+      new->rec_array[rec_index] = new;
+    }
+
+    /* update list of BBCCs for same BB */
+    new->next_bbcc = orig->bb->bbcc_list;
+    orig->bb->bbcc_list = new;
+
+
+    CLG_DEBUGIF(3)
+      CLG_(print_bbcc)(-2, new, False);
+
+    CLG_DEBUG(2,"- clone_BBCC(%p, %d) for BB %p\n"
+		"   orig %s\n"
+		"   new  %s\n",
+	     orig, rec_index, bb_addr(orig->bb),
+	     mangled_cxt(orig->cxt, orig->rec_index),
+	     mangled_cxt(new->cxt, new->rec_index));
+
+    CLG_(stat).bbcc_clones++;
+ 
+    return new;
+};
+
+
+
+/* Get a pointer to the cost centre structure for given basic block
+ * address. If created, the BBCC is inserted into the BBCC hash.
+ * Also sets BB_seen_before by reference.
+ *
+ */ 
+BBCC* CLG_(get_bbcc)(BB* bb)
+{
+   BBCC* bbcc;
+
+   CLG_DEBUG(3, "+ get_bbcc(BB %p)\n", bb_addr(bb));
+
+   bbcc = bb->bbcc_list;
+
+   if (!bbcc) {
+     bbcc = new_bbcc(bb);
+
+     /* initialize BBCC */
+     bbcc->cxt       = 0;
+     bbcc->rec_array = 0;
+     bbcc->rec_index = 0;
+
+     bbcc->next_bbcc = bb->bbcc_list;
+     bb->bbcc_list = bbcc;
+     bb->last_bbcc = bbcc;
+
+     CLG_DEBUGIF(3)
+       CLG_(print_bbcc)(-2, bbcc, False);
+   }
+
+   CLG_DEBUG(3, "- get_bbcc(BB %p): BBCC %p\n",
+		bb_addr(bb), bbcc);
+
+   return bbcc;
+}
+
+
+/* Callgrind manages its own call stack for each thread.
+ * When leaving a function, a underflow can happen when
+ * Callgrind's tracing was switched on in the middle of
+ * a run, i.e. when Callgrind was not able to trace the
+ * call instruction.
+ * This function tries to reconstruct the original call.
+ * As we know the return address (the address following
+ * the CALL instruction), we can detect the function
+ * we return back to, but the original call site is unknown.
+ * We suppose a call site at return address - 1.
+ * (TODO: other heuristic: lookup info of instrumented BBs).
+ */
+static void handleUnderflow(BB* bb)
+{
+  /* RET at top of call stack */
+  BBCC* source_bbcc;
+  BB* source_bb;
+  jCC* jcc;
+  Bool seen_before;
+  fn_node* caller;
+  int fn_number, *pactive;
+  call_entry* call_entry_up;
+
+  CLG_DEBUG(1,"  Callstack underflow !\n");
+
+  /* we emulate an old call from the function we return to
+   * by using (<return address> -1) */
+  source_bb = CLG_(get_bb)(bb_addr(bb)-1, 0, &seen_before);
+  source_bbcc = CLG_(get_bbcc)(source_bb);
+
+  /* seen_before can be true if RET from a signal handler */
+  if (!seen_before) {
+    source_bbcc->ecounter_sum = CLG_(current_state).collect ? 1 : 0;
+  }
+  else if (CLG_(current_state).collect)
+    source_bbcc->ecounter_sum++;
+  
+  /* Force a new top context, will be set active by push_cxt() */
+  CLG_(current_fn_stack).top--;
+  CLG_(current_state).cxt = 0;
+  caller = CLG_(get_fn_node)(bb);
+  CLG_(push_cxt)( caller );
+
+  if (!seen_before) {
+    /* set rec array for source BBCC: this is at rec level 1 */
+    source_bbcc->rec_array = new_recursion(caller->separate_recursions);
+    source_bbcc->rec_array[0] = source_bbcc;
+
+    CLG_ASSERT(source_bbcc->cxt == 0);
+    source_bbcc->cxt = CLG_(current_state).cxt;
+    insert_bbcc_into_hash(source_bbcc);
+  }
+  CLG_ASSERT(CLG_(current_state).bbcc);
+
+  /* correct active counts */
+  fn_number = CLG_(current_state).bbcc->cxt->fn[0]->number;
+  pactive = CLG_(get_fn_entry)(fn_number);
+  (*pactive)--;
+
+  /* This assertion is not correct for reentrant
+   * signal handlers */
+  /* CLG_ASSERT(*pactive == 0); */
+
+  CLG_(current_state).nonskipped = 0; /* we didn't skip this function */
+  /* back to current context */
+  CLG_(push_cxt)( CLG_(current_state).bbcc->cxt->fn[0] );
+  CLG_(push_call_stack)(source_bbcc, 0, CLG_(current_state).bbcc,
+		       (Addr)-1, False);
+  call_entry_up = 
+    &(CLG_(current_call_stack).entry[CLG_(current_call_stack).sp -1]);
+  jcc = call_entry_up->jcc;
+  /* assume this call is lasting since last dump or
+   * for a signal handler since it's call */
+  if (CLG_(current_state).sig == 0)
+    CLG_(copy_cost)( CLG_(sets).full, call_entry_up->enter_cost,
+		    CLG_(get_current_thread)()->lastdump_cost );
+  else
+    CLG_(zero_cost)( CLG_(sets).full, call_entry_up->enter_cost );
+}
+
+
+/*
+ * Helper function called at start of each instrumented BB to setup
+ * pointer to costs for current thread/context/recursion level
+ */
+
+VG_REGPARM(1)
+void CLG_(setup_bbcc)(BB* bb)
+{
+  BBCC *bbcc, *last_bbcc;
+  Bool  call_emulation = False, delayed_push = False, skip = False;
+  Addr sp;
+  BB* last_bb;
+  ThreadId tid;
+  Int jmpkind, passed = 0, csp;
+  Bool ret_without_call = False;
+  Int popcount_on_return = 1;
+
+  CLG_DEBUG(3,"+ setup_bbcc(BB %p)\n", bb_addr(bb));
+
+  /* This is needed because thread switches can not reliable be tracked
+   * with callback CLG_(run_thread) only: we have otherwise no way to get
+   * the thread ID after a signal handler returns.
+   * This could be removed again if that bug is fixed in Valgrind.
+   * This is in the hot path but hopefully not to costly.
+   */
+  tid = VG_(get_running_tid)();
+#if 1
+  CLG_(switch_thread)(tid);
+#else
+  CLG_ASSERT(VG_(get_running_tid)() == CLG_(current_tid));
+#endif
+
+  sp = VG_(get_SP)(tid);
+  last_bbcc = CLG_(current_state).bbcc;
+  last_bb = last_bbcc ? last_bbcc->bb : 0;
+
+  if (last_bb) {
+      passed = CLG_(current_state).jmps_passed;
+      if (passed == last_bb->cjmp_count) {
+	  jmpkind = last_bb->jmpkind;
+
+	  /* VEX always gives a Boring jump kind also when passed trough */
+	  if ((jmpkind == Ijk_Boring) &&
+	      (last_bb->offset + last_bb->instr_len == bb->offset))
+	      jmpkind = JmpNone;
+      }
+      else
+	  jmpkind = JmpCond;
+
+      /* if we are in a function which is skipped in the call graph, we
+       * do not increment the exe counter to produce cost (if simulation off),
+       * which would lead to dumping this BB to be skipped
+       */
+      if (CLG_(current_state).collect && !CLG_(current_state).nonskipped) {
+	  last_bbcc->ecounter_sum++;
+	  last_bbcc->jmp[passed].ecounter++;
+	  if (!CLG_(clo).simulate_cache) {
+	      /* update Ir cost */
+	      int instr_count = last_bb->jmp[passed].instr+1;
+	      CLG_(current_state).cost[CLG_(sets).off_sim_Ir] += instr_count;
+	  }
+      }
+
+      CLG_DEBUGIF(4) {
+	  CLG_(print_execstate)(-2, &CLG_(current_state) );
+	  CLG_(print_bbcc_cost)(-2, last_bbcc);
+      }
+  }
+  else {
+      jmpkind = JmpNone;
+  }
+
+  /* Manipulate JmpKind if needed, only using BB specific info */
+
+  csp = CLG_(current_call_stack).sp;
+
+  /* A return not matching the top call in our callstack is a jump */
+  if ( (jmpkind == Ijk_Ret) && (csp >0)) {
+      Int csp_up = csp-1;      
+      call_entry* top_ce = &(CLG_(current_call_stack).entry[csp_up]);
+
+      /* We have a real return if
+       * - the stack pointer (SP) left the current stack frame, or
+       * - SP has the same value as when reaching the current function
+       *   and the address of this BB is the return address of last call
+       *   (we even allow to leave multiple frames if the SP stays the
+       *    same and we find a matching return address)
+       * The latter condition is needed because on PPC, SP can stay
+       * the same over CALL=b(c)l / RET=b(c)lr boundaries
+       */
+      if (sp < top_ce->sp) popcount_on_return = 0;
+      else if (top_ce->sp == sp) {
+	  while(1) {
+	      if (top_ce->ret_addr == bb_addr(bb)) break;
+	      if (csp_up>0) {
+		  csp_up--;
+		  top_ce = &(CLG_(current_call_stack).entry[csp_up]);
+		  if (top_ce->sp == sp) {
+		      popcount_on_return++;
+		      continue; 
+		  }
+	      }
+	      popcount_on_return = 0;
+	      break;
+	  }
+      }
+      if (popcount_on_return == 0) {
+	  jmpkind = Ijk_Boring;
+	  ret_without_call = True;
+      }
+  }
+
+  /* Should this jump be converted to call or pop/call ? */
+  if (( jmpkind != Ijk_Ret) &&
+      ( jmpkind != Ijk_Call) && last_bb) {
+
+    /* We simulate a JMP/Cont to be a CALL if
+     * - jump is in another ELF object or section kind
+     * - jump is to first instruction of a function (tail recursion)
+     */
+    if (ret_without_call ||
+	/* This is for detection of optimized tail recursion.
+	 * On PPC, this is only detected as call when going to another
+	 * function. The problem is that on PPC it can go wrong
+	 * more easily (no stack frame setup needed)
+	 */
+#if defined(VGA_ppc32)
+	(bb->is_entry && (last_bb->fn != bb->fn)) ||
+#else
+	bb->is_entry ||
+#endif
+	(last_bb->sect_kind != bb->sect_kind) ||
+	(last_bb->obj->number != bb->obj->number)) {
+
+	CLG_DEBUG(1,"     JMP: %s[%s] to %s[%s]%s!\n",
+		  last_bb->fn->name, last_bb->obj->name,
+		  bb->fn->name, bb->obj->name,
+		  ret_without_call?" (RET w/o CALL)":"");
+
+	if (CLG_(get_fn_node)(last_bb)->pop_on_jump && (csp>0)) {
+
+	    call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]);
+	    
+	    if (top_ce->jcc) {
+
+		CLG_DEBUG(1,"     Pop on Jump!\n");
+
+		/* change source for delayed push */
+		CLG_(current_state).bbcc = top_ce->jcc->from;
+		sp = top_ce->sp;
+		CLG_(pop_call_stack)();
+	    }
+	    else {
+		CLG_ASSERT(CLG_(current_state).nonskipped != 0);
+	    }
+	}
+
+	jmpkind = Ijk_Call;
+	call_emulation = True;
+    }
+  }
+
+  if (jmpkind == Ijk_Call)
+    skip = CLG_(get_fn_node)(bb)->skip;
+
+  CLG_DEBUGIF(1) {
+      if (jmpkind == JmpCond)
+	  VG_(printf)("Conditional");
+      else if (jmpkind == JmpNone)
+	  VG_(printf)("None");
+      else
+	  ppIRJumpKind( jmpkind );
+
+      VG_(printf)(" %08x -> %08x, SP %08x\n",
+		  last_bb ? bb_jmpaddr(last_bb) : 0,
+		  bb_addr(bb), sp);
+  }
+
+  /* Handle CALL/RET and update context to get correct BBCC */
+  
+  if (jmpkind == Ijk_Ret) {
+    
+    if ((csp == 0) || 
+	((CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom) &&
+	 ( *(CLG_(current_fn_stack).top-1)==0)) ) {
+
+      /* On an empty call stack or at a signal separation marker,
+       * a RETURN generates an call stack underflow.
+       */	
+      handleUnderflow(bb);
+      CLG_(pop_call_stack)();
+    }
+    else {
+	CLG_ASSERT(popcount_on_return >0);
+	CLG_(unwind_call_stack)(sp, popcount_on_return);
+    }
+  }
+  else {
+    CLG_(unwind_call_stack)(sp, 0);
+    
+    if (jmpkind == Ijk_Call) {
+      delayed_push = True;
+
+      csp = CLG_(current_call_stack).sp;
+      if (call_emulation && csp>0)
+	sp = CLG_(current_call_stack).entry[csp-1].sp;	
+
+    }
+  }
+  
+  /* Change new context if needed, taking delayed_push into account */
+  if ((delayed_push && !skip) || (CLG_(current_state).cxt == 0)) {
+    CLG_(push_cxt)(CLG_(get_fn_node)(bb));
+  }
+  CLG_ASSERT(CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom);
+  
+  /* If there is a fresh instrumented BBCC, assign current context */
+  bbcc = CLG_(get_bbcc)(bb);
+  if (bbcc->cxt == 0) {
+    CLG_ASSERT(bbcc->rec_array == 0);
+      
+    bbcc->cxt = CLG_(current_state).cxt;
+    bbcc->rec_array = 
+      new_recursion((*CLG_(current_fn_stack).top)->separate_recursions);
+    bbcc->rec_array[0] = bbcc;
+      
+    insert_bbcc_into_hash(bbcc);
+  }
+  else {
+    /* get BBCC with current context */
+    
+    /* first check LRU of last bbcc executed */
+    
+    if (last_bbcc) {
+      bbcc = last_bbcc->lru_next_bbcc;
+      if (bbcc &&
+	  ((bbcc->bb != bb) ||
+	   (bbcc->cxt != CLG_(current_state).cxt)))
+	bbcc = 0;
+    }
+    else
+      bbcc = 0;
+
+    if (!bbcc)
+      bbcc = lookup_bbcc(bb, CLG_(current_state).cxt);
+    if (!bbcc)
+      bbcc = clone_bbcc(bb->bbcc_list, CLG_(current_state).cxt, 0);
+    
+    bb->last_bbcc = bbcc;
+  }
+
+  /* save for fast lookup */
+  if (last_bbcc)
+    last_bbcc->lru_next_bbcc = bbcc;
+
+  if ((*CLG_(current_fn_stack).top)->separate_recursions >1) {
+    UInt level, idx;
+    fn_node* top = *(CLG_(current_fn_stack).top);
+
+    level = *CLG_(get_fn_entry)(top->number);
+
+    if (delayed_push && !skip) {
+      if (CLG_(clo).skip_direct_recursion) {
+	/* do not increment rec. level if called from
+	 * same function */
+	if (!CLG_(current_state).bbcc || 
+	    (CLG_(current_state).bbcc->cxt->fn[0] != bbcc->cxt->fn[0]))
+	  level++;
+      }
+      else level++;
+    }
+    if (level> top->separate_recursions)
+      level = top->separate_recursions;
+
+    if (level == 0) {
+      /* can only happen if instrumentation just was switched on */
+      level = 1;
+      *CLG_(get_fn_entry)(top->number) = 1;
+    }
+
+    idx = level -1;
+    if (bbcc->rec_array[idx])
+      bbcc = bbcc->rec_array[idx];
+    else
+      bbcc = clone_bbcc(bbcc, CLG_(current_state).cxt, idx);
+
+    CLG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
+  }
+
+  if (delayed_push) {
+    if (!skip && CLG_(current_state).nonskipped) {
+      /* a call from skipped to nonskipped */
+      CLG_(current_state).bbcc = CLG_(current_state).nonskipped;
+    }
+    CLG_(push_call_stack)(CLG_(current_state).bbcc, passed,
+			 bbcc, sp, skip);
+  }
+
+  if (CLG_(clo).collect_jumps &&
+      ((jmpkind == JmpCond) || (jmpkind == Ijk_Boring))) {
+    
+    /* Handle conditional jumps followed, i.e. trace arcs
+     * This uses JCC structures, too */
+    
+    jCC* jcc = CLG_(get_jcc)(last_bbcc, passed, bbcc);
+    CLG_ASSERT(jcc != 0);
+    // Change from default, and check if already changed
+    if (jcc->jmpkind == Ijk_Call)
+      jcc->jmpkind = jmpkind;
+    else {
+	// FIXME: Why can this fail?
+	// CLG_ASSERT(jcc->jmpkind == jmpkind);
+    }
+    
+    jcc->call_counter++;
+    if (jmpkind == JmpCond)
+      CLG_(stat).jcnd_counter++;
+    else
+      CLG_(stat).jump_counter++;
+  }
+  
+  CLG_(current_state).bbcc = bbcc;
+  
+  CLG_DEBUGIF(1) {
+    VG_(printf)("     ");
+    CLG_(print_bbcc_fn)(bbcc);
+    VG_(printf)("\n");
+  }
+  
+  CLG_DEBUG(3,"- setup_bbcc (BB %p): Cost %p (Len %d), Instrs %d (Len %d)\n",
+	   bb_addr(bb), bbcc->cost, bb->cost_count, 
+	   bb->instr_count, bb->instr_len);
+  CLG_DEBUGIF(3)
+    CLG_(print_cxt)(-8, CLG_(current_state).cxt, bbcc->rec_index);
+  CLG_DEBUG(3,"\n");
+  
+  (*CLG_(cachesim).after_bbsetup)();
+
+  CLG_(stat).bb_executions++;
+}
diff --git a/callgrind/callgrind.h b/callgrind/callgrind.h
new file mode 100644
index 0000000000..c153dbd2d2
--- /dev/null
+++ b/callgrind/callgrind.h
@@ -0,0 +1,130 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (callgrind.h) only.  The entire rest of Valgrind is licensed
+   under the terms of the GNU General Public License, version 2.  See
+   the COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of callgrind, a valgrind skin for cache simulation
+   and call tree tracing.
+
+   Copyright (C) 2003,2004 Josef Weidendorfer.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must
+      not claim that you wrote the original software.  If you use this
+      software in a product, an acknowledgment in the product
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (vgprof.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+*/
+
+#ifndef __CALLGRIND_H
+#define __CALLGRIND_H
+
+#include "valgrind.h"
+
+typedef
+   enum {
+      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'),
+      VG_USERREQ__ZERO_STATS,
+      VG_USERREQ__TOGGLE_COLLECT,
+      VG_USERREQ__DUMP_STATS_AT,
+      VG_USERREQ__START_INSTRUMENTATION,
+      VG_USERREQ__STOP_INSTRUMENTATION
+   } Vg_CalltreeClientRequest;
+
+/* Dump current state of cost centers.
+   This will also atomically zero the cost centers */
+#define CALLGRIND_DUMP_STATS()                   			\
+   do {									\
+     unsigned int _qzz_res;						\
+     VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0, VG_USERREQ__DUMP_STATS,	\
+			     0, 0, 0, 0);       			\
+     (void)0;								\
+   } while(0)
+
+/* Dump current state of cost centers.
+   This will also atomically zero the cost centers */
+#define CALLGRIND_DUMP_STATS_AT(pos_str)                			\
+   do {									\
+     unsigned int _qzz_res;						\
+     VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0, VG_USERREQ__DUMP_STATS_AT,	\
+			     pos_str, 0, 0, 0);       			\
+     (void)0;								\
+   } while(0)
+
+/* Zero cost centers */
+#define CALLGRIND_ZERO_STATS()						\
+   do {									\
+     unsigned int _qzz_res;						\
+     VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0, VG_USERREQ__ZERO_STATS,	\
+			     0, 0, 0, 0);				\
+     (void)0;								\
+   } while(0)
+
+/* Toggle collection state,
+ * i.e. if events happening are collected into cost centers */
+#define CALLGRIND_TOGGLE_COLLECT()       				\
+   do {									\
+     unsigned int _qzz_res;						\
+     VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0, VG_USERREQ__TOGGLE_COLLECT,	\
+			     0, 0, 0, 0);				\
+     (void)0;								\
+   } while(0)
+
+/* Start instrumentation if not already on */
+#define CALLGRIND_START_INSTRUMENTATION()       				\
+   do {									\
+     unsigned int _qzz_res;						\
+     VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0, VG_USERREQ__START_INSTRUMENTATION,\
+			     0, 0, 0, 0);				\
+     (void)0;								\
+   } while(0)
+
+/* Stop instrumentation if not already off */
+#define CALLGRIND_STOP_INSTRUMENTATION()       				\
+   do {									\
+     unsigned int _qzz_res;						\
+     VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0, VG_USERREQ__STOP_INSTRUMENTATION,\
+			     0, 0, 0, 0);				\
+     (void)0;								\
+   } while(0)
+
+#endif /* __CALLGRIND_H */
diff --git a/callgrind/callgrind_annotate.in b/callgrind/callgrind_annotate.in
new file mode 100644
index 0000000000..6d36f0602b
--- /dev/null
+++ b/callgrind/callgrind_annotate.in
@@ -0,0 +1,1191 @@
+#! /usr/bin/perl -w
+##--------------------------------------------------------------------##
+##--- The cache simulation framework: instrumentation, recording   ---##
+##--- and results printing.                                        ---##
+##---                                           callgrind_annotate ---##
+##--------------------------------------------------------------------##
+
+#  This file is part of Callgrind, a cache-simulator and call graph
+#  tracer built on Valgrind.
+#
+#  Copyright (C) 2003 Josef Weidendorfer
+#     Josef.Weidendorfer@gmx.de
+#
+#  This file is based heavily on vg_annotate, part of Valgrind.
+#  Copyright (C) 2002 Nicholas Nethercote
+#     njn25@cam.ac.uk
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+#  02111-1307, USA.
+#
+#  The GNU General Public License is contained in the file COPYING.
+
+#----------------------------------------------------------------------------
+# Annotator for cachegrind/callgrind. 
+#
+# File format is described in /docs/techdocs.html.
+#
+# Performance improvements record, using cachegrind.out for cacheprof, doing no
+# source annotation (irrelevant ones removed):
+#                                                               user time
+# 1. turned off warnings in add_hash_a_to_b()                   3.81 --> 3.48s
+#    [now add_array_a_to_b()]
+# 6. make line_to_CC() return a ref instead of a hash           3.01 --> 2.77s
+#
+#10. changed file format to avoid file/fn name repetition       2.40s
+#    (not sure why higher;  maybe due to new '.' entries?)
+#11. changed file format to drop unnecessary end-line "."s      2.36s
+#    (shrunk file by about 37%)
+#12. switched from hash CCs to array CCs                        1.61s
+#13. only adding b[i] to a[i] if b[i] defined (was doing it if
+#    either a[i] or b[i] was defined, but if b[i] was undefined
+#    it just added 0)                                           1.48s
+#14. Stopped converting "." entries to undef and then back      1.16s
+#15. Using foreach $i (x..y) instead of for ($i = 0...) in
+#    add_array_a_to_b()                                         1.11s
+#
+# Auto-annotating primes:
+#16. Finding count lengths by int((length-1)/3), not by
+#    commifying (halves the number of commify calls)            1.68s --> 1.47s
+
+use strict;
+
+#----------------------------------------------------------------------------
+# Overview: the running example in the comments is for:
+#   - events = A,B,C,D
+#   - --show=C,A,D
+#   - --sort=D,C
+#----------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------
+# Global variables, main data structures
+#----------------------------------------------------------------------------
+# CCs are arrays, the counts corresponding to @events, with 'undef'
+# representing '.'.  This makes things fast (faster than using hashes for CCs)
+# but we have to use @sort_order and @show_order below to handle the --sort and
+# --show options, which is a bit tricky.
+#----------------------------------------------------------------------------
+
+# Total counts for summary (an array reference).
+my $summary_CC;
+
+# Totals for each function, for overall summary.
+# hash(filename:fn_name => CC array)
+my %fn_totals;
+
+# Individual CCs, organised by filename and line_num for easy annotation.
+# hash(filename => hash(line_num => CC array))
+my %all_ind_CCs;
+
+# Files chosen for annotation on the command line.  
+# key = basename (trimmed of any directory), value = full filename
+my %user_ann_files;
+
+# Generic description string.
+my $desc = "";
+
+# Command line of profiled program.
+my $cmd = "";
+
+# Info on the profiled process.
+my $pid = "";
+my $part = "";
+my $thread = "";
+
+# Positions used for cost lines; default: line numbers
+my $has_line = 1;
+my $has_addr = 0;
+
+# Events in input file, eg. (A,B,C,D)
+my @events;
+my $events;
+
+# Events to show, from command line, eg. (C,A,D)
+my @show_events;
+
+# Map from @show_events indices to @events indices, eg. (2,0,3).  Gives the
+# order in which we must traverse @events in order to show the @show_events, 
+# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events.
+# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).)
+my @show_order;
+
+# Print out the function totals sorted by these events, eg. (D,C).
+my @sort_events;
+
+# Map from @sort_events indices to @events indices, eg. (3,2).  Same idea as
+# for @show_order.
+my @sort_order;
+
+# Thresholds, one for each sort event (or default to 1 if no sort events
+# specified).  We print out functions and do auto-annotations until we've
+# handled this proportion of all the events thresholded.
+my @thresholds;
+
+my $default_threshold = 99;
+
+my $single_threshold  = $default_threshold;
+
+# If on, automatically annotates all files that are involved in getting over
+# all the threshold counts.
+my $auto_annotate = 0;
+
+# Number of lines to show around each annotated line.
+my $context = 8;
+
+# Directories in which to look for annotation files.
+my @include_dirs = ("");
+
+# Verbose mode
+my $verbose = "1";
+
+# Inclusive statistics (with subroutine events)
+my $inclusive = 0;
+
+# Inclusive totals for each function, for overall summary.
+# hash(filename:fn_name => CC array)
+my %cfn_totals;
+
+# hash( file:func => [ called file:func ])
+my $called_funcs;
+
+# hash( file:func => [ calling file:func ])
+my $calling_funcs;
+
+# hash( file:func,line => [called file:func ])
+my $called_from_line;
+
+# hash( file:func,line => file:func
+my %func_of_line;
+
+# hash (file:func => object name)
+my %obj_name;
+
+# Print out the callers of a function
+my $tree_caller = 0;
+
+# Print out the called functions
+my $tree_calling = 0;
+
+# hash( file:func,cfile:cfunc => call CC[])
+my %call_CCs;
+
+# hash( file:func,cfile:cfunc => call counter)
+my %call_counter;
+
+# hash(context, index) => realname for compressed traces
+my %compressed;
+
+# Input file name, will be set in process_cmd_line
+my $input_file = "";
+
+# Version number
+my $version = "@VERSION@";
+
+# Usage message.
+my $usage = <<END
+usage: callgrind_annotate [options] [data-file [source-files]]
+
+  options for the user, with defaults in [ ], are:
+    -h --help             show this message
+    -v --version          show version
+    --show=A,B,C          only show figures for events A,B,C [all]
+    --sort=A,B,C          sort columns by events A,B,C [event column order]
+    --threshold=<0--100>  percentage of counts (of primary sort event) we
+                          are interested in [$default_threshold%]
+    --auto=yes|no         annotate all source files containing functions
+                          that helped reach the event count threshold [no]
+    --context=N           print N lines of context before and after
+                          annotated lines [8]
+    --inclusive=yes|no    add subroutine costs to functions calls [no]
+    --tree=none|caller|   print for each function their callers,
+           calling|both   the called functions or both [none]
+    -I --include=<dir>    add <dir> to list of directories to search for 
+                          source files
+
+END
+;
+
+# Used in various places of output.
+my $fancy = '-' x 80 . "\n";
+
+#-----------------------------------------------------------------------------
+# Argument and option handling
+#-----------------------------------------------------------------------------
+sub process_cmd_line() 
+{
+    for my $arg (@ARGV) { 
+
+        # Option handling
+        if ($arg =~ /^-/) {
+
+            # --version
+            if ($arg =~ /^-v$|^--version$/) {
+                die("callgrind_annotate-$version\n");
+
+            # --show=A,B,C
+            } elsif ($arg =~ /^--show=(.*)$/) {
+                @show_events = split(/,/, $1);
+
+            # --sort=A,B,C
+            } elsif ($arg =~ /^--sort=(.*)$/) {
+                @sort_events = split(/,/, $1);
+                foreach my $i (0 .. scalar @sort_events - 1) {
+                    if ($sort_events[$i] =~#/.*:(\d+)$/) {
+                                            /.*:([\d\.]+)%?$/) {
+                        my $th = $1;
+                        ($th >= 0 && $th <= 100) or die($usage);
+                        $sort_events[$i] =~ s/:.*//;
+                        $thresholds[$i] = $th;
+                    } else {
+                        $thresholds[$i] = 0;
+                    }
+                }
+
+            # --threshold=X (tolerates a trailing '%')
+            } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) {
+                $single_threshold = $1;
+                ($1 >= 0 && $1 <= 100) or die($usage);
+
+            # --auto=yes|no
+            } elsif ($arg =~ /^--auto=(yes|no)$/) {
+                $auto_annotate = 1 if ($1 eq "yes");
+                $auto_annotate = 0 if ($1 eq "no");
+
+            # --context=N
+            } elsif ($arg =~ /^--context=([\d\.]+)$/) {
+                $context = $1;
+                if ($context < 0) {
+                    die($usage);
+                }
+
+            # --inclusive=yes|no
+            } elsif ($arg =~ /^--inclusive=(yes|no)$/) {
+                $inclusive = 1 if ($1 eq "yes");
+                $inclusive = 0 if ($1 eq "no");
+
+            # --tree=none|caller|calling|both
+            } elsif ($arg =~ /^--tree=(none|caller|calling|both)$/) {
+                $tree_caller  = 1 if ($1 eq "caller" || $1 eq "both");
+                $tree_calling = 1 if ($1 eq "calling" || $1 eq "both");
+
+            # --include=A,B,C
+            } elsif ($arg =~ /^(-I|--include)=(.*)$/) {
+                my $inc = $2;
+                $inc =~ s|/$||;         # trim trailing '/'
+                push(@include_dirs, "$inc/");
+
+            } else {            # -h and --help fall under this case
+                die($usage);
+            }
+
+        # Argument handling -- annotation file checking and selection.
+        # Stick filenames into a hash for quick 'n easy lookup throughout
+        } else {
+	  if ($input_file eq "") {
+	    $input_file = $arg;
+	  }
+	  else {
+            my $readable = 0;
+            foreach my $include_dir (@include_dirs) {
+                if (-r $include_dir . $arg) {
+                    $readable = 1;
+                }
+            }
+            $readable or die("File $arg not found in any of: @include_dirs\n");
+            $user_ann_files{$arg} = 1;
+        } 
+    }
+    }
+
+    if ($input_file eq "") {
+      $input_file = (<cachegrind.out*>)[0];
+      if (!defined $input_file) {
+	$input_file = "cachegrind.out";
+      }
+      print "Reading data from '$input_file'...\n";
+    }
+}
+
+#-----------------------------------------------------------------------------
+# Reading of input file
+#-----------------------------------------------------------------------------
+sub max ($$) 
+{
+    my ($x, $y) = @_;
+    return ($x > $y ? $x : $y);
+}
+
+# Add the two arrays;  any '.' entries are ignored.  Two tricky things:
+# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn
+#    off warnings to allow this.  This makes things about 10% faster than
+#    checking for definedness ourselves.
+# 2. We don't add an undefined count or a ".", even though it's value is 0,
+#    because we don't want to make an $a2->[$i] that is undef become 0
+#    unnecessarily.
+sub add_array_a_to_b ($$) 
+{
+    my ($a1, $a2) = @_;
+
+    my $n = max(scalar @$a1, scalar @$a2);
+    $^W = 0;
+    foreach my $i (0 .. $n-1) {
+        $a2->[$i] += $a1->[$i] if (defined $a1->[$i] && "." ne $a1->[$i]);
+    }
+    $^W = 1;
+}
+
+# Add each event count to the CC array.  '.' counts become undef, as do
+# missing entries (implicitly).
+sub line_to_CC ($)
+{
+    my @CC = (split /\s+/, $_[0]);
+    (@CC <= @events) or die("Line $.: too many event counts\n");
+    return \@CC;
+}
+
+sub uncompressed_name($$)
+{
+   my ($context, $name) = @_;
+
+   if ($name =~ /^\((\d+)\)\s*(.*)$/) {
+     my $index = $1;
+     my $realname = $2;
+
+     if ($realname eq "") {
+       $realname = $compressed{$context,$index};
+     }
+     else {
+       $compressed{$context,$index} = $realname;
+     }
+     return $realname;
+   }
+   return $name;
+}
+
+sub read_input_file() 
+{
+    open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n";
+
+    my $line;
+
+    # Read header
+    while(<INPUTFILE>) {
+
+      # remove comments
+      s/#.*$//;
+
+      if (/^$/) { ; }
+
+      elsif (/^version:\s*(\d+)/) {
+	# Can't read format with major version > 1
+	($1<2) or die("Can't read format with major version $1.\n");
+      }
+
+      elsif (/^pid:\s+(.*)$/) { $pid = $1;  }
+      elsif (/^thread:\s+(.*)$/) { $thread = $1;  }
+      elsif (/^part:\s+(.*)$/) { $part = $1;  }
+      elsif (/^desc:\s+(.*)$/) {
+	my $dline = $1;
+	# suppress profile options in description output
+	if ($dline =~ /^Option:/) {;}
+	else { $desc .= "$dline\n"; }
+      }
+      elsif (/^cmd:\s+(.*)$/)  { $cmd = $1; }
+      elsif (/^positions:\s+(.*)$/) {
+	my $positions = $1;
+	$has_line = ($positions =~ /line/);
+	$has_addr = ($positions =~ /(addr|instr)/);
+      }
+      elsif (/^events:\s+(.*)$/) {
+	$events = $1;
+	
+	# events line is last in header
+	last;
+      }
+      else {
+	warn("WARNING: header line $. malformed, ignoring\n");
+	if ($verbose) { chomp; warn("    line: '$_'\n"); }
+      }
+    }
+
+    # Check for needed header entries
+    ($cmd ne "") or die("Line $.: missing command line\n");
+
+    # Read "events:" line.  We make a temporary hash in which the Nth event's
+    # value is N, which is useful for handling --show/--sort options below.
+    ($events ne "") or die("Line $.: missing events line\n");
+    @events = split(/\s+/, $events);
+    my %events;
+    my $n = 0;
+    foreach my $event (@events) {
+        $events{$event} = $n;
+        $n++
+    }
+
+    # If no --show arg give, default to showing all events in the file.
+    # If --show option is used, check all specified events appeared in the
+    # "events:" line.  Then initialise @show_order.
+    if (@show_events) {
+        foreach my $show_event (@show_events) {
+            (defined $events{$show_event}) or 
+                die("--show event `$show_event' did not appear in input\n");
+        }
+    } else {
+        @show_events = @events;
+    }
+    foreach my $show_event (@show_events) {
+        push(@show_order, $events{$show_event});
+    }
+
+    # Do as for --show, but if no --sort arg given, default to sorting by
+    # column order (ie. first column event is primary sort key, 2nd column is
+    # 2ndary key, etc).
+    if (@sort_events) {
+        foreach my $sort_event (@sort_events) {
+            (defined $events{$sort_event}) or 
+                die("--sort event `$sort_event' did not appear in input\n");
+        }
+    } else {
+        @sort_events = @events;
+    }
+    foreach my $sort_event (@sort_events) {
+        push(@sort_order, $events{$sort_event});
+    }
+
+    # If multiple threshold args weren't given via --sort, stick in the single
+    # threshold (either from --threshold if used, or the default otherwise) for
+    # the primary sort event, and 0% for the rest.
+    if (not @thresholds) {
+        foreach my $e (@sort_order) {
+            push(@thresholds, 0);
+        }
+        $thresholds[0] = $single_threshold;
+    }
+
+    my $curr_obj = "";
+    my $curr_file;
+    my $curr_fn;
+    my $curr_name;
+    my $curr_line_num = 0;
+
+    my $curr_cobj = "";
+    my $curr_cfile = "";
+    my $curr_cfunc = "";
+    my $curr_cname;
+    my $curr_call_counter = 0;
+    my $curr_cfn_CC = [];
+
+    my $curr_fn_CC = [];
+    my $curr_file_ind_CCs = {};     # hash(line_num => CC)
+
+    # Read body of input file.
+    while (<INPUTFILE>) {
+        s/#.*$//;   # remove comments
+	s/^\+(\d+)/$curr_line_num+$1/e;
+        s/^\-(\d+)/$curr_line_num-$1/e;
+        s/^\*/$curr_line_num/e;
+        if (s/^(\d+|0x\w+)\s+//) {
+            $curr_line_num = $1;
+	    if ($has_addr) {
+	      if ($has_line) {
+                s/^\+(\d+)/$curr_line_num+$1/e;
+	        s/^\-(\d+)/$curr_line_num-$1/e;
+                s/^\*/$curr_line_num/e;
+
+	        if (s/^(\d+)\s+//) { $curr_line_num = $1; }
+	      }
+	      else { $curr_line_num = 0; }
+	    }
+            my $CC = line_to_CC($_);
+
+	    if ($curr_call_counter>0) {
+#	      print "Read ($curr_name => $curr_cname) $curr_call_counter\n";
+
+	      if (defined $call_CCs{$curr_name,$curr_cname}) {
+		add_array_a_to_b($CC, $call_CCs{$curr_name,$curr_cname});
+		$call_counter{$curr_name,$curr_cname} += $curr_call_counter;
+	      }
+	      else {
+		$call_CCs{$curr_name,$curr_cname} = $CC;
+		$call_counter{$curr_name,$curr_cname} = $curr_call_counter;
+	      }
+
+	      my $tmp = $called_from_line->{$curr_file,$curr_line_num};
+	      if (!defined $tmp) {
+		$func_of_line{$curr_file,$curr_line_num} = $curr_name;
+	      }
+	      $tmp = {} unless defined $tmp;
+	      $$tmp{$curr_cname} = 1;
+	      $called_from_line->{$curr_file,$curr_line_num} = $tmp;
+	      $call_CCs{$curr_name,$curr_cname,$curr_line_num} = $CC;
+	      $call_counter{$curr_name,$curr_cname,$curr_line_num} = $curr_call_counter;
+
+	      $curr_call_counter = 0;
+
+	      # inclusive costs
+	      $curr_cfn_CC = $cfn_totals{$curr_cname};
+	      $curr_cfn_CC = [] unless (defined $curr_cfn_CC);
+	      add_array_a_to_b($CC, $curr_cfn_CC);
+	      $cfn_totals{$curr_cname} = $curr_cfn_CC;
+
+	      if ($inclusive) {
+		add_array_a_to_b($CC, $curr_fn_CC);
+	      }
+	      next;
+	    }
+
+            add_array_a_to_b($CC, $curr_fn_CC);
+
+            # If curr_file is selected, add CC to curr_file list.  We look for
+            # full filename matches;  or, if auto-annotating, we have to
+            # remember everything -- we won't know until the end what's needed.
+            if ($auto_annotate || defined $user_ann_files{$curr_file}) {
+                my $tmp = $curr_file_ind_CCs->{$curr_line_num};
+                $tmp = [] unless defined $tmp;
+                add_array_a_to_b($CC, $tmp);
+                $curr_file_ind_CCs->{$curr_line_num} = $tmp;
+            }
+
+        } elsif (s/^fn=(.*)$//) {
+            # Commit result from previous function
+            $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name);
+
+            # Setup new one
+            $curr_fn = uncompressed_name("fn",$1);
+            $curr_name = "$curr_file:$curr_fn";
+	    $obj_name{$curr_name} = $curr_obj;
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^ob=(.*)$//) {
+            $curr_obj = uncompressed_name("ob",$1);
+
+        } elsif (s/^fl=(.*)$//) {
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs 
+                if (defined $curr_file);
+
+            $curr_file = uncompressed_name("fl",$1);
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+
+        } elsif (s/^(fi|fe)=(.*)$//) {
+            (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n");
+            $fn_totals{$curr_name} = $curr_fn_CC;
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs;
+
+            $curr_file = uncompressed_name("fl",$2);
+            $curr_name = "$curr_file:$curr_fn";
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^\s*$//) {
+            # blank, do nothing
+
+        } elsif (s/^cob=(.*)$//) {
+	  $curr_cobj = uncompressed_name("ob",$1);
+
+	} elsif (s/^cfi=(.*)$//) {
+	  $curr_cfile = uncompressed_name("fl",$1);
+
+	} elsif (s/^cfn=(.*)$//) {
+	  $curr_cfunc = uncompressed_name("fn",$1);
+	  if ($curr_cfile eq "") {
+	    $curr_cname = "$curr_file:$curr_cfunc";
+	  }
+	  else {
+	    $curr_cname = "$curr_cfile:$curr_cfunc";
+	    $curr_cfile = "";
+	  }
+
+	  my $tmp = $calling_funcs->{$curr_cname};
+	  $tmp = {} unless defined $tmp;
+	  $$tmp{$curr_name} = 1;
+	  $calling_funcs->{$curr_cname} = $tmp;
+		
+	  my $tmp2 = $called_funcs->{$curr_name};
+	  $tmp2 = {} unless defined $tmp2;
+	  $$tmp2{$curr_cname} = 1;
+	  $called_funcs->{$curr_name} = $tmp2;
+
+	} elsif (s/^calls=(\d+)//) {
+	  $curr_call_counter = $1;
+
+        } elsif (s/^(jump|jcnd)=//) {
+	  #ignore jump information
+
+        } elsif (s/^totals:\s+//) {
+	  #ignore
+
+        } elsif (s/^summary:\s+//) {
+            $summary_CC = line_to_CC($_);
+
+        } else {
+            warn("WARNING: line $. malformed, ignoring\n");
+	    if ($verbose) { chomp; warn("    line: '$_'\n"); }
+        }
+    }
+
+    # Check if summary line was present
+    if (not defined $summary_CC) {
+        warn("WARNING: missing final summary line, no summary will be printed\n");
+    }
+    else {
+      # Finish up handling final filename/fn_name counts
+      $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC 
+	if (defined $curr_file && defined $curr_fn);
+      $all_ind_CCs{$curr_file} = 
+	$curr_file_ind_CCs if (defined $curr_file);
+
+      (scalar(@$summary_CC) == @events) 
+	or die("Line $.: summary event and total event mismatch\n");
+    }
+
+    # Correct inclusive totals
+    if ($inclusive) {
+      foreach my $name (keys %cfn_totals) {
+	$fn_totals{$name} = $cfn_totals{$name};
+      }
+    }
+
+    close(INPUTFILE);
+}
+
+#-----------------------------------------------------------------------------
+# Print options used
+#-----------------------------------------------------------------------------
+sub print_options ()
+{
+    print($fancy);
+    print($desc);
+    my $target = $cmd;
+    if ($pid ne "") {
+      $target .= " (PID $pid";
+      if ($part ne "") { $target .= ", part $part"; }
+      if ($thread ne "") { $target .= ", thread $thread"; }
+      $target .= ")";
+    }
+    print("Profiled target:  $target\n");
+    print("Events recorded:  @events\n");
+    print("Events shown:     @show_events\n");
+    print("Event sort order: @sort_events\n");
+    print("Thresholds:       @thresholds\n");
+
+    my @include_dirs2 = @include_dirs;  # copy @include_dirs
+    shift(@include_dirs2);       # remove "" entry, which is always the first
+    unshift(@include_dirs2, "") if (0 == @include_dirs2); 
+    my $include_dir = shift(@include_dirs2);
+    print("Include dirs:     $include_dir\n");
+    foreach my $include_dir (@include_dirs2) {
+        print("                  $include_dir\n");
+    }
+
+    my @user_ann_files = keys %user_ann_files;
+    unshift(@user_ann_files, "") if (0 == @user_ann_files); 
+    my $user_ann_file = shift(@user_ann_files);
+    print("User annotated:   $user_ann_file\n");
+    foreach $user_ann_file (@user_ann_files) {
+        print("                  $user_ann_file\n");
+    }
+
+    my $is_on = ($auto_annotate ? "on" : "off");
+    print("Auto-annotation:  $is_on\n");
+    print("\n");
+}
+
+#-----------------------------------------------------------------------------
+# Print summary and sorted function totals
+#-----------------------------------------------------------------------------
+sub mycmp ($$) 
+{
+    my ($c, $d) = @_;
+
+    # Iterate through sort events (eg. 3,2); return result if two are different
+    foreach my $i (@sort_order) {
+        my ($x, $y);
+        $x = $c->[$i];
+        $y = $d->[$i];
+        $x = -1 unless defined $x;
+        $y = -1 unless defined $y;
+
+        my $cmp = $y <=> $x;        # reverse sort
+        if (0 != $cmp) {
+            return $cmp;
+        }
+    }
+    # Exhausted events, equal
+    return 0;
+}
+
+sub commify ($) {
+    my ($val) = @_;
+    1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/);
+    return $val;
+}
+
+# Because the counts can get very big, and we don't want to waste screen space
+# and make lines too long, we compute exactly how wide each column needs to be
+# by finding the widest entry for each one.
+sub compute_CC_col_widths (@) 
+{
+    my @CCs = @_;
+    my $CC_col_widths = [];
+
+    # Initialise with minimum widths (from event names)
+    foreach my $event (@events) {
+        push(@$CC_col_widths, length($event));
+    }
+    
+    # Find maximum width count for each column.  @CC_col_width positions
+    # correspond to @CC positions.
+    foreach my $CC (@CCs) {
+        foreach my $i (0 .. scalar(@$CC)-1) {
+            if (defined $CC->[$i]) {
+                # Find length, accounting for commas that will be added
+                my $length = length $CC->[$i];
+                my $clength = $length + int(($length - 1) / 3);
+                $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); 
+            }
+        }
+    }
+    return $CC_col_widths;
+}
+
+# Print the CC with each column's size dictated by $CC_col_widths.
+sub print_CC ($$) 
+{
+    my ($CC, $CC_col_widths) = @_;
+
+    foreach my $i (@show_order) {
+        my $count = (defined $CC->[$i] ? commify($CC->[$i]) : ".");
+        my $space = ' ' x ($CC_col_widths->[$i] - length($count));
+        print("$space$count ");
+    }
+}
+
+sub print_events ($)
+{
+    my ($CC_col_widths) = @_;
+
+    foreach my $i (@show_order) { 
+        my $event       = $events[$i];
+        my $event_width = length($event);
+        my $col_width   = $CC_col_widths->[$i];
+        my $space       = ' ' x ($col_width - $event_width);
+        print("$space$event ");
+    }
+}
+
+# Prints summary and function totals (with separate column widths, so that
+# function names aren't pushed over unnecessarily by huge summary figures).
+# Also returns a hash containing all the files that are involved in getting the
+# events count above the thresholds (ie. all the interesting ones).
+sub print_summary_and_fn_totals ()
+{
+    my @fn_fullnames = keys   %fn_totals;
+
+    # Work out the size of each column for printing (summary and functions
+    # separately).
+    my $summary_CC_col_widths = compute_CC_col_widths($summary_CC);
+    my      $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals);
+
+    # Header and counts for summary
+    print($fancy);
+    print_events($summary_CC_col_widths);
+    print("\n");
+    print($fancy);
+    print_CC($summary_CC, $summary_CC_col_widths);
+    print(" PROGRAM TOTALS\n");
+    print("\n");
+
+    # Header for functions
+    print($fancy);
+    print_events($fn_CC_col_widths);
+    print(" file:function\n");
+    print($fancy);
+
+    # Sort function names into order dictated by --sort option.
+    @fn_fullnames = sort {
+        mycmp($fn_totals{$a}, $fn_totals{$b})
+    } @fn_fullnames;
+
+
+    # Assertion
+    (scalar @sort_order == scalar @thresholds) or 
+        die("sort_order length != thresholds length:\n",
+            "  @sort_order\n  @thresholds\n");
+
+    my $threshold_files       = {};
+    # @curr_totals has the same shape as @sort_order and @thresholds
+    my @curr_totals = ();
+    foreach my $e (@thresholds) {
+        push(@curr_totals, 0);
+    }
+
+    # Print functions, stopping when the threshold has been reached.
+    foreach my $fn_name (@fn_fullnames) {
+
+        # Stop when we've reached all the thresholds
+        my $reached_all_thresholds = 1;
+        foreach my $i (0 .. scalar @thresholds - 1) {
+            my $prop = $curr_totals[$i] * 100;
+	    if ($summary_CC->[$sort_order[$i]] >0) {
+	      $prop = $prop / $summary_CC->[$sort_order[$i]];
+	    }
+            $reached_all_thresholds &= ($prop >= $thresholds[$i]);
+        }
+        last if $reached_all_thresholds;
+
+	if ($tree_caller || $tree_calling) { print "\n"; }
+
+	if ($tree_caller && ($fn_name ne "???:???")) {
+	  # Print function callers
+	  my $tmp1 = $calling_funcs->{$fn_name};
+	  if (defined $tmp1) {
+	    foreach my $calling (keys %$tmp1) {
+	      if (defined $call_counter{$calling,$fn_name}) {
+		print_CC($call_CCs{$calling,$fn_name}, $fn_CC_col_widths);
+		print" < $calling (";
+		print $call_counter{$calling,$fn_name} . "x)";
+		if (defined $obj_name{$calling}) {
+		  print " [$obj_name{$calling}]";
+		}
+		print "\n";
+	      }
+	    }
+	  }
+	}
+
+        # Print function results
+        my $fn_CC = $fn_totals{$fn_name};
+        print_CC($fn_CC, $fn_CC_col_widths);
+	if ($tree_caller || $tree_calling) { print " * "; }
+        print(" $fn_name");
+	if (defined $obj_name{$fn_name}) {
+	  print " [$obj_name{$fn_name}]";
+	}
+	print "\n";
+
+	if ($tree_calling && ($fn_name ne "???:???")) {
+	  # Print called functions
+	  my $tmp2 = $called_funcs->{$fn_name};
+	  if (defined $tmp2) {
+	    foreach my $called (keys %$tmp2) {
+	      if (defined $call_counter{$fn_name,$called}) {
+		print_CC($call_CCs{$fn_name,$called}, $fn_CC_col_widths);
+		print" >   $called (";
+		print $call_counter{$fn_name,$called} . "x)";
+		if (defined $obj_name{$called}) {
+		  print " [$obj_name{$called}]";
+		}
+		print "\n";
+	      }
+	    }
+	  }
+	}
+
+        # Update the threshold counts
+        my $filename = $fn_name;
+        $filename =~ s/:.+$//;    # remove function name
+        $threshold_files->{$filename} = 1;
+        foreach my $i (0 .. scalar @sort_order - 1) {
+	  if ($inclusive) {
+	    $curr_totals[$i] = $summary_CC->[$sort_order[$i]] -
+                               $fn_CC->[$sort_order[$i]]
+	      if (defined $fn_CC->[$sort_order[$i]]);
+	  } else {
+            $curr_totals[$i] += $fn_CC->[$sort_order[$i]] 
+                if (defined $fn_CC->[$sort_order[$i]]);
+        }
+    }
+    }
+    print("\n");
+
+    return $threshold_files;
+}
+
+#-----------------------------------------------------------------------------
+# Annotate selected files
+#-----------------------------------------------------------------------------
+
+# Issue a warning that the source file is more recent than the input file. 
+sub warning_on_src_more_recent_than_inputfile ($)
+{
+    my $src_file = $_[0];
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ Source file '$src_file' is more recent than input file '$input_file'.
+@ Annotations may not be correct.
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+END
+;
+    print($warning);
+}
+
+# If there is information about lines not in the file, issue a warning
+# explaining possible causes.
+sub warning_on_nonexistent_lines ($$$)
+{
+    my ($src_more_recent_than_inputfile, $src_file, $excess_line_nums) = @_;
+    my $cause_and_solution;
+
+    if ($src_more_recent_than_inputfile) {
+        $cause_and_solution = <<END
+@@ cause:    '$src_file' has changed since information was gathered.
+@@           If so, a warning will have already been issued about this.
+@@ solution: Recompile program and rerun under "valgrind --cachesim=yes" to 
+@@           gather new information.
+END
+    # We suppress warnings about .h files
+    } elsif ($src_file =~ /\.h$/) {
+        $cause_and_solution = <<END
+@@ cause:    bug in the Valgrind's debug info reader that screws up with .h
+@@           files sometimes
+@@ solution: none, sorry
+END
+    } else {
+        $cause_and_solution = <<END
+@@ cause:    not sure, sorry
+END
+    }
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@
+@@ Information recorded about lines past the end of '$src_file'.
+@@
+@@ Probable cause and solution:
+$cause_and_solution@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+END
+;
+    print($warning);
+}
+
+sub annotate_ann_files($)
+{
+    my ($threshold_files) = @_; 
+
+    my %all_ann_files;
+    my @unfound_auto_annotate_files;
+    my $printed_totals_CC = [];
+
+    # If auto-annotating, add interesting files (but not "???")
+    if ($auto_annotate) {
+        delete $threshold_files->{"???"};
+        %all_ann_files = (%user_ann_files, %$threshold_files) 
+    } else {
+        %all_ann_files = %user_ann_files;
+    }
+
+    # Track if we did any annotations.
+    my $did_annotations = 0;
+
+    LOOP:
+    foreach my $src_file (keys %all_ann_files) {
+
+        my $opened_file = "";
+        my $full_file_name = "";
+        foreach my $include_dir (@include_dirs) {
+            my $try_name = $include_dir . $src_file;
+            if (open(INPUTFILE, "< $try_name")) {
+                $opened_file    = $try_name;
+                $full_file_name = ($include_dir eq "" 
+                                  ? $src_file 
+                                  : "$include_dir + $src_file"); 
+                last;
+            }
+        }
+        
+        if (not $opened_file) {
+            # Failed to open the file.  If chosen on the command line, die.
+            # If arose from auto-annotation, print a little message.
+            if (defined $user_ann_files{$src_file}) {
+                die("File $src_file not opened in any of: @include_dirs\n");
+
+            } else {
+                push(@unfound_auto_annotate_files, $src_file);
+            }
+
+        } else {
+            # File header (distinguish between user- and auto-selected files).
+            print("$fancy");
+            my $ann_type = 
+                (defined $user_ann_files{$src_file} ? "User" : "Auto");
+            print("-- $ann_type-annotated source: $full_file_name\n");
+            print("$fancy");
+
+            # Get file's CCs
+            my $src_file_CCs = $all_ind_CCs{$src_file};
+            if (!defined $src_file_CCs) {
+                print("  No information has been collected for $src_file\n\n");
+                next LOOP;
+            }
+        
+            $did_annotations = 1;
+            
+            # Numeric, not lexicographic sort!
+            my @line_nums = sort {$a <=> $b} keys %$src_file_CCs;  
+
+            # If $src_file more recent than cachegrind.out, issue warning
+            my $src_more_recent_than_inputfile = 0;
+            if ((stat $opened_file)[9] > (stat $input_file)[9]) {
+                $src_more_recent_than_inputfile = 1;
+                warning_on_src_more_recent_than_inputfile($src_file);
+            }
+
+            # Work out the size of each column for printing
+            my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs);
+
+            # Events header
+            print_events($CC_col_widths);
+            print("\n\n");
+
+            # Shift out 0 if it's in the line numbers (from unknown entries,
+            # likely due to bugs in Valgrind's stabs debug info reader)
+            shift(@line_nums) if (0 == $line_nums[0]);
+
+            # Finds interesting line ranges -- all lines with a CC, and all
+            # lines within $context lines of a line with a CC.
+            my $n = @line_nums;
+            my @pairs;
+            for (my $i = 0; $i < $n; $i++) {
+                push(@pairs, $line_nums[$i] - $context);   # lower marker
+                while ($i < $n-1 && 
+                       $line_nums[$i] + 2*$context >= $line_nums[$i+1]) {
+                    $i++;
+                }
+                push(@pairs, $line_nums[$i] + $context);   # upper marker
+            }
+
+            # Annotate chosen lines, tracking total counts of lines printed
+            $pairs[0] = 1 if ($pairs[0] < 1);
+            while (@pairs) {
+                my $low  = shift @pairs;
+                my $high = shift @pairs;
+                while ($. < $low-1) {
+                    my $tmp = <INPUTFILE>;
+                    last unless (defined $tmp);     # hack to detect EOF
+                }
+                my $src_line;
+                # Print line number, unless start of file
+                print("-- line $low " . '-' x 40 . "\n") if ($low != 1);
+                while (($. < $high) && ($src_line = <INPUTFILE>)) {
+                    if (defined $line_nums[0] && $. == $line_nums[0]) {
+                        print_CC($src_file_CCs->{$.}, $CC_col_widths);
+                        add_array_a_to_b($src_file_CCs->{$.}, 
+                                         $printed_totals_CC);
+                        shift(@line_nums);
+
+                    } else {
+                        print_CC( [], $CC_col_widths);
+                    }
+
+                    print(" $src_line");
+
+		    my $tmp  = $called_from_line->{$src_file,$.};
+		    my $func = $func_of_line{$src_file,$.};
+		    if (defined $tmp) {
+		      foreach my $called (keys %$tmp) {
+			if (defined $call_CCs{$func,$called,$.}) {
+			  print_CC($call_CCs{$func,$called,$.}, $CC_col_widths);
+			  print " => $called (";
+			  print $call_counter{$func,$called,$.} . "x)\n";
+			}
+		      }
+		    }
+                }
+                # Print line number, unless EOF
+                if ($src_line) {
+                    print("-- line $high " . '-' x 40 . "\n");
+                } else {
+                    last;
+                }
+            }
+
+            # If there was info on lines past the end of the file...
+            if (@line_nums) {
+                foreach my $line_num (@line_nums) {
+                    print_CC($src_file_CCs->{$line_num}, $CC_col_widths);
+                    print(" <bogus line $line_num>\n");
+                }
+                print("\n");
+                warning_on_nonexistent_lines($src_more_recent_than_inputfile,
+                                             $src_file, \@line_nums);
+            }
+            print("\n");
+
+            # Print summary of counts attributed to file but not to any
+            # particular line (due to incomplete debug info).
+            if ($src_file_CCs->{0}) {
+                print_CC($src_file_CCs->{0}, $CC_col_widths);
+                print(" <counts for unidentified lines in $src_file>\n\n");
+            }
+            
+            close(INPUTFILE);
+        }
+    }
+
+    # Print list of unfound auto-annotate selected files.
+    if (@unfound_auto_annotate_files) {
+        print("$fancy");
+        print("The following files chosen for auto-annotation could not be found:\n");
+        print($fancy);
+        foreach my $f (@unfound_auto_annotate_files) {
+            print("  $f\n");
+        }
+        print("\n");
+    }
+
+    # If we did any annotating, print what proportion of events were covered by
+    # annotated lines above.
+    if ($did_annotations) {
+        my $percent_printed_CC;
+        foreach (my $i = 0; $i < @$summary_CC; $i++) {
+            $percent_printed_CC->[$i] = 
+                sprintf("%.0f", 
+                        $printed_totals_CC->[$i] / $summary_CC->[$i] * 100);
+        }
+        my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC);
+        print($fancy);
+        print_events($pp_CC_col_widths);
+        print("\n");
+        print($fancy);
+        print_CC($percent_printed_CC, $pp_CC_col_widths);
+        print(" percentage of events annotated\n\n");
+    }
+}
+
+#----------------------------------------------------------------------------
+# "main()"
+#----------------------------------------------------------------------------
+process_cmd_line();
+read_input_file();
+print_options();
+my $threshold_files = print_summary_and_fn_totals();
+annotate_ann_files($threshold_files);
+
+##--------------------------------------------------------------------##
+##--- end                                           vg_annotate.in ---##
+##--------------------------------------------------------------------##
+
+
diff --git a/callgrind/callgrind_control.in b/callgrind/callgrind_control.in
new file mode 100644
index 0000000000..869c9b3af3
--- /dev/null
+++ b/callgrind/callgrind_control.in
@@ -0,0 +1,485 @@
+#! /usr/bin/perl -w
+##--------------------------------------------------------------------##
+##--- Control supervision of applications run with callgrind       ---##
+##---                                            callgrind_control ---##
+##--------------------------------------------------------------------##
+
+#  This file is part of Callgrind, a cache-simulator and call graph
+#  tracer built on Valgrind.
+#
+#  Copyright (C) 2003,2004,2005 Josef Weidendorfer
+#     Josef.Weidendorfer@gmx.de
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+#  02111-1307, USA.
+
+sub getCallgrindPids {
+
+  @pids = ();
+  foreach $f (</tmp/callgrind.info.*>) {
+    ($pid) = ($f =~ /info\.(\d+)/);
+    if ($pid eq "") { next; }
+    $mapfile = "/proc/$pid/maps";
+    if (!-e $mapfile) { next; }
+
+    open MAP, "<$mapfile";
+    $found = 0;
+    while(<MAP>) {
+      # works both for VG 3.0 and VG 3.1
+      if (/callgrind/) { $found = 1; }
+    }
+    close MAP;
+    if ($found == 0) { next; }
+
+    open INFO, "<$f";
+    while(<INFO>) {
+      if (/version: (\d+)/) { $mversion{$pid} = $1; }
+      if (/cmd: (.+)$/) { $cmd{$pid} = $1; }
+      if (/control: (.+)$/) { $control{$pid} = $1; }
+      if (/base: (.+)$/) { $base{$pid} = $1; }
+      if (/result: (.+)$/) { $result{$pid} = $1; }
+    }
+    close INFO;
+
+    if ($mversion{$pid} > 1) {
+      #print " Unsupported Callgrind Major Version $mversion.\n\n";
+      next;
+    }
+
+    push(@pids, $pid);
+  }
+}
+
+sub printHeader {
+  if ($headerPrinted) { return; }
+  $headerPrinted = 1;
+  if ($beQuiet) { return; }
+
+  print "Observe the status and control currently active callgrind runs.\n";
+  print "(C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer\@gmx.de)\n\n";
+}
+
+sub printVersion {
+  print "callgrind_control-@VERSION@\n";
+  exit;
+}
+
+sub printHelp {
+  printHeader;
+
+  print "Usage: callgrind_control [options] [ <PID>|<Name> ...]\n\n";
+  print "If no PIDs/Names are given, an action is applied to all currently\n";
+  print "active Callgrind runs. Default action is printing short information.\n\n";
+  print "Options:\n";
+  print "  -h         Print this help text\n";
+  print "  -v         Print version\n";
+  print "  -q         Be quiet\n";
+  print "  -l         Print more information\n";
+  print "  -s         Print status information\n";
+  print "  -b         Print backtrace information\n";
+  print "  -e [A,..]  Print event counters for A,.. [default: all]\n";
+  print "  -d [str]   Request a profile dump, include <str> as trigger hint\n";
+  print "  -z         Zero all cost counters\n";
+  print "  -k         Kill\n";
+  print "  -i on/off  Switch instrumentation state on/off\n";
+  print "  -w <dir>   Manually specify the working directory of a callgrind run\n";
+  print "\n";
+  exit;
+}
+
+
+#
+# Parts more or less copied from ct_annotate (author: Nicholas Nethercote)
+#
+
+sub prepareEvents {
+
+  @events = split(/\s+/, $events);
+  %events = ();
+  $n = 0;
+  foreach $event (@events) {
+    $events{$event} = $n;
+    $n++;
+  }
+  if (@show_events) {
+    foreach my $show_event (@show_events) {
+      (defined $events{$show_event}) or
+	print "Warning: Event `$show_event' is not being collected\n";
+    }
+  } else {
+    @show_events = @events;
+  }
+  @show_order = ();
+  foreach my $show_event (@show_events) {
+    push(@show_order, $events{$show_event});
+  }
+}
+
+sub max ($$) 
+{
+    my ($x, $y) = @_;
+    return ($x > $y ? $x : $y);
+}
+
+sub line_to_CC ($)
+{
+    my @CC = (split /\s+/, $_[0]);
+    (@CC <= @events) or die("Line $.: too many event counts\n");
+    return \@CC;
+}
+
+sub commify ($) {
+    my ($val) = @_;
+    1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/);
+    return $val;
+}
+
+sub compute_CC_col_widths (@) 
+{
+    my @CCs = @_;
+    my $CC_col_widths = [];
+
+    # Initialise with minimum widths (from event names)
+    foreach my $event (@events) {
+        push(@$CC_col_widths, length($event));
+    }
+    
+    # Find maximum width count for each column.  @CC_col_width positions
+    # correspond to @CC positions.
+    foreach my $CC (@CCs) {
+        foreach my $i (0 .. scalar(@$CC)-1) {
+            if (defined $CC->[$i]) {
+                # Find length, accounting for commas that will be added
+                my $length = length $CC->[$i];
+                my $clength = $length + int(($length - 1) / 3);
+                $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); 
+            }
+        }
+    }
+    return $CC_col_widths;
+}
+
+# Print the CC with each column's size dictated by $CC_col_widths.
+sub print_CC ($$) 
+{
+    my ($CC, $CC_col_widths) = @_;
+
+    foreach my $i (@show_order) {
+        my $count = (defined $CC->[$i] ? commify($CC->[$i]) : ".");
+        my $space = ' ' x ($CC_col_widths->[$i] - length($count));
+        print("$space$count ");
+    }
+}
+
+sub print_events ($)
+{
+    my ($CC_col_widths) = @_;
+
+    foreach my $i (@show_order) { 
+        my $event       = $events[$i];
+        my $event_width = length($event);
+        my $col_width   = $CC_col_widths->[$i];
+        my $space       = ' ' x ($col_width - $event_width);
+        print("$space$event ");
+    }
+}
+
+
+
+#
+# Main
+#
+
+getCallgrindPids;
+
+$requestEvents = 0;
+$requestDump = 0;
+$switchInstr = 0;
+$headerPrinted = 0;
+$beQuiet = 0;
+$dumpHint = "";
+$gotW = 0;
+$workingDir = "";
+
+%spids = ();
+foreach $arg (@ARGV) {
+  if ($arg =~ /^-/) {
+    if ($requestDump == 1) { $requestDump = 2; }
+    if ($requestEvents == 1) { $requestEvents = 2; }
+    if ($gotW == 1) { $gotW = 2; }
+
+    if ($arg =~ /^-?-h/) { printHelp; }
+    if ($arg =~ /^-?-v/) { printVersion; }
+    if ($arg =~ /^-q/) { $beQuiet = 1; next; }
+    if ($arg =~ /^-l/) { $printLong = 1; next; }
+    if ($arg =~ /^-s/) { $printStatus = 1; next; }
+    if ($arg =~ /^-b/) { $printBacktrace = 1; next; }
+    if ($arg =~ /^-d/) { $requestDump = 1; next; }
+    if ($arg =~ /^-z/) { $requestZero = 1; next; }
+    if ($arg =~ /^-k/) { $requestKill = 1; next; }
+    if ($arg =~ /^-e/) { $requestEvents = 1; next; }
+    if ($arg =~ /^-i/) { $switchInstr = 1; next; }
+    if ($arg =~ /^-w/) { $gotW = 1; next; }
+
+    printHeader;
+    print "Unknown option '$arg'.\n\n";
+    printHelp;
+  }
+
+  if ($arg =~ /^[A-Za-z_]/) {
+    # arguments of -d/-e/-i are non-numeric
+    if ($requestDump == 1) {
+      $requestDump = 2;
+      $dumpHint = $arg;
+      next;
+    }
+
+    if ($requestEvents == 1) {
+      $requestEvents = 2;
+      @show_events = split(/,/, $arg);
+      next;
+    }
+
+    if ($switchInstr == 1) {
+      $switchInstr = 2;
+      $switchInstrMode = "+";
+      if (($arg eq "off") || ($arg eq "no")) {
+	$switchInstrMode = "-";
+      }
+      next;
+    }
+  }
+
+  if ($gotW == 1) {
+      $gotW = 2;
+      $workingDir = $arg;
+      if (!-d $workingDir) {
+	  print "Error: directory '$workingDir' does not exist.\n";
+	  printHelp;
+      }
+      next;
+  }
+
+  if (defined $cmd{$arg}) { $spids{$arg} = 1; next; }
+  $nameFound = 0;
+  foreach $p (@pids) {
+    if ($cmd{$p} =~ /^$arg/) {
+      $nameFound = 1;
+      $spids{$p} = 1;
+    }
+  }
+  if ($nameFound) { next; }
+
+  printHeader;
+  print "Non-existent Callgrind task with PID/Name '$arg'.\n\n";
+  printHelp;
+}
+
+if ($workingDir ne "") {
+    # Generate dummy information for dummy pid 0
+    $pid = "0";
+    $mversion{$pid} = "@VERSION@";
+    $cmd{$pid} = "???";
+    $base{$pid} = $workingDir;
+    $control{$pid} = "$workingDir/callgrind.cmd";
+    # do not wait for any result...
+    $result{$pid} = "";
+
+    # Only handle this faked callgrind run
+    @pids = ($pid);
+}
+
+if (scalar @pids == 0) {
+  print "No active callgrind runs detected.\n";
+  #print "Detection fails when /proc/*/maps is not readable.\n";
+  print "[Detection can fail on some systems; to work around this,\n";
+  print " specify the working directory of a callgrind run with '-w']\n";
+  exit;
+}
+
+@spids = keys %spids;
+if (scalar @spids >0) { @pids = @spids; }
+
+$command = "";
+$waitForAnswer = 0;
+if ($requestDump) {
+  $command = "Dump";
+  if ($dumpHint ne "") { $command .= " ".$dumpHint; }
+}
+if ($requestZero) { $command = "Zero"; }
+if ($requestKill) { $command = "Kill"; }
+if ($switchInstr) { $command = $switchInstrMode."Instrumentation"; }
+if ($printStatus || $printBacktrace || $requestEvents) {
+  $command = "Status";
+  $waitForAnswer = 1;
+}
+
+foreach $pid (@pids) {
+  $pidstr = "PID $pid: ";
+  print $pidstr.$cmd{$pid};
+
+  if ($command eq "") {
+    if ($printLong) {
+      #print " " x length $pidstr;
+      print " (in $base{$pid})\n";
+    }
+    else {
+      print "\n";
+    }
+    next;
+  }
+  else {
+    if (! (open CONTROL, ">$control{$pid}")) {
+      print " [sending '$command' failed: permission denied]\n";
+      next;
+    }
+    print " [requesting '$command'...]\n";
+    print CONTROL $command;
+    close CONTROL;
+
+    while(-e $control{$pid}) {
+      # sleep for 250 ms
+      select(undef, undef, undef, 0.25);
+    }
+  }
+
+  if ($result{$pid} eq "") { $waitForAnswer=0; }
+  if (!$waitForAnswer) { print "  OK.\n"; next; }
+
+  if (! (open RESULT, "<$result{$pid}")) {
+    print " Warning: Can't open expected result file $result{$pid}.\n";
+    next;
+  }
+
+  @tids = ();
+  $ctid = 0;
+  %fcount = ();
+  %func = ();
+  %calls = ();
+  %events = ();
+  @events = ();
+  %totals = ();
+
+  $exec_bbs = 0;
+  $dist_bbs = 0;
+  $exec_calls = 0;
+  $dist_calls = 0;
+  $dist_ctxs = 0;
+  $dist_funcs = 0;
+  $threads = 0;
+  $events = "";
+
+  while(<RESULT>) {
+    if (/function-(\d+)-(\d+): (.+)$/) {
+      if ($ctid != $1) {
+	$ctid = $1;
+	push(@tids, $ctid);
+	$fcount{$ctid} = 0;
+      }
+      $fcount{$ctid}++;
+      $func{$ctid,$fcount{$ctid}} = $3;
+    }
+    elsif (/calls-(\d+)-(\d+): (.+)$/) {
+      if ($ctid != $1) { next; }
+      $calls{$ctid,$fcount{$ctid}} = $3;
+    }
+    elsif (/events-(\d+)-(\d+): (.+)$/) {
+      if ($ctid != $1) { next; }
+      $events{$ctid,$fcount{$ctid}} = line_to_CC($3);
+    }
+    elsif (/events-(\d+): (.+)$/) {
+      if (scalar @events == 0) { next; }
+      $totals{$1} = line_to_CC($2);
+    }
+    elsif (/executed-bbs: (\d+)/) { $exec_bbs = $1; }
+    elsif (/distinct-bbs: (\d+)/) { $dist_bbs = $1; }
+    elsif (/executed-calls: (\d+)/) { $exec_calls = $1; }
+    elsif (/distinct-calls: (\d+)/) { $dist_calls = $1; }
+    elsif (/distinct-functions: (\d+)/) { $dist_funcs = $1; }
+    elsif (/distinct-contexts: (\d+)/) { $dist_ctxs = $1; }
+    elsif (/events: (.+)$/) { $events = $1; prepareEvents; }
+    elsif (/threads: (\d+)$/) { $threads = $1; }
+    elsif (/instrumentation: (\w+)$/) { $instrumentation = $1; }
+  }
+
+  unlink $result{$pid};
+
+  if ($instrumentation eq "off") {
+    print "  No information available as instrumentation is switched off.\n\n";
+    exit;
+  }
+
+  if ($printStatus) {
+    if ($requestEvents <1) {
+      print "  Number of threads: $threads\n";
+      print "  Events collected: $events\n";
+    }
+
+    print "  Functions: ".commify($dist_funcs);
+    print " (executed ".commify($exec_calls);
+    print ", contexts ".commify($dist_ctxs).")\n";
+
+    print "  Basic blocks: ".commify($dist_bbs);
+    print " (executed ".commify($exec_bbs);
+    print ", call sites ".commify($dist_calls).")\n";
+  }
+
+  if ($requestEvents >0) {
+    $totals_width = compute_CC_col_widths(values %totals);
+    print "\n  Totals:";
+    print_events($totals_width);
+    print("\n");
+    foreach $tid (@tids) {
+      print "   Th".substr("  ".$tid,-2)."  ";
+      print_CC($totals{$tid}, $totals_width);
+      print("\n");
+    }
+  }
+
+  if ($printBacktrace) {
+
+    if ($requestEvents >0) {
+      $totals_width = compute_CC_col_widths(values %events);
+    }
+
+    foreach $tid (@tids) {
+      print "\n  Frame: ";
+      if ($requestEvents >0) {
+	print_events($totals_width);
+      }
+      print "Backtrace for Thread $tid\n";
+
+      $i = $fcount{$tid};
+      $c = 0;
+      while($i>0 && $c<100) {
+	$fc = substr(" $c",-2);
+	print "   [$fc]  ";
+	if ($requestEvents >0) {
+	  print_CC($events{$tid,$i-1}, $totals_width);
+	}
+	print $func{$tid,$i};
+	if ($i > 1) {
+	  print " (".$calls{$tid,$i-1}." x)";
+	}
+	print "\n";
+	$i--;
+	$c++;
+      }
+      print "\n";
+    }
+  }
+  print "\n";
+}
+	
diff --git a/callgrind/callstack.c b/callgrind/callstack.c
new file mode 100644
index 0000000000..6e14b2e1d0
--- /dev/null
+++ b/callgrind/callstack.c
@@ -0,0 +1,424 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                               ct_callstack.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Call stack, operations                               ---*/
+/*------------------------------------------------------------*/
+
+/* Stack of current thread. Gets initialized when switching to 1st thread.
+ *
+ * The artificial call stack is an array of call_entry's, representing
+ * stack frames of the executing program. 
+ * Array call_stack and call_stack_esp have same size and grow on demand.
+ * Array call_stack_esp holds SPs of corresponding stack frames.
+ *
+ */
+
+#define N_CALL_STACK_INITIAL_ENTRIES 500
+
+call_stack CLG_(current_call_stack);
+
+void CLG_(init_call_stack)(call_stack* s)
+{
+  Int i;
+
+  CLG_ASSERT(s != 0);
+
+  s->size = N_CALL_STACK_INITIAL_ENTRIES;   
+  s->entry = (call_entry*) CLG_MALLOC(s->size * sizeof(call_entry));
+  s->sp = 0;
+  s->entry[0].cxt = 0; /* for assertion in push_cxt() */
+
+  for(i=0; i<s->size; i++) s->entry[i].enter_cost = 0;
+}
+
+call_entry* CLG_(get_call_entry)(Int sp)
+{
+  CLG_ASSERT(sp <= CLG_(current_call_stack).sp);
+  return &(CLG_(current_call_stack).entry[sp]);
+}
+
+void CLG_(copy_current_call_stack)(call_stack* dst)
+{
+  CLG_ASSERT(dst != 0);
+
+  dst->size  = CLG_(current_call_stack).size;
+  dst->entry = CLG_(current_call_stack).entry;
+  dst->sp    = CLG_(current_call_stack).sp;
+}
+
+void CLG_(set_current_call_stack)(call_stack* s)
+{
+  CLG_ASSERT(s != 0);
+
+  CLG_(current_call_stack).size  = s->size;
+  CLG_(current_call_stack).entry = s->entry;
+  CLG_(current_call_stack).sp    = s->sp;
+}
+
+
+static __inline__
+void ensure_stack_size(Int i)
+{
+  Int oldsize;
+  call_stack *cs = &CLG_(current_call_stack);
+
+  if (i < cs->size) return;
+
+  oldsize = cs->size;
+  cs->size *= 2;
+  while (i > cs->size) cs->size *= 2;
+
+  cs->entry = (call_entry*) VG_(realloc)(cs->entry,
+					 cs->size * sizeof(call_entry));
+
+  for(i=oldsize; i<cs->size; i++)
+    cs->entry[i].enter_cost = 0;
+
+  CLG_(stat).call_stack_resizes++;
+ 
+  CLG_DEBUGIF(2)
+    VG_(printf)("        call stack enlarged to %d entries\n",
+		CLG_(current_call_stack).size);
+}
+
+
+
+/* Called when function entered nonrecursive */
+static void function_entered(fn_node* fn, BBCC* to)
+{
+  CLG_ASSERT(fn != 0);
+
+#if CLG_ENABLE_DEBUG
+  if (fn->verbosity >=0) {
+    Int old = CLG_(clo).verbose;
+    CLG_(clo).verbose = fn->verbosity;
+    fn->verbosity = old;
+    VG_(message)(Vg_DebugMsg, 
+		 "Entering %s: Verbosity set to %d",
+		 fn->name, CLG_(clo).verbose);
+  }
+#endif		
+	    
+  if (fn->dump_before) {
+    Char trigger[FN_NAME_LEN];
+    VG_(sprintf)(trigger, "--dump-before=%s", fn->name);
+    CLG_(dump_profile)(trigger, True);
+  }
+  else if (fn->zero_before) {
+    CLG_(zero_all_cost)(True);
+  }
+
+  if (fn->toggle_collect) {
+    CLG_(current_state).collect = !CLG_(current_state).collect;
+    CLG_DEBUG(2,"   entering %s: toggled collection state to %s\n",
+	     fn->name,
+	     CLG_(current_state).collect ? "ON" : "OFF");
+  }
+}	
+
+/* Called when function left (no recursive level active) */
+static void function_left(fn_node* fn, BBCC* from)
+{
+  CLG_ASSERT(fn != 0);
+
+  if (fn->dump_after) {
+    Char trigger[FN_NAME_LEN];
+    VG_(sprintf)(trigger, "--dump-after=%s", fn->name);
+    CLG_(dump_profile)(trigger, True);
+  }
+  if (fn->toggle_collect) {
+    CLG_(current_state).collect = !CLG_(current_state).collect;
+    CLG_DEBUG(2,"   leaving %s: toggled collection state to %s\n",
+	     fn->name,
+	     CLG_(current_state).collect ? "ON" : "OFF");
+  }
+
+#if CLG_ENABLE_DEBUG
+  if (fn->verbosity >=0) {
+    Int old = CLG_(clo).verbose;
+    CLG_(clo).verbose = fn->verbosity;
+    fn->verbosity = old;
+    VG_(message)(Vg_DebugMsg, 
+		 "Leaving %s: Verbosity set back to %d",
+		 fn->name, CLG_(clo).verbose);
+  }
+#endif		
+}
+
+
+/* Push call on call stack.
+ *
+ * Increment the usage count for the function called.
+ * A jump from <from> to <to>, with <sp>.
+ * If <skip> is true, this is a call to a function to be skipped;
+ * for this, we set jcc = 0.
+ */
+void CLG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
+{
+    jCC* jcc;
+    UInt* pdepth;
+    call_entry* current_entry;
+    Addr ret_addr;
+
+    /* Ensure a call stack of size <current_sp>+1.
+     * The +1 is needed as push_cxt will store the
+     * context at [current_sp]
+     */
+    ensure_stack_size(CLG_(current_call_stack).sp +1);
+    current_entry = &(CLG_(current_call_stack).entry[CLG_(current_call_stack).sp]);
+
+    if (skip) {
+	jcc = 0;
+    }
+    else {
+	fn_node* to_fn = to->cxt->fn[0];
+
+	if (CLG_(current_state).nonskipped) {
+	    /* this is a jmp from skipped to nonskipped */
+	    CLG_ASSERT(CLG_(current_state).nonskipped == from);
+	}
+
+	/* As push_cxt() has to be called before push_call_stack if not
+	 * skipping, the old context should already be saved on the stack */
+	CLG_ASSERT(current_entry->cxt != 0);
+	CLG_(copy_cost_lz)( CLG_(sets).full, &(current_entry->enter_cost),
+			   CLG_(current_state).cost );
+
+	jcc = CLG_(get_jcc)(from, jmp, to);
+	CLG_ASSERT(jcc != 0);
+
+	pdepth = CLG_(get_fn_entry)(to_fn->number);
+	if (CLG_(clo).skip_direct_recursion) {
+	    /* only increment depth if another function is called */
+	  if (jcc->from->cxt->fn[0] != to_fn) (*pdepth)++;
+	}
+	else (*pdepth)++;
+
+	if (*pdepth>1)
+	  CLG_(stat).rec_call_counter++;
+	
+	jcc->call_counter++;
+	CLG_(stat).call_counter++;
+
+	if (*pdepth == 1) function_entered(to_fn, to);
+    }
+
+    /* return address is only is useful with a real call;
+     * used to detect RET w/o CALL */
+    ret_addr = (from->bb->jmpkind == Ijk_Call) ?
+	bb_addr(from->bb) + from->bb->instr_len : 0;
+
+    /* put jcc on call stack */
+    current_entry->jcc = jcc;
+    current_entry->sp = sp;
+    current_entry->ret_addr = ret_addr;
+    current_entry->nonskipped = CLG_(current_state).nonskipped;
+
+    CLG_(current_call_stack).sp++;
+
+    /* To allow for above assertion we set context of next frame to 0 */
+    CLG_ASSERT(CLG_(current_call_stack).sp < CLG_(current_call_stack).size);
+    current_entry++;
+    current_entry->cxt = 0;
+
+    if (!skip)
+	CLG_(current_state).nonskipped = 0;
+    else if (!CLG_(current_state).nonskipped) {
+	/* a call from nonskipped to skipped */
+	CLG_(current_state).nonskipped = from;
+	if (!CLG_(current_state).nonskipped->skipped) {
+	  CLG_(init_cost_lz)( CLG_(sets).full,
+			     &CLG_(current_state).nonskipped->skipped);
+	  CLG_(stat).distinct_skips++;
+	}
+    }
+
+#if CLG_ENABLE_DEBUG
+    CLG_DEBUGIF(0) {
+	if (CLG_(clo).verbose<2) {
+	  if (jcc && jcc->to && jcc->to->bb) {
+	    char spaces[][41] = { "   .   .   .   .   .   .   .   .   .   .",
+				  "  .   .   .   .   .   .   .   .   .   . ",
+				  " .   .   .   .   .   .   .   .   .   .  ",
+				  ".   .   .   .   .   .   .   .   .   .   " };
+
+	    int s = CLG_(current_call_stack).sp;
+	    Int* pars = (Int*) sp;
+
+	    BB* bb = jcc->to->bb;
+	    if (s>40) s=40;
+	    VG_(printf)("%s> %s(0x%x, 0x%x, ...) [%s / %p]\n", spaces[s%4]+40-s, bb->fn->name,
+                        pars ? pars[1]:0,
+			pars ? pars[2]:0,
+			bb->obj->name + bb->obj->last_slash_pos,
+			bb->offset);
+	  }
+	}
+	else if (CLG_(clo).verbose<4) {
+	    VG_(printf)("+ %2d ", CLG_(current_call_stack).sp);
+	    CLG_(print_short_jcc)(jcc);
+	    VG_(printf)(", SP %p, RA %p\n", sp, ret_addr);
+	}
+	else {
+	    VG_(printf)("  Pushed ");
+	    CLG_(print_stackentry)(3, CLG_(current_call_stack).sp-1);
+	}
+    }
+#endif
+
+}
+
+
+/* Pop call stack and update inclusive sums.
+ * Returns modified fcc.
+ *
+ * If the JCC becomes inactive, call entries are freed if possible
+ */
+void CLG_(pop_call_stack)()
+{
+    jCC* jcc;
+    Int depth = 0;
+    call_entry* lower_entry;
+
+    if (CLG_(current_state).sig >0) {
+	/* Check if we leave a signal handler; this can happen when
+	 * calling longjmp() in the handler */
+	CLG_(run_post_signal_on_call_stack_bottom)();
+    }
+
+    lower_entry =
+	&(CLG_(current_call_stack).entry[CLG_(current_call_stack).sp-1]);
+
+    CLG_DEBUG(4,"+ pop_call_stack: frame %d, jcc %p\n", 
+		CLG_(current_call_stack).sp, lower_entry->jcc);
+
+    /* jCC item not any more on real stack: pop */
+    jcc = lower_entry->jcc;
+    CLG_(current_state).nonskipped = lower_entry->nonskipped;
+
+    if (jcc) {
+	fn_node* to_fn  = jcc->to->cxt->fn[0];
+	UInt* pdepth =  CLG_(get_fn_entry)(to_fn->number);
+	if (CLG_(clo).skip_direct_recursion) {
+	    /* only decrement depth if another function was called */
+	  if (jcc->from->cxt->fn[0] != to_fn) (*pdepth)--;
+	}
+	else (*pdepth)--;
+	depth = *pdepth;
+
+	/* add cost difference to sum */
+	if ( CLG_(add_diff_cost_lz)( CLG_(sets).full, &(jcc->cost),
+				    lower_entry->enter_cost,
+				    CLG_(current_state).cost) ) {
+	    
+	  /* only count this call if it attributed some cost.
+	   * the ret_counter is used to check if a BBCC dump is needed.
+	   */
+	  jcc->from->ret_counter++;
+	}
+	CLG_(stat).ret_counter++;
+
+	/* restore context */
+	CLG_(current_state).cxt  = lower_entry->cxt;
+	CLG_(current_fn_stack).top =
+	  CLG_(current_fn_stack).bottom + lower_entry->fn_sp;
+	CLG_ASSERT(CLG_(current_state).cxt != 0);
+
+	if (depth == 0) function_left(to_fn, jcc->from);
+    }
+
+    /* To allow for an assertion in push_call_stack() */
+    lower_entry->cxt = 0;
+
+    CLG_(current_call_stack).sp--;
+
+#if CLG_ENABLE_DEBUG
+    CLG_DEBUGIF(1) {
+	if (CLG_(clo).verbose<4) {
+	    if (jcc) {
+		/* popped JCC target first */
+		VG_(printf)("- %2d %p => ", 
+			    CLG_(current_call_stack).sp,
+			    bb_addr(jcc->to->bb));
+		CLG_(print_addr)(bb_jmpaddr(jcc->from->bb));
+		VG_(printf)(", SP %p\n",
+			    CLG_(current_call_stack).entry[CLG_(current_call_stack).sp].sp);
+		CLG_(print_cost)(10, CLG_(sets).full, jcc->cost);
+	    }
+	    else
+		VG_(printf)("- %2d [Skipped JCC], SP %p\n",
+			    CLG_(current_call_stack).sp,
+			    CLG_(current_call_stack).entry[CLG_(current_call_stack).sp].sp);
+	}
+	else {
+	    VG_(printf)("  Popped ");
+	    CLG_(print_stackentry)(7, CLG_(current_call_stack).sp);
+	    if (jcc) {
+		VG_(printf)("       returned to ");
+		CLG_(print_addr_ln)(bb_jmpaddr(jcc->from->bb));
+	    }
+	}
+    }
+#endif
+
+}
+
+
+/* remove CallStack items to sync with current SP
+ */
+void CLG_(unwind_call_stack)(Addr sp, Int minpops)
+{
+    Int csp;
+    CLG_DEBUG(4,"+ unwind_call_stack(sp %p, minpops %d): frame %d\n",
+	      sp, minpops, CLG_(current_call_stack).sp);
+
+    /* We pop old stack frames.
+     * For a call, be p the stack address with return address.
+     *  - call_stack_esp[] has SP after the CALL: p-4
+     *  - current sp is after a RET: >= p
+     */
+    
+    while( (csp=CLG_(current_call_stack).sp) >0) {
+	call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]);
+
+	if ((top_ce->sp < sp) ||
+	    ((top_ce->sp == sp) && minpops>0)) {
+
+	    minpops--;
+	    CLG_(pop_call_stack)();
+	    csp=CLG_(current_call_stack).sp;
+	    continue;
+	}
+	break;
+    }
+
+    CLG_DEBUG(4,"- unwind_call_stack\n");
+}
diff --git a/callgrind/clo.c b/callgrind/clo.c
new file mode 100644
index 0000000000..184fed1068
--- /dev/null
+++ b/callgrind/clo.c
@@ -0,0 +1,765 @@
+/*
+   This file is part of Callgrind, a Valgrind skin for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This skin is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002 Nicholas Nethercote (njn25@cam.ac.uk)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h" // for VG_PREFIX
+
+#include "global.h"
+
+
+
+/*------------------------------------------------------------*/
+/*--- Function specific configuration options              ---*/
+/*------------------------------------------------------------*/
+
+/* Special value for separate_callers: automatic = adaptive */
+#define CONFIG_AUTO    -1
+
+#define CONFIG_DEFAULT -1
+#define CONFIG_FALSE    0
+#define CONFIG_TRUE     1
+
+/* Logging configuration for a function */
+struct _fn_config {
+    Int dump_before;
+    Int dump_after;
+    Int zero_before;
+    Int toggle_collect;
+
+    Int skip;    /* Handle CALL to this function as JMP (= Skip)? */
+    Int group;   /* don't change caller dependency inside group !=0 */
+    Int pop_on_jump; 
+
+    Int separate_callers;    /* separate logging dependent on caller  */
+    Int separate_recursions; /* separate logging of rec. levels       */
+
+#if CLG_ENABLE_DEBUG
+    Int verbosity; /* Change debug verbosity level while in function */
+#endif
+};
+
+/* Configurations for function name prefix patterns.
+ * Currently, only very limit patterns are possible:
+ * Exact prefix patterns and "*::" are allowed.
+ * E.g.
+ *  - "abc" matches all functions starting with "abc".
+ *  - "abc*::def" matches all functions starting with "abc" and
+ *    starting with "def" after the first "::" separator.
+ *  - "*::print(" matches C++ methods "print" in all classes
+ *    without namespace. I.e. "*" doesn't match a "::".
+ *
+ * We build a trie from patterns, and for a given function, we
+ * go down the tree and apply all non-default configurations.
+ */
+
+
+#define NODE_DEGREE 30
+
+/* node of compressed trie search structure */
+typedef struct _config_node config_node;
+struct _config_node {
+  Int length;
+    
+  fn_config* config;
+  config_node* sub_node[NODE_DEGREE];
+  config_node* next;
+  config_node* wild_star;
+  config_node* wild_char;
+
+  Char name[1];
+};
+
+/* root of trie */
+static config_node* fn_configs = 0;
+
+static __inline__ 
+fn_config* new_fnc(void)
+{
+   fn_config* new = (fn_config*) CLG_MALLOC(sizeof(fn_config));
+
+   new->dump_before  = CONFIG_DEFAULT;
+   new->dump_after   = CONFIG_DEFAULT;
+   new->zero_before  = CONFIG_DEFAULT;
+   new->toggle_collect = CONFIG_DEFAULT;
+   new->skip         = CONFIG_DEFAULT;
+   new->pop_on_jump  = CONFIG_DEFAULT;
+   new->group        = CONFIG_DEFAULT;
+   new->separate_callers    = CONFIG_DEFAULT;
+   new->separate_recursions = CONFIG_DEFAULT;
+
+#if CLG_ENABLE_DEBUG
+   new->verbosity    = CONFIG_DEFAULT;
+#endif
+
+   return new;
+}
+
+
+static config_node* new_config(Char* name, int length)
+{
+    int i;
+    config_node* node = (config_node*) CLG_MALLOC(sizeof(config_node) + length);
+
+    for(i=0;i<length;i++) {
+      if (name[i] == 0) break;
+      node->name[i] = name[i];
+    }
+    node->name[i] = 0;
+
+    node->length = length;
+    node->config = 0;
+    for(i=0;i<NODE_DEGREE;i++)
+	node->sub_node[i] = 0;
+    node->next = 0;
+    node->wild_char = 0;
+    node->wild_star = 0;
+
+    CLG_DEBUG(3, "   new_config('%s', len %d)\n", node->name, length);
+
+    return node;
+}
+
+static __inline__
+Bool is_wild(Char n)
+{
+  return (n == '*') || (n == '?');
+}
+
+/* Recursively build up function matching tree (prefix tree).
+ * Returns function config object for pattern <name>
+ * and starting at tree node <*pnode>.
+ *
+ * Tree nodes (config_node) are created as needed,
+ * tree root is stored into <*pnode>, and the created
+ * leaf (fn_config) for the given pattern is returned.
+ */
+static fn_config* get_fnc2(config_node* node, Char* name)
+{
+  config_node *new_sub, *n, *nprev;
+  int offset, len;
+
+  CLG_DEBUG(3, "  get_fnc2(%p, '%s')\n", node, name);
+
+  if (name[0] == 0) {
+    if (!node->config) node->config = new_fnc();
+    return node->config;
+  }
+
+  if (is_wild(*name)) {
+    if (*name == '*') {
+      while(name[1] == '*') name++;
+      new_sub = node->wild_star;
+    }
+    else
+      new_sub = node->wild_char;
+
+    if (!new_sub) {
+      new_sub = new_config(name, 1);
+      if (*name == '*')
+	node->wild_star = new_sub;
+      else
+	node->wild_char = new_sub;
+    }
+
+    return get_fnc2( new_sub, name+1);
+  }
+
+  n = node->sub_node[ name[0]%NODE_DEGREE ];
+  nprev = 0;
+  len = 0;
+  while(n) {
+    for(len=0; name[len] == n->name[len]; len++);
+    if (len>0) break;
+    nprev = n;
+    n = n->next;
+  }
+
+  if (!n) {
+    len = 1;
+    while(name[len] && (!is_wild(name[len]))) len++;
+    new_sub = new_config(name, len);
+    new_sub->next = node->sub_node[ name[0]%NODE_DEGREE ];
+    node->sub_node[ name[0]%NODE_DEGREE ] = new_sub;	
+
+    if (name[len] == 0) {
+      new_sub->config = new_fnc();
+      return new_sub->config;
+    }
+    
+    /* recurse on wildcard */
+    return get_fnc2( new_sub, name+len);
+  }
+
+  if (len < n->length) {
+
+    /* split up the subnode <n> */
+    config_node *new_node;
+    int i;
+
+    new_node = new_config(n->name, len);
+    if (nprev)
+      nprev->next = new_node;
+    else
+      node->sub_node[ n->name[0]%NODE_DEGREE ] = new_node;
+    new_node->next = n->next;
+
+    new_node->sub_node[ n->name[len]%NODE_DEGREE ] = n;
+
+    for(i=0, offset=len; offset < n->length; i++, offset++)
+      n->name[i] = n->name[offset];
+    n->name[i] = 0;
+    n->length = i;
+
+    name += len;
+    offset = 0;
+    while(name[offset] && (!is_wild(name[offset]))) offset++;
+    new_sub  = new_config(name, offset);
+    /* this sub_node of new_node could already be set: chain! */
+    new_sub->next = new_node->sub_node[ name[0]%NODE_DEGREE ];
+    new_node->sub_node[ name[0]%NODE_DEGREE ] = new_sub;
+
+    if (name[offset]==0) {
+      new_sub->config = new_fnc();
+      return new_sub->config;
+    }
+
+    /* recurse on wildcard */
+    return get_fnc2( new_sub, name+offset);
+  }
+
+  name += n->length;
+
+  if (name[0] == 0) {
+    /* name and node name are the same */
+    if (!n->config) n->config = new_fnc();
+    return n->config;
+  }
+
+  offset = 1;
+  while(name[offset] && (!is_wild(name[offset]))) offset++;
+
+  new_sub = new_config(name, offset);
+  new_sub->next = n->sub_node[ name[offset]%NODE_DEGREE ];
+  n->sub_node[ name[offset]%NODE_DEGREE ] = new_sub;	
+
+  return get_fnc2(new_sub, name+offset);
+}
+
+static void print_config_node(int s, config_node* node)
+{
+  config_node* n;
+  int i;
+
+  if (node != fn_configs) {
+    char sp[] = "                                        ";
+
+    if (s>40) s=40;
+    VG_(printf)(sp+40-s);
+    VG_(printf)("'%s'/%d\n", node->name, node->length);
+  }
+  for(i=0;i<NODE_DEGREE;i++) {
+    n = node->sub_node[i];
+    while(n) {
+      print_config_node(s+1, n);
+      n = n->next;
+    }
+  }
+  if (node->wild_char) print_config_node(s+1, node->wild_char);
+  if (node->wild_star) print_config_node(s+1, node->wild_star);
+}
+
+/* get a function config for a name pattern (from command line) */
+static fn_config* get_fnc(Char* name)
+{
+  fn_config* fnc;
+
+  CLG_DEBUG(3, " +get_fnc(%s)\n", name);
+  if (fn_configs == 0)
+    fn_configs = new_config(name, 0);
+  fnc =  get_fnc2(fn_configs, name);
+
+  CLG_DEBUGIF(3) {
+    CLG_DEBUG(3, " -get_fnc(%s):\n", name);
+    print_config_node(3, fn_configs);
+  }
+  return fnc;
+}
+
+  
+
+static void update_fn_config1(fn_node* fn, fn_config* fnc)
+{
+    if (fnc->dump_before != CONFIG_DEFAULT)
+	fn->dump_before = (fnc->dump_before == CONFIG_TRUE);
+
+    if (fnc->dump_after != CONFIG_DEFAULT)
+	fn->dump_after = (fnc->dump_after == CONFIG_TRUE);
+
+    if (fnc->zero_before != CONFIG_DEFAULT)
+	fn->zero_before = (fnc->zero_before == CONFIG_TRUE);
+
+    if (fnc->toggle_collect != CONFIG_DEFAULT)
+	fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
+
+    if (fnc->skip != CONFIG_DEFAULT)
+	fn->skip = (fnc->skip == CONFIG_TRUE);
+
+    if (fnc->pop_on_jump != CONFIG_DEFAULT)
+	fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE);
+
+    if (fnc->group != CONFIG_DEFAULT)
+	fn->group = fnc->group;
+
+    if (fnc->separate_callers != CONFIG_DEFAULT)
+	fn->separate_callers = fnc->separate_callers;
+
+    if (fnc->separate_recursions != CONFIG_DEFAULT)
+	fn->separate_recursions = fnc->separate_recursions;
+
+#if CLG_ENABLE_DEBUG
+    if (fnc->verbosity != CONFIG_DEFAULT)
+	fn->verbosity = fnc->verbosity;
+#endif
+}
+
+/* Recursively go down the function matching tree,
+ * looking for a match to <name>. For every matching leaf,
+ * <fn> is updated with the pattern config.
+ */
+static void update_fn_config2(fn_node* fn, Char* name, config_node* node)
+{
+    config_node* n;
+
+    CLG_DEBUG(3, "  update_fn_config2('%s', node '%s'): \n",
+	     name, node->name);
+    if ((*name == 0) && node->config) {
+      CLG_DEBUG(3, "Found!\n");
+      update_fn_config1(fn, node->config);
+      return;
+    }
+
+    n = node->sub_node[ name[0]%NODE_DEGREE ];
+    while(n) {
+      if (VG_(strncmp)(name, n->name, n->length)==0) break;
+      n = n->next;
+    }
+    if (n) update_fn_config2(fn, name+n->length, n);
+    
+    if (node->wild_char)
+      update_fn_config2(fn, name+1, node->wild_char);
+
+    if (node->wild_star) {
+      while(*name) {
+	update_fn_config2(fn, name, node->wild_star);
+	name++;
+      }
+      update_fn_config2(fn, name, node->wild_star);
+    }
+}
+
+/* Update function config according to configs of name prefixes */
+void CLG_(update_fn_config)(fn_node* fn)
+{
+    CLG_DEBUG(3, "  update_fn_config('%s')\n", fn->name);
+    if (fn_configs)
+      update_fn_config2(fn, fn->name, fn_configs);
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- Command line processing                                      ---*/
+/*--------------------------------------------------------------------*/
+
+static Char* getUInt(Char* s, UInt* pn)
+{
+    UInt n = 0;
+    while((*s >='0') && (*s <='9')) {
+	n = 10*n + (*s-'0');
+	s++;
+    }
+    if (pn) *pn = n;
+    return s;
+}
+
+__attribute__((unused))
+static UWord getUWord(Char* s)
+{
+    UWord n = 0;
+    Bool isHex = False;
+
+    if ((s[0] == '0') && (s[1] == 'x')) {
+	isHex = True;
+	s += 2;
+    }
+
+    if (!isHex) {
+	while((*s >='0') && (*s <='9')) {
+	    n = 10*n + (*s-'0');
+	    s++;
+	}
+    }
+    else {
+	while(1) {
+	    if ((*s >='0') && (*s <='9')) {
+		n = 16*n + (*s-'0');
+		s++;
+		continue;
+	    }
+	    if ((*s >='a') && (*s <='f')) {
+		n = 16*n + (*s-'a'+10);
+		s++;
+		continue;
+	    }
+	    if ((*s >='A') && (*s <='F')) {
+		n = 16*n + (*s-'A'+10);
+		s++;
+		continue;
+	    }
+	    break;
+	}
+    }
+
+    return n;
+}
+
+Bool CLG_(process_cmd_line_option)(Char* arg)
+{
+   if (0 == VG_(strcmp)(arg, "--skip-plt=yes"))
+       CLG_(clo).skip_plt = True;
+   else if (0 == VG_(strcmp)(arg, "--skip-plt=no"))
+       CLG_(clo).skip_plt = False;
+
+   else if (0 == VG_(strcmp)(arg, "--collect-jumps=yes"))
+       CLG_(clo).collect_jumps = True;
+   else if (0 == VG_(strcmp)(arg, "--collect-jumps=no"))
+       CLG_(clo).collect_jumps = False;
+   /* compatibility alias, deprecated option */
+   else if (0 == VG_(strcmp)(arg, "--trace-jump=yes"))
+       CLG_(clo).collect_jumps = True;
+   else if (0 == VG_(strcmp)(arg, "--trace-jump=no"))
+       CLG_(clo).collect_jumps = False;
+
+   else if (0 == VG_(strcmp)(arg, "--combine-dumps=yes"))
+       CLG_(clo).combine_dumps = True;
+   else if (0 == VG_(strcmp)(arg, "--combine-dumps=no"))
+       CLG_(clo).combine_dumps = False;
+
+   else if (0 == VG_(strcmp)(arg, "--collect-atstart=yes"))
+       CLG_(clo).collect_atstart = True;
+   else if (0 == VG_(strcmp)(arg, "--collect-atstart=no"))
+       CLG_(clo).collect_atstart = False;
+
+   else if (0 == VG_(strcmp)(arg, "--instr-atstart=yes"))
+       CLG_(clo).instrument_atstart = True;
+   else if (0 == VG_(strcmp)(arg, "--instr-atstart=no"))
+       CLG_(clo).instrument_atstart = False;
+
+   else if (0 == VG_(strcmp)(arg, "--separate-threads=yes"))
+       CLG_(clo).separate_threads = True;
+   else if (0 == VG_(strcmp)(arg, "--separate-threads=no"))
+       CLG_(clo).separate_threads = False;
+
+   else if (0 == VG_(strcmp)(arg, "--compress-strings=yes"))
+       CLG_(clo).compress_strings = True;
+   else if (0 == VG_(strcmp)(arg, "--compress-strings=no"))
+       CLG_(clo).compress_strings = False;
+
+   else if (0 == VG_(strcmp)(arg, "--compress-mangled=yes"))
+       CLG_(clo).compress_mangled = True;
+   else if (0 == VG_(strcmp)(arg, "--compress-mangled=no"))
+       CLG_(clo).compress_mangled = False;
+
+   else if (0 == VG_(strcmp)(arg, "--compress-pos=yes"))
+       CLG_(clo).compress_pos = True;
+   else if (0 == VG_(strcmp)(arg, "--compress-pos=no"))
+       CLG_(clo).compress_pos = False;
+
+   else if (0 == VG_(strncmp)(arg, "--fn-skip=", 10)) {
+       fn_config* fnc = get_fnc(arg+10);
+       fnc->skip = CONFIG_TRUE;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--dump-before=", 14)) {
+       fn_config* fnc = get_fnc(arg+14);
+       fnc->dump_before = CONFIG_TRUE;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--zero-before=", 14)) {
+       fn_config* fnc = get_fnc(arg+14);
+       fnc->zero_before = CONFIG_TRUE;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--dump-after=", 13)) {
+       fn_config* fnc = get_fnc(arg+13);
+       fnc->dump_after = CONFIG_TRUE;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--toggle-collect=", 17)) {
+       fn_config* fnc = get_fnc(arg+17);
+       fnc->toggle_collect = CONFIG_TRUE;
+       /* defaults to initial collection off */
+       CLG_(clo).collect_atstart = False;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--separate-recs=", 16))
+        CLG_(clo).separate_recursions = (Int)VG_(atoll)(&arg[16]);
+
+   /* workaround to find runtime_resolve (needs special handling) */
+   else if (0 == VG_(strncmp)(arg, "--pop-on-jump=", 14)) {
+       fn_config* fnc = get_fnc(arg+14);
+       fnc->pop_on_jump = CONFIG_TRUE;
+   }
+
+#if CLG_ENABLE_DEBUG
+   else if (0 == VG_(strncmp)(arg, "--ct-verbose=", 13))
+        CLG_(clo).verbose = (Int)VG_(atoll)(&arg[13]);
+
+   else if (0 == VG_(strncmp)(arg, "--ct-vstart=", 12))
+        CLG_(clo).verbose_start = (ULong)VG_(atoll)(&arg[12]);
+
+   else if (0 == VG_(strncmp)(arg, "--ct-verbose", 12)) {
+       UInt n;
+       fn_config* fnc;
+       Char* s = getUInt(arg+12, &n);
+       if ((n == 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->verbosity = n;
+   }
+#endif
+
+   else if (0 == VG_(strncmp)(arg, "--separate-callers=", 19)) {
+     if (0 == VG_(strcmp)(arg+19, "auto"))
+       CLG_(clo).separate_callers = CONFIG_AUTO;
+     else
+       CLG_(clo).separate_callers = (Int)VG_(atoll)(&arg[19]);
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--fn-group", 10)) {
+       UInt n;
+       fn_config* fnc;
+       Char* s = getUInt(arg+10, &n);
+       if ((n == 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->group = n;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--separate-callers", 18)) {
+       UInt n;
+       fn_config* fnc;
+       Char* s = getUInt(arg+18, &n);
+       if ((n == 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->separate_callers = n;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--separate-recs", 15)) {
+       UInt n;
+       fn_config* fnc;
+       Char* s = getUInt(arg+15, &n);
+       if ((n == 0) || *s != '=') return False;
+       fnc = get_fnc(s+1);
+       fnc->separate_recursions = n;
+   }
+
+   else if (0 == VG_(strncmp)(arg, "--base=", 7))
+       CLG_(clo).filename_base = VG_(strdup)(arg+7);
+
+   else if (0 == VG_(strcmp)(arg, "--mangle-names=yes"))
+       CLG_(clo).mangle_names = True;
+   else if (0 == VG_(strcmp)(arg, "--mangle-names=no"))
+       CLG_(clo).mangle_names = False;
+
+   else if (0 == VG_(strcmp)(arg, "--skip-direct-rec=yes"))
+       CLG_(clo).skip_direct_recursion = True;
+   else if (0 == VG_(strcmp)(arg, "--skip-direct-rec=no"))
+       CLG_(clo).skip_direct_recursion = False;
+
+   else if (0 == VG_(strcmp)(arg, "--dump-bbs=yes"))
+       CLG_(clo).dump_bbs = True;
+   else if (0 == VG_(strcmp)(arg, "--dump-bbs=no"))
+       CLG_(clo).dump_bbs = False;
+
+   else if (0 == VG_(strcmp)(arg, "--dump-line=yes"))
+       CLG_(clo).dump_line = True;
+   else if (0 == VG_(strcmp)(arg, "--dump-line=no"))
+       CLG_(clo).dump_line = False;
+
+   else if (0 == VG_(strcmp)(arg, "--dump-instr=yes"))
+       CLG_(clo).dump_instr = True;
+   else if (0 == VG_(strcmp)(arg, "--dump-instr=no"))
+       CLG_(clo).dump_instr = False;
+
+   else if (0 == VG_(strcmp)(arg, "--dump-bb=yes"))
+       CLG_(clo).dump_bb = True;
+   else if (0 == VG_(strcmp)(arg, "--dump-bb=no"))
+       CLG_(clo).dump_bb = False;
+
+   else if (0 == VG_(strncmp)(arg, "--dump-every-bb=", 16))
+        CLG_(clo).dump_every_bb = (Int)VG_(atoll)(&arg[16]);
+
+
+   else if (0 == VG_(strcmp)(arg, "--collect-alloc=yes"))
+       CLG_(clo).collect_alloc = True;
+   else if (0 == VG_(strcmp)(arg, "--collect-alloc=no"))
+       CLG_(clo).collect_alloc = False;
+
+   else if (0 == VG_(strcmp)(arg, "--collect-systime=yes"))
+       CLG_(clo).collect_systime = True;
+   else if (0 == VG_(strcmp)(arg, "--collect-systime=no"))
+       CLG_(clo).collect_systime = False;
+
+   else if (0 == VG_(strcmp)(arg, "--simulate-cache=yes"))
+       CLG_(clo).simulate_cache = True;
+   else if (0 == VG_(strcmp)(arg, "--simulate-cache=no"))
+       CLG_(clo).simulate_cache = False;
+
+   else {
+       Bool isCachesimOption = (*CLG_(cachesim).parse_opt)(arg);
+
+       /* cache simulator is used if a simulator option is given */
+       if (isCachesimOption)
+	   CLG_(clo).simulate_cache = True;
+
+       return isCachesimOption;
+   }
+
+   return True;
+}
+
+void CLG_(print_usage)(void)
+{
+   VG_(printf)(
+"\n   dump creation options:\n"
+"    --base=<prefix>           Prefix for profile files [" DEFAULT_DUMPNAME "]\n"
+"    --dump-line=no|yes        Dump source lines of costs? [yes]\n"
+"    --dump-instr=no|yes       Dump instruction address of costs? [no]\n"
+"    --compress-strings=no|yes Compress strings in profile dump? [yes]\n"
+"    --compress-pos=no|yes     Compress positions in profile dump? [yes]\n"
+"    --combine-dumps=no|yes    Concat all dumps into same file [no]\n"
+#if CLG_EXPERIMENTAL
+"    --compress-events=no|yes  Compress events in profile dump? [no]\n"
+"    --dump-bb=no|yes          Dump basic block address of costs? [no]\n"
+"    --dump-bbs=no|yes         Dump basic block info? [no]\n"
+"    --dump-skipped=no|yes     Dump info on skipped functions in calls? [no]\n"
+"    --mangle-names=no|yes     Mangle separation into names? [yes]\n"
+#endif
+
+"\n   activity options (for interactivity use callgrind_control):\n"
+"    --dump-every-bb=<count>   Dump every <count> basic blocks [0=never]\n"
+"    --dump-before=<func>      Dump when entering function\n"
+"    --zero-before=<func>      Zero all costs when entering function\n"
+"    --dump-after=<func>       Dump when leaving function\n"
+#if CLG_EXPERIMENTAL
+"    --dump-objs=no|yes        Dump static object information [no]\n"
+#endif
+
+"\n   data collection options:\n"
+"    --instr-atstart=no|yes    Do instrumentation at callgrind start [yes]\n"
+"    --collect-atstart=no|yes  Collect at process/thread start [yes]\n"
+"    --toggle-collect=<func>   Toggle collection on enter/leave function\n"
+"    --collect-jumps=no|yes    Collect jumps? [no]\n"
+#if CLG_EXPERIMENTAL
+"    --collect-alloc=no|yes    Collect memory allocation info? [no]\n"
+#endif
+"    --collect-systime=no|yes  Collect system call time info? [no]\n"
+
+"\n   cost entity separation options:\n"
+"    --separate-threads=no|yes Separate data per thread [no]\n"
+"    --separate-callers=<n>    Separate functions by call chain length [0]\n"
+"    --separate-recs=<n>       Separate function recursions upto level [2]\n"
+"    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
+"    --separate-recs<n>=<f>    Separate <n> recursions for function <f>\n"
+"    --separate-callers<n>=<f> Separate <n> callers for function <f>\n"
+"    --skip-direct-rec=no|yes  Ignore direct recursions? [yes]\n"
+"    --fn-skip=<function>      Ignore calls to/from function?\n"
+#if CLG_EXPERIMENTAL
+"    --fn-group<no>=<func>     Put function into separation group <no>\n"
+#endif
+    );
+
+   (*CLG_(cachesim).print_opts)();
+
+//   VG_(printf)("\n"
+//	       "  For full callgrind documentation, see\n"
+//	       "  "VG_PREFIX"/share/doc/callgrind/html/callgrind.html\n\n");
+}
+
+void CLG_(print_debug_usage)(void)
+{
+    VG_(printf)(
+
+#if CLG_ENABLE_DEBUG
+"    --ct-verbose=<level>       Verbosity of standard debug output [0]\n"
+"    --ct-vstart=<BB number>    Only be verbose after basic block [0]\n"
+"    --ct-verbose<level>=<func> Verbosity while in <func>\n"
+#else
+"    (none)\n"
+#endif
+
+    );
+}
+
+
+void CLG_(set_clo_defaults)(void)
+{
+  /* Default values for command line arguments */
+
+  /* dump options */
+  CLG_(clo).filename_base    = 0;
+  CLG_(clo).combine_dumps    = False;
+  CLG_(clo).compress_strings = True;
+  CLG_(clo).compress_mangled = False;
+  CLG_(clo).compress_events  = False;
+  CLG_(clo).compress_pos     = True;
+  CLG_(clo).mangle_names     = True;
+  CLG_(clo).dump_line        = True;
+  CLG_(clo).dump_instr       = False;
+  CLG_(clo).dump_bb          = False;
+  CLG_(clo).dump_bbs         = False;
+
+  CLG_(clo).dump_every_bb    = 0;
+
+  /* Collection */
+  CLG_(clo).separate_threads = False;
+  CLG_(clo).collect_atstart  = True;
+  CLG_(clo).collect_jumps    = False;
+  CLG_(clo).collect_alloc    = False;
+  CLG_(clo).collect_systime  = False;
+
+  CLG_(clo).skip_plt         = True;
+  CLG_(clo).separate_callers = 0;
+  CLG_(clo).separate_recursions = 2;
+  CLG_(clo).skip_direct_recursion = False;
+
+  /* Instrumentation */
+  CLG_(clo).instrument_atstart = True;
+  CLG_(clo).simulate_cache = False;
+
+#if CLG_ENABLE_DEBUG
+  CLG_(clo).verbose = 0;
+  CLG_(clo).verbose_start = 0;
+#endif
+}
diff --git a/callgrind/command.c b/callgrind/command.c
new file mode 100644
index 0000000000..23c14d9025
--- /dev/null
+++ b/callgrind/command.c
@@ -0,0 +1,517 @@
+/*
+   This file is part of Callgrind, a Valgrind skin for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This skin is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002 Nicholas Nethercote (njn25@cam.ac.uk)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/*
+ * Functions related to interactive commands via "callgrind.cmd"
+ */
+
+#include "config.h"
+#include "global.h"
+
+#include <pub_tool_threadstate.h> // VG_N_THREADS
+
+static Char outbuf[FILENAME_LEN + FN_NAME_LEN + OBJ_NAME_LEN];
+
+static Char* command_file = 0;
+static Char* command_file2 = 0;
+static Char* result_file = 0;
+static Char* info_file = 0;
+static Char* dump_base = 0;
+
+static Bool command_inited = False;
+
+void CLG_(init_command)(Char* dir, Char* dumps)
+{
+  Int fd, size;
+  SysRes res;
+
+  dump_base = dumps;
+
+  size = VG_(strlen)(dir) + VG_(strlen)(DEFAULT_COMMANDNAME) +10;
+  command_file = (char*) CLG_MALLOC(size);
+  CLG_ASSERT(command_file != 0);
+  VG_(sprintf)(command_file, "%s/%s.%d",
+	       dir, DEFAULT_COMMANDNAME, VG_(getpid)());
+
+  /* This is for compatibility with the "Force Now" Button of current
+   * KCachegrind releases, as it doesn't use ".pid" to distinguish
+   * different callgrind instances from same base directory.
+   * Should be removed sometimes in the future (29.10.03)
+   */
+  command_file2 = (char*) CLG_MALLOC(size);
+  CLG_ASSERT(command_file2 != 0);
+  VG_(sprintf)(command_file2, "%s/%s",
+	       dir, DEFAULT_COMMANDNAME);
+
+  size = VG_(strlen)(dir) + VG_(strlen)(DEFAULT_RESULTNAME) +10;
+  result_file = (char*) CLG_MALLOC(size);
+  CLG_ASSERT(result_file != 0);
+  VG_(sprintf)(result_file, "%s/%s.%d",
+	       dir, DEFAULT_RESULTNAME, VG_(getpid)());
+
+  info_file = (char*) CLG_MALLOC(VG_(strlen)(DEFAULT_INFONAME) + 10);
+  CLG_ASSERT(info_file != 0);
+  VG_(sprintf)(info_file, "%s.%d", DEFAULT_INFONAME, VG_(getpid)());
+
+  CLG_DEBUG(1, "  dump file base: '%s'\n", dump_base);
+  CLG_DEBUG(1, "  command file:   '%s'\n", command_file);
+  CLG_DEBUG(1, "  result file:    '%s'\n", result_file);
+  CLG_DEBUG(1, "  info file:      '%s'\n", info_file);
+
+  /* create info file to indicate that we are running */ 
+  res = VG_(open)(info_file, VKI_O_WRONLY|VKI_O_TRUNC, 0);
+  if (res.isError) { 
+    res = VG_(open)(info_file, VKI_O_CREAT|VKI_O_WRONLY,
+		   VKI_S_IRUSR|VKI_S_IWUSR);
+    if (res.isError) {
+      VG_(message)(Vg_DebugMsg, 
+		   "warning: can't write info file '%s'", info_file);
+      info_file = 0;
+      fd = -1;
+    }
+  }
+  if (!res.isError)
+      fd = (Int) res.val;
+  if (fd>=0) {
+    Char buf[512];
+    Int i;
+    
+    VG_(sprintf)(buf, 
+		 "# This file is generated by Callgrind-" VERSION ".\n"
+		 "# It is used to enable controlling the supervision of\n"
+		 "#  '%s'\n"
+		 "# by external tools.\n\n",
+#if VG_CORE_INTERFACE_VERSION < 9
+		 VG_(client_argv[0])
+#else
+		 VG_(args_the_exename)
+#endif
+	);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    VG_(sprintf)(buf, "version: " VERSION "\n");
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    VG_(sprintf)(buf, "base: %s\n", dir);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    VG_(sprintf)(buf, "dumps: %s\n", dump_base);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    VG_(sprintf)(buf, "control: %s\n", command_file);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    VG_(sprintf)(buf, "result: %s\n", result_file);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    VG_(strcpy)(buf, "cmd:");
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+#if VG_CORE_INTERFACE_VERSION < 9
+    for (i = 0; i < VG_(client_argc); i++) {
+	if (!VG_(client_argv[i])) continue;
+	VG_(sprintf)(buf, " %s", VG_(client_argv[i]));
+	VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    }
+#else
+    VG_(sprintf)(buf, " %s", VG_(args_the_exename));
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    for (i = 0; i < VG_(args_for_client).used; i++) {
+	if (!VG_(args_for_client).strs[i]) continue;
+	VG_(sprintf)(buf, " %s", VG_(args_for_client).strs[i]);
+	VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    }
+#endif
+    VG_(write)(fd, "\n", 1);
+    VG_(close)(fd);
+  }
+
+  command_inited = True;
+}
+
+void CLG_(finish_command)()
+{
+  /* unlink info file */
+  if (info_file) VG_(unlink)(info_file);
+}
+
+
+static Int createRes(Int fd)
+{
+    SysRes res;
+
+    if (fd > -2) return fd;
+
+    /* fd == -2: No error, but we need to create the file */
+    res = VG_(open)(result_file,
+		   VKI_O_CREAT|VKI_O_WRONLY|VKI_O_TRUNC,
+		   VKI_S_IRUSR|VKI_S_IWUSR);
+
+    /* VG_(open) can return any negative number on error. Remap errors to -1,
+     * to not confuse it with our special value -2
+     */
+    if (res.isError) fd = -1;
+    else fd = (Int) res.val;
+
+    return fd;
+}
+
+/* Run Info: Fixed information for a callgrind run */
+static Int dump_info(Int fd)
+{
+    Char* buf = outbuf;
+    int i;
+    
+    if ( (fd = createRes(fd)) <0) return fd;
+
+    /* version */
+    VG_(sprintf)(buf, "version: " VERSION "\n");
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    /* "pid:" line */
+    VG_(sprintf)(buf, "pid: %d\n", VG_(getpid)());
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    /* "base:" line */
+    VG_(sprintf)(buf, "base: %s\n", dump_base);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    
+    /* "cmd:" line */
+    VG_(strcpy)(buf, "cmd:");
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+#if VG_CORE_INTERFACE_VERSION < 9
+    for (i = 0; i < VG_(client_argc); i++) {
+	if (!VG_(client_argv[i])) continue;
+	VG_(sprintf)(buf, " %s", VG_(client_argv[i]));
+	VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    }
+#else
+    VG_(sprintf)(buf, " %s", VG_(args_the_exename));
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    for (i = 0; i < VG_(args_for_client).used; i++) {
+	if (!VG_(args_for_client).strs[i]) continue;
+	VG_(sprintf)(buf, " %s", VG_(args_for_client).strs[i]);
+	VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    }
+#endif
+
+    return fd;
+}
+
+
+/* Helper for dump_state */
+
+Int dump_fd;
+
+void static dump_state_of_thread(thread_info* ti)
+{
+    Char* buf = outbuf;
+    int t = CLG_(current_tid);
+    Int p, i;
+    static FullCost sum = 0, tmp = 0;
+    BBCC *from, *to;
+    call_entry* ce;
+
+    p = VG_(sprintf)(buf, "events-%d: ", t);
+    CLG_(init_cost_lz)( CLG_(sets).full, &sum );
+    CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
+    CLG_(add_diff_cost)( CLG_(sets).full, sum,
+			ti->lastdump_cost,
+			ti->states.entry[0]->cost);
+    CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
+    p += CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum);
+    p += VG_(sprintf)(buf+p, "\n");
+    VG_(write)(dump_fd, (void*)buf, p);
+
+    p = VG_(sprintf)(buf, "frames-%d: %d\n", t,
+		     CLG_(current_call_stack).sp);
+    VG_(write)(dump_fd, (void*)buf, p);
+    ce = 0;
+    for(i = 0; i < CLG_(current_call_stack).sp; i++) {
+      ce = CLG_(get_call_entry)(i);
+      /* if this frame is skipped, we don't have counters */
+      if (!ce->jcc) continue;
+      
+      from = ce->jcc->from;
+      p = VG_(sprintf)(buf, "function-%d-%d: %s\n",t, i, 
+		       from->cxt->fn[0]->name);	    
+      VG_(write)(dump_fd, (void*)buf, p);
+      
+      p = VG_(sprintf)(buf, "calls-%d-%d: ",t, i);
+      p+= VG_(sprintf)(buf+p, "%llu\n", ce->jcc->call_counter);
+      VG_(write)(dump_fd, (void*)buf, p);
+      
+      /* FIXME: EventSets! */
+      CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
+      CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
+      CLG_(add_diff_cost)( CLG_(sets).full, sum,
+			  ce->enter_cost, CLG_(current_state).cost );
+      CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
+      
+      p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
+      p += CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum );
+      p += VG_(sprintf)(buf+p, "\n");
+      VG_(write)(dump_fd, (void*)buf, p);
+    }
+    if (ce && ce->jcc) {
+      to = ce->jcc->to;
+      p = VG_(sprintf)(buf, "function-%d-%d: %s\n",t, i, 
+		       to->cxt->fn[0]->name );	    
+      VG_(write)(dump_fd, (void*)buf, p);
+    }
+}
+
+/* Dump info on current callgrind state */
+static Int dump_state(Int fd)
+{
+    Char* buf = outbuf;
+    thread_info** th;
+    int t, p;
+    Int orig_tid = CLG_(current_tid);
+
+    if ( (fd = createRes(fd)) <0) return fd;
+
+    VG_(sprintf)(buf, "instrumentation: %s\n",
+		 CLG_(instrument_state) ? "on":"off");
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    if (!CLG_(instrument_state)) return fd;
+
+    VG_(sprintf)(buf, "executed-bbs: %llu\n", CLG_(stat).bb_executions);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    VG_(sprintf)(buf, "executed-calls: %llu\n", CLG_(stat).call_counter);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    VG_(sprintf)(buf, "distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    VG_(sprintf)(buf, "distinct-calls: %d\n", CLG_(stat).distinct_jccs);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    VG_(sprintf)(buf, "distinct-functions: %d\n", CLG_(stat).distinct_fns);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    VG_(sprintf)(buf, "distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    /* "events:" line. Given here because it will be dynamic in the future */
+    p = VG_(sprintf)(buf, "events: ");
+    CLG_(sprint_eventmapping)(buf+p, CLG_(dumpmap));
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+    VG_(write)(fd, "\n", 1);
+		
+    /* "part:" line (number of last part. Is 0 at start */
+    VG_(sprintf)(buf, "\npart: %d\n", CLG_(get_dump_counter)());
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+		
+    /* threads */
+    th = CLG_(get_threads)();
+    p = VG_(sprintf)(buf, "threads:");
+    for(t=1;t<VG_N_THREADS;t++) {
+	if (!th[t]) continue;
+	p += VG_(sprintf)(buf+p, " %d", t);
+    }
+    p += VG_(sprintf)(buf+p, "\n");
+    VG_(write)(fd, (void*)buf, p);
+
+    VG_(sprintf)(buf, "current-tid: %d\n", orig_tid);
+    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+    /* current event counters */
+    dump_fd = fd;
+    CLG_(forall_threads)(dump_state_of_thread);
+
+    return fd;
+}
+
+void CLG_(check_command)()
+{
+    /* check for dumps needed */
+    static Char buf[512];
+    static Char cmdBuffer[512];
+    Char *cmdPos = 0, *cmdNextLine = 0;
+    Int fd, bytesRead = 0, do_kill = 0;
+    static Char* cfile = 0;
+    SysRes res;
+
+    if (!command_inited) return;
+
+    /* toggle between 2 command files, with/without ".pid" postfix */
+    cfile = ((cfile == command_file) || (cfile == 0)) ? 
+      command_file2 : command_file;
+    
+    
+    res = VG_(open)(cfile, VKI_O_RDONLY,0);
+    if (!res.isError) {
+	fd = (Int) res.val;
+	bytesRead = VG_(read)(fd,cmdBuffer,500);
+	cmdBuffer[500] = 0; /* no command overrun please */
+	VG_(close)(fd);
+	/* don't delete command file on read error (e.g. EAGAIN) */
+	if (bytesRead>0) {
+	    cmdPos = cmdBuffer;
+	}
+    }
+
+    /* force creation of result file if needed */
+    fd = -2;
+
+    while((bytesRead>0) && *cmdPos) {
+      
+	/* Calculate pointer for next line */
+	cmdNextLine = cmdPos+1;
+	while((bytesRead>0) && *cmdNextLine && (*cmdNextLine != '\n')) {
+	  cmdNextLine++;
+	  bytesRead--;
+	}
+	if ((bytesRead>0) && (*cmdNextLine == '\n')) {
+	  *cmdNextLine = 0;
+	  cmdNextLine++;
+	  bytesRead--;
+	} 
+
+	/* Command with integer option */
+	if ((*cmdPos >= '0') && (*cmdPos <='9')) {
+	  int value = *cmdPos-'0';
+	  cmdPos++;
+	  while((*cmdPos >= '0') && (*cmdPos <='9')) {
+	    value = 10*value + (*cmdPos-'0');
+	    cmdPos++;
+	  }
+	  while((*cmdPos == ' ') || (*cmdPos == '\t')) cmdPos++;
+	  
+	  switch(*cmdPos) {
+#if CLG_ENABLE_DEBUG
+	    /* verbosity */
+	  case 'V':
+	  case 'v':
+	    CLG_(clo).verbose = value;
+	    break;
+#endif
+	  default:
+	    break;	      
+	  }
+
+	  cmdPos = cmdNextLine;
+	  continue;
+	}  
+
+	/* Command with boolean/switch option */
+	if ((*cmdPos=='+') || 
+	    (*cmdPos=='-')) {
+	  int value = (cmdPos[0] == '+');
+	  cmdPos++;
+	  while((*cmdPos == ' ') || (*cmdPos == '\t')) cmdPos++;
+	  
+	  switch(*cmdPos) {
+	  case 'I':
+	  case 'i':
+	    CLG_(set_instrument_state)("Command", value);
+	    break;
+
+	  default:
+	    break;
+	  }
+
+	  cmdPos = cmdNextLine;
+	  continue;
+	}
+
+	/* regular command */
+	switch(*cmdPos) {
+	case 'D':
+	case 'd':
+	  /* DUMP */
+
+	  /* skip command */
+	  while(*cmdPos && (*cmdPos != ' ')) cmdPos++;
+	  if (*cmdPos)
+	    VG_(sprintf)(buf, "Dump Command:%s", cmdPos);
+	  else
+	    VG_(sprintf)(buf, "Dump Command");
+	  CLG_(dump_profile)(buf, False);
+	  break;
+	    
+	case 'Z':
+	case 'z':
+	    CLG_(zero_all_cost)(False);
+	    break;
+
+	case 'K':
+	case 'k':
+	    /* Kill: Delay to be able to remove command file before. */
+	    do_kill = 1;
+	    break;
+
+	case 'I':
+	case 'i':
+	    fd = dump_info(fd);
+	    break;
+
+	case 's':
+	case 'S':
+	    fd = dump_state(fd);
+	    break;
+
+	case 'O':
+	case 'o':
+	    /* Options Info */
+	    if ( (fd = createRes(fd)) <0) break;
+
+	    VG_(sprintf)(buf, "\ndesc: Option: --skip-plt=%s\n",
+			 CLG_(clo).skip_plt ? "yes" : "no");
+	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));	    
+	    VG_(sprintf)(buf, "desc: Option: --collect-jumps=%s\n",
+			 CLG_(clo).collect_jumps ? "yes" : "no");
+	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+	    VG_(sprintf)(buf, "desc: Option: --separate-recs=%d\n",
+			 CLG_(clo).separate_recursions);
+	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+	    VG_(sprintf)(buf, "desc: Option: --separate-callers=%d\n",
+			 CLG_(clo).separate_callers);
+	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+	    break;
+
+	default:
+	  break;
+	}
+
+	cmdPos = cmdNextLine;
+    }
+
+    /* If command executed, delete command file */
+    if (cmdPos) VG_(unlink)(cfile);
+    if (fd>=0) VG_(close)(fd);	    
+
+    if (do_kill) {
+      VG_(message)(Vg_UserMsg,
+		   "Killed because of command from %s", cfile);
+      CLG_(fini)(0);
+      VG_(exit)(1);
+    }
+}
diff --git a/callgrind/context.c b/callgrind/context.c
new file mode 100644
index 0000000000..ade251f791
--- /dev/null
+++ b/callgrind/context.c
@@ -0,0 +1,328 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                 ct_context.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Context operations                                   ---*/
+/*------------------------------------------------------------*/
+
+#define N_FNSTACK_INITIAL_ENTRIES 500
+#define N_CXT_INITIAL_ENTRIES 2537
+
+fn_stack CLG_(current_fn_stack);
+
+void CLG_(init_fn_stack)(fn_stack* s)
+{
+  CLG_ASSERT(s != 0);
+
+  s->size   = N_FNSTACK_INITIAL_ENTRIES;   
+  s->bottom = (fn_node**) CLG_MALLOC(s->size * sizeof(fn_node*));
+  s->top    = s->bottom;
+  s->bottom[0] = 0;
+}
+
+void CLG_(copy_current_fn_stack)(fn_stack* dst)
+{
+  CLG_ASSERT(dst != 0);
+
+  dst->size   = CLG_(current_fn_stack).size;
+  dst->bottom = CLG_(current_fn_stack).bottom;
+  dst->top    = CLG_(current_fn_stack).top;
+}
+
+void CLG_(set_current_fn_stack)(fn_stack* s)
+{
+  CLG_ASSERT(s != 0);
+
+  CLG_(current_fn_stack).size   = s->size;
+  CLG_(current_fn_stack).bottom = s->bottom;
+  CLG_(current_fn_stack).top    = s->top;
+}
+
+static cxt_hash cxts;
+
+void CLG_(init_cxt_table)()
+{
+   Int i;
+   
+   cxts.size    = N_CXT_INITIAL_ENTRIES;
+   cxts.entries = 0;
+   cxts.table   = (Context**) CLG_MALLOC(cxts.size * sizeof(Context*));
+
+   for (i = 0; i < cxts.size; i++)
+     cxts.table[i] = 0;
+}
+
+cxt_hash* CLG_(get_cxt_hash)()
+{
+  return &cxts;
+}
+
+/* double size of cxt table  */
+static void resize_cxt_table(void)
+{
+    UInt i, new_size, conflicts1 = 0, conflicts2 = 0;
+    Context **new_table, *curr, *next;
+    UInt new_idx;
+
+    new_size  = 2* cxts.size +3;
+    new_table = (Context**) CLG_MALLOC(new_size * sizeof(Context*));
+
+    if (!new_table) return;
+
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+
+    for (i = 0; i < cxts.size; i++) {
+        if (cxts.table[i] == NULL) continue;
+
+        curr = cxts.table[i];
+        while (NULL != curr) {
+            next = curr->next;
+
+            new_idx = (UInt) (curr->hash % new_size);
+
+            curr->next = new_table[new_idx];
+            new_table[new_idx] = curr;
+            if (curr->next) {
+                conflicts1++;
+                if (curr->next->next)
+                    conflicts2++;
+            }
+
+            curr = next;
+        }
+    }
+
+    VG_(free)(cxts.table);
+
+
+    CLG_DEBUG(0, "Resize Context Hash: %d => %d (entries %d, conflicts %d/%d)\n",
+             cxts.size, new_size,
+             cxts.entries, conflicts1, conflicts2);
+
+    cxts.size  = new_size;
+    cxts.table = new_table;
+    CLG_(stat).cxt_hash_resizes++;
+}
+
+__inline__
+static UWord cxt_hash_val(fn_node** fn, UInt size)
+{
+    UWord hash = 0;
+    UInt count = size;
+    while(*fn != 0) {
+        hash = (hash<<7) + (hash>>25) + (UWord)(*fn);
+        fn--;
+        count--;
+        if (count==0) break;
+    }
+    return hash;
+}
+
+__inline__
+static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt)
+{
+    int count;
+    fn_node** cxt_fn;
+
+    if (hash != cxt->hash) return False;
+
+    count = cxt->size;
+    cxt_fn = &(cxt->fn[0]);
+    while((*fn != 0) && (count>0)) {
+        if (*cxt_fn != *fn) return False;
+        fn--;
+        cxt_fn++;
+        count--;
+    }
+    return True;
+}
+
+/**
+ * Allocate new Context structure
+ */
+static Context* new_cxt(fn_node** fn)
+{
+    Context* new;
+    UInt idx, offset;
+    UWord hash;
+    int size, recs;
+    fn_node* top_fn;
+
+    CLG_ASSERT(fn);
+    top_fn = *fn;
+    if (top_fn == 0) return 0;
+
+    size = top_fn->separate_callers +1;
+    recs = top_fn->separate_recursions;
+    if (recs<1) recs=1;
+
+    /* check fill degree of context hash table and resize if needed (>80%) */
+    cxts.entries++;
+    if (10 * cxts.entries / cxts.size > 8)
+        resize_cxt_table();
+
+    new = (Context*) CLG_MALLOC(sizeof(Context)+sizeof(fn_node*)*size);
+
+    // hash value calculation similar to cxt_hash_val(), but additionally
+    // copying function pointers in one run
+    hash = 0;
+    offset = 0;
+    while(*fn != 0) {
+        hash = (hash<<7) + (hash>>25) + (UWord)(*fn);
+        new->fn[offset] = *fn;
+        offset++;
+        fn--;
+        if (offset >= size) break;
+    }
+    if (offset < size) size = offset;
+
+    new->size        = size;
+    new->base_number = CLG_(stat).context_counter;
+    new->hash        = hash;
+
+    CLG_(stat).context_counter += recs;
+    CLG_(stat).distinct_contexts++;
+
+    /* insert into Context hash table */
+    idx = (UInt) (hash % cxts.size);
+    new->next = cxts.table[idx];
+    cxts.table[idx] = new;
+
+#if CLG_ENABLE_DEBUG
+    CLG_DEBUGIF(3) {
+      VG_(printf)("  new_cxt ox%p: ", new);
+      CLG_(print_cxt)(12, new, 0);
+    }
+#endif
+
+    return new;
+}
+
+/* get the Context structure for current context */
+Context* CLG_(get_cxt)(fn_node** fn)
+{
+    Context* cxt;
+    UInt size, idx;
+    UWord hash;
+
+    CLG_ASSERT(fn != 0);
+    if (*fn == 0) return 0;
+    size = (*fn)->separate_callers+1;
+    if (size<=0) { size = -size+1; }
+
+    CLG_DEBUG(5, "+ get_cxt(fn '%s'): size %d\n",
+                (*fn)->name, size);
+
+    hash = cxt_hash_val(fn, size);
+
+    if ( ((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) {
+        CLG_DEBUG(5, "- get_cxt: %p\n", cxt);
+        return cxt;
+    }
+
+    CLG_(stat).cxt_lru_misses++;
+
+    idx = (UInt) (hash % cxts.size);
+    cxt = cxts.table[idx];
+
+    while(cxt) {
+        if (is_cxt(hash,fn,cxt)) break;
+        cxt = cxt->next;
+    }
+
+    if (!cxt)
+        cxt = new_cxt(fn);
+
+    (*fn)->last_cxt = cxt;
+
+    CLG_DEBUG(5, "- get_cxt: %p\n", cxt);
+
+    return cxt;
+}
+
+
+/**
+ * Change execution context by calling a new function from current context
+ *
+ */
+void CLG_(push_cxt)(fn_node* fn)
+{
+  call_stack* cs = &CLG_(current_call_stack);
+  Int fn_entries;
+
+  /* save old context on stack (even if not changed at all!) */
+  CLG_ASSERT(cs->sp < cs->size);
+  CLG_ASSERT(cs->entry[cs->sp].cxt == 0);
+  cs->entry[cs->sp].cxt = CLG_(current_state).cxt;
+  cs->entry[cs->sp].fn_sp = CLG_(current_fn_stack).top - CLG_(current_fn_stack).bottom;
+
+  if (*(CLG_(current_fn_stack).top) == fn) return;
+  if (fn && (fn->group>0) &&
+      ((*(CLG_(current_fn_stack).top))->group == fn->group)) return;
+
+  /* resizing needed ? */
+  fn_entries = CLG_(current_fn_stack).top - CLG_(current_fn_stack).bottom;
+  if (fn_entries == CLG_(current_fn_stack).size-1) {
+    int new_size = CLG_(current_fn_stack).size *2;
+    fn_node** new = (fn_node**) CLG_MALLOC(new_size * sizeof(fn_node*));
+    int i;
+    for(i=0;i<CLG_(current_fn_stack).size;i++)
+      new[i] = CLG_(current_fn_stack).bottom[i];
+    VG_(free)(CLG_(current_fn_stack).bottom);
+    CLG_(current_fn_stack).top = new + fn_entries;
+    CLG_(current_fn_stack).bottom = new;
+
+    CLG_DEBUG(0, "Resize Context Stack: %d => %d (pushing '%s')\n", 
+	     CLG_(current_fn_stack).size, new_size,
+	     fn ? fn->name : (Char*)"0x0");
+
+    CLG_(current_fn_stack).size = new_size;
+  }
+
+  if (*(CLG_(current_fn_stack).top) == 0) {
+    UInt *pactive;
+
+    /* this is first function: increment its active count */
+    CLG_ASSERT(fn != 0);
+    pactive = CLG_(get_fn_entry)(fn->number);
+    (*pactive)++;
+  }
+
+  CLG_(current_fn_stack).top++;
+  *(CLG_(current_fn_stack).top) = fn;
+  CLG_(current_state).cxt = CLG_(get_cxt)(CLG_(current_fn_stack).top);
+
+  CLG_DEBUG(5, "  push_cxt(fn '%s'): %d\n", 
+	   fn ? fn->name : (Char*)"0x0",
+	   CLG_(current_fn_stack).top - CLG_(current_fn_stack).bottom);
+}
+			       
diff --git a/callgrind/costs.c b/callgrind/costs.c
new file mode 100644
index 0000000000..1fa1b6108d
--- /dev/null
+++ b/callgrind/costs.c
@@ -0,0 +1,79 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                   ct_costs.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#include <pub_tool_mallocfree.h>
+
+#define COSTCHUNK_SIZE 100000
+
+UInt CLG_(costarray_entries) = 0;
+UInt CLG_(costarray_chunks) = 0;
+static CostChunk* cost_chunk_base = 0;
+static CostChunk* cost_chunk_current = 0;
+
+ULong* CLG_(get_costarray)(Int size)
+{
+  ULong* ptr;
+
+  if (!cost_chunk_current ||
+      (cost_chunk_current->size - cost_chunk_current->used < size)) {
+    CostChunk* cc  = (CostChunk*) CLG_MALLOC(sizeof(CostChunk) +
+					      COSTCHUNK_SIZE * sizeof(ULong));
+    cc->size = COSTCHUNK_SIZE;
+    cc->used = 0;
+    cc->next = 0;
+
+    if (cost_chunk_current)
+      cost_chunk_current->next = cc;
+    cost_chunk_current = cc;
+
+    if (!cost_chunk_base) cost_chunk_base = cc;
+
+    CLG_(costarray_chunks)++;
+  }
+  
+  ptr = &(cost_chunk_current->data[cost_chunk_current->used]);
+  cost_chunk_current->used += size;
+
+  CLG_(costarray_entries) += size;
+
+  return ptr;
+}
+
+void CLG_(free_costarrays)()
+{
+  CostChunk* cc = cost_chunk_base, *cc_next;
+  while(cc) {
+    cc_next = cc->next;
+    VG_(free)(cc);
+    cc = cc_next;
+  }
+  cost_chunk_base = 0;
+  cost_chunk_current = 0;
+}
diff --git a/callgrind/costs.h b/callgrind/costs.h
new file mode 100644
index 0000000000..5e5ccfdd0c
--- /dev/null
+++ b/callgrind/costs.h
@@ -0,0 +1,35 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                   ct_costs.h ---*/
+/*--- (C) 2004, Josef Weidendorfer                                 ---*/
+/*--------------------------------------------------------------------*/
+
+#ifndef CT_COSTS
+#define CT_COSTS
+
+#include "pub_tool_basics.h"
+
+#define CLG_(str) VGAPPEND(vgCallgrind_,str)
+
+extern UInt CLG_(costarray_entries);
+extern UInt CLG_(costarray_chunks);
+
+/* Array of 64bit costs. This is separated from other structs
+ * to support a dynamic number of costs for a cost item.
+ * Chunks are allocated on demand, and deallocated at program termination.
+ */
+typedef struct _CostChunk CostChunk;
+struct _CostChunk {
+  Int size;
+  Int used;
+  CostChunk *next, *prev;
+  ULong data[0];
+};
+
+/* Allocate a number of 64bit cost values.
+ * Typically used from ct_events.c */
+ULong* CLG_(get_costarray)(Int size);
+void CLG_(free_costarrays)(void);
+
+
+#endif /* CT_COSTS */
diff --git a/callgrind/debug.c b/callgrind/debug.c
new file mode 100644
index 0000000000..2e3ef608cb
--- /dev/null
+++ b/callgrind/debug.c
@@ -0,0 +1,453 @@
+/*
+   This file is part of Callgrind, a Valgrind skin for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This skin is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002 Nicholas Nethercote (njn25@cam.ac.uk)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+#include "events.h"
+
+/* If debugging mode of, dummy functions are provided (see below)
+ */
+#if CLG_ENABLE_DEBUG
+
+/*------------------------------------------------------------*/
+/*--- Debug output helpers                                 ---*/
+/*------------------------------------------------------------*/
+
+static void print_indent(int s)
+{
+    /* max of 40 spaces */
+    char sp[] = "                                        ";
+    if (s>40) s=40;
+    VG_(printf)(sp+40-s);
+}
+
+void CLG_(print_bb)(int s, BB* bb)
+{
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    VG_(printf)("BB %p (Obj '%s')", bb_addr(bb), bb->obj->name);
+}
+
+static
+void print_mangled_cxt(Context* cxt, int rec_index)
+{
+    int i;
+
+    if (!cxt)
+      VG_(printf)("(none)");
+    else {
+      VG_(printf)("%s", cxt->fn[0]->name);
+      if (rec_index >0)
+	VG_(printf)("'%d", rec_index +1);
+      for(i=1;i<cxt->size;i++)
+	VG_(printf)("'%s", cxt->fn[i]->name);
+    }
+}
+
+
+
+void CLG_(print_cxt)(int s, Context* cxt, int rec_index)
+{
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (cxt) {
+    UInt *pactive = CLG_(get_fn_entry)(cxt->fn[0]->number);
+    CLG_ASSERT(rec_index < cxt->fn[0]->separate_recursions);
+    
+    VG_(printf)("Cxt %d" ,cxt->base_number + rec_index);
+    if (*pactive>0)
+      VG_(printf)(" [active=%d]", *pactive);
+    VG_(printf)(": ");	
+    print_mangled_cxt(cxt, rec_index);
+    VG_(printf)("\n");
+  }
+  else
+    VG_(printf)("(no context)\n");
+}
+
+void CLG_(print_execstate)(int s, exec_state* es)
+{
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (!es) {
+    VG_(printf)("ExecState 0x0\n");
+    return;
+  }
+
+  VG_(printf)("ExecState [Sig %d, collect %s, nonskipped %p]: jmps_passed %d\n",
+	      es->sig, es->collect?"yes":"no",
+	      es->nonskipped, es->jmps_passed);
+}
+
+
+void CLG_(print_bbcc)(int s, BBCC* bbcc, Bool jumpaddr)
+{
+  BB* bb;
+
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (!bbcc) {
+    VG_(printf)("BBCC 0x0\n");
+    return;
+  }
+ 
+  bb = bbcc->bb;
+  CLG_ASSERT(bb!=0);
+
+#if 0 
+  if (jumpaddr)
+    VG_(printf)("%s +%p=%p, ",
+	      bb->obj->name + bb->obj->last_slash_pos,
+	      bb->jmp_offset, bb_jmpaddr(bb));
+  else
+#endif
+    VG_(printf)("%s +%p=%p, ",
+		bb->obj->name + bb->obj->last_slash_pos,
+		bb->offset, bb_addr(bb));
+  CLG_(print_cxt)(s+8, bbcc->cxt, bbcc->rec_index);
+}
+
+void CLG_(print_eventset)(int s, EventSet* es)
+{
+  int i;
+
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+
+  if (!es) {
+    VG_(printf)("(EventSet not set)\n");
+    return;
+  }
+
+  VG_(printf)("%5s (Size/Cap %d/%d): ",
+	      es->name, es->size, es->capacity);
+
+  if (es->size == 0)
+    VG_(printf)("-");
+  else {
+    for(i=0; i< es->size; i++) {
+      if (i>0) {
+	VG_(printf)(" ");
+	if (es->e[i-1].nextTop == i)
+	  VG_(printf)("| ");
+      }
+      VG_(printf)(es->e[i].type->name);
+    }
+  }
+  VG_(printf)("\n");
+}
+
+
+void CLG_(print_cost)(int s, EventSet* es, ULong* c)
+{
+  Int i, pos;
+
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    if (!es) {
+      VG_(printf)("Cost (Nothing, EventSet not set)\n");
+      return;
+    }
+    if (!c) {
+      VG_(printf)("Cost (Null, EventSet %s)\n", es->name);
+      return;
+    }
+
+    if (es->size == 0) {
+      VG_(printf)("Cost (Nothing, EventSet %s with len 0)\n", es->name);
+      return;
+    } 
+
+    pos = s;
+    pos += VG_(printf)("Cost %s [%p]: %s %llu", es->name, c, es->e[0].type->name, c[0]);
+
+    i = 1;
+    while(i<es->size) {
+      if (pos > 70) {
+	VG_(printf)(",\n");
+	print_indent(s+5);
+	pos = s+5;
+      }
+      else
+	pos += VG_(printf)(", ");
+      pos += VG_(printf)("%s %llu", es->e[i].type->name, c[i]);
+      i++;
+    }
+    VG_(printf)("\n");
+}
+
+
+void CLG_(print_short_jcc)(jCC* jcc)
+{
+    if (jcc)
+	VG_(printf)("%p => %p [%llu/%llu,%llu,%llu]",
+		    bb_jmpaddr(jcc->from->bb),
+		    bb_addr(jcc->to->bb),
+		    jcc->call_counter,
+		    jcc->cost ? jcc->cost[CLG_(sets).off_sim_Ir]:0,
+		    jcc->cost ? jcc->cost[CLG_(sets).off_sim_Dr]:0,
+		    jcc->cost ? jcc->cost[CLG_(sets).off_sim_Dw]:0);
+    else
+	VG_(printf)("[Skipped JCC]");
+}
+
+void CLG_(print_jcc)(int s, jCC* jcc)
+{
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    if (!jcc) {
+	VG_(printf)("JCC to skipped function\n");
+	return;
+    }
+    VG_(printf)("JCC %p from ", jcc);
+    CLG_(print_bbcc)(s+9, jcc->from, True);
+    print_indent(s+4);    
+    VG_(printf)("to   ");
+    CLG_(print_bbcc)(s+9, jcc->to, False);
+    print_indent(s+4);
+    VG_(printf)("Calls %llu\n", jcc->call_counter);
+    print_indent(s+4);
+    CLG_(print_cost)(s+9, CLG_(sets).full, jcc->cost);
+}
+
+/* dump out the current call stack */
+void CLG_(print_stackentry)(int s, int sp)
+{
+    call_entry* ce;
+
+    if (s<0) {
+	s = -s;
+	print_indent(s);
+    }
+
+    ce = CLG_(get_call_entry)(sp);
+    VG_(printf)("[%-2d] SP %p, RA %p", sp, ce->sp, ce->ret_addr);
+    if (ce->nonskipped)
+	VG_(printf)(" NonSkipped BB %p / %s",
+		    bb_addr(ce->nonskipped->bb),
+		    ce->nonskipped->cxt->fn[0]->name);
+    VG_(printf)("\n");
+    print_indent(s+5);
+    CLG_(print_jcc)(5,ce->jcc);
+}
+
+/* debug output */
+#if 0
+static void print_call_stack()
+{
+    int c;
+
+    VG_(printf)("Call Stack:\n");
+    for(c=0;c<CLG_(current_call_stack).sp;c++)
+      CLG_(print_stackentry)(-2, c);
+}
+#endif
+
+void CLG_(print_bbcc_fn)(BBCC* bbcc)
+{
+    obj_node* obj;
+
+    if (!bbcc) {
+	VG_(printf)("%08x", 0);
+	return;
+    }
+
+    VG_(printf)("%08x/%c  %d:", bb_addr(bbcc->bb), 
+		(bbcc->bb->sect_kind == Vg_SectText) ? 'T' :
+		(bbcc->bb->sect_kind == Vg_SectData) ? 'D' :
+		(bbcc->bb->sect_kind == Vg_SectBSS) ? 'B' :
+		(bbcc->bb->sect_kind == Vg_SectGOT) ? 'G' :
+		(bbcc->bb->sect_kind == Vg_SectPLT) ? 'P' : 'U',
+		bbcc->cxt->base_number+bbcc->rec_index);
+    print_mangled_cxt(bbcc->cxt, bbcc->rec_index);
+
+    obj = bbcc->cxt->fn[0]->file->obj;
+    if (obj->name[0])
+	VG_(printf)(" %s", obj->name+obj->last_slash_pos);
+
+    if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") !=0) {
+	VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name);
+	if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line>0))
+	    VG_(printf)(":%d", bbcc->bb->line);
+    }
+}	
+
+void CLG_(print_bbcc_cost)(int s, BBCC* bbcc)
+{
+  BB* bb;
+  Int i, cjmpNo;
+  ULong ecounter;
+
+  if (s<0) {
+    s = -s;
+    print_indent(s);
+  }
+  
+  if (!bbcc) {
+    VG_(printf)("BBCC 0x0\n");
+    return;
+  }
+ 
+  bb = bbcc->bb;
+  CLG_ASSERT(bb!=0);
+    
+  CLG_(print_bbcc)(s, bbcc, False);
+
+  ecounter = bbcc->ecounter_sum;
+
+  print_indent(s+2);
+  VG_(printf)("ECounter: sum %d ", ecounter);
+  for(i=0; i<bb->cjmp_count; i++) {
+      VG_(printf)("[%d]=%d ",
+		  bb->jmp[i].instr, bbcc->jmp[i].ecounter);
+  }
+  VG_(printf)("\n");
+
+  cjmpNo = 0; 
+  for(i=0; i<bb->instr_count; i++) {
+      InstrInfo* ii = &(bb->instr[i]);
+      print_indent(s+2);
+      VG_(printf)("[%2d] IOff %2d ecnt %3d ",
+		  i, ii->instr_offset, ecounter);
+      CLG_(print_cost)(s+5, ii->eventset, bbcc->cost + ii->cost_offset);
+
+      /* update execution counter */
+      if (cjmpNo < bb->cjmp_count)
+	  if (bb->jmp[cjmpNo].instr == i) {
+	      ecounter -= bbcc->jmp[cjmpNo].ecounter;
+	      cjmpNo++;
+	  }
+  }
+}
+
+
+/* dump out an address with source info if available */
+void CLG_(print_addr)(Addr addr)
+{
+    Char fl_buf[FILENAME_LEN];
+    Char fn_buf[FN_NAME_LEN];
+    const UChar* obj_name;
+    SegInfo* si;
+    int ln, i=0, opos=0;
+	
+    if (addr == 0) {
+	VG_(printf)("%08x", addr);
+	return;
+    }
+
+    CLG_(get_debug_info)(addr, fl_buf, fn_buf, &ln, &si);
+
+    if (VG_(strcmp)(fn_buf,"???")==0)
+	VG_(printf)("%p", addr);
+    else
+	VG_(printf)("%p %s", addr, fn_buf);
+
+    if (si) {
+      obj_name = VG_(seginfo_filename)(si);
+      if (obj_name) {
+	while(obj_name[i]) {
+	  if (obj_name[i]=='/') opos = i+1;
+	  i++;
+	}
+	if (obj_name[0])
+	  VG_(printf)(" %s", obj_name+opos);
+      }
+    }
+
+    if (ln>0)
+    	VG_(printf)(" (%s:%u)", fl_buf,ln);
+}
+
+void CLG_(print_addr_ln)(Addr addr)
+{
+  CLG_(print_addr)(addr);
+  VG_(printf)("\n");
+}
+
+static ULong bb_written = 0;
+
+void CLG_(print_bbno)(void)
+{
+  if (bb_written != CLG_(stat).bb_executions) {
+    bb_written = CLG_(stat).bb_executions;
+    VG_(printf)("BB# %llu\n",CLG_(stat).bb_executions);
+  }
+}
+
+void CLG_(print_context)(void)
+{
+  BBCC* bbcc;
+
+  CLG_DEBUG(0,"In tid %d [%d] ",
+	   CLG_(current_tid),  CLG_(current_call_stack).sp);
+  bbcc =  CLG_(current_state).bbcc;
+  print_mangled_cxt(CLG_(current_state).cxt,
+		    bbcc ? bbcc->rec_index : 0);
+  VG_(printf)("\n");
+}
+
+void* CLG_(malloc)(UWord s, char* f)
+{
+    CLG_DEBUG(3, "Malloc(%d) in %s.\n", s, f);
+    return VG_(malloc)(s);
+}
+
+#else /* CLG_ENABLE_DEBUG */
+
+void CLG_(print_bbno)(void) {}
+void CLG_(print_context)(void) {}
+void CLG_(print_jcc)(int s, jCC* jcc) {}
+void CLG_(print_bbcc)(int s, BBCC* bbcc, Bool b) {}
+void CLG_(print_bbcc_fn)(BBCC* bbcc) {}
+void CLG_(print_cost)(int s, EventSet* es, ULong* cost) {}
+void CLG_(print_bb)(int s, BB* bb) {}
+void CLG_(print_cxt)(int s, Context* cxt, int rec_index) {}
+void CLG_(print_short_jcc)(jCC* jcc) {}
+void CLG_(print_stackentry)(int s, int sp) {}
+void CLG_(print_addr)(Addr addr) {}
+void CLG_(print_addr_ln)(Addr addr) {}
+
+#endif
diff --git a/callgrind/docs/Makefile.am b/callgrind/docs/Makefile.am
new file mode 100644
index 0000000000..d539a6ecd5
--- /dev/null
+++ b/callgrind/docs/Makefile.am
@@ -0,0 +1 @@
+EXTRA_DIST = 
diff --git a/callgrind/dump.c b/callgrind/dump.c
new file mode 100644
index 0000000000..3f13aea132
--- /dev/null
+++ b/callgrind/dump.c
@@ -0,0 +1,1715 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                       dump.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "global.h"
+
+#include <pub_tool_threadstate.h>
+#include <pub_tool_libcfile.h>
+
+/*------------------------------------------------------------*/
+/*--- Support for signal handlers and multi-threading      ---*/
+/*------------------------------------------------------------*/
+
+/* Dump Part Counter */
+static Int out_counter = 0;
+
+static Char* dump_file_base = 0;
+static Char* base_directory = 0;
+
+/* Command */
+static Char cmdbuf[BUF_LEN];
+
+/* Total reads/writes/misses sum over all dumps and threads.
+ * Updated during CC traversal at dump time.
+ */
+FullCost CLG_(total_cost) = 0;
+static FullCost dump_total_cost = 0;
+
+EventMapping* CLG_(dumpmap) = 0;
+
+/* Temporary output buffer for
+ *  print_fn_pos, fprint_apos, fprint_fcost, fprint_jcc,
+ *  fprint_fcc_ln, dump_run_info, dump_state_info
+ */
+static Char outbuf[FILENAME_LEN + FN_NAME_LEN + OBJ_NAME_LEN];
+
+Int CLG_(get_dump_counter)(void)
+{
+  return out_counter;
+}
+
+Char* CLG_(get_dump_file_base)()
+{
+  return dump_file_base;
+}
+
+/*------------------------------------------------------------*/
+/*--- Output file related stuff                            ---*/
+/*------------------------------------------------------------*/
+
+/* Boolean dumping array */
+static Bool* dump_array = 0;
+static Int   dump_array_size = 0;
+static Bool* obj_dumped = 0;
+static Bool* file_dumped = 0;
+static Bool* fn_dumped = 0;
+static Bool* cxt_dumped = 0;
+
+static
+void reset_dump_array(void)
+{
+    int i;
+
+    CLG_ASSERT(dump_array != 0);
+
+    for(i=0;i<dump_array_size;i++)
+	dump_array[i] = False;
+}
+
+static
+void init_dump_array(void)
+{
+    dump_array_size = CLG_(stat).distinct_objs +
+      CLG_(stat).distinct_files +
+      CLG_(stat).distinct_fns +
+      CLG_(stat).context_counter;
+    CLG_ASSERT(dump_array == 0);
+    dump_array = (Bool*) CLG_MALLOC(dump_array_size * sizeof(Bool));
+    obj_dumped  = dump_array;
+    file_dumped = obj_dumped + CLG_(stat).distinct_objs;
+    fn_dumped   = file_dumped + CLG_(stat).distinct_files;
+    cxt_dumped  = fn_dumped + CLG_(stat).distinct_fns;
+
+    reset_dump_array();
+
+    CLG_DEBUG(1, "  init_dump_array: size %d\n", dump_array_size);
+}
+
+static __inline__
+void free_dump_array(void)
+{
+    CLG_ASSERT(dump_array != 0);
+    VG_(free)(dump_array);
+
+    dump_array = 0;
+    obj_dumped = 0;
+    file_dumped = 0;
+    fn_dumped = 0;
+    cxt_dumped = 0;
+}
+
+
+/* Initialize to an invalid position */
+static __inline__
+void init_fpos(FnPos* p)
+ {
+    p->file = 0;
+    p->fn = 0;
+    p->obj = 0;
+    p->cxt = 0;
+    p->rec_index = 0;
+}
+
+
+#if 0
+static __inline__
+static void my_fwrite(Int fd, Char* buf, Int len)
+{
+	VG_(write)(fd, (void*)buf, len);
+}
+#else
+
+#define FWRITE_BUFSIZE 32000
+#define FWRITE_THROUGH 10000
+static Char fwrite_buf[FWRITE_BUFSIZE];
+static Int fwrite_pos;
+static Int fwrite_fd = -1;
+
+static __inline__
+void fwrite_flush(void)
+{
+    if ((fwrite_fd>=0) && (fwrite_pos>0))
+	VG_(write)(fwrite_fd, (void*)fwrite_buf, fwrite_pos);
+    fwrite_pos = 0;
+}
+
+static void my_fwrite(Int fd, Char* buf, Int len)
+{
+    if (fwrite_fd != fd) {
+	fwrite_flush();
+	fwrite_fd = fd;
+    }
+    if (len > FWRITE_THROUGH) {
+	fwrite_flush();
+	VG_(write)(fd, (void*)buf, len);
+	return;
+    }
+    if (FWRITE_BUFSIZE - fwrite_pos <= len) fwrite_flush();
+    VG_(strncpy)(fwrite_buf + fwrite_pos, buf, len);
+    fwrite_pos += len;
+}
+#endif
+
+
+static void print_obj(Char* buf, obj_node* obj)
+{
+    int n;
+
+    if (CLG_(clo).compress_strings) {
+	CLG_ASSERT(obj_dumped != 0);
+	if (obj_dumped[obj->number])
+	    n = VG_(sprintf)(buf, "(%d)\n", obj->number);
+	else {
+	    n = VG_(sprintf)(buf, "(%d) %s\n",
+			     obj->number, obj->name);
+	}
+    }
+    else
+	n = VG_(sprintf)(buf, "%s\n", obj->name);
+
+#if 0
+    /* add mapping parameters the first time a object is dumped
+     * format: mp=0xSTART SIZE 0xOFFSET */
+    if (!obj_dumped[obj->number]) {
+	obj_dumped[obj->number];
+	VG_(sprintf)(buf+n, "mp=%p %p %p\n",
+		     pos->obj->start, pos->obj->size, pos->obj->offset);
+    }
+#else
+    obj_dumped[obj->number] = True;
+#endif
+}
+
+static void print_file(Char* buf, file_node* file)
+{
+    if (CLG_(clo).compress_strings) {
+	CLG_ASSERT(file_dumped != 0);
+	if (file_dumped[file->number])
+	    VG_(sprintf)(buf, "(%d)\n", file->number);
+	else {
+	    VG_(sprintf)(buf, "(%d) %s\n",
+			 file->number, file->name);
+	    file_dumped[file->number] = True;
+	}
+    }
+    else
+	VG_(sprintf)(buf, "%s\n", file->name);
+}
+
+/*
+ * tag can be "fn", "cfn", "jfn"
+ */
+static void print_fn(Int fd, Char* buf, Char* tag, fn_node* fn)
+{
+    int p;
+    p = VG_(sprintf)(buf, "%s=",tag);
+    if (CLG_(clo).compress_strings) {
+	CLG_ASSERT(fn_dumped != 0);
+	if (fn_dumped[fn->number])
+	    p += VG_(sprintf)(buf+p, "(%d)\n", fn->number);
+	else {
+	    p += VG_(sprintf)(buf+p, "(%d) %s\n",
+			      fn->number, fn->name);
+	    fn_dumped[fn->number] = True;
+	}
+    }
+    else
+	p += VG_(sprintf)(buf+p, "%s\n", fn->name);
+
+    my_fwrite(fd, buf, p);
+}
+
+static void print_mangled_fn(Int fd, Char* buf, Char* tag, 
+			     Context* cxt, int rec_index)
+{
+    int p, i;
+
+    if (CLG_(clo).compress_strings && CLG_(clo).compress_mangled) {
+
+	int n;
+	Context* last;
+
+	CLG_ASSERT(cxt_dumped != 0);
+	if (cxt_dumped[cxt->base_number+rec_index]) {
+	    p = VG_(sprintf)(buf, "%s=(%d)\n",
+			     tag, cxt->base_number + rec_index);
+	    my_fwrite(fd, buf, p);
+	    return;
+	}
+
+	last = 0;
+	/* make sure that for all context parts compressed data is written */
+	for(i=cxt->size;i>0;i--) {
+	    CLG_ASSERT(cxt->fn[i-1]->pure_cxt != 0);
+	    n = cxt->fn[i-1]->pure_cxt->base_number;
+	    if (cxt_dumped[n]) continue;
+	    p = VG_(sprintf)(buf, "%s=(%d) %s\n",
+			     tag, n, cxt->fn[i-1]->name);
+	    my_fwrite(fd, buf, p);
+
+	    cxt_dumped[n] = True;
+	    last = cxt->fn[i-1]->pure_cxt;
+	}
+	/* If the last context was the context to print, we are finished */
+	if ((last == cxt) && (rec_index == 0)) return;
+
+	p = VG_(sprintf)(buf, "%s=(%d) (%d)", tag,
+			 cxt->base_number + rec_index,
+			 cxt->fn[0]->pure_cxt->base_number);
+	if (rec_index >0)
+	    p += VG_(sprintf)(buf+p, "'%d", rec_index +1);
+	for(i=1;i<cxt->size;i++)
+	    p += VG_(sprintf)(buf+p, "'(%d)", 
+			      cxt->fn[i]->pure_cxt->base_number);
+	p += VG_(sprintf)(buf+p, "\n");
+	my_fwrite(fd, buf, p);
+
+	cxt_dumped[cxt->base_number+rec_index] = True;
+	return;
+    }
+
+
+    p = VG_(sprintf)(buf, "%s=", tag);
+    if (CLG_(clo).compress_strings) {
+	CLG_ASSERT(cxt_dumped != 0);
+	if (cxt_dumped[cxt->base_number+rec_index]) {
+	    p += VG_(sprintf)(buf+p, "(%d)\n", cxt->base_number + rec_index);
+	    my_fwrite(fd, buf, p);
+	    return;
+	}
+	else {
+	    p += VG_(sprintf)(buf+p, "(%d) ", cxt->base_number + rec_index);
+	    cxt_dumped[cxt->base_number+rec_index] = True;
+	}
+    }
+
+    p += VG_(sprintf)(buf+p, "%s", cxt->fn[0]->name);
+    if (rec_index >0)
+	p += VG_(sprintf)(buf+p, "'%d", rec_index +1);
+    for(i=1;i<cxt->size;i++)
+	p += VG_(sprintf)(buf+p, "'%s", cxt->fn[i]->name);
+
+    p += VG_(sprintf)(buf+p, "\n");
+    my_fwrite(fd, buf, p);
+}
+
+
+
+/**
+ * Print function position of the BBCC, but only print info differing to
+ * the <last> position, update <last>
+ * Return True if something changes.
+ */
+static Bool print_fn_pos(int fd, FnPos* last, BBCC* bbcc)
+{
+    Bool res = False;
+
+    CLG_DEBUGIF(3) {
+	CLG_DEBUG(2, "+ print_fn_pos: ");
+	CLG_(print_cxt)(16, bbcc->cxt, bbcc->rec_index);
+    }
+
+    if (!CLG_(clo).mangle_names) {
+	if (last->rec_index != bbcc->rec_index) {
+	    VG_(sprintf)(outbuf, "rec=%d\n\n", bbcc->rec_index);
+	    my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+	    last->rec_index = bbcc->rec_index;
+	    last->cxt = 0; /* reprint context */
+	    res = True;
+	}
+	
+	if (last->cxt != bbcc->cxt) {
+	    fn_node* last_from = (last->cxt && last->cxt->size>1) ?
+				 last->cxt->fn[1] : 0;
+	    fn_node* curr_from = (bbcc->cxt && bbcc->cxt->size>1) ?
+				 bbcc->cxt->fn[1] : 0;
+	    if (curr_from == 0) {
+		if (last_from != 0) {
+		    /* switch back to no context */
+		    VG_(sprintf)(outbuf, "frfn=(spontaneous)\n");
+		    my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+		    res = True;
+		}
+	    }
+	    else if (last_from != curr_from) {
+		print_fn(fd,outbuf,"frfn", curr_from);
+		res = True;
+	    }
+	    last->cxt = bbcc->cxt;
+	}
+    }
+
+    if (last->obj != bbcc->cxt->fn[0]->file->obj) {
+	VG_(sprintf)(outbuf, "ob=");
+	print_obj(outbuf+3, bbcc->cxt->fn[0]->file->obj);
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+	last->obj = bbcc->cxt->fn[0]->file->obj;
+	res = True;
+    }
+
+    if (last->file != bbcc->cxt->fn[0]->file) {
+	VG_(sprintf)(outbuf, "fl=");
+	print_file(outbuf+3, bbcc->cxt->fn[0]->file);
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+	last->file = bbcc->cxt->fn[0]->file;
+	res = True;
+    }
+
+    if (!CLG_(clo).mangle_names) {
+	if (last->fn != bbcc->cxt->fn[0]) {
+	    print_fn(fd,outbuf, "fn", bbcc->cxt->fn[0]);
+	    last->fn = bbcc->cxt->fn[0];
+	    res = True;
+	}
+    }
+    else {
+	/* Print mangled name if context or rec_index changes */
+	if ((last->rec_index != bbcc->rec_index) ||
+	    (last->cxt != bbcc->cxt)) {
+
+	    print_mangled_fn(fd, outbuf, "fn", bbcc->cxt, bbcc->rec_index);
+	    last->fn = bbcc->cxt->fn[0];
+	    last->rec_index = bbcc->rec_index;
+	    res = True;
+	}
+    }
+
+    last->cxt = bbcc->cxt;
+
+    CLG_DEBUG(2, "- print_fn_pos: %s\n", res ? "changed" : "");
+    
+    return res;
+}
+
+/* the debug lookup cache is useful if BBCC for same BB are
+ * dumped directly in a row. This is a direct mapped cache.
+ */
+#define DEBUG_CACHE_SIZE 1777
+
+static Addr       debug_cache_addr[DEBUG_CACHE_SIZE];
+static file_node* debug_cache_file[DEBUG_CACHE_SIZE];
+static int        debug_cache_line[DEBUG_CACHE_SIZE];
+static Bool       debug_cache_info[DEBUG_CACHE_SIZE];
+
+static __inline__
+void init_debug_cache(void)
+{
+    int i;
+    for(i=0;i<DEBUG_CACHE_SIZE;i++) {
+	debug_cache_addr[i] = 0;
+	debug_cache_file[i] = 0;
+	debug_cache_line[i] = 0;
+	debug_cache_info[i] = 0;
+    }
+}
+
+static __inline__
+Bool get_debug_pos(BBCC* bbcc, Addr addr, AddrPos* p)
+{
+    Char file[FILENAME_LEN];
+    Bool res;
+
+    int cachepos = addr % DEBUG_CACHE_SIZE;
+    
+    if (debug_cache_addr[cachepos] == addr) {
+	p->line = debug_cache_line[cachepos];
+	p->file = debug_cache_file[cachepos];
+	res     = debug_cache_info[cachepos];
+    }
+    else {
+	res = VG_(get_filename_linenum)(addr,
+					file, FILENAME_LEN,
+					NULL, 0, NULL, //FIXME
+					&(p->line));
+	if (!res) {
+	    VG_(strcpy)(file, "???");
+	    p->line = 0;
+	}
+	p->file    = CLG_(get_file_node)(bbcc->bb->obj, file);
+
+	debug_cache_info[cachepos] = res;
+	debug_cache_addr[cachepos] = addr;
+	debug_cache_line[cachepos] = p->line;
+	debug_cache_file[cachepos] = p->file;
+    }
+
+    /* Address offset from bbcc start address */
+    p->addr = addr - bbcc->bb->obj->offset;
+    p->bb_addr = bbcc->bb->offset;
+
+    CLG_DEBUG(3, "  get_debug_pos(%p): BB %p, fn '%s', file '%s', line %u\n",
+	     addr, bb_addr(bbcc->bb), bbcc->cxt->fn[0]->name,
+	     p->file->name, p->line);
+
+    return res;
+}
+
+
+/* copy file position and init cost */
+static void init_apos(AddrPos* p, Addr addr, Addr bbaddr, file_node* file)
+{
+    p->addr    = addr;
+    p->bb_addr = bbaddr;
+    p->file    = file;
+    p->line    = 0;
+}
+
+static void copy_apos(AddrPos* dst, AddrPos* src)
+{
+    dst->addr    = src->addr;
+    dst->bb_addr = src->bb_addr;
+    dst->file    = src->file;
+    dst->line    = src->line;
+}   
+
+/* copy file position and init cost */
+static void init_fcost(AddrCost* c, Addr addr, Addr bbaddr, file_node* file)
+{
+    init_apos( &(c->p), addr, bbaddr, file);
+    /* FIXME: This is a memory leak as a AddrCost is inited multiple times */
+    c->cost = CLG_(get_eventset_cost)( CLG_(sets).full );
+    CLG_(init_cost)( CLG_(sets).full, c->cost );
+}
+
+
+/**
+ * print position change inside of a BB (last -> curr)
+ * this doesn't update last to curr!
+ */
+static void fprint_apos(Int fd, AddrPos* curr, AddrPos* last, file_node* func_file)
+{
+    CLG_ASSERT(curr->file != 0);
+    CLG_DEBUG(2, "    print_apos(file '%s', line %d, bb %p, addr %p) fnFile '%s'\n",
+	     curr->file->name, curr->line, curr->bb_addr, curr->addr,
+	     func_file->name);
+
+    if (curr->file != last->file) {
+
+	/* if we switch back to orig file, use fe=... */
+	if (curr->file == func_file)
+	    VG_(sprintf)(outbuf, "fe=");
+	else
+	    VG_(sprintf)(outbuf, "fi=");
+	print_file(outbuf+3, curr->file);
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+    }
+
+    if (CLG_(clo).dump_bbs) {
+	if (curr->line != last->line) {
+	    VG_(sprintf)(outbuf, "ln=%d\n", curr->line);
+	    my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+	}
+    }
+}
+
+
+
+/**
+ * Print a position.
+ * This prints out differences if allowed
+ *
+ * This doesn't set last to curr afterwards!
+ */
+static
+void fprint_pos(Int fd, AddrPos* curr, AddrPos* last)
+{
+    if (0) //CLG_(clo).dump_bbs)
+	VG_(sprintf)(outbuf, "%u ", curr->addr - curr->bb_addr);
+    else {
+	int p = 0;
+	if (CLG_(clo).dump_instr) {
+	    int diff = curr->addr - last->addr;
+	    if ( CLG_(clo).compress_pos && (last->addr >0) && 
+		 (diff > -100) && (diff < 100)) {
+		if (diff >0)
+		    p = VG_(sprintf)(outbuf, "+%d ", diff);
+		else if (diff==0)
+		    p = VG_(sprintf)(outbuf, "* ");
+	        else
+		    p = VG_(sprintf)(outbuf, "%d ", diff);
+	    }
+	    else
+		p = VG_(sprintf)(outbuf, "%p ", curr->addr);
+	}
+
+	if (CLG_(clo).dump_bb) {
+	    int diff = curr->bb_addr - last->bb_addr;
+	    if ( CLG_(clo).compress_pos && (last->bb_addr >0) && 
+		 (diff > -100) && (diff < 100)) {
+		if (diff >0)
+		    p += VG_(sprintf)(outbuf+p, "+%d ", diff);
+		else if (diff==0)
+		    p += VG_(sprintf)(outbuf+p, "* ");
+	        else
+		    p += VG_(sprintf)(outbuf+p, "%d ", diff);
+	    }
+	    else
+		p += VG_(sprintf)(outbuf+p, "%p ", curr->bb_addr);
+	}
+
+	if (CLG_(clo).dump_line) {
+	    int diff = curr->line - last->line;
+	    if ( CLG_(clo).compress_pos && (last->line >0) && 
+		 (diff > -100) && (diff < 100)) {
+
+		if (diff >0)
+		    VG_(sprintf)(outbuf+p, "+%d ", diff);
+		else if (diff==0)
+		    VG_(sprintf)(outbuf+p, "* ");
+	        else
+		    VG_(sprintf)(outbuf+p, "%d ", diff);
+	    }
+	    else
+		VG_(sprintf)(outbuf+p, "%u ", curr->line);
+	}
+    }
+    my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+}
+
+
+/**
+ * Print events.
+ */
+
+static
+void fprint_cost(int fd, EventMapping* es, ULong* cost)
+{
+  int p = CLG_(sprint_mappingcost)(outbuf, es, cost);
+  VG_(sprintf)(outbuf+p, "\n");
+  my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+  return;
+}
+
+
+
+/* Write the cost of a source line; only that parts of the source
+ * position are written that changed relative to last written position.
+ * funcPos is the source position of the first line of actual function.
+ * Something is written only if cost != 0; returns True in this case.
+ */
+static void fprint_fcost(Int fd, AddrCost* c, AddrPos* last)
+{
+  CLG_DEBUGIF(3) {
+    CLG_DEBUG(2, "   print_fcost(file '%s', line %d, bb %p, addr %p):\n",
+	     c->p.file->name, c->p.line, c->p.bb_addr, c->p.addr);
+    CLG_(print_cost)(-5, CLG_(sets).full, c->cost);
+  }
+    
+  fprint_pos(fd, &(c->p), last);
+  copy_apos( last, &(c->p) ); /* update last to current position */
+
+  fprint_cost(fd, CLG_(dumpmap), c->cost);
+
+  /* add cost to total */
+  CLG_(add_and_zero_cost)( CLG_(sets).full, dump_total_cost, c->cost );
+}
+
+
+/* Write out the calls from jcc (at pos)
+ */
+static void fprint_jcc(Int fd, jCC* jcc, AddrPos* curr, AddrPos* last, ULong ecounter)
+{
+    static AddrPos target;
+    file_node* file;
+    obj_node*  obj;
+
+    CLG_DEBUGIF(2) {
+      CLG_DEBUG(2, "   fprint_jcc (jkind %d)\n", jcc->jmpkind);
+      CLG_(print_jcc)(-10, jcc);
+    }
+
+    if (!get_debug_pos(jcc->to, bb_addr(jcc->to->bb), &target)) {
+	/* if we don't have debug info, don't switch to file "???" */
+	target.file = last->file;
+    }
+
+    if (jcc->from &&
+	(jcc->jmpkind == JmpCond || jcc->jmpkind == Ijk_Boring)) {
+	    
+      /* this is a JCC for a followed conditional or boring jump. */
+      CLG_ASSERT(CLG_(is_zero_cost)( CLG_(sets).full, jcc->cost));
+	
+      /* objects among jumps should be the same.
+       * Otherwise this jump would have been changed to a call
+       *  (see setup_bbcc)
+       */
+      CLG_ASSERT(jcc->from->bb->obj == jcc->to->bb->obj);
+
+	/* only print if target position info is usefull */
+	if (!CLG_(clo).dump_instr && !CLG_(clo).dump_bb && target.line==0) {
+	  jcc->call_counter = 0;
+	  return;
+	}
+
+	/* Different files/functions are possible e.g. with longjmp's
+	 * which change the stack, and thus context
+	 */
+	if (last->file != target.file) {
+	    VG_(sprintf)(outbuf, "jfi=");
+	    print_file(outbuf+4, target.file);
+	    my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+	}
+	
+	if (jcc->from->cxt != jcc->to->cxt) {
+	    if (CLG_(clo).mangle_names)
+		print_mangled_fn(fd, outbuf, "jfn",
+				 jcc->to->cxt, jcc->to->rec_index);
+	    else
+		print_fn(fd, outbuf, "jfn", jcc->to->cxt->fn[0]);
+	}
+	    
+	if (jcc->jmpkind == JmpCond) {
+	    /* format: jcnd=<followed>/<executions> <target> */
+	    VG_(sprintf)(outbuf, "jcnd=%llu/%llu ",
+			 jcc->call_counter, ecounter);
+	}
+	else {
+	    /* format: jump=<jump count> <target> */
+	    VG_(sprintf)(outbuf, "jump=%llu ",
+			 jcc->call_counter);
+	}
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+		
+	fprint_pos(fd, &target, last);
+	my_fwrite(fd, "\n", 1);
+	fprint_pos(fd, curr, last);
+	my_fwrite(fd, "\n", 1);
+
+	jcc->call_counter = 0;
+	return;
+    }
+
+    CLG_ASSERT(jcc->to !=0);
+    
+    file = jcc->to->cxt->fn[0]->file;
+    obj  = jcc->to->bb->obj;
+    
+    /* object of called position different to object of this function?*/
+    if (jcc->from->cxt->fn[0]->file->obj != obj) {
+	VG_(sprintf)(outbuf, "cob=");
+	print_obj(outbuf+4, obj);
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+    }
+
+    /* file of called position different to current file? */
+    if (last->file != file) {
+	VG_(sprintf)(outbuf, "cfi=");
+	print_file(outbuf+4, file);
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+    }
+
+    if (CLG_(clo).mangle_names)
+	print_mangled_fn(fd, outbuf, "cfn", jcc->to->cxt, jcc->to->rec_index);
+    else
+	print_fn(fd, outbuf, "cfn", jcc->to->cxt->fn[0]);
+
+    if (!CLG_(is_zero_cost)( CLG_(sets).full, jcc->cost)) {
+      VG_(sprintf)(outbuf, "calls=%llu ", 
+		   jcc->call_counter);
+	my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+
+	fprint_pos(fd, &target, last);
+	my_fwrite(fd, "\n", 1);	
+	fprint_pos(fd, curr, last);
+	fprint_cost(fd, CLG_(dumpmap), jcc->cost);
+
+	CLG_(init_cost)( CLG_(sets).full, jcc->cost );
+
+	jcc->call_counter = 0;
+    }
+}
+
+
+
+/* Cost summation of functions.We use alternately ccSum[0/1], thus
+ * ssSum[currSum] for recently read lines with same line number.
+ */
+static AddrCost ccSum[2];
+static int currSum;
+
+/*
+ * Print all costs of a BBCC:
+ * - FCCs of instructions
+ * - JCCs of the unique jump of this BB
+ * returns True if something was written 
+ */
+static Bool fprint_bbcc(Int fd, BBCC* bbcc, AddrPos* last)
+{
+  InstrInfo* instr_info;
+  ULong ecounter;
+  Bool something_written = False;
+  jCC* jcc;
+  AddrCost *currCost, *newCost;
+  Int jcc_count = 0, instr, i, jmp;
+  BB* bb = bbcc->bb;
+
+  CLG_ASSERT(bbcc->cxt != 0);
+  CLG_DEBUGIF(1) {
+    VG_(printf)("+ fprint_bbcc (Instr %d): ", bb->instr_count);
+    CLG_(print_bbcc)(15, bbcc, False);
+  }
+
+  CLG_ASSERT(currSum == 0 || currSum == 1);
+  currCost = &(ccSum[currSum]);
+  newCost  = &(ccSum[1-currSum]);
+
+  ecounter = bbcc->ecounter_sum;
+  jmp = 0;
+  instr_info = &(bb->instr[0]);
+  for(instr=0; instr<bb->instr_count; instr++, instr_info++) {
+
+    /* get debug info of current instruction address and dump cost
+     * if CLG_(clo).dump_bbs or file/line has changed
+     */
+    if (!get_debug_pos(bbcc, bb_addr(bb) + instr_info->instr_offset, 
+		       &(newCost->p))) {
+      /* if we don't have debug info, don't switch to file "???" */
+      newCost->p.file = bbcc->cxt->fn[0]->file;
+    }
+
+    if (CLG_(clo).dump_bbs || CLG_(clo).dump_instr ||
+	(newCost->p.line != currCost->p.line) ||
+	(newCost->p.file != currCost->p.file)) {
+      
+      if (!CLG_(is_zero_cost)( CLG_(sets).full, currCost->cost )) {
+	something_written = True;
+	
+	fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
+	fprint_fcost(fd, currCost, last);
+      }
+	   
+      /* switch buffers */
+      currSum = 1 - currSum;
+      currCost = &(ccSum[currSum]);
+      newCost  = &(ccSum[1-currSum]);
+    }
+       
+    /* add line cost to current cost sum */
+    (*CLG_(cachesim).add_icost)(currCost->cost, bbcc, instr_info, ecounter);
+
+    /* print jcc's if there are: only jumps */
+    if (bb->jmp[jmp].instr == instr) {
+	jcc_count=0;
+	for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from)
+	    if ((jcc->jmpkind != Ijk_Call) && (jcc->call_counter >0))
+	      jcc_count++;
+
+	if (jcc_count>0) {    
+	    if (!CLG_(is_zero_cost)( CLG_(sets).full, currCost->cost )) {
+		/* no need to switch buffers, as position is the same */
+		fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
+		fprint_fcost(fd, currCost, last);
+	    }
+	    get_debug_pos(bbcc, bb_addr(bb)+instr_info->instr_offset, &(currCost->p));
+	    fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
+	    something_written = True;
+	    for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
+		if ((jcc->jmpkind != Ijk_Call) && (jcc->call_counter >0))
+		    fprint_jcc(fd, jcc, &(currCost->p), last, ecounter);
+	    }
+	}
+    }
+
+    /* update execution counter */
+    if (jmp < bb->cjmp_count)
+	if (bb->jmp[jmp].instr == instr) {
+	    ecounter -= bbcc->jmp[jmp].ecounter;
+	    jmp++;
+	}
+  }
+  
+  /* jCCs at end? If yes, dump cumulated line info first */
+  jcc_count = 0;
+  for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
+      /* yes, if JCC only counts jmp arcs or cost >0 */
+      if ( ((jcc->jmpkind != Ijk_Call) && (jcc->call_counter >0)) ||
+	   (!CLG_(is_zero_cost)( CLG_(sets).full, jcc->cost )))
+	  jcc_count++;
+  }
+  
+  if ( (bbcc->skipped &&
+	!CLG_(is_zero_cost)(CLG_(sets).full, bbcc->skipped)) || 
+       (jcc_count>0) ) {
+    
+    if (!CLG_(is_zero_cost)( CLG_(sets).full, currCost->cost )) {
+      /* no need to switch buffers, as position is the same */
+      fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
+      fprint_fcost(fd, currCost, last);
+    }
+    
+    get_debug_pos(bbcc, bb_jmpaddr(bb), &(currCost->p));
+    fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
+    something_written = True;
+    
+    /* first, print skipped costs for calls */
+    if (bbcc->skipped && !CLG_(is_zero_cost)( CLG_(sets).full,
+					     bbcc->skipped )) {
+      CLG_(add_and_zero_cost)( CLG_(sets).full,
+			      currCost->cost, bbcc->skipped );
+#if 0
+      VG_(sprintf)(outbuf, "# Skipped\n");
+      my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+#endif
+      fprint_fcost(fd, currCost, last);
+    }
+
+    if (jcc_count > 0)
+	for(jcc=bbcc->jmp[jmp].jcc_list; jcc; jcc=jcc->next_from) {
+	    CLG_ASSERT(jcc->jmp == jmp);
+	    if ( ((jcc->jmpkind != Ijk_Call) && (jcc->call_counter >0)) ||
+		 (!CLG_(is_zero_cost)( CLG_(sets).full, jcc->cost )))
+	  
+		fprint_jcc(fd, jcc, &(currCost->p), last, ecounter);
+	}
+  }
+
+  if (CLG_(clo).dump_bbs || CLG_(clo).dump_bb) {
+    if (!CLG_(is_zero_cost)( CLG_(sets).full, currCost->cost )) {
+      something_written = True;
+      
+      fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
+      fprint_fcost(fd, currCost, last);
+    }
+    if (CLG_(clo).dump_bbs) my_fwrite(fd, (void*)"\n", 1);
+    
+    /* when every cost was immediatly written, we must have done so,
+     * as this function is only called when there's cost in a BBCC
+     */
+    CLG_ASSERT(something_written);
+  }
+  
+  bbcc->ecounter_sum = 0;
+  for(i=0; i<=bbcc->bb->cjmp_count; i++)
+    bbcc->jmp[i].ecounter = 0;
+  bbcc->ret_counter = 0;
+  
+  CLG_DEBUG(1, "- fprint_bbcc: JCCs %d\n", jcc_count);
+  
+  return something_written;
+}
+
+/* order by
+ *  recursion,
+ *  from->bb->obj, from->bb->fn
+ *  obj, fn[0]->file, fn
+ *  address
+ */
+static int my_cmp(BBCC** pbbcc1, BBCC** pbbcc2)
+{
+#if 0
+    return (*pbbcc1)->bb->offset - (*pbbcc2)->bb->offset;
+#else
+    BBCC *bbcc1 = *pbbcc1;
+    BBCC *bbcc2 = *pbbcc2;
+    Context* cxt1 = bbcc1->cxt;
+    Context* cxt2 = bbcc2->cxt;
+    int off = 1;
+
+    if (cxt1->fn[0]->file->obj != cxt2->fn[0]->file->obj)
+	return cxt1->fn[0]->file->obj - cxt2->fn[0]->file->obj;
+
+    if (cxt1->fn[0]->file != cxt2->fn[0]->file)
+	return cxt1->fn[0]->file - cxt2->fn[0]->file;
+
+    if (cxt1->fn[0] != cxt2->fn[0])
+	return cxt1->fn[0] - cxt2->fn[0];
+
+    if (bbcc1->rec_index != bbcc2->rec_index)
+	return bbcc1->rec_index - bbcc2->rec_index;
+
+    while((off < cxt1->size) && (off < cxt2->size)) {
+	fn_node* ffn1 = cxt1->fn[off];
+	fn_node* ffn2 = cxt2->fn[off];
+	if (ffn1->file->obj != ffn2->file->obj)
+	    return ffn1->file->obj - ffn2->file->obj;
+	if (ffn1 != ffn2)
+	    return ffn1 - ffn2;
+	off++;
+    }
+    if      (cxt1->size > cxt2->size) return 1;
+    else if (cxt1->size < cxt2->size) return -1;
+
+    return bbcc1->bb->offset - bbcc2->bb->offset;
+#endif
+}
+
+
+
+
+
+/* modified version of:
+ *
+ * qsort -- qsort interface implemented by faster quicksort.
+ * J. L. Bentley and M. D. McIlroy, SPE 23 (1993) 1249-1265.
+ * Copyright 1993, John Wiley.
+*/
+
+static __inline__
+void swapfunc(BBCC** a, BBCC** b, int n)
+{
+    while(n>0) {
+	BBCC* t = *a; *a = *b; *b = t;
+	a++, b++;
+	n--;
+    }
+}
+
+static __inline__
+void swap(BBCC** a, BBCC** b)
+{
+    BBCC* t;
+    t = *a; *a = *b; *b = t;
+}
+
+#define min(x, y) ((x)<=(y) ? (x) : (y))
+
+static
+BBCC** med3(BBCC **a, BBCC **b, BBCC **c, int (*cmp)(BBCC**,BBCC**))
+{	return cmp(a, b) < 0 ?
+		  (cmp(b, c) < 0 ? b : cmp(a, c) < 0 ? c : a)
+		: (cmp(b, c) > 0 ? b : cmp(a, c) > 0 ? c : a);
+}
+
+static BBCC** qsort_start = 0;
+
+static void qsort(BBCC **a, int n, int (*cmp)(BBCC**,BBCC**))
+{
+	BBCC **pa, **pb, **pc, **pd, **pl, **pm, **pn, **pv;
+	int s, r;
+	BBCC* v;
+
+	CLG_DEBUG(8, "  qsort(%d,%d)\n", a-qsort_start, n);
+
+	if (n < 7) {	 /* Insertion sort on smallest arrays */
+		for (pm = a+1; pm < a+n; pm++)
+			for (pl = pm; pl > a && cmp(pl-1, pl) > 0; pl --)
+				swap(pl, pl-1);
+
+		CLG_DEBUGIF(8) {
+		    for (pm = a; pm < a+n; pm++) {
+			VG_(printf)("   %3d BB %p, ", pm - qsort_start,
+				    bb_addr((*pm)->bb));      
+			CLG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
+		    }
+		}
+		return;
+	}
+	pm = a + n/2;    /* Small arrays, middle element */
+	if (n > 7) {
+		pl = a;
+		pn = a + (n-1);
+		if (n > 40) {    /* Big arrays, pseudomedian of 9 */
+			s = n/8;
+			pl = med3(pl, pl+s, pl+2*s, cmp);
+			pm = med3(pm-s, pm, pm+s, cmp);
+			pn = med3(pn-2*s, pn-s, pn, cmp);
+		}
+		pm = med3(pl, pm, pn, cmp); /* Mid-size, med of 3 */
+	}
+
+
+	v = *pm;
+	pv = &v;
+	pa = pb = a;
+	pc = pd = a + (n-1);
+	for (;;) {
+		while ((pb <= pc) && ((r=cmp(pb, pv)) <= 0)) {
+		    if (r==0) {
+			/* same as pivot, to start */
+			swap(pa,pb); pa++; 
+		    }
+		    pb ++;
+		}
+		while ((pb <= pc) && ((r=cmp(pc, pv)) >= 0)) {
+		    if (r==0) {
+			/* same as pivot, to end */
+			swap(pc,pd); pd--; 
+		    }
+		    pc --;
+		}
+		if (pb > pc) { break; }
+		swap(pb, pc);
+		pb ++;
+		pc --;
+	}
+	pb--;
+	pc++;
+
+	/* put pivot from start into middle */
+	if ((s = pa-a)>0) { for(r=0;r<s;r++) swap(a+r, pb+1-s+r); }
+	/* put pivot from end into middle */
+	if ((s = a+n-1-pd)>0) { for(r=0;r<s;r++) swap(pc+r, a+n-s+r); }	    
+
+	CLG_DEBUGIF(8) {
+	  VG_(printf)("   PV BB %p, ", bb_addr((*pv)->bb));
+	    CLG_(print_cxt)(9, (*pv)->cxt, (*pv)->rec_index);
+
+	    s = pb-pa+1;
+	    VG_(printf)("    Lower %d - %d:\n", a-qsort_start, a+s-1-qsort_start);
+	    for (r=0;r<s;r++) {
+		pm = a+r;
+		VG_(printf)("     %3d BB %p, ", 
+			    pm-qsort_start,bb_addr((*pm)->bb));
+		CLG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
+	    }
+
+	    s = pd-pc+1;
+	    VG_(printf)("    Upper %d - %d:\n", 
+			a+n-s-qsort_start, a+n-1-qsort_start);
+	    for (r=0;r<s;r++) {
+		pm = a+n-s+r;
+		VG_(printf)("     %3d BB %p, ", 
+			    pm-qsort_start,bb_addr((*pm)->bb));
+		CLG_(print_cxt)(9, (*pm)->cxt, (*pm)->rec_index);
+	    }
+	}
+
+	if ((s = pb+1-pa) > 1) qsort(a,     s, cmp);
+	if ((s = pd+1-pc) > 1) qsort(a+n-s, s, cmp);
+}
+
+
+/* Helpers for prepare_dump */
+
+static Int    prepare_count;
+static BBCC** prepare_ptr;
+
+
+static void hash_addCount(BBCC* bbcc)
+{
+  if ((bbcc->ecounter_sum > 0) || (bbcc->ret_counter>0))
+    prepare_count++;
+}
+
+static void hash_addPtr(BBCC* bbcc)
+{
+  if ((bbcc->ecounter_sum == 0) &&
+      (bbcc->ret_counter == 0)) return;
+
+  *prepare_ptr = bbcc;
+  prepare_ptr++;
+}
+
+
+static void cs_addCount(thread_info* ti)
+{
+  Int i;
+  BBCC* bbcc;
+
+  /* add BBCCs with active call in call stack of current thread.
+   * update cost sums for active calls
+   */
+      
+  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
+    call_entry* e = &(CLG_(current_call_stack).entry[i]);
+    if (e->jcc == 0) continue;
+    
+    CLG_(add_diff_cost_lz)( CLG_(sets).full, &(e->jcc->cost),
+			   e->enter_cost, CLG_(current_state).cost);
+    bbcc = e->jcc->from;
+
+    CLG_DEBUG(1, " [%2d] (tid %d), added active: %s\n",
+	     i,CLG_(current_tid),bbcc->cxt->fn[0]->name);
+    
+    if (bbcc->ecounter_sum>0 || bbcc->ret_counter>0) {
+      /* already counted */
+      continue;
+    }
+    prepare_count++;
+  }
+}
+
+static void cs_addPtr(thread_info* ti)
+{
+  Int i;
+  BBCC* bbcc;
+
+  /* add BBCCs with active call in call stack of current thread.
+   * update cost sums for active calls
+   */
+      
+  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
+    call_entry* e = &(CLG_(current_call_stack).entry[i]);
+    if (e->jcc == 0) continue;
+
+    bbcc = e->jcc->from;
+    
+    if (bbcc->ecounter_sum>0 || bbcc->ret_counter>0) {
+      /* already counted */
+      continue;
+    }
+
+    *prepare_ptr = bbcc;
+    prepare_ptr++;
+  }
+}
+
+
+/**
+ * Put all BBCCs with costs into a sorted array.
+ * The returned arrays ends with a null pointer. 
+ * Must be freed after dumping.
+ */
+static
+BBCC** prepare_dump(void)
+{
+    BBCC **array;
+
+    prepare_count = 0;
+    
+    /* if we do not separate among threads, this gives all */
+    /* count number of BBCCs with >0 executions */
+    CLG_(forall_bbccs)(hash_addCount);
+
+    /* even if we do not separate among threads,
+     * call stacks are separated */
+    if (CLG_(clo).separate_threads)
+      cs_addCount(0);
+    else
+      CLG_(forall_threads)(cs_addCount);
+
+    CLG_DEBUG(0, "prepare_dump: %d BBCCs\n", prepare_count);
+
+    /* allocate bbcc array, insert BBCCs and sort */
+    prepare_ptr = array =
+      (BBCC**) CLG_MALLOC((prepare_count+1) * sizeof(BBCC*));    
+
+    CLG_(forall_bbccs)(hash_addPtr);
+
+    if (CLG_(clo).separate_threads)
+      cs_addPtr(0);
+    else
+      CLG_(forall_threads)(cs_addPtr);
+
+    CLG_ASSERT(array + prepare_count == prepare_ptr);
+
+    /* end mark */
+    *prepare_ptr = 0;
+
+    CLG_DEBUG(0,"             BBCCs inserted\n");
+
+    qsort_start = array;
+    qsort(array, prepare_count, my_cmp);
+
+    CLG_DEBUG(0,"             BBCCs sorted\n");
+
+    return array;
+}
+
+
+
+
+static void fprint_cost_ln(int fd, Char* prefix,
+			   EventMapping* em, ULong* cost)
+{
+    int p;
+
+    p = VG_(sprintf)(outbuf, "%s", prefix);
+    p += CLG_(sprint_mappingcost)(outbuf + p, em, cost);
+    VG_(sprintf)(outbuf + p, "\n");
+    my_fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
+}
+
+static ULong bbs_done = 0;
+static Char* filename = 0;
+
+static
+void file_err(void)
+{
+   VG_(message)(Vg_UserMsg,
+                "Error: can not open cache simulation output file `%s'",
+                filename );
+   VG_(exit)(1);
+}
+
+/**
+ * Create a new dump file and write header.
+ *
+ * Naming: <CLG_(clo).filename_base>.<pid>[.<part>][-<tid>]
+ *         <part> is skipped for final dump (trigger==0)
+ *         <tid>  is skipped for thread 1 with CLG_(clo).separate_threads=no
+ *
+ * Returns the file descriptor, and -1 on error (no write permission)
+ */
+static int new_dumpfile(Char buf[BUF_LEN], int tid, Char* trigger)
+{
+    Bool appending = False;
+    int i, fd;
+    FullCost sum = 0;
+    SysRes res;
+
+    CLG_ASSERT(filename != 0);
+
+    if (!CLG_(clo).combine_dumps) {
+	i = VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
+    
+	if (trigger)
+	    i += VG_(sprintf)(filename+i, ".%d", out_counter);
+
+	if (CLG_(clo).separate_threads)
+	    i += VG_(sprintf)(filename+i, "-%02d", tid);
+
+	res = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
+    }
+    else {
+	VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
+        res = VG_(open)(filename, VKI_O_WRONLY|VKI_O_APPEND, 0);
+	if (!res.isError && out_counter>1)
+	    appending = True;
+    }
+
+    if (res.isError) {
+	res = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
+			VKI_S_IRUSR|VKI_S_IWUSR);
+	if (res.isError) {
+	    /* If the file can not be opened for whatever reason (conflict
+	       between multiple supervised processes?), give up now. */
+	    file_err();
+	}
+    }
+    fd = (Int) res.val;
+
+    CLG_DEBUG(2, "  new_dumpfile '%s'\n", filename);
+
+    if (!appending)
+	reset_dump_array();
+
+
+    if (!appending) {
+	/* version */
+	VG_(sprintf)(buf, "version: 1\n");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+
+	/* creator */
+	VG_(sprintf)(buf, "creator: callgrind-" VERSION "\n");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+
+	/* "pid:" line */
+	VG_(sprintf)(buf, "pid: %d\n", VG_(getpid)());
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+
+	/* "cmd:" line */
+	VG_(strcpy)(buf, "cmd: ");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	my_fwrite(fd, (void*)cmdbuf, VG_(strlen)(cmdbuf));
+    }
+
+    VG_(sprintf)(buf, "\npart: %d\n", out_counter);
+    my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+    if (CLG_(clo).separate_threads) {
+	VG_(sprintf)(buf, "thread: %d\n", tid);
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+    }
+
+    /* "desc:" lines */
+    if (!appending) {
+	my_fwrite(fd, "\n", 1);
+
+#if 0
+	/* Global options changing the tracing behaviour */
+	VG_(sprintf)(buf, "\ndesc: Option: --skip-plt=%s\n",
+		     CLG_(clo).skip_plt ? "yes" : "no");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	VG_(sprintf)(buf, "desc: Option: --collect-jumps=%s\n",
+		     CLG_(clo).collect_jumps ? "yes" : "no");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	VG_(sprintf)(buf, "desc: Option: --separate-recs=%d\n",
+		     CLG_(clo).separate_recursions);
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	VG_(sprintf)(buf, "desc: Option: --separate-callers=%d\n",
+		     CLG_(clo).separate_callers);
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+
+	VG_(sprintf)(buf, "desc: Option: --dump-bbs=%s\n",
+		     CLG_(clo).dump_bbs ? "yes" : "no");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	VG_(sprintf)(buf, "desc: Option: --separate-threads=%s\n",
+		     CLG_(clo).separate_threads ? "yes" : "no");
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+#endif
+
+	(*CLG_(cachesim).getdesc)(buf);
+	my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+    }
+
+    VG_(sprintf)(buf, "\ndesc: Timerange: Basic block %llu - %llu\n",
+		 bbs_done, CLG_(stat).bb_executions);
+
+    my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+    VG_(sprintf)(buf, "desc: Trigger: %s\n",
+		 trigger ? trigger : (Char*)"Program termination");
+    my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+
+#if 0
+   /* Output function specific config
+    * FIXME */
+   for (i = 0; i < N_FNCONFIG_ENTRIES; i++) {
+       fnc = fnc_table[i];
+       while (fnc) {
+	   if (fnc->skip) {
+	       VG_(sprintf)(buf, "desc: Option: --fn-skip=%s\n", fnc->name);
+	       my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	   }
+	   if (fnc->dump_at_enter) {
+	       VG_(sprintf)(buf, "desc: Option: --fn-dump-at-enter=%s\n",
+			    fnc->name);
+	       my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	   }   
+	   if (fnc->dump_at_leave) {
+	       VG_(sprintf)(buf, "desc: Option: --fn-dump-at-leave=%s\n",
+			    fnc->name);
+	       my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	   }
+	   if (fnc->separate_callers != CLG_(clo).separate_callers) {
+	       VG_(sprintf)(buf, "desc: Option: --separate-callers%d=%s\n",
+			    fnc->separate_callers, fnc->name);
+	       my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	   }   
+	   if (fnc->separate_recursions != CLG_(clo).separate_recursions) {
+	       VG_(sprintf)(buf, "desc: Option: --separate-recs%d=%s\n",
+			    fnc->separate_recursions, fnc->name);
+	       my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+	   }   
+	   fnc = fnc->next;
+       }
+   }
+#endif
+
+   /* "positions:" line */
+   VG_(sprintf)(buf, "\npositions:%s%s%s\n",
+		CLG_(clo).dump_instr ? " instr" : "",
+		CLG_(clo).dump_bb    ? " bb" : "",
+		CLG_(clo).dump_line  ? " line" : "");
+   my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+
+   /* "events:" line */
+   i = VG_(sprintf)(buf, "events: ");
+   CLG_(sprint_eventmapping)(buf+i, CLG_(dumpmap));
+   my_fwrite(fd, (void*)buf, VG_(strlen)(buf));
+   my_fwrite(fd, "\n", 1);
+
+   /* summary lines */
+   sum = CLG_(get_eventset_cost)( CLG_(sets).full );
+   CLG_(zero_cost)(CLG_(sets).full, sum);
+   if (CLG_(clo).separate_threads) {
+     thread_info* ti = CLG_(get_current_thread)();
+     CLG_(add_diff_cost)(CLG_(sets).full, sum, ti->lastdump_cost,
+			   ti->states.entry[0]->cost);
+   }
+   else {
+     /* This function is called once for thread 1, where
+      * all costs are summed up when not dumping separate per thread.
+      * But this is not true for summary: we need to add all threads.
+      */
+     int t;
+     thread_info** thr = CLG_(get_threads)();
+     for(t=1;t<VG_N_THREADS;t++) {
+       if (!thr[t]) continue;
+       CLG_(add_diff_cost)(CLG_(sets).full, sum,
+			  thr[t]->lastdump_cost,
+			  thr[t]->states.entry[0]->cost);
+     }
+   }
+   fprint_cost_ln(fd, "summary: ", CLG_(dumpmap), sum);
+
+   /* all dumped cost will be added to total_fcc */
+   CLG_(init_cost_lz)( CLG_(sets).full, &dump_total_cost );
+
+   my_fwrite(fd, "\n\n",2);
+
+   if (VG_(clo_verbosity) > 1)
+       VG_(message)(Vg_DebugMsg, "Dump to %s", filename);
+
+   return fd;
+}
+
+
+static void close_dumpfile(Char buf[BUF_LEN], int fd, int tid)
+{
+    if (fd <0) return;
+
+    fprint_cost_ln(fd, "totals: ", CLG_(dumpmap),
+		   dump_total_cost);
+    //fprint_fcc_ln(fd, "summary: ", &dump_total_fcc);
+    CLG_(add_cost_lz)(CLG_(sets).full, 
+		     &CLG_(total_cost), dump_total_cost);
+
+    fwrite_flush();    
+    VG_(close)(fd);
+
+    if (filename[0] == '.') {
+	if (-1 == VG_(rename) (filename, filename+1)) {
+	    /* Can not rename to correct file name: give out warning */
+	    VG_(message)(Vg_DebugMsg, "Warning: Can not rename .%s to %s",
+			 filename, filename);
+       }
+   }
+}
+
+
+/* Helper for print_bbccs */
+
+static Int   print_fd;
+static Char* print_trigger;
+static Char  print_buf[BUF_LEN];
+
+static void print_bbccs_of_thread(thread_info* ti)
+{
+  BBCC **p, **array;
+  FnPos lastFnPos;
+  AddrPos lastAPos;
+
+  CLG_DEBUG(1, "+ print_bbccs(tid %d)\n", CLG_(current_tid));
+
+  print_fd = new_dumpfile(print_buf, CLG_(current_tid), print_trigger);
+  if (print_fd <0) {
+    CLG_DEBUG(1, "- print_bbccs(tid %d): No output...\n", CLG_(current_tid));
+    return;
+  }
+
+  p = array = prepare_dump();
+  init_fpos(&lastFnPos);
+  init_apos(&lastAPos, 0, 0, 0);
+
+  if (p) while(1) {
+
+    /* on context/function change, print old cost buffer before */
+    if (lastFnPos.cxt && ((*p==0) ||				 
+			 (lastFnPos.cxt != (*p)->cxt) ||
+			 (lastFnPos.rec_index != (*p)->rec_index))) {
+      if (!CLG_(is_zero_cost)( CLG_(sets).full, ccSum[currSum].cost )) {
+	/* no need to switch buffers, as position is the same */
+	fprint_apos(print_fd, &(ccSum[currSum].p), &lastAPos,
+		    lastFnPos.cxt->fn[0]->file);
+	fprint_fcost(print_fd, &ccSum[currSum], &lastAPos);
+      }
+      
+      if (ccSum[currSum].p.file != lastFnPos.cxt->fn[0]->file) {
+	/* switch back to file of function */
+	VG_(sprintf)(print_buf, "fe=");
+	print_file(print_buf+3, lastFnPos.cxt->fn[0]->file);
+	my_fwrite(print_fd, (void*)print_buf, VG_(strlen)(print_buf));
+      }
+      my_fwrite(print_fd, "\n", 1);
+    }
+    
+    if (*p == 0) break;
+    
+    if (print_fn_pos(print_fd, &lastFnPos, *p)) {
+      
+      /* new function */
+      init_apos(&lastAPos, 0, 0, (*p)->cxt->fn[0]->file);
+      init_fcost(&ccSum[0], 0, 0, 0);
+      init_fcost(&ccSum[1], 0, 0, 0);
+      currSum = 0;
+    }
+    
+    if (CLG_(clo).dump_bbs) {
+	/* FIXME: Specify Object of BB if different to object of fn */
+	int i, pos = 0;
+	ULong ecounter = (*p)->ecounter_sum;
+	pos = VG_(sprintf)(print_buf, "bb=%p ", (*p)->bb->offset);
+	for(i = 0; i<(*p)->bb->cjmp_count;i++) {
+	    pos += VG_(sprintf)(print_buf+pos, "%d %llu ", 
+				(*p)->bb->jmp[i].instr,
+				ecounter);
+	    ecounter -= (*p)->jmp[i].ecounter;
+	}
+	VG_(sprintf)(print_buf+pos, "%d %llu\n", 
+		     (*p)->bb->instr_count,
+		     ecounter);
+	my_fwrite(print_fd, (void*)print_buf, VG_(strlen)(print_buf));
+    }
+    
+    fprint_bbcc(print_fd, *p, &lastAPos);
+    
+    p++;
+  }
+  
+  close_dumpfile(print_buf, print_fd, CLG_(current_tid));
+  if (array) VG_(free)(array);
+  
+  /* set counters of last dump */
+  CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost,
+		  CLG_(current_state).cost );
+
+  CLG_DEBUG(1, "- print_bbccs(tid %d)\n", CLG_(current_tid));
+}
+
+
+static void print_bbccs(Char* trigger, Bool only_current_thread)
+{
+  init_dump_array();
+  init_debug_cache();
+
+  print_fd = -1;
+  print_trigger = trigger;
+
+  if (!CLG_(clo).separate_threads) {
+    /* All BBCC/JCC costs is stored for thread 1 */
+    Int orig_tid = CLG_(current_tid);
+
+    CLG_(switch_thread)(1);
+    print_bbccs_of_thread( CLG_(get_current_thread)() );
+    CLG_(switch_thread)(orig_tid);
+  }
+  else if (only_current_thread)
+    print_bbccs_of_thread( CLG_(get_current_thread)() );
+  else
+    CLG_(forall_threads)(print_bbccs_of_thread);
+
+  free_dump_array();
+}
+
+
+void CLG_(dump_profile)(Char* trigger, Bool only_current_thread)
+{
+   CLG_DEBUG(2, "+ dump_profile(Trigger '%s')\n",
+	    trigger ? trigger : (Char*)"Prg.Term.");
+
+   if (VG_(clo_verbosity) > 1)
+       VG_(message)(Vg_DebugMsg, "Start dumping at BB %llu (%s)...",
+		    CLG_(stat).bb_executions,
+		    trigger ? trigger : (Char*)"Prg.Term.");
+
+   out_counter++;
+
+   print_bbccs(trigger, only_current_thread);
+
+
+   bbs_done = CLG_(stat).bb_executions++;
+
+   if (VG_(clo_verbosity) > 1)
+     VG_(message)(Vg_DebugMsg, "Dumping done.");
+}
+
+/* copy command to cmd buffer (could change) */
+static
+void init_cmdbuf(void)
+{
+  Int i,j,size = 0;
+  HChar* argv;
+
+#if VG_CORE_INTERFACE_VERSION > 8
+  if (VG_(args_the_exename))
+      size = VG_(sprintf)(cmdbuf, " %s", VG_(args_the_exename));
+
+  for(i = 0; i < VG_(args_for_client).used; i++) {
+      argv = VG_(args_for_client).strs[i];
+      if (!argv) continue;
+      if ((size>0) && (size < BUF_LEN)) cmdbuf[size++] = ' ';
+      for(j=0;argv[j]!=0;j++)
+	  if (size < BUF_LEN) cmdbuf[size++] = argv[j];
+  }
+#else
+  for(i = 0; i < VG_(client_argc); i++) {
+    argv = VG_(client_argv[i]);
+    if (!argv) continue;
+    if ((size>0) && (size < BUF_LEN)) cmdbuf[size++] = ' ';
+    for(j=0;argv[j]!=0;j++)
+      if (size < BUF_LEN) cmdbuf[size++] = argv[j];
+  }
+#endif
+
+  if (size == BUF_LEN) size--;
+  cmdbuf[size] = 0;
+}
+
+void CLG_(init_files)(Char** dir, Char** file)
+{
+  Int size;
+  SysRes res;
+
+   if (!CLG_(clo).filename_base)
+     CLG_(clo).filename_base = DEFAULT_DUMPNAME;
+
+   /* get base directory for dump/command/result files */
+   if (CLG_(clo).filename_base[0] == '/') {
+       int lastSlash = 0, i =1;
+       while(CLG_(clo).filename_base[i]) {
+	 for(; CLG_(clo).filename_base[i] &&
+	       CLG_(clo).filename_base[i] != '/'; i++);
+	   if (CLG_(clo).filename_base[i] != '/') break;
+	   lastSlash = i;
+	   i++;
+       }
+       base_directory = (Char*) CLG_MALLOC(i+1);
+       VG_(strncpy)(base_directory, CLG_(clo).filename_base, i);
+       base_directory[i] = 0;
+
+       dump_file_base = CLG_(clo).filename_base;
+   }
+   else {
+       size = 100;
+       base_directory = 0;
+
+       /* getcwd() fails if the buffer isn't big enough -- keep doubling size
+          until it succeeds. */
+       while (NULL == base_directory) {
+           base_directory = CLG_MALLOC(size);
+           if (!VG_(getcwd)(base_directory, size)) {
+               VG_(free)(base_directory);
+               base_directory = 0;
+               size *= 2;
+           }
+       }
+
+       size = VG_(strlen)(base_directory) + VG_(strlen)(CLG_(clo).filename_base) +2;
+       dump_file_base = (Char*) CLG_MALLOC(size);
+       CLG_ASSERT(dump_file_base != 0);
+       VG_(sprintf)(dump_file_base, "%s/%s",
+		    base_directory, CLG_(clo).filename_base);
+   }
+
+   /* allocate space big enough for final filenames */
+   filename = (Char*) CLG_MALLOC(VG_(strlen)(dump_file_base)+32);
+   CLG_ASSERT(filename != 0);
+       
+   /* Make sure the output base file can be written.
+    * This is used for the dump at program termination.
+    * We stop with an error here if we can not create the
+    * file: This is probably because of missing rights,
+    * and trace parts wouldn't be allowed to be written, too.
+    */ 
+    VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
+    res = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
+    if (res.isError) { 
+	res = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
+		       VKI_S_IRUSR|VKI_S_IWUSR);
+	if (res.isError) {
+	    file_err(); 
+	}
+    }
+    if (!res.isError) VG_(close)( (Int)res.val );
+
+    *dir  = base_directory;
+    *file = filename;
+
+    init_cmdbuf();
+}
diff --git a/callgrind/events.c b/callgrind/events.c
new file mode 100644
index 0000000000..6ef8d8523e
--- /dev/null
+++ b/callgrind/events.c
@@ -0,0 +1,575 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                     events.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#define MAX_EVENTTYPE 20
+
+static EventType eventtype[MAX_EVENTTYPE];
+static Int eventtype_count = 0;
+
+EventType* CLG_(register_eventtype)(Char* name)
+{
+  EventType* et;
+
+  if (eventtype_count == MAX_EVENTTYPE) {
+    VG_(printf)("\nMore than %d event types used!\n"
+		"Increase MAX_EVENTTYPE in ct_events.c and recomile this tool!\n",
+		MAX_EVENTTYPE);
+    VG_(tool_panic)("Too many event types requested.");
+  }
+
+  et = &(eventtype[eventtype_count]);
+  et->id = eventtype_count; 
+  et->name = (UChar*) VG_(strdup)(name);
+  et->description = 0;
+
+  eventtype_count++;
+
+  return et;
+}
+
+
+EventType* CLG_(get_eventtype)(Char* name)
+{
+  Int i;
+
+  for(i=0;i<eventtype_count;i++)
+    if (VG_(strcmp)(eventtype[i].name, name) == 0)
+      return eventtype+i;
+  return 0;
+}
+
+EventType* CLG_(get_eventtype_byindex)(Int id)
+{
+  if ((id >= 0) && (id < eventtype_count))
+    return eventtype+id;
+  return 0;
+}
+
+/* Allocate space for an event set */
+EventSet* CLG_(get_eventset)(Char* n, Int capacity)
+{
+  EventSet* es;
+
+  es = (EventSet*) CLG_MALLOC(sizeof(EventSet) +
+			       capacity * sizeof(EventSetEntry));
+  es->capacity = capacity;
+  es->size = 0;
+  es->name = n;
+
+  return es;
+}
+
+/* Incorporate a event type into a set, get start offset */
+Int CLG_(add_eventtype)(EventSet* es, EventType* t)
+{
+  Int offset = es->size;
+  if (es->capacity - offset < 1) return -1;
+
+  es->size++;
+  es->e[offset].type = t;
+  es->e[offset].nextTop = es->size;
+
+  return offset;
+}
+
+/* Incorporate one event set into another, get start offset */
+Int CLG_(add_eventset)(EventSet* dst, EventSet* src)
+{
+  Int offset = dst->size, i;
+  if (!src || (src->size == 0)) return offset;
+
+  if (dst->capacity - offset < src->size) return -1;
+  
+  for(i=0;i<src->size;i++) {
+    dst->e[offset+i].type = src->e[i].type;
+    dst->e[offset+i].nextTop = src->e[i].nextTop + offset;
+  }
+  dst->size += src->size;
+
+  return offset;
+}
+
+/* Incorporate two event types into a set, with second < first */
+Int CLG_(add_dep_event2)(EventSet* es, EventType* e1, EventType* e2)
+{
+  Int offset = es->size;
+
+  if (es->capacity - offset < 2) return -1;
+
+  es->size += 2;
+  es->e[offset].type = e1;
+  es->e[offset].nextTop = es->size;
+  es->e[offset+1].type = e2;
+  es->e[offset+1].nextTop = es->size;
+  
+  return offset;
+}
+
+/* Incorporate 3 event types into a set, with third < second < first */
+Int CLG_(add_dep_event3)(EventSet* es,
+			EventType* e1, EventType* e2, EventType* e3)
+{
+  Int offset = es->size;
+
+  if (es->capacity - offset < 3) return -1;
+
+  es->size += 3;
+  es->e[offset].type = e1;
+  es->e[offset].nextTop = es->size;
+  es->e[offset+1].type = e2;
+  es->e[offset+1].nextTop = es->size;
+  es->e[offset+2].type = e3;
+  es->e[offset+2].nextTop = es->size;
+  
+  return offset;
+}
+
+Int CLG_(add_dep_event4)(EventSet* es,
+			EventType* e1, EventType* e2,
+			EventType* e3, EventType* e4)
+{
+  Int offset = es->size;
+
+  if (es->capacity - offset < 4) return -1;
+
+  es->size += 4;
+  es->e[offset].type = e1;
+  es->e[offset].nextTop = es->size;
+  es->e[offset+1].type = e2;
+  es->e[offset+1].nextTop = es->size;
+  es->e[offset+2].type = e3;
+  es->e[offset+2].nextTop = es->size;
+  es->e[offset+3].type = e4;
+  es->e[offset+3].nextTop = es->size;
+  
+  return offset;
+}
+
+/* Returns number of characters written */
+Int CLG_(sprint_eventset)(Char* buf, EventSet* es)
+{
+  Int i, pos = 0;
+
+  for(i=0; i< es->size; i++) {
+    if (pos>0) buf[pos++] = ' ';
+    pos += VG_(sprintf)(buf + pos, es->e[i].type->name);
+  }
+  buf[pos] = 0;
+
+  return pos;
+}
+
+/* Get cost array for an event set */
+ULong* CLG_(get_eventset_cost)(EventSet* es)
+{
+  return CLG_(get_costarray)(es->capacity);
+}
+
+/* Set all costs of an event set to zero */
+void CLG_(init_cost)(EventSet* es, ULong* cost)
+{
+  Int i;
+
+  if (!cost) return;
+
+  for(i=0;i<es->capacity;i++)
+    cost[i] = 0;
+}
+
+/* Set all costs of an event set to zero */
+void CLG_(init_cost_lz)(EventSet* es, ULong** cost)
+{
+  Int i;
+
+  CLG_ASSERT(cost != 0);
+  if (!(*cost))
+    *cost = CLG_(get_eventset_cost)(es);
+
+  for(i=0;i<es->capacity;i++)
+    (*cost)[i] = 0;
+}
+
+void CLG_(zero_cost)(EventSet* es, ULong* cost)
+{
+  Int i;
+
+  if (!cost) return;
+
+  for(i=0;i<es->size;i++)
+    cost[i] = 0;
+}
+  
+Bool CLG_(is_zero_cost)(EventSet* es, ULong* cost)
+{
+  Int i = 0;
+
+  if (!cost) return True;
+
+  while(i<es->size) {
+    if (cost[i] != 0) return False;
+    i = es->e[i].nextTop;
+  }
+  return True;
+}
+
+Bool CLG_(is_equal_cost)(EventSet* es, ULong* c1, ULong* c2)
+{
+  Int i = 0;
+
+  if (!c1) return CLG_(is_zero_cost)(es,c2);
+  if (!c2) return CLG_(is_zero_cost)(es,c1);
+
+  while(i<es->size) {
+    if (c1[i] != c2[i]) return False;
+    if (c1[i] == 0)
+      i = es->e[i].nextTop;
+    else
+      i++;
+  }
+  return True;
+}
+
+void CLG_(copy_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+  Int i;
+
+  if (!src) {
+    CLG_(zero_cost)(es, dst);
+    return;
+  }
+  CLG_ASSERT(dst != 0);
+  
+  for(i=0;i<es->size;i++)
+    dst[i] = src[i];
+}
+
+void CLG_(copy_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+  Int i;
+  ULong* dst;
+
+  CLG_ASSERT(pdst != 0);
+
+  if (!src) {
+    CLG_(zero_cost)(es, *pdst);
+    return;
+  }
+  dst = *pdst;
+  if (!dst)
+    dst = *pdst = CLG_(get_eventset_cost)(es);
+  
+  for(i=0;i<es->size;i++)
+    dst[i] = src[i];
+}
+
+void CLG_(add_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+  Int i = 0;
+
+  if (!src) return;
+  CLG_ASSERT(dst != 0);
+
+  while(i<es->size) {
+    if (src[i] == 0)
+      i = es->e[i].nextTop;
+    else {
+      dst[i] += src[i];
+      i++;
+    }
+  }
+}
+
+void CLG_(add_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+  Int i;
+  ULong* dst;
+
+  if (!src) return;
+  CLG_ASSERT(pdst != 0);
+
+  dst = *pdst;
+  if (!dst) {
+    dst = *pdst = CLG_(get_eventset_cost)(es);
+    CLG_(copy_cost)(es,dst,src);
+    return;
+  }
+
+  i = 0;
+  while(i<es->size) {
+    if (src[i] == 0)
+      i = es->e[i].nextTop;
+    else {
+      dst[i] += src[i];
+      i++;
+    }
+  }
+}
+
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool CLG_(add_and_zero_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+  Int i = 0, j = 0;
+
+  CLG_DEBUGIF(6) {
+    CLG_DEBUG(6, "   add_and_zero_cost(%s, dst %p, src %p)\n", es->name, dst, src);
+    CLG_(print_cost)(-5, es, src);
+  }
+
+  if (!es || !src) return False;
+
+  while(i<es->size) {
+    if (src[i] == 0)
+      i = es->e[i].nextTop;
+    else {
+      dst[i] += src[i];
+      src[i] = 0;
+      i++;
+      j++;
+    }
+  }
+
+  return (j>0);
+}
+
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool CLG_(add_and_zero_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+  Int i;
+  ULong* dst;
+
+  if (!src) return False;
+
+  i = 0;
+  while(1) {
+    if (i >= es->size) return False;
+    if (src[i] != 0) break;
+    i = es->e[i].nextTop;
+  }
+
+  CLG_ASSERT(pdst != 0);
+  dst = *pdst;
+  if (!dst) {
+    dst = *pdst = CLG_(get_eventset_cost)(es);
+    CLG_(copy_cost)(es,dst,src);
+    CLG_(zero_cost)(es,src);
+    return True;
+  }
+
+  dst[i] += src[i];
+  src[i] = 0;
+  i++;
+
+  while(i<es->size) {
+    if (src[i] == 0)
+      i = es->e[i].nextTop;
+    else {
+      dst[i] += src[i];
+      src[i] = 0;
+    }
+  }
+
+  return True;
+}
+
+/* Adds difference of new and old to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool CLG_(add_diff_cost)(EventSet* es, ULong* dst, ULong* old, ULong* new)
+{
+  Int i = 0, j = 0;
+
+  while(i<es->size) {
+    if (new[i] == old[i])
+      i = es->e[i].nextTop;
+    else {
+      dst[i] += new[i] - old[i];
+      old[i] = new[i];
+      i++;
+      j++;
+    }
+  }
+
+  return (j>0);
+}
+
+/* Adds difference of new and old to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool CLG_(add_diff_cost_lz)(EventSet* es, ULong** pdst, 
+			    ULong* old, ULong* new)
+{
+  Int i;
+  ULong* dst;
+
+  if (!old && !new) return False;  
+  CLG_ASSERT(old && new);
+
+  i = 0;
+  while(1) {
+    if (i >= es->size) return False;
+    if (old[i] != new[i]) break;
+    i = es->e[i].nextTop;
+  }
+
+  CLG_ASSERT(pdst != 0);
+  dst = *pdst;
+  if (!dst) {
+    dst = *pdst = CLG_(get_eventset_cost)(es);
+    CLG_(zero_cost)(es,dst);
+  }
+
+  dst[i] += new[i] - old[i];
+  old[i] = new[i];
+  i++;
+
+  while(i<es->size) {
+    if (new[i] == old[i])
+      i = es->e[i].nextTop;
+    else {
+      dst[i] += new[i] - old[i];
+      old[i] = new[i];
+      i++;
+    }
+  }
+
+  return True;
+}
+
+/* Returns number of characters written */
+Int CLG_(sprint_cost)(Char* buf, EventSet* es, ULong* c)
+{
+  Int i, pos, skipped = 0;
+
+  if (!c || es->size==0) return 0;
+
+  /* At least one entry */
+  pos = VG_(sprintf)(buf, "%llu", c[0]);
+  i = 1;
+
+  while(i<es->size) {
+    if (c[i] == 0) {
+      skipped  += es->e[i].nextTop - i;
+      i = es->e[i].nextTop;
+    }
+    else {
+      while(skipped>0) {
+	buf[pos++] = ' ';
+	buf[pos++] = '0';
+	skipped--;
+      }
+      buf[pos++] = ' ';
+      pos += VG_(sprintf)(buf+pos, "%llu", c[i]);
+      i++;
+    }
+  }
+
+  return pos;
+}
+
+
+/* Allocate space for an event mapping */
+EventMapping* CLG_(get_eventmapping)(EventSet* es)
+{
+  EventMapping* em;
+
+  CLG_ASSERT(es != 0);
+
+  em = (EventMapping*) CLG_MALLOC(sizeof(EventMapping) +
+				   es->capacity * sizeof(Int));
+  em->capacity = es->capacity;
+  em->size = 0;
+  em->set = es;
+
+  return em;
+}
+
+void CLG_(append_event)(EventMapping* em, Char* n)
+{
+  Int i;
+
+  CLG_ASSERT(em != 0);
+
+  for(i=0; i<em->set->size; i++)
+    if (VG_(strcmp)(n, em->set->e[i].type->name)==0)
+      break;
+  
+  if (i == em->set->size) return;
+
+  CLG_ASSERT(em->capacity > em->size);
+
+  em->index[em->size] = i;
+  em->size++;
+}
+
+
+/* Returns number of characters written */
+Int CLG_(sprint_eventmapping)(Char* buf, EventMapping* em)
+{
+  Int i, pos = 0;
+
+  CLG_ASSERT(em != 0);
+
+  for(i=0; i< em->size; i++) {
+    if (pos>0) buf[pos++] = ' ';
+    pos += VG_(sprintf)(buf + pos, em->set->e[em->index[i]].type->name);
+  }
+  buf[pos] = 0;
+
+  return pos;
+}
+
+/* Returns number of characters written */
+Int CLG_(sprint_mappingcost)(Char* buf, EventMapping* em, ULong* c)
+{
+  Int i, pos, skipped = 0;
+
+  if (!c || em->size==0) return 0;
+
+    /* At least one entry */
+  pos = VG_(sprintf)(buf, "%llu", c[em->index[0]]);
+  i = 1;
+
+  while(i<em->size) {
+    if (c[em->index[i]] == 0) {
+      skipped++;
+      i++;
+    }
+    else {
+      while(skipped>0) {
+	buf[pos++] = ' ';
+	buf[pos++] = '0';
+	skipped--;
+      }
+      buf[pos++] = ' ';
+      pos += VG_(sprintf)(buf+pos, "%llu", c[em->index[i]]);
+      i++;
+    }
+  }
+
+  return pos;
+}
diff --git a/callgrind/events.h b/callgrind/events.h
new file mode 100644
index 0000000000..d2cad1e2a9
--- /dev/null
+++ b/callgrind/events.h
@@ -0,0 +1,113 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                     events.h ---*/
+/*--- (C) 2004-2005, Josef Weidendorfer                            ---*/
+/*--------------------------------------------------------------------*/
+
+
+/* Abstractions for 64-bit cost lists (events.h) */
+
+#ifndef CG_EVENTS
+#define CG_EVENTS
+
+#include "pub_tool_basics.h"
+
+#define CLG_(str) VGAPPEND(vgCallgrind_,str)
+
+/* An event type */
+typedef struct _EventType EventType;
+struct _EventType {
+  Char* name;
+  Char* description;
+  Int id;
+};
+
+EventType* CLG_(register_eventtype)(Char*);
+EventType* CLG_(get_eventtype)(Char*);
+EventType* CLG_(get_eventtype_byindex)(Int id);
+
+/* An event set is a ordered list of event types, which comes down
+ * to some description for ordered lists of costs.
+ * Often, costs of 2 event types are related, e.g. one is always smaller
+ * than the other. This is useful to speed up arithmetics on cost lists:
+ * Each event type in the set has a <nextTop>. All indexes before are
+ * promised to hold smaller values than the current.
+ */
+typedef struct _EventSetEntry EventSetEntry;
+struct _EventSetEntry {
+  EventType* type;
+  Int nextTop;
+};
+typedef struct _EventSet EventSet;
+struct _EventSet {
+  Char* name;
+  Int size;
+  Int capacity;
+  EventSetEntry e[0];
+};
+
+
+/* Some events out of an event set.
+ * Used to print out part of an EventSet, or in another order.
+ */
+typedef struct _EventMapping EventMapping;
+struct _EventMapping {
+  EventSet* set;
+  Int size;
+  Int capacity;
+  Int index[0];
+};
+
+  
+/* Allocate space for an event set */
+EventSet* CLG_(get_eventset)(Char* n, Int capacity);
+/* Incorporate a event type into a set, get start offset */
+Int CLG_(add_eventtype)(EventSet* dst, EventType*);
+/* Incorporate event types into a set, with ... < second < first */
+Int CLG_(add_dep_event2)(EventSet* dst, EventType* e1, EventType* e2);
+Int CLG_(add_dep_event3)(EventSet* dst,
+			EventType* e1, EventType* e2, EventType* e3);
+Int CLG_(add_dep_event4)(EventSet* dst,
+			EventType* e1, EventType* e2, EventType* e3,
+			EventType* e4);
+/* Incorporate one event set into another, get start offset */
+Int CLG_(add_eventset)(EventSet* dst, EventSet* src);
+/* Returns number of characters written */
+Int CLG_(sprint_eventset)(Char* buf, EventSet*);
+/* Allocate cost array for an event set */
+ULong* CLG_(get_eventset_cost)(EventSet*);
+
+/* Operations on costs. A cost pointer of 0 means zero cost.
+ * Functions ending in _lz allocate costs lazy if needed
+ */
+/* Set costs according full capacity of event set to 0 */
+void CLG_(init_cost)(EventSet*,ULong*);
+/* This always allocates counter and sets them to 0 */
+void CLG_(init_cost_lz)(EventSet*,ULong**);
+/* Set costs of an event set to zero */
+void CLG_(zero_cost)(EventSet*,ULong*);
+Bool CLG_(is_zero_cost)(EventSet*,ULong*);
+Bool CLG_(is_equal_cost)(EventSet*,ULong*,ULong*);
+void CLG_(copy_cost)(EventSet*,ULong* dst, ULong* src);
+void CLG_(copy_cost_lz)(EventSet*,ULong** pdst, ULong* src);
+void CLG_(add_cost)(EventSet*,ULong* dst, ULong* src);
+void CLG_(add_cost_lz)(EventSet*,ULong** pdst, ULong* src);
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool CLG_(add_and_zero_cost)(EventSet*,ULong* dst, ULong* src);
+Bool CLG_(add_and_zero_cost_lz)(EventSet*,ULong** pdst, ULong* src);
+/* Adds difference of new and old to to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool CLG_(add_diff_cost)(EventSet*,ULong* dst, ULong* old, ULong* new);
+Bool CLG_(add_diff_cost_lz)(EventSet*,ULong** pdst, ULong* old, ULong* new);
+/* Returns number of characters written */
+Int CLG_(sprint_cost)(Char* buf, EventSet*, ULong*);
+
+/* Allocate space for an event mapping */
+EventMapping* CLG_(get_eventmapping)(EventSet*);
+void CLG_(append_event)(EventMapping*, Char*);
+/* Returns number of characters written */
+Int CLG_(sprint_eventmapping)(Char* buf, EventMapping*);
+/* Returns number of characters written */
+Int CLG_(sprint_mappingcost)(Char* buf, EventMapping*, ULong*);
+
+#endif /* CG_EVENTS */
diff --git a/callgrind/fn.c b/callgrind/fn.c
new file mode 100644
index 0000000000..a786c5097a
--- /dev/null
+++ b/callgrind/fn.c
@@ -0,0 +1,616 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                      ct_fn.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#define N_INITIAL_FN_ARRAY_SIZE 10071
+
+static fn_array current_fn_active;
+
+static Addr runtime_resolve_addr = 0;
+static int  runtime_resolve_length = 0;
+
+/* _ld_runtime_resolve, located in needs special handling:
+ * The jump at end into the resolved function should not be
+ * represented as a call (as usually done in callgrind with jumps),
+ * but as a return + call. Otherwise, the repeated existance of
+ * _ld_runtime_resolve in call chains will lead to huge cycles,
+ * making the profile almost worthless.
+ *
+ * If ld.so is stripped, the symbol will not appear. But as this
+ * function is handcrafted assembler, we search for it...
+ *
+ * Returns 0 if code not found, otherwise start address
+ */
+static void search_runtime_resolve(obj_node* obj)
+{
+    /* We do not check target address of <fixup>, therefore we have >1 ranges.
+     * We use a tuple sequence (offset,length) into the code array for this
+     */
+
+#if defined(VGA_x86)
+    /* Check ranges [0-11], [16-23] */
+    static int  code_offsets[] = { 0, 12, 16, 8, 24, 0 };
+    static unsigned char code[] = {
+	/* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+	/* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+	/*16*/ 0x5a, 0x59, 0x87, 0x04, 0x24, 0xc2, 0x08, 0x00 };
+#else
+#if defined(VGA_ppc32)
+    static int  code_offsets[] = {0, 65, 68, 64, 132, 0 };
+    static unsigned char code[] = {
+	/* 0*/ 0x94, 0x21, 0xff, 0xc0, 0x90, 0x01, 0x00, 0x0c,
+	/* 8*/ 0x90, 0x61, 0x00, 0x10, 0x90, 0x81, 0x00, 0x14,
+	/*16*/ 0x7d, 0x83, 0x63, 0x78, 0x90, 0xa1, 0x00, 0x18,
+	/*24*/ 0x7d, 0x64, 0x5b, 0x78, 0x90, 0xc1, 0x00, 0x1c,
+	/*32*/ 0x7c, 0x08, 0x02, 0xa6, 0x90, 0xe1, 0x00, 0x20,
+	/*40*/ 0x90, 0x01, 0x00, 0x30, 0x91, 0x01, 0x00, 0x24,
+	/*48*/ 0x7c, 0x00, 0x00, 0x26, 0x91, 0x21, 0x00, 0x28,
+	/*56*/ 0x91, 0x41, 0x00, 0x2c, 0x90, 0x01, 0x00, 0x08,
+	/*64*/ 0x48, 0x00, 0x02, 0x91, 0x7c, 0x69, 0x03, 0xa6, /* at 64: bl aff0 <fixup> */
+	/*72*/ 0x80, 0x01, 0x00, 0x30, 0x81, 0x41, 0x00, 0x2c,
+	/*80*/ 0x81, 0x21, 0x00, 0x28, 0x7c, 0x08, 0x03, 0xa6,
+	/*88*/ 0x81, 0x01, 0x00, 0x24, 0x80, 0x01, 0x00, 0x08,
+	/*96*/ 0x80, 0xe1, 0x00, 0x20, 0x80, 0xc1, 0x00, 0x1c,
+	/*104*/0x7c, 0x0f, 0xf1, 0x20, 0x80, 0xa1, 0x00, 0x18,
+	/*112*/0x80, 0x81, 0x00, 0x14, 0x80, 0x61, 0x00, 0x10,
+	/*120*/0x80, 0x01, 0x00, 0x0c, 0x38, 0x21, 0x00, 0x40,
+	/*128*/0x4e, 0x80, 0x04, 0x20 };
+#else
+#if defined(VGA_amd64)
+    /* x86_64 */
+    static int  code_offsets[] = {0, 62, 66, 44, 110, 0 };
+    static unsigned char code[] = {
+	/* 0*/ 0x48, 0x83, 0xec, 0x38, 0x48, 0x89, 0x04, 0x24,
+	/* 8*/ 0x48, 0x89, 0x4c, 0x24, 0x08, 0x48, 0x89, 0x54, 0x24, 0x10,
+	/*18*/ 0x48, 0x89, 0x74, 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20,
+	/*28*/ 0x4c, 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, 0x30,
+	/*38*/ 0x48, 0x8b, 0x74, 0x24, 0x40, 0x49, 0x89, 0xf3,
+	/*46*/ 0x4c, 0x01, 0xde, 0x4c, 0x01, 0xde, 0x48, 0xc1, 0xe6, 0x03,
+	/*56*/ 0x48, 0x8b, 0x7c, 0x24, 0x38, 0xe8, 0xee, 0x01, 0x00, 0x00,
+	/*66*/ 0x49, 0x89, 0xc3, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+	/*74*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+	/*84*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, 0x24, 0x10,
+	/*94*/ 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48, 0x8b, 0x04, 0x24,
+	/*103*/0x48, 0x83, 0xc4, 0x48, 0x41, 0xff, 0xe3 };
+#else
+    /* Unknown architecture, no check is done */
+    static int  code_offsets[] = {0, 0 };
+    static unsigned char code[] = { 0 };
+#endif
+#endif
+#endif
+    
+    int *range = &(code_offsets[0]), *r = 0;
+    Bool found = False;
+    Addr addr, end;
+
+    /* Only search in libraries with a given name pattern */
+    if ((VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) &&
+	(VG_(strncmp)(obj->name, "/lib64/ld", 9) != 0)) return;
+    
+    CLG_DEBUG(1, "search_rs: Checking %d bytes of [%x %x %x...]\n",
+	      range[1], code[0], code[1], code[2]);
+
+    end = obj->start + obj->size - range[1];
+    addr = obj->start;
+    while(addr < end) {
+	if (VG_(memcmp)( (void*)addr, code, range[1]) == 0) {
+
+	    r = range + 2;
+	    found = True;
+	    while(r[1]) {
+		CLG_DEBUG(1, " [%p] Found! Checking %d bytes of [%x %x %x...]\n",
+			  addr, r[1], code[r[0]], code[r[0]+1], code[r[0]+2]);
+
+		if (VG_(memcmp)( (void*)(addr+r[0]), code+r[0], r[1]) != 0) {
+		    found = False;
+		    break;
+		}
+		r += 2;
+	    }
+	    if (found) break;
+	}
+	addr++;
+    }
+
+    if (!found || (r==0)) return;
+
+    if (VG_(clo_verbosity) > 1)
+	VG_(message)(Vg_DebugMsg, "Code check found runtime_resolve: %s +%p=%p, length %d",
+		     obj->name + obj->last_slash_pos,
+		     addr - obj->start, addr, r[0]);
+
+    runtime_resolve_addr   = addr;
+    runtime_resolve_length = r[0];
+}
+
+/*------------------------------------------------------------*/
+/*--- Object/File/Function hash entry operations           ---*/
+/*------------------------------------------------------------*/
+
+/* Object hash table, fixed */
+static obj_node* obj_table[N_OBJ_ENTRIES];
+
+void CLG_(init_obj_table)()
+{
+    Int i;
+    for (i = 0; i < N_OBJ_ENTRIES; i++)
+	obj_table[i] = 0;
+}
+
+#define HASH_CONSTANT   256
+
+static UInt str_hash(const Char *s, UInt table_size)
+{
+    int hash_value = 0;
+    for ( ; *s; s++)
+        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+    return hash_value;
+}
+
+
+static Char* anonymous_obj = "???";
+
+static __inline__ 
+obj_node* new_obj_node(SegInfo* si, obj_node* next)
+{
+   Int i;
+   obj_node* new;
+
+   new = (obj_node*) CLG_MALLOC(sizeof(obj_node));
+   new->name  = si ? VG_(strdup)( VG_(seginfo_filename)(si) )
+                     : anonymous_obj;
+   for (i = 0; i < N_FILE_ENTRIES; i++) {
+      new->files[i] = NULL;
+   }
+   CLG_(stat).distinct_objs ++;
+   new->number  = CLG_(stat).distinct_objs;
+   new->start   = si ? VG_(seginfo_start)(si) : 0;
+   new->size    = si ? VG_(seginfo_size)(si) : 0;
+   new->offset  = si ? VG_(seginfo_sym_offset)(si) : 0;
+   new->next    = next;
+
+   // not only used for debug output (see static.c)
+   new->last_slash_pos = 0;
+   i = 0;
+   while(new->name[i]) {
+	if (new->name[i]=='/') new->last_slash_pos = i+1;
+	i++;
+   }
+
+   if (runtime_resolve_addr == 0) search_runtime_resolve(new);
+   
+   return new;
+}
+
+obj_node* CLG_(get_obj_node)(SegInfo* si)
+{
+    obj_node*    curr_obj_node;
+    UInt         objname_hash;
+    const UChar* obj_name;
+    
+    obj_name = si ? (Char*) VG_(seginfo_filename)(si) : anonymous_obj;
+
+    /* lookup in obj hash */
+    objname_hash = str_hash(obj_name, N_OBJ_ENTRIES);
+    curr_obj_node = obj_table[objname_hash];
+    while (NULL != curr_obj_node && 
+	   VG_(strcmp)(obj_name, curr_obj_node->name) != 0) {
+	curr_obj_node = curr_obj_node->next;
+    }
+    if (NULL == curr_obj_node) {
+	obj_table[objname_hash] = curr_obj_node = 
+	    new_obj_node(si, obj_table[objname_hash]);
+    }
+
+    return curr_obj_node;
+}
+
+
+static __inline__ 
+file_node* new_file_node(Char filename[FILENAME_LEN],
+			 obj_node* obj, file_node* next)
+{
+  Int i;
+  file_node* new = (file_node*) CLG_MALLOC(sizeof(file_node));
+  new->name  = VG_(strdup)(filename);
+  for (i = 0; i < N_FN_ENTRIES; i++) {
+    new->fns[i] = NULL;
+  }
+  CLG_(stat).distinct_files++;
+  new->number  = CLG_(stat).distinct_files;
+  new->obj     = obj;
+  new->next      = next;
+  return new;
+}
+
+ 
+file_node* CLG_(get_file_node)(obj_node* curr_obj_node,
+			      Char filename[FILENAME_LEN])
+{
+    file_node* curr_file_node;
+    UInt       filename_hash;
+
+    /* lookup in file hash */
+    filename_hash = str_hash(filename, N_FILE_ENTRIES);
+    curr_file_node = curr_obj_node->files[filename_hash];
+    while (NULL != curr_file_node && 
+	   VG_(strcmp)(filename, curr_file_node->name) != 0) {
+	curr_file_node = curr_file_node->next;
+    }
+    if (NULL == curr_file_node) {
+	curr_obj_node->files[filename_hash] = curr_file_node = 
+	    new_file_node(filename, curr_obj_node, 
+			  curr_obj_node->files[filename_hash]);
+    }
+
+    return curr_file_node;
+}
+
+/* forward decl. */
+static void resize_fn_array(void);
+
+static __inline__ 
+fn_node* new_fn_node(Char fnname[FILENAME_LEN],
+		     file_node* file, fn_node* next)
+{
+    fn_node* new = (fn_node*) CLG_MALLOC(sizeof(fn_node));
+    new->name = VG_(strdup)(fnname);
+
+    CLG_(stat).distinct_fns++;
+    new->number   = CLG_(stat).distinct_fns;
+    new->last_cxt = 0;
+    new->pure_cxt = 0;
+    new->file     = file;
+    new->next     = next;
+
+    new->dump_before  = False;
+    new->dump_after   = False;
+    new->zero_before  = False;
+    new->toggle_collect = False;
+    new->skip         = False;
+    new->pop_on_jump  = False;
+    new->is_malloc    = False;
+    new->is_realloc   = False;
+    new->is_free      = False;
+
+    new->group        = 0;
+    new->separate_callers    = CLG_(clo).separate_callers;
+    new->separate_recursions = CLG_(clo).separate_recursions;
+
+#if CLG_ENABLE_DEBUG
+    new->verbosity    = -1;
+#endif
+
+    if (CLG_(stat).distinct_fns >= current_fn_active.size)
+	resize_fn_array();
+
+    return new;
+}
+
+
+/* Get a function node in hash2 with known file node.
+ * hash nodes are created if needed
+ */
+static
+fn_node* get_fn_node_infile(file_node* curr_file_node,
+			    Char fnname[FN_NAME_LEN])
+{
+    fn_node* curr_fn_node;
+    UInt     fnname_hash;
+
+    CLG_ASSERT(curr_file_node != 0);
+
+    /* lookup in function hash */
+    fnname_hash = str_hash(fnname, N_FN_ENTRIES);
+    curr_fn_node = curr_file_node->fns[fnname_hash];
+    while (NULL != curr_fn_node && 
+	   VG_(strcmp)(fnname, curr_fn_node->name) != 0) {
+	curr_fn_node = curr_fn_node->next;
+    }
+    if (NULL == curr_fn_node) {
+	curr_file_node->fns[fnname_hash] = curr_fn_node = 
+            new_fn_node(fnname, curr_file_node,
+			curr_file_node->fns[fnname_hash]);
+    }
+
+    return curr_fn_node;
+}
+
+
+/* Get a function node in a Segment.
+ * Hash nodes are created if needed.
+ */
+static __inline__
+fn_node* get_fn_node_inseg(SegInfo* si,
+			   Char filename[FILENAME_LEN],
+			   Char fnname[FN_NAME_LEN])
+{
+  obj_node  *obj  = CLG_(get_obj_node)(si);
+  file_node *file = CLG_(get_file_node)(obj, filename);
+  fn_node   *fn   = get_fn_node_infile(file, fnname);
+
+  return fn;
+}
+
+
+Bool CLG_(get_debug_info)(Addr instr_addr,
+			 Char filename[FILENAME_LEN],
+			 Char fn_name[FN_NAME_LEN], UInt* line_num,
+			 SegInfo** pSegInfo)
+{
+  Bool found1, found2, result = True;
+  UInt line;
+  
+  CLG_DEBUG(6, "  + get_debug_info(%p)\n", instr_addr);
+
+  if (pSegInfo) {
+      *pSegInfo = VG_(find_seginfo)(instr_addr);
+
+      // for generated code in anonymous space, pSegInfo is 0
+   }
+
+   found1 = VG_(get_filename_linenum)(instr_addr,
+				      filename, FILENAME_LEN,
+				      NULL, 0, NULL, // FIXME: add dirnames!
+				      &line);
+   found2 = VG_(get_fnname)(instr_addr, 
+			    fn_name, FN_NAME_LEN);
+
+   if (!found1 && !found2) {
+     CLG_(stat).no_debug_BBs++;
+     VG_(strcpy)(filename, "???");
+     VG_(strcpy)(fn_name,  "???");
+     if (line_num) *line_num=0;
+     result = False;
+
+   } else if ( found1 &&  found2) {
+     CLG_(stat).full_debug_BBs++;
+     if (line_num) *line_num=line;
+
+   } else if ( found1 && !found2) {
+     CLG_(stat).file_line_debug_BBs++;
+     VG_(strcpy)(fn_name,  "???");
+     if (line_num) *line_num=line;
+
+   } else  /*(!found1 &&  found2)*/ {
+     CLG_(stat).fn_name_debug_BBs++;
+     VG_(strcpy)(filename, "???");
+     if (line_num) *line_num=0;
+   }
+
+   CLG_DEBUG(6, "  - get_debug_info(%p): seg '%s', fn %s\n",
+	    instr_addr,
+	    !pSegInfo   ? (const UChar*)"-" :
+	    (*pSegInfo) ? VG_(seginfo_filename)(*pSegInfo) :
+	    (const UChar*)"(None)",
+	    fn_name);
+
+  return result;
+}
+
+/* for _libc_freeres_wrapper => _exit renaming */
+static BB* exit_bb = 0;
+
+
+/*
+ * Attach function struct to a BB from debug info.
+ */
+fn_node* CLG_(get_fn_node)(BB* bb)
+{
+    Char       filename[FILENAME_LEN], fnname[FN_NAME_LEN];
+    SegInfo*   si;
+    UInt       line_num;
+    fn_node*   fn;
+
+    /* fn from debug info is idempotent for a BB */
+    if (bb->fn) return bb->fn;
+
+    CLG_DEBUG(3,"+ get_fn_node(BB %p)\n", bb_addr(bb));
+
+    /* get function/file name, line number and object of
+     * the BB according to debug information
+     */
+    CLG_(get_debug_info)(bb_addr(bb),
+			filename, fnname, &line_num, &si);
+
+    if (0 == VG_(strcmp)(fnname, "???")) {
+	int p;
+
+	/* Use address as found in library */
+	if (sizeof(Addr) == 4)
+	    p = VG_(sprintf)(fnname, "%08p", bb->offset);
+	else 	    
+	    // 64bit address
+	    p = VG_(sprintf)(fnname, "%016p", bb->offset);
+
+	VG_(sprintf)(fnname+p, "%s", 
+		     (bb->sect_kind == Vg_SectData) ? " [Data]" :
+		     (bb->sect_kind == Vg_SectBSS)  ? " [BSS]"  :
+		     (bb->sect_kind == Vg_SectGOT)  ? " [GOT]"  :
+		     (bb->sect_kind == Vg_SectPLT)  ? " [PLT]"  : "");
+    }
+    else {
+      if (VG_(get_fnname_if_entry)(bb_addr(bb), fnname, FN_NAME_LEN))
+	bb->is_entry = 1;
+    }
+
+    /* HACK for correct _exit: 
+     * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind,
+     * so we rename it back again :-)
+     */
+    if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper")
+	&& exit_bb) {
+      CLG_(get_debug_info)(bb_addr(exit_bb),
+			  filename, fnname, &line_num, &si);
+	
+	CLG_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n");
+    }
+    if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb)
+	exit_bb = bb;
+    
+    if (runtime_resolve_addr && 
+	(bb_addr(bb) >= runtime_resolve_addr) &&
+	(bb_addr(bb) < runtime_resolve_addr + runtime_resolve_length)) {
+	/* BB in runtime_resolve found by code check; use this name */
+	VG_(sprintf)(fnname, "_dl_runtime_resolve");
+    }
+
+    /* get fn_node struct for this function */
+    fn = get_fn_node_inseg( si, filename, fnname);
+
+    /* if this is the 1st time the function is seen,
+     * some attributes are set */
+    if (fn->pure_cxt == 0) {
+
+      /* Every function gets a "pure" context, i.e. a context with stack
+       * depth 1 only with this function. This is for compression of mangled
+       * names
+       */
+      fn_node* pure[2];
+      pure[0] = 0;
+      pure[1] = fn;
+      fn->pure_cxt = CLG_(get_cxt)(pure+1);
+
+      if (bb->sect_kind == Vg_SectPLT)	
+	fn->skip = CLG_(clo).skip_plt;
+
+      if (VG_(strcmp)(fn->name, "_dl_runtime_resolve")==0) {
+	  fn->pop_on_jump = True;
+
+	  if (VG_(clo_verbosity) > 1)
+	      VG_(message)(Vg_DebugMsg, "Symbol match: found runtime_resolve: %s +%p=%p",
+		      bb->obj->name + bb->obj->last_slash_pos,
+		      bb->offset, bb_addr(bb));
+      }
+
+      fn->is_malloc  = (VG_(strcmp)(fn->name, "malloc")==0);
+      fn->is_realloc = (VG_(strcmp)(fn->name, "realloc")==0);
+      fn->is_free    = (VG_(strcmp)(fn->name, "free")==0);
+
+      /* apply config options from function name patterns
+       * given on command line */
+      CLG_(update_fn_config)(fn);
+    }
+
+
+    bb->fn   = fn;
+    bb->line = line_num;
+
+    CLG_DEBUG(3,"- get_fn_node(BB %p): %s (in %s:%u)\n",
+	     bb_addr(bb), fnname, filename, line_num);
+
+    return fn;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Active function array operations                     ---*/
+/*------------------------------------------------------------*/
+
+/* The active function array is a thread-specific array
+ * of UInts, mapping function numbers to the active count of
+ * functions.
+ * The active count is the number of times a function appears
+ * in the current call stack, and is used when costs for recursion
+ * levels should be separated.
+ */
+
+UInt* CLG_(get_fn_entry)(Int n)
+{
+  CLG_ASSERT(n < current_fn_active.size);
+  return current_fn_active.array + n;
+}
+
+void CLG_(init_fn_array)(fn_array* a)
+{
+  Int i;
+
+  CLG_ASSERT(a != 0);
+
+  a->size = N_INITIAL_FN_ARRAY_SIZE;
+  if (a->size <= CLG_(stat).distinct_fns)
+    a->size = CLG_(stat).distinct_fns+1;
+  
+  a->array = (UInt*) CLG_MALLOC(a->size * sizeof(UInt));
+  for(i=0;i<a->size;i++)
+    a->array[i] = 0;
+}
+
+void CLG_(copy_current_fn_array)(fn_array* dst)
+{
+  CLG_ASSERT(dst != 0);
+
+  dst->size  = current_fn_active.size;
+  dst->array = current_fn_active.array;
+}
+
+fn_array* CLG_(get_current_fn_array)()
+{
+  return &current_fn_active;
+}
+
+void CLG_(set_current_fn_array)(fn_array* a)
+{
+  CLG_ASSERT(a != 0);
+
+  current_fn_active.size  = a->size;
+  current_fn_active.array = a->array;
+  if (current_fn_active.size <= CLG_(stat).distinct_fns)
+    resize_fn_array();
+}
+
+/* ensure that active_array is big enough:
+ *  <distinct_fns> is the highest index, so <fn_active_array_size>
+ *  has to be bigger than that.
+ */
+static void resize_fn_array(void)
+{
+    UInt* new;
+    Int i, newsize;
+
+    newsize = current_fn_active.size;
+    while (newsize <= CLG_(stat).distinct_fns) newsize *=2;
+
+    CLG_DEBUG(0, "Resize fn_active_array: %d => %d\n",
+	     current_fn_active.size, newsize);
+
+    new = (UInt*) CLG_MALLOC(newsize * sizeof(UInt));
+    for(i=0;i<current_fn_active.size;i++)
+      new[i] = current_fn_active.array[i];
+    while(i<newsize)
+	new[i++] = 0;
+
+    VG_(free)(current_fn_active.array);
+    current_fn_active.size = newsize;
+    current_fn_active.array = new;
+    CLG_(stat).fn_array_resizes++;
+}
+
+
diff --git a/callgrind/global.h b/callgrind/global.h
new file mode 100644
index 0000000000..a1033923d2
--- /dev/null
+++ b/callgrind/global.h
@@ -0,0 +1,838 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                     global.h ---*/
+/*--- (C) 2004, 2005 Josef Weidendorfer                            ---*/
+/*--------------------------------------------------------------------*/
+
+#ifndef CLG_GLOBAL
+#define CLG_GLOBAL
+
+#include "pub_tool_basics.h"
+#include "pub_tool_debuginfo.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcfile.h"
+#include "pub_tool_libcprint.h"
+#include "pub_tool_libcproc.h"
+#include "pub_tool_machine.h"
+#include "pub_tool_mallocfree.h"
+#include "pub_tool_options.h"
+#include "pub_tool_tooliface.h"
+#include "pub_tool_clientstate.h"
+#include "pub_tool_machine.h"      // VG_(fnptr_to_fnentry)
+
+#include "events.h" // defines CLG_ macro
+#include "costs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Calltree compile options                            --- */
+/*------------------------------------------------------------*/
+
+/* Enable debug output */
+#define CLG_ENABLE_DEBUG 1
+
+/* Enable experimental features? */
+#define CLG_EXPERIMENTAL 0
+
+/* Syscall Timing in microseconds? 
+ * (define to 0 if you get compile errors) */
+#define CLG_MICROSYSTIME 0
+
+/* Set to 1 if you want full sanity checks for JCC */
+#define JCC_CHECK 0
+
+
+
+/*------------------------------------------------------------*/
+/*--- Command line options                                 ---*/
+/*------------------------------------------------------------*/
+
+#define DEFAULT_DUMPNAME    "callgrind.out"
+#define DEFAULT_COMMANDNAME "callgrind.cmd"
+#define DEFAULT_RESULTNAME  "callgrind.res"
+#define DEFAULT_INFONAME    "/tmp/callgrind.info"
+
+typedef struct _CommandLineOptions CommandLineOptions;
+struct _CommandLineOptions {
+
+  /* Dump format options */
+  Char* filename_base;   /* Base name for dumps */
+  Bool combine_dumps;    /* Dump trace parts into same file? */
+  Bool compress_strings;
+  Bool compress_events;
+  Bool compress_pos;
+  Bool mangle_names;
+  Bool compress_mangled;
+  Bool dump_line;
+  Bool dump_instr;
+  Bool dump_bb;
+  Bool dump_bbs;         /* Dump basic block information? */
+  
+  /* Dump generation options */
+  Int dump_every_bb;     /* Dump every xxx BBs. */
+  
+  /* Collection options */
+  Bool separate_threads; /* Separate threads in dump? */
+  Int  separate_callers; /* Separate dependent on how many callers? */
+  Int  separate_recursions; /* Max level of recursions to separate */
+  Bool skip_plt;         /* Skip functions in PLT section? */
+  Bool skip_direct_recursion; /* Increment direct recursions the level? */
+
+  Bool collect_atstart;  /* Start in collecting state ? */
+  Bool collect_jumps;    /* Collect (cond.) jumps in functions ? */
+
+  Bool collect_alloc;    /* Collect size of allocated memory */
+  Bool collect_systime;  /* Collect time for system calls */
+
+  /* Instrument options */
+  Bool instrument_atstart;  /* Instrument at start? */
+  Bool simulate_cache;      /* Call into cache simulator ? */
+
+#if CLG_ENABLE_DEBUG
+  Int   verbose;
+  ULong verbose_start;
+#endif
+};
+
+/*------------------------------------------------------------*/
+/*--- Constants                                            ---*/
+/*------------------------------------------------------------*/
+
+
+/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
+#define MAX_x86_INSTR_SIZE              16
+
+/* Minimum cache line size allowed */
+#define MIN_LINE_SIZE   16
+
+/* Size of various buffers used for storing strings */
+#define FILENAME_LEN                    256
+#define FN_NAME_LEN                    4096 /* for C++ code :-) */
+#define OBJ_NAME_LEN                    256
+#define BUF_LEN                         512
+#define COMMIFY_BUF_LEN                 128
+#define RESULTS_BUF_LEN                 128
+#define LINE_BUF_LEN                     64
+
+
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _Statistics Statistics;
+struct _Statistics {
+  ULong call_counter;
+  ULong jcnd_counter;
+  ULong jump_counter;
+  ULong rec_call_counter;
+  ULong ret_counter;
+  ULong bb_executions;
+
+  Int  context_counter;
+  Int  bb_retranslations;  
+
+  Int  distinct_objs;
+  Int  distinct_files;
+  Int  distinct_fns;
+  Int  distinct_contexts;
+  Int  distinct_bbs;
+  Int  distinct_jccs;
+  Int  distinct_bbccs;
+  Int  distinct_instrs;
+  Int  distinct_skips;
+
+  Int  bb_hash_resizes;
+  Int  bbcc_hash_resizes;
+  Int  jcc_hash_resizes;
+  Int  cxt_hash_resizes;
+  Int  fn_array_resizes;
+  Int  call_stack_resizes;
+  Int  fn_stack_resizes;
+
+  Int  full_debug_BBs;
+  Int  file_line_debug_BBs;
+  Int  fn_name_debug_BBs;
+  Int  no_debug_BBs;
+  Int  bbcc_lru_misses;
+  Int  jcc_lru_misses;
+  Int  cxt_lru_misses;
+  Int  bbcc_clones;
+};
+
+
+/*------------------------------------------------------------*/
+/*--- Structure declarations                               ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _Context     Context;
+typedef struct _CC          CC;
+typedef struct _BB          BB;
+typedef struct _Skipped     Skipped;
+typedef struct _BBCC        BBCC;
+typedef struct _jCC         jCC;
+typedef struct _fCC         fCC;
+typedef struct _fn_node     fn_node;
+typedef struct _file_node   file_node;
+typedef struct _obj_node    obj_node;
+typedef struct _fn_config   fn_config;
+typedef struct _call_entry  call_entry;
+typedef struct _thread_info thread_info;
+
+/* Costs of event sets. Aliases to arrays of 64-bit values */
+typedef ULong* SimCost;  /* All events the simulator can produce */
+typedef ULong* UserCost;
+typedef ULong* FullCost; /* Simulator + User */
+
+
+/* JmpCall cost center
+ * for subroutine call (from->bb->jmp_addr => to->bb->addr)
+ *
+ * Each BB has at most one CALL instruction. The list of JCC from
+ * this call is a pointer to the list head (stored in BBCC), and
+ * <next_from> in the JCC struct.
+ *
+ * For fast lookup, JCCs are reachable with a hash table, keyed by
+ * the (from_bbcc,to) pair. <next_hash> is used for the JCC chain
+ * of one hash table entry.
+ *
+ * Cost <sum> holds event counts for already returned executions.
+ * <last> are the event counters at last enter of the subroutine.
+ * <sum> is updated on returning from the subroutine by
+ * adding the diff of <last> and current event counters to <sum>.
+ *
+ * After updating, <last> is set to current event counters. Thus,
+ * events are not counted twice for recursive calls (TODO: True?)
+ */
+#define JmpNone (Ijk_Boring+30)
+#define JmpCond (Ijk_Boring+31)
+
+struct _jCC {
+  Int  jmpkind;     /* JmpCall, JmpBoring, JmpCond */
+  jCC* next_hash;   /* for hash entry chain */
+  jCC* next_from;   /* next JCC from a BBCC */
+  BBCC *from, *to;  /* call arc from/to this BBCC */
+  UInt jmp;         /* jump no. in source */
+
+  ULong call_counter; /* no wraparound with 64 bit */
+
+  FullCost cost; /* simulator + user counters */
+};
+
+
+/* 
+ * Info for one instruction of a basic block.
+ */
+typedef struct _InstrInfo InstrInfo;
+struct _InstrInfo {
+  UInt instr_offset;
+  UInt instr_size;
+  UInt data_size;
+  UInt cost_offset;
+  EventSet* eventset;
+};
+
+
+/*
+ * Info for a conditional jump in a basic block
+ */
+typedef struct _CJmpInfo CJmpInfo;
+struct _CJmpInfo {
+    UInt instr; /* instruction index in this basic block */
+    Bool skip;   /* Cond.Jumps to next instruction should be ignored */
+};
+
+
+/**
+ * An instrumented basic block (BB).
+ *
+ * BBs are put into a resizable hash to allow for fast detection if a
+ * BB is to be retranslated but cost info is already available.
+ * The key for a BB is a (object, offset) tupel making it independent
+ * from possibly multiple mappings of the same ELF object.
+ *
+ * At the beginning of each instrumented BB,
+ * a call to setup_bbcc(), specifying a pointer to the
+ * according BB structure, is added.
+ *
+ * As cost of a BB has to be distinguished depending on the context,
+ * multiple cost centers for one BB (struct BBCC) exist and the according
+ * BBCC is set by setup_bbcc.
+ */
+struct _BB {
+  obj_node*  obj;         /* ELF object of BB */
+  OffT       offset;      /* offset of BB in ELF object file */
+  BB*        next;       /* chaining for a hash entry */
+
+  VgSectKind sect_kind;  /* section of this BB, e.g. PLT */
+  UInt       instr_count;
+  
+  /* filled by CLG_(get_fn_node) if debug info is available */
+  fn_node*   fn;          /* debug info for this BB */
+  UInt       line;
+  Bool       is_entry;    /* True if this BB is a function entry */
+        
+  BBCC*      bbcc_list;  /* BBCCs for same BB (see next_bbcc in BBCC) */
+  BBCC*      last_bbcc;  /* Temporary: Cached for faster access (LRU) */
+
+  /* filled by CLG_(instrument) if not seen before */
+  UInt       cjmp_count;  /* number of conditional exits */
+  CJmpInfo*  jmp;         /* array of info for condition jumps,
+			   * allocated directly after this struct */
+  Int        jmpkind;    /* remember jump kind of final exit */
+  Bool       cjmp_inverted; /* condition of last cond.jump can be inverted by VEX */
+
+  UInt       instr_len;
+  UInt       cost_count;
+  InstrInfo  instr[0];   /* info on instruction sizes and costs */
+};
+
+
+
+/**
+ * Function context
+ *
+ * Basic blocks are always executed in the scope of a context.
+ * A function context is a list of function nodes representing
+ * the call chain to the current context: I.e. fn[0] is the
+ * function we are currently in, fn[1] has called fn[0], and so on.
+ * Recursion levels are used for fn[0].
+ *
+ * To get a unique number for a full execution context, use
+ *  rec_index = min(<fn->rec_separation>,<active>) - 1;
+ *  unique_no = <number> + rec_index
+ *
+ * For each Context, recursion index and BB, there can be a BBCC.
+ */
+struct _Context {
+    UInt size;        // number of function dependencies
+    UInt base_number; // for context compression & dump array
+    Context* next;    // entry chaining for hash
+    UWord hash;       // for faster lookup...
+    fn_node* fn[0];
+};
+
+
+/*
+ * Info for a conditional jump in a basic block
+ */
+typedef struct _JmpData JmpData;
+struct _JmpData {
+    ULong ecounter; /* number of times the BB was left at this exit */
+    jCC*  jcc_list;  /* JCCs for Cond.Jumps from this exit */
+};
+
+
+/*
+ * Basic Block Cost Center
+ *
+ * On demand, multiple BBCCs will be created for the same BB
+ * dependend on command line options and:
+ * - current function (it's possible that a BB is executed in the
+ *   context of different functions, e.g. in manual assembler/PLT)
+ * - current thread ID
+ * - position where current function is called from
+ * - recursion level of current function
+ *
+ * The cost centres for the instructions of a basic block are
+ * stored in a contiguous array.
+ * They are distinguishable by their tag field.
+ */
+struct _BBCC {
+    BB*      bb;           /* BB for this cost center */
+
+    Context* cxt;          /* execution context of this BBCC */
+    ThreadId tid;          /* only for assertion check purpose */
+    UInt     rec_index;    /* Recursion index in rec->bbcc for this bbcc */
+    BBCC**   rec_array;    /* Variable sized array of pointers to 
+			    * recursion BBCCs. Shared. */
+    ULong    ret_counter;  /* how often returned from jccs of this bbcc;
+			    * used to check if a dump for this BBCC is needed */
+    
+    BBCC*    next_bbcc;    /* Chain of BBCCs for same BB */
+    BBCC*    lru_next_bbcc; /* BBCC executed next the last time */
+    
+    jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
+    jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
+    FullCost skipped;      /* cost for skipped functions called from 
+			    * jmp_addr. Allocated lazy */
+    
+    BBCC*    next;         /* entry chain in hash */
+    ULong*   cost;         /* start of 64bit costs for this BBCC */
+    ULong    ecounter_sum; /* execution counter for first instruction of BB */
+    JmpData  jmp[0];
+};
+
+
+/* the <number> of fn_node, file_node and obj_node are for compressed dumping
+ * and a index into the dump boolean table and fn_info_table
+ */
+
+struct _fn_node {
+  Char*      name;
+  UInt       number;
+  Context*   last_cxt; /* LRU info */
+  Context*   pure_cxt; /* the context with only the function itself */
+  file_node* file;     /* reverse mapping for 2nd hash */
+  fn_node* next;
+
+  Bool dump_before :1;
+  Bool dump_after :1;
+  Bool zero_before :1;
+  Bool toggle_collect :1;
+  Bool skip :1;
+  Bool pop_on_jump : 1;
+
+  Bool is_malloc :1;
+  Bool is_realloc :1;
+  Bool is_free :1;
+
+  Int  group;
+  Int  separate_callers;
+  Int  separate_recursions;
+#if CLG_ENABLE_DEBUG
+  Int  verbosity; /* Stores old verbosity level while in function */
+#endif
+};
+
+/* Quite arbitrary fixed hash sizes */
+
+#define   N_OBJ_ENTRIES         47
+#define  N_FILE_ENTRIES         53
+#define    N_FN_ENTRIES         87
+#define N_BBCC2_ENTRIES         37
+
+struct _file_node {
+   Char*      name;
+   fn_node*   fns[N_FN_ENTRIES];
+   UInt       number;
+   obj_node*  obj;
+   file_node* next;
+};
+
+/* If an object is dlopened multiple times, we hope that <name> is unique;
+ * <start> and <offset> can change with each dlopen, and <start> is
+ * zero when object is unmapped (possible at dump time).
+ */
+struct _obj_node {
+   Char*      name;
+   UInt       last_slash_pos;
+
+   Addr       start;  /* Start address of text segment mapping */
+   SizeT      size;   /* Length of mapping */
+   OffT       offset; /* Offset between symbol address and file offset */
+
+   file_node* files[N_FILE_ENTRIES];
+   UInt       number;
+   obj_node*  next;
+};
+
+/* an entry in the callstack
+ *
+ * <nonskipped> is 0 if the function called is not skipped (usual case).
+ * Otherwise, it is the last non-skipped BBCC. This one gets all
+ * the calls to non-skipped functions and all costs in skipped 
+ * instructions.
+ */
+struct _call_entry {
+    jCC* jcc;           /* jCC for this call */
+    FullCost enter_cost; /* cost event counters at entering frame */
+    Addr sp;            /* stack pointer directly after call */
+    Addr ret_addr;      /* address to which to return to
+			 * is 0 on a simulated call */
+    BBCC* nonskipped;   /* see above */
+    Context* cxt;       /* context before call */
+    Int fn_sp;          /* function stack index before call */
+};
+
+
+/*
+ * Execution state of main thread or a running signal handler in
+ * a thread while interrupted by another signal handler.
+ * As there's no scheduling among running signal handlers of one thread,
+ * we only need a subset of a full thread state:
+ * - event counter
+ * - collect state
+ * - last BB, last jump kind, last nonskipped BB
+ * - callstack pointer for sanity checking and correct unwinding
+ *   after exit
+ */
+typedef struct _exec_state exec_state;
+struct _exec_state {
+
+  /* the signum of the handler, 0 for main thread context
+   */
+  Int sig;
+  
+  /* the old call stack pointer at entering the signal handler */
+  Int orig_sp;
+  
+  FullCost cost;
+  Bool     collect;
+  Context* cxt;
+  
+  Int   jmps_passed; /* number of conditional jumps passed in last BB */
+  BBCC* bbcc;      /* last BB executed */
+  BBCC* nonskipped;
+
+  Int call_stack_bottom; /* Index into fn_stack */
+};
+
+/* Global state structures */
+typedef struct _bb_hash bb_hash;
+struct _bb_hash {
+  UInt size, entries;
+  BB** table;
+};
+
+typedef struct _cxt_hash cxt_hash;
+struct _cxt_hash {
+  UInt size, entries;
+  Context** table;
+};  
+
+/* Thread specific state structures, i.e. parts of a thread state.
+ * There are variables for the current state of each part,
+ * on which a thread state is copied at thread switch.
+ */
+typedef struct _bbcc_hash bbcc_hash;
+struct _bbcc_hash {
+  UInt size, entries;
+  BBCC** table;
+};
+
+typedef struct _jcc_hash jcc_hash;
+struct _jcc_hash {
+  UInt size, entries;
+  jCC** table;
+  jCC* spontaneous;
+};
+
+typedef struct _fn_array fn_array;
+struct _fn_array {
+  UInt size;
+  UInt* array;
+};
+
+typedef struct _call_stack call_stack;
+struct _call_stack {
+  UInt size;
+  Int sp;
+  call_entry* entry;
+};
+
+typedef struct _fn_stack fn_stack;
+struct _fn_stack {
+  UInt size;
+  fn_node **bottom, **top;
+};
+
+/* The maximum number of simultaneous running signal handlers per thread.
+ * This is the number of execution states storable in a thread.
+ */
+#define MAX_SIGHANDLERS 10
+
+typedef struct _exec_stack exec_stack;
+struct _exec_stack {
+  Int sp; /* > 0 if a handler is running */
+  exec_state* entry[MAX_SIGHANDLERS];
+};
+
+/* Thread State 
+ *
+ * This structure stores thread specific info while a thread is *not*
+ * running. See function switch_thread() for save/restore on thread switch.
+ *
+ * If --separate-threads=no, BBCCs and JCCs can be shared by all threads, i.e.
+ * only structures of thread 1 are used.
+ * This involves variables fn_info_table, bbcc_table and jcc_table.
+ */
+struct _thread_info {
+
+  /* state */
+  fn_stack fns;       /* function stack */
+  call_stack calls;   /* context call arc stack */
+  exec_stack states;  /* execution states interrupted by signals */
+
+  /* dump statistics */
+  FullCost lastdump_cost;    /* Cost at last dump */
+  FullCost sighandler_cost;
+
+  /* thread specific data structure containers */
+  fn_array fn_active;
+  jcc_hash jccs;
+  bbcc_hash bbccs;
+};
+
+/* Structs used for dumping */
+
+/* Address position inside of a BBCC:
+ * This includes
+ * - the address offset from the BB start address
+ * - file/line from debug info for that address (can change inside a BB)
+ */
+typedef struct _AddrPos AddrPos;
+struct _AddrPos {
+    Addr addr;
+    Addr bb_addr;
+    file_node* file;
+    UInt line;
+};
+
+/* a simulator cost entity that can be written out in one line */
+typedef struct _AddrCost AddrCost;
+struct _AddrCost {
+    AddrPos p;
+    SimCost cost;
+};
+
+/* A function in an execution context */
+typedef struct _FnPos FnPos;
+struct _FnPos {
+    file_node* file;
+    fn_node* fn;
+    obj_node* obj;
+    Context* cxt;
+    int rec_index;
+    UInt line;
+};
+
+/*------------------------------------------------------------*/
+/*--- Cache simulator interface                            ---*/
+/*------------------------------------------------------------*/
+
+struct cachesim_if
+{
+    void (*print_opts)(void);
+    Bool (*parse_opt)(Char* arg);
+    void (*post_clo_init)(void);
+    void (*clear)(void);
+    void (*getdesc)(Char* buf);
+    void (*printstat)(void);  
+    void (*add_icost)(SimCost, BBCC*, InstrInfo*, ULong);
+    void (*after_bbsetup)(void);
+    void (*finish)(void);
+    
+    void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
+
+    void (*log_1I1Dr)(InstrInfo*, Addr) VG_REGPARM(2);
+    void (*log_1I1Dw)(InstrInfo*, Addr) VG_REGPARM(2);
+    void (*log_1I2D)(InstrInfo*, Addr, Addr) VG_REGPARM(3);
+
+    void (*log_0I1Dr)(InstrInfo*, Addr) VG_REGPARM(2);
+    void (*log_0I1Dw)(InstrInfo*, Addr) VG_REGPARM(2);
+    void (*log_0I2D)(InstrInfo*, Addr, Addr) VG_REGPARM(3);
+
+    // function names of helpers (for debugging generated code)
+    Char *log_1I0D_name;
+    Char *log_1I1Dr_name, *log_1I1Dw_name, *log_1I2D_name;
+    Char *log_0I1Dr_name, *log_0I1Dw_name, *log_0I2D_name;
+};
+
+
+/*------------------------------------------------------------*/
+/*--- Functions                                            ---*/
+/*------------------------------------------------------------*/
+
+/* from clo.c */
+
+void CLG_(set_clo_defaults)(void);
+void CLG_(update_fn_config)(fn_node*);
+Bool CLG_(process_cmd_line_option)(Char*);
+void CLG_(print_usage)(void);
+void CLG_(print_debug_usage)(void);
+
+/* from sim.c */
+struct event_sets {
+  EventSet *use, *Ir, *Dr, *Dw;
+  EventSet *D0, *D1r, *D1w, *D2;
+  EventSet *sim;
+  EventSet *full; /* sim plus user events */
+
+  /* offsets into eventsets */  
+  Int off_sim_Ir, off_sim_Dr, off_sim_Dw;
+  Int off_full_Ir, off_full_Dr, off_full_Dw;
+  Int off_full_user, off_full_alloc, off_full_systime;
+};
+
+extern struct event_sets CLG_(sets);
+extern struct cachesim_if CLG_(cachesim);
+
+void CLG_(init_eventsets)(Int user);
+
+/* from main.c */
+Bool CLG_(get_debug_info)(Addr, Char filename[FILENAME_LEN],
+			 Char fn_name[FN_NAME_LEN], UInt*, SegInfo**);
+void CLG_(collectBlockInfo)(IRBB* bbIn, UInt*, UInt*, Bool*);
+void CLG_(set_instrument_state)(Char*,Bool);
+void CLG_(dump_profile)(Char* trigger,Bool only_current_thread);
+void CLG_(zero_all_cost)(Bool only_current_thread);
+Int CLG_(get_dump_counter)(void);
+void CLG_(fini)(Int exitcode);
+
+/* from command.c */
+void CLG_(init_command)(Char* dir, Char* dumps);
+void CLG_(check_command)(void);
+void CLG_(finish_command)(void);
+
+/* from bb.c */
+void CLG_(init_bb_hash)(void);
+bb_hash* CLG_(get_bb_hash)(void);
+BB*  CLG_(get_bb)(Addr addr, IRBB* bb_in, Bool *seen_before);
+void CLG_(delete_bb)(Addr addr);
+
+static __inline__ Addr bb_addr(BB* bb)
+ { return bb->offset + bb->obj->offset; }
+static __inline__ Addr bb_jmpaddr(BB* bb)
+ { return bb->instr[bb->instr_count-1].instr_offset + bb->offset + bb->obj->offset; }
+
+/* from fn.c */
+void CLG_(init_fn_array)(fn_array*);
+void CLG_(copy_current_fn_array)(fn_array* dst);
+fn_array* CLG_(get_current_fn_array)(void);
+void CLG_(set_current_fn_array)(fn_array*);
+UInt* CLG_(get_fn_entry)(Int n);
+
+void      CLG_(init_obj_table)(void);
+obj_node* CLG_(get_obj_node)(SegInfo* si);
+file_node* CLG_(get_file_node)(obj_node*, Char* filename);
+fn_node*  CLG_(get_fn_node)(BB* bb);
+
+/* from bbcc.c */
+void CLG_(init_bbcc_hash)(bbcc_hash* bbccs);
+void CLG_(copy_current_bbcc_hash)(bbcc_hash* dst);
+bbcc_hash* CLG_(get_current_bbcc_hash)(void);
+void CLG_(set_current_bbcc_hash)(bbcc_hash*);
+void CLG_(forall_bbccs)(void (*func)(BBCC*));
+void CLG_(zero_bbcc)(BBCC* bbcc);
+BBCC* CLG_(get_bbcc)(BB* bb);
+BBCC* CLG_(clone_bbcc)(BBCC* orig, Context* cxt, Int rec_index);
+void CLG_(setup_bbcc)(BB* bb) VG_REGPARM(1);
+
+
+/* from jumps.c */
+void CLG_(init_jcc_hash)(jcc_hash*);
+void CLG_(copy_current_jcc_hash)(jcc_hash* dst);
+jcc_hash* CLG_(get_current_jcc_hash)(void);
+void CLG_(set_current_jcc_hash)(jcc_hash*);
+jCC* CLG_(get_jcc)(BBCC* from, UInt, BBCC* to);
+
+/* from callstack.c */
+void CLG_(init_call_stack)(call_stack*);
+void CLG_(copy_current_call_stack)(call_stack* dst);
+void CLG_(set_current_call_stack)(call_stack*);
+call_entry* CLG_(get_call_entry)(Int n);
+
+void CLG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip);
+void CLG_(pop_call_stack)(void);
+void CLG_(unwind_call_stack)(Addr sp, Int);
+
+/* from context.c */
+void CLG_(init_fn_stack)(fn_stack*);
+void CLG_(copy_current_fn_stack)(fn_stack*);
+fn_stack* CLG_(get_current_fn_stack)(void);
+void CLG_(set_current_fn_stack)(fn_stack*);
+
+void CLG_(init_cxt_table)(void);
+cxt_hash* CLG_(get_cxt_hash)(void);
+Context* CLG_(get_cxt)(fn_node** fn);
+void CLG_(push_cxt)(fn_node* fn);
+
+/* from threads.c */
+void CLG_(init_threads)(void);
+thread_info** CLG_(get_threads)(void);
+thread_info* CLG_(get_current_thread)(void);
+void CLG_(switch_thread)(ThreadId tid);
+void CLG_(forall_threads)(void (*func)(thread_info*));
+void CLG_(run_thread)(ThreadId tid);
+
+void CLG_(init_exec_state)(exec_state* es);
+void CLG_(init_exec_stack)(exec_stack*);
+void CLG_(copy_current_exec_stack)(exec_stack*);
+void CLG_(set_current_exec_stack)(exec_stack*);
+void CLG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack);
+void CLG_(post_signal)(ThreadId tid, Int sigNum);
+void CLG_(run_post_signal_on_call_stack_bottom)(void);
+
+/* from dump.c */
+extern FullCost CLG_(total_cost);
+void CLG_(init_files)(Char** dir, Char** file);
+Char* CLG_(get_dump_file_base)(void);
+
+
+/*------------------------------------------------------------*/
+/*--- Exported global variables                            ---*/
+/*------------------------------------------------------------*/
+
+extern CommandLineOptions CLG_(clo);
+extern Statistics CLG_(stat);
+extern EventMapping* CLG_(dumpmap);
+
+/* Function active counter array, indexed by function number */
+extern UInt* CLG_(fn_active_array);
+extern Bool CLG_(instrument_state);
+
+extern call_stack CLG_(current_call_stack);
+extern fn_stack   CLG_(current_fn_stack);
+extern exec_state CLG_(current_state);
+extern ThreadId   CLG_(current_tid);
+
+
+/*------------------------------------------------------------*/
+/*--- Debug output                                         ---*/
+/*------------------------------------------------------------*/
+
+#if CLG_ENABLE_DEBUG
+
+#define CLG_DEBUGIF(x) \
+  if ( (CLG_(clo).verbose >x) && \
+       (CLG_(stat).bb_executions >= CLG_(clo).verbose_start))
+
+#define CLG_DEBUG(x,format,args...)   \
+    CLG_DEBUGIF(x) {                  \
+      CLG_(print_bbno)();	      \
+      VG_(printf)(format,##args);     \
+    }
+
+#define CLG_ASSERT(cond)              \
+    if (!(cond)) {                    \
+      CLG_(print_context)();          \
+      CLG_(print_bbno)();	      \
+      tl_assert(cond);                \
+     }
+
+#else
+#define CLG_DEBUGIF(x) if (0)
+#define CLG_DEBUG(x...) {}
+#define CLG_ASSERT(cond) tl_assert(cond);
+#endif
+
+/* from debug.c */
+void CLG_(print_bbno)(void);
+void CLG_(print_context)(void);
+void CLG_(print_jcc)(int s, jCC* jcc);
+void CLG_(print_bbcc)(int s, BBCC* bbcc, Bool);
+void CLG_(print_bbcc_fn)(BBCC* bbcc);
+void CLG_(print_execstate)(int s, exec_state* es);
+void CLG_(print_eventset)(int s, EventSet* es);
+void CLG_(print_cost)(int s, EventSet*, ULong* cost);
+void CLG_(print_bb)(int s, BB* bb);
+void CLG_(print_bbcc_cost)(int s, BBCC*);
+void CLG_(print_cxt)(int s, Context* cxt, int rec_index);
+void CLG_(print_short_jcc)(jCC* jcc);
+void CLG_(print_stackentry)(int s, int sp);
+void CLG_(print_addr)(Addr addr);
+void CLG_(print_addr_ln)(Addr addr);
+
+void* CLG_(malloc)(UWord s, char* f);
+void* CLG_(free)(void* p, char* f);
+#if 0
+#define CLG_MALLOC(x) CLG_(malloc)(x,__FUNCTION__)
+#define CLG_FREE(p)   CLG_(free)(p,__FUNCTION__)
+#else
+#define CLG_MALLOC(x) VG_(malloc)(x)
+#define CLG_FREE(p)   VG_(free)(p)
+#endif
+
+#endif /* CLG_GLOBAL */
diff --git a/callgrind/jumps.c b/callgrind/jumps.c
new file mode 100644
index 0000000000..2a6a09a3c9
--- /dev/null
+++ b/callgrind/jumps.c
@@ -0,0 +1,233 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                   ct_jumps.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#define N_JCC_INITIAL_ENTRIES  4437
+
+/*------------------------------------------------------------*/
+/*--- Jump Cost Center (JCC) operations, including Calls   ---*/
+/*------------------------------------------------------------*/
+
+#define N_JCC_INITIAL_ENTRIES  4437
+
+jcc_hash current_jccs;
+
+void CLG_(init_jcc_hash)(jcc_hash* jccs)
+{
+   Int i;
+
+   CLG_ASSERT(jccs != 0);
+
+   jccs->size    = N_JCC_INITIAL_ENTRIES;
+   jccs->entries = 0;
+   jccs->table = (jCC**) CLG_MALLOC(jccs->size * sizeof(jCC*));
+   jccs->spontaneous = 0;
+
+   for (i = 0; i < jccs->size; i++)
+     jccs->table[i] = 0;
+}
+
+
+void CLG_(copy_current_jcc_hash)(jcc_hash* dst)
+{
+  CLG_ASSERT(dst != 0);
+
+  dst->size        = current_jccs.size;
+  dst->entries     = current_jccs.entries;
+  dst->table       = current_jccs.table;
+  dst->spontaneous = current_jccs.spontaneous;
+}
+
+void CLG_(set_current_jcc_hash)(jcc_hash* h)
+{
+  CLG_ASSERT(h != 0);
+
+  current_jccs.size        = h->size;
+  current_jccs.entries     = h->entries;
+  current_jccs.table       = h->table;
+  current_jccs.spontaneous = h->spontaneous;
+}
+
+__inline__
+static UInt jcc_hash_idx(BBCC* from, UInt jmp, BBCC* to, UInt size)
+{
+  return (UInt) ( (UWord)from + 7* (UWord)to + 13*jmp) % size;
+} 
+
+/* double size of jcc table  */
+static void resize_jcc_table(void)
+{
+    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
+    jCC** new_table;
+    UInt new_idx;
+    jCC *curr_jcc, *next_jcc;
+
+    new_size  = 2* current_jccs.size +3;
+    new_table = (jCC**) CLG_MALLOC(new_size * sizeof(jCC*));
+ 
+    if (!new_table) return;
+ 
+    for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+ 
+    for (i = 0; i < current_jccs.size; i++) {
+	if (current_jccs.table[i] == NULL) continue;
+ 
+	curr_jcc = current_jccs.table[i];
+	while (NULL != curr_jcc) {
+	    next_jcc = curr_jcc->next_hash;
+
+	    new_idx = jcc_hash_idx(curr_jcc->from, curr_jcc->jmp,
+				    curr_jcc->to, new_size);
+
+	    curr_jcc->next_hash = new_table[new_idx];
+	    new_table[new_idx] = curr_jcc;
+	    if (curr_jcc->next_hash) {
+		conflicts1++;
+		if (curr_jcc->next_hash->next_hash)
+		    conflicts2++;
+	    }
+
+	    curr_jcc = next_jcc;
+	}
+    }
+
+    VG_(free)(current_jccs.table);
+
+
+    CLG_DEBUG(0, "Resize JCC Hash: %d => %d (entries %d, conflicts %d/%d)\n",
+	     current_jccs.size, new_size,
+	     current_jccs.entries, conflicts1, conflicts2);
+
+    current_jccs.size  = new_size;
+    current_jccs.table = new_table;
+    CLG_(stat).jcc_hash_resizes++;
+}
+
+
+
+/* new jCC structure: a call was done to a BB of a BBCC 
+ * for a spontaneous call, from is 0 (i.e. caller unknown)
+ */
+static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to)
+{
+   jCC* new;
+   UInt new_idx;
+
+   /* check fill degree of jcc hash table and resize if needed (>80%) */
+   current_jccs.entries++;
+   if (10 * current_jccs.entries / current_jccs.size > 8)
+       resize_jcc_table();
+
+   new = (jCC*) CLG_MALLOC(sizeof(jCC));
+
+   new->from      = from;
+   new->jmp       = jmp;
+   new->to        = to;
+   new->jmpkind   = Ijk_Call;
+   new->call_counter = 0;
+   new->cost = 0;
+
+   /* insert into JCC chain of calling BBCC.
+    * This list is only used at dumping time */
+
+   if (from) {
+       new->next_from = from->jmp[jmp].jcc_list;
+       from->jmp[jmp].jcc_list = new;
+   }
+   else {
+       new->next_from = current_jccs.spontaneous;
+       current_jccs.spontaneous = new;
+   }
+
+   /* insert into JCC hash table */
+   new_idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
+   new->next_hash = current_jccs.table[new_idx];
+   current_jccs.table[new_idx] = new;
+
+   CLG_(stat).distinct_jccs++;
+
+   CLG_DEBUGIF(3) {
+     VG_(printf)("  new_jcc (now %d): %p\n",
+		 CLG_(stat).distinct_jccs, new);
+   }
+
+   return new;
+}
+
+
+/* get the jCC for a call arc (BBCC->BBCC) */
+jCC* CLG_(get_jcc)(BBCC* from, UInt jmp, BBCC* to)
+{
+    jCC* jcc;
+    UInt idx;
+
+    CLG_DEBUG(5, "+ get_jcc(bbcc %p/%d => bbcc %p)\n",
+		from, jmp, to);
+
+    /* first check last recently used JCC */
+    jcc = to->lru_to_jcc;
+    if (jcc && (jcc->from == from) && (jcc->jmp == jmp)) {
+	CLG_ASSERT(to == jcc->to);
+	CLG_DEBUG(5,"- get_jcc: [LRU to] jcc %p\n", jcc);
+	return jcc;
+    }
+
+    jcc = from->lru_from_jcc;
+    if (jcc && (jcc->to == to) && (jcc->jmp == jmp)) {
+	CLG_ASSERT(from == jcc->from);
+	CLG_DEBUG(5, "- get_jcc: [LRU from] jcc %p\n", jcc);
+	return jcc;
+    }
+
+    CLG_(stat).jcc_lru_misses++;
+
+    idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
+    jcc = current_jccs.table[idx];
+
+    while(jcc) {
+	if ((jcc->from == from) &&
+	    (jcc->jmp == jmp) &&
+	    (jcc->to == to)) break;
+	jcc = jcc->next_hash;
+    }
+
+    if (!jcc)
+	jcc = new_jcc(from, jmp, to);
+
+    /* set LRU */
+    from->lru_from_jcc = jcc;
+    to->lru_to_jcc = jcc;
+
+    CLG_DEBUG(5, "- get_jcc(bbcc %p => bbcc %p)\n",
+		from, to);
+
+    return jcc;
+}
+
diff --git a/callgrind/main.c b/callgrind/main.c
new file mode 100644
index 0000000000..dd19b3b338
--- /dev/null
+++ b/callgrind/main.c
@@ -0,0 +1,1086 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                       main.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This skin is derived from and contains code from Cachegrind
+   Copyright (C) 2002-2005 Nicholas Nethercote (njn25@cam.ac.uk)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "callgrind.h"
+#include "global.h"
+
+#include <pub_tool_threadstate.h>
+
+/*------------------------------------------------------------*/
+/*--- Global variables                                     ---*/
+/*------------------------------------------------------------*/
+
+/* for all threads */
+CommandLineOptions CLG_(clo);
+Statistics CLG_(stat);
+Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
+
+/* thread and signal handler specific */
+exec_state CLG_(current_state);
+
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+static void CLG_(init_statistics)(Statistics* s)
+{
+  s->call_counter        = 0;
+  s->jcnd_counter        = 0;
+  s->jump_counter        = 0;
+  s->rec_call_counter    = 0;
+  s->ret_counter         = 0;
+  s->bb_executions       = 0;
+
+  s->context_counter     = 0;
+  s->bb_retranslations   = 0;
+
+  s->distinct_objs       = 0;
+  s->distinct_files      = 0;
+  s->distinct_fns        = 0;
+  s->distinct_contexts   = 0;
+  s->distinct_bbs        = 0;
+  s->distinct_bbccs      = 0;
+  s->distinct_instrs     = 0;
+  s->distinct_skips      = 0;
+
+  s->bb_hash_resizes     = 0;
+  s->bbcc_hash_resizes   = 0;
+  s->jcc_hash_resizes    = 0;
+  s->cxt_hash_resizes    = 0;
+  s->fn_array_resizes    = 0;
+  s->call_stack_resizes  = 0;
+  s->fn_stack_resizes    = 0;
+
+  s->full_debug_BBs      = 0;
+  s->file_line_debug_BBs = 0;
+  s->fn_name_debug_BBs   = 0;
+  s->no_debug_BBs        = 0;
+  s->bbcc_lru_misses     = 0;
+  s->jcc_lru_misses      = 0;
+  s->cxt_lru_misses      = 0;
+  s->bbcc_clones         = 0;
+}
+
+
+    
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation instrumentation phase               ---*/
+/*------------------------------------------------------------*/
+
+
+static Bool loadStoreAddrsMatch(IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+{
+  // I'm assuming that for 'modify' instructions, that Vex always makes
+  // the loadAddrExpr and storeAddrExpr be of the same type, ie. both Tmp
+  // expressions, or both Const expressions.
+  CLG_ASSERT(isIRAtom(loadAddrExpr));
+  CLG_ASSERT(isIRAtom(storeAddrExpr));
+  return eqIRAtom(loadAddrExpr, storeAddrExpr);
+}
+
+static
+EventSet* insert_simcall(IRBB* bbOut, InstrInfo* ii, UInt dataSize,
+			 Bool instrIssued,
+			 IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+{
+    HChar*    helperName;
+    void*     helperAddr;
+    Int       argc;
+    EventSet* es;
+    IRExpr   *arg1, *arg2 = 0, *arg3 = 0, **argv;
+    IRDirty* di;
+
+    /* Check type of original instruction regarding memory access,
+     * and collect info to be able to generate fitting helper call
+     */
+    if (!loadAddrExpr && !storeAddrExpr) {
+	// no load/store
+	CLG_ASSERT(0 == dataSize);
+	if (instrIssued) {
+	    helperName = 0;
+	    helperAddr = 0;
+	}
+	else {
+	    helperName = CLG_(cachesim).log_1I0D_name;
+	    helperAddr = CLG_(cachesim).log_1I0D;
+	}
+	argc = 1;
+	es = CLG_(sets).D0;
+	
+    } else if (loadAddrExpr && !storeAddrExpr) {
+	// load
+	CLG_ASSERT( isIRAtom(loadAddrExpr) );
+	if (instrIssued) {
+	    helperName = CLG_(cachesim).log_0I1Dr_name;
+	    helperAddr = CLG_(cachesim).log_0I1Dr;
+	}
+	else {
+	    helperName = CLG_(cachesim).log_1I1Dr_name;
+	    helperAddr = CLG_(cachesim).log_1I1Dr;
+	}
+	argc = 2;
+	arg2 = loadAddrExpr;
+	es = CLG_(sets).D1r;
+
+    } else if (!loadAddrExpr && storeAddrExpr) {
+	// store
+	CLG_ASSERT( isIRAtom(storeAddrExpr) );
+	if (instrIssued) {
+	    helperName = CLG_(cachesim).log_0I1Dw_name;
+	    helperAddr = CLG_(cachesim).log_0I1Dw;
+	}
+	else {
+	    helperName = CLG_(cachesim).log_1I1Dw_name;
+	    helperAddr = CLG_(cachesim).log_1I1Dw;
+	}
+	argc = 2;
+	arg2 = storeAddrExpr;
+	es = CLG_(sets).D1w;
+	
+    } else {
+	CLG_ASSERT( loadAddrExpr && storeAddrExpr );
+	CLG_ASSERT( isIRAtom(loadAddrExpr) );
+	CLG_ASSERT( isIRAtom(storeAddrExpr) );
+	
+	if ( loadStoreAddrsMatch(loadAddrExpr, storeAddrExpr) ) {
+	    /* modify: suppose write access, as this is
+	     * more resource consuming (as in callgrind for VG2)
+	     * Cachegrind does a read here (!)
+	     * DISCUSS: Best way depends on simulation model?
+	     */
+	    if (instrIssued) {
+		helperName = CLG_(cachesim).log_0I1Dw_name;
+		helperAddr = CLG_(cachesim).log_0I1Dw;
+	    }
+	    else {
+		helperName = CLG_(cachesim).log_1I1Dw_name;
+		helperAddr = CLG_(cachesim).log_1I1Dw;
+	    }
+	    argc = 2;
+	    arg2 = storeAddrExpr;
+	    es = CLG_(sets).D1w;
+	    
+	} else {
+	    // load/store
+	    if (instrIssued) {
+		helperName = CLG_(cachesim).log_0I2D_name;
+		helperAddr = CLG_(cachesim).log_0I2D;
+	    }
+	    else {
+		helperName = CLG_(cachesim).log_1I2D_name;
+		helperAddr = CLG_(cachesim).log_1I2D;
+	    }
+	    argc = 3;
+	    arg2 = loadAddrExpr;
+	    arg3 = storeAddrExpr;
+	    es = CLG_(sets).D2;
+	}
+    }
+
+    /* helper could be unset depending on the simulator used */
+    if (helperAddr == 0) return 0;
+    
+    /* Setup 1st arg: InstrInfo */
+    arg1 = mkIRExpr_HWord( (HWord)ii );
+    
+    // Add call to the instrumentation function
+    if      (argc == 1)
+	argv = mkIRExprVec_1(arg1);
+    else if (argc == 2)
+	argv = mkIRExprVec_2(arg1, arg2);
+    else if (argc == 3)
+	argv = mkIRExprVec_3(arg1, arg2, arg3);
+    else
+	VG_(tool_panic)("argc... not 1 or 2 or 3?");
+    
+    di = unsafeIRDirty_0_N( argc, helperName, helperAddr, argv);
+    addStmtToIRBB( bbOut, IRStmt_Dirty(di) );
+
+    return es;
+}
+
+
+/* Instrumentation before a conditional jump or at the end
+ * of each original instruction.
+ * Fills the InstrInfo struct if not seen before
+ */
+static
+void endOfInstr(IRBB* bbOut, InstrInfo* ii, Bool bb_seen_before,
+		UInt instr_offset, UInt instrLen, UInt dataSize, 
+		UInt* cost_offset, Bool instrIssued,
+		IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+{
+   IRType    wordTy;
+   EventSet* es;
+
+   // Stay sane ...
+   CLG_ASSERT(sizeof(HWord) == sizeof(void*));
+   if (sizeof(HWord) == 4) {
+      wordTy = Ity_I32;
+   } else
+   if (sizeof(HWord) == 8) {
+      wordTy = Ity_I64;
+   } else {
+      VG_(tool_panic)("endOfInstr: strange word size");
+   }
+
+   if (loadAddrExpr) 
+      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, loadAddrExpr));
+   if (storeAddrExpr) 
+      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, storeAddrExpr));
+
+   // Large (eg. 28B, 108B, 512B on x86) data-sized instructions will be
+   // done inaccurately, but they're very rare and this avoids errors from
+   // hitting more than two cache lines in the simulation.
+   if (dataSize > MIN_LINE_SIZE) dataSize = MIN_LINE_SIZE;
+
+   /* returns 0 if simulator needs no instrumentation */
+   es = insert_simcall(bbOut, ii, dataSize, instrIssued,
+		       loadAddrExpr, storeAddrExpr);
+
+   if (bb_seen_before) {
+       CLG_ASSERT(ii->instr_offset == instr_offset);
+       CLG_ASSERT(ii->instr_size == instrLen);
+       CLG_ASSERT(ii->data_size == dataSize);
+       CLG_ASSERT(ii->cost_offset == *cost_offset);
+       CLG_ASSERT(ii->eventset == es);
+   }
+   else {
+       ii->instr_offset = instr_offset;
+       ii->instr_size = instrLen;
+       ii->data_size = dataSize;
+       ii->cost_offset = *cost_offset;
+       ii->eventset = es;
+
+       CLG_(stat).distinct_instrs++;
+   }
+
+   *cost_offset += es ? es->size : 0;
+
+   CLG_DEBUG(5, "  Instr +%2d (Size %d, DSize %d): ESet %s (Size %d)\n",
+	     instr_offset, instrLen, dataSize, 
+	     es ? es->name : (Char*)"(no Instr)",
+	     es ? es->size : 0);
+}
+
+#if defined(VG_BIGENDIAN)
+# define CLGEndness Iend_BE
+#elif defined(VG_LITTLEENDIAN)
+# define CLGEndness Iend_LE
+#else
+# error "Unknown endianness"
+#endif
+
+static
+Addr IRConst2Addr(IRConst* con)
+{
+    Addr addr;
+
+    if (sizeof(Addr) == 4) {
+	CLG_ASSERT( con->tag == Ico_U32 );
+	addr = con->Ico.U32;
+    }
+    else if (sizeof(Addr) == 8) {
+	CLG_ASSERT( con->tag == Ico_U64 );
+	addr = con->Ico.U64;
+    }
+    else
+	VG_(tool_panic)("Callgrind: invalid Addr type");
+
+    return addr;
+}
+
+/* First pass over a BB to instrument, counting instructions and jumps
+ * This is needed for the size of the BB struct to allocate
+ *
+ * Called from CLG_(get_bb)
+ */
+void CLG_(collectBlockInfo)(IRBB* bbIn,
+			    /*INOUT*/ UInt* instrs,
+			    /*INOUT*/ UInt* cjmps,
+			    /*INOUT*/ Bool* cjmp_inverted)
+{
+    Int i;
+    IRStmt* st;
+    Addr instrAddr =0, jumpDst;
+    UInt instrLen = 0;
+    Bool toNextInstr = False;
+
+    // Ist_Exit has to be ignored in preamble code, before first IMark:
+    // preamble code is added by VEX for self modifying code, and has
+    // nothing to do with client code
+    Bool inPreamble = True;
+
+    if (!bbIn) return;
+
+    for (i = 0; i < bbIn->stmts_used; i++) {
+	  st = bbIn->stmts[i];
+	  if (Ist_IMark == st->tag) {
+	      inPreamble = False;
+
+	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
+	      instrLen  = st->Ist.IMark.len;
+
+	      (*instrs)++;
+	      toNextInstr = False;
+	  }
+	  if (inPreamble) continue;
+	  if (Ist_Exit == st->tag) {
+	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
+	      toNextInstr =  (jumpDst == instrAddr + instrLen);
+	      
+	      (*cjmps)++;
+	  }
+    }
+
+    /* if the last instructions of BB conditionally jumps to next instruction
+     * (= first instruction of next BB in memory), this is a inverted by VEX.
+     */
+    *cjmp_inverted = toNextInstr;
+}
+
+static
+void collectStatementInfo(IRTypeEnv* tyenv, IRBB* bbOut, IRStmt* st,
+			  Addr* instrAddr, UInt* instrLen,
+			  IRExpr** loadAddrExpr, IRExpr** storeAddrExpr,
+			  UInt* dataSize, IRType hWordTy)
+{
+   CLG_ASSERT(isFlatIRStmt(st));
+
+   switch (st->tag) {
+   case Ist_NoOp:
+      break;
+
+   case Ist_AbiHint:
+      /* ABI hints aren't interesting.  Ignore. */
+      break;
+
+   case Ist_IMark:
+      /* st->Ist.IMark.addr is a 64-bit int.  ULong_to_Ptr casts this
+         to the host's native pointer type; if that is 32 bits then it
+         discards the upper 32 bits.  If we are cachegrinding on a
+         32-bit host then we are also ensured that the guest word size
+         is 32 bits, due to the assertion in cg_instrument that the
+         host and guest word sizes must be the same.  Hence
+         st->Ist.IMark.addr will have been derived from a 32-bit guest
+         code address and truncation of it is safe.  I believe this
+         assignment should be correct for both 32- and 64-bit
+         machines. */
+      *instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
+      *instrLen =        st->Ist.IMark.len;
+      break;
+
+   case Ist_Tmp: {
+      IRExpr* data = st->Ist.Tmp.data;
+      if (data->tag == Iex_Load) {
+         IRExpr* aexpr = data->Iex.Load.addr;
+         CLG_ASSERT( isIRAtom(aexpr) );
+         // Note also, endianness info is ignored.  I guess that's not
+         // interesting.
+         // XXX: repe cmpsb does two loads... the first one is ignored here!
+         //tl_assert( NULL == *loadAddrExpr );          // XXX: ???
+         *loadAddrExpr = aexpr;
+         *dataSize = sizeofIRType(data->Iex.Load.ty);
+      }
+      break;
+   }
+      
+   case Ist_Store: {
+      IRExpr* data  = st->Ist.Store.data;
+      IRExpr* aexpr = st->Ist.Store.addr;
+      CLG_ASSERT( isIRAtom(aexpr) );
+      if ( NULL == *storeAddrExpr ) {
+          /* this is a kludge: ignore all except the first store from
+             an instruction. */
+          *storeAddrExpr = aexpr;
+          *dataSize = sizeofIRType(typeOfIRExpr(tyenv, data));
+      }
+      break;
+   }
+   
+   case Ist_Dirty: {
+      IRDirty* d = st->Ist.Dirty.details;
+      if (d->mFx != Ifx_None) {
+         /* This dirty helper accesses memory.  Collect the
+            details. */
+         CLG_ASSERT(d->mAddr != NULL);
+         CLG_ASSERT(d->mSize != 0);
+         *dataSize = d->mSize;
+         if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+            *loadAddrExpr = d->mAddr;
+         if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+            *storeAddrExpr = d->mAddr;
+      } else {
+         CLG_ASSERT(d->mAddr == NULL);
+         CLG_ASSERT(d->mSize == 0);
+      }
+      break;
+   }
+
+   case Ist_Put:
+   case Ist_PutI:
+   case Ist_MFence:
+   case Ist_Exit:
+       break;
+
+   default:
+      VG_(printf)("\n");
+      ppIRStmt(st);
+      VG_(printf)("\n");
+      VG_(tool_panic)("Callgrind: unhandled IRStmt");
+   }
+}
+
+static
+void addConstMemStoreStmt( IRBB* bbOut, UWord addr, UInt val, IRType hWordTy)
+{
+    addStmtToIRBB( bbOut,
+		   IRStmt_Store(CLGEndness,
+				IRExpr_Const(hWordTy == Ity_I32 ?
+					     IRConst_U32( addr ) :
+					     IRConst_U64( addr )),
+				IRExpr_Const(IRConst_U32(val)) ));
+}   
+
+static
+IRBB* CLG_(instrument)( VgCallbackClosure* closure,
+			IRBB* bbIn,
+			VexGuestLayout* layout,
+			VexGuestExtents* vge,
+			IRType gWordTy, IRType hWordTy )
+{
+   Int      i;
+   IRBB*    bbOut;
+   IRStmt*  st, *stnext;
+   Addr     instrAddr, origAddr;
+   UInt     instrLen = 0, dataSize;
+   UInt     instrCount, costOffset;
+   IRExpr  *loadAddrExpr, *storeAddrExpr;
+
+   BB*         bb;
+
+   IRDirty* di;
+   IRExpr  *arg1, **argv;
+
+   Bool        bb_seen_before     = False;
+   UInt        cJumps = 0, cJumpsCorrected;
+   Bool        beforeIBoundary, instrIssued;
+
+   if (gWordTy != hWordTy) {
+      /* We don't currently support this case. */
+      VG_(tool_panic)("host/guest word size mismatch");
+   }
+
+   // No instrumentation if it is switched off
+   if (! CLG_(instrument_state)) {
+       CLG_DEBUG(5, "instrument(BB %p) [Instrumentation OFF]\n",
+		 (Addr)closure->readdr);
+       return bbIn;
+   }
+
+   CLG_DEBUG(3, "+ instrument(BB %p)\n", (Addr)closure->readdr);
+
+   /* Set up BB for instrumented IR */
+   bbOut           = emptyIRBB();
+   bbOut->tyenv    = dopyIRTypeEnv(bbIn->tyenv);
+   bbOut->next     = dopyIRExpr(bbIn->next);
+   bbOut->jumpkind = bbIn->jumpkind;
+
+   // Copy verbatim any IR preamble preceding the first IMark
+   i = 0;
+   while (i < bbIn->stmts_used && bbIn->stmts[i]->tag != Ist_IMark) {
+      addStmtToIRBB( bbOut, bbIn->stmts[i] );
+      i++;
+   }
+
+   // Get the first statement, and origAddr from it
+   CLG_ASSERT(bbIn->stmts_used > 0);
+   st = bbIn->stmts[i];
+   CLG_ASSERT(Ist_IMark == st->tag);
+   instrAddr = origAddr = (Addr)st->Ist.IMark.addr;
+   CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
+
+   /* Get BB (creating if necessary).
+    * JS: The hash table is keyed with orig_addr_noredir -- important!
+    * JW: Why? If it is because of different chasing of the redirection,
+    *     this is not needed, as chasing is switched off in callgrind
+    */
+   bb = CLG_(get_bb)(origAddr, bbIn, &bb_seen_before);
+   //bb = CLG_(get_bb)(orig_addr_noredir, bbIn, &bb_seen_before);
+
+   /* 
+    * Precondition:
+    * - jmps_passed has number of cond.jumps passed in last executed BB
+    * - current_bbcc has a pointer to the BBCC of the last executed BB
+    *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
+    *     current_bbcc->bb->jmp_addr
+    *   gives the address of the jump source.
+    *   
+    * The BBCC setup does 2 things:
+    * - trace call:
+    *   * Unwind own call stack, i.e sync our ESP with real ESP
+    *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
+    *   * For CALLs or JMPs crossing objects, record call arg +
+    *     push are on own call stack
+    *
+    * - prepare for cache log functions:
+    *   Set current_bbcc to BBCC that gets the costs for this BB execution
+    *   attached
+    */
+
+   // helper call to setup_bbcc, with pointer to basic block info struct as argument
+   arg1 = mkIRExpr_HWord( (HWord)bb );
+   argv = mkIRExprVec_1(arg1);
+   di = unsafeIRDirty_0_N( 1, "setup_bbcc", & CLG_(setup_bbcc), argv);
+   addStmtToIRBB( bbOut, IRStmt_Dirty(di) );
+
+   instrCount = 0;
+   costOffset = 0;
+
+   // loop for each host instruction (starting from 'i')
+   do {
+
+      // We should be at an IMark statement
+      CLG_ASSERT(Ist_IMark == st->tag);
+
+      // Reset stuff for this original instruction
+      loadAddrExpr = storeAddrExpr = NULL;
+      instrIssued = False;
+      dataSize = 0;
+
+      // Process all the statements for this original instruction (ie. until
+      // the next IMark statement, or the end of the block)
+      do {
+	  i++;
+	  stnext = ( i < bbIn->stmts_used ? bbIn->stmts[i] : NULL );
+	  beforeIBoundary = !stnext || (Ist_IMark == stnext->tag);
+	  collectStatementInfo(bbIn->tyenv, bbOut, st, &instrAddr, &instrLen,
+			       &loadAddrExpr, &storeAddrExpr, &dataSize, hWordTy);
+
+	  // instrument a simulator call before conditional jumps
+	  if (st->tag == Ist_Exit) {
+	      // Nb: instrLen will be zero if Vex failed to decode it.
+	      // Also Client requests can appear to be very large (eg. 18
+	      // bytes on x86) because they are really multiple instructions.
+	      CLG_ASSERT( 0 == instrLen ||
+			  bbIn->jumpkind == Ijk_ClientReq ||
+			  (instrLen >= VG_MIN_INSTR_SZB && 
+			   instrLen <= VG_MAX_INSTR_SZB) );
+
+              // Add instrumentation before this statement
+	      endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
+			 instrAddr - origAddr, instrLen, dataSize, &costOffset,
+			 instrIssued, loadAddrExpr, storeAddrExpr);
+
+	      // prepare for a possible further simcall in same host instr
+	      loadAddrExpr = storeAddrExpr = NULL;
+	      instrIssued = True;
+
+	      if (!bb_seen_before) {
+		  bb->jmp[cJumps].instr = instrCount;
+		  bb->jmp[cJumps].skip = False;
+	      }
+	      
+	      /* Update global variable jmps_passed (this is before the jump!)
+	       * A correction is needed if VEX inverted the last jump condition
+	       */
+	      cJumpsCorrected = cJumps;
+	      if ((cJumps+1 == bb->cjmp_count) && bb->cjmp_inverted) cJumpsCorrected++;
+	      addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
+				    cJumpsCorrected, hWordTy);
+
+	      cJumps++;
+	  }
+
+	  addStmtToIRBB( bbOut, st );
+	  st = stnext;
+      } 
+      while (!beforeIBoundary);
+
+      // Add instrumentation for this original instruction.
+      if (!instrIssued || (loadAddrExpr != 0) || (storeAddrExpr !=0))
+	  endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
+		     instrAddr - origAddr, instrLen, dataSize, &costOffset,
+		     instrIssued, loadAddrExpr, storeAddrExpr);
+
+      instrCount++;
+   }
+   while (st);
+
+   /* Always update global variable jmps_passed (at end of BB)
+    * A correction is needed if VEX inverted the last jump condition
+    */
+   cJumpsCorrected = cJumps;
+   if (bb->cjmp_inverted) cJumpsCorrected--;
+   addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
+			 cJumpsCorrected, hWordTy);
+
+   /* This stores the instr of the call/ret at BB end */
+   bb->jmp[cJumps].instr = instrCount-1;
+
+   CLG_ASSERT(bb->cjmp_count == cJumps);
+   CLG_ASSERT(bb->instr_count == instrCount);
+
+   instrAddr += instrLen;
+   if (bb_seen_before) {
+       CLG_ASSERT(bb->instr_len == instrAddr - origAddr);
+       CLG_ASSERT(bb->cost_count == costOffset);
+       CLG_ASSERT(bb->jmpkind == bbIn->jumpkind);
+   }
+   else {
+       bb->instr_len = instrAddr - origAddr;
+       bb->cost_count = costOffset;
+       bb->jmpkind = bbIn->jumpkind;
+   }
+   
+   CLG_DEBUG(3, "- instrument(BB %p): byteLen %u, CJumps %u, CostLen %u\n",
+	     origAddr, bb->instr_len, bb->cjmp_count, bb->cost_count);
+   if (cJumps>0) {
+       CLG_DEBUG(3, "                     [ ");
+       for (i=0;i<cJumps;i++)
+	   CLG_DEBUG(3, "%d ", bb->jmp[i].instr);
+       CLG_DEBUG(3, "], last inverted: %s \n", bb->cjmp_inverted ? "yes":"no");
+   }
+
+  return bbOut;
+}
+
+/*--------------------------------------------------------------------*/
+/*--- Discarding BB info                                           ---*/
+/*--------------------------------------------------------------------*/
+
+// Called when a translation is removed from the translation cache for
+// any reason at all: to free up space, because the guest code was
+// unmapped or modified, or for any arbitrary reason.
+static
+void clg_discard_basic_block_info ( Addr64 orig_addr64, VexGuestExtents vge )
+{
+    Addr orig_addr = (Addr)orig_addr64;
+
+    tl_assert(vge.n_used > 0);
+
+   if (0)
+      VG_(printf)( "discard_basic_block_info: %p, %p, %llu\n",
+                   (void*)(Addr)orig_addr,
+                   (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
+
+   // Get BB info, remove from table, free BB info.  Simple!  Note that we
+   // use orig_addr, not the first instruction address in vge.
+   CLG_(delete_bb)(orig_addr);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- CLG_(fini)() and related function                     ---*/
+/*------------------------------------------------------------*/
+
+
+
+static void zero_thread_cost(thread_info* t)
+{
+  Int i;
+
+  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
+    if (!CLG_(current_call_stack).entry[i].jcc) continue;
+
+    /* reset call counters to current for active calls */
+    CLG_(copy_cost)( CLG_(sets).full, 
+		    CLG_(current_call_stack).entry[i].enter_cost,
+		    CLG_(current_state).cost );
+  }
+
+  CLG_(forall_bbccs)(CLG_(zero_bbcc));
+
+  /* set counter for last dump */
+  CLG_(copy_cost)( CLG_(sets).full, 
+		  t->lastdump_cost, CLG_(current_state).cost );
+}
+
+void CLG_(zero_all_cost)(Bool only_current_thread)
+{
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "  Zeroing costs...");
+
+  if (only_current_thread)
+    zero_thread_cost(CLG_(get_current_thread)());
+  else
+    CLG_(forall_threads)(zero_thread_cost);
+
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "  ...done");
+}
+
+static
+void unwind_thread(thread_info* t)
+{
+  /* unwind signal handlers */
+  while(CLG_(current_state).sig !=0)
+    CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
+
+  /* unwind regular call stack */
+  while(CLG_(current_call_stack).sp>0)
+    CLG_(pop_call_stack)();
+}
+
+/* Ups, this can go wrong... */
+extern void VG_(discard_translations) ( Addr64 start, ULong range );
+
+void CLG_(set_instrument_state)(Char* reason, Bool state)
+{
+  if (CLG_(instrument_state) == state) {
+    CLG_DEBUG(2, "%s: instrumentation already %s\n",
+	     reason, state ? "ON" : "OFF");
+    return;
+  }
+  CLG_(instrument_state) = state;
+  CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
+	   reason, state ? "ON" : "OFF");
+
+  VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl);
+
+  /* reset internal state: call stacks, simulator */
+  CLG_(forall_threads)(unwind_thread);
+  (*CLG_(cachesim).clear)();
+  if (0)
+    CLG_(forall_threads)(zero_thread_cost);
+
+  if (!state)
+    CLG_(init_exec_state)( &CLG_(current_state) );
+
+  if (VG_(clo_verbosity) > 1)
+    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
+		 reason, state ? "ON" : "OFF");
+}
+  
+
+static
+Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
+{
+   if (!VG_IS_TOOL_USERREQ('C','T',args[0]))
+      return False;
+
+   switch(args[0]) {
+   case VG_USERREQ__DUMP_STATS:     
+      CLG_(dump_profile)("Client Request", True);
+      *ret = 0;                 /* meaningless */
+      break;
+
+   case VG_USERREQ__DUMP_STATS_AT:
+     {
+       Char buf[512];
+       VG_(sprintf)(buf,"Client Request: %d", args[1]);
+       CLG_(dump_profile)(buf, True);
+       *ret = 0;                 /* meaningless */
+     }
+     break;
+
+   case VG_USERREQ__ZERO_STATS:
+     CLG_(zero_all_cost)(True);
+      *ret = 0;                 /* meaningless */
+      break;
+
+   case VG_USERREQ__TOGGLE_COLLECT:
+     CLG_(current_state).collect = !CLG_(current_state).collect;
+     CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
+	      CLG_(current_state).collect ? "ON" : "OFF");
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__START_INSTRUMENTATION:
+     CLG_(set_instrument_state)("Client Request", True);
+     *ret = 0;                 /* meaningless */
+     break;
+
+   case VG_USERREQ__STOP_INSTRUMENTATION:
+     CLG_(set_instrument_state)("Client Request", False);
+     *ret = 0;                 /* meaningless */
+     break;
+
+   default:
+      return False;
+   }
+
+   return True;
+}
+
+
+/* Syscall Timing */
+
+/* struct timeval syscalltime[VG_N_THREADS]; */
+#if CLG_MICROSYSTIME
+#include <sys/time.h>
+#include <sys/syscall.h>
+extern Int VG_(do_syscall) ( UInt, ... );
+
+ULong syscalltime[VG_N_THREADS];
+#else
+UInt syscalltime[VG_N_THREADS];
+#endif
+
+static
+void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno)
+{
+  if (CLG_(clo).collect_systime) {
+#if CLG_MICROSYSTIME
+    struct vki_timeval tv_now;
+    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
+    syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
+#else
+    syscalltime[tid] = VG_(read_millisecond_timer)();
+#endif
+  }
+}
+
+static
+void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno, SysRes res)
+{
+  if (CLG_(clo).collect_systime) {
+    Int o = CLG_(sets).off_full_systime;
+#if CLG_MICROSYSTIME
+    struct vki_timeval tv_now;
+    ULong diff;
+    
+    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
+    diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
+#else
+    UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
+#endif  
+    
+    CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
+    
+    if (o<0) return;
+
+    CLG_(current_state).cost[o] ++;
+    CLG_(current_state).cost[o+1] += diff;
+    if (!CLG_(current_state).bbcc->skipped)
+      CLG_(init_cost_lz)(CLG_(sets).full,
+			&(CLG_(current_state).bbcc->skipped));
+    CLG_(current_state).bbcc->skipped[o] ++;
+    CLG_(current_state).bbcc->skipped[o+1] += diff;
+  }
+}
+
+static
+void finish(void)
+{
+  char buf[RESULTS_BUF_LEN];
+
+  CLG_DEBUG(0, "finish()\n");
+
+  (*CLG_(cachesim).finish)();
+
+  /* pop all remaining items from CallStack for correct sum
+   */
+  CLG_(forall_threads)(unwind_thread);
+  
+  CLG_(dump_profile)(0, False);
+  
+  CLG_(finish_command)();
+  
+  if (VG_(clo_verbosity) == 0) return;
+  
+  /* Hash table stats */
+  if (VG_(clo_verbosity) > 1) {
+    int BB_lookups =
+      CLG_(stat).full_debug_BBs +
+      CLG_(stat).fn_name_debug_BBs +
+      CLG_(stat).file_line_debug_BBs +
+      CLG_(stat).no_debug_BBs;
+    
+    VG_(message)(Vg_DebugMsg, "");
+    VG_(message)(Vg_DebugMsg, "Distinct objects: %d",
+		 CLG_(stat).distinct_objs);
+    VG_(message)(Vg_DebugMsg, "Distinct files:   %d",
+		 CLG_(stat).distinct_files);
+    VG_(message)(Vg_DebugMsg, "Distinct fns:     %d",
+		 CLG_(stat).distinct_fns);
+    VG_(message)(Vg_DebugMsg, "Distinct contexts:%d",
+		 CLG_(stat).distinct_contexts);
+    VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d",
+		 CLG_(stat).distinct_bbs);
+    VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)",
+		 CLG_(costarray_entries), CLG_(costarray_chunks));
+    VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d",
+		 CLG_(stat).distinct_bbccs);
+    VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d",
+		 CLG_(stat).distinct_jccs);
+    VG_(message)(Vg_DebugMsg, "Distinct skips:   %d",
+		 CLG_(stat).distinct_skips);
+    VG_(message)(Vg_DebugMsg, "BB lookups:       %d",
+		 BB_lookups);
+    if (BB_lookups>0) {
+      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
+		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
+		   CLG_(stat).full_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
+		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
+		   CLG_(stat).file_line_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
+		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
+		   CLG_(stat).fn_name_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
+		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
+		   CLG_(stat).no_debug_BBs);
+    }
+    VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d",
+		 CLG_(stat).bbcc_clones);
+    VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d",
+		 CLG_(stat).bb_retranslations);
+    VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d",
+		 CLG_(stat).distinct_instrs);
+    VG_(message)(Vg_DebugMsg, "");
+    
+    VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d",
+		 CLG_(stat).cxt_lru_misses);
+    VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d",
+		 CLG_(stat).bbcc_lru_misses);
+    VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d",
+		 CLG_(stat).jcc_lru_misses);
+    VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu",
+		 CLG_(stat).bb_executions);
+    VG_(message)(Vg_DebugMsg, "Calls:             %llu",
+		 CLG_(stat).call_counter);
+    VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu",
+		 CLG_(stat).jcnd_counter);
+    VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu",
+		 CLG_(stat).jump_counter);
+    VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu",
+		 CLG_(stat).rec_call_counter);
+    VG_(message)(Vg_DebugMsg, "Returns:           %llu",
+		 CLG_(stat).ret_counter);
+
+    VG_(message)(Vg_DebugMsg, "");
+  }
+
+  CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
+  VG_(message)(Vg_UserMsg, "Events    : %s", buf);
+  CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
+  VG_(message)(Vg_UserMsg, "Collected : %s", buf);
+  VG_(message)(Vg_UserMsg, "");
+
+  //  if (CLG_(clo).simulate_cache)
+  (*CLG_(cachesim).printstat)();
+}
+
+
+void CLG_(fini)(Int exitcode)
+{
+  finish();
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- Setup                                                        ---*/
+/*--------------------------------------------------------------------*/
+
+static
+void CLG_(post_clo_init)(void)
+{
+   Char *dir = 0, *fname = 0;
+
+   VG_(clo_vex_control).iropt_unroll_thresh = 0;
+   VG_(clo_vex_control).guest_chase_thresh = 0;
+
+   CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
+   CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
+   CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
+
+   if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
+       VG_(message)(Vg_UserMsg, "Using source line as position.");
+       CLG_(clo).dump_line = True;
+   }
+
+   CLG_(init_files)(&dir,&fname);
+   CLG_(init_command)(dir,fname);
+
+   (*CLG_(cachesim).post_clo_init)();
+
+   CLG_(init_eventsets)(0);
+   CLG_(init_statistics)(& CLG_(stat));
+   CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
+
+   /* initialize hash tables */
+   CLG_(init_obj_table)();
+   CLG_(init_cxt_table)();
+   CLG_(init_bb_hash)();
+
+   CLG_(init_threads)();
+   CLG_(run_thread)(1);
+
+   CLG_(instrument_state) = CLG_(clo).instrument_atstart;
+
+   VG_(message)(Vg_UserMsg, "");
+   VG_(message)(Vg_UserMsg, "For interactive control, run 'callgrind_control -h'.");
+}
+
+static
+void CLG_(pre_clo_init)(void)
+{
+    VG_(details_name)            ("Callgrind");
+    VG_(details_version)         (VERSION);
+    VG_(details_description)     ("a call-graph generating cache profiler");
+    VG_(details_copyright_author)("Copyright (C) 2002-2006, and GNU GPL'd, "
+				  "by J.Weidendorfer et al.");
+    VG_(details_bug_reports_to)  ("Josef.Weidendorfer@gmx.de");
+    VG_(details_avg_translation_sizeB) ( 155 );
+
+    VG_(basic_tool_funcs)        (CLG_(post_clo_init),
+                                  CLG_(instrument),
+                                  CLG_(fini));
+
+    VG_(needs_basic_block_discards)(clg_discard_basic_block_info);
+
+
+    VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
+				    CLG_(print_usage),
+				    CLG_(print_debug_usage));
+
+    VG_(needs_client_requests)(CLG_(handle_client_request));
+    VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
+			       CLG_(post_syscalltime));
+
+    VG_(track_thread_run) ( & CLG_(run_thread) );
+    VG_(track_pre_deliver_signal)  ( & CLG_(pre_signal) );
+    VG_(track_post_deliver_signal)  ( & CLG_(post_signal) );
+
+    CLG_(set_clo_defaults)();
+}
+
+VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                   main.c ---*/
+/*--------------------------------------------------------------------*/
diff --git a/callgrind/sim.c b/callgrind/sim.c
new file mode 100644
index 0000000000..e61eb6971c
--- /dev/null
+++ b/callgrind/sim.c
@@ -0,0 +1,2162 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Cache simulation.                                            ---*/
+/*---                                                        sim.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind.
+   (c) 2003-2005, Josef Weidendorfer
+
+   Parts are Copyright (C) 2002 Nicholas Nethercote
+      njn25@cam.ac.uk
+
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+
+/* Notes:
+  - simulates a write-allocate cache
+  - (block --> set) hash function uses simple bit selection
+  - handling of references straddling two cache blocks:
+      - counts as only one cache access (not two)
+      - both blocks hit                  --> one hit
+      - one block hits, the other misses --> one miss
+      - both blocks miss                 --> one miss (not two)
+*/
+
+/* Cache configuration */
+#include "cg_arch.h"
+
+/* additional structures for cache use info, separated
+ * according usage frequency:
+ * - line_loaded : pointer to cost center of instruction 
+ *                 which loaded the line into cache.
+ *                 Needed to increment counters when line is evicted.
+ * - line_use    : updated on every access
+ */
+typedef struct {
+  UInt count;
+  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
+} line_use;
+
+typedef struct {
+  Addr memline, iaddr;
+  line_use* dep_use; /* point to higher-level cacheblock for this memline */
+  ULong* use_base;
+} line_loaded;  
+
+/* Cache state */
+typedef struct {
+   char*        name;
+   int          size;                   /* bytes */
+   int          assoc;
+   int          line_size;              /* bytes */
+   Bool         sectored;  /* prefetch nearside cacheline on read */
+   int          sets;
+   int          sets_min_1;
+   int          assoc_bits;
+   int          line_size_bits;
+   int          tag_shift;
+   UWord        tag_mask;
+   char         desc_line[128];
+   UWord*       tags;
+
+  /* for cache use */
+   int          line_size_mask;
+   int*         line_start_mask;
+   int*         line_end_mask;
+   line_loaded* loaded;
+   line_use*    use;
+} cache_t2;
+
+/*
+ * States of flat caches in our model.
+ * We use a 2-level hierarchy, 
+ */
+static cache_t2 I1, D1, L2;
+
+/* Lower bits of cache tags are used as flags for a cache line */
+#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
+#define CACHELINE_DIRTY    1
+
+
+/* Cache simulator Options */
+static Bool clo_simulate_writeback = False;
+static Bool clo_simulate_hwpref = False;
+static Bool clo_simulate_sectors = False;
+static Bool clo_collect_cacheuse = False;
+
+/* Following global vars are setup before by
+ *  setup_bbcc()/cachesim_after_bbsetup():
+ *
+ * - Addr   bb_base     (instruction start address of original BB)
+ * - ULong* cost_base   (start of cost array for BB)
+ * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
+ */
+
+/* Offset to events in event set, used in log_* functions */
+static Int off_D0_Ir;
+static Int off_D1r_Ir;
+static Int off_D1r_Dr;
+static Int off_D1w_Ir;
+static Int off_D1w_Dw;
+static Int off_D2_Ir;
+static Int off_D2_Dr;
+static Int off_D2_Dw;
+
+static Addr   bb_base;
+static ULong* cost_base;
+static InstrInfo* current_ii;
+
+/* Cache use offsets */
+/* FIXME: The offsets are only correct because all eventsets get
+ * the "Use" set added first !
+ */
+static Int off_I1_AcCost  = 0;
+static Int off_I1_SpLoss  = 1;
+static Int off_D1_AcCost  = 0;
+static Int off_D1_SpLoss  = 1;
+static Int off_L2_AcCost  = 2;
+static Int off_L2_SpLoss  = 3;
+
+/* Cache access types */
+typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
+
+/* Result of a reference into a flat cache */
+typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
+
+/* Result of a reference into a hierarchical cache model */
+typedef enum {
+    L1_Hit, 
+    L2_Hit,
+    MemAccess,
+    WriteBackMemAccess } CacheModelResult;
+
+typedef CacheModelResult (*simcall_type)(Addr, UChar);
+
+static struct {
+    simcall_type I1_Read;
+    simcall_type D1_Read;
+    simcall_type D1_Write;
+} simulator;
+
+/*------------------------------------------------------------*/
+/*--- Cache Simulator Initialization                       ---*/
+/*------------------------------------------------------------*/
+
+static void cachesim_clearcache(cache_t2* c)
+{
+  Int i;
+
+  for (i = 0; i < c->sets * c->assoc; i++)
+    c->tags[i] = 0;
+  if (c->use) {
+    for (i = 0; i < c->sets * c->assoc; i++) {
+      c->loaded[i].memline  = 0;
+      c->loaded[i].use_base = 0;
+      c->loaded[i].dep_use = 0;
+      c->loaded[i].iaddr = 0;
+      c->use[i].mask    = 0;
+      c->use[i].count   = 0;
+      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
+    }
+  }
+}
+
+static void cacheuse_initcache(cache_t2* c);
+
+/* By this point, the size/assoc/line_size has been checked. */
+static void cachesim_initcache(cache_t config, cache_t2* c)
+{
+   c->size      = config.size;
+   c->assoc     = config.assoc;
+   c->line_size = config.line_size;
+   c->sectored  = False; // FIXME
+
+   c->sets           = (c->size / c->line_size) / c->assoc;
+   c->sets_min_1     = c->sets - 1;
+   c->assoc_bits     = VG_(log2)(c->assoc);
+   c->line_size_bits = VG_(log2)(c->line_size);
+   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
+   c->tag_mask       = ~((1<<c->tag_shift)-1);
+
+   /* Can bits in tag entries be used for flags?
+    * Should be always true as MIN_LINE_SIZE >= 16 */
+   CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
+
+   if (c->assoc == 1) {
+      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
+		   c->size, c->line_size,
+		   c->sectored ? ", sectored":"");
+   } else {
+      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
+		   c->size, c->line_size, c->assoc,
+		   c->sectored ? ", sectored":"");
+   }
+
+   c->tags = (UWord*) CLG_MALLOC(sizeof(UWord) * c->sets * c->assoc);
+   if (clo_collect_cacheuse)
+       cacheuse_initcache(c);
+   else
+     c->use = 0;
+   cachesim_clearcache(c);
+}
+
+
+#if 0
+static void print_cache(cache_t2* c)
+{
+   UInt set, way, i;
+
+   /* Note initialisation and update of 'i'. */
+   for (i = 0, set = 0; set < c->sets; set++) {
+      for (way = 0; way < c->assoc; way++, i++) {
+         VG_(printf)("%8x ", c->tags[i]);
+      }
+      VG_(printf)("\n");
+   }
+}
+#endif 
+
+
+/*------------------------------------------------------------*/
+/*--- Write Through Cache Simulation                       ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * Simple model: L1 & L2 Write Through
+ * Does not distinguish among read and write references
+ *
+ * Simulator functions:
+ *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+ */
+
+static __inline__
+CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
+{
+    int i, j;
+    UWord *set;
+
+    /* Shifting is a bit faster than multiplying */
+    set = &(c->tags[set_no << c->assoc_bits]);
+
+    /* This loop is unrolled for just the first case, which is the most */
+    /* common.  We can't unroll any further because it would screw up   */
+    /* if we have a direct-mapped (1-way) cache.                        */
+    if (tag == set[0])
+        return Hit;
+
+    /* If the tag is one other than the MRU, move it into the MRU spot  */
+    /* and shuffle the rest down.                                       */
+    for (i = 1; i < c->assoc; i++) {
+        if (tag == set[i]) {
+            for (j = i; j > 0; j--) {
+                set[j] = set[j - 1];
+            }
+            set[0] = tag;
+            return Hit;
+        }
+    }
+
+    /* A miss;  install this tag as MRU, shuffle rest down. */
+    for (j = c->assoc - 1; j > 0; j--) {
+        set[j] = set[j - 1];
+    }
+    set[0] = tag;
+
+    return Miss;
+}
+
+static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
+{
+    UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
+    UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
+    UWord tag  = a >> c->tag_shift;
+
+    /* Access entirely within line. */
+    if (set1 == set2) 
+	return cachesim_setref(c, set1, tag);
+
+    /* Access straddles two lines. */
+    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
+    else if (((set1 + 1) & (c->sets-1)) == set2) {
+
+	/* the call updates cache structures as side effect */
+	CacheResult res1 =  cachesim_setref(c, set1, tag);
+	CacheResult res2 =  cachesim_setref(c, set2, tag);
+	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2);
+       VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+
+static
+CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    return MemAccess;
+}
+
+static
+CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    return MemAccess;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Write Back Cache Simulation                          ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * More complex model: L1 Write-through, L2 Write-back
+ * This needs to distinguish among read and write references.
+ *
+ * Simulator functions:
+ *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+ */
+
+/*
+ * With write-back, result can be a miss evicting a dirty line
+ * The dirty state of a cache line is stored in Bit0 of the tag for
+ * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
+ * type (Read/Write), the line gets dirty on a write.
+ */
+static __inline__
+CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
+{
+    int i, j;
+    UWord *set, tmp_tag;
+
+    /* Shifting is a bit faster than multiplying */
+    set = &(c->tags[set_no << c->assoc_bits]);
+
+    /* This loop is unrolled for just the first case, which is the most */
+    /* common.  We can't unroll any further because it would screw up   */
+    /* if we have a direct-mapped (1-way) cache.                        */
+    if (tag == (set[0] & ~CACHELINE_DIRTY)) {
+	set[0] |= ref;
+        return Hit;
+    }
+    /* If the tag is one other than the MRU, move it into the MRU spot  */
+    /* and shuffle the rest down.                                       */
+    for (i = 1; i < c->assoc; i++) {
+	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
+	    tmp_tag = set[i] | ref; // update dirty flag
+            for (j = i; j > 0; j--) {
+                set[j] = set[j - 1];
+            }
+            set[0] = tmp_tag;
+            return Hit;
+        }
+    }
+
+    /* A miss;  install this tag as MRU, shuffle rest down. */
+    tmp_tag = set[c->assoc - 1];
+    for (j = c->assoc - 1; j > 0; j--) {
+        set[j] = set[j - 1];
+    }
+    set[0] = tag | ref;
+
+    return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
+}
+
+
+static __inline__
+CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
+{
+    UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
+    UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
+    UWord tag = a & c->tag_mask;
+
+    /* Access entirely within line. */
+    if (set1 == set2)
+	return cachesim_setref_wb(c, ref, set1, tag);
+
+    /* Access straddles two lines. */
+    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
+    else if (((set1 + 1) & (c->sets-1)) == set2) {
+
+	/* the call updates cache structures as side effect */
+	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
+	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag);
+
+	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
+	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2);
+       VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+
+
+static
+CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
+	case Hit: return L2_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
+	case Hit: return L2_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) {
+	/* Even for a L1 hit, the write-trough L1 passes
+	 * the write to the L2 to make the L2 line dirty.
+	 * But this causes no latency, so return the hit.
+	 */
+	cachesim_ref_wb( &L2, Write, a, size);
+	return L1_Hit;
+    }
+    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
+	case Hit: return L2_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Hardware Prefetch Simulation                         ---*/
+/*------------------------------------------------------------*/
+
+static ULong prefetch_up = 0;
+static ULong prefetch_down = 0;
+
+#define PF_STREAMS  8
+#define PF_PAGEBITS 12
+
+static UInt pf_lastblock[PF_STREAMS];
+static Int  pf_seqblocks[PF_STREAMS];
+
+static
+void prefetch_clear(void)
+{
+  int i;
+  for(i=0;i<PF_STREAMS;i++)
+    pf_lastblock[i] = pf_seqblocks[i] = 0;
+}
+
+/*
+ * HW Prefetch emulation
+ * Start prefetching when detecting sequential access to 3 memory blocks.
+ * One stream can be detected per 4k page.
+ */
+static __inline__
+void prefetch_L2_doref(Addr a, UChar size)
+{
+  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
+  UInt block = ( a >> L2.line_size_bits);
+
+  if (block != pf_lastblock[stream]) {
+    if (pf_seqblocks[stream] == 0) {
+      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
+      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
+    }
+    else if (pf_seqblocks[stream] >0) {
+      if (pf_lastblock[stream] +1 == block) {
+	pf_seqblocks[stream]++;
+	if (pf_seqblocks[stream] >= 2) {
+	  prefetch_up++;
+	  cachesim_ref(&L2, a + 5 * L2.line_size,1);
+	}
+      }
+      else pf_seqblocks[stream] = 0;
+    }
+    else if (pf_seqblocks[stream] <0) {
+      if (pf_lastblock[stream] -1 == block) {
+	pf_seqblocks[stream]--;
+	if (pf_seqblocks[stream] <= -2) {
+	  prefetch_down++;
+	  cachesim_ref(&L2, a - 5 * L2.line_size,1);
+	}
+      }
+      else pf_seqblocks[stream] = 0;
+    }
+    pf_lastblock[stream] = block;
+  }
+}  
+
+/* simple model with hardware prefetch */
+
+static
+CacheModelResult prefetch_I1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    prefetch_L2_doref(a,size);
+    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    return MemAccess;
+}
+
+static
+CacheModelResult prefetch_D1_ref(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    prefetch_L2_doref(a,size);
+    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+    return MemAccess;
+}
+
+
+/* complex model with hardware prefetch */
+
+static
+CacheModelResult prefetch_I1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
+    prefetch_L2_doref(a,size);
+    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
+	case Hit: return L2_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult prefetch_D1_Read(Addr a, UChar size)
+{
+    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
+    prefetch_L2_doref(a,size);
+    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
+	case Hit: return L2_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+static
+CacheModelResult prefetch_D1_Write(Addr a, UChar size)
+{
+    prefetch_L2_doref(a,size);
+    if ( cachesim_ref( &D1, a, size) == Hit ) {
+	/* Even for a L1 hit, the write-trough L1 passes
+	 * the write to the L2 to make the L2 line dirty.
+	 * But this causes no latency, so return the hit.
+	 */
+	cachesim_ref_wb( &L2, Write, a, size);
+	return L1_Hit;
+    }
+    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
+	case Hit: return L2_Hit;
+	case Miss: return MemAccess;
+	default: break;
+    }
+    return WriteBackMemAccess;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Cache Simulation with use metric collection          ---*/
+/*------------------------------------------------------------*/
+
+/* can not be combined with write-back or prefetch */
+
+static
+void cacheuse_initcache(cache_t2* c)
+{
+    int i;
+    unsigned int start_mask, start_val;
+    unsigned int end_mask, end_val;
+
+    c->use    = CLG_MALLOC(sizeof(line_use) * c->sets * c->assoc);
+    c->loaded = CLG_MALLOC(sizeof(line_loaded) * c->sets * c->assoc);
+    c->line_start_mask = CLG_MALLOC(sizeof(int) * c->line_size);
+    c->line_end_mask = CLG_MALLOC(sizeof(int) * c->line_size);
+    
+
+    c->line_size_mask = c->line_size-1;
+
+    /* Meaning of line_start_mask/line_end_mask
+     * Example: for a given cache line, you get an access starting at
+     * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
+     * line size of 32, you have 1 bit per byte in the mask:
+     *
+     *   bit31   bit8 bit5  bit 0
+     *       |      |  |    |
+     *       11..111111100000   line_start_mask[5]
+     *       00..000111111111   line_end_mask[(5+4)-1]
+     *
+     *  use_mask |= line_start_mask[5] && line_end_mask[8]
+     *
+     */
+    start_val = end_val = ~0;
+    if (c->line_size < 32) {
+	int bits_per_byte = 32/c->line_size;
+	start_mask = (1<<bits_per_byte)-1;
+	end_mask   = start_mask << (32-bits_per_byte);
+	for(i=0;i<c->line_size;i++) {
+	    c->line_start_mask[i] = start_val;
+	    start_val  = start_val & ~start_mask;
+	    start_mask = start_mask << bits_per_byte;
+	    
+	    c->line_end_mask[c->line_size-i-1] = end_val;
+	    end_val  = end_val & ~end_mask;
+	    end_mask = end_mask >> bits_per_byte;
+	}
+    }
+    else {
+	int bytes_per_bit = c->line_size/32;
+	start_mask = 1;
+	end_mask   = 1 << 31;
+	for(i=0;i<c->line_size;i++) {
+	    c->line_start_mask[i] = start_val;
+	    c->line_end_mask[c->line_size-i-1] = end_val;
+	    if ( ((i+1)%bytes_per_bit) == 0) {
+		start_val   &= ~start_mask;
+		end_val     &= ~end_mask;
+		start_mask <<= 1;
+		end_mask   >>= 1;
+	    }
+	}
+    }
+    
+    CLG_DEBUG(6, "Config %s:\n", c->desc_line);
+    for(i=0;i<c->line_size;i++) {
+	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
+		  i, c->line_start_mask[i], c->line_end_mask[i]);
+    }
+    
+    /* We use lower tag bits as offset pointers to cache use info.
+     * I.e. some cache parameters don't work.
+     */
+    if (c->tag_shift < c->assoc_bits) {
+	VG_(message)(Vg_DebugMsg,
+		     "error: Use associativity < %d for cache use statistics!",
+		     (1<<c->tag_shift) );
+	VG_(tool_panic)("Unsupported cache configuration");
+    }
+}
+    
+/* FIXME: A little tricky */
+#if 0
+
+static __inline__
+void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
+{
+    int idx = (high_idx << c->assoc_bits) | low_idx;
+
+    c->use[idx].count ++;
+    c->use[idx].mask |= use_mask;
+
+    CLG_DEBUG(6," Hit [idx %d] (line %p from %p): %x => %08x, count %d\n",
+	      idx, c->loaded[idx].memline,  c->loaded[idx].iaddr,
+	      use_mask, c->use[idx].mask, c->use[idx].count);
+}
+
+/* only used for I1, D1 */
+
+static __inline__
+CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
+{
+    int i, j, idx;
+    UWord *set, tmp_tag;
+    UInt use_mask;
+
+    /* Shifting is a bit faster than multiplying */
+    set = &(c->tags[set_no << c->assoc_bits]);
+    use_mask =
+	c->line_start_mask[a & c->line_size_mask] &
+	c->line_end_mask[(a+size-1) & c->line_size_mask];
+
+    /* This loop is unrolled for just the first case, which is the most */
+    /* common.  We can't unroll any further because it would screw up   */
+    /* if we have a direct-mapped (1-way) cache.                        */
+    if (tag == (set[0] & c->tag_mask)) {
+	cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
+	return L1_Hit;
+    }
+
+    /* If the tag is one other than the MRU, move it into the MRU spot  */
+    /* and shuffle the rest down.                                       */
+    for (i = 1; i < c->assoc; i++) {
+	if (tag == (set[i] & c->tag_mask)) {
+  	    tmp_tag = set[i];
+            for (j = i; j > 0; j--) {
+                set[j] = set[j - 1];
+            }
+            set[0] = tmp_tag;
+
+	    cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
+            return L1_Hit;
+        }
+    }
+
+    /* A miss;  install this tag as MRU, shuffle rest down. */
+    tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
+    for (j = c->assoc - 1; j > 0; j--) {
+        set[j] = set[j - 1];
+    }
+    set[0] = tag | tmp_tag;
+
+    cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
+		     use_mask, a & ~c->line_size_mask);
+
+    return Miss;
+}
+
+
+static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
+{
+    UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
+    UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
+    UWord tag  = a >> c->tag_shift;
+
+    /* Access entirely within line. */
+    if (set1 == set2) 
+	return cacheuse_setref(c, set1, tag);
+
+    /* Access straddles two lines. */
+    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
+    else if (((set1 + 1) & (c->sets-1)) == set2) {
+
+	/* the call updates cache structures as side effect */
+	CacheResult res1 =  cacheuse_isMiss(c, set1, tag);
+	CacheResult res2 =  cacheuse_isMiss(c, set2, tag);
+	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2);
+       VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+#endif
+
+
+/* for I1/D1 caches */
+#define CACHEUSE(L)                                                         \
+                                                                            \
+static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
+{                                                                           \
+   register UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);  \
+   register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);  \
+   register UWord tag  = a & L.tag_mask;                                    \
+   int i, j, idx;                                                           \
+   UWord *set, tmp_tag; 						    \
+   UInt use_mask;							    \
+                                                                            \
+   CLG_DEBUG(6,"%s.Acc(Addr %p, size %d): Sets [%d/%d]\n",                  \
+	    L.name, a, size, set1, set2);				    \
+                                                                            \
+   /* First case: word entirely within line. */                             \
+   if (set1 == set2) {                                                      \
+                                                                            \
+      /* Shifting is a bit faster than multiplying */                       \
+      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
+	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
+                                                                            \
+      /* This loop is unrolled for just the first case, which is the most */\
+      /* common.  We can't unroll any further because it would screw up   */\
+      /* if we have a direct-mapped (1-way) cache.                        */\
+      if (tag == (set[0] & L.tag_mask)) {                                   \
+        idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);              \
+        L.use[idx].count ++;                                                \
+        L.use[idx].mask |= use_mask;                                        \
+	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
+		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+	return L1_Hit;							    \
+      }                                                                     \
+      /* If the tag is one other than the MRU, move it into the MRU spot  */\
+      /* and shuffle the rest down.                                       */\
+      for (i = 1; i < L.assoc; i++) {                                       \
+	 if (tag == (set[i] & L.tag_mask)) {			            \
+  	    tmp_tag = set[i];                                               \
+            for (j = i; j > 0; j--) {                                       \
+               set[j] = set[j - 1];                                         \
+            }                                                               \
+            set[0] = tmp_tag;			                            \
+            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            L.use[idx].count ++;                                            \
+            L.use[idx].mask |= use_mask;                                    \
+	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
+		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+            return L1_Hit;                                                  \
+         }                                                                  \
+      }                                                                     \
+                                                                            \
+      /* A miss;  install this tag as MRU, shuffle rest down. */            \
+      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+      for (j = L.assoc - 1; j > 0; j--) {                                   \
+         set[j] = set[j - 1];                                               \
+      }                                                                     \
+      set[0] = tag | tmp_tag;                                               \
+      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      return update_##L##_use(&L, idx,         			            \
+		       use_mask, a &~ L.line_size_mask);		    \
+                                                                            \
+   /* Second case: word straddles two lines. */                             \
+   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
+   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
+      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
+      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
+      if (tag == (set[0] & L.tag_mask)) {                                   \
+         idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         L.use[idx].count ++;                                               \
+         L.use[idx].mask |= use_mask;                                       \
+	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
+		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+         goto block2;                                                       \
+      }                                                                     \
+      for (i = 1; i < L.assoc; i++) {                                       \
+	 if (tag == (set[i] & L.tag_mask)) {			            \
+  	    tmp_tag = set[i];                                               \
+            for (j = i; j > 0; j--) {                                       \
+               set[j] = set[j - 1];                                         \
+            }                                                               \
+            set[0] = tmp_tag;                                               \
+            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            L.use[idx].count ++;                                            \
+            L.use[idx].mask |= use_mask;                                    \
+	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
+		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+            goto block2;                                                    \
+         }                                                                  \
+      }                                                                     \
+      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+      for (j = L.assoc - 1; j > 0; j--) {                                   \
+         set[j] = set[j - 1];                                               \
+      }                                                                     \
+      set[0] = tag | tmp_tag;                                               \
+      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      miss1 = update_##L##_use(&L, idx,        			            \
+		       use_mask, a &~ L.line_size_mask);		    \
+block2:                                                                     \
+      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
+      if (tag == (set[0] & L.tag_mask)) {                                   \
+         idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         L.use[idx].count ++;                                               \
+         L.use[idx].mask |= use_mask;                                       \
+	CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): %x => %08x, count %d\n",\
+		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+         return miss1;                                                      \
+      }                                                                     \
+      for (i = 1; i < L.assoc; i++) {                                       \
+	 if (tag == (set[i] & L.tag_mask)) {			            \
+  	    tmp_tag = set[i];                                               \
+            for (j = i; j > 0; j--) {                                       \
+               set[j] = set[j - 1];                                         \
+            }                                                               \
+            set[0] = tmp_tag;                                               \
+            idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            L.use[idx].count ++;                                            \
+            L.use[idx].mask |= use_mask;                                    \
+	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): %x => %08x, count %d\n",\
+		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
+		 use_mask, L.use[idx].mask, L.use[idx].count);              \
+            return miss1;                                                   \
+         }                                                                  \
+      }                                                                     \
+      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+      for (j = L.assoc - 1; j > 0; j--) {                                   \
+         set[j] = set[j - 1];                                               \
+      }                                                                     \
+      set[0] = tag | tmp_tag;                                               \
+      idx = (set2 << L.assoc_bits) | tmp_tag;                               \
+      miss2 = update_##L##_use(&L, idx,			                    \
+		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
+      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
+                                                                            \
+   } else {                                                                 \
+       VG_(printf)("addr: %p  size: %u  sets: %d %d", a, size, set1, set2); \
+       VG_(tool_panic)("item straddles more than two cache sets");          \
+   }                                                                        \
+   return 0;                                                                \
+}
+
+
+/* logarithmic bitcounting algorithm, see
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ */
+static __inline__ unsigned int countBits(unsigned int bits)
+{
+  unsigned int c; // store the total here
+  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
+  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
+
+  c = bits;
+  c = ((c >> S[0]) & B[0]) + (c & B[0]);
+  c = ((c >> S[1]) & B[1]) + (c & B[1]);
+  c = ((c >> S[2]) & B[2]) + (c & B[2]);
+  c = ((c >> S[3]) & B[3]) + (c & B[3]);
+  c = ((c >> S[4]) & B[4]) + (c & B[4]);
+  return c;
+}
+
+static void update_L2_use(int idx, Addr memline)
+{
+  line_loaded* loaded = &(L2.loaded[idx]);
+  line_use* use = &(L2.use[idx]);
+  int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
+  
+  CLG_DEBUG(2, " L2.miss [%d]: at %p accessing memline %p\n",
+	   idx, bb_base + current_ii->instr_offset, memline);
+  if (use->count>0) {
+    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %p from %p]\n",
+	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
+    CLG_DEBUG(2, "   collect: %d, use_base %p\n",
+	     CLG_(current_state).collect, loaded->use_base);
+    
+    if (CLG_(current_state).collect && loaded->use_base) {
+      (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
+      (loaded->use_base)[off_L2_SpLoss] += i;
+    }
+   }
+
+   use->count = 0;
+   use->mask  = 0;
+
+  loaded->memline = memline;
+  loaded->iaddr   = bb_base + current_ii->instr_offset;
+  loaded->use_base = (CLG_(current_state).nonskipped) ?
+    CLG_(current_state).nonskipped->skipped :
+    cost_base + current_ii->cost_offset;
+}
+
+static
+CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
+{
+   UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
+   UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
+   UWord tag  = memline & L2.tag_mask;
+
+   int i, j, idx;
+   UWord tmp_tag;
+   
+   CLG_DEBUG(6,"L2.Acc(Memline %p): Set %d\n", memline, setNo);
+
+   if (tag == (set[0] & L2.tag_mask)) {
+     idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
+     l1_loaded->dep_use = &(L2.use[idx]);
+
+     CLG_DEBUG(6," Hit0 [idx %d] (line %p from %p): => %08x, count %d\n",
+		 idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
+		 L2.use[idx].mask, L2.use[idx].count);
+     return L2_Hit;
+   }
+   for (i = 1; i < L2.assoc; i++) {
+     if (tag == (set[i] & L2.tag_mask)) {
+       tmp_tag = set[i];
+       for (j = i; j > 0; j--) {
+	 set[j] = set[j - 1];
+       }
+       set[0] = tmp_tag;
+       idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
+       l1_loaded->dep_use = &(L2.use[idx]);
+
+	CLG_DEBUG(6," Hit%d [idx %d] (line %p from %p): => %08x, count %d\n",
+		 i, idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
+		 L2.use[idx].mask, L2.use[idx].count);
+	return L2_Hit;
+     }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
+   for (j = L2.assoc - 1; j > 0; j--) {
+     set[j] = set[j - 1];
+   }
+   set[0] = tag | tmp_tag;
+   idx = (setNo << L2.assoc_bits) | tmp_tag;
+   l1_loaded->dep_use = &(L2.use[idx]);
+
+   update_L2_use(idx, memline);
+
+   return MemAccess;
+}
+
+
+
+
+#define UPDATE_USE(L)					             \
+                                                                     \
+static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
+			       UInt mask, Addr memline)		     \
+{                                                                    \
+  line_loaded* loaded = &(cache->loaded[idx]);			     \
+  line_use* use = &(cache->use[idx]);				     \
+  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
+                                                                     \
+  CLG_DEBUG(2, " %s.miss [%d]: at %p accessing memline %p (mask %08x)\n", \
+	   cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
+  if (use->count>0) {                                                \
+    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %p from %p]\n",\
+	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
+    CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
+	     CLG_(current_state).collect, loaded->use_base);	     \
+                                                                     \
+    if (CLG_(current_state).collect && loaded->use_base) {            \
+      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
+      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
+                                                                     \
+      /* FIXME (?): L1/L2 line sizes must be equal ! */              \
+      loaded->dep_use->mask |= use->mask;                            \
+      loaded->dep_use->count += use->count;                          \
+    }                                                                \
+  }                                                                  \
+                                                                     \
+  use->count = 1;                                                    \
+  use->mask  = mask;                                                 \
+  loaded->memline = memline;                                         \
+  loaded->iaddr   = bb_base + current_ii->instr_offset;              \
+  loaded->use_base = (CLG_(current_state).nonskipped) ?               \
+    CLG_(current_state).nonskipped->skipped :                         \
+    cost_base + current_ii->cost_offset;		             \
+                                                                     \
+  if (memline == 0) return L2_Hit;                                   \
+  return cacheuse_L2_access(memline, loaded);                        \
+}
+
+UPDATE_USE(I1);
+UPDATE_USE(D1);
+
+CACHEUSE(I1);
+CACHEUSE(D1);
+
+
+static
+void cacheuse_finish(void)
+{
+  int i;
+  InstrInfo ii = { 0,0,0,0,0 };
+
+  if (!CLG_(current_state).collect) return;
+
+  bb_base = 0;
+  current_ii = &ii;
+  cost_base = 0;  
+
+  /* update usage counters */
+  if (I1.use)
+    for (i = 0; i < I1.sets * I1.assoc; i++)
+      if (I1.loaded[i].use_base)
+	update_I1_use( &I1, i, 0,0);
+
+  if (D1.use)
+    for (i = 0; i < D1.sets * D1.assoc; i++)
+      if (D1.loaded[i].use_base)
+	update_D1_use( &D1, i, 0,0);
+
+  if (L2.use)
+    for (i = 0; i < L2.sets * L2.assoc; i++)
+      if (L2.loaded[i].use_base)
+	update_L2_use(i, 0);
+}
+  
+
+
+/*------------------------------------------------------------*/
+/*--- Helper functions called by instrumented code         ---*/
+/*------------------------------------------------------------*/
+
+
+static __inline__
+void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
+{
+    switch(r) {
+	case WriteBackMemAccess:
+	    if (clo_simulate_writeback) {
+		c1[3]++;
+		c2[3]++;
+	    }
+	    // fall through
+
+	case MemAccess:
+	    c1[2]++;
+	    c2[2]++;
+	    // fall through
+
+	case L2_Hit:
+	    c1[1]++;
+	    c2[1]++;
+	    // fall through
+
+	default:
+	    c1[0]++;
+	    c2[0]++;
+    }
+}
+
+
+VG_REGPARM(1)
+static void log_1I0D(InstrInfo* ii)
+{
+    CacheModelResult IrRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
+
+    CLG_DEBUG(6, "log_1I0D:  Ir=%p/%u => Ir %d\n",
+	      bb_base + ii->instr_offset, ii->instr_size, IrRes);
+
+    if (CLG_(current_state).collect) {
+	ULong* cost_Ir;
+	
+	if (CLG_(current_state).nonskipped)
+	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
+	else
+	    cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
+
+	inc_costs(IrRes, cost_Ir, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
+    }
+}
+
+
+/* Instruction doing a read access */
+
+VG_REGPARM(2)
+static void log_1I1Dr(InstrInfo* ii, Addr data)
+{
+    CacheModelResult IrRes, DrRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
+    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+
+    CLG_DEBUG(6, "log_1I1Dr: Ir=%p/%u, Dr=%p/%u => Ir %d, Dr %d\n",
+	      bb_base + ii->instr_offset, ii->instr_size,
+	      data, ii->data_size, IrRes, DrRes);
+
+    if (CLG_(current_state).collect) {
+	ULong *cost_Ir, *cost_Dr;
+	
+	if (CLG_(current_state).nonskipped) {
+	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
+	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
+	}
+	else {
+	    cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
+	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+	}
+       
+	inc_costs(IrRes, cost_Ir, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
+	inc_costs(DrRes, cost_Dr,
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
+    }
+}
+
+
+VG_REGPARM(2)
+static void log_0I1Dr(InstrInfo* ii, Addr data)
+{
+    CacheModelResult DrRes;
+
+    current_ii = ii;
+    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+
+    CLG_DEBUG(6, "log_0I1Dr: Dr=%p/%u => Dr %d\n",
+	      data, ii->data_size, DrRes);
+
+    if (CLG_(current_state).collect) {
+	ULong *cost_Dr;
+	
+	if (CLG_(current_state).nonskipped) {
+	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
+	}
+	else {
+	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+	}
+       
+	inc_costs(DrRes, cost_Dr,
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
+    }
+}
+
+
+/* Instruction doing a write access */
+
+VG_REGPARM(2)
+static void log_1I1Dw(InstrInfo* ii, Addr data)
+{
+    CacheModelResult IrRes, DwRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
+    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+
+    CLG_DEBUG(6, "log_1I1Dw: Ir=%p/%u, Dw=%p/%u => Ir %d, Dw %d\n",
+	      bb_base + ii->instr_offset, ii->instr_size,
+	      data, ii->data_size, IrRes, DwRes);
+
+    if (CLG_(current_state).collect) {
+	ULong *cost_Ir, *cost_Dw;
+	
+	if (CLG_(current_state).nonskipped) {
+	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
+	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
+	}
+	else {
+	    cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
+	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+	}
+       
+	inc_costs(IrRes, cost_Ir,
+		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
+	inc_costs(DwRes, cost_Dw,
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
+    }
+}
+
+VG_REGPARM(2)
+static void log_0I1Dw(InstrInfo* ii, Addr data)
+{
+    CacheModelResult DwRes;
+
+    current_ii = ii;
+    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+
+    CLG_DEBUG(6, "log_0I1Dw: Dw=%p/%u => Dw %d\n",
+	      data, ii->data_size, DwRes);
+
+    if (CLG_(current_state).collect) {
+	ULong *cost_Dw;
+	
+	if (CLG_(current_state).nonskipped) {
+	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
+	}
+	else {
+	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+	}
+       
+	inc_costs(DwRes, cost_Dw,
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
+    }
+}
+
+/* Instruction doing a read and a write access */
+
+VG_REGPARM(3)
+static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
+{
+    CacheModelResult IrRes, DrRes, DwRes;
+
+    current_ii = ii;
+    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
+    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
+
+    CLG_DEBUG(6,
+	      "log_1I2D: Ir=%p/%u, Dr=%p/%u, Dw=%p/%u => Ir %d, Dr %d, Dw %d\n",
+	      bb_base + ii->instr_offset, ii->instr_size,
+	      data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
+
+    if (CLG_(current_state).collect) {
+	ULong *cost_Ir, *cost_Dr, *cost_Dw;
+
+	if (CLG_(current_state).nonskipped) {
+	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
+	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
+	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
+	}
+	else {
+	    cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
+	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
+	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
+	}
+	
+	inc_costs(IrRes, cost_Ir, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
+	inc_costs(DrRes, cost_Dr, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
+	inc_costs(DwRes, cost_Dw, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
+    }
+}
+
+VG_REGPARM(3)
+static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
+{
+    CacheModelResult DrRes, DwRes;
+
+    current_ii = ii;
+    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
+
+    CLG_DEBUG(6,
+	      "log_0D2D: Dr=%p/%u, Dw=%p/%u => Dr %d, Dw %d\n",
+	      data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
+
+    if (CLG_(current_state).collect) {
+	ULong *cost_Dr, *cost_Dw;
+
+	if (CLG_(current_state).nonskipped) {
+	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
+	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
+	}
+	else {
+	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
+	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
+	}
+	
+	inc_costs(DrRes, cost_Dr, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
+	inc_costs(DwRes, cost_Dw, 
+		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
+    }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Cache configuration                                  ---*/
+/*------------------------------------------------------------*/
+
+#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 }) 
+
+static cache_t clo_I1_cache = UNDEFINED_CACHE;
+static cache_t clo_D1_cache = UNDEFINED_CACHE;
+static cache_t clo_L2_cache = UNDEFINED_CACHE;
+
+
+/* Checks cache config is ok;  makes it so if not. */
+static
+void check_cache(cache_t* cache, Char *name)
+{
+   /* First check they're all powers of two */
+   if (-1 == VG_(log2)(cache->size)) {
+      VG_(message)(Vg_UserMsg,
+         "error: %s size of %dB not a power of two; aborting.",
+         name, cache->size);
+      VG_(exit)(1);
+   }
+
+   if (-1 == VG_(log2)(cache->assoc)) {
+      VG_(message)(Vg_UserMsg,
+         "error: %s associativity of %d not a power of two; aborting.",
+         name, cache->assoc);
+      VG_(exit)(1);
+   }
+
+  if (-1 == VG_(log2)(cache->line_size)) {
+      VG_(message)(Vg_UserMsg,
+         "error: %s line size of %dB not a power of two; aborting.",
+         name, cache->line_size);
+      VG_(exit)(1);
+   }
+
+   // Then check line size >= 16 -- any smaller and a single instruction could
+   // straddle three cache lines, which breaks a simulation assertion and is
+   // stupid anyway.
+   if (cache->line_size < MIN_LINE_SIZE) {
+      VG_(message)(Vg_UserMsg,
+         "error: %s line size of %dB too small; aborting.",
+         name, cache->line_size);
+      VG_(exit)(1);
+   }
+
+   /* Then check cache size > line size (causes seg faults if not). */
+   if (cache->size <= cache->line_size) {
+      VG_(message)(Vg_UserMsg,
+         "error: %s cache size of %dB <= line size of %dB; aborting.",
+         name, cache->size, cache->line_size);
+      VG_(exit)(1);
+   }
+
+   /* Then check assoc <= (size / line size) (seg faults otherwise). */
+   if (cache->assoc > (cache->size / cache->line_size)) {
+      VG_(message)(Vg_UserMsg,
+         "warning: %s associativity > (size / line size); aborting.", name);
+      VG_(exit)(1);
+   }
+}
+
+static
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+{
+#define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
+
+   Int n_clos = 0;
+
+   // Count how many were defined on the command line.
+   if (DEFINED(clo_I1_cache)) { n_clos++; }
+   if (DEFINED(clo_D1_cache)) { n_clos++; }
+   if (DEFINED(clo_L2_cache)) { n_clos++; }
+
+   // Set the cache config (using auto-detection, if supported by the
+   // architecture)
+   VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
+
+   // Then replace with any defined on the command line.
+   if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
+   if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
+   if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+
+   // Then check values and fix if not acceptable.
+   check_cache(I1c, "I1");
+   check_cache(D1c, "D1");
+   check_cache(L2c, "L2");
+
+   if (VG_(clo_verbosity) > 1) {
+      VG_(message)(Vg_UserMsg, "Cache configuration used:");
+      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines",
+                               I1c->size, I1c->assoc, I1c->line_size);
+      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines",
+                               D1c->size, D1c->assoc, D1c->line_size);
+      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines",
+                               L2c->size, L2c->assoc, L2c->line_size);
+   }
+#undef CMD_LINE_DEFINED
+}
+
+
+/* Initialize and clear simulator state */
+static void cachesim_post_clo_init(void)
+{
+  /* Cache configurations. */
+  cache_t  I1c, D1c, L2c;
+
+  /* Initialize access handlers */
+  if (!CLG_(clo).simulate_cache) {
+    CLG_(cachesim).log_1I0D  = 0;
+    CLG_(cachesim).log_1I0D_name = "(no function)";
+
+    CLG_(cachesim).log_1I1Dr = 0;
+    CLG_(cachesim).log_1I1Dw = 0;
+    CLG_(cachesim).log_1I2D  = 0;
+    CLG_(cachesim).log_1I1Dr_name = "(no function)";
+    CLG_(cachesim).log_1I1Dw_name = "(no function)";
+    CLG_(cachesim).log_1I2D_name = "(no function)";
+
+    CLG_(cachesim).log_0I1Dr = 0;
+    CLG_(cachesim).log_0I1Dw = 0;
+    CLG_(cachesim).log_0I2D  = 0;
+    CLG_(cachesim).log_0I1Dr_name = "(no function)";
+    CLG_(cachesim).log_0I1Dw_name = "(no function)";
+    CLG_(cachesim).log_0I2D_name = "(no function)";
+    return;
+  }
+
+  /* Configuration of caches only needed with real cache simulation */
+  configure_caches(&I1c, &D1c, &L2c);
+  
+  I1.name = "I1";
+  D1.name = "D1";
+  L2.name = "L2";
+
+  cachesim_initcache(I1c, &I1);
+  cachesim_initcache(D1c, &D1);
+  cachesim_initcache(L2c, &L2);
+
+  /* the other cache simulators use the standard helpers
+   * with dispatching via simulator struct */
+
+  CLG_(cachesim).log_1I0D  = log_1I0D;
+  CLG_(cachesim).log_1I0D_name  = "log_1I0D";
+
+  CLG_(cachesim).log_1I1Dr = log_1I1Dr;
+  CLG_(cachesim).log_1I1Dw = log_1I1Dw;
+  CLG_(cachesim).log_1I2D  = log_1I2D;
+  CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
+  CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
+  CLG_(cachesim).log_1I2D_name  = "log_1I2D";
+
+  CLG_(cachesim).log_0I1Dr = log_0I1Dr;
+  CLG_(cachesim).log_0I1Dw = log_0I1Dw;
+  CLG_(cachesim).log_0I2D  = log_0I2D;
+  CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
+  CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
+  CLG_(cachesim).log_0I2D_name  = "log_0I2D";
+
+  if (clo_collect_cacheuse) {
+
+      /* Output warning for not supported option combinations */
+      if (clo_simulate_hwpref) {
+	  VG_(message)(Vg_DebugMsg,
+		       "warning: prefetch simulation can not be used with cache usage");
+	  clo_simulate_hwpref = False;
+      }
+
+      if (clo_simulate_writeback) {
+	  VG_(message)(Vg_DebugMsg,
+		       "warning: write-back simulation can not be used with cache usage");
+	  clo_simulate_writeback = False;
+      }
+
+      simulator.I1_Read  = cacheuse_I1_doRead;
+      simulator.D1_Read  = cacheuse_D1_doRead;
+      simulator.D1_Write = cacheuse_D1_doRead;
+      return;
+  }
+
+  if (clo_simulate_hwpref) {
+    prefetch_clear();
+
+    if (clo_simulate_writeback) {
+      simulator.I1_Read  = prefetch_I1_Read;
+      simulator.D1_Read  = prefetch_D1_Read;
+      simulator.D1_Write = prefetch_D1_Write;
+    }
+    else {
+      simulator.I1_Read  = prefetch_I1_ref;
+      simulator.D1_Read  = prefetch_D1_ref;
+      simulator.D1_Write = prefetch_D1_ref;
+    }
+
+    return;
+  }
+
+  if (clo_simulate_writeback) {
+      simulator.I1_Read  = cachesim_I1_Read;
+      simulator.D1_Read  = cachesim_D1_Read;
+      simulator.D1_Write = cachesim_D1_Write;
+  }
+  else {
+      simulator.I1_Read  = cachesim_I1_ref;
+      simulator.D1_Read  = cachesim_D1_ref;
+      simulator.D1_Write = cachesim_D1_ref;
+  }
+}
+
+
+/* Clear simulator state. Has to be initialized before */
+static
+void cachesim_clear(void)
+{
+  cachesim_clearcache(&I1);
+  cachesim_clearcache(&D1);
+  cachesim_clearcache(&L2);
+
+  prefetch_clear();
+}
+
+
+static void cachesim_getdesc(Char* buf)
+{
+  Int p;
+  p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
+  p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
+  VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
+}
+
+static
+void cachesim_print_opts(void)
+{
+  VG_(printf)(
+"\n   cache simulator options:\n"
+"    --simulate-cache=no|yes   Do cache simulation [no]\n"
+"    --simulate-wb=no|yes      Count write-back events [no]\n"
+"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
+#if CLG_EXPERIMENTAL
+"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
+#endif
+"    --cacheuse=no|yes         Collect cache block use [no]\n"
+"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
+"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
+"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
+	      );
+}
+
+static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
+{
+   int   i1, i2, i3;
+   int   i;
+   char *opt = VG_(strdup)(orig_opt);
+
+   i = i1 = opt_len;
+
+   /* Option looks like "--I1=65536,2,64".
+    * Find commas, replace with NULs to make three independent 
+    * strings, then extract numbers.  Yuck. */
+   while (VG_(isdigit)(opt[i])) i++;
+   if (',' == opt[i]) {
+      opt[i++] = '\0';
+      i2 = i;
+   } else goto bad;
+   while (VG_(isdigit)(opt[i])) i++;
+   if (',' == opt[i]) {
+      opt[i++] = '\0';
+      i3 = i;
+   } else goto bad;
+   while (VG_(isdigit)(opt[i])) i++;
+   if ('\0' != opt[i]) goto bad;
+
+   cache->size      = (Int)VG_(atoll)(opt + i1);
+   cache->assoc     = (Int)VG_(atoll)(opt + i2);
+   cache->line_size = (Int)VG_(atoll)(opt + i3);
+
+   VG_(free)(opt);
+
+   return;
+
+  bad:
+   VG_(bad_option)(orig_opt);
+}
+
+/* Check for command line option for cache configuration.
+ * Return False if unknown and not handled.
+ *
+ * Called from CLG_(process_cmd_line_option)() in clo.c
+ */
+static Bool cachesim_parse_opt(Char* arg)
+{
+  if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
+    clo_simulate_writeback = True;
+  else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
+    clo_simulate_writeback = False;
+
+  else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
+    clo_simulate_hwpref = True;
+  else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
+    clo_simulate_hwpref = False;
+
+  else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
+    clo_simulate_sectors = True;
+  else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
+    clo_simulate_sectors = False;
+
+  else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
+    clo_collect_cacheuse = True;
+    /* Use counters only make sense with fine dumping */
+    CLG_(clo).dump_instr = True;
+  }
+  else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
+    clo_collect_cacheuse = False;
+
+  /* 5 is length of "--I1=" */
+  else if (0 == VG_(strncmp)(arg, "--I1=", 5))
+    parse_opt(&clo_I1_cache, arg,   5);
+  else if (0 == VG_(strncmp)(arg, "--D1=", 5))
+    parse_opt(&clo_D1_cache, arg,   5);
+  else if (0 == VG_(strncmp)(arg, "--L2=", 5))
+    parse_opt(&clo_L2_cache, arg,   5);
+  else
+    return False;
+
+  return True;
+}
+
+/* Adds commas to ULong, right justifying in a field field_width wide, returns
+ * the string in buf. */
+static
+Int commify(ULong n, int field_width, char* buf)
+{
+   int len, n_commas, i, j, new_len, space;
+
+   VG_(sprintf)(buf, "%llu", n);
+   len = VG_(strlen)(buf);
+   n_commas = (len - 1) / 3;
+   new_len = len + n_commas;
+   space = field_width - new_len;
+
+   /* Allow for printing a number in a field_width smaller than it's size */
+   if (space < 0) space = 0;    
+
+   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
+    * of three. */
+   for (j = -1, i = len ; i >= 0; i--) {
+      buf[i + n_commas + space] = buf[i];
+
+      if ((i>0) && (3 == ++j)) {
+         j = 0;
+         n_commas--;
+         buf[i + n_commas + space] = ',';
+      }
+   }
+   /* Right justify in field. */
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+   return new_len;
+}
+
+static
+void percentify(Int n, Int ex, Int field_width, char buf[]) 
+{
+   int i, len, space;
+    
+   VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
+   len = VG_(strlen)(buf);
+   space = field_width - len;
+   if (space < 0) space = 0;     /* Allow for v. small field_width */
+   i = len;
+
+   /* Right justify in field */
+   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+}
+
+static
+void cachesim_printstat(void)
+{
+  FullCost total = CLG_(total_cost), D_total = 0;
+  ULong L2_total_m, L2_total_mr, L2_total_mw,
+    L2_total, L2_total_r, L2_total_w;
+  char buf1[RESULTS_BUF_LEN], 
+    buf2[RESULTS_BUF_LEN], 
+    buf3[RESULTS_BUF_LEN];
+  Int l1, l2, l3;
+  Int p;
+
+  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
+    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu", 
+		 prefetch_up);
+    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu", 
+		 prefetch_down);
+    VG_(message)(Vg_DebugMsg, "");
+  }
+
+  /* I cache results.  Use the I_refs value to determine the first column
+   * width. */
+  l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
+  VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);
+
+  if (!CLG_(clo).simulate_cache) return;
+
+  commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
+  VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);
+
+  commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
+  VG_(message)(Vg_UserMsg, "L2i misses:    %s", buf1);
+
+  p = 100;
+
+  if (0 == total[CLG_(sets).off_full_Ir]) 
+    total[CLG_(sets).off_full_Ir] = 1;
+
+  percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
+	     total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
+  VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
+       
+  percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
+	     total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
+  VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
+  VG_(message)(Vg_UserMsg, "");
+   
+  /* D cache results.
+     Use the D_refs.rd and D_refs.wr values to determine the
+   * width of columns 2 & 3. */
+
+  D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
+  CLG_(init_cost)( CLG_(sets).full, D_total);
+  CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
+  CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
+
+  commify( D_total[0], l1, buf1);
+  l2 = commify(total[CLG_(sets).off_full_Dr], 0,  buf2);
+  l3 = commify(total[CLG_(sets).off_full_Dw], 0,  buf3);
+  VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
+	       buf1,  buf2,  buf3);
+
+  commify( D_total[1], l1, buf1);
+  commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
+  commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
+  VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
+	       buf1, buf2, buf3);
+
+  commify( D_total[2], l1, buf1);
+  commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
+  commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
+  VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)",
+	       buf1, buf2, buf3);
+
+  p = 10;
+  
+  if (0 == D_total[0])   D_total[0] = 1;
+  if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
+  if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
+  
+  percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
+  percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
+	     total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
+  percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
+	     total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
+  VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+  
+  percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
+  percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
+	     total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
+  percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
+	     total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
+  VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+  VG_(message)(Vg_UserMsg, "");
+
+
+  
+  /* L2 overall results */
+  
+  L2_total   =
+    total[CLG_(sets).off_full_Dr +1] +
+    total[CLG_(sets).off_full_Dw +1] +
+    total[CLG_(sets).off_full_Ir +1];
+  L2_total_r =
+    total[CLG_(sets).off_full_Dr +1] +
+    total[CLG_(sets).off_full_Ir +1];
+  L2_total_w = total[CLG_(sets).off_full_Dw +1];
+  commify(L2_total,   l1, buf1);
+  commify(L2_total_r, l2, buf2);
+  commify(L2_total_w, l3, buf3);
+  VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)",
+	       buf1, buf2, buf3);
+  
+  L2_total_m  =
+    total[CLG_(sets).off_full_Dr +2] +
+    total[CLG_(sets).off_full_Dw +2] +
+    total[CLG_(sets).off_full_Ir +2];
+  L2_total_mr =
+    total[CLG_(sets).off_full_Dr +2] +
+    total[CLG_(sets).off_full_Ir +2];
+  L2_total_mw = total[CLG_(sets).off_full_Dw +2];
+  commify(L2_total_m,  l1, buf1);
+  commify(L2_total_mr, l2, buf2);
+  commify(L2_total_mw, l3, buf3);
+  VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
+	       buf1, buf2, buf3);
+  
+  percentify(L2_total_m  * 100 * p /
+	     (total[CLG_(sets).off_full_Ir] + D_total[0]),  p, l1+1, buf1);
+  percentify(L2_total_mr * 100 * p /
+	     (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
+	     p, l2+1, buf2);
+  percentify(L2_total_mw * 100 * p /
+	     total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
+  VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )",
+	       buf1, buf2,buf3);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Setup for Event set.                                 ---*/
+/*------------------------------------------------------------*/
+
+struct event_sets CLG_(sets);
+
+void CLG_(init_eventsets)(Int max_user)
+{
+  EventType * e1, *e2, *e3, *e4;
+  EventSet *Ir, *Dr, *Dw;
+  EventSet *D0, *D1r, *D1w, *D2;
+  EventSet *sim, *full;
+  EventSet *use;
+  int sizeOfUseIr;
+
+  use = CLG_(get_eventset)("Use", 4);
+  if (clo_collect_cacheuse) {
+    /* if TUse is 0, there was never a load, and no loss, too */
+    e1 = CLG_(register_eventtype)("AcCost1");
+    CLG_(add_eventtype)(use, e1);
+    e1 = CLG_(register_eventtype)("SpLoss1");
+    CLG_(add_eventtype)(use, e1);
+    e1 = CLG_(register_eventtype)("AcCost2");
+    CLG_(add_eventtype)(use, e1);
+    e1 = CLG_(register_eventtype)("SpLoss2");
+    CLG_(add_eventtype)(use, e1);
+  }
+
+  Ir = CLG_(get_eventset)("Ir", 4);    
+  Dr = CLG_(get_eventset)("Dr", 4);
+  Dw = CLG_(get_eventset)("Dw", 4);
+  if (CLG_(clo).simulate_cache) {
+    e1 = CLG_(register_eventtype)("Ir");
+    e2 = CLG_(register_eventtype)("I1mr");
+    e3 = CLG_(register_eventtype)("I2mr");
+    if (clo_simulate_writeback) {
+      e4 = CLG_(register_eventtype)("I2dmr");
+      CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
+    }
+    else
+      CLG_(add_dep_event3)(Ir, e1,e2,e3);
+
+    e1 = CLG_(register_eventtype)("Dr");
+    e2 = CLG_(register_eventtype)("D1mr");
+    e3 = CLG_(register_eventtype)("D2mr");
+    if (clo_simulate_writeback) {
+      e4 = CLG_(register_eventtype)("D2dmr");
+      CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
+    }
+    else
+      CLG_(add_dep_event3)(Dr, e1,e2,e3);
+    
+    e1 = CLG_(register_eventtype)("Dw");
+    e2 = CLG_(register_eventtype)("D1mw");
+    e3 = CLG_(register_eventtype)("D2mw");
+    if (clo_simulate_writeback) {
+      e4 = CLG_(register_eventtype)("D2dmw");
+      CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
+    }
+    else
+      CLG_(add_dep_event3)(Dw, e1,e2,e3);
+
+  }
+  else {
+    e1 = CLG_(register_eventtype)("Ir");
+    CLG_(add_eventtype)(Ir, e1);
+  }
+
+  sizeOfUseIr =  use->size + Ir->size;
+  D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
+  CLG_(add_eventset)(D0, use);
+  off_D0_Ir  = CLG_(add_eventset)(D0, Ir);
+
+  D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
+  CLG_(add_eventset)(D1r, use);
+  off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
+  off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
+
+  D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
+  CLG_(add_eventset)(D1w, use);
+  off_D1w_Ir   = CLG_(add_eventset)(D1w, Ir);
+  off_D1w_Dw   = CLG_(add_eventset)(D1w, Dw);
+
+  D2  = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
+  CLG_(add_eventset)(D2, use);
+  off_D2_Ir    = CLG_(add_eventset)(D2, Ir);
+  off_D2_Dr    = CLG_(add_eventset)(D2, Dr);
+  off_D2_Dw    = CLG_(add_eventset)(D2, Dw);
+
+  sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
+  CLG_(add_eventset)(sim, use);
+  CLG_(sets).off_sim_Ir   = CLG_(add_eventset)(sim, Ir);
+  CLG_(sets).off_sim_Dr   = CLG_(add_eventset)(sim, Dr);
+  CLG_(sets).off_sim_Dw   = CLG_(add_eventset)(sim, Dw);
+
+  if (CLG_(clo).collect_alloc)   max_user += 2;
+  if (CLG_(clo).collect_systime) max_user += 2;
+
+  full = CLG_(get_eventset)("full", sim->size + max_user);
+  CLG_(add_eventset)(full, sim);
+  CLG_(sets).off_full_Ir   = CLG_(sets).off_sim_Ir;
+  CLG_(sets).off_full_Dr   = CLG_(sets).off_sim_Dr;
+  CLG_(sets).off_full_Dw   = CLG_(sets).off_sim_Dw;
+
+  CLG_(sets).use = use;
+  CLG_(sets).Ir  = Ir;
+  CLG_(sets).Dr  = Dr;
+  CLG_(sets).Dw  = Dw;
+
+  CLG_(sets).D0  = D0;
+  CLG_(sets).D1r = D1r;
+  CLG_(sets).D1w = D1w;
+  CLG_(sets).D2  = D2;
+
+  CLG_(sets).sim  = sim;
+  CLG_(sets).full = full;
+
+  if (CLG_(clo).collect_alloc) {
+    e1 = CLG_(register_eventtype)("allocCount");
+    e2 = CLG_(register_eventtype)("allocSize");
+    CLG_(sets).off_full_user =  CLG_(add_dep_event2)(full, e1,e2);
+  }
+
+  if (CLG_(clo).collect_systime) {
+    e1 = CLG_(register_eventtype)("sysCount");
+    e2 = CLG_(register_eventtype)("sysTime");
+    CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
+  }
+
+  CLG_DEBUGIF(1) {
+    CLG_DEBUG(1, "EventSets:\n");
+    CLG_(print_eventset)(-2, use);
+    CLG_(print_eventset)(-2, Ir);
+    CLG_(print_eventset)(-2, Dr);
+    CLG_(print_eventset)(-2, Dw);
+    CLG_(print_eventset)(-2, sim);
+    CLG_(print_eventset)(-2, full);
+  }
+
+  /* Not-existing events are silently ignored */
+  CLG_(dumpmap) = CLG_(get_eventmapping)(full);
+  CLG_(append_event)(CLG_(dumpmap), "Ir");
+  CLG_(append_event)(CLG_(dumpmap), "Dr");
+  CLG_(append_event)(CLG_(dumpmap), "Dw");
+  CLG_(append_event)(CLG_(dumpmap), "I1mr");
+  CLG_(append_event)(CLG_(dumpmap), "D1mr");
+  CLG_(append_event)(CLG_(dumpmap), "D1mw");
+  CLG_(append_event)(CLG_(dumpmap), "I2mr");
+  CLG_(append_event)(CLG_(dumpmap), "D2mr");
+  CLG_(append_event)(CLG_(dumpmap), "D2mw");
+  CLG_(append_event)(CLG_(dumpmap), "I2dmr");
+  CLG_(append_event)(CLG_(dumpmap), "D2dmr");
+  CLG_(append_event)(CLG_(dumpmap), "D2dmw");
+  CLG_(append_event)(CLG_(dumpmap), "AcCost1");
+  CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
+  CLG_(append_event)(CLG_(dumpmap), "AcCost2");
+  CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
+  CLG_(append_event)(CLG_(dumpmap), "allocCount");
+  CLG_(append_event)(CLG_(dumpmap), "allocSize");
+  CLG_(append_event)(CLG_(dumpmap), "sysCount");
+  CLG_(append_event)(CLG_(dumpmap), "sysTime");
+
+}
+
+
+
+static
+void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
+{
+  /* if eventset use is defined, it is always first (hardcoded!) */
+  CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);  
+
+  /* FIXME: This is hardcoded... */
+  if (es == CLG_(sets).D0) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
+			    cost + off_D0_Ir);
+  }
+  else if (es == CLG_(sets).D1r) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
+			    cost + off_D1r_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
+			    cost + off_D1r_Dr);
+  }
+  else if (es == CLG_(sets).D1w) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
+			    cost + off_D1w_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
+			    cost + off_D1w_Dw);
+  }
+  else {
+    CLG_ASSERT(es == CLG_(sets).D2);
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
+			    cost + off_D2_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
+			    cost + off_D2_Dr);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
+			    cost + off_D2_Dw);
+  }
+}
+
+/* this is called at dump time for every instruction executed */
+static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
+			       InstrInfo* ii, ULong exe_count)
+{
+  if (!CLG_(clo).simulate_cache)
+      cost[CLG_(sets).off_sim_Ir] += exe_count;
+  else {
+
+#if 0
+/* There is always a trivial case where exe_count and Ir can be
+ * slightly different because ecounter is updated when executing
+ * the next BB. E.g. for last BB executed, or when toggling collection
+ */
+      /* FIXME: Hardcoded that each eventset has Ir as first */
+      if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
+	  VG_(printf)("==> Ir %llu, exe %llu\n",
+		      (bbcc->cost + ii->cost_offset)[0], exe_count);
+	  CLG_(print_bbcc_cost)(-2, bbcc);
+	  //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
+      }
+#endif
+
+      add_and_zero_Dx(ii->eventset, cost,
+		      bbcc->cost + ii->cost_offset);
+  }
+}
+
+static
+void cachesim_after_bbsetup(void)
+{
+  BBCC* bbcc = CLG_(current_state).bbcc;
+
+  if (CLG_(clo).simulate_cache) {
+    BB* bb = bbcc->bb;
+
+    /* only needed if log_* functions are called */
+    bb_base   = bb->obj->offset + bb->offset;
+    cost_base = bbcc->cost;
+  }
+}
+
+static
+void cachesim_finish(void)
+{
+  if (clo_collect_cacheuse)
+    cacheuse_finish();
+}
+
+/*------------------------------------------------------------*/
+/*--- The simulator defined in this file                   ---*/
+/*------------------------------------------------------------*/
+
+struct cachesim_if CLG_(cachesim) = {
+  .print_opts    = cachesim_print_opts,
+  .parse_opt     = cachesim_parse_opt,
+  .post_clo_init = cachesim_post_clo_init,
+  .clear         = cachesim_clear,
+  .getdesc       = cachesim_getdesc,
+  .printstat     = cachesim_printstat,
+  .add_icost     = cachesim_add_icost,
+  .after_bbsetup = cachesim_after_bbsetup,
+  .finish        = cachesim_finish,
+
+  /* these will be set by cachesim_post_clo_init */
+  .log_1I0D        = 0,
+
+  .log_1I1Dr       = 0,
+  .log_1I1Dw       = 0,
+  .log_1I2D        = 0,
+
+  .log_0I1Dr       = 0,
+  .log_0I1Dw       = 0,
+  .log_0I2D        = 0,
+
+  .log_1I0D_name = "(no function)",
+
+  .log_1I1Dr_name = "(no function)",
+  .log_1I1Dw_name = "(no function)",
+  .log_1I2D_name = "(no function)",
+
+  .log_0I1Dr_name = "(no function)",
+  .log_0I1Dw_name = "(no function)",
+  .log_0I2D_name = "(no function)"
+};
+
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                 ct_sim.c ---*/
+/*--------------------------------------------------------------------*/
+
diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am
new file mode 100644
index 0000000000..bc7d201f4f
--- /dev/null
+++ b/callgrind/tests/Makefile.am
@@ -0,0 +1,14 @@
+# For AM_FLAG_M3264_PRI
+include $(top_srcdir)/Makefile.flags.am
+
+SUBDIRS = .
+DIST_SUBDIRS = .
+
+noinst_SCRIPTS = 
+
+EXTRA_DIST = 
+
+check_PROGRAMS = 
+
+AM_CPPFLAGS = -I$(top_srcdir)/include
+AM_CFLAGS   = $(WERROR) -Winline -Wall -Wshadow -g $(AM_FLAG_M3264_PRI)
diff --git a/callgrind/threads.c b/callgrind/threads.c
new file mode 100644
index 0000000000..eda9d0c46f
--- /dev/null
+++ b/callgrind/threads.c
@@ -0,0 +1,456 @@
+/*--------------------------------------------------------------------*/
+/*--- Callgrind                                                    ---*/
+/*---                                                 ct_threads.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Callgrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#include <pub_tool_threadstate.h>
+
+/* forward decls */
+static exec_state* exec_state_save(void);
+static exec_state* exec_state_restore(void);
+static exec_state* push_exec_state(int);
+static exec_state* top_exec_state(void);
+
+static exec_stack current_states;
+
+
+/*------------------------------------------------------------*/
+/*--- Support for multi-threading                          ---*/
+/*------------------------------------------------------------*/
+
+
+/*
+ * For Valgrind, MT is cooperative (no preemting in our code),
+ * so we don't need locks...
+ *
+ * Per-thread data:
+ *  - BBCCs
+ *  - call stack
+ *  - call hash
+ *  - event counters: last, current
+ *
+ * Even when ignoring MT, we need this functions to set up some
+ * datastructures for the process (= Thread 1).
+ */
+
+/* current running thread */
+ThreadId CLG_(current_tid);
+
+static thread_info* thread[VG_N_THREADS];
+
+thread_info** CLG_(get_threads)()
+{
+  return thread;
+}
+
+thread_info* CLG_(get_current_thread)()
+{
+  return thread[CLG_(current_tid)];
+}
+
+void CLG_(init_threads)()
+{
+    Int i;
+    for(i=0;i<VG_N_THREADS;i++)
+	thread[i] = 0;
+    CLG_(current_tid) = VG_INVALID_THREADID;
+}
+
+/* switches through all threads and calls func */
+void CLG_(forall_threads)(void (*func)(thread_info*))
+{
+  Int t, orig_tid = CLG_(current_tid);
+
+  for(t=1;t<VG_N_THREADS;t++) {
+    if (!thread[t]) continue;
+    CLG_(switch_thread)(t);
+    (*func)(thread[t]);
+  }
+  CLG_(switch_thread)(orig_tid);
+}
+
+
+static
+thread_info* new_thread(void)
+{
+    thread_info* t;
+
+    t = (thread_info*) CLG_MALLOC(sizeof(thread_info));
+
+    /* init state */
+    CLG_(init_exec_stack)( &(t->states) );
+    CLG_(init_call_stack)( &(t->calls) );
+    CLG_(init_fn_stack)  ( &(t->fns) );
+    /* t->states.entry[0]->cxt = CLG_(get_cxt)(t->fns.bottom); */
+
+    /* event counters */
+    t->lastdump_cost   = CLG_(get_eventset_cost)( CLG_(sets).full );
+    t->sighandler_cost = CLG_(get_eventset_cost)( CLG_(sets).full );
+    CLG_(init_cost)( CLG_(sets).full, t->lastdump_cost );
+    CLG_(init_cost)( CLG_(sets).full, t->sighandler_cost );
+
+    /* init data containers */
+    CLG_(init_fn_array)( &(t->fn_active) );
+    CLG_(init_bbcc_hash)( &(t->bbccs) );
+    CLG_(init_jcc_hash)( &(t->jccs) );
+    
+    return t;
+}
+
+
+void CLG_(switch_thread)(ThreadId tid)
+{
+  if (tid == CLG_(current_tid)) return;
+
+  CLG_DEBUG(0, ">> thread %d (was %d)\n", tid, CLG_(current_tid));
+
+  if (CLG_(current_tid) != VG_INVALID_THREADID) {    
+    /* save thread state */
+    thread_info* t = thread[CLG_(current_tid)];
+
+    CLG_ASSERT(t != 0);
+
+    /* current context (including signal handler contexts) */
+    exec_state_save();
+    CLG_(copy_current_exec_stack)( &(t->states) );
+    CLG_(copy_current_call_stack)( &(t->calls) );
+    CLG_(copy_current_fn_stack)  ( &(t->fns) );
+
+    CLG_(copy_current_fn_array) ( &(t->fn_active) );
+    /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+    if (!CLG_(clo).separate_threads) t = thread[1];
+    CLG_(copy_current_bbcc_hash)( &(t->bbccs) );
+    CLG_(copy_current_jcc_hash) ( &(t->jccs) );
+  }
+
+  CLG_(current_tid) = tid;
+  CLG_ASSERT(tid < VG_N_THREADS);
+
+  if (tid != VG_INVALID_THREADID) {
+    thread_info* t;
+
+    /* load thread state */
+
+    if (thread[tid] == 0) thread[tid] = new_thread();
+    t = thread[tid];
+
+    /* current context (including signal handler contexts) */
+    CLG_(set_current_exec_stack)( &(t->states) );
+    exec_state_restore();
+    CLG_(set_current_call_stack)( &(t->calls) );
+    CLG_(set_current_fn_stack)  ( &(t->fns) );
+    
+    CLG_(set_current_fn_array)  ( &(t->fn_active) );
+    /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+    if (!CLG_(clo).separate_threads) t = thread[1];
+    CLG_(set_current_bbcc_hash) ( &(t->bbccs) );
+    CLG_(set_current_jcc_hash)  ( &(t->jccs) );
+  }
+}
+
+
+void CLG_(run_thread)(ThreadId tid)
+{
+    /* check for dumps needed */
+    static ULong bbs_done = 0;
+    static Char buf[512];
+
+    if (CLG_(clo).dump_every_bb >0) {
+       if (CLG_(stat).bb_executions - bbs_done > CLG_(clo).dump_every_bb) {
+           VG_(sprintf)(buf, "--dump-every-bb=%d", CLG_(clo).dump_every_bb);
+	   CLG_(dump_profile)(buf, False);
+           bbs_done = CLG_(stat).bb_executions;
+       }
+    }
+
+    CLG_(check_command)();
+    
+    /* now check for thread switch */
+    CLG_(switch_thread)(tid);
+}
+
+void CLG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack)
+{
+    exec_state *es;
+
+    CLG_DEBUG(0, ">> pre_signal(TID %d, sig %d, alt_st %s)\n",
+	     tid, sigNum, alt_stack ? "yes":"no");
+
+    /* switch to the thread the handler runs in */
+    CLG_(run_thread)(tid);
+
+    /* save current execution state */
+    exec_state_save();
+
+    /* setup current state for a spontaneous call */
+    CLG_(init_exec_state)( &CLG_(current_state) );
+    CLG_(push_cxt)(0);
+
+    /* setup new cxtinfo struct for this signal handler */
+    es = push_exec_state(sigNum);
+    CLG_(init_cost)( CLG_(sets).full, es->cost);
+    CLG_(current_state).cost = es->cost;
+    es->call_stack_bottom = CLG_(current_call_stack).sp;
+
+    CLG_(current_state).sig = sigNum;
+}
+
+/* Run post-signal if the stackpointer for call stack is at
+ * the bottom in current exec state (e.g. a signal handler)
+ *
+ * Called from CLG_(pop_call_stack)
+ */
+void CLG_(run_post_signal_on_call_stack_bottom)()
+{
+    exec_state* es = top_exec_state();
+    CLG_ASSERT(es != 0);
+    CLG_ASSERT(CLG_(current_state).sig >0);
+
+    if (CLG_(current_call_stack).sp == es->call_stack_bottom)
+	CLG_(post_signal)( CLG_(current_tid), CLG_(current_state).sig );
+}
+
+void CLG_(post_signal)(ThreadId tid, Int sigNum)
+{
+    exec_state* es;
+    UInt fn_number, *pactive;
+
+    CLG_DEBUG(0, ">> post_signal(TID %d, sig %d)\n",
+	     tid, sigNum);
+
+    CLG_ASSERT(tid == CLG_(current_tid));
+    CLG_ASSERT(sigNum == CLG_(current_state).sig);
+
+    /* Unwind call stack of this signal handler.
+     * This should only be needed at finalisation time
+     */
+    es = top_exec_state();
+    CLG_ASSERT(es != 0);
+    while(CLG_(current_call_stack).sp > es->call_stack_bottom)
+      CLG_(pop_call_stack)();
+    
+    if (CLG_(current_state).cxt) {
+      /* correct active counts */
+      fn_number = CLG_(current_state).cxt->fn[0]->number;
+      pactive = CLG_(get_fn_entry)(fn_number);
+      (*pactive)--;
+      CLG_DEBUG(0, "  set active count of %s back to %d\n",
+	       CLG_(current_state).cxt->fn[0]->name, *pactive);
+    }
+
+    if (CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom) {
+	/* set fn_stack_top back.
+	 * top can point to 0 if nothing was executed in the signal handler;
+	 * this is possible at end on unwinding handlers.
+	 */
+	if (*(CLG_(current_fn_stack).top) != 0) {
+	    CLG_(current_fn_stack).top--;
+	    CLG_ASSERT(*(CLG_(current_fn_stack).top) == 0);
+	}
+      if (CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom)
+	CLG_(current_fn_stack).top--;
+    }
+
+    /* sum up costs */
+    CLG_ASSERT(CLG_(current_state).cost == es->cost);
+    CLG_(add_and_zero_cost)( CLG_(sets).full,
+			    thread[CLG_(current_tid)]->sighandler_cost,
+			    CLG_(current_state).cost );
+    
+    /* restore previous context */
+    es->sig = -1;
+    current_states.sp--;
+    es = top_exec_state();
+    CLG_(current_state).sig = es->sig;
+    exec_state_restore();
+
+    /* There is no way to reliable get the thread ID we are switching to
+     * after this handler returns. So we sync with actual TID at start of
+     * CLG_(setup_bb)(), which should be the next for callgrind.
+     */
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- Execution states in a thread & signal handlers       ---*/
+/*------------------------------------------------------------*/
+
+/* Each thread can be interrupted by a signal handler, and they
+ * themselves again. But as there's no scheduling among handlers
+ * of the same thread, we don't need additional stacks.
+ * So storing execution contexts and
+ * adding separators in the callstack(needed to not intermix normal/handler
+ * functions in contexts) should be enough.
+ */
+
+/* not initialized: call_stack_bottom, sig */
+void CLG_(init_exec_state)(exec_state* es)
+{
+  es->collect = CLG_(clo).collect_atstart;
+  es->cxt  = 0;
+  es->jmps_passed = 0;
+  es->bbcc = 0;
+  es->nonskipped = 0;
+}
+
+
+static exec_state* new_exec_state(Int sigNum)
+{
+    exec_state* es;
+    es = (exec_state*) CLG_MALLOC(sizeof(exec_state));
+
+    /* allocate real cost space: needed as incremented by
+     * simulation functions */
+    es->cost       = CLG_(get_eventset_cost)(CLG_(sets).full);
+    CLG_(init_cost)( CLG_(sets).full, es->cost );
+
+    CLG_(init_exec_state)(es);
+    es->sig        = sigNum;
+    es->call_stack_bottom  = 0;
+
+    return es;
+}
+
+void CLG_(init_exec_stack)(exec_stack* es)
+{
+  Int i;
+
+  /* The first element is for the main thread */
+  es->entry[0] = new_exec_state(0);
+  for(i=1;i<MAX_SIGHANDLERS;i++)
+    es->entry[i] = 0;
+  es->sp = 0;
+}
+
+void CLG_(copy_current_exec_stack)(exec_stack* dst)
+{
+  Int i;
+
+  dst->sp = current_states.sp;
+  for(i=0;i<MAX_SIGHANDLERS;i++)
+    dst->entry[i] = current_states.entry[i];
+}
+
+void CLG_(set_current_exec_stack)(exec_stack* dst)
+{
+  Int i;
+
+  current_states.sp = dst->sp;
+  for(i=0;i<MAX_SIGHANDLERS;i++)
+    current_states.entry[i] = dst->entry[i];
+}
+
+
+/* Get top context info struct of current thread */
+static
+exec_state* top_exec_state(void)
+{
+  Int sp = current_states.sp;
+  exec_state* es;
+
+  CLG_ASSERT((sp >= 0) && (sp < MAX_SIGHANDLERS));
+  es = current_states.entry[sp];
+  CLG_ASSERT(es != 0);
+  return es;
+}
+
+/* Allocates a free context info structure for a new entered
+ * signal handler, putting it on the context stack.
+ * Returns a pointer to the structure.
+ */
+static exec_state* push_exec_state(int sigNum)
+{   
+  Int sp;
+  exec_state* es;
+
+  current_states.sp++;
+  sp = current_states.sp;
+
+  CLG_ASSERT((sigNum > 0) && (sigNum <= _VKI_NSIG));
+  CLG_ASSERT((sp > 0) && (sp < MAX_SIGHANDLERS));
+  es = current_states.entry[sp];
+  if (!es) {
+    es = new_exec_state(sigNum);
+    current_states.entry[sp] = es;
+  }
+  else
+    es->sig = sigNum;
+
+  return es;
+}
+
+/* Save current context to top cxtinfo struct */
+static
+exec_state* exec_state_save(void)
+{
+  exec_state* es = top_exec_state();
+
+  es->cxt         = CLG_(current_state).cxt;
+  es->collect     = CLG_(current_state).collect;
+  es->jmps_passed = CLG_(current_state).jmps_passed;
+  es->bbcc        = CLG_(current_state).bbcc;
+  es->nonskipped  = CLG_(current_state).nonskipped;
+
+  CLG_DEBUGIF(1) {
+    CLG_DEBUG(1, "  cxtinfo_save(sig %d): collect %s, jmps_passed %d\n",
+	     es->sig, es->collect ? "Yes": "No", es->jmps_passed);	
+    CLG_(print_bbcc)(-9, es->bbcc, False);
+    CLG_(print_cost)(-9, CLG_(sets).full, es->cost);
+  }
+
+  /* signal number does not need to be saved */
+  CLG_ASSERT(CLG_(current_state).sig == es->sig);
+
+  return es;
+}
+
+static
+exec_state* exec_state_restore(void)
+{
+  exec_state* es = top_exec_state();
+  
+  CLG_(current_state).cxt     = es->cxt;
+  CLG_(current_state).collect = es->collect;
+  CLG_(current_state).jmps_passed = es->jmps_passed;
+  CLG_(current_state).bbcc    = es->bbcc;
+  CLG_(current_state).nonskipped = es->nonskipped;
+  CLG_(current_state).cost    = es->cost;
+  CLG_(current_state).sig     = es->sig;
+    
+  CLG_DEBUGIF(1) {
+	CLG_DEBUG(1, "  exec_state_restore(sig %d): collect %s, jmps_passed %d\n",
+		  es->sig, es->collect ? "Yes": "No", es->jmps_passed);
+	CLG_(print_bbcc)(-9, es->bbcc, False);
+	CLG_(print_cxt)(-9, es->cxt, 0);
+	CLG_(print_cost)(-9, CLG_(sets).full, es->cost);
+  }
+
+  return es;
+}
+
diff --git a/configure.in b/configure.in
index 2491673370..f912b6bd95 100644
--- a/configure.in
+++ b/configure.in
@@ -697,6 +697,11 @@ AC_OUTPUT(
    cachegrind/tests/x86/Makefile
    cachegrind/docs/Makefile
    cachegrind/cg_annotate
+   callgrind/Makefile
+   callgrind/callgrind_annotate
+   callgrind/callgrind_control
+   callgrind/tests/Makefile
+   callgrind/docs/Makefile
    helgrind/Makefile
    helgrind/tests/Makefile
    helgrind/docs/Makefile
-- 
2.47.2