From fcc45a9e8580826d539fcc4c60cd26b57954da46 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <njn@valgrind.org>
Date: Mon, 29 Apr 2002 16:03:24 +0000
Subject: [PATCH] New files:   - vg_cachesim.c   - vg_cachesim_{I1,D1,L2}.c   -
 vg_annotate.in   - vg_cachegen.in

Changes to existing files:

  - valgrind/valgrind.in, added option:

        --cachesim=no|yes       [no]

  - Makefile/Makefile.am:
        * added vg_cachesim.c to valgrind_so_SOURCES var
        * added vg_cachesim_I1.c, vg_cachesim_D1.c, vg_cachesim_L2.c to
          noinst_HEADERS var
        * added vg_annotate, vg_cachegen to 'bin_SCRIPTS' var, and added empty
          targets for them

  - vg_main.c:
        * added two offsets for cache sim functions (put in positions 17a,17b)
        * added option handling (detection of --cachesim=yes which turns off of
          --instrument);
        * added calls to cachesim initialisation/finalisation functions

  - vg_mylibc: added some system call wrappers (for chmod, open_write, etc) for
    file writing

  - vg_symtab2.c:
        * allow it to read symbols if either of --instrument or --cachesim is
          used
        * made vg_symtab2.c:vg_what_{line,fn}_is_this extern, renaming it as
          VG_(what_line_is_this) (and added to vg_include.h)
        * completely rewrote the read loop in vg_read_lib_symbols, fixing
          several bugs.  Much better now, although probably not perfect.  It's
          also relatively fragile -- I'm using the "die immediately if anything
          unexpected happens" approach.

  - vg_to_ucode.c:
        * in VG_(disBB), patching in x86 instruction size into extra4b field of
          JMP instructions at the end of basic blocks if --cachesim=yes.
          Shifted things around to do this;  also had to fiddle around with
          single-step stuff to get this to work, by not sticking extra JMPs on
          the end of the single-instruction block if there was already one
          there (to avoid breaking an assertion in vg_cachesim.c).  Did a
          similar thing to avoid an extra JMP on huge basic blocks that are
          split.

  - vg_translate.c:
        * if --cachesim=yes call the cachesim instrumentation phase
        * made some functions extern and renamed:
                allocCodeBlock() --> VG_(allocCodeBlock)()
                freeCodeBlock()  --> VG_(freeCodeBlock)()
                copyUInstr()     --> VG_(copyUInstr)()
          (added to vg_include.h too)

  - vg_include.c: declared
        * cachesim offsets
        * exports of vg_cachesim.c
        * added four new profiling events (increasing VGP_M_CCS to 24 -- I kept
          the spare ones)
        * added comment about UInstr.extra4b field being used for instr size in
          JMPs for cache simulation

  - docs/manual.html:
        * Added --cachesim option to section 2.5.
        * Added cache profiling stuff as section 7.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@168
---
 Makefile.am                 |   11 +-
 addrcheck/Makefile.am       |   11 +-
 cachegrind/Makefile.am      |   11 +-
 cachegrind/cg_annotate.in   |  892 +++++++++++++++++++++++++++++
 cachegrind/cg_main.c        | 1068 +++++++++++++++++++++++++++++++++++
 cachegrind/cg_sim_D1.c      |   93 +++
 cachegrind/cg_sim_I1.c      |   93 +++
 cachegrind/cg_sim_L2.c      |  250 ++++++++
 cachegrind/docs/manual.html |  636 ++++++++++++++++++++-
 corecheck/Makefile.am       |   11 +-
 coregrind/Makefile.am       |   11 +-
 coregrind/docs/manual.html  |  636 ++++++++++++++++++++-
 coregrind/valgrind.in       |    3 +
 coregrind/vg_include.h      |   42 +-
 coregrind/vg_main.c         |   29 +-
 coregrind/vg_mylibc.c       |   42 ++
 coregrind/vg_symtab2.c      |  196 ++++---
 coregrind/vg_to_ucode.c     |   65 ++-
 coregrind/vg_translate.c    |   95 ++--
 docs/manual.html            |  636 ++++++++++++++++++++-
 helgrind/Makefile.am        |   11 +-
 lackey/Makefile.am          |   11 +-
 memcheck/Makefile.am        |   11 +-
 memcheck/docs/manual.html   |  636 ++++++++++++++++++++-
 none/Makefile.am            |   11 +-
 valgrind.in                 |    3 +
 vg_annotate.in              |  892 +++++++++++++++++++++++++++++
 vg_cachegen.in              |  300 ++++++++++
 vg_cachesim.c               | 1068 +++++++++++++++++++++++++++++++++++
 vg_cachesim_D1.c            |   93 +++
 vg_cachesim_I1.c            |   93 +++
 vg_cachesim_L2.c            |  250 ++++++++
 vg_include.h                |   42 +-
 vg_main.c                   |   29 +-
 vg_mylibc.c                 |   42 ++
 vg_symtab2.c                |  196 ++++---
 vg_to_ucode.c               |   65 ++-
 vg_translate.c              |   95 ++--
 38 files changed, 8375 insertions(+), 304 deletions(-)
 create mode 100644 cachegrind/cg_annotate.in
 create mode 100644 cachegrind/cg_main.c
 create mode 100644 cachegrind/cg_sim_D1.c
 create mode 100644 cachegrind/cg_sim_I1.c
 create mode 100644 cachegrind/cg_sim_L2.c
 create mode 100644 vg_annotate.in
 create mode 100755 vg_cachegen.in
 create mode 100644 vg_cachesim.c
 create mode 100644 vg_cachesim_D1.c
 create mode 100644 vg_cachesim_I1.c
 create mode 100644 vg_cachesim_L2.c

diff --git a/Makefile.am b/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/addrcheck/Makefile.am b/addrcheck/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/addrcheck/Makefile.am
+++ b/addrcheck/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/cachegrind/Makefile.am b/cachegrind/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/cachegrind/Makefile.am
+++ b/cachegrind/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/cachegrind/cg_annotate.in b/cachegrind/cg_annotate.in
new file mode 100644
index 0000000000..b3b091cc23
--- /dev/null
+++ b/cachegrind/cg_annotate.in
@@ -0,0 +1,892 @@
+#! /usr/bin/perl -w
+##--------------------------------------------------------------------##
+##--- The cache simulation framework: instrumentation, recording   ---##
+##--- and results printing.                                        ---##
+##---                                                  vg_annotate ---##
+##--------------------------------------------------------------------##
+
+#  This file is part of Valgrind, an x86 protected-mode emulator 
+#  designed for debugging and profiling binaries on x86-Unixes.
+#
+#  Copyright (C) 2000-2002 Julian Seward 
+#     jseward@acm.org
+#     Julian_Seward@muraroa.demon.co.uk
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+#  02111-1307, USA.
+#
+#  The GNU General Public License is contained in the file LICENSE.
+
+#----------------------------------------------------------------------------
+# Annotator for valgrind --cachesim=yes.
+#
+# Input file has the following format:
+# <file>         ::= <desc_line>* <cmd_line> <events_line> 
+#                    <data_line>+ <summary_line>
+# <desc_line>    ::= "desc:" <ws>? <non_nl_string>
+# <cmd_line>     ::= "cmd:" <ws>? <cmd>
+# <events_line>  ::= "events:" <ws>? (<event> <ws>)+
+# <data_line>    ::= <file_line> | <fn_line> | <count_line>
+# <file_line>    ::= ("fl=" | "fi=" | "fe=" ) <filename>
+# <fn_line>      ::= "fn="<filename>
+# <count_line>   ::= <line_num> <ws>? (<count> <ws>)+
+# <summary_line> ::= "summary:" <ws>? (<count> <ws>)+
+# <count>        ::= <num> | "."
+#
+# where
+#
+# <non_nl_string> is any string not containing a newline
+# <cmd> is a command line invocation
+# <filename> and <fn_name> can be anything 
+# <num> and <line_num> are decimal numbers
+# <ws> is whitespace.
+# <nl> is a newline
+#
+# The contents of the "desc:" lines is printed out at the top of the summary.
+# This is a generic way of providing simulation specific information, eg. for
+# giving the cache configuration for cache simulation.
+#
+# Counts can be ".", to represent "N/A".
+#
+# The number of counts in each <line> and the <summary_line> should not exceed
+# the number of events in the <event_line>.  If the number in each <line> is
+# less, we use "." for the the missing counts (the last however-many).
+#
+# A <file_line> changes the current file name.  A <fn_line> changes the current
+# function name.  A <count_line> contains counts that pertain to the current
+# filename/fn_name.  A "fn=" <file_line> and a <fn_line> must appear before any
+# <count_line>s to give the context of the first <count_line>s.
+#
+# Each <file_line> should be immediately followed by a <fn_line>.  "fi="
+# <file_lines> are used to switch filenames for inlined functions;  "fe="
+# <file_lines> are similar, but are put at the end of a basic block in which
+# the file name hasn't been switched back to the original file name.  (fi and
+# fe lines behave the same, they are only distinguished to help debugging.)
+#
+#----------------------------------------------------------------------------
+# Performance improvements record, using cachegrind.out for cacheprof, doing no
+# source annotation (irrelevant ones removed):
+#                                                               user time
+# 1. turned off warnings in add_hash_a_to_b()                   3.81 --> 3.48s
+#    [now add_array_a_to_b()]
+# 6. make line_to_CC() return a ref instead of a hash           3.01 --> 2.77s
+#
+#10. changed file format to avoid file/fn name repetition       2.40s
+#    (not sure why higher;  maybe due to new '.' entries?)
+#11. changed file format to drop unnecessary end-line "."s      2.36s
+#    (shrunk file by about 37%)
+#12. switched from hash CCs to array CCs                        1.61s
+#13. only adding b[i] to a[i] if b[i] defined (was doing it if
+#    either a[i] or b[i] was defined, but if b[i] was undefined
+#    it just added 0)                                           1.48s
+#14. Stopped converting "." entries to undef and then back      1.16s
+#15. Using foreach $i (x..y) instead of for ($i = 0...) in
+#    add_array_a_to_b()                                         1.11s
+#
+# Auto-annotating primes:
+#16. Finding count lengths by int((length-1)/3), not by
+#    commifying (halves the number of commify calls)            1.68s --> 1.47s
+
+use strict;
+
+#----------------------------------------------------------------------------
+# Overview: the running example in the comments is for:
+#   - events = A,B,C,D
+#   - --show=C,A,D
+#   - --sort=D,C
+#----------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------
+# Global variables, main data structures
+#----------------------------------------------------------------------------
+# CCs are arrays, the counts corresponding to @events, with 'undef'
+# representing '.'.  This makes things fast (faster than using hashes for CCs)
+# but we have to use @sort_order and @show_order below to handle the --sort and
+# --show options, which is a bit tricky.
+#----------------------------------------------------------------------------
+
+# Total counts for summary (an array reference).
+my $summary_CC;
+
+# Totals for each function, for overall summary.
+# hash(filename:fn_name => CC array)
+my %fn_totals;
+
+# Individual CCs, organised by filename and line_num for easy annotation.
+# hash(filename => hash(line_num => CC array))
+my %all_ind_CCs;
+
+# Files chosen for annotation on the command line.  
+# key = basename (trimmed of any directory), value = full filename
+my %user_ann_files;
+
+# Generic description string.
+my $desc = "";
+
+# Command line of profiled program.
+my $cmd;
+
+# Events in input file, eg. (A,B,C,D)
+my @events;
+
+# Events to show, from command line, eg. (C,A,D)
+my @show_events;
+
+# Map from @show_events indices to @events indices, eg. (2,0,3).  Gives the
+# order in which we must traverse @events in order to show the @show_events, 
+# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events.
+# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).)
+my @show_order;
+
+# Print out the function totals sorted by these events, eg. (D,C).
+my @sort_events;
+
+# Map from @sort_events indices to @events indices, eg. (3,2).  Same idea as
+# for @show_order
+my @sort_order;
+
+# Threshold;  whatever event is the primary sort, we print out functions
+# representing more than this proportion of 'event' events.
+my $threshold = 99;
+
+# If on, automatically annotates all files that are involved in getting over
+# the threshold count of the primary sort event.
+my $auto_annotate = 0;
+
+# Number of lines to show around each annotated line.
+my $context = 8;
+
+# Directories in which to look for annotation files.
+my @include_dirs = ("");
+
+# Input file name
+my $input_file = "cachegrind.out";
+
+# Version number
+my $version = "@VERSION@";
+
+# Usage message.
+my $usage = <<END
+usage: vg_annotate [options] [source-files]
+
+  options for the user, with defaults in [ ], are:
+    -h --help             show this message
+    -v --version          show version
+    --show=A,B,C          only show figures for events A,B,C [all]
+    --sort=A,B,C          sort columns by events A,B,C [event column order]
+    --threshold=<0--100>  percentage of counts (of primary sort event) we
+                          are interested in [$threshold%]
+    --auto=yes|no         annotate all source files containing functions
+                          that helped reach the event count threshold [no]
+    --context=N           print N lines of context before and after
+                          annotated lines [8]
+    -I --include=<dir>    add <dir> to list of directories to search for 
+                          source files
+
+  Valgrind is Copyright (C) 2000-2002 Julian Seward
+  and licensed under the GNU General Public License, version 2.
+  Bug reports, feedback, admiration, abuse, etc, to: jseward\@acm.org.
+
+END
+;
+
+# Used in various places of output.
+my $fancy = '-' x 80 . "\n";
+
+#-----------------------------------------------------------------------------
+# Argument and option handling
+#-----------------------------------------------------------------------------
+sub process_cmd_line() 
+{
+    for my $arg (@ARGV) { 
+
+        # Option handling
+        if ($arg =~ /^-/) {
+
+            # --version
+            if ($arg =~ /^-v$|^--version$/) {
+                die("vg_annotate$version\n");
+
+            # --show=A,B,C
+            } elsif ($arg =~ /^--show=(.*)$/) {
+                @show_events = split(/,/, $1);
+
+            # --sort=A,B,C
+            } elsif ($arg =~ /^--sort=(.*)$/) {
+                @sort_events = split(/,/, $1);
+
+            # --threshold=X (tolerates a trailing '%')
+            } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) {
+                $threshold = $1;
+                if ($threshold < 0 || $threshold > 100) {
+                    die($usage);
+                }
+
+            # --auto=yes|no
+            } elsif ($arg =~ /^--auto=(yes|no)$/) {
+                $auto_annotate = 1 if ($1 eq "yes");
+                $auto_annotate = 0 if ($1 eq "no");
+
+            # --context=N
+            } elsif ($arg =~ /^--context=([\d\.]+)$/) {
+                $context = $1;
+                if ($context < 0) {
+                    die($usage);
+                }
+
+            # --include=A,B,C
+            } elsif ($arg =~ /^(-I|--include)=(.*)$/) {
+                my $inc = $2;
+                $inc =~ s|/$||;         # trim trailing '/'
+                push(@include_dirs, "$inc/");
+
+            } else {            # -h and --help fall under this case
+                die($usage);
+            }
+
+        # Argument handling -- annotation file checking and selection.
+        # Stick filenames into a hash for quick 'n easy lookup throughout
+        } else {
+            my $readable = 0;
+            foreach my $include_dir (@include_dirs) {
+                if (-r $include_dir . $arg) {
+                    $readable = 1;
+                }
+            }
+            $readable or die("File $arg not found in any of: @include_dirs\n");
+            $user_ann_files{$arg} = 1;
+        } 
+    }
+}
+
+#-----------------------------------------------------------------------------
+# Reading of input file
+#-----------------------------------------------------------------------------
+sub max ($$) 
+{
+    my ($x, $y) = @_;
+    return ($x > $y ? $x : $y);
+}
+
+# Add the two arrays;  any '.' entries are ignored.  Two tricky things:
+# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn
+#    off warnings to allow this.  This makes things about 10% faster than
+#    checking for definedness ourselves.
+# 2. We don't add a ".", even though it's value is 0, because we don't want to
+#    make an $a2->[$i] that is undef become 0 unnecessarily.
+sub add_array_a_to_b ($$) 
+{
+    my ($a1, $a2) = @_;
+
+    my $n = max(scalar @$a1, scalar @$a2);
+    $^W = 0;
+    foreach my $i (0 .. $n-1) {
+        $a2->[$i] += $a1->[$i] if ("." ne $a1->[$i]);
+    }
+    $^W = 1;
+}
+
+# Add each event count to the CC array.  '.' counts become undef, as do
+# missing entries (implicitly).
+sub line_to_CC ($)
+{
+    my @CC = (split /\s+/, $_[0]);
+    (@CC <= @events) or die("Line $.: too many event counts\n");
+    return \@CC;
+}
+
+sub read_input_file() 
+{
+    open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n";
+
+    # Read "desc:" lines.
+    my $line;
+    # This gives a "uninitialized value in substitution (s///)" warning; hmm...
+    #while ($line = <INPUTFILE> && $line =~ s/desc:\s+//) {
+    #    $desc .= "$line\n";
+    #}
+    while (1) {
+        $line = <INPUTFILE>;
+        if ($line =~ s/desc:\s+//) {
+            $desc .= $line;
+        } else {
+            last;
+        }
+    }
+
+    # Read "cmd:" line (Nb: will already be in $line from "desc:" loop above).
+    ($line =~ s/cmd:\s+//) or die("Line $.: missing command line\n");
+    $cmd = $line;
+    chomp($cmd);    # Remove newline
+
+    # Read "events:" line.  We make a temporary hash in which the Nth event's
+    # value is N, which is useful for handling --show/--sort options below.
+    $line = <INPUTFILE>;
+    ($line =~ s/events:\s+//) or die("Line $.: missing events line\n");
+    @events = split(/\s+/, $line);
+    my %events;
+    my $n = 0;
+    foreach my $event (@events) {
+        $events{$event} = $n;
+        $n++
+    }
+
+    # If no --show arg give, default to showing all events in the file.
+    # If --show option is used, check all specified events appeared in the
+    # "events:" line.  Then initialise @show_order.
+    if (@show_events) {
+        foreach my $show_event (@show_events) {
+            (defined $events{$show_event}) or 
+                die("--show event `$show_event' did not appear in input\n");
+        }
+    } else {
+        @show_events = @events;
+    }
+    foreach my $show_event (@show_events) {
+        push(@show_order, $events{$show_event});
+    }
+
+    # Do as for --show, but if no --sort arg given, default to sorting by
+    # column order (ie. first column event is primary sort key, 2nd column is
+    # 2ndary key, etc).
+    if (@sort_events) {
+        foreach my $sort_event (@sort_events) {
+            (defined $events{$sort_event}) or 
+                die("--sort event `$sort_event' did not appear in input\n");
+        }
+    } else {
+        @sort_events = @events;
+    }
+    foreach my $sort_event (@sort_events) {
+        push(@sort_order, $events{$sort_event});
+    }
+
+    my $curr_file;
+    my $curr_fn;
+    my $curr_name;
+
+    my $curr_fn_CC = [];
+    my $curr_file_ind_CCs = {};     # hash(line_num => CC)
+
+    # Read body of input file.
+    while (<INPUTFILE>) {
+        s/#.*$//;   # remove comments
+        if (s/^(\d+)\s+//) {
+            my $line_num = $1;
+            my $CC = line_to_CC($_);
+            add_array_a_to_b($CC, $curr_fn_CC);
+            
+            # If curr_file is selected, add CC to curr_file list.  We look for
+            # full filename matches;  or, if auto-annotating, we have to
+            # remember everything -- we won't know until the end what's needed.
+            if ($auto_annotate || defined $user_ann_files{$curr_file}) {
+                my $tmp = $curr_file_ind_CCs->{$line_num};
+                $tmp = [] unless defined $tmp;
+                add_array_a_to_b($CC, $tmp);
+                $curr_file_ind_CCs->{$line_num} = $tmp;
+            }
+
+        } elsif (s/^fn=(.*)$//) {
+            # Commit result from previous function
+            $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name);
+
+            # Setup new one
+            $curr_fn = $1;
+            $curr_name = "$curr_file:$curr_fn";
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^fl=(.*)$//) {
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs 
+                if (defined $curr_file);
+
+            $curr_file = $1;
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+
+        } elsif (s/^(fi|fe)=(.*)$//) {
+            (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n");
+            $fn_totals{$curr_name} = $curr_fn_CC;
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs;
+
+            $curr_file = $2;
+            $curr_name = "$curr_file:$curr_fn";
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^\s*$//) {
+            # blank, do nothing
+        
+        } elsif (s/^summary:\s+//) {
+            # Finish up handling final filename/fn_name counts
+            $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC 
+                if (defined $curr_file && defined $curr_fn);
+            $all_ind_CCs{$curr_file} = 
+                $curr_file_ind_CCs if (defined $curr_file);
+
+            $summary_CC = line_to_CC($_);
+            (scalar(@$summary_CC) == @events) 
+                or die("Line $.: summary event and total event mismatch\n");
+
+        } else {
+            warn("WARNING: line $. malformed, ignoring\n");
+        }
+    }
+
+    # Check if summary line was present
+    if (not defined $summary_CC) {
+        warn("WARNING: missing final summary line, no summary will be printed\n");
+    }
+
+    close(INPUTFILE);
+}
+
+#-----------------------------------------------------------------------------
+# Print options used
+#-----------------------------------------------------------------------------
+sub print_options ()
+{
+    print($fancy);
+    print($desc);
+    print("Command:          $cmd\n");
+    print("Events recorded:  @events\n");
+    print("Events shown:     @show_events\n");
+    print("Event sort order: @sort_events\n");
+    print("Threshold:        $threshold%\n");
+
+    my @include_dirs2 = @include_dirs;  # copy @include_dirs
+    shift(@include_dirs2);       # remove "" entry, which is always the first
+    unshift(@include_dirs2, "") if (0 == @include_dirs2); 
+    my $include_dir = shift(@include_dirs2);
+    print("Include dirs:     $include_dir\n");
+    foreach my $include_dir (@include_dirs2) {
+        print("                  $include_dir\n");
+    }
+
+    my @user_ann_files = keys %user_ann_files;
+    unshift(@user_ann_files, "") if (0 == @user_ann_files); 
+    my $user_ann_file = shift(@user_ann_files);
+    print("User annotated:   $user_ann_file\n");
+    foreach $user_ann_file (@user_ann_files) {
+        print("                  $user_ann_file\n");
+    }
+
+    my $is_on = ($auto_annotate ? "on" : "off");
+    print("Auto-annotation:  $is_on\n");
+    print("\n");
+}
+
+#-----------------------------------------------------------------------------
+# Print summary and sorted function totals
+#-----------------------------------------------------------------------------
+sub mycmp ($$) 
+{
+    my ($c, $d) = @_;
+
+    # Iterate through sort events (eg. 3,2); return result if two are different
+    foreach my $i (@sort_order) {
+        my ($x, $y);
+        $x = $c->[$i];
+        $y = $d->[$i];
+        $x = -1 unless defined $x;
+        $y = -1 unless defined $y;
+
+        my $cmp = $y <=> $x;        # reverse sort
+        if (0 != $cmp) {
+            return $cmp;
+        }
+    }
+    # Exhausted events, equal
+    return 0;
+}
+
+sub commify ($) {
+    my ($val) = @_;
+    1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/);
+    return $val;
+}
+
+# Because the counts can get very big, and we don't want to waste screen space
+# and make lines too long, we compute exactly how wide each column needs to be
+# by finding the widest entry for each one.
+sub compute_CC_col_widths (@) 
+{
+    my @CCs = @_;
+    my $CC_col_widths = [];
+
+    # Initialise with minimum widths (from event names)
+    foreach my $event (@events) {
+        push(@$CC_col_widths, length($event));
+    }
+    
+    # Find maximum width count for each column.  @CC_col_width positions
+    # correspond to @CC positions.
+    foreach my $CC (@CCs) {
+        foreach my $i (0 .. scalar(@$CC)-1) {
+            if (defined $CC->[$i]) {
+                # Find length, accounting for commas that will be added
+                my $length = length $CC->[$i];
+                my $clength = $length + int(($length - 1) / 3);
+                $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); 
+            }
+        }
+    }
+    return $CC_col_widths;
+}
+
+# Print the CC with each column's size dictated by $CC_col_widths.
+sub print_CC ($$) 
+{
+    my ($CC, $CC_col_widths) = @_;
+
+    foreach my $i (@show_order) {
+        my $count = (defined $CC->[$i] ? commify($CC->[$i]) : ".");
+        my $space = ' ' x ($CC_col_widths->[$i] - length($count));
+        print("$space$count ");
+    }
+}
+
+sub print_events ($)
+{
+    my ($CC_col_widths) = @_;
+
+    foreach my $i (@show_order) { 
+        my $event       = $events[$i];
+        my $event_width = length($event);
+        my $col_width   = $CC_col_widths->[$i];
+        my $space       = ' ' x ($col_width - $event_width);
+        print("$event$space ");
+    }
+}
+
+# Prints summary and function totals (with separate column widths, so that
+# function names aren't pushed over unnecessarily by huge summary figures).
+# Also returns a hash containing all the files that are involved in getting the
+# events count above the threshold (ie. all the interesting ones).
+sub print_summary_and_fn_totals ()
+{
+    my @fn_fullnames = keys   %fn_totals;
+
+    # Work out the size of each column for printing (summary and functions
+    # separately).
+    my $summary_CC_col_widths = compute_CC_col_widths($summary_CC);
+    my      $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals);
+
+    # Header and counts for summary
+    print($fancy);
+    print_events($summary_CC_col_widths);
+    print("\n");
+    print($fancy);
+    print_CC($summary_CC, $summary_CC_col_widths);
+    print(" PROGRAM TOTALS\n");
+    print("\n");
+
+    # Header for functions
+    print($fancy);
+    print_events($fn_CC_col_widths);
+    print(" file:function\n");
+    print($fancy);
+
+    # Sort function names into order dictated by --sort option.
+    @fn_fullnames = sort {
+        mycmp($fn_totals{$a}, $fn_totals{$b})
+    } @fn_fullnames;
+
+    # The thresholded event is the one that is the primary sort event.
+    my $threshold_files       = {};
+    my $threshold_event_index = $sort_order[0];
+    my $threshold_total       = $summary_CC->[$threshold_event_index];
+    my $curr_total            = 0;
+
+    # Print functions, stopping when the threshold has been reached.
+    foreach my $fn_name (@fn_fullnames) {
+
+        # Stop when we've reached the threshold
+        last if ($curr_total * 100 / $threshold_total >= $threshold);
+
+        # Print function results
+        my $fn_CC = $fn_totals{$fn_name};
+        print_CC($fn_CC, $fn_CC_col_widths);
+        print(" $fn_name\n");
+
+        # Update the threshold counting
+        my $filename = $fn_name;
+        $filename =~ s/:[^:]+$//;    # remove function name
+        $threshold_files->{$filename} = 1;
+        $curr_total += $fn_CC->[$threshold_event_index] 
+            if (defined $fn_CC->[$threshold_event_index]);
+    }
+    print("\n");
+
+    return $threshold_files;
+}
+
+#-----------------------------------------------------------------------------
+# Annotate selected files
+#-----------------------------------------------------------------------------
+
+# Issue a warning that the source file is more recent than the input file. 
+sub warning_on_src_more_recent_than_inputfile ($)
+{
+    my $src_file = $_[0];
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ Source file '$src_file' is more recent than input file '$input_file'.
+@ Annotations may not be correct.
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+END
+;
+    print($warning);
+}
+
+# If there is information about lines not in the file, issue a warning
+# explaining possible causes.
+sub warning_on_nonexistent_lines ($$$)
+{
+    my ($src_more_recent_than_inputfile, $src_file, $excess_line_nums) = @_;
+    my $cause_and_solution;
+
+    if ($src_more_recent_than_inputfile) {
+        $cause_and_solution = <<END
+@@ cause:    '$src_file' has changed since information was gathered.
+@@           If so, a warning will have already been issued about this.
+@@ solution: Recompile program and rerun under "valgrind --cachesim=yes" to 
+@@           gather new information.
+END
+    # We suppress warnings about .h files
+    } elsif ($src_file =~ /\.h$/) {
+        $cause_and_solution = <<END
+@@ cause:    bug in the Valgrind's debug info reader that screws up with .h
+@@           files sometimes
+@@ solution: none, sorry
+END
+    } else {
+        $cause_and_solution = <<END
+@@ cause:    not sure, sorry
+END
+    }
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@
+@@ Information recorded about lines past the end of '$src_file'.
+@@
+@@ Probable cause and solution:
+$cause_and_solution@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+END
+;
+    print($warning);
+}
+
+sub annotate_ann_files($)
+{
+    my ($threshold_files) = @_; 
+
+    my %all_ann_files;
+    my @unfound_auto_annotate_files;
+    my $printed_totals_CC = [];
+
+    # If auto-annotating, add interesting files (but not "???")
+    if ($auto_annotate) {
+        delete $threshold_files->{"???"};
+        %all_ann_files = (%user_ann_files, %$threshold_files) 
+    } else {
+        %all_ann_files = %user_ann_files;
+    }
+
+    # Track if we did any annotations.
+    my $did_annotations = 0;
+
+    LOOP:
+    foreach my $src_file (keys %all_ann_files) {
+
+        my $opened_file = "";
+        my $full_file_name = "";
+        foreach my $include_dir (@include_dirs) {
+            my $try_name = $include_dir . $src_file;
+            if (open(INPUTFILE, "< $try_name")) {
+                $opened_file    = $try_name;
+                $full_file_name = ($include_dir eq "" 
+                                  ? $src_file 
+                                  : "$include_dir + $src_file"); 
+                last;
+            }
+        }
+        
+        if (not $opened_file) {
+            # Failed to open the file.  If chosen on the command line, die.
+            # If arose from auto-annotation, print a little message.
+            if (defined $user_ann_files{$src_file}) {
+                die("File $src_file not opened in any of: @include_dirs\n");
+
+            } else {
+                push(@unfound_auto_annotate_files, $src_file);
+            }
+
+        } else {
+            # File header (distinguish between user- and auto-selected files).
+            print("$fancy");
+            my $ann_type = 
+                (defined $user_ann_files{$src_file} ? "User" : "Auto");
+            print("-- $ann_type-annotated source: $full_file_name\n");
+            print("$fancy");
+
+            # Get file's CCs
+            my $src_file_CCs = $all_ind_CCs{$src_file};
+            if (!defined $src_file_CCs) {
+                print("  No information has been collected for $src_file\n\n");
+                next LOOP;
+            }
+        
+            $did_annotations = 1;
+            
+            # Numeric, not lexicographic sort!
+            my @line_nums = sort {$a <=> $b} keys %$src_file_CCs;  
+
+            # If $src_file more recent than cachegrind.out, issue warning
+            my $src_more_recent_than_inputfile = 0;
+            if ((stat $opened_file)[9] > (stat $input_file)[9]) {
+                $src_more_recent_than_inputfile = 1;
+                warning_on_src_more_recent_than_inputfile($src_file);
+            }
+
+            # Work out the size of each column for printing
+            my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs);
+
+            # Events header
+            print_events($CC_col_widths);
+            print("\n\n");
+
+            # Shift out 0 if it's in the line numbers (from unknown entries,
+            # likely due to bugs in Valgrind's stabs debug info reader)
+            shift(@line_nums) if (0 == $line_nums[0]);
+
+            # Finds interesting line ranges -- all lines with a CC, and all
+            # lines within $context lines of a line with a CC.
+            my $n = @line_nums;
+            my @pairs;
+            for (my $i = 0; $i < $n; $i++) {
+                push(@pairs, $line_nums[$i] - $context);   # lower marker
+                while ($i < $n-1 && 
+                       $line_nums[$i] + 2*$context >= $line_nums[$i+1]) {
+                    $i++;
+                }
+                push(@pairs, $line_nums[$i] + $context);   # upper marker
+            }
+
+            # Annotate chosen lines, tracking total counts of lines printed
+            $pairs[0] = 1 if ($pairs[0] < 1);
+            while (@pairs) {
+                my $low  = shift @pairs;
+                my $high = shift @pairs;
+                while ($. < $low-1) {
+                    my $tmp = <INPUTFILE>;
+                    last unless (defined $tmp);     # hack to detect EOF
+                }
+                my $src_line;
+                # Print line number, unless start of file
+                print("-- line $low " . '-' x 40 . "\n") if ($low != 1);
+                while (($. < $high) && ($src_line = <INPUTFILE>)) {
+                    if (defined $line_nums[0] && $. == $line_nums[0]) {
+                        print_CC($src_file_CCs->{$.}, $CC_col_widths);
+                        add_array_a_to_b($src_file_CCs->{$.}, 
+                                         $printed_totals_CC);
+                        shift(@line_nums);
+
+                    } else {
+                        print_CC( [], $CC_col_widths);
+                    }
+
+                    print(" $src_line");
+                }
+                # Print line number, unless EOF
+                if ($src_line) {
+                    print("-- line $high " . '-' x 40 . "\n");
+                } else {
+                    last;
+                }
+            }
+
+            # If there was info on lines past the end of the file...
+            if (@line_nums) {
+                foreach my $line_num (@line_nums) {
+                    print_CC($src_file_CCs->{$line_num}, $CC_col_widths);
+                    print(" <bogus line $line_num>\n");
+                }
+                print("\n");
+                warning_on_nonexistent_lines($src_more_recent_than_inputfile,
+                                             $src_file, \@line_nums);
+            }
+            print("\n");
+
+            # Print summary of counts attributed to file but not to any
+            # particular line (due to incomplete debug info).
+            if ($src_file_CCs->{0}) {
+                print_CC($src_file_CCs->{0}, $CC_col_widths);
+                print(" <counts for unidentified lines in $src_file>\n\n");
+            }
+            
+            close(INPUTFILE);
+        }
+    }
+
+    # Print list of unfound auto-annotate selected files.
+    if (@unfound_auto_annotate_files) {
+        print("$fancy");
+        print("The following files chosen for auto-annotation could not be found:\n");
+        print($fancy);
+        foreach my $f (@unfound_auto_annotate_files) {
+            print("  $f\n");
+        }
+        print("\n");
+    }
+
+    # If we did any annotating, print what proportion of events were covered by
+    # annotated lines above.
+    if ($did_annotations) {
+        my $percent_printed_CC;
+        foreach (my $i = 0; $i < @$summary_CC; $i++) {
+            $percent_printed_CC->[$i] = 
+                sprintf("%.0f", 
+                        $printed_totals_CC->[$i] / $summary_CC->[$i] * 100);
+        }
+        my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC);
+        print($fancy);
+        print_events($pp_CC_col_widths);
+        print("\n");
+        print($fancy);
+        print_CC($percent_printed_CC, $pp_CC_col_widths);
+        print(" percentage of events annotated\n\n");
+    }
+}
+
+#----------------------------------------------------------------------------
+# "main()"
+#----------------------------------------------------------------------------
+process_cmd_line();
+read_input_file();
+print_options();
+my $threshold_files = print_summary_and_fn_totals();
+annotate_ann_files($threshold_files);
+
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
new file mode 100644
index 0000000000..ea0cb410a2
--- /dev/null
+++ b/cachegrind/cg_main.c
@@ -0,0 +1,1068 @@
+/*--------------------------------------------------------------------*/
+/*--- The cache simulation framework: instrumentation, recording   ---*/
+/*--- and results printing.                                        ---*/
+/*---                                                vg_cachesim.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, an x86 protected-mode emulator 
+   designed for debugging and profiling binaries on x86-Unixes.
+
+   Copyright (C) 2000-2002 Julian Seward 
+      jseward@acm.org
+      Julian_Seward@muraroa.demon.co.uk
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file LICENSE.
+*/
+
+#include <string.h>
+
+#include "vg_include.h"
+
+#include "vg_cachesim_L2.c"
+#include "vg_cachesim_I1.c"
+#include "vg_cachesim_D1.c"
+
+
+/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
+#define MAX_x86_INSTR_SIZE  16
+
+/* Size of various buffers used for storing strings */
+#define FILENAME_LEN                      256
+#define FN_NAME_LEN                       256
+#define BUF_LEN                           512
+#define COMMIFY_BUF_LEN                   128
+#define RESULTS_BUF                       128
+
+/*------------------------------------------------------------*/
+/*--- Output file related stuff                            ---*/
+/*------------------------------------------------------------*/
+
+#define OUT_FILE        "cachegrind.out"
+
+static void file_err()
+{
+   VG_(message)(Vg_UserMsg,
+                "FATAL: can't open cache simulation output file `%s'",
+                OUT_FILE );
+   VG_(exit)(1);
+}
+
+/*------------------------------------------------------------*/
+/*--- Cost center types, operations                        ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _CC CC;
+struct _CC {
+   ULong a;
+   ULong m1;
+   ULong m2;
+};
+
+static __inline__ void initCC(CC* cc) {
+    cc->a  = 0;
+    cc->m1 = 0;
+    cc->m2 = 0;
+}
+
+
+typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type;
+
+/* Instruction-level cost-centres.  The typedefs for these structs are in
+ * vg_include.c 
+ *
+ * WARNING:  the 'tag' field *must* be the first byte of both CC types.
+ *           the 'instr_addr' *must* be the second word of both CC types.
+ *
+ * This is because we use them when we don't know what type of CC we're dealing
+ * with.
+ */ 
+struct _iCC {
+   /* word 1 */
+   UChar tag;
+   UChar instr_size;
+
+   /* words 2+ */
+   Addr instr_addr;
+   CC I;
+};
+
+struct _idCC {
+   /* word 1 */
+   UChar tag;
+   UChar instr_size;
+   UChar data_size;
+
+   /* words 2+ */
+   Addr instr_addr;
+   CC I;
+   CC D;
+};
+
+static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size)
+{
+   cc->tag        = INSTR_CC;
+   cc->instr_size = instr_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+}
+
+static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr,
+                      UInt instr_size, UInt data_size)
+{
+   cc->tag        = X_CC;
+   cc->instr_size = instr_size;
+   cc->data_size  = data_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+   initCC(&cc->D);
+}
+
+static __inline__ void sprint_iCC(Char buf[BUF_LEN], UInt ln, iCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2/*, cc->instr_addr*/);
+}
+
+static __inline__ void sprint_read_or_mod_CC(Char buf[BUF_LEN], UInt ln, 
+                                             idCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2, 
+                          cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/);
+}
+
+static __inline__ void sprint_write_CC(Char buf[BUF_LEN], UInt ln, idCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu . . . %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2, 
+                          cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/);
+}
+
+/*------------------------------------------------------------*/
+/*--- BBCC hash table stuff                                ---*/
+/*------------------------------------------------------------*/
+
+/* The table of BBCCs is of the form hash(filename, hash(fn_name,
+ * hash(BBCCs))).  Each hash table is separately chained.  The sizes below work
+ * fairly well for Konqueror. */
+
+#define N_FILE_ENTRIES        251
+#define   N_FN_ENTRIES         53
+#define N_BBCC_ENTRIES         37
+
+/* The cost centres for a basic block are stored in a contiguous array.
+ * They are distinguishable by their tag field. */
+typedef struct _BBCC BBCC;
+struct _BBCC {
+   Addr  orig_addr;
+   UInt  array_size;    /* byte-size of variable length array */
+   BBCC* next;
+   Addr  array[0];      /* variable length array */
+};
+
+typedef struct _fn_node fn_node;
+struct _fn_node {
+   Char*    fn_name;
+   BBCC*    BBCCs[N_BBCC_ENTRIES];
+   fn_node* next;
+};
+
+typedef struct _file_node file_node;
+struct _file_node {
+   Char*      filename;
+   fn_node*   fns[N_FN_ENTRIES];
+   file_node* next;
+};
+
+/* BBCC_table structure:  list(filename, list(fn_name, list(BBCC))) */
+file_node *BBCC_table[N_FILE_ENTRIES];
+
+Int  distinct_files      = 0;
+Int  distinct_fns        = 0;
+
+Int  distinct_instrs     = 0;
+Int  full_debug_BBs      = 0;
+Int  file_line_debug_BBs = 0;
+Int  fn_name_debug_BBs   = 0;
+Int  no_debug_BBs        = 0;
+
+Int  BB_retranslations   = 0;
+
+static void init_BBCC_table()
+{
+   Int i;
+   for (i = 0; i < N_FILE_ENTRIES; i++)
+      BBCC_table[i] = NULL;
+}
+
+static void get_file_fn_names(Addr instr_addr, Char filename[FILENAME_LEN],
+                       Char fn_name[FN_NAME_LEN])
+{
+   UInt dummy_line_num;
+   Bool found1, found2, no_demangle = False;
+
+   found1 = VG_(what_line_is_this)(instr_addr, filename,
+                                   FILENAME_LEN, &dummy_line_num);
+   found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN);
+
+   if (!found1 && !found2) {
+      no_debug_BBs++;
+      VG_(strcpy)(filename, "???");
+      VG_(strcpy)(fn_name,  "???");
+
+   } else if ( found1 &&  found2) {
+      full_debug_BBs++;
+
+   } else if ( found1 && !found2) {
+      file_line_debug_BBs++;
+      VG_(strcpy)(fn_name,  "???");
+
+   } else  /*(!found1 &&  found2)*/ {
+      fn_name_debug_BBs++;
+      VG_(strcpy)(filename, "???");
+   }
+}
+
+/* Forward declaration. */
+static Int compute_BBCC_array_size(UCodeBlock* cb);
+
+static __inline__ 
+file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next)
+{
+   Int i;
+   file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node));
+   new->filename  = VG_(strdup)(VG_AR_PRIVATE, filename);
+   for (i = 0; i < N_FN_ENTRIES; i++) {
+      new->fns[i] = NULL;
+   }
+   new->next      = next;
+   return new;
+}
+
+static __inline__ 
+fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next)
+{
+   Int i;
+   fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node));
+   new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name);
+   for (i = 0; i < N_BBCC_ENTRIES; i++) {
+      new->BBCCs[i] = NULL;
+   }
+   new->next    = next;
+   return new;
+}
+
+static __inline__ 
+BBCC* new_BBCC(Addr bb_orig_addr, UCodeBlock* cb, BBCC* next)
+{
+   Int BBCC_array_size = compute_BBCC_array_size(cb);
+   BBCC* new;
+
+   new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size);
+   new->orig_addr  = bb_orig_addr;
+   new->array_size = BBCC_array_size;
+   new->next = next;
+
+   return new;
+}
+
+#define HASH_CONSTANT   256
+
+static UInt hash(Char *s, UInt table_size)
+{
+    int hash_value = 0;
+    for ( ; *s; s++)
+        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+    return hash_value;
+}
+
+/* Do a three step traversal: by filename, then fn_name, then instr_addr.
+ * In all cases prepends new nodes to their chain.  Returns a pointer to the
+ * cost centre.  Also sets BB_seen_before by reference. 
+ */ 
+static __inline__ BBCC* get_BBCC(Addr bb_orig_addr, UCodeBlock* cb, 
+                                 Bool *BB_seen_before)
+{
+   file_node *curr_file_node;
+   fn_node   *curr_fn_node;
+   BBCC      *curr_BBCC;
+   Char       filename[FILENAME_LEN], fn_name[FN_NAME_LEN];
+   UInt       filename_hash, fnname_hash, BBCC_hash;
+
+   get_file_fn_names(bb_orig_addr, filename, fn_name);
+
+   VGP_PUSHCC(VgpCacheGetBBCC);
+   filename_hash = hash(filename, N_FILE_ENTRIES);
+   curr_file_node = BBCC_table[filename_hash];
+   while (NULL != curr_file_node && 
+          strcmp(filename, curr_file_node->filename) != 0) {
+      curr_file_node = curr_file_node->next;
+   }
+   if (NULL == curr_file_node) {
+      BBCC_table[filename_hash] = curr_file_node = 
+         new_file_node(filename, BBCC_table[filename_hash]);
+      distinct_files++;
+   }
+
+   fnname_hash = hash(fn_name, N_FN_ENTRIES);
+   curr_fn_node = curr_file_node->fns[fnname_hash];
+   while (NULL != curr_fn_node && 
+          strcmp(fn_name, curr_fn_node->fn_name) != 0) {
+      curr_fn_node = curr_fn_node->next;
+   }
+   if (NULL == curr_fn_node) {
+      curr_file_node->fns[fnname_hash] = curr_fn_node = 
+         new_fn_node(fn_name, curr_file_node->fns[fnname_hash]);
+      distinct_fns++;
+   }
+
+   BBCC_hash = bb_orig_addr % N_BBCC_ENTRIES;
+   curr_BBCC = curr_fn_node->BBCCs[BBCC_hash];
+   while (NULL != curr_BBCC && bb_orig_addr != curr_BBCC->orig_addr) {
+      curr_BBCC = curr_BBCC->next;
+   }
+   if (curr_BBCC == NULL) {
+      curr_fn_node->BBCCs[BBCC_hash] = curr_BBCC = 
+         new_BBCC(bb_orig_addr, cb, curr_fn_node->BBCCs[BBCC_hash]);
+      *BB_seen_before = False;
+
+   } else {
+      vg_assert(bb_orig_addr == curr_BBCC->orig_addr);
+      vg_assert(curr_BBCC->array_size > 0 && curr_BBCC->array_size < 1000000);
+      if (VG_(clo_verbosity) > 1) {
+          VG_(message)(Vg_DebugMsg, "BB retranslation, retrieving from BBCC table");
+      }
+      *BB_seen_before = True;
+      BB_retranslations++;
+   }
+   VGP_POPCC;
+   return curr_BBCC;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation instrumentation phase               ---*/
+/*------------------------------------------------------------*/
+
+#define uInstr1   VG_(newUInstr1)
+#define uInstr2   VG_(newUInstr2)
+#define uInstr3   VG_(newUInstr3)
+#define dis       VG_(disassemble)
+#define uLiteral  VG_(setLiteralField)
+#define newTemp   VG_(getNewTemp)
+
+static Int compute_BBCC_array_size(UCodeBlock* cb)
+{
+   UInstr* u_in;
+   Int     i, CC_size, BBCC_size = 0;
+   Bool    is_LOAD, is_STORE, is_FPU_R, is_FPU_W;
+    
+   is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+
+   for (i = 0; i < cb->used; i++) {
+      //VG_(ppUInstr)(0, &cb->instrs[i]);
+
+      u_in = &cb->instrs[i];
+      switch(u_in->opcode) {
+
+         case INCEIP: 
+            goto case_for_end_of_instr;
+         
+         case JMP:
+            if (u_in->cond != CondAlways) break;
+
+            goto case_for_end_of_instr;
+
+            case_for_end_of_instr:
+
+            CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W 
+                      ? sizeof(idCC) : sizeof(iCC));
+
+            BBCC_size += CC_size;
+            is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+            break;
+
+         case LOAD:
+            /* Two LDBs are possible for a single instruction */
+            vg_assert(/*!is_LOAD &&*/ !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_LOAD = True;
+            break;
+
+         case STORE:
+            /* Multiple STOREs are possible for 'pushal' */
+            vg_assert(            /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W);
+            is_STORE = True;
+            break;
+
+         case FPU_R:
+            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_FPU_R = True;
+            break;
+
+         case FPU_W:
+            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_FPU_W = True;
+            break;
+
+         default:
+            break;
+      }
+   }
+
+   return BBCC_size;
+}
+
+/* Use this rather than eg. -1 because it's stored as a UInt. */
+#define INVALID_DATA_SIZE   999999
+
+UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr)
+{
+   UCodeBlock* cb;
+   Int         i;
+   UInstr*     u_in;
+   BBCC*       BBCC_node;
+   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr;
+   Int         CC_size = -1;    /* Shut gcc warnings up */
+   Addr        instr_addr = orig_addr;
+   UInt        instr_size, data_size = INVALID_DATA_SIZE;
+   Int         helper = -1;     /* Shut gcc warnings up */
+   UInt        stack_used;
+   Bool        BB_seen_before       = False;
+   Bool        prev_instr_was_Jcond = False;
+   Addr        BBCC_ptr0, BBCC_ptr; 
+
+   /* Get BBCC (creating if necessary -- requires a counting pass over the BB
+    * if it's the first time it's been seen), and point to start of the 
+    * BBCC array.  */
+   BBCC_node = get_BBCC(orig_addr, cb_in, &BB_seen_before);
+   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
+
+   cb = VG_(allocCodeBlock)();
+   cb->nextTemp = cb_in->nextTemp;
+
+   t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG;
+
+   for (i = 0; i < cb_in->used; i++) {
+      u_in = &cb_in->instrs[i];
+
+      //VG_(ppUInstr)(0, u_in);
+
+      /* What this is all about:  we want to instrument each x86 instruction 
+       * translation.  The end of these are marked in three ways.  The three
+       * ways, and the way we instrument them, are as follows:
+       *
+       * 1. UCode, INCEIP         --> UCode, Instrumentation, INCEIP
+       * 2. UCode, Juncond        --> UCode, Instrumentation, Juncond
+       * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond
+       *
+       * We must put the instrumentation before the jumps so that it is always
+       * executed.  We don't have to put the instrumentation before the INCEIP
+       * (it could go after) but we do so for consistency.
+       *
+       * Junconds are always the last instruction in a basic block.  Jconds are
+       * always the 2nd last, and must be followed by a Jcond.  We check this
+       * with various assertions.
+       *
+       * Note that in VG_(disBB) we patched the `extra4b' field of the first
+       * occurring JMP in a block with the size of its x86 instruction.  This
+       * is used now.
+       *
+       * Note that we don't have to treat JIFZ specially;  unlike JMPs, JIFZ
+       * occurs in the middle of a BB and gets an INCEIP after it.
+       *
+       * The instrumentation is just a call to the appropriate helper function,
+       * passing it the address of the instruction's CC.
+       */
+      if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP);
+
+      switch (u_in->opcode) {
+
+         case INCEIP:
+            instr_size = u_in->val1;
+            goto case_for_end_of_x86_instr;
+
+         case JMP:
+            if (u_in->cond == CondAlways) {
+               vg_assert(i+1 == cb_in->used); 
+
+               /* Don't instrument if previous instr was a Jcond. */
+               if (prev_instr_was_Jcond) {
+                  vg_assert(0 == u_in->extra4b);
+                  VG_(copyUInstr)(cb, u_in);
+                  break;
+               }
+               prev_instr_was_Jcond = False;
+
+            } else {
+               vg_assert(i+2 == cb_in->used);  /* 2nd last instr in block */
+               prev_instr_was_Jcond = True;
+            }
+
+            /* Ah, the first JMP... instrument, please. */
+            instr_size = u_in->extra4b;
+            goto case_for_end_of_x86_instr;
+
+            /* Shared code that is executed at the end of an x86 translation
+             * block, marked by either an INCEIP or an unconditional JMP. */
+            case_for_end_of_x86_instr:
+
+#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)
+             
+            /* Initialise the CC in the BBCC array appropriately if it hasn't
+             * been initialised before.
+             * Then call appropriate sim function, passing it the CC address.
+             * Note that CALLM_S/CALL_E aren't required here;  by this point,
+             * the checking related to them has already happened. */
+            stack_used = 0;
+
+            vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE);
+            vg_assert(0 != instr_addr);
+
+            /* Save the caller-save registers before we push our args */
+            uInstr1(cb, PUSH, 4, RealReg, R_EAX);
+            uInstr1(cb, PUSH, 4, RealReg, R_ECX);
+            uInstr1(cb, PUSH, 4, RealReg, R_EDX);
+
+            if (!IS_(read) && !IS_(write)) {
+               iCC* CC_ptr = (iCC*)(BBCC_ptr);
+               vg_assert(INVALID_DATA_SIZE == data_size);
+               vg_assert(INVALID_TEMPREG == t_read_addr && 
+                         INVALID_TEMPREG == t_write_addr);
+               CC_size = sizeof(iCC);
+               if (!BB_seen_before)
+                   init_iCC(CC_ptr, instr_addr, instr_size);
+
+               helper = VGOFF_(cachesim_log_non_mem_instr);
+
+            } else { 
+               CC_type X_CC;
+               idCC* CC_ptr = (idCC*)(BBCC_ptr);
+                
+               vg_assert(4 == data_size || 2  == data_size || 1 == data_size || 
+                         8 == data_size || 10 == data_size);
+               
+               CC_size = sizeof(idCC);
+               helper = VGOFF_(cachesim_log_mem_instr);
+
+               if (IS_(read) && !IS_(write)) {
+                  X_CC = READ_CC;
+                  vg_assert(INVALID_TEMPREG != t_read_addr && 
+                            INVALID_TEMPREG == t_write_addr);
+                  t_data_addr = t_read_addr;
+
+               } else if (!IS_(read) && IS_(write)) {
+                  X_CC = WRITE_CC;
+                  vg_assert(INVALID_TEMPREG == t_read_addr && 
+                            INVALID_TEMPREG != t_write_addr);
+                  t_data_addr = t_write_addr;
+
+               } else {
+                  vg_assert(IS_(read) && IS_(write));
+                  X_CC = MOD_CC;
+                  vg_assert(INVALID_TEMPREG != t_read_addr && 
+                            INVALID_TEMPREG != t_write_addr);
+                  t_data_addr = t_read_addr;
+               }
+
+               if (!BB_seen_before)
+                  init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size);
+
+               /* 2nd arg: data addr */
+               uInstr1(cb, PUSH,  4, TempReg, t_data_addr);
+               stack_used += 4;
+            }
+#undef IS_
+
+            /* 1st arg: CC addr */
+            t_CC_addr = newTemp(cb);
+            uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
+            uLiteral(cb, BBCC_ptr);
+            uInstr1(cb, PUSH,  4, TempReg, t_CC_addr);
+            stack_used += 4;
+
+            /* Call function and return. */
+            uInstr1(cb, CALLM, 0, Lit16,   helper);
+            uInstr1(cb, CLEAR, 0, Lit16,   stack_used);
+
+            /* Restore the caller-save registers now the call is done */
+            uInstr1(cb, POP, 4, RealReg, R_EDX);
+            uInstr1(cb, POP, 4, RealReg, R_ECX);
+            uInstr1(cb, POP, 4, RealReg, R_EAX);
+
+            VG_(copyUInstr)(cb, u_in);
+
+            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
+            BBCC_ptr   += CC_size; 
+            instr_addr += instr_size;
+            t_CC_addr = t_read_addr = t_write_addr = 
+                                      t_data_addr  = INVALID_TEMPREG;
+            data_size = INVALID_DATA_SIZE;
+            break;
+
+
+         /* For memory-ref instrs, copy the data_addr into a temporary to be
+          * passed to the cachesim_log_function at the end of the instruction.
+          */
+         case LOAD: 
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         case FPU_R:
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val2,  TempReg, t_read_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         /* Note that we must set t_write_addr even for mod instructions;
+          * that's how the code above determines whether it does a write;
+          * without it, it would think a mod instruction is a read.
+          * As for the MOV, if it's a mod instruction it's redundant, but it's
+          * not expensive and mod instructions are rare anyway. */
+         case STORE:
+         case FPU_W:
+            t_write_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         case NOP:  case CALLM_E:  case CALLM_S:
+            break;
+
+         default:
+            VG_(copyUInstr)(cb, u_in);
+            break;
+      }
+   }
+
+   /* Just check everything looks ok */
+   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
+
+   VG_(freeCodeBlock)(cb_in);
+   return cb;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation stuff                               ---*/
+/*------------------------------------------------------------*/
+
+/* Total reads/writes/misses.  Calculated during CC traversal at the end. */
+static CC Ir_total;
+static CC Dr_total;
+static CC Dw_total;
+
+void VG_(init_cachesim)(void)
+{
+   /* Make sure the output file can be written. */
+   Int fd = VG_(open_write)(OUT_FILE);
+   if (-1 == fd) { 
+      fd = VG_(create_and_write)(OUT_FILE);
+      if (-1 == fd) {
+         file_err(); 
+      }
+   }
+   VG_(close)(fd);
+    
+   initCC(&Ir_total);
+   initCC(&Dr_total);
+   initCC(&Dw_total);
+   
+   cachesim_I1_initcache();
+   cachesim_D1_initcache();
+   cachesim_L2_initcache();
+
+   init_BBCC_table();
+}
+
+void VG_(cachesim_log_non_mem_instr)(iCC* cc)
+{
+   //VG_(printf)("sim  I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+   VGP_POPCC;
+}
+
+void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr)
+{
+   //VG_(printf)("sim  D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+
+   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
+   cc->D.a++;
+   VGP_POPCC;
+}
+
+/*------------------------------------------------------------*/
+/*--- Printing of output file and summary stats            ---*/
+/*------------------------------------------------------------*/
+
+int get_line_num(Addr instr_addr) 
+{
+   Char filename[FILENAME_LEN] = "???";
+   UInt line_num;
+   Bool found;
+
+   found = VG_(what_line_is_this)(instr_addr, filename,
+                                  FILENAME_LEN, &line_num);
+   if (!found) {
+      line_num = 0; 
+   }
+   return line_num;
+}
+
+static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, 
+                                                 Char *first_instr_fn)
+{
+   Addr BBCC_ptr0, BBCC_ptr;
+   Char buf[BUF_LEN], curr_file[BUF_LEN], fbuf[BUF_LEN+4];
+   UInt line_num;
+
+   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
+
+   VG_(write)(fd, (void*)"\n", 1);
+
+   VG_(strcpy)(curr_file, first_instr_fl);
+   
+   while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) {
+
+      /* We pretend the CC is an iCC for getting the tag.  This is ok
+       * because both CC types have tag as their first byte.  Once we know
+       * the type, we can cast and act appropriately. */
+
+      Char fl_buf[FILENAME_LEN];
+      Char fn_buf[FN_NAME_LEN];
+
+      /* Assumes instr_addr position is same for both CCs. */
+      Addr instr_addr = ((iCC*)BBCC_ptr)->instr_addr;
+      get_file_fn_names(instr_addr, fl_buf, fn_buf);
+
+      /* Allow for filename switching in the middle of a BB;  if this happens,
+       * must print the new filename with the function name. */
+      if (0 != strcmp(fl_buf, curr_file)) {
+         VG_(strcpy)(curr_file, fl_buf);
+         VG_(sprintf)(fbuf, "fi=%s\n", curr_file);
+         VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
+      }
+
+      switch ( ((iCC*)BBCC_ptr)->tag ) {
+
+#define ADD_CC_TO(CC_type, cc, total)           \
+   total.a  += ((CC_type*)BBCC_ptr)->cc.a;      \
+   total.m1 += ((CC_type*)BBCC_ptr)->cc.m1;     \
+   total.m2 += ((CC_type*)BBCC_ptr)->cc.m2;
+          
+         case INSTR_CC:
+            line_num = get_line_num(((iCC*)BBCC_ptr)->instr_addr);
+            sprint_iCC(buf, line_num, (iCC*)BBCC_ptr);
+            ADD_CC_TO(iCC, I, Ir_total);
+            BBCC_ptr += sizeof(iCC);
+            break;
+
+         case READ_CC:
+         case  MOD_CC:
+            line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr);
+            sprint_read_or_mod_CC(buf, line_num, (idCC*)BBCC_ptr);
+            ADD_CC_TO(idCC, I, Ir_total);
+            ADD_CC_TO(idCC, D, Dr_total);
+            BBCC_ptr += sizeof(idCC);
+            break;
+
+         case WRITE_CC:
+            line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr);
+            sprint_write_CC(buf, line_num, (idCC*)BBCC_ptr);
+            ADD_CC_TO(idCC, I, Ir_total);
+            ADD_CC_TO(idCC, D, Dw_total);
+            BBCC_ptr += sizeof(idCC);
+            break;
+
+#undef ADD_CC_TO
+
+         default:
+            VG_(panic)("Unknown CC type in fprint_BBCC()\n");
+            break;
+      }
+      distinct_instrs++;
+      
+      /* If the function name for this instruction doesn't match that of the
+       * first instruction in the BB, print out a warning. */
+      if (VG_(clo_trace_symtab) && 0 != strcmp(fn_buf, first_instr_fn)) {
+         VG_(printf)("Mismatched function names\n");
+         VG_(printf)("  filenames: BB:%s, instr:%s;  "
+                     "fn_names:  BB:%s, instr:%s;  "
+                     "line: %d\n", 
+                     first_instr_fl, fl_buf, 
+                     first_instr_fn, fn_buf, 
+                     line_num);
+      }
+
+      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   }
+   /* If we switched filenames in the middle of the BB without switching back,
+    * switch back now because the subsequent BB may be relying on falling under
+    * the original file name. */
+   if (0 != VG_(strcmp)(first_instr_fl, curr_file)) {
+      VG_(sprintf)(fbuf, "fe=%s\n", first_instr_fl);
+      VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
+   }
+   //VG_(write)(fd, (void*)"#}\n", 3);
+
+   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
+}
+
+static void fprint_BBCC_table_and_calc_totals(Int client_argc, 
+                                              Char** client_argv)
+{
+   Int        fd;
+   Char       buf[BUF_LEN];
+   file_node *curr_file_node;
+   fn_node   *curr_fn_node;
+   BBCC      *curr_BBCC;
+   Int        i,j,k;
+
+   VGP_PUSHCC(VgpCacheDump);
+   fd = VG_(open_write)(OUT_FILE);
+   if (-1 == fd) { file_err(); }
+
+   /* "desc:" lines (giving I1/D1/L2 cache configuration) */
+   VG_(write)(fd, (void*)I1_desc_line, VG_(strlen)(I1_desc_line));
+   VG_(write)(fd, (void*)D1_desc_line, VG_(strlen)(D1_desc_line));
+   VG_(write)(fd, (void*)L2_desc_line, VG_(strlen)(L2_desc_line));
+
+   /* "cmd:" line */
+   VG_(strcpy)(buf, "cmd:");
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   for (i = 0; i < client_argc; i++) {
+       VG_(sprintf)(buf, " %s", client_argv[i]);
+       VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   }
+   /* "events:" line */
+   VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n");
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+   /* Six loops here:  three for the hash table arrays, and three for the
+    * chains hanging off the hash table arrays. */
+   for (i = 0; i < N_FILE_ENTRIES; i++) {
+      curr_file_node = BBCC_table[i];
+      while (curr_file_node != NULL) {
+         VG_(sprintf)(buf, "fl=%s\n", curr_file_node->filename);
+         VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+         for (j = 0; j < N_FN_ENTRIES; j++) {
+            curr_fn_node = curr_file_node->fns[j];
+            while (curr_fn_node != NULL) {
+               VG_(sprintf)(buf, "fn=%s\n", curr_fn_node->fn_name);
+               VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+               for (k = 0; k < N_BBCC_ENTRIES; k++) {
+                  curr_BBCC = curr_fn_node->BBCCs[k];
+                  while (curr_BBCC != NULL) {
+                     fprint_BBCC(fd, curr_BBCC, 
+                             
+                             curr_file_node->filename,
+                             curr_fn_node->fn_name);
+
+                     curr_BBCC = curr_BBCC->next;
+                  }
+               }
+               curr_fn_node = curr_fn_node->next;
+            }
+         }
+         curr_file_node = curr_file_node->next;
+      }
+   }
+
+   /* Summary stats must come after rest of table, since we calculate them
+    * during traversal.  */ 
+   VG_(sprintf)(buf, "summary: "
+                     "%llu %llu %llu "
+                     "%llu %llu %llu "
+                     "%llu %llu %llu\n", 
+                     Ir_total.a, Ir_total.m1, Ir_total.m2,
+                     Dr_total.a, Dr_total.m1, Dr_total.m2,
+                     Dw_total.a, Dw_total.m1, Dw_total.m2);
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   VG_(close)(fd);
+}
+
+/* Adds commas to ULong, right justifying in a field field_width wide, returns
+ * the string in buf. */
+Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN])
+{
+   int len, n_commas, i, j, new_len, space;
+
+   VG_(sprintf)(buf, "%lu", n);
+   len = VG_(strlen)(buf);
+   n_commas = (len - 1) / 3;
+   new_len = len + n_commas;
+   space = field_width - new_len;
+
+   /* Allow for printing a number in a field_width smaller than it's size */
+   if (space < 0) space = 0;    
+
+   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
+    * of three. */
+   for (j = -1, i = len ; i >= 0; i--) {
+      buf[i + n_commas + space] = buf[i];
+
+      if (3 == ++j) {
+         j = 0;
+         n_commas--;
+         buf[i + n_commas + space] = ',';
+      }
+   }
+   /* Right justify in field. */
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+   return new_len;
+}
+
+void percentify(Int n, Int pow, Int field_width, char buf[]) 
+{
+   int i, len, space;
+    
+   VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow);
+   len = VG_(strlen)(buf);
+   space = field_width - len;
+   i = len;
+
+   /* Right justify in field */
+   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+}
+
+void VG_(show_cachesim_results)(Int client_argc, Char** client_argv)
+{
+   CC D_total;
+   ULong L2_total_m, L2_total_mr, L2_total_mw; 
+   char buf1[RESULTS_BUF], 
+        buf2[RESULTS_BUF], 
+        buf3[RESULTS_BUF];
+   Int l1, l2, l3;
+   Int p;
+
+   fprint_BBCC_table_and_calc_totals(client_argc, client_argv);
+
+   /* I cache results.  Use the I_refs value to determine the first column
+    * width. */
+   l1 = commify(Ir_total.a, 0, buf1);
+   VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);
+
+   commify(Ir_total.m1, l1, buf1);
+   VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);
+
+   commify(Ir_total.m2, l1, buf1);
+   VG_(message)(Vg_UserMsg, "L2  misses:    %s", buf1);
+
+   p = 100;
+
+   percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1);
+   VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
+                
+   percentify(Ir_total.m2 * 100 * p / Ir_total.a, p, l1+1, buf1);
+   VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
+   VG_(message)(Vg_UserMsg, "");
+
+   /* D cache results.  Use the D_refs.rd and D_refs.wr values to determine the
+    * width of columns 2 & 3. */
+   D_total.a  = Dr_total.a  + Dw_total.a;
+   D_total.m1 = Dr_total.m1 + Dw_total.m1;
+   D_total.m2 = Dr_total.m2 + Dw_total.m2;
+       
+        commify( D_total.a, 0, buf1);
+   l2 = commify(Dr_total.a, 0, buf2);
+   l3 = commify(Dw_total.a, 0, buf3);
+   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
+                buf1,  buf2,  buf3);
+
+   commify( D_total.m1, l1, buf1);
+   commify(Dr_total.m1, l2, buf2);
+   commify(Dw_total.m1, l3, buf3);
+   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   commify( D_total.m2, l1, buf1);
+   commify(Dr_total.m2, l2, buf2);
+   commify(Dw_total.m2, l3, buf3);
+   VG_(message)(Vg_UserMsg, "L2  misses:    %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   p = 10;
+   
+   percentify( D_total.m1 * 100 * p / D_total.a,  p, l1+1, buf1);
+   percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2);
+   percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+
+   percentify( D_total.m2 * 100 * p / D_total.a,  p, l1+1, buf1);
+   percentify(Dr_total.m2 * 100 * p / Dr_total.a, p, l2+1, buf2);
+   percentify(Dw_total.m2 * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+   VG_(message)(Vg_UserMsg, "");
+
+   /* L2 overall results */
+   L2_total_m  = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
+   L2_total_mr = Dr_total.m2 + Ir_total.m2;
+   L2_total_mw = Dw_total.m2;
+
+   commify(L2_total_m,  l1, buf1);
+   commify(L2_total_mr, l2, buf2);
+   commify(L2_total_mw, l3, buf3);
+   VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   percentify(L2_total_m  * 100 * p / (Ir_total.a + D_total.a),  p, l1+1, buf1);
+   percentify(L2_total_mr * 100 * p / (Ir_total.a + Dr_total.a), p, l2+1, buf2);
+   percentify(L2_total_mw * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )", buf1, buf2,buf3);
+            
+
+   /* Hash table stats */
+   if (VG_(clo_verbosity) > 1) {
+       int BB_lookups = full_debug_BBs      + fn_name_debug_BBs +
+                        file_line_debug_BBs + no_debug_BBs;
+      
+       VG_(message)(Vg_DebugMsg, "");
+       VG_(message)(Vg_DebugMsg, "Distinct files:   %d", distinct_files);
+       VG_(message)(Vg_DebugMsg, "Distinct fns:     %d", distinct_fns);
+       VG_(message)(Vg_DebugMsg, "BB lookups:       %d", BB_lookups);
+       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
+                    full_debug_BBs    * 100 / BB_lookups,
+                    full_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
+                    file_line_debug_BBs * 100 / BB_lookups,
+                    file_line_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
+                    fn_name_debug_BBs * 100 / BB_lookups,
+                    fn_name_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
+                    no_debug_BBs      * 100 / BB_lookups,
+                    no_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations);
+       VG_(message)(Vg_DebugMsg, "Distinct instrs:  %d", distinct_instrs);
+   }
+   VGP_POPCC;
+}
+
diff --git a/cachegrind/cg_sim_D1.c b/cachegrind/cg_sim_D1.c
new file mode 100644
index 0000000000..c2b3d4cc3e
--- /dev/null
+++ b/cachegrind/cg_sim_D1.c
@@ -0,0 +1,93 @@
+/*  D1 cache simulator, generated by vg_cachegen.
+ *     total size    = 65536 bytes
+ *     line size     = 64 bytes
+ *     associativity = 2-way associative
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char D1_desc_line[] = 
+    "desc: D1 cache:         65536 B, 64 B, 2-way associative\n";
+
+static UInt D1_tags[512][2];
+
+static void cachesim_D1_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < 512; set++)
+      for (way = 0; way < 2; way++)
+         D1_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_D1_doref(Addr a, UChar size, ULong* m1, ULong *m2)
+{
+   register UInt set1 = ( a         >> 6) & (512-1);
+   register UInt set2 = ((a + size) >> 6) & (512-1);
+   register UInt tag  = a >> (6 + 9);
+
+   if (set1 == set2) {
+
+      if (tag == D1_tags[set1][0]) {
+         return;
+      }
+      else if (tag == D1_tags[set1][1]) {
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+         return;
+      }
+      else {
+         /* A miss */
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else if ((set1 + 1) % 512 == set2) {
+
+      Bool is_D1_miss = False;
+
+      /* Block one */
+      if (tag == D1_tags[set1][0]) {
+      }
+      else if (tag == D1_tags[set1][1]) {
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+      }
+      else {
+         /* A miss */
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+
+         is_D1_miss = True;
+      }
+
+      /* Block two */
+      if (tag == D1_tags[set2][0]) {
+      }
+      else if (tag == D1_tags[set2][1]) {
+         D1_tags[set2][1] = D1_tags[set2][0];
+         D1_tags[set2][0] = tag;
+      }
+      else {
+         /* A miss */
+         D1_tags[set2][1] = D1_tags[set2][0];
+         D1_tags[set2][0] = tag;
+
+         is_D1_miss = True;
+      }
+
+      /* Miss treatment */
+      if (is_D1_miss) {
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else {
+      VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size);
+      VG_(printf)("sets %d and %d.\n", set1, set2);
+      VG_(panic)("D1 cache set mismatch");
+   }
+}
diff --git a/cachegrind/cg_sim_I1.c b/cachegrind/cg_sim_I1.c
new file mode 100644
index 0000000000..32b89b51b0
--- /dev/null
+++ b/cachegrind/cg_sim_I1.c
@@ -0,0 +1,93 @@
+/*  I1 cache simulator, generated by vg_cachegen.
+ *     total size    = 65536 bytes
+ *     line size     = 64 bytes
+ *     associativity = 2-way associative
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char I1_desc_line[] = 
+    "desc: I1 cache:         65536 B, 64 B, 2-way associative\n";
+
+static UInt I1_tags[512][2];
+
+static void cachesim_I1_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < 512; set++)
+      for (way = 0; way < 2; way++)
+         I1_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_I1_doref(Addr a, UChar size, ULong* m1, ULong *m2)
+{
+   register UInt set1 = ( a         >> 6) & (512-1);
+   register UInt set2 = ((a + size) >> 6) & (512-1);
+   register UInt tag  = a >> (6 + 9);
+
+   if (set1 == set2) {
+
+      if (tag == I1_tags[set1][0]) {
+         return;
+      }
+      else if (tag == I1_tags[set1][1]) {
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+         return;
+      }
+      else {
+         /* A miss */
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else if ((set1 + 1) % 512 == set2) {
+
+      Bool is_I1_miss = False;
+
+      /* Block one */
+      if (tag == I1_tags[set1][0]) {
+      }
+      else if (tag == I1_tags[set1][1]) {
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+      }
+      else {
+         /* A miss */
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+
+         is_I1_miss = True;
+      }
+
+      /* Block two */
+      if (tag == I1_tags[set2][0]) {
+      }
+      else if (tag == I1_tags[set2][1]) {
+         I1_tags[set2][1] = I1_tags[set2][0];
+         I1_tags[set2][0] = tag;
+      }
+      else {
+         /* A miss */
+         I1_tags[set2][1] = I1_tags[set2][0];
+         I1_tags[set2][0] = tag;
+
+         is_I1_miss = True;
+      }
+
+      /* Miss treatment */
+      if (is_I1_miss) {
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else {
+      VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size);
+      VG_(printf)("sets %d and %d.\n", set1, set2);
+      VG_(panic)("I1 cache set mismatch");
+   }
+}
diff --git a/cachegrind/cg_sim_L2.c b/cachegrind/cg_sim_L2.c
new file mode 100644
index 0000000000..bb685326ab
--- /dev/null
+++ b/cachegrind/cg_sim_L2.c
@@ -0,0 +1,250 @@
+/*  L2 cache simulator, generated by vg_cachegen.
+ *     total size    = 262144 bytes
+ *     line size     = 64 bytes
+ *     associativity = 8-way associative
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char L2_desc_line[] = 
+    "desc: L2 cache:         262144 B, 64 B, 8-way associative\n";
+
+static UInt L2_tags[512][8];
+
+static void cachesim_L2_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < 512; set++)
+      for (way = 0; way < 8; way++)
+         L2_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_L2_doref(Addr a, UChar size, ULong *m2)
+{
+   register UInt set1 = ( a         >> 6) & (512-1);
+   register UInt set2 = ((a + size) >> 6) & (512-1);
+   register UInt tag  = a >> (6 + 9);
+
+   if (set1 == set2) {
+
+      if (tag == L2_tags[set1][0]) {
+         return;
+      }
+      else if (tag == L2_tags[set1][1]) {
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][2]) {
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][3]) {
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][4]) {
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][5]) {
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][6]) {
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][7]) {
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else {
+         /* A miss */
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+
+         (*m2)++;
+      }
+
+   } else if ((set1 + 1) % 512 == set2) {
+
+      Bool is_L2_miss = False;
+
+      /* Block one */
+      if (tag == L2_tags[set1][0]) {
+      }
+      else if (tag == L2_tags[set1][1]) {
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][2]) {
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][3]) {
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][4]) {
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][5]) {
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][6]) {
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][7]) {
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else {
+         /* A miss */
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+
+         is_L2_miss = True;
+      }
+
+      /* Block two */
+      if (tag == L2_tags[set2][0]) {
+      }
+      else if (tag == L2_tags[set2][1]) {
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][2]) {
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][3]) {
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][4]) {
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][5]) {
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][6]) {
+         L2_tags[set2][6] = L2_tags[set2][5];
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][7]) {
+         L2_tags[set2][7] = L2_tags[set2][6];
+         L2_tags[set2][6] = L2_tags[set2][5];
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else {
+         /* A miss */
+         L2_tags[set2][7] = L2_tags[set2][6];
+         L2_tags[set2][6] = L2_tags[set2][5];
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+
+         is_L2_miss = True;
+      }
+
+      /* Miss treatment */
+      if (is_L2_miss) {
+         (*m2)++;
+      }
+
+   } else {
+      VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size);
+      VG_(printf)("sets %d and %d.\n", set1, set2);
+      VG_(panic)("L2 cache set mismatch");
+   }
+}
diff --git a/cachegrind/docs/manual.html b/cachegrind/docs/manual.html
index c441db1620..a97c2f9fec 100644
--- a/cachegrind/docs/manual.html
+++ b/cachegrind/docs/manual.html
@@ -78,7 +78,9 @@ Linux-x86 executables.
 
 <h4>6&nbsp; <a href="#example">An example</a></h4>
 
-<h4>7&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
+<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
+
+<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
 
 <hr width="100%">
 
@@ -515,6 +517,11 @@ follows:
       buggy, so you may need to issue this flag if you use 3.0.4.
       </li><br><p>
 
+  <li><code>--cachesim=no</code> [default]<br>
+      <code>--cachesim=yes</code>
+      <p>When enabled, turns off memory checking, and turns on cache profiling.
+      Cache profiling is described in detail in <a href="#cache">Section 7</a>.
+      </li><p>
 </ul>
 
 There are also some options for debugging Valgrind itself.  You
@@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$
 <p>The GCC folks fixed this about a week before gcc-3.0 shipped.
 <hr width="100%">
 <p>
+
+
+
+<a name="cache"></a>
+<h2>7&nbsp; Cache profiling</h2>
+As well as memory debugging, Valgrind also allows you to do cache simulations
+and annotate your source line-by-line with the number of cache misses.  In
+particular, it records:
+<ul>
+  <li>L1 instruction cache reads and misses;
+  <li>L1 data cache reads and read misses, writes and write misses;
+  <li>L2 unified cache reads and read misses, writes and writes misses.
+</ul>
+On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
+and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
+very useful for improving the performance of your program.
+
+Please note that this is an experimental feature.  Any feedback, bug-fixes,
+suggestions, etc, welcome.
+
+
+<h3>7.1&nbsp; Overview</h3>
+First off, as for normal Valgrind use, you probably want to turn on debugging
+info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
+probably <b>do</b> want to turn optimisation on, since you should profile your
+program as it will be normally run.
+
+The three steps are:
+<ol>
+  <li>Generate a cache simulator for your machine's cache configuration with
+      `vg_cachegen' and recompile Valgrind with <code>make install</code>.
+      Valgrind comes with a default simulator, but it is unlikely to be correct
+      for your system, so you should generate a simulator yourself.</li>
+  <li>Run your program with <code>valgrind --cachesim=yes</code> in front of 
+      the normal command line invocation.  When the program finishes, Valgrind
+      will print summary cache statistics. It also collects line-by-line
+      information in a file <code>cachegrind.out</code>.</li>
+  <li>Generate a function-by-function summary, and possibly annotate source
+      files with 'vg_annotate'. Source files to annotate can be specified
+      manually, or manually on the command line, or "interesting" source files
+      can be annotated automatically with the <code>--auto=yes</code> option.
+      You can annotate C/C++ files or assembly language files equally
+      easily.</li>
+</ol>
+
+<a href="#generate">Step 1</a> only needs to be done once, unless you are
+interested in simulating different cache configurations (eg. first
+concentrating on instruction cache misses, then on data cache misses).<p>
+
+<a href="#profile">Step 2</a> should be done every time you want to collect
+information about a new program, a changed program, or about the same program
+with different input.<p>
+
+<a href="#annotate">Step 3</a> can be performed as many times as you like for
+each Step 2; you may want to do multiple annotations showing different
+information each time.<p>
+
+The steps are described in detail in the following sections.<p>
+
+
+<a name="generate"></a>
+<h3>7.3&nbsp; Generating a cache simulator</h3>
+Although Valgrind comes with a pre-generated cache simulator, it most likely
+won't match the cache configuration of your machine, so you should generate
+a new simulator.<p>
+
+You need to generate three files, one for each of the I1, D1 and L2 caches.
+For each cache, you need to know the:
+<ul>
+  <li>Cache size (bytes);
+  <li>Line size (bytes);
+  <li>Associativity.
+</ul>
+
+vg_cachegen takes three options:
+<ul>
+  <li><code>--I1=size,line_size,associativity</code>
+  <li><code>--D1=size,line_size,associativity</code>
+  <li><code>--L2=size,line_size,associativity</code>
+</ul>
+
+You can specify one, two or all three caches per invocation of vg_cachegen.  It
+checks that the configuration is sensible before generating the simulators;  to
+see the allowed values, run <code>vg_cachegen -h</code>.<p>
+
+An example invocation would be:
+
+<blockquote><code>
+  vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8
+</code></blockquote>
+
+This simulates a machine with a 128KB split L1 2-way associative cache, and a
+256KB unified 8-way associative L2 cache.  Both caches have 64B lines.<p>
+
+If you don't know your cache configuration, you'll have to find it out.
+(Ideally vg_cachegen could auto-identify your cache configuration using the
+CPUID instruction, which could be done automatically during installation, and
+this whole step could be skipped...)<p>
+
+
+<h3>7.4&nbsp; Cache simulation specifics</h3>
+vg_cachegen only generates simulations for a machine with a split L1 cache and
+a unified L2 cache.  This configuration is used for all x86-based machines we
+are aware of.<p>
+
+The more specific characteristics of the simulation are as follows.
+
+<ul>
+  <li>Write-allocate: when a write miss occurs, the block written to is brought
+      into the D1 cache.  Most modern caches have this property.</li><p>
+
+  <li>Bit-selection hash function:  the line(s) in the cache to which a memory
+      block maps is chosen by the middle bits M--(M+N-1) of the byte address,
+      where:
+      <ul>
+        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
+        <li>(cache size / line size) = 2^N bytes</li>
+      </ul> </li><p>
+
+  <li>Inclusive L2 cache:  the L2 cache replicates all the entries of the L1
+      cache.  This is standard on Pentium chips, but AMD Athlons use an
+      exclusive L2 cache that only holds blocks evicted from L1.</li><p>
+</ul>
+
+Other noteworthy behaviour:
+
+<ul>
+  <li>References that straddle two cache lines are treated as follows:</li>
+  <ul>
+    <li>If both blocks hit --&gt; counted as one hit</li>
+    <li>If one block hits, the other misses --&gt; counted as one miss</li>
+    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
+  </ul><p>
+
+  <li>Instructions that modify a memory location (eg. <code>inc</code> and
+      <code>dec</code>) are counted as doing just a read, ie. a single data
+      reference.  This may seem strange, but since the write can never cause a
+      miss (the read guarantees the block is in the cache) it's not very
+      interesting.<p>
+
+      Thus it measures not the number of times the data cache is accessed, but
+      the number of times a data cache miss could occur.<p>
+      </li>
+</ul>
+
+If you are interested in simulating a cache with different properties, it is
+not particularly hard to write your own cache simulator, or to modify existing
+ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_I1.c</code> and
+<code>vg_cachesim_I1.c</code>.  We'd be interested to hear from anyone who
+does.
+
+
+<a name="profile"></a>
+<h3>7.5&nbsp; Profiling programs</h3>
+Cache profiling is enabled by using the <code>--cachesim=yes</code> option to
+Valgrind.  This automatically turns off Valgrind's memory checking functions,
+since the cache simulation is slow enough already, and you probably don't want
+to do both at once.<p>
+
+To gather cache profiling information about the program <code>ls -l<code, type:
+
+<blockquote><code>valgrind --cachesim=yes ls -l</code></blockquote>
+
+The program will execute (slowly).  Upon completion, summary statistics
+that look like this will be printed:
+
+<pre>
+==31751== I   refs:      27,742,716
+==31751== I1  misses:           276
+==31751== L2  misses:           275
+==31751== I1  miss rate:        0.0%
+==31751== L2i miss rate:        0.0%
+==31751== 
+==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
+==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
+==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
+==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
+==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
+==31751== 
+==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
+==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
+</pre>
+
+Cache accesses for instruction fetches are summarised first, giving the
+number of fetches made (this is the number of instructions executed, which
+can be useful to know in its own right), the number of I1 misses, and the
+number of L2 instruction (<code>L2i</code>) misses.<p>
+
+Cache accesses for data follow. The information is similar to that of the
+instruction fetches, except that the values are also shown split between reads
+and writes (note each row's <code>rd</code> and <code>wr</code> values add up
+to the row's total).<p>
+
+Combined instruction and data figures for the L2 cache follow that.<p>
+
+
+<h3>7.6&nbsp; Output file</h3>
+As well as printing summary information, Valgrind also writes line-by-line
+cache profiling information to a file named <code>cachegrind.out</code> .  This
+file is human-readable, but is best interpreted by the accompanying program
+vg_annotate, described in the next section.<p>
+
+Things to note about the <code>cachegrind.out</code> file:
+<ul>
+  <li>It is written every time <code>valgrind --cachesim=yes</code> is run; it
+      will automatically overwrite any existing <code>cachegrind.out<code/> in
+      the current directory.</li>
+  <li>It can be quite large: <code>ls -l</code> generates a file of about
+      350KB; browsing a few files and web pages with Konqueror generates a file
+      of around 10MB.</li>
+</ul>
+
+
+<a name="annotate"></a>
+<h3>7.7&nbsp; Annotating C/C++ programs</h3>
+Before using vg_annotate, it is worth widening your window to be at least
+120-characters wide if possible, as the output lines can be quite long.<p>
+
+To get a function-by-function summary, run <code>vg_annotate</code> in
+directory containing a <code>cachegrind.out</code> file.  The output looks like
+this:
+
+<pre>
+--------------------------------------------------------------------------------
+I1 cache:              65536 B, 64 B, 2-way associative
+D1 cache:              65536 B, 64 B, 2-way associative
+L2 cache:              262144 B, 64 B, 8-way associative
+Command:               concord vg_to_ucode.c
+Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Threshold:             99%
+Chosen for annotation:
+Auto-annotation:       on
+
+--------------------------------------------------------------------------------
+Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
+--------------------------------------------------------------------------------
+27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
+
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
+--------------------------------------------------------------------------------
+8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
+5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
+2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
+2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
+2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
+1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
+  897,991   51   51   897,831    95    30        62      1      1  ???:???
+  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
+  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
+  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
+  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
+  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
+  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
+  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
+  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
+  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
+   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
+   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
+</pre>
+
+First up is a summary of the annotation options:
+                    
+<ul>
+  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
+      configuration with which these results were obtained.</li><p>
+
+  <li>Command: the command line invocation of the program under
+      examination.</li><p>
+
+  <li>Events recorded: event abbreviations are:<p>
+  <ul>
+    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
+    <li><code>I1mr</code>: I1 cache read misses</li>
+    <li><code>I2mr</code>: L2 cache instruction read misses</li>
+    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
+    <li><code>D1mr</code>: D1 cache read misses</li>
+    <li><code>D2mr</code>: L2 cache data read misses</li>
+    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
+    <li><code>D1mw</code>: D1 cache write misses</li>
+    <li><code>D2mw</code>: L2 cache data write misses</li>
+  </ul><p>
+      Note that D1 total accesses is given by <code>D1mr</code> +
+      <code>D1mw</code>, and that L2 total accesses is given by
+      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
+
+  <li>Events shown: the events shown (a subset of events gathered).  This can
+      be adjusted with the <code>--show</code> option.</li><p>
+
+  <li>Event sort order: the sort order in which functions are shown.  For
+      example, in this case the functions are sorted from highest
+      <code>Ir</code> counts to lowest.  If two functions have identical
+      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
+      counts, and so on.  This order can be adjusted with the
+      <code>--sort</code> option.<p>
+
+      Note that this dictates the order the functions appear.  It is <b>not</b>
+      the order in which the columns appear;  that is dictated by the "events
+      shown" line (and can be changed with the <code>--sort</code> option).
+      </li><p>
+
+  <li>Threshold: vg_annotate by default omits functions that cause very low
+      numbers of misses to avoid drowing you in information.  In this case,
+      vg_annotate shows summaries the functions that account for 99%   of the
+      <code>Ir</code> counts; <code>Ir</code> is chosen as the treshold event
+      since it is  the primary sort event.  The threshold can be adjusted with
+      the <code>--threshold</code> option.</li><p>
+
+  <li>Chosen for annotation: names of files specified manually for annotation; 
+      in this case none.</li><p>
+
+  <li>Auto-annotation: whether auto-annotation was requested via the 
+      <code>--auto=yes</code> option. In this case no.</li><p>
+</ul>
+
+Then follows summary statistics for the whole program. These are similar
+to the summary provided when running <code>valgrind --cachesim=yes</code>.<p>
+  
+Then follows function-by-function statistics. Each function is identified by a
+<code>file_name:function_name</code> pair. If a column contains only a
+`.' it means  the function never performs that event (eg. the third row shows
+that <code>strcmp()</code> contains no instructions that write to memory). The
+name <code>???</code> is used if the the file name and/or function name could
+not be determined from debugging information. (If most of the entries have the
+form <code>???:???</code> the program probably wasn't compiled with
+<code>-g</code>.)<p> 
+
+It is worth noting that functions will come from three types of source files:
+<ol>
+  <li> From the profiled program (<code>concord.c</code> in this example).</li>
+  <li>From libraries (eg. <code>getc.c</code>)</li>
+  <li>From Valgrind's implementation of some libc functions (eg.
+      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
+      the filename begins with <code>vg_</code>, and is probably one of
+      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
+      <code>vg_mylibc.c</code>.
+  </li>
+</ol>
+
+There are two ways to annotate source files -- by choosing them manually, or
+with the <code>--auto=yes</code> option. To do it manually, just
+specify the filenames as arguments to vg_annotate. For example, the output from
+running <code>vg_annotate concord.c</code> for our example produces the same
+output as above followed by an annotated version of <code>concord.c</code>, a
+section of which looks like:
+
+<pre>
+--------------------------------------------------------------------------------
+-- User-annotated source: concord.c
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
+
+[snip]
+
+        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
+        3    1    1       .     .     .       1      0      0  {
+        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
+        .    .    .       .     .     .       .      .      .      Word_Info *data;
+        1    0    0       .     .     .       1      1      1      int line = 1, i;
+        .    .    .       .     .     .       .      .      .
+        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
+        .    .    .       .     .     .       .      .      .
+    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
+    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
+        .    .    .       .     .     .       .      .      .
+        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
+        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
+        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
+        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
+        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
+        .    .    .       .     .     .       .      .      .      }
+        .    .    .       .     .     .       .      .      .
+  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
+  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
+        .    .    .       .     .     .       .      .      .
+        4    0    0       1     0     0       2      0      0      free(data);
+        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
+        3    0    0       2     0     0       .      .      .  }
+</pre>
+
+(Although column widths are automatically minimised, a wide terminal is clearly
+useful.)<p>
+  
+Each source file is clearly marked (<code>User-annotated source</code>) as
+having been chosen manually for annotation.  If the file was found in one of
+the directories specified with the <code>-I</code>/<code>--include</code>
+option, the directory and file are both given.<p>
+
+Each line is annotated with its event counts.  Events not applicable for a line
+are represented by a `.';  this is useful for distinguishing between an event
+which cannot happen, and one which can but did not.<p> 
+
+Sometimes only a small section of a source file is executed.  To minimise
+uninteresting output, Valgrind only shows annotated lines and lines within a
+small distance of annotated lines.  Gaps are marked with the line numbers so
+you know which part of a file the shown code comes from, eg:
+
+<pre>
+(figures and code for line 704)
+-- line 704 ----------------------------------------
+-- line 878 ----------------------------------------
+(figures and code for line 878)
+</pre>
+
+The amount of context to show around annotated lines is controlled by the
+<code>--context</code> option.<p>
+
+To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
+vg_annotate will automatically annotate every source file it can find that is
+mentioned in the function-by-function summary.  Therefore, the files chosen for
+auto-annotation  are affected by the <code>--sort</code> and
+<code>--threshold</code> options.  Each source file is clearly marked
+(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
+that could not be found are mentioned at the end of the output, eg:    
+
+<pre>
+--------------------------------------------------------------------------------
+The following files chosen for auto-annotation could not be found:
+--------------------------------------------------------------------------------
+  getc.c
+  ctype.c
+  ../sysdeps/generic/lockfile.c
+</pre>
+
+This is quite common for library files, since libraries are usually compiled
+with debugging information, but the source files are often not present on a
+system.  If a file is chosen for annotation <b>both</b> manually and
+automatically, it is marked as <code>User-annotated source</code>.
+
+Use the <code>-I/--include</code> option to tell Valgrind where to look for
+source files if the filenames found from the debugging information aren't
+specific enough.
+
+Beware that vg_annotate can take some time to digest large
+<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
+auto-annotation can produce a lot of output if your program is large!
+
+
+<h3>7.8&nbsp; Annotating assembler programs</h3>
+Valgrind can annotate assembler programs too, or annotate the assembler
+generated for your C program.  Sometimes this is useful for understanding what
+is really happening when an interesting line of C code is translated into
+multiple instructions.<p>
+
+To do this, you just need to assemble your <code>.s</code> files with
+assembler-level debug information.  gcc doesn't do this, but you can use GNU as
+with the <code>--gstabs</code> option to generate object files with this
+information, eg:
+
+<blockquote><code>as --gstabs foo.s</code></blockquote>
+
+You can then profile and annotate source files in the same way as for C/C++
+programs.
+
+
+<h3>7.9&nbsp; vg_annotate options</h3>
+<ul>
+  <li><code>-h, --help</code></li><p>
+  <li><code>-v, --version</code><p>
+
+      Help and version, as usual.</li>
+
+  <li><code>--sort=A,B,C</code> [default: order in 
+      <code>cachegrind.out</code>]<p>
+      Specifies the events upon which the sorting of the function-by-function
+      entries will be based.  Useful if you want to concentrate on eg. I cache
+      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
+      (<code>--sort=D1mr,D2mr</code>), or L2 misses
+      (<code>--sort=D2mr,I2mr</code>).</li><p>
+
+  <li><code>--show=A,B,C</code> [default: all, using order in
+      <code>cachegrind.out</code>]<p>
+      Specifies which events to show (and the column order). Default is to use
+      all present in the <code>cachegrind.out</code> file (and use the order in
+      the file).</li><p>
+
+  <li><code>--threshold=X</code> [default: 99%] <p>
+      Sets the threshold for the function-by-function summary.  Functions are
+      shown that account for more than X% of all the primary sort events.  If
+      auto-annotating, also affects which files are annotated.</li><p>
+
+  <li><code>--auto=no</code> [default]<br>
+      <code>--auto=yes</code> <p>
+      When enabled, automatically annotates every file that is mentioned in the
+      function-by-function summary that can be found.  Also gives a list of
+      those that couldn't be found.
+
+  <li><code>--context=N</code> [default: 8]<p>
+      Print N lines of context before and after each annotated line.  Avoids
+      printing large sections of source files that were not executed.  Use a 
+      large number (eg. 10,000) to show all source lines.
+      </li><p>
+
+  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
+      [default: empty string]<p>
+      Adds a directory to the list in which to search for files.  Multiple
+      -I/--include options can be given to add multiple directories.
+</ul>
+  
+
+<h3>7.10&nbsp; Warnings</h3>
+There are a couple of situations in which vg_annotate issues warnings.
+
+<ul>
+  <li>If a source file is more recent than the <code>cachegrind.out</code>
+      file.  This is because the information in <code>cachegrind.out</code> is
+      only recorded with line numbers, so if the line numbers change at all in
+      the source (eg. lines added, deleted, swapped), any annotations will be 
+      incorrect.<p>
+
+  <li>If information is recorded about line numbers past the end of a file.
+      This can be caused by the above problem, ie. shortening the source file
+      while using an old <code>cachegrind.out</code> file.  If this happens,
+      the figures for the bogus lines are printed anyway (clearly marked as
+      bogus) in case they are important.</li><p>
+</ul>
+
+
+<h3>7.10&nbsp; Things to watch out for</h3>
+Some odd things that can occur during annotation:
+
+<ul>
+  <li>If annotating at the assembler level, you might see something like this:
+
+      <pre>
+      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
+      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
+      .    .    .  .    .    .  .    .    .          .align 4,0x90
+      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
+      </pre>
+
+      How can the third instruction be executed twice when the others are
+      executed only once?  As it turns out, it isn't.  Here's a dump of the
+      executable, from objdump:
+
+      <pre>
+      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
+      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
+      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
+      8048f32:       89 f6                   mov    %esi,%esi
+      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
+      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
+      </pre>
+
+      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
+      come from?  The GNU assembler inserted it to serve as the two bytes of
+      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
+      a four-byte boundary, but pretended it didn't exist when adding debug
+      information.  Thus when Valgrind reads the debug info it thinks that the
+      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
+      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
+      <code>mov %esi,%esi</code> to it.<p>
+  </li>
+
+  <li>
+      Inlined functions can cause strange results in the function-by-function
+      summary.  If a function <code>inline_me()</code> is defined in
+      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
+      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
+      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
+      will be separate function entries for each inlining site, ie.
+      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
+      <code>foo.h:f3()</code>.  To find the total counts for
+      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
+
+      The reason for this is that although the debug info output by gcc
+      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
+      doesn't indicate the name of the function in <code>foo.h</code>, so
+      Valgrind keeps using the old one.<p>
+
+  <li>
+      Sometimes, the same filename might be represented with a relative name
+      and with an absolute name in different parts of the debug info, eg:
+      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
+      case, if you use auto-annotation, the file will be annotated twice with
+      the counts split between the two.<p>
+  </li>
+</ul>
+
+Note: stabs is not an easy format to read.  If you come across bizarre
+annotations that look like might be caused by a bug in the stabs reader,
+please let us know.
+
+
+<h3>7.11&nbsp; Accuracy</h3>
+Valgrind's cache profiling has a number of shortcomings:
+
+<ul>
+  <li>It doesn't account for kernel activity -- the effect of system calls on
+      the cache contents is ignored.</li><p>
+
+  <li>It doesn't account for other process activity (although this is probably
+      desirable when considering a single program).</li><p>
+
+  <li>It doesn't account for virtual-to-physical address mappings;  hence the
+      entire simulation is not a true representation of what's happening in the
+      cache.</li><p>
+
+  <li>It doesn't account for cache misses not visible at the instruction level,
+      eg. those arising from TLB misses, or speculative execution.</li><p>
+</ul>
+
+Another thing worth nothing is that results are very sensitive.  Changing the
+size of the <code>valgrind.so</code> file, the size of the program being
+profiled, or even the length of its name can perturb the results.  Variations
+will be small, but don't expect perfectly repeatable results if your program
+changes at all.<p>
+
+While these factors mean you shouldn't trust the results to be super-accurate,
+hopefully they should be close enough to be useful.<p>
+
+
+<h3>7.12&nbsp; Todo</h3>
+<ul>
+  <li>Use CPUID instruction to auto-identify cache configuration during 
+      installation.  This would save the user from having to know their cache
+      configuration and using vg_cachegen.</li><p>
+  <li>Program start-up/shut-down calls a lot of functions that aren't
+      interesting and just complicate the output.  Would be nice to exclude
+      these somehow.</li><p>
+</ul> 
+<hr width="100%">
 </body>
 </html>
+
diff --git a/corecheck/Makefile.am b/corecheck/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/corecheck/Makefile.am
+++ b/corecheck/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/coregrind/docs/manual.html b/coregrind/docs/manual.html
index c441db1620..a97c2f9fec 100644
--- a/coregrind/docs/manual.html
+++ b/coregrind/docs/manual.html
@@ -78,7 +78,9 @@ Linux-x86 executables.
 
 <h4>6&nbsp; <a href="#example">An example</a></h4>
 
-<h4>7&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
+<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
+
+<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
 
 <hr width="100%">
 
@@ -515,6 +517,11 @@ follows:
       buggy, so you may need to issue this flag if you use 3.0.4.
       </li><br><p>
 
+  <li><code>--cachesim=no</code> [default]<br>
+      <code>--cachesim=yes</code>
+      <p>When enabled, turns off memory checking, and turns on cache profiling.
+      Cache profiling is described in detail in <a href="#cache">Section 7</a>.
+      </li><p>
 </ul>
 
 There are also some options for debugging Valgrind itself.  You
@@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$
 <p>The GCC folks fixed this about a week before gcc-3.0 shipped.
 <hr width="100%">
 <p>
+
+
+
+<a name="cache"></a>
+<h2>7&nbsp; Cache profiling</h2>
+As well as memory debugging, Valgrind also allows you to do cache simulations
+and annotate your source line-by-line with the number of cache misses.  In
+particular, it records:
+<ul>
+  <li>L1 instruction cache reads and misses;
+  <li>L1 data cache reads and read misses, writes and write misses;
+  <li>L2 unified cache reads and read misses, writes and writes misses.
+</ul>
+On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
+and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
+very useful for improving the performance of your program.
+
+Please note that this is an experimental feature.  Any feedback, bug-fixes,
+suggestions, etc, welcome.
+
+
+<h3>7.1&nbsp; Overview</h3>
+First off, as for normal Valgrind use, you probably want to turn on debugging
+info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
+probably <b>do</b> want to turn optimisation on, since you should profile your
+program as it will be normally run.
+
+The three steps are:
+<ol>
+  <li>Generate a cache simulator for your machine's cache configuration with
+      `vg_cachegen' and recompile Valgrind with <code>make install</code>.
+      Valgrind comes with a default simulator, but it is unlikely to be correct
+      for your system, so you should generate a simulator yourself.</li>
+  <li>Run your program with <code>valgrind --cachesim=yes</code> in front of 
+      the normal command line invocation.  When the program finishes, Valgrind
+      will print summary cache statistics. It also collects line-by-line
+      information in a file <code>cachegrind.out</code>.</li>
+  <li>Generate a function-by-function summary, and possibly annotate source
+      files with 'vg_annotate'. Source files to annotate can be specified
+      manually, or manually on the command line, or "interesting" source files
+      can be annotated automatically with the <code>--auto=yes</code> option.
+      You can annotate C/C++ files or assembly language files equally
+      easily.</li>
+</ol>
+
+<a href="#generate">Step 1</a> only needs to be done once, unless you are
+interested in simulating different cache configurations (eg. first
+concentrating on instruction cache misses, then on data cache misses).<p>
+
+<a href="#profile">Step 2</a> should be done every time you want to collect
+information about a new program, a changed program, or about the same program
+with different input.<p>
+
+<a href="#annotate">Step 3</a> can be performed as many times as you like for
+each Step 2; you may want to do multiple annotations showing different
+information each time.<p>
+
+The steps are described in detail in the following sections.<p>
+
+
+<a name="generate"></a>
+<h3>7.3&nbsp; Generating a cache simulator</h3>
+Although Valgrind comes with a pre-generated cache simulator, it most likely
+won't match the cache configuration of your machine, so you should generate
+a new simulator.<p>
+
+You need to generate three files, one for each of the I1, D1 and L2 caches.
+For each cache, you need to know the:
+<ul>
+  <li>Cache size (bytes);
+  <li>Line size (bytes);
+  <li>Associativity.
+</ul>
+
+vg_cachegen takes three options:
+<ul>
+  <li><code>--I1=size,line_size,associativity</code>
+  <li><code>--D1=size,line_size,associativity</code>
+  <li><code>--L2=size,line_size,associativity</code>
+</ul>
+
+You can specify one, two or all three caches per invocation of vg_cachegen.  It
+checks that the configuration is sensible before generating the simulators;  to
+see the allowed values, run <code>vg_cachegen -h</code>.<p>
+
+An example invocation would be:
+
+<blockquote><code>
+  vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8
+</code></blockquote>
+
+This simulates a machine with a 128KB split L1 2-way associative cache, and a
+256KB unified 8-way associative L2 cache.  Both caches have 64B lines.<p>
+
+If you don't know your cache configuration, you'll have to find it out.
+(Ideally vg_cachegen could auto-identify your cache configuration using the
+CPUID instruction, which could be done automatically during installation, and
+this whole step could be skipped...)<p>
+
+
+<h3>7.4&nbsp; Cache simulation specifics</h3>
+vg_cachegen only generates simulations for a machine with a split L1 cache and
+a unified L2 cache.  This configuration is used for all x86-based machines we
+are aware of.<p>
+
+The more specific characteristics of the simulation are as follows.
+
+<ul>
+  <li>Write-allocate: when a write miss occurs, the block written to is brought
+      into the D1 cache.  Most modern caches have this property.</li><p>
+
+  <li>Bit-selection hash function:  the line(s) in the cache to which a memory
+      block maps is chosen by the middle bits M--(M+N-1) of the byte address,
+      where:
+      <ul>
+        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
+        <li>(cache size / line size) = 2^N bytes</li>
+      </ul> </li><p>
+
+  <li>Inclusive L2 cache:  the L2 cache replicates all the entries of the L1
+      cache.  This is standard on Pentium chips, but AMD Athlons use an
+      exclusive L2 cache that only holds blocks evicted from L1.</li><p>
+</ul>
+
+Other noteworthy behaviour:
+
+<ul>
+  <li>References that straddle two cache lines are treated as follows:</li>
+  <ul>
+    <li>If both blocks hit --&gt; counted as one hit</li>
+    <li>If one block hits, the other misses --&gt; counted as one miss</li>
+    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
+  </ul><p>
+
+  <li>Instructions that modify a memory location (eg. <code>inc</code> and
+      <code>dec</code>) are counted as doing just a read, ie. a single data
+      reference.  This may seem strange, but since the write can never cause a
+      miss (the read guarantees the block is in the cache) it's not very
+      interesting.<p>
+
+      Thus it measures not the number of times the data cache is accessed, but
+      the number of times a data cache miss could occur.<p>
+      </li>
+</ul>
+
+If you are interested in simulating a cache with different properties, it is
+not particularly hard to write your own cache simulator, or to modify existing
+ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_I1.c</code> and
+<code>vg_cachesim_I1.c</code>.  We'd be interested to hear from anyone who
+does.
+
+
+<a name="profile"></a>
+<h3>7.5&nbsp; Profiling programs</h3>
+Cache profiling is enabled by using the <code>--cachesim=yes</code> option to
+Valgrind.  This automatically turns off Valgrind's memory checking functions,
+since the cache simulation is slow enough already, and you probably don't want
+to do both at once.<p>
+
+To gather cache profiling information about the program <code>ls -l<code, type:
+
+<blockquote><code>valgrind --cachesim=yes ls -l</code></blockquote>
+
+The program will execute (slowly).  Upon completion, summary statistics
+that look like this will be printed:
+
+<pre>
+==31751== I   refs:      27,742,716
+==31751== I1  misses:           276
+==31751== L2  misses:           275
+==31751== I1  miss rate:        0.0%
+==31751== L2i miss rate:        0.0%
+==31751== 
+==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
+==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
+==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
+==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
+==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
+==31751== 
+==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
+==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
+</pre>
+
+Cache accesses for instruction fetches are summarised first, giving the
+number of fetches made (this is the number of instructions executed, which
+can be useful to know in its own right), the number of I1 misses, and the
+number of L2 instruction (<code>L2i</code>) misses.<p>
+
+Cache accesses for data follow. The information is similar to that of the
+instruction fetches, except that the values are also shown split between reads
+and writes (note each row's <code>rd</code> and <code>wr</code> values add up
+to the row's total).<p>
+
+Combined instruction and data figures for the L2 cache follow that.<p>
+
+
+<h3>7.6&nbsp; Output file</h3>
+As well as printing summary information, Valgrind also writes line-by-line
+cache profiling information to a file named <code>cachegrind.out</code> .  This
+file is human-readable, but is best interpreted by the accompanying program
+vg_annotate, described in the next section.<p>
+
+Things to note about the <code>cachegrind.out</code> file:
+<ul>
+  <li>It is written every time <code>valgrind --cachesim=yes</code> is run; it
+      will automatically overwrite any existing <code>cachegrind.out<code/> in
+      the current directory.</li>
+  <li>It can be quite large: <code>ls -l</code> generates a file of about
+      350KB; browsing a few files and web pages with Konqueror generates a file
+      of around 10MB.</li>
+</ul>
+
+
+<a name="annotate"></a>
+<h3>7.7&nbsp; Annotating C/C++ programs</h3>
+Before using vg_annotate, it is worth widening your window to be at least
+120-characters wide if possible, as the output lines can be quite long.<p>
+
+To get a function-by-function summary, run <code>vg_annotate</code> in
+directory containing a <code>cachegrind.out</code> file.  The output looks like
+this:
+
+<pre>
+--------------------------------------------------------------------------------
+I1 cache:              65536 B, 64 B, 2-way associative
+D1 cache:              65536 B, 64 B, 2-way associative
+L2 cache:              262144 B, 64 B, 8-way associative
+Command:               concord vg_to_ucode.c
+Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Threshold:             99%
+Chosen for annotation:
+Auto-annotation:       on
+
+--------------------------------------------------------------------------------
+Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
+--------------------------------------------------------------------------------
+27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
+
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
+--------------------------------------------------------------------------------
+8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
+5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
+2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
+2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
+2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
+1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
+  897,991   51   51   897,831    95    30        62      1      1  ???:???
+  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
+  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
+  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
+  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
+  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
+  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
+  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
+  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
+  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
+   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
+   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
+</pre>
+
+First up is a summary of the annotation options:
+                    
+<ul>
+  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
+      configuration with which these results were obtained.</li><p>
+
+  <li>Command: the command line invocation of the program under
+      examination.</li><p>
+
+  <li>Events recorded: event abbreviations are:<p>
+  <ul>
+    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
+    <li><code>I1mr</code>: I1 cache read misses</li>
+    <li><code>I2mr</code>: L2 cache instruction read misses</li>
+    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
+    <li><code>D1mr</code>: D1 cache read misses</li>
+    <li><code>D2mr</code>: L2 cache data read misses</li>
+    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
+    <li><code>D1mw</code>: D1 cache write misses</li>
+    <li><code>D2mw</code>: L2 cache data write misses</li>
+  </ul><p>
+      Note that D1 total accesses is given by <code>D1mr</code> +
+      <code>D1mw</code>, and that L2 total accesses is given by
+      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
+
+  <li>Events shown: the events shown (a subset of events gathered).  This can
+      be adjusted with the <code>--show</code> option.</li><p>
+
+  <li>Event sort order: the sort order in which functions are shown.  For
+      example, in this case the functions are sorted from highest
+      <code>Ir</code> counts to lowest.  If two functions have identical
+      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
+      counts, and so on.  This order can be adjusted with the
+      <code>--sort</code> option.<p>
+
+      Note that this dictates the order the functions appear.  It is <b>not</b>
+      the order in which the columns appear;  that is dictated by the "events
+      shown" line (and can be changed with the <code>--sort</code> option).
+      </li><p>
+
+  <li>Threshold: vg_annotate by default omits functions that cause very low
+      numbers of misses to avoid drowing you in information.  In this case,
+      vg_annotate shows summaries the functions that account for 99%   of the
+      <code>Ir</code> counts; <code>Ir</code> is chosen as the treshold event
+      since it is  the primary sort event.  The threshold can be adjusted with
+      the <code>--threshold</code> option.</li><p>
+
+  <li>Chosen for annotation: names of files specified manually for annotation; 
+      in this case none.</li><p>
+
+  <li>Auto-annotation: whether auto-annotation was requested via the 
+      <code>--auto=yes</code> option. In this case no.</li><p>
+</ul>
+
+Then follows summary statistics for the whole program. These are similar
+to the summary provided when running <code>valgrind --cachesim=yes</code>.<p>
+  
+Then follows function-by-function statistics. Each function is identified by a
+<code>file_name:function_name</code> pair. If a column contains only a
+`.' it means  the function never performs that event (eg. the third row shows
+that <code>strcmp()</code> contains no instructions that write to memory). The
+name <code>???</code> is used if the the file name and/or function name could
+not be determined from debugging information. (If most of the entries have the
+form <code>???:???</code> the program probably wasn't compiled with
+<code>-g</code>.)<p> 
+
+It is worth noting that functions will come from three types of source files:
+<ol>
+  <li> From the profiled program (<code>concord.c</code> in this example).</li>
+  <li>From libraries (eg. <code>getc.c</code>)</li>
+  <li>From Valgrind's implementation of some libc functions (eg.
+      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
+      the filename begins with <code>vg_</code>, and is probably one of
+      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
+      <code>vg_mylibc.c</code>.
+  </li>
+</ol>
+
+There are two ways to annotate source files -- by choosing them manually, or
+with the <code>--auto=yes</code> option. To do it manually, just
+specify the filenames as arguments to vg_annotate. For example, the output from
+running <code>vg_annotate concord.c</code> for our example produces the same
+output as above followed by an annotated version of <code>concord.c</code>, a
+section of which looks like:
+
+<pre>
+--------------------------------------------------------------------------------
+-- User-annotated source: concord.c
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
+
+[snip]
+
+        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
+        3    1    1       .     .     .       1      0      0  {
+        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
+        .    .    .       .     .     .       .      .      .      Word_Info *data;
+        1    0    0       .     .     .       1      1      1      int line = 1, i;
+        .    .    .       .     .     .       .      .      .
+        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
+        .    .    .       .     .     .       .      .      .
+    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
+    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
+        .    .    .       .     .     .       .      .      .
+        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
+        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
+        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
+        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
+        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
+        .    .    .       .     .     .       .      .      .      }
+        .    .    .       .     .     .       .      .      .
+  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
+  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
+        .    .    .       .     .     .       .      .      .
+        4    0    0       1     0     0       2      0      0      free(data);
+        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
+        3    0    0       2     0     0       .      .      .  }
+</pre>
+
+(Although column widths are automatically minimised, a wide terminal is clearly
+useful.)<p>
+  
+Each source file is clearly marked (<code>User-annotated source</code>) as
+having been chosen manually for annotation.  If the file was found in one of
+the directories specified with the <code>-I</code>/<code>--include</code>
+option, the directory and file are both given.<p>
+
+Each line is annotated with its event counts.  Events not applicable for a line
+are represented by a `.';  this is useful for distinguishing between an event
+which cannot happen, and one which can but did not.<p> 
+
+Sometimes only a small section of a source file is executed.  To minimise
+uninteresting output, Valgrind only shows annotated lines and lines within a
+small distance of annotated lines.  Gaps are marked with the line numbers so
+you know which part of a file the shown code comes from, eg:
+
+<pre>
+(figures and code for line 704)
+-- line 704 ----------------------------------------
+-- line 878 ----------------------------------------
+(figures and code for line 878)
+</pre>
+
+The amount of context to show around annotated lines is controlled by the
+<code>--context</code> option.<p>
+
+To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
+vg_annotate will automatically annotate every source file it can find that is
+mentioned in the function-by-function summary.  Therefore, the files chosen for
+auto-annotation  are affected by the <code>--sort</code> and
+<code>--threshold</code> options.  Each source file is clearly marked
+(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
+that could not be found are mentioned at the end of the output, eg:    
+
+<pre>
+--------------------------------------------------------------------------------
+The following files chosen for auto-annotation could not be found:
+--------------------------------------------------------------------------------
+  getc.c
+  ctype.c
+  ../sysdeps/generic/lockfile.c
+</pre>
+
+This is quite common for library files, since libraries are usually compiled
+with debugging information, but the source files are often not present on a
+system.  If a file is chosen for annotation <b>both</b> manually and
+automatically, it is marked as <code>User-annotated source</code>.
+
+Use the <code>-I/--include</code> option to tell Valgrind where to look for
+source files if the filenames found from the debugging information aren't
+specific enough.
+
+Beware that vg_annotate can take some time to digest large
+<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
+auto-annotation can produce a lot of output if your program is large!
+
+
+<h3>7.8&nbsp; Annotating assembler programs</h3>
+Valgrind can annotate assembler programs too, or annotate the assembler
+generated for your C program.  Sometimes this is useful for understanding what
+is really happening when an interesting line of C code is translated into
+multiple instructions.<p>
+
+To do this, you just need to assemble your <code>.s</code> files with
+assembler-level debug information.  gcc doesn't do this, but you can use GNU as
+with the <code>--gstabs</code> option to generate object files with this
+information, eg:
+
+<blockquote><code>as --gstabs foo.s</code></blockquote>
+
+You can then profile and annotate source files in the same way as for C/C++
+programs.
+
+
+<h3>7.9&nbsp; vg_annotate options</h3>
+<ul>
+  <li><code>-h, --help</code></li><p>
+  <li><code>-v, --version</code><p>
+
+      Help and version, as usual.</li>
+
+  <li><code>--sort=A,B,C</code> [default: order in 
+      <code>cachegrind.out</code>]<p>
+      Specifies the events upon which the sorting of the function-by-function
+      entries will be based.  Useful if you want to concentrate on eg. I cache
+      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
+      (<code>--sort=D1mr,D2mr</code>), or L2 misses
+      (<code>--sort=D2mr,I2mr</code>).</li><p>
+
+  <li><code>--show=A,B,C</code> [default: all, using order in
+      <code>cachegrind.out</code>]<p>
+      Specifies which events to show (and the column order). Default is to use
+      all present in the <code>cachegrind.out</code> file (and use the order in
+      the file).</li><p>
+
+  <li><code>--threshold=X</code> [default: 99%] <p>
+      Sets the threshold for the function-by-function summary.  Functions are
+      shown that account for more than X% of all the primary sort events.  If
+      auto-annotating, also affects which files are annotated.</li><p>
+
+  <li><code>--auto=no</code> [default]<br>
+      <code>--auto=yes</code> <p>
+      When enabled, automatically annotates every file that is mentioned in the
+      function-by-function summary that can be found.  Also gives a list of
+      those that couldn't be found.
+
+  <li><code>--context=N</code> [default: 8]<p>
+      Print N lines of context before and after each annotated line.  Avoids
+      printing large sections of source files that were not executed.  Use a 
+      large number (eg. 10,000) to show all source lines.
+      </li><p>
+
+  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
+      [default: empty string]<p>
+      Adds a directory to the list in which to search for files.  Multiple
+      -I/--include options can be given to add multiple directories.
+</ul>
+  
+
+<h3>7.10&nbsp; Warnings</h3>
+There are a couple of situations in which vg_annotate issues warnings.
+
+<ul>
+  <li>If a source file is more recent than the <code>cachegrind.out</code>
+      file.  This is because the information in <code>cachegrind.out</code> is
+      only recorded with line numbers, so if the line numbers change at all in
+      the source (eg. lines added, deleted, swapped), any annotations will be 
+      incorrect.<p>
+
+  <li>If information is recorded about line numbers past the end of a file.
+      This can be caused by the above problem, ie. shortening the source file
+      while using an old <code>cachegrind.out</code> file.  If this happens,
+      the figures for the bogus lines are printed anyway (clearly marked as
+      bogus) in case they are important.</li><p>
+</ul>
+
+
+<h3>7.10&nbsp; Things to watch out for</h3>
+Some odd things that can occur during annotation:
+
+<ul>
+  <li>If annotating at the assembler level, you might see something like this:
+
+      <pre>
+      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
+      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
+      .    .    .  .    .    .  .    .    .          .align 4,0x90
+      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
+      </pre>
+
+      How can the third instruction be executed twice when the others are
+      executed only once?  As it turns out, it isn't.  Here's a dump of the
+      executable, from objdump:
+
+      <pre>
+      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
+      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
+      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
+      8048f32:       89 f6                   mov    %esi,%esi
+      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
+      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
+      </pre>
+
+      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
+      come from?  The GNU assembler inserted it to serve as the two bytes of
+      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
+      a four-byte boundary, but pretended it didn't exist when adding debug
+      information.  Thus when Valgrind reads the debug info it thinks that the
+      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
+      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
+      <code>mov %esi,%esi</code> to it.<p>
+  </li>
+
+  <li>
+      Inlined functions can cause strange results in the function-by-function
+      summary.  If a function <code>inline_me()</code> is defined in
+      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
+      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
+      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
+      will be separate function entries for each inlining site, ie.
+      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
+      <code>foo.h:f3()</code>.  To find the total counts for
+      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
+
+      The reason for this is that although the debug info output by gcc
+      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
+      doesn't indicate the name of the function in <code>foo.h</code>, so
+      Valgrind keeps using the old one.<p>
+
+  <li>
+      Sometimes, the same filename might be represented with a relative name
+      and with an absolute name in different parts of the debug info, eg:
+      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
+      case, if you use auto-annotation, the file will be annotated twice with
+      the counts split between the two.<p>
+  </li>
+</ul>
+
+Note: stabs is not an easy format to read.  If you come across bizarre
+annotations that look like might be caused by a bug in the stabs reader,
+please let us know.
+
+
+<h3>7.11&nbsp; Accuracy</h3>
+Valgrind's cache profiling has a number of shortcomings:
+
+<ul>
+  <li>It doesn't account for kernel activity -- the effect of system calls on
+      the cache contents is ignored.</li><p>
+
+  <li>It doesn't account for other process activity (although this is probably
+      desirable when considering a single program).</li><p>
+
+  <li>It doesn't account for virtual-to-physical address mappings;  hence the
+      entire simulation is not a true representation of what's happening in the
+      cache.</li><p>
+
+  <li>It doesn't account for cache misses not visible at the instruction level,
+      eg. those arising from TLB misses, or speculative execution.</li><p>
+</ul>
+
+Another thing worth nothing is that results are very sensitive.  Changing the
+size of the <code>valgrind.so</code> file, the size of the program being
+profiled, or even the length of its name can perturb the results.  Variations
+will be small, but don't expect perfectly repeatable results if your program
+changes at all.<p>
+
+While these factors mean you shouldn't trust the results to be super-accurate,
+hopefully they should be close enough to be useful.<p>
+
+
+<h3>7.12&nbsp; Todo</h3>
+<ul>
+  <li>Use CPUID instruction to auto-identify cache configuration during 
+      installation.  This would save the user from having to know their cache
+      configuration and using vg_cachegen.</li><p>
+  <li>Program start-up/shut-down calls a lot of functions that aren't
+      interesting and just complicate the output.  Would be nice to exclude
+      these somehow.</li><p>
+</ul> 
+<hr width="100%">
 </body>
 </html>
+
diff --git a/coregrind/valgrind.in b/coregrind/valgrind.in
index 7f18890796..b10f48795b 100755
--- a/coregrind/valgrind.in
+++ b/coregrind/valgrind.in
@@ -65,6 +65,8 @@ do
     --workaround-gcc296-bugs=yes)   vgopts="$vgopts $arg"; shift;;
     --freelist-vol=*)       vgopts="$vgopts $arg"; shift;;
     --suppressions=*)       vgopts="$vgopts $arg"; shift;;
+    --cachesim=yes)         vgopts="$vgopts $arg"; shift;;
+    --cachesim=no)          vgopts="$vgopts $arg"; shift;;
 #   options for debugging Valgrind
     --sanity-level=*)       vgopts="$vgopts $arg"; shift;;
     --single-step=yes)      vgopts="$vgopts $arg"; shift;;
@@ -129,6 +131,7 @@ if [ $# = 0 ] || [ z"$dousage" = z1 ]; then
    echo "                              suppressions file <filename>"
    echo "    --check-addrVs=no|yes     experimental lighterweight checking? [yes]"
    echo "                              yes == Valgrind's original behaviour"
+   echo "    --cachesim=no|yes         do cache profiling? [no]"
    echo
    echo "  options for debugging Valgrind itself are:"
    echo "    --sanity-level=<number>   level of sanity checking to do [1]"
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
index b51ef059a8..08a1ab20de 100644
--- a/coregrind/vg_include.h
+++ b/coregrind/vg_include.h
@@ -234,6 +234,8 @@ extern Bool  VG_(clo_optimise);
 extern Bool  VG_(clo_instrument);
 /* DEBUG: clean up instrumented code?  default: YES */
 extern Bool  VG_(clo_cleanup);
+/* Cache simulation instrumentation?  default: NO */
+extern Bool  VG_(clo_cachesim);
 /* SMC write checks?  default: SOME (1,2,4 byte movs to mem) */
 extern Int   VG_(clo_smc_check);
 /* DEBUG: print system calls?  default: NO */
@@ -287,7 +289,7 @@ extern void VG_(shutdown_logging) ( void );
 
 #define VGP_M_STACK 10
 
-#define VGP_M_CCS 20  /* == the # of elems in VGP_LIST */
+#define VGP_M_CCS 24  /* == the # of elems in VGP_LIST */
 #define VGP_LIST \
    VGP_PAIR(VgpRun=0,      "running"),                \
    VGP_PAIR(VgpMalloc,     "low-lev malloc/free"),    \
@@ -307,6 +309,10 @@ extern void VG_(shutdown_logging) ( void );
    VGP_PAIR(VgpAddToT,     "add-to-transtab"),        \
    VGP_PAIR(VgpSARP,       "set-addr-range-perms"),   \
    VGP_PAIR(VgpSyscall,    "syscall wrapper"),        \
+   VGP_PAIR(VgpCacheInstrument, "cache instrument"),  \
+   VGP_PAIR(VgpCacheGetBBCC,"cache get BBCC"),        \
+   VGP_PAIR(VgpCacheSimulate, "cache simulate"),      \
+   VGP_PAIR(VgpCacheDump,  "cache stats dump"),       \
    VGP_PAIR(VgpSpare1,     "spare 1"),                \
    VGP_PAIR(VgpSpare2,     "spare 2")
 
@@ -718,8 +724,10 @@ extern void VG_(assert_fail) ( Char* expr, Char* file,
                                Int line, Char* fn )
             __attribute__ ((__noreturn__));
 
-/* Reading files. */
+/* Reading and writing files. */
 extern Int  VG_(open_read) ( Char* pathname );
+extern Int  VG_(open_write)       ( Char* pathname );
+extern Int  VG_(create_and_write) ( Char* pathname );
 extern void VG_(close)     ( Int fd );
 extern Int  VG_(read)      ( Int fd, void* buf, Int count);
 extern Int  VG_(write)     ( Int fd, void* buf, Int count);
@@ -955,7 +963,9 @@ typedef
       UChar   tag3:4;     /* third  operand tag */
       UChar   extra4b:4;  /* Spare field, used by WIDEN for src
                              -size, and by LEA2 for scale 
-                             (1,2,4 or 8) */
+                             (1,2,4 or 8), and by unconditional JMPs for
+                             orig x86 instr size if --cachesim=yes */
+
 
       /* word 5 */
       UChar   cond;            /* condition, for jumps */
@@ -1044,6 +1054,10 @@ extern Bool VG_(anyFlagUse) ( UInstr* u );
 extern void  VG_(ppUInstr)        ( Int instrNo, UInstr* u );
 extern void  VG_(ppUCodeBlock)    ( UCodeBlock* cb, Char* title );
 
+extern UCodeBlock* VG_(allocCodeBlock) ( void );
+extern void  VG_(freeCodeBlock)        ( UCodeBlock* cb );
+extern void  VG_(copyUInstr)                ( UCodeBlock* cb, UInstr* instr );
+
 extern Char* VG_(nameCondcode)    ( Condcode cond );
 extern Bool  VG_(saneUInstr)      ( Bool beforeRA, UInstr* u );
 extern Bool  VG_(saneUCodeBlock)  ( UCodeBlock* cb );
@@ -1184,6 +1198,11 @@ extern void VG_(what_obj_and_fun_is_this)
                                      ( Addr a,
                                        Char* obj_buf, Int n_obj_buf,
                                        Char* fun_buf, Int n_fun_buf );
+extern Bool VG_(what_line_is_this) ( Addr a,
+                                     UChar* filename, Int n_filename,
+                                     UInt* lineno );
+extern Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a,
+                                     Char* fn_name, Int n_fn_name);
 
 extern void VG_(symtab_notify_munmap) ( Addr start, UInt length );
 
@@ -1590,6 +1609,20 @@ extern void VG_(helper_value_check0_fail);
 extern void VG_(signalreturn_bogusRA)( void );
 extern void VG_(pthreadreturn_bogusRA)( void );
 
+/* ---------------------------------------------------------------------
+   Exports of vg_cachesim.c
+   ------------------------------------------------------------------ */
+
+extern UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr);
+
+typedef struct  _iCC  iCC;
+typedef struct _idCC idCC;
+
+extern void VG_(init_cachesim)        ( void );
+extern void VG_(show_cachesim_results)( Int client_argc, Char** client_argv );
+
+extern void VG_(cachesim_log_non_mem_instr)(  iCC* cc );
+extern void VG_(cachesim_log_mem_instr)    ( idCC* cc, Addr data_addr );
 
 /* ---------------------------------------------------------------------
    The state of the simulated CPU.
@@ -1724,7 +1757,8 @@ extern Int VGOFF_(handle_esp_assignment); /* :: Addr -> void */
 extern Int VGOFF_(fpu_write_check);       /* :: Addr -> Int -> void */
 extern Int VGOFF_(fpu_read_check);        /* :: Addr -> Int -> void */
 
-
+extern Int VGOFF_(cachesim_log_non_mem_instr);
+extern Int VGOFF_(cachesim_log_mem_instr);
 
 #endif /* ndef __VG_INCLUDE_H */
 
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index 5f7fe59c34..ee875e226b 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -103,7 +103,8 @@ Int VGOFF_(helperc_STOREV1) = INVALID_OFFSET;
 Int VGOFF_(handle_esp_assignment) = INVALID_OFFSET;
 Int VGOFF_(fpu_write_check) = INVALID_OFFSET;
 Int VGOFF_(fpu_read_check) = INVALID_OFFSET;
-
+Int VGOFF_(cachesim_log_non_mem_instr) = INVALID_OFFSET;
+Int VGOFF_(cachesim_log_mem_instr)     = INVALID_OFFSET;
 
 /* This is the actual defn of baseblock. */
 UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
@@ -165,6 +166,13 @@ static void vg_init_baseBlock ( void )
    /* 16  */ VGOFF_(sh_edi)    = alloc_BaB(1);
    /* 17  */ VGOFF_(sh_eflags) = alloc_BaB(1);
 
+   /* 17a */ 
+   VGOFF_(cachesim_log_non_mem_instr)  
+      = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_non_mem_instr) );
+   /* 17b */ 
+   VGOFF_(cachesim_log_mem_instr)  
+      = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_mem_instr) );
+
    /* 18  */ 
    VGOFF_(helper_value_check4_fail) 
       = alloc_BaB_1_set( (Addr) & VG_(helper_value_check4_fail) );
@@ -419,6 +427,7 @@ Bool   VG_(clo_single_step);
 Bool   VG_(clo_optimise);
 Bool   VG_(clo_instrument);
 Bool   VG_(clo_cleanup);
+Bool   VG_(clo_cachesim);
 Int    VG_(clo_smc_check);
 Bool   VG_(clo_trace_syscalls);
 Bool   VG_(clo_trace_signals);
@@ -745,6 +754,11 @@ static void process_cmd_line_options ( void )
       else if (STREQ(argv[i], "--cleanup=no"))
          VG_(clo_cleanup) = False;
 
+      else if (STREQ(argv[i], "--cachesim=yes"))
+         VG_(clo_cachesim) = True;     
+      else if (STREQ(argv[i], "--cachesim=no"))
+         VG_(clo_cachesim) = False;
+
       else if (STREQ(argv[i], "--smc-check=none"))
          VG_(clo_smc_check) = VG_CLO_SMC_NONE;
       else if (STREQ(argv[i], "--smc-check=some"))
@@ -821,6 +835,11 @@ static void process_cmd_line_options ( void )
 
    VG_(clo_logfile_fd) = eventually_logfile_fd;
 
+   /* Don't do memory checking if simulating the cache. */
+   if (VG_(clo_cachesim)) {
+       VG_(clo_instrument) = False;
+   }
+
    if (VG_(clo_verbosity > 0))
       VG_(message)(Vg_UserMsg, 
                    "valgrind-%s, a memory error detector for x86 GNU/Linux.",
@@ -978,7 +997,7 @@ void VG_(main) ( void )
       attach GDB in another shell. */
    /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */
 
-   if (VG_(clo_instrument)) {
+   if (VG_(clo_instrument) || VG_(clo_cachesim)) {
       VGP_PUSHCC(VgpInitAudit);
       VGM_(init_memory_audit)();
       VGP_POPCC;
@@ -1012,6 +1031,9 @@ void VG_(main) ( void )
       VGM_(make_readable) ( (Addr)&VG_(clo_sloppy_malloc), 1 );
    }
 
+   if (VG_(clo_cachesim)) 
+      VG_(init_cachesim)();
+
    if (VG_(clo_verbosity) > 0)
       VG_(message)(Vg_UserMsg, "");
 
@@ -1039,6 +1061,9 @@ void VG_(main) ( void )
    }
    VG_(running_on_simd_CPU) = False;
 
+   if (VG_(clo_cachesim))
+      VG_(show_cachesim_results)(VG_(client_argc), VG_(client_argv));
+
    VG_(do_sanity_checks)( 1 /* root thread */, 
                           True /*include expensive checks*/ );
 
diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c
index 740b21e3d1..77dd153a2e 100644
--- a/coregrind/vg_mylibc.c
+++ b/coregrind/vg_mylibc.c
@@ -885,7 +885,49 @@ Int VG_(open_read) ( Char* pathname )
    if (VG_(is_kerror)(fd)) fd = -1;
    return fd;
 }
+
+/* Returns -1 on failure. */
+static Int VG_(chmod_u_rw) ( Int fd )
+{
+   Int res;
+   const int O_IRUSR_IWUSR = 000600; /* See /usr/include/cpio.h */
+   res = vg_do_syscall2(__NR_fchmod, fd, O_IRUSR_IWUSR);
+   if (VG_(is_kerror)(res)) res = -1;
+   return res;
+}
+ 
+/* Returns -1 on failure. */
+Int VG_(create_and_write) ( Char* pathname )
+{
+   Int fd;
+
+   const int O_CR_AND_WR_ONLY = 0101; /* See /usr/include/bits/fcntl.h */
+   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_CR_AND_WR_ONLY, 0);
+   /* VG_(printf)("result = %d\n", fd); */
+   if (VG_(is_kerror)(fd)) {
+      fd = -1;
+   } else {
+      VG_(chmod_u_rw)(fd);
+      if (VG_(is_kerror)(fd)) {
+         fd = -1;
+      }
+   }
+   return fd;
+}
  
+/* Returns -1 on failure. */
+Int VG_(open_write) ( Char* pathname )
+{  
+   Int fd;
+
+   const int O_WRONLY_AND_TRUNC = 01001; /* See /usr/include/bits/fcntl.h */
+   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_WRONLY_AND_TRUNC, 0);
+   /* VG_(printf)("result = %d\n", fd); */
+   if (VG_(is_kerror)(fd)) {
+      fd = -1;
+   } 
+   return fd;
+}
 
 void VG_(close) ( Int fd )
 {
diff --git a/coregrind/vg_symtab2.c b/coregrind/vg_symtab2.c
index 74cada4e82..d9da8fff45 100644
--- a/coregrind/vg_symtab2.c
+++ b/coregrind/vg_symtab2.c
@@ -1,4 +1,3 @@
-
 /*--------------------------------------------------------------------*/
 /*--- Management of symbols and debugging information.             ---*/
 /*---                                                 vg_symtab2.c ---*/
@@ -50,6 +49,23 @@
 /*--- Structs n stuff                                      ---*/
 /*------------------------------------------------------------*/
 
+/* Stabs entry types, from:
+ *   The "stabs" debug format
+ *   Menapace, Kingdon and MacKenzie
+ *   Cygnus Support
+ */
+typedef enum { N_FUN   = 36,    /* Function start or end            */
+               N_STSYM = 38,    /* Data segment file-scope variable */
+               N_LCSYM = 40,    /* BSS segment file-scope variable  */
+               N_RSYM  = 64,    /* Register variable                */
+               N_SLINE = 68,    /* Source line number               */
+               N_SO    = 100,   /* Source file path and name        */
+               N_LSYM  = 128,   /* Stack variable or type           */
+               N_SOL   = 132,   /* Include file name                */
+               N_LBRAC = 192,   /* Start of lexical block           */
+               N_RBRAC = 224    /* End   of lexical block           */
+             } stab_types;
+      
 /* A structure to hold an ELF symbol (very crudely). */
 typedef 
    struct { 
@@ -535,12 +551,7 @@ void addLineInfo ( SegInfo* si,
 {
    RiLoc loc;
    UInt size = end - start + 1;
-#  if 0
-   if (size > 10000)
-   VG_(printf)( "line %4d: %p .. %p, in %s\n",
-                lineno, start, end, 
-                &si->strtab[fnmoff] );
-#  endif
+
    /* Sanity ... */
    if (size > 10000) return;
 
@@ -577,8 +588,8 @@ void vg_read_lib_symbols ( SegInfo* si )
    /* for the .stabs reader */
    Int    curr_filenmoff;
    Addr   curr_fnbaseaddr;
-   Addr   range_startAddr;
-   Int    range_lineno;
+   Char*  curr_file_name;
+   Int    n_stab_entries;
 
    oimage = (Addr)NULL;
    if (VG_(clo_verbosity) > 1)
@@ -880,25 +891,23 @@ void vg_read_lib_symbols ( SegInfo* si )
    /* Ok.  It all looks plausible.  Go on and read debug data. 
          stab kinds: 100   N_SO     a source file name
                       68   N_SLINE  a source line number
-                      36   N_FUN ?  start of a function
-
-      In this loop, we maintain a current file name, updated
-      as N_SOs appear, and a current function base address,
-      updated as N_FUNs appear.  Based on that, address ranges
-      for N_SLINEs are calculated, and stuffed into the 
-      line info table.
-
-      N_SLINE indicates the start of a source line.  Functions are
-      delimited by N_FUNS, at the start with a non-empty string and at
-      the end with an empty string.  The latter facilitates detecting
-      where to close the last N_SLINE for a function. 
+                      36   N_FUN    start of a function
+
+      In this loop, we maintain a current file name, updated as 
+      N_SO/N_SOLs appear, and a current function base address, 
+      updated as N_FUNs appear.  Based on that, address ranges for 
+      N_SLINEs are calculated, and stuffed into the line info table.
+
+      Finding the instruction address range covered by an N_SLINE is
+      complicated;  see the N_SLINE case below.
    */
    curr_filenmoff  = addStr(si,"???");
    curr_fnbaseaddr = (Addr)NULL;
-   range_startAddr = 0;
-   range_lineno    = 0;
+   curr_file_name  = (Char*)NULL;
 
-   for (i = 0; i < stab_sz/(int)sizeof(struct nlist); i++) {
+   n_stab_entries = stab_sz/(int)sizeof(struct nlist);
+
+   for (i = 0; i < n_stab_entries; i++) {
 #     if 0
       VG_(printf) ( "   %2d  ", i );
       VG_(printf) ( "type=0x%x   othr=%d   desc=%d   value=0x%x   strx=%d  %s",
@@ -910,61 +919,95 @@ void vg_read_lib_symbols ( SegInfo* si )
 #     endif
 
       switch (stab[i].n_type) {
-
-         case 68: { /* N_SLINE */
-            /* flush the current line, if any, and start a new one */
-            Addr range_endAddr 
-               = curr_fnbaseaddr 
-                    + (UInt)stab[i].n_value - 1;
-            if (range_startAddr != 0) {
-               addLineInfo ( si,
-                             curr_filenmoff,
-                             range_startAddr,
-                             range_endAddr,
-                             range_lineno );
+         UInt next_addr;
+
+         /* To compute the instr address range covered by a single line, find
+          * the address of the next thing and compute the difference.  The
+          * approach used depends on what kind of entry/entries follow... */
+         case N_SLINE: {
+            Int lineno = stab[i].n_desc;              
+            Int this_addr = (UInt)stab[i].n_value;
+
+            LOOP:
+            vg_assert(i+1 < n_stab_entries);    /* Haven't reached end */
+            switch (stab[i+1].n_type) {
+               /* Easy, common case: use address of next entry */
+               case N_SLINE: case N_SO:
+                  next_addr = (UInt)stab[i+1].n_value;
+                  break;
+
+               /* Boring one: skip, look for something more useful. */
+               case N_RSYM: case N_LSYM: case N_LBRAC: case N_RBRAC: 
+               case N_STSYM: case N_LCSYM:
+                  i++;
+                  goto LOOP;
+                  
+               /* Should be an end of fun entry, use its address */
+               case N_FUN: 
+                  if ('\0' == * (stabstr + stab[i+1].n_un.n_strx) ) {
+                     next_addr = (UInt)stab[i+1].n_value;
+                  } else {
+                     VG_(printf)("unhandled stabs case: N_FUN start %d %s\n",
+                                i, (stabstr + stab[i+1].n_un.n_strx) );
+                     VG_(panic)("argh");
+                  }
+                  break;
+
+               /* N_SOL should be followed by an N_SLINE which can be used */
+               case N_SOL:
+                  if (i+2 < n_stab_entries && N_SLINE == stab[i+2].n_type) {
+                     next_addr = (UInt)stab[i+2].n_value;
+                     break;
+                  } else {
+                     VG_(printf)("unhandled N_SOL stabs case: %d %d %d", 
+                                 stab[i+1].n_type, i, n_stab_entries);
+                     VG_(panic)("argh");
+                  }
+
+               default:
+                  VG_(printf)("unhandled stabs case: %d %d", 
+                              stab[i+1].n_type,i);
+                  VG_(panic)("argh");
             }
-            range_startAddr = range_endAddr + 1;
-            range_lineno = stab[i].n_desc;              
+            
+            //Int offset2 = (i+1 < n_stab_entries && 68 == stab[i+1].n_type
+            //              ? (UInt)stab[i+1].n_value - 1
+            //              : offset + 1);
+            //if (i+1 < n_stab_entries) {
+            //    int x;
+            //    if (68 != (x = stab[i+1].n_type)) {
+            //        VG_(printf)("%d  ", x);
+            //    }
+            //}
+
+            addLineInfo ( si, curr_filenmoff, curr_fnbaseaddr + this_addr, 
+                          curr_fnbaseaddr + next_addr - 1, lineno );
             break;
          }
 
-         case 36: { /* N_FUN */
-            if ('\0' == * (stabstr + stab[i].n_un.n_strx) ) {
-               /* N_FUN with no name -- indicates the end of a fn.
-                  Flush the current line, if any, but don't start a
-                  new one. */
-               Addr range_endAddr 
-                  = curr_fnbaseaddr 
-                       + (UInt)stab[i].n_value - 1;
-               if (range_startAddr != 0) {
-                  addLineInfo ( si,
-                                curr_filenmoff,
-                                range_startAddr,
-                                range_endAddr,
-                                range_lineno );
-               }
-               range_startAddr = 0;
-            } else {
+         case N_FUN: {
+            if ('\0' != (stabstr + stab[i].n_un.n_strx)[0] ) {
                /* N_FUN with a name -- indicates the start of a fn.  */
-               curr_fnbaseaddr = si->offset
-                                 + (Addr)stab[i].n_value;
-               range_startAddr = curr_fnbaseaddr;
+               curr_fnbaseaddr = si->offset + (Addr)stab[i].n_value;
             }
             break;
          }
 
-         case 100: /* N_SO */
-         case 132: /* N_SOL */
+         case N_SO: case N_SOL:
          /* seems to give lots of locations in header files */
          /* case 130: */ /* BINCL */
          { 
             UChar* nm = stabstr + stab[i].n_un.n_strx;
             UInt len = VG_(strlen)(nm);
-            if (len > 0 && nm[len-1] != '/')
+            
+            if (len > 0 && nm[len-1] != '/') {
                curr_filenmoff = addStr ( si, nm );
+               curr_file_name = stabstr + stab[i].n_un.n_strx;
+            }
             else
                if (len == 0)
                   curr_filenmoff = addStr ( si, "?1\0" );
+
             break;
          }
 
@@ -1071,8 +1114,8 @@ void read_symtab_callback (
    which happen to correspond to the munmap()d area.  */
 void VG_(read_symbols) ( void )
 {
-   if (! VG_(clo_instrument)) 
-     return;
+   if (! VG_(clo_instrument) && ! VG_(clo_cachesim)) 
+      return;
 
    VG_(read_procselfmaps) ( read_symtab_callback );
 
@@ -1251,9 +1294,8 @@ static void search_all_loctabs ( Addr ptr, SegInfo** psi, Int* locno )
    Caller supplies buf and nbuf.  If no_demangle is True, don't do
    demangling, regardless of vg_clo_demangle -- probably because the
    call has come from vg_what_fn_or_object_is_this. */
-static
-Bool vg_what_fn_is_this ( Bool no_demangle, Addr a, 
-                          Char* buf, Int nbuf )
+Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, 
+                            Char* buf, Int nbuf )
 {
    SegInfo* si;
    Int      sno;
@@ -1297,17 +1339,16 @@ void VG_(what_obj_and_fun_is_this) ( Addr a,
                                      Char* fun_buf, Int n_fun_buf )
 {
    (void)vg_what_object_is_this ( a, obj_buf, n_obj_buf );
-   (void)vg_what_fn_is_this ( True, a, fun_buf, n_fun_buf );
+   (void)VG_(what_fn_is_this) ( True, a, fun_buf, n_fun_buf );
 }
 
 
 /* Map a code address to a (filename, line number) pair.  
    Returns True if successful.
 */
-static
-Bool vg_what_line_is_this ( Addr a, 
-                            UChar* filename, Int n_filename, 
-                            UInt* lineno )
+Bool VG_(what_line_is_this)( Addr a, 
+                             UChar* filename, Int n_filename, 
+                             UInt* lineno )
 {
    SegInfo* si;
    Int      locno;
@@ -1317,6 +1358,7 @@ Bool vg_what_line_is_this ( Addr a,
    VG_(strncpy_safely)(filename, & si->strtab[si->loctab[locno].fnmoff], 
                        n_filename);
    *lineno = si->loctab[locno].lineno;
+
    return True;
 }
 
@@ -1348,11 +1390,11 @@ void VG_(mini_stack_dump) ( ExeContext* ec )
 
    n = 0;
 
-   know_fnname  = vg_what_fn_is_this(False,ec->eips[0], buf_fn, M_VG_ERRTXT);
+   know_fnname  = VG_(what_fn_is_this)(False,ec->eips[0], buf_fn, M_VG_ERRTXT);
    know_objname = vg_what_object_is_this(ec->eips[0], buf_obj, M_VG_ERRTXT);
-   know_srcloc  = vg_what_line_is_this(ec->eips[0], 
-                                       buf_srcloc, M_VG_ERRTXT, 
-                                       &lineno);
+   know_srcloc  = VG_(what_line_is_this)(ec->eips[0], 
+                                         buf_srcloc, M_VG_ERRTXT, 
+                                         &lineno);
 
    APPEND("   at ");
    VG_(sprintf)(ibuf,"0x%x: ", ec->eips[0]);
@@ -1383,9 +1425,9 @@ void VG_(mini_stack_dump) ( ExeContext* ec )
 
    clueless = 0;
    for (i = 1; i < stop_at; i++) {
-      know_fnname  = vg_what_fn_is_this(False,ec->eips[i], buf_fn, M_VG_ERRTXT);
+      know_fnname  = VG_(what_fn_is_this)(False,ec->eips[i], buf_fn, M_VG_ERRTXT);
       know_objname = vg_what_object_is_this(ec->eips[i],buf_obj, M_VG_ERRTXT);
-      know_srcloc  = vg_what_line_is_this(ec->eips[i], 
+      know_srcloc  = VG_(what_line_is_this)(ec->eips[i], 
                                           buf_srcloc, M_VG_ERRTXT, 
                                           &lineno);
       n = 0;
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index f06689d956..691ef26d67 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -4437,43 +4437,76 @@ Int VG_(disBB) ( UCodeBlock* cb, Addr eip0 )
    Addr eip   = eip0;
    Bool isEnd = False;
    Bool block_sane;
+   Int INCEIP_allowed_lag = 4;
+   Int delta = 0;
+
    if (dis) VG_(printf)("\n");
 
+   /* When cache simulating, to ensure cache misses are attributed to the
+    * correct line we ensure EIP is always correct.   This is done by:
+    *
+    * a) Using eager INCEIP updating to cope with all instructions except those
+    *    at the end of a basic block.
+    *
+    * b) Patching in the size of the original x86 instr in the `extra4b' field
+    *    of JMPs at the end of a basic block.  Two cases:
+    *       - Jcond followed by Juncond:  patch the Jcond
+    *       - Juncond alone:              patch the Juncond
+    *
+    * See vg_cachesim_instrument() for how this is used. 
+    */
+   if (VG_(clo_cachesim)) {
+       INCEIP_allowed_lag = 0;
+   }
+
    if (VG_(clo_single_step)) {
       eip = disInstr ( cb, eip, &isEnd );
-      uInstr1(cb, JMP, 0, Literal, 0);
-      uLiteral(cb, eip);
-      uCond(cb, CondAlways);
-      if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
+
+      /* Add a JMP to the next (single x86 instruction) BB if it doesn't
+       * already end with a JMP instr. We also need to check for no UCode,
+       * which occurs if the x86 instr was a nop */
+      if (cb->used == 0 || LAST_UINSTR(cb).opcode != JMP) {
+         uInstr1(cb, JMP, 0, Literal, 0);
+         uLiteral(cb, eip);
+         uCond(cb, CondAlways);
+         if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
+      }
+      delta = eip - eip0;
+
    } else {
-      Int delta = 0;
       Addr eip2;
-      while (True) {
-         if (isEnd) break;
+      while (!isEnd) {
          eip2 = disInstr ( cb, eip, &isEnd );
          delta += (eip2 - eip);
          eip = eip2;
-         if (delta > 4 && !isEnd) {
-            uInstr1(cb, INCEIP, 0, Lit16, delta);
-            if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-            delta = 0;
-         }
          /* Split up giant basic blocks into pieces, so the
             translations fall within 64k. */
-         if (eip - eip0 > 2000) {
+         if (eip - eip0 > 2000 && !isEnd) {
             if (VG_(clo_verbosity) > 0)
-               VG_(message)(Vg_DebugMsg, 
+               VG_(message)(Vg_DebugMsg,
                   "Warning: splitting giant basic block into pieces");
             uInstr1(cb, JMP, 0, Literal, 0);
             uLiteral(cb, eip);
             uCond(cb, CondAlways);
             if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-            if (dis) VG_(printf)("\n");
-            break;
+            isEnd = True;
+
+         } else if (delta > INCEIP_allowed_lag && !isEnd) {
+            uInstr1(cb, INCEIP, 0, Lit16, delta);
+            if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
+            delta = 0;
          }
          if (dis) VG_(printf)("\n");
       }
    }
+   if (VG_(clo_cachesim)) {
+      /* Patch instruction size into earliest JMP. */
+      if (cb->used >= 2 && JMP == cb->instrs[cb->used - 2].opcode) {
+         cb->instrs[cb->used - 2].extra4b = delta;
+      } else {
+         LAST_UINSTR(cb).extra4b = delta;
+      }
+   }
 
    block_sane = VG_(saneUCodeBlock)(cb);
    if (!block_sane) {
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 73cad5e9bc..76c6ef8a9b 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -104,7 +104,7 @@ void VG_(jitfree) ( void* ptr )
 /*--- Basics                                               ---*/
 /*------------------------------------------------------------*/
 
-static UCodeBlock* allocCodeBlock ( void )
+UCodeBlock* VG_(allocCodeBlock) ( void )
 {
    UCodeBlock* cb = VG_(malloc)(VG_AR_PRIVATE, sizeof(UCodeBlock));
    cb->used = cb->size = cb->nextTemp = 0;
@@ -113,7 +113,7 @@ static UCodeBlock* allocCodeBlock ( void )
 }
 
 
-static void freeCodeBlock ( UCodeBlock* cb )
+void VG_(freeCodeBlock) ( UCodeBlock* cb )
 {
    if (cb->instrs) VG_(free)(VG_AR_PRIVATE, cb->instrs);
    VG_(free)(VG_AR_PRIVATE, cb);
@@ -239,17 +239,15 @@ void VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz )
    ui->size   = sz;
 }
 
-
 /* Copy an instruction into the given codeblock. */
-static __inline__ 
-void copyUInstr ( UCodeBlock* cb, UInstr* instr )
+__inline__ 
+void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr )
 {
    ensureUInstr(cb);
    cb->instrs[cb->used] = *instr;
    cb->used++;
 }
 
-
 /* Copy auxiliary info from one uinstr to another. */
 static __inline__ 
 void copyAuxInfoFromTo ( UInstr* src, UInstr* dst )
@@ -1654,7 +1652,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
 
    /* Resulting code goes here.  We generate it all in a forwards
       pass. */
-   c2 = allocCodeBlock();
+   c2 = VG_(allocCodeBlock)();
 
    /* At the start, no TempRegs are assigned to any real register.
       Correspondingly, all temps claim to be currently resident in
@@ -1841,7 +1839,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
       for (j = 0; j < k; j++)
          tempUse[j].realNo 
             = VG_(rankToRealRegNo)(temp_info[tempUse[j].tempNo].real_no);
-      copyUInstr(c2, &c1->instrs[i]);
+      VG_(copyUInstr)(c2, &c1->instrs[i]);
       patchUInstr(&LAST_UINSTR(c2), &tempUse[0], k);
 
       if (VG_(disassemble)) {
@@ -1853,7 +1851,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
    if (temp_info != NULL)
       VG_(jitfree)(temp_info);
 
-   freeCodeBlock(c1);
+   VG_(freeCodeBlock)(c1);
 
    if (spill_reqd) 
       VG_(translations_needing_spill)++;
@@ -2099,7 +2097,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
    Int         i, j;
    UInstr*     u_in;
    Int         qs, qd, qt, qtt;
-   cb = allocCodeBlock();
+   cb = VG_(allocCodeBlock)();
    cb->nextTemp = cb_in->nextTemp;
 
    for (i = 0; i < cb_in->used; i++) {
@@ -2115,7 +2113,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             break;
 
          case INCEIP:
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Loads and stores.  Test the V bits for the address.  24
@@ -2136,7 +2134,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, LOADV, u_in->size, 
                         TempReg, u_in->val1,
                         TempReg, SHADOW(u_in->val2));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          case STORE:
             if (VG_(clo_check_addrVs)) {
@@ -2146,7 +2144,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, STOREV, u_in->size,
                         TempReg, SHADOW(u_in->val1), 
                         TempReg, u_in->val2);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Moving stuff around.  Make the V bits follow accordingly,
@@ -2156,24 +2154,24 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, GETV, u_in->size,
                         ArchReg, u_in->val1,
                         TempReg, SHADOW(u_in->val2));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          case PUT:
             uInstr2(cb, PUTV, u_in->size, 
                         TempReg, SHADOW(u_in->val1),
                         ArchReg, u_in->val2);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          case GETF:
             /* This is not the smartest way to do it, but should work. */
             qd = create_GETVF(cb, u_in->size);
             uInstr2(cb, MOV, 4, TempReg, qd, TempReg, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          case PUTF:
             create_PUTVF(cb, u_in->size, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          case MOV:
@@ -2190,7 +2188,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
                default: 
                   VG_(panic)("vg_instrument: MOV");
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Special case of add, where one of the operands is a literal.
@@ -2203,7 +2201,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             qd = SHADOW(u_in->val2);
             uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qd);
             create_Left(cb, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Another form of add.  
@@ -2234,7 +2232,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             }
             create_UifU(cb, 4, qs, qd);
             create_Left(cb, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          }
 
@@ -2244,7 +2242,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             create_Left(cb, u_in->size, qd);
             if (u_in->flags_w != FlagsEmpty)
                create_PUTVF(cb, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* This is a HACK (approximation :-) */
@@ -2279,7 +2277,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             create_UifU(cb, 0, qs, qd);
             create_PUTVF(cb, 0, qd);
             create_PCast(cb, 0, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* for OP in shl shr sar rol ror
@@ -2329,7 +2327,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt);
             create_PCast(cb, 1, u_in->size, qt);
             create_UifU(cb, u_in->size, qt, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          }
 
@@ -2338,20 +2336,20 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->tag1 == TempReg);
             create_Widen(cb, u_in->signed_widen, u_in->extra4b, u_in->size, 
                              SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* not#(x) = x (since bitwise independent) */
          case NOT:
             vg_assert(u_in->tag1 == TempReg);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* neg#(x) = left(x) (derivable from case for SUB) */
          case NEG:
             vg_assert(u_in->tag1 == TempReg);
             create_Left(cb, u_in->size, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* bswap#(x) = bswap(x) */
@@ -2360,7 +2358,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->size == 4);
             qd = SHADOW(u_in->val1);
             uInstr1(cb, BSWAP, 4, TempReg, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* cc2val#(qd) = pcast-0-to-size(eflags#) */
@@ -2369,7 +2367,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->flags_r != FlagsEmpty);
             qt = create_GETVF(cb, u_in->size);
             uInstr2(cb, MOV, 4, TempReg, qt, TempReg, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* cmov#(qs,qd) = cmov(qs,qd)
@@ -2395,7 +2393,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             LAST_UINSTR(cb).cond    = u_in->cond;
             LAST_UINSTR(cb).flags_r = u_in->flags_r;
 
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* add#/sub#(qs,qd) 
@@ -2421,7 +2419,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             if (u_in->flags_w != FlagsEmpty) {
                create_PUTVF(cb, u_in->size, qd);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* xor#(qs,qd) = qs `UifU` qd */
@@ -2432,7 +2430,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             if (u_in->flags_w != FlagsEmpty) {
                create_PUTVF(cb, u_in->size, qd);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* and#/or#(qs,qd) 
@@ -2476,7 +2474,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             if (u_in->flags_w != FlagsEmpty) {
                create_PUTVF(cb, u_in->size, qd);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Machinery to do with supporting CALLM.  Copy the start and
@@ -2484,7 +2482,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             (debug); they generate no code and have no effect. 
          */
          case CALLM_S: case CALLM_E:
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Copy PUSH and POP verbatim.  Arg/result absval
@@ -2493,7 +2491,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             needs to be copied.  
          */
          case PUSH: case POP: case CLEAR:
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* In short:
@@ -2562,7 +2560,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
                   ... */
                uInstr1(cb, SETV, 0, TempReg, qt);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          }
          /* Whew ... */
@@ -2582,13 +2580,13 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
                   ... */
                uInstr1(cb, SETV, 0, TempReg, qt);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          case JIFZ:
             uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
             uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Emit a check on the address used.  For FPU_R, the value
@@ -2598,12 +2596,12 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->tag2 == TempReg);
             uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val2));
             uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val2));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* For FPU insns not referencing memory, just copy thru. */
          case FPU: 
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          default:
@@ -2614,7 +2612,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
 
    } /* end of for loop */
 
-   freeCodeBlock(cb_in);
+   VG_(freeCodeBlock)(cb_in);
    return cb;
 }
 
@@ -3083,7 +3081,7 @@ void VG_(translate) ( ThreadState* tst,
               VG_(overall_in_osize), VG_(overall_in_tsize),
               orig_addr );
    }
-   cb = allocCodeBlock();
+   cb = VG_(allocCodeBlock)();
 
    /* Disassemble this basic block into cb. */
    VGP_PUSHCC(VgpToUCode);
@@ -3117,6 +3115,19 @@ void VG_(translate) ( ThreadState* tst,
       }
    }
 
+   //VG_(disassemble) = True;
+
+   /* Add cache simulation code. */
+   if (VG_(clo_cachesim)) {
+      VGP_PUSHCC(VgpCacheInstrument);
+      cb = VG_(cachesim_instrument)(cb, orig_addr);
+      VGP_POPCC;
+      if (VG_(disassemble)) 
+         VG_(ppUCodeBlock) ( cb, "Cachesim instrumented code:" );
+   }
+   
+   //VG_(disassemble) = False;
+   
    /* Allocate registers. */
    VGP_PUSHCC(VgpRegAlloc);
    cb = vg_do_register_allocation ( cb );
@@ -3132,7 +3143,7 @@ void VG_(translate) ( ThreadState* tst,
       and so must be VG_(jitfree)'d. */
    final_code = VG_(emit_code)(cb, &final_code_size );
    VGP_POPCC;
-   freeCodeBlock(cb);
+   VG_(freeCodeBlock)(cb);
 
    if (debugging_translation) {
       /* Only done for debugging -- throw away final result. */
diff --git a/docs/manual.html b/docs/manual.html
index c441db1620..a97c2f9fec 100644
--- a/docs/manual.html
+++ b/docs/manual.html
@@ -78,7 +78,9 @@ Linux-x86 executables.
 
 <h4>6&nbsp; <a href="#example">An example</a></h4>
 
-<h4>7&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
+<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
+
+<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
 
 <hr width="100%">
 
@@ -515,6 +517,11 @@ follows:
       buggy, so you may need to issue this flag if you use 3.0.4.
       </li><br><p>
 
+  <li><code>--cachesim=no</code> [default]<br>
+      <code>--cachesim=yes</code>
+      <p>When enabled, turns off memory checking, and turns on cache profiling.
+      Cache profiling is described in detail in <a href="#cache">Section 7</a>.
+      </li><p>
 </ul>
 
 There are also some options for debugging Valgrind itself.  You
@@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$
 <p>The GCC folks fixed this about a week before gcc-3.0 shipped.
 <hr width="100%">
 <p>
+
+
+
+<a name="cache"></a>
+<h2>7&nbsp; Cache profiling</h2>
+As well as memory debugging, Valgrind also allows you to do cache simulations
+and annotate your source line-by-line with the number of cache misses.  In
+particular, it records:
+<ul>
+  <li>L1 instruction cache reads and misses;
+  <li>L1 data cache reads and read misses, writes and write misses;
+  <li>L2 unified cache reads and read misses, writes and writes misses.
+</ul>
+On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
+and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
+very useful for improving the performance of your program.
+
+Please note that this is an experimental feature.  Any feedback, bug-fixes,
+suggestions, etc, welcome.
+
+
+<h3>7.1&nbsp; Overview</h3>
+First off, as for normal Valgrind use, you probably want to turn on debugging
+info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
+probably <b>do</b> want to turn optimisation on, since you should profile your
+program as it will be normally run.
+
+The three steps are:
+<ol>
+  <li>Generate a cache simulator for your machine's cache configuration with
+      `vg_cachegen' and recompile Valgrind with <code>make install</code>.
+      Valgrind comes with a default simulator, but it is unlikely to be correct
+      for your system, so you should generate a simulator yourself.</li>
+  <li>Run your program with <code>valgrind --cachesim=yes</code> in front of 
+      the normal command line invocation.  When the program finishes, Valgrind
+      will print summary cache statistics. It also collects line-by-line
+      information in a file <code>cachegrind.out</code>.</li>
+  <li>Generate a function-by-function summary, and possibly annotate source
+      files with 'vg_annotate'. Source files to annotate can be specified
+      manually, or manually on the command line, or "interesting" source files
+      can be annotated automatically with the <code>--auto=yes</code> option.
+      You can annotate C/C++ files or assembly language files equally
+      easily.</li>
+</ol>
+
+<a href="#generate">Step 1</a> only needs to be done once, unless you are
+interested in simulating different cache configurations (eg. first
+concentrating on instruction cache misses, then on data cache misses).<p>
+
+<a href="#profile">Step 2</a> should be done every time you want to collect
+information about a new program, a changed program, or about the same program
+with different input.<p>
+
+<a href="#annotate">Step 3</a> can be performed as many times as you like for
+each Step 2; you may want to do multiple annotations showing different
+information each time.<p>
+
+The steps are described in detail in the following sections.<p>
+
+
+<a name="generate"></a>
+<h3>7.3&nbsp; Generating a cache simulator</h3>
+Although Valgrind comes with a pre-generated cache simulator, it most likely
+won't match the cache configuration of your machine, so you should generate
+a new simulator.<p>
+
+You need to generate three files, one for each of the I1, D1 and L2 caches.
+For each cache, you need to know the:
+<ul>
+  <li>Cache size (bytes);
+  <li>Line size (bytes);
+  <li>Associativity.
+</ul>
+
+vg_cachegen takes three options:
+<ul>
+  <li><code>--I1=size,line_size,associativity</code>
+  <li><code>--D1=size,line_size,associativity</code>
+  <li><code>--L2=size,line_size,associativity</code>
+</ul>
+
+You can specify one, two or all three caches per invocation of vg_cachegen.  It
+checks that the configuration is sensible before generating the simulators;  to
+see the allowed values, run <code>vg_cachegen -h</code>.<p>
+
+An example invocation would be:
+
+<blockquote><code>
+  vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8
+</code></blockquote>
+
+This simulates a machine with a 128KB split L1 2-way associative cache, and a
+256KB unified 8-way associative L2 cache.  Both caches have 64B lines.<p>
+
+If you don't know your cache configuration, you'll have to find it out.
+(Ideally vg_cachegen could auto-identify your cache configuration using the
+CPUID instruction, which could be done automatically during installation, and
+this whole step could be skipped...)<p>
+
+
+<h3>7.4&nbsp; Cache simulation specifics</h3>
+vg_cachegen only generates simulations for a machine with a split L1 cache and
+a unified L2 cache.  This configuration is used for all x86-based machines we
+are aware of.<p>
+
+The more specific characteristics of the simulation are as follows.
+
+<ul>
+  <li>Write-allocate: when a write miss occurs, the block written to is brought
+      into the D1 cache.  Most modern caches have this property.</li><p>
+
+  <li>Bit-selection hash function:  the line(s) in the cache to which a memory
+      block maps is chosen by the middle bits M--(M+N-1) of the byte address,
+      where:
+      <ul>
+        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
+        <li>(cache size / line size) = 2^N bytes</li>
+      </ul> </li><p>
+
+  <li>Inclusive L2 cache:  the L2 cache replicates all the entries of the L1
+      cache.  This is standard on Pentium chips, but AMD Athlons use an
+      exclusive L2 cache that only holds blocks evicted from L1.</li><p>
+</ul>
+
+Other noteworthy behaviour:
+
+<ul>
+  <li>References that straddle two cache lines are treated as follows:</li>
+  <ul>
+    <li>If both blocks hit --&gt; counted as one hit</li>
+    <li>If one block hits, the other misses --&gt; counted as one miss</li>
+    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
+  </ul><p>
+
+  <li>Instructions that modify a memory location (eg. <code>inc</code> and
+      <code>dec</code>) are counted as doing just a read, ie. a single data
+      reference.  This may seem strange, but since the write can never cause a
+      miss (the read guarantees the block is in the cache) it's not very
+      interesting.<p>
+
+      Thus it measures not the number of times the data cache is accessed, but
+      the number of times a data cache miss could occur.<p>
+      </li>
+</ul>
+
+If you are interested in simulating a cache with different properties, it is
+not particularly hard to write your own cache simulator, or to modify existing
+ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_I1.c</code> and
+<code>vg_cachesim_I1.c</code>.  We'd be interested to hear from anyone who
+does.
+
+
+<a name="profile"></a>
+<h3>7.5&nbsp; Profiling programs</h3>
+Cache profiling is enabled by using the <code>--cachesim=yes</code> option to
+Valgrind.  This automatically turns off Valgrind's memory checking functions,
+since the cache simulation is slow enough already, and you probably don't want
+to do both at once.<p>
+
+To gather cache profiling information about the program <code>ls -l<code, type:
+
+<blockquote><code>valgrind --cachesim=yes ls -l</code></blockquote>
+
+The program will execute (slowly).  Upon completion, summary statistics
+that look like this will be printed:
+
+<pre>
+==31751== I   refs:      27,742,716
+==31751== I1  misses:           276
+==31751== L2  misses:           275
+==31751== I1  miss rate:        0.0%
+==31751== L2i miss rate:        0.0%
+==31751== 
+==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
+==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
+==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
+==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
+==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
+==31751== 
+==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
+==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
+</pre>
+
+Cache accesses for instruction fetches are summarised first, giving the
+number of fetches made (this is the number of instructions executed, which
+can be useful to know in its own right), the number of I1 misses, and the
+number of L2 instruction (<code>L2i</code>) misses.<p>
+
+Cache accesses for data follow. The information is similar to that of the
+instruction fetches, except that the values are also shown split between reads
+and writes (note each row's <code>rd</code> and <code>wr</code> values add up
+to the row's total).<p>
+
+Combined instruction and data figures for the L2 cache follow that.<p>
+
+
+<h3>7.6&nbsp; Output file</h3>
+As well as printing summary information, Valgrind also writes line-by-line
+cache profiling information to a file named <code>cachegrind.out</code> .  This
+file is human-readable, but is best interpreted by the accompanying program
+vg_annotate, described in the next section.<p>
+
+Things to note about the <code>cachegrind.out</code> file:
+<ul>
+  <li>It is written every time <code>valgrind --cachesim=yes</code> is run; it
+      will automatically overwrite any existing <code>cachegrind.out<code/> in
+      the current directory.</li>
+  <li>It can be quite large: <code>ls -l</code> generates a file of about
+      350KB; browsing a few files and web pages with Konqueror generates a file
+      of around 10MB.</li>
+</ul>
+
+
+<a name="annotate"></a>
+<h3>7.7&nbsp; Annotating C/C++ programs</h3>
+Before using vg_annotate, it is worth widening your window to be at least
+120-characters wide if possible, as the output lines can be quite long.<p>
+
+To get a function-by-function summary, run <code>vg_annotate</code> in
+directory containing a <code>cachegrind.out</code> file.  The output looks like
+this:
+
+<pre>
+--------------------------------------------------------------------------------
+I1 cache:              65536 B, 64 B, 2-way associative
+D1 cache:              65536 B, 64 B, 2-way associative
+L2 cache:              262144 B, 64 B, 8-way associative
+Command:               concord vg_to_ucode.c
+Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Threshold:             99%
+Chosen for annotation:
+Auto-annotation:       on
+
+--------------------------------------------------------------------------------
+Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
+--------------------------------------------------------------------------------
+27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
+
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
+--------------------------------------------------------------------------------
+8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
+5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
+2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
+2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
+2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
+1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
+  897,991   51   51   897,831    95    30        62      1      1  ???:???
+  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
+  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
+  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
+  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
+  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
+  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
+  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
+  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
+  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
+   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
+   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
+</pre>
+
+First up is a summary of the annotation options:
+                    
+<ul>
+  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
+      configuration with which these results were obtained.</li><p>
+
+  <li>Command: the command line invocation of the program under
+      examination.</li><p>
+
+  <li>Events recorded: event abbreviations are:<p>
+  <ul>
+    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
+    <li><code>I1mr</code>: I1 cache read misses</li>
+    <li><code>I2mr</code>: L2 cache instruction read misses</li>
+    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
+    <li><code>D1mr</code>: D1 cache read misses</li>
+    <li><code>D2mr</code>: L2 cache data read misses</li>
+    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
+    <li><code>D1mw</code>: D1 cache write misses</li>
+    <li><code>D2mw</code>: L2 cache data write misses</li>
+  </ul><p>
+      Note that D1 total accesses is given by <code>D1mr</code> +
+      <code>D1mw</code>, and that L2 total accesses is given by
+      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
+
+  <li>Events shown: the events shown (a subset of events gathered).  This can
+      be adjusted with the <code>--show</code> option.</li><p>
+
+  <li>Event sort order: the sort order in which functions are shown.  For
+      example, in this case the functions are sorted from highest
+      <code>Ir</code> counts to lowest.  If two functions have identical
+      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
+      counts, and so on.  This order can be adjusted with the
+      <code>--sort</code> option.<p>
+
+      Note that this dictates the order the functions appear.  It is <b>not</b>
+      the order in which the columns appear;  that is dictated by the "events
+      shown" line (and can be changed with the <code>--sort</code> option).
+      </li><p>
+
+  <li>Threshold: vg_annotate by default omits functions that cause very low
+      numbers of misses to avoid drowing you in information.  In this case,
+      vg_annotate shows summaries the functions that account for 99%   of the
+      <code>Ir</code> counts; <code>Ir</code> is chosen as the treshold event
+      since it is  the primary sort event.  The threshold can be adjusted with
+      the <code>--threshold</code> option.</li><p>
+
+  <li>Chosen for annotation: names of files specified manually for annotation; 
+      in this case none.</li><p>
+
+  <li>Auto-annotation: whether auto-annotation was requested via the 
+      <code>--auto=yes</code> option. In this case no.</li><p>
+</ul>
+
+Then follows summary statistics for the whole program. These are similar
+to the summary provided when running <code>valgrind --cachesim=yes</code>.<p>
+  
+Then follows function-by-function statistics. Each function is identified by a
+<code>file_name:function_name</code> pair. If a column contains only a
+`.' it means  the function never performs that event (eg. the third row shows
+that <code>strcmp()</code> contains no instructions that write to memory). The
+name <code>???</code> is used if the the file name and/or function name could
+not be determined from debugging information. (If most of the entries have the
+form <code>???:???</code> the program probably wasn't compiled with
+<code>-g</code>.)<p> 
+
+It is worth noting that functions will come from three types of source files:
+<ol>
+  <li> From the profiled program (<code>concord.c</code> in this example).</li>
+  <li>From libraries (eg. <code>getc.c</code>)</li>
+  <li>From Valgrind's implementation of some libc functions (eg.
+      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
+      the filename begins with <code>vg_</code>, and is probably one of
+      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
+      <code>vg_mylibc.c</code>.
+  </li>
+</ol>
+
+There are two ways to annotate source files -- by choosing them manually, or
+with the <code>--auto=yes</code> option. To do it manually, just
+specify the filenames as arguments to vg_annotate. For example, the output from
+running <code>vg_annotate concord.c</code> for our example produces the same
+output as above followed by an annotated version of <code>concord.c</code>, a
+section of which looks like:
+
+<pre>
+--------------------------------------------------------------------------------
+-- User-annotated source: concord.c
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
+
+[snip]
+
+        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
+        3    1    1       .     .     .       1      0      0  {
+        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
+        .    .    .       .     .     .       .      .      .      Word_Info *data;
+        1    0    0       .     .     .       1      1      1      int line = 1, i;
+        .    .    .       .     .     .       .      .      .
+        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
+        .    .    .       .     .     .       .      .      .
+    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
+    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
+        .    .    .       .     .     .       .      .      .
+        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
+        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
+        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
+        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
+        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
+        .    .    .       .     .     .       .      .      .      }
+        .    .    .       .     .     .       .      .      .
+  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
+  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
+        .    .    .       .     .     .       .      .      .
+        4    0    0       1     0     0       2      0      0      free(data);
+        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
+        3    0    0       2     0     0       .      .      .  }
+</pre>
+
+(Although column widths are automatically minimised, a wide terminal is clearly
+useful.)<p>
+  
+Each source file is clearly marked (<code>User-annotated source</code>) as
+having been chosen manually for annotation.  If the file was found in one of
+the directories specified with the <code>-I</code>/<code>--include</code>
+option, the directory and file are both given.<p>
+
+Each line is annotated with its event counts.  Events not applicable for a line
+are represented by a `.';  this is useful for distinguishing between an event
+which cannot happen, and one which can but did not.<p> 
+
+Sometimes only a small section of a source file is executed.  To minimise
+uninteresting output, Valgrind only shows annotated lines and lines within a
+small distance of annotated lines.  Gaps are marked with the line numbers so
+you know which part of a file the shown code comes from, eg:
+
+<pre>
+(figures and code for line 704)
+-- line 704 ----------------------------------------
+-- line 878 ----------------------------------------
+(figures and code for line 878)
+</pre>
+
+The amount of context to show around annotated lines is controlled by the
+<code>--context</code> option.<p>
+
+To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
+vg_annotate will automatically annotate every source file it can find that is
+mentioned in the function-by-function summary.  Therefore, the files chosen for
+auto-annotation  are affected by the <code>--sort</code> and
+<code>--threshold</code> options.  Each source file is clearly marked
+(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
+that could not be found are mentioned at the end of the output, eg:    
+
+<pre>
+--------------------------------------------------------------------------------
+The following files chosen for auto-annotation could not be found:
+--------------------------------------------------------------------------------
+  getc.c
+  ctype.c
+  ../sysdeps/generic/lockfile.c
+</pre>
+
+This is quite common for library files, since libraries are usually compiled
+with debugging information, but the source files are often not present on a
+system.  If a file is chosen for annotation <b>both</b> manually and
+automatically, it is marked as <code>User-annotated source</code>.
+
+Use the <code>-I/--include</code> option to tell Valgrind where to look for
+source files if the filenames found from the debugging information aren't
+specific enough.
+
+Beware that vg_annotate can take some time to digest large
+<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
+auto-annotation can produce a lot of output if your program is large!
+
+
+<h3>7.8&nbsp; Annotating assembler programs</h3>
+Valgrind can annotate assembler programs too, or annotate the assembler
+generated for your C program.  Sometimes this is useful for understanding what
+is really happening when an interesting line of C code is translated into
+multiple instructions.<p>
+
+To do this, you just need to assemble your <code>.s</code> files with
+assembler-level debug information.  gcc doesn't do this, but you can use GNU as
+with the <code>--gstabs</code> option to generate object files with this
+information, eg:
+
+<blockquote><code>as --gstabs foo.s</code></blockquote>
+
+You can then profile and annotate source files in the same way as for C/C++
+programs.
+
+
+<h3>7.9&nbsp; vg_annotate options</h3>
+<ul>
+  <li><code>-h, --help</code></li><p>
+  <li><code>-v, --version</code><p>
+
+      Help and version, as usual.</li>
+
+  <li><code>--sort=A,B,C</code> [default: order in 
+      <code>cachegrind.out</code>]<p>
+      Specifies the events upon which the sorting of the function-by-function
+      entries will be based.  Useful if you want to concentrate on eg. I cache
+      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
+      (<code>--sort=D1mr,D2mr</code>), or L2 misses
+      (<code>--sort=D2mr,I2mr</code>).</li><p>
+
+  <li><code>--show=A,B,C</code> [default: all, using order in
+      <code>cachegrind.out</code>]<p>
+      Specifies which events to show (and the column order). Default is to use
+      all present in the <code>cachegrind.out</code> file (and use the order in
+      the file).</li><p>
+
+  <li><code>--threshold=X</code> [default: 99%] <p>
+      Sets the threshold for the function-by-function summary.  Functions are
+      shown that account for more than X% of all the primary sort events.  If
+      auto-annotating, also affects which files are annotated.</li><p>
+
+  <li><code>--auto=no</code> [default]<br>
+      <code>--auto=yes</code> <p>
+      When enabled, automatically annotates every file that is mentioned in the
+      function-by-function summary that can be found.  Also gives a list of
+      those that couldn't be found.
+
+  <li><code>--context=N</code> [default: 8]<p>
+      Print N lines of context before and after each annotated line.  Avoids
+      printing large sections of source files that were not executed.  Use a 
+      large number (eg. 10,000) to show all source lines.
+      </li><p>
+
+  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
+      [default: empty string]<p>
+      Adds a directory to the list in which to search for files.  Multiple
+      -I/--include options can be given to add multiple directories.
+</ul>
+  
+
+<h3>7.10&nbsp; Warnings</h3>
+There are a couple of situations in which vg_annotate issues warnings.
+
+<ul>
+  <li>If a source file is more recent than the <code>cachegrind.out</code>
+      file.  This is because the information in <code>cachegrind.out</code> is
+      only recorded with line numbers, so if the line numbers change at all in
+      the source (eg. lines added, deleted, swapped), any annotations will be 
+      incorrect.<p>
+
+  <li>If information is recorded about line numbers past the end of a file.
+      This can be caused by the above problem, ie. shortening the source file
+      while using an old <code>cachegrind.out</code> file.  If this happens,
+      the figures for the bogus lines are printed anyway (clearly marked as
+      bogus) in case they are important.</li><p>
+</ul>
+
+
+<h3>7.10&nbsp; Things to watch out for</h3>
+Some odd things that can occur during annotation:
+
+<ul>
+  <li>If annotating at the assembler level, you might see something like this:
+
+      <pre>
+      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
+      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
+      .    .    .  .    .    .  .    .    .          .align 4,0x90
+      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
+      </pre>
+
+      How can the third instruction be executed twice when the others are
+      executed only once?  As it turns out, it isn't.  Here's a dump of the
+      executable, from objdump:
+
+      <pre>
+      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
+      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
+      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
+      8048f32:       89 f6                   mov    %esi,%esi
+      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
+      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
+      </pre>
+
+      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
+      come from?  The GNU assembler inserted it to serve as the two bytes of
+      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
+      a four-byte boundary, but pretended it didn't exist when adding debug
+      information.  Thus when Valgrind reads the debug info it thinks that the
+      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
+      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
+      <code>mov %esi,%esi</code> to it.<p>
+  </li>
+
+  <li>
+      Inlined functions can cause strange results in the function-by-function
+      summary.  If a function <code>inline_me()</code> is defined in
+      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
+      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
+      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
+      will be separate function entries for each inlining site, ie.
+      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
+      <code>foo.h:f3()</code>.  To find the total counts for
+      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
+
+      The reason for this is that although the debug info output by gcc
+      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
+      doesn't indicate the name of the function in <code>foo.h</code>, so
+      Valgrind keeps using the old one.<p>
+
+  <li>
+      Sometimes, the same filename might be represented with a relative name
+      and with an absolute name in different parts of the debug info, eg:
+      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
+      case, if you use auto-annotation, the file will be annotated twice with
+      the counts split between the two.<p>
+  </li>
+</ul>
+
+Note: stabs is not an easy format to read.  If you come across bizarre
+annotations that look like might be caused by a bug in the stabs reader,
+please let us know.
+
+
+<h3>7.11&nbsp; Accuracy</h3>
+Valgrind's cache profiling has a number of shortcomings:
+
+<ul>
+  <li>It doesn't account for kernel activity -- the effect of system calls on
+      the cache contents is ignored.</li><p>
+
+  <li>It doesn't account for other process activity (although this is probably
+      desirable when considering a single program).</li><p>
+
+  <li>It doesn't account for virtual-to-physical address mappings;  hence the
+      entire simulation is not a true representation of what's happening in the
+      cache.</li><p>
+
+  <li>It doesn't account for cache misses not visible at the instruction level,
+      eg. those arising from TLB misses, or speculative execution.</li><p>
+</ul>
+
+Another thing worth nothing is that results are very sensitive.  Changing the
+size of the <code>valgrind.so</code> file, the size of the program being
+profiled, or even the length of its name can perturb the results.  Variations
+will be small, but don't expect perfectly repeatable results if your program
+changes at all.<p>
+
+While these factors mean you shouldn't trust the results to be super-accurate,
+hopefully they should be close enough to be useful.<p>
+
+
+<h3>7.12&nbsp; Todo</h3>
+<ul>
+  <li>Use CPUID instruction to auto-identify cache configuration during 
+      installation.  This would save the user from having to know their cache
+      configuration and using vg_cachegen.</li><p>
+  <li>Program start-up/shut-down calls a lot of functions that aren't
+      interesting and just complicate the output.  Would be nice to exclude
+      these somehow.</li><p>
+</ul> 
+<hr width="100%">
 </body>
 </html>
+
diff --git a/helgrind/Makefile.am b/helgrind/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/helgrind/Makefile.am
+++ b/helgrind/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/lackey/Makefile.am b/lackey/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/lackey/Makefile.am
+++ b/lackey/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/memcheck/Makefile.am b/memcheck/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/memcheck/Makefile.am
+++ b/memcheck/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/memcheck/docs/manual.html b/memcheck/docs/manual.html
index c441db1620..a97c2f9fec 100644
--- a/memcheck/docs/manual.html
+++ b/memcheck/docs/manual.html
@@ -78,7 +78,9 @@ Linux-x86 executables.
 
 <h4>6&nbsp; <a href="#example">An example</a></h4>
 
-<h4>7&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
+<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
+
+<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
 
 <hr width="100%">
 
@@ -515,6 +517,11 @@ follows:
       buggy, so you may need to issue this flag if you use 3.0.4.
       </li><br><p>
 
+  <li><code>--cachesim=no</code> [default]<br>
+      <code>--cachesim=yes</code>
+      <p>When enabled, turns off memory checking, and turns on cache profiling.
+      Cache profiling is described in detail in <a href="#cache">Section 7</a>.
+      </li><p>
 </ul>
 
 There are also some options for debugging Valgrind itself.  You
@@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$
 <p>The GCC folks fixed this about a week before gcc-3.0 shipped.
 <hr width="100%">
 <p>
+
+
+
+<a name="cache"></a>
+<h2>7&nbsp; Cache profiling</h2>
+As well as memory debugging, Valgrind also allows you to do cache simulations
+and annotate your source line-by-line with the number of cache misses.  In
+particular, it records:
+<ul>
+  <li>L1 instruction cache reads and misses;
+  <li>L1 data cache reads and read misses, writes and write misses;
+  <li>L2 unified cache reads and read misses, writes and writes misses.
+</ul>
+On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
+and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
+very useful for improving the performance of your program.
+
+Please note that this is an experimental feature.  Any feedback, bug-fixes,
+suggestions, etc, welcome.
+
+
+<h3>7.1&nbsp; Overview</h3>
+First off, as for normal Valgrind use, you probably want to turn on debugging
+info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
+probably <b>do</b> want to turn optimisation on, since you should profile your
+program as it will be normally run.
+
+The three steps are:
+<ol>
+  <li>Generate a cache simulator for your machine's cache configuration with
+      `vg_cachegen' and recompile Valgrind with <code>make install</code>.
+      Valgrind comes with a default simulator, but it is unlikely to be correct
+      for your system, so you should generate a simulator yourself.</li>
+  <li>Run your program with <code>valgrind --cachesim=yes</code> in front of 
+      the normal command line invocation.  When the program finishes, Valgrind
+      will print summary cache statistics. It also collects line-by-line
+      information in a file <code>cachegrind.out</code>.</li>
+  <li>Generate a function-by-function summary, and possibly annotate source
+      files with 'vg_annotate'. Source files to annotate can be specified
+      manually, or manually on the command line, or "interesting" source files
+      can be annotated automatically with the <code>--auto=yes</code> option.
+      You can annotate C/C++ files or assembly language files equally
+      easily.</li>
+</ol>
+
+<a href="#generate">Step 1</a> only needs to be done once, unless you are
+interested in simulating different cache configurations (eg. first
+concentrating on instruction cache misses, then on data cache misses).<p>
+
+<a href="#profile">Step 2</a> should be done every time you want to collect
+information about a new program, a changed program, or about the same program
+with different input.<p>
+
+<a href="#annotate">Step 3</a> can be performed as many times as you like for
+each Step 2; you may want to do multiple annotations showing different
+information each time.<p>
+
+The steps are described in detail in the following sections.<p>
+
+
+<a name="generate"></a>
+<h3>7.3&nbsp; Generating a cache simulator</h3>
+Although Valgrind comes with a pre-generated cache simulator, it most likely
+won't match the cache configuration of your machine, so you should generate
+a new simulator.<p>
+
+You need to generate three files, one for each of the I1, D1 and L2 caches.
+For each cache, you need to know the:
+<ul>
+  <li>Cache size (bytes);
+  <li>Line size (bytes);
+  <li>Associativity.
+</ul>
+
+vg_cachegen takes three options:
+<ul>
+  <li><code>--I1=size,line_size,associativity</code>
+  <li><code>--D1=size,line_size,associativity</code>
+  <li><code>--L2=size,line_size,associativity</code>
+</ul>
+
+You can specify one, two or all three caches per invocation of vg_cachegen.  It
+checks that the configuration is sensible before generating the simulators;  to
+see the allowed values, run <code>vg_cachegen -h</code>.<p>
+
+An example invocation would be:
+
+<blockquote><code>
+  vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8
+</code></blockquote>
+
+This simulates a machine with a 128KB split L1 2-way associative cache, and a
+256KB unified 8-way associative L2 cache.  Both caches have 64B lines.<p>
+
+If you don't know your cache configuration, you'll have to find it out.
+(Ideally vg_cachegen could auto-identify your cache configuration using the
+CPUID instruction, which could be done automatically during installation, and
+this whole step could be skipped...)<p>
+
+
+<h3>7.4&nbsp; Cache simulation specifics</h3>
+vg_cachegen only generates simulations for a machine with a split L1 cache and
+a unified L2 cache.  This configuration is used for all x86-based machines we
+are aware of.<p>
+
+The more specific characteristics of the simulation are as follows.
+
+<ul>
+  <li>Write-allocate: when a write miss occurs, the block written to is brought
+      into the D1 cache.  Most modern caches have this property.</li><p>
+
+  <li>Bit-selection hash function:  the line(s) in the cache to which a memory
+      block maps is chosen by the middle bits M--(M+N-1) of the byte address,
+      where:
+      <ul>
+        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
+        <li>(cache size / line size) = 2^N bytes</li>
+      </ul> </li><p>
+
+  <li>Inclusive L2 cache:  the L2 cache replicates all the entries of the L1
+      cache.  This is standard on Pentium chips, but AMD Athlons use an
+      exclusive L2 cache that only holds blocks evicted from L1.</li><p>
+</ul>
+
+Other noteworthy behaviour:
+
+<ul>
+  <li>References that straddle two cache lines are treated as follows:</li>
+  <ul>
+    <li>If both blocks hit --&gt; counted as one hit</li>
+    <li>If one block hits, the other misses --&gt; counted as one miss</li>
+    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
+  </ul><p>
+
+  <li>Instructions that modify a memory location (eg. <code>inc</code> and
+      <code>dec</code>) are counted as doing just a read, ie. a single data
+      reference.  This may seem strange, but since the write can never cause a
+      miss (the read guarantees the block is in the cache) it's not very
+      interesting.<p>
+
+      Thus it measures not the number of times the data cache is accessed, but
+      the number of times a data cache miss could occur.<p>
+      </li>
+</ul>
+
+If you are interested in simulating a cache with different properties, it is
+not particularly hard to write your own cache simulator, or to modify existing
+ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_I1.c</code> and
+<code>vg_cachesim_I1.c</code>.  We'd be interested to hear from anyone who
+does.
+
+
+<a name="profile"></a>
+<h3>7.5&nbsp; Profiling programs</h3>
+Cache profiling is enabled by using the <code>--cachesim=yes</code> option to
+Valgrind.  This automatically turns off Valgrind's memory checking functions,
+since the cache simulation is slow enough already, and you probably don't want
+to do both at once.<p>
+
+To gather cache profiling information about the program <code>ls -l<code, type:
+
+<blockquote><code>valgrind --cachesim=yes ls -l</code></blockquote>
+
+The program will execute (slowly).  Upon completion, summary statistics
+that look like this will be printed:
+
+<pre>
+==31751== I   refs:      27,742,716
+==31751== I1  misses:           276
+==31751== L2  misses:           275
+==31751== I1  miss rate:        0.0%
+==31751== L2i miss rate:        0.0%
+==31751== 
+==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
+==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
+==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
+==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
+==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
+==31751== 
+==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
+==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
+</pre>
+
+Cache accesses for instruction fetches are summarised first, giving the
+number of fetches made (this is the number of instructions executed, which
+can be useful to know in its own right), the number of I1 misses, and the
+number of L2 instruction (<code>L2i</code>) misses.<p>
+
+Cache accesses for data follow. The information is similar to that of the
+instruction fetches, except that the values are also shown split between reads
+and writes (note each row's <code>rd</code> and <code>wr</code> values add up
+to the row's total).<p>
+
+Combined instruction and data figures for the L2 cache follow that.<p>
+
+
+<h3>7.6&nbsp; Output file</h3>
+As well as printing summary information, Valgrind also writes line-by-line
+cache profiling information to a file named <code>cachegrind.out</code> .  This
+file is human-readable, but is best interpreted by the accompanying program
+vg_annotate, described in the next section.<p>
+
+Things to note about the <code>cachegrind.out</code> file:
+<ul>
+  <li>It is written every time <code>valgrind --cachesim=yes</code> is run; it
+      will automatically overwrite any existing <code>cachegrind.out<code/> in
+      the current directory.</li>
+  <li>It can be quite large: <code>ls -l</code> generates a file of about
+      350KB; browsing a few files and web pages with Konqueror generates a file
+      of around 10MB.</li>
+</ul>
+
+
+<a name="annotate"></a>
+<h3>7.7&nbsp; Annotating C/C++ programs</h3>
+Before using vg_annotate, it is worth widening your window to be at least
+120-characters wide if possible, as the output lines can be quite long.<p>
+
+To get a function-by-function summary, run <code>vg_annotate</code> in
+directory containing a <code>cachegrind.out</code> file.  The output looks like
+this:
+
+<pre>
+--------------------------------------------------------------------------------
+I1 cache:              65536 B, 64 B, 2-way associative
+D1 cache:              65536 B, 64 B, 2-way associative
+L2 cache:              262144 B, 64 B, 8-way associative
+Command:               concord vg_to_ucode.c
+Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Threshold:             99%
+Chosen for annotation:
+Auto-annotation:       on
+
+--------------------------------------------------------------------------------
+Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
+--------------------------------------------------------------------------------
+27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
+
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
+--------------------------------------------------------------------------------
+8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
+5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
+2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
+2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
+2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
+1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
+  897,991   51   51   897,831    95    30        62      1      1  ???:???
+  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
+  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
+  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
+  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
+  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
+  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
+  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
+  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
+  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
+   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
+   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
+</pre>
+
+First up is a summary of the annotation options:
+                    
+<ul>
+  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
+      configuration with which these results were obtained.</li><p>
+
+  <li>Command: the command line invocation of the program under
+      examination.</li><p>
+
+  <li>Events recorded: event abbreviations are:<p>
+  <ul>
+    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
+    <li><code>I1mr</code>: I1 cache read misses</li>
+    <li><code>I2mr</code>: L2 cache instruction read misses</li>
+    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
+    <li><code>D1mr</code>: D1 cache read misses</li>
+    <li><code>D2mr</code>: L2 cache data read misses</li>
+    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
+    <li><code>D1mw</code>: D1 cache write misses</li>
+    <li><code>D2mw</code>: L2 cache data write misses</li>
+  </ul><p>
+      Note that D1 total accesses is given by <code>D1mr</code> +
+      <code>D1mw</code>, and that L2 total accesses is given by
+      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
+
+  <li>Events shown: the events shown (a subset of events gathered).  This can
+      be adjusted with the <code>--show</code> option.</li><p>
+
+  <li>Event sort order: the sort order in which functions are shown.  For
+      example, in this case the functions are sorted from highest
+      <code>Ir</code> counts to lowest.  If two functions have identical
+      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
+      counts, and so on.  This order can be adjusted with the
+      <code>--sort</code> option.<p>
+
+      Note that this dictates the order the functions appear.  It is <b>not</b>
+      the order in which the columns appear;  that is dictated by the "events
+      shown" line (and can be changed with the <code>--sort</code> option).
+      </li><p>
+
+  <li>Threshold: vg_annotate by default omits functions that cause very low
+      numbers of misses to avoid drowing you in information.  In this case,
+      vg_annotate shows summaries the functions that account for 99%   of the
+      <code>Ir</code> counts; <code>Ir</code> is chosen as the treshold event
+      since it is  the primary sort event.  The threshold can be adjusted with
+      the <code>--threshold</code> option.</li><p>
+
+  <li>Chosen for annotation: names of files specified manually for annotation; 
+      in this case none.</li><p>
+
+  <li>Auto-annotation: whether auto-annotation was requested via the 
+      <code>--auto=yes</code> option. In this case no.</li><p>
+</ul>
+
+Then follows summary statistics for the whole program. These are similar
+to the summary provided when running <code>valgrind --cachesim=yes</code>.<p>
+  
+Then follows function-by-function statistics. Each function is identified by a
+<code>file_name:function_name</code> pair. If a column contains only a
+`.' it means  the function never performs that event (eg. the third row shows
+that <code>strcmp()</code> contains no instructions that write to memory). The
+name <code>???</code> is used if the the file name and/or function name could
+not be determined from debugging information. (If most of the entries have the
+form <code>???:???</code> the program probably wasn't compiled with
+<code>-g</code>.)<p> 
+
+It is worth noting that functions will come from three types of source files:
+<ol>
+  <li> From the profiled program (<code>concord.c</code> in this example).</li>
+  <li>From libraries (eg. <code>getc.c</code>)</li>
+  <li>From Valgrind's implementation of some libc functions (eg.
+      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
+      the filename begins with <code>vg_</code>, and is probably one of
+      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
+      <code>vg_mylibc.c</code>.
+  </li>
+</ol>
+
+There are two ways to annotate source files -- by choosing them manually, or
+with the <code>--auto=yes</code> option. To do it manually, just
+specify the filenames as arguments to vg_annotate. For example, the output from
+running <code>vg_annotate concord.c</code> for our example produces the same
+output as above followed by an annotated version of <code>concord.c</code>, a
+section of which looks like:
+
+<pre>
+--------------------------------------------------------------------------------
+-- User-annotated source: concord.c
+--------------------------------------------------------------------------------
+Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
+
+[snip]
+
+        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
+        3    1    1       .     .     .       1      0      0  {
+        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
+        .    .    .       .     .     .       .      .      .      Word_Info *data;
+        1    0    0       .     .     .       1      1      1      int line = 1, i;
+        .    .    .       .     .     .       .      .      .
+        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
+        .    .    .       .     .     .       .      .      .
+    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
+    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
+        .    .    .       .     .     .       .      .      .
+        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
+        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
+        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
+        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
+        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
+        .    .    .       .     .     .       .      .      .      }
+        .    .    .       .     .     .       .      .      .
+  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
+  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
+        .    .    .       .     .     .       .      .      .
+        4    0    0       1     0     0       2      0      0      free(data);
+        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
+        3    0    0       2     0     0       .      .      .  }
+</pre>
+
+(Although column widths are automatically minimised, a wide terminal is clearly
+useful.)<p>
+  
+Each source file is clearly marked (<code>User-annotated source</code>) as
+having been chosen manually for annotation.  If the file was found in one of
+the directories specified with the <code>-I</code>/<code>--include</code>
+option, the directory and file are both given.<p>
+
+Each line is annotated with its event counts.  Events not applicable for a line
+are represented by a `.';  this is useful for distinguishing between an event
+which cannot happen, and one which can but did not.<p> 
+
+Sometimes only a small section of a source file is executed.  To minimise
+uninteresting output, Valgrind only shows annotated lines and lines within a
+small distance of annotated lines.  Gaps are marked with the line numbers so
+you know which part of a file the shown code comes from, eg:
+
+<pre>
+(figures and code for line 704)
+-- line 704 ----------------------------------------
+-- line 878 ----------------------------------------
+(figures and code for line 878)
+</pre>
+
+The amount of context to show around annotated lines is controlled by the
+<code>--context</code> option.<p>
+
+To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
+vg_annotate will automatically annotate every source file it can find that is
+mentioned in the function-by-function summary.  Therefore, the files chosen for
+auto-annotation  are affected by the <code>--sort</code> and
+<code>--threshold</code> options.  Each source file is clearly marked
+(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
+that could not be found are mentioned at the end of the output, eg:    
+
+<pre>
+--------------------------------------------------------------------------------
+The following files chosen for auto-annotation could not be found:
+--------------------------------------------------------------------------------
+  getc.c
+  ctype.c
+  ../sysdeps/generic/lockfile.c
+</pre>
+
+This is quite common for library files, since libraries are usually compiled
+with debugging information, but the source files are often not present on a
+system.  If a file is chosen for annotation <b>both</b> manually and
+automatically, it is marked as <code>User-annotated source</code>.
+
+Use the <code>-I/--include</code> option to tell Valgrind where to look for
+source files if the filenames found from the debugging information aren't
+specific enough.
+
+Beware that vg_annotate can take some time to digest large
+<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
+auto-annotation can produce a lot of output if your program is large!
+
+
+<h3>7.8&nbsp; Annotating assembler programs</h3>
+Valgrind can annotate assembler programs too, or annotate the assembler
+generated for your C program.  Sometimes this is useful for understanding what
+is really happening when an interesting line of C code is translated into
+multiple instructions.<p>
+
+To do this, you just need to assemble your <code>.s</code> files with
+assembler-level debug information.  gcc doesn't do this, but you can use GNU as
+with the <code>--gstabs</code> option to generate object files with this
+information, eg:
+
+<blockquote><code>as --gstabs foo.s</code></blockquote>
+
+You can then profile and annotate source files in the same way as for C/C++
+programs.
+
+
+<h3>7.9&nbsp; vg_annotate options</h3>
+<ul>
+  <li><code>-h, --help</code></li><p>
+  <li><code>-v, --version</code><p>
+
+      Help and version, as usual.</li>
+
+  <li><code>--sort=A,B,C</code> [default: order in 
+      <code>cachegrind.out</code>]<p>
+      Specifies the events upon which the sorting of the function-by-function
+      entries will be based.  Useful if you want to concentrate on eg. I cache
+      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
+      (<code>--sort=D1mr,D2mr</code>), or L2 misses
+      (<code>--sort=D2mr,I2mr</code>).</li><p>
+
+  <li><code>--show=A,B,C</code> [default: all, using order in
+      <code>cachegrind.out</code>]<p>
+      Specifies which events to show (and the column order). Default is to use
+      all present in the <code>cachegrind.out</code> file (and use the order in
+      the file).</li><p>
+
+  <li><code>--threshold=X</code> [default: 99%] <p>
+      Sets the threshold for the function-by-function summary.  Functions are
+      shown that account for more than X% of all the primary sort events.  If
+      auto-annotating, also affects which files are annotated.</li><p>
+
+  <li><code>--auto=no</code> [default]<br>
+      <code>--auto=yes</code> <p>
+      When enabled, automatically annotates every file that is mentioned in the
+      function-by-function summary that can be found.  Also gives a list of
+      those that couldn't be found.
+
+  <li><code>--context=N</code> [default: 8]<p>
+      Print N lines of context before and after each annotated line.  Avoids
+      printing large sections of source files that were not executed.  Use a 
+      large number (eg. 10,000) to show all source lines.
+      </li><p>
+
+  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
+      [default: empty string]<p>
+      Adds a directory to the list in which to search for files.  Multiple
+      -I/--include options can be given to add multiple directories.
+</ul>
+  
+
+<h3>7.10&nbsp; Warnings</h3>
+There are a couple of situations in which vg_annotate issues warnings.
+
+<ul>
+  <li>If a source file is more recent than the <code>cachegrind.out</code>
+      file.  This is because the information in <code>cachegrind.out</code> is
+      only recorded with line numbers, so if the line numbers change at all in
+      the source (eg. lines added, deleted, swapped), any annotations will be 
+      incorrect.<p>
+
+  <li>If information is recorded about line numbers past the end of a file.
+      This can be caused by the above problem, ie. shortening the source file
+      while using an old <code>cachegrind.out</code> file.  If this happens,
+      the figures for the bogus lines are printed anyway (clearly marked as
+      bogus) in case they are important.</li><p>
+</ul>
+
+
+<h3>7.10&nbsp; Things to watch out for</h3>
+Some odd things that can occur during annotation:
+
+<ul>
+  <li>If annotating at the assembler level, you might see something like this:
+
+      <pre>
+      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
+      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
+      .    .    .  .    .    .  .    .    .          .align 4,0x90
+      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
+      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
+      </pre>
+
+      How can the third instruction be executed twice when the others are
+      executed only once?  As it turns out, it isn't.  Here's a dump of the
+      executable, from objdump:
+
+      <pre>
+      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
+      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
+      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
+      8048f32:       89 f6                   mov    %esi,%esi
+      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
+      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
+      </pre>
+
+      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
+      come from?  The GNU assembler inserted it to serve as the two bytes of
+      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
+      a four-byte boundary, but pretended it didn't exist when adding debug
+      information.  Thus when Valgrind reads the debug info it thinks that the
+      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
+      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
+      <code>mov %esi,%esi</code> to it.<p>
+  </li>
+
+  <li>
+      Inlined functions can cause strange results in the function-by-function
+      summary.  If a function <code>inline_me()</code> is defined in
+      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
+      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
+      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
+      will be separate function entries for each inlining site, ie.
+      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
+      <code>foo.h:f3()</code>.  To find the total counts for
+      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
+
+      The reason for this is that although the debug info output by gcc
+      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
+      doesn't indicate the name of the function in <code>foo.h</code>, so
+      Valgrind keeps using the old one.<p>
+
+  <li>
+      Sometimes, the same filename might be represented with a relative name
+      and with an absolute name in different parts of the debug info, eg:
+      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
+      case, if you use auto-annotation, the file will be annotated twice with
+      the counts split between the two.<p>
+  </li>
+</ul>
+
+Note: stabs is not an easy format to read.  If you come across bizarre
+annotations that look like might be caused by a bug in the stabs reader,
+please let us know.
+
+
+<h3>7.11&nbsp; Accuracy</h3>
+Valgrind's cache profiling has a number of shortcomings:
+
+<ul>
+  <li>It doesn't account for kernel activity -- the effect of system calls on
+      the cache contents is ignored.</li><p>
+
+  <li>It doesn't account for other process activity (although this is probably
+      desirable when considering a single program).</li><p>
+
+  <li>It doesn't account for virtual-to-physical address mappings;  hence the
+      entire simulation is not a true representation of what's happening in the
+      cache.</li><p>
+
+  <li>It doesn't account for cache misses not visible at the instruction level,
+      eg. those arising from TLB misses, or speculative execution.</li><p>
+</ul>
+
+Another thing worth nothing is that results are very sensitive.  Changing the
+size of the <code>valgrind.so</code> file, the size of the program being
+profiled, or even the length of its name can perturb the results.  Variations
+will be small, but don't expect perfectly repeatable results if your program
+changes at all.<p>
+
+While these factors mean you shouldn't trust the results to be super-accurate,
+hopefully they should be close enough to be useful.<p>
+
+
+<h3>7.12&nbsp; Todo</h3>
+<ul>
+  <li>Use CPUID instruction to auto-identify cache configuration during 
+      installation.  This would save the user from having to know their cache
+      configuration and using vg_cachegen.</li><p>
+  <li>Program start-up/shut-down calls a lot of functions that aren't
+      interesting and just complicate the output.  Would be nice to exclude
+      these somehow.</li><p>
+</ul> 
+<hr width="100%">
 </body>
 </html>
+
diff --git a/none/Makefile.am b/none/Makefile.am
index b13d2439e7..fcbfae9b57 100644
--- a/none/Makefile.am
+++ b/none/Makefile.am
@@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst
 
 INCLUDES = -I$(srcdir)/demangle
 
-bin_SCRIPTS = valgrind
+bin_SCRIPTS = valgrind vg_annotate vg_cachegen
 
 SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
 
@@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c
 valgrind_so_SOURCES = \
 	vg_clientfuncs.c \
 	vg_scheduler.c \
+        vg_cachesim.c \
 	vg_clientmalloc.c \
 	vg_clientperms.c \
 	vg_demangle.c \
@@ -69,14 +70,20 @@ valgrind_so_LDADD = \
 include_HEADERS = valgrind.h
 
 noinst_HEADERS = \
+        vg_cachesim_I1.c        \
+        vg_cachesim_D1.c        \
+        vg_cachesim_L2.c        \
         vg_kerneliface.h        \
         vg_include.h            \
         vg_constants.h          \
         vg_unsafe.h
 
-
 MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS)
 
+vg_annotate:
+
+vg_cachegen:
+
 vg_memory.o: vg_memory.c $(MANUAL_DEPS)
 	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
 
diff --git a/valgrind.in b/valgrind.in
index 7f18890796..b10f48795b 100755
--- a/valgrind.in
+++ b/valgrind.in
@@ -65,6 +65,8 @@ do
     --workaround-gcc296-bugs=yes)   vgopts="$vgopts $arg"; shift;;
     --freelist-vol=*)       vgopts="$vgopts $arg"; shift;;
     --suppressions=*)       vgopts="$vgopts $arg"; shift;;
+    --cachesim=yes)         vgopts="$vgopts $arg"; shift;;
+    --cachesim=no)          vgopts="$vgopts $arg"; shift;;
 #   options for debugging Valgrind
     --sanity-level=*)       vgopts="$vgopts $arg"; shift;;
     --single-step=yes)      vgopts="$vgopts $arg"; shift;;
@@ -129,6 +131,7 @@ if [ $# = 0 ] || [ z"$dousage" = z1 ]; then
    echo "                              suppressions file <filename>"
    echo "    --check-addrVs=no|yes     experimental lighterweight checking? [yes]"
    echo "                              yes == Valgrind's original behaviour"
+   echo "    --cachesim=no|yes         do cache profiling? [no]"
    echo
    echo "  options for debugging Valgrind itself are:"
    echo "    --sanity-level=<number>   level of sanity checking to do [1]"
diff --git a/vg_annotate.in b/vg_annotate.in
new file mode 100644
index 0000000000..b3b091cc23
--- /dev/null
+++ b/vg_annotate.in
@@ -0,0 +1,892 @@
+#! /usr/bin/perl -w
+##--------------------------------------------------------------------##
+##--- The cache simulation framework: instrumentation, recording   ---##
+##--- and results printing.                                        ---##
+##---                                                  vg_annotate ---##
+##--------------------------------------------------------------------##
+
+#  This file is part of Valgrind, an x86 protected-mode emulator 
+#  designed for debugging and profiling binaries on x86-Unixes.
+#
+#  Copyright (C) 2000-2002 Julian Seward 
+#     jseward@acm.org
+#     Julian_Seward@muraroa.demon.co.uk
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+#  02111-1307, USA.
+#
+#  The GNU General Public License is contained in the file LICENSE.
+
+#----------------------------------------------------------------------------
+# Annotator for valgrind --cachesim=yes.
+#
+# Input file has the following format:
+# <file>         ::= <desc_line>* <cmd_line> <events_line> 
+#                    <data_line>+ <summary_line>
+# <desc_line>    ::= "desc:" <ws>? <non_nl_string>
+# <cmd_line>     ::= "cmd:" <ws>? <cmd>
+# <events_line>  ::= "events:" <ws>? (<event> <ws>)+
+# <data_line>    ::= <file_line> | <fn_line> | <count_line>
+# <file_line>    ::= ("fl=" | "fi=" | "fe=" ) <filename>
+# <fn_line>      ::= "fn="<filename>
+# <count_line>   ::= <line_num> <ws>? (<count> <ws>)+
+# <summary_line> ::= "summary:" <ws>? (<count> <ws>)+
+# <count>        ::= <num> | "."
+#
+# where
+#
+# <non_nl_string> is any string not containing a newline
+# <cmd> is a command line invocation
+# <filename> and <fn_name> can be anything 
+# <num> and <line_num> are decimal numbers
+# <ws> is whitespace.
+# <nl> is a newline
+#
+# The contents of the "desc:" lines is printed out at the top of the summary.
+# This is a generic way of providing simulation specific information, eg. for
+# giving the cache configuration for cache simulation.
+#
+# Counts can be ".", to represent "N/A".
+#
+# The number of counts in each <line> and the <summary_line> should not exceed
+# the number of events in the <event_line>.  If the number in each <line> is
+# less, we use "." for the the missing counts (the last however-many).
+#
+# A <file_line> changes the current file name.  A <fn_line> changes the current
+# function name.  A <count_line> contains counts that pertain to the current
+# filename/fn_name.  A "fn=" <file_line> and a <fn_line> must appear before any
+# <count_line>s to give the context of the first <count_line>s.
+#
+# Each <file_line> should be immediately followed by a <fn_line>.  "fi="
+# <file_lines> are used to switch filenames for inlined functions;  "fe="
+# <file_lines> are similar, but are put at the end of a basic block in which
+# the file name hasn't been switched back to the original file name.  (fi and
+# fe lines behave the same, they are only distinguished to help debugging.)
+#
+#----------------------------------------------------------------------------
+# Performance improvements record, using cachegrind.out for cacheprof, doing no
+# source annotation (irrelevant ones removed):
+#                                                               user time
+# 1. turned off warnings in add_hash_a_to_b()                   3.81 --> 3.48s
+#    [now add_array_a_to_b()]
+# 6. make line_to_CC() return a ref instead of a hash           3.01 --> 2.77s
+#
+#10. changed file format to avoid file/fn name repetition       2.40s
+#    (not sure why higher;  maybe due to new '.' entries?)
+#11. changed file format to drop unnecessary end-line "."s      2.36s
+#    (shrunk file by about 37%)
+#12. switched from hash CCs to array CCs                        1.61s
+#13. only adding b[i] to a[i] if b[i] defined (was doing it if
+#    either a[i] or b[i] was defined, but if b[i] was undefined
+#    it just added 0)                                           1.48s
+#14. Stopped converting "." entries to undef and then back      1.16s
+#15. Using foreach $i (x..y) instead of for ($i = 0...) in
+#    add_array_a_to_b()                                         1.11s
+#
+# Auto-annotating primes:
+#16. Finding count lengths by int((length-1)/3), not by
+#    commifying (halves the number of commify calls)            1.68s --> 1.47s
+
+use strict;
+
+#----------------------------------------------------------------------------
+# Overview: the running example in the comments is for:
+#   - events = A,B,C,D
+#   - --show=C,A,D
+#   - --sort=D,C
+#----------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------
+# Global variables, main data structures
+#----------------------------------------------------------------------------
+# CCs are arrays, the counts corresponding to @events, with 'undef'
+# representing '.'.  This makes things fast (faster than using hashes for CCs)
+# but we have to use @sort_order and @show_order below to handle the --sort and
+# --show options, which is a bit tricky.
+#----------------------------------------------------------------------------
+
+# Total counts for summary (an array reference).
+my $summary_CC;
+
+# Totals for each function, for overall summary.
+# hash(filename:fn_name => CC array)
+my %fn_totals;
+
+# Individual CCs, organised by filename and line_num for easy annotation.
+# hash(filename => hash(line_num => CC array))
+my %all_ind_CCs;
+
+# Files chosen for annotation on the command line.  
+# key = basename (trimmed of any directory), value = full filename
+my %user_ann_files;
+
+# Generic description string.
+my $desc = "";
+
+# Command line of profiled program.
+my $cmd;
+
+# Events in input file, eg. (A,B,C,D)
+my @events;
+
+# Events to show, from command line, eg. (C,A,D)
+my @show_events;
+
+# Map from @show_events indices to @events indices, eg. (2,0,3).  Gives the
+# order in which we must traverse @events in order to show the @show_events, 
+# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events.
+# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).)
+my @show_order;
+
+# Print out the function totals sorted by these events, eg. (D,C).
+my @sort_events;
+
+# Map from @sort_events indices to @events indices, eg. (3,2).  Same idea as
+# for @show_order
+my @sort_order;
+
+# Threshold;  whatever event is the primary sort, we print out functions
+# representing more than this proportion of 'event' events.
+my $threshold = 99;
+
+# If on, automatically annotates all files that are involved in getting over
+# the threshold count of the primary sort event.
+my $auto_annotate = 0;
+
+# Number of lines to show around each annotated line.
+my $context = 8;
+
+# Directories in which to look for annotation files.
+my @include_dirs = ("");
+
+# Input file name
+my $input_file = "cachegrind.out";
+
+# Version number
+my $version = "@VERSION@";
+
+# Usage message.
+my $usage = <<END
+usage: vg_annotate [options] [source-files]
+
+  options for the user, with defaults in [ ], are:
+    -h --help             show this message
+    -v --version          show version
+    --show=A,B,C          only show figures for events A,B,C [all]
+    --sort=A,B,C          sort columns by events A,B,C [event column order]
+    --threshold=<0--100>  percentage of counts (of primary sort event) we
+                          are interested in [$threshold%]
+    --auto=yes|no         annotate all source files containing functions
+                          that helped reach the event count threshold [no]
+    --context=N           print N lines of context before and after
+                          annotated lines [8]
+    -I --include=<dir>    add <dir> to list of directories to search for 
+                          source files
+
+  Valgrind is Copyright (C) 2000-2002 Julian Seward
+  and licensed under the GNU General Public License, version 2.
+  Bug reports, feedback, admiration, abuse, etc, to: jseward\@acm.org.
+
+END
+;
+
+# Used in various places of output.
+my $fancy = '-' x 80 . "\n";
+
+#-----------------------------------------------------------------------------
+# Argument and option handling
+#-----------------------------------------------------------------------------
+sub process_cmd_line() 
+{
+    for my $arg (@ARGV) { 
+
+        # Option handling
+        if ($arg =~ /^-/) {
+
+            # --version
+            if ($arg =~ /^-v$|^--version$/) {
+                die("vg_annotate$version\n");
+
+            # --show=A,B,C
+            } elsif ($arg =~ /^--show=(.*)$/) {
+                @show_events = split(/,/, $1);
+
+            # --sort=A,B,C
+            } elsif ($arg =~ /^--sort=(.*)$/) {
+                @sort_events = split(/,/, $1);
+
+            # --threshold=X (tolerates a trailing '%')
+            } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) {
+                $threshold = $1;
+                if ($threshold < 0 || $threshold > 100) {
+                    die($usage);
+                }
+
+            # --auto=yes|no
+            } elsif ($arg =~ /^--auto=(yes|no)$/) {
+                $auto_annotate = 1 if ($1 eq "yes");
+                $auto_annotate = 0 if ($1 eq "no");
+
+            # --context=N
+            } elsif ($arg =~ /^--context=([\d\.]+)$/) {
+                $context = $1;
+                if ($context < 0) {
+                    die($usage);
+                }
+
+            # --include=A,B,C
+            } elsif ($arg =~ /^(-I|--include)=(.*)$/) {
+                my $inc = $2;
+                $inc =~ s|/$||;         # trim trailing '/'
+                push(@include_dirs, "$inc/");
+
+            } else {            # -h and --help fall under this case
+                die($usage);
+            }
+
+        # Argument handling -- annotation file checking and selection.
+        # Stick filenames into a hash for quick 'n easy lookup throughout
+        } else {
+            my $readable = 0;
+            foreach my $include_dir (@include_dirs) {
+                if (-r $include_dir . $arg) {
+                    $readable = 1;
+                }
+            }
+            $readable or die("File $arg not found in any of: @include_dirs\n");
+            $user_ann_files{$arg} = 1;
+        } 
+    }
+}
+
+#-----------------------------------------------------------------------------
+# Reading of input file
+#-----------------------------------------------------------------------------
+sub max ($$) 
+{
+    my ($x, $y) = @_;
+    return ($x > $y ? $x : $y);
+}
+
+# Add the two arrays;  any '.' entries are ignored.  Two tricky things:
+# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn
+#    off warnings to allow this.  This makes things about 10% faster than
+#    checking for definedness ourselves.
+# 2. We don't add a ".", even though it's value is 0, because we don't want to
+#    make an $a2->[$i] that is undef become 0 unnecessarily.
+sub add_array_a_to_b ($$) 
+{
+    my ($a1, $a2) = @_;
+
+    my $n = max(scalar @$a1, scalar @$a2);
+    $^W = 0;
+    foreach my $i (0 .. $n-1) {
+        $a2->[$i] += $a1->[$i] if ("." ne $a1->[$i]);
+    }
+    $^W = 1;
+}
+
+# Add each event count to the CC array.  '.' counts become undef, as do
+# missing entries (implicitly).
+sub line_to_CC ($)
+{
+    my @CC = (split /\s+/, $_[0]);
+    (@CC <= @events) or die("Line $.: too many event counts\n");
+    return \@CC;
+}
+
+sub read_input_file() 
+{
+    open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n";
+
+    # Read "desc:" lines.
+    my $line;
+    # This gives a "uninitialized value in substitution (s///)" warning; hmm...
+    #while ($line = <INPUTFILE> && $line =~ s/desc:\s+//) {
+    #    $desc .= "$line\n";
+    #}
+    while (1) {
+        $line = <INPUTFILE>;
+        if ($line =~ s/desc:\s+//) {
+            $desc .= $line;
+        } else {
+            last;
+        }
+    }
+
+    # Read "cmd:" line (Nb: will already be in $line from "desc:" loop above).
+    ($line =~ s/cmd:\s+//) or die("Line $.: missing command line\n");
+    $cmd = $line;
+    chomp($cmd);    # Remove newline
+
+    # Read "events:" line.  We make a temporary hash in which the Nth event's
+    # value is N, which is useful for handling --show/--sort options below.
+    $line = <INPUTFILE>;
+    ($line =~ s/events:\s+//) or die("Line $.: missing events line\n");
+    @events = split(/\s+/, $line);
+    my %events;
+    my $n = 0;
+    foreach my $event (@events) {
+        $events{$event} = $n;
+        $n++
+    }
+
+    # If no --show arg give, default to showing all events in the file.
+    # If --show option is used, check all specified events appeared in the
+    # "events:" line.  Then initialise @show_order.
+    if (@show_events) {
+        foreach my $show_event (@show_events) {
+            (defined $events{$show_event}) or 
+                die("--show event `$show_event' did not appear in input\n");
+        }
+    } else {
+        @show_events = @events;
+    }
+    foreach my $show_event (@show_events) {
+        push(@show_order, $events{$show_event});
+    }
+
+    # Do as for --show, but if no --sort arg given, default to sorting by
+    # column order (ie. first column event is primary sort key, 2nd column is
+    # 2ndary key, etc).
+    if (@sort_events) {
+        foreach my $sort_event (@sort_events) {
+            (defined $events{$sort_event}) or 
+                die("--sort event `$sort_event' did not appear in input\n");
+        }
+    } else {
+        @sort_events = @events;
+    }
+    foreach my $sort_event (@sort_events) {
+        push(@sort_order, $events{$sort_event});
+    }
+
+    my $curr_file;
+    my $curr_fn;
+    my $curr_name;
+
+    my $curr_fn_CC = [];
+    my $curr_file_ind_CCs = {};     # hash(line_num => CC)
+
+    # Read body of input file.
+    while (<INPUTFILE>) {
+        s/#.*$//;   # remove comments
+        if (s/^(\d+)\s+//) {
+            my $line_num = $1;
+            my $CC = line_to_CC($_);
+            add_array_a_to_b($CC, $curr_fn_CC);
+            
+            # If curr_file is selected, add CC to curr_file list.  We look for
+            # full filename matches;  or, if auto-annotating, we have to
+            # remember everything -- we won't know until the end what's needed.
+            if ($auto_annotate || defined $user_ann_files{$curr_file}) {
+                my $tmp = $curr_file_ind_CCs->{$line_num};
+                $tmp = [] unless defined $tmp;
+                add_array_a_to_b($CC, $tmp);
+                $curr_file_ind_CCs->{$line_num} = $tmp;
+            }
+
+        } elsif (s/^fn=(.*)$//) {
+            # Commit result from previous function
+            $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name);
+
+            # Setup new one
+            $curr_fn = $1;
+            $curr_name = "$curr_file:$curr_fn";
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^fl=(.*)$//) {
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs 
+                if (defined $curr_file);
+
+            $curr_file = $1;
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+
+        } elsif (s/^(fi|fe)=(.*)$//) {
+            (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n");
+            $fn_totals{$curr_name} = $curr_fn_CC;
+            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs;
+
+            $curr_file = $2;
+            $curr_name = "$curr_file:$curr_fn";
+            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
+            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
+            $curr_fn_CC = $fn_totals{$curr_name};
+            $curr_fn_CC = [] unless (defined $curr_fn_CC);
+
+        } elsif (s/^\s*$//) {
+            # blank, do nothing
+        
+        } elsif (s/^summary:\s+//) {
+            # Finish up handling final filename/fn_name counts
+            $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC 
+                if (defined $curr_file && defined $curr_fn);
+            $all_ind_CCs{$curr_file} = 
+                $curr_file_ind_CCs if (defined $curr_file);
+
+            $summary_CC = line_to_CC($_);
+            (scalar(@$summary_CC) == @events) 
+                or die("Line $.: summary event and total event mismatch\n");
+
+        } else {
+            warn("WARNING: line $. malformed, ignoring\n");
+        }
+    }
+
+    # Check if summary line was present
+    if (not defined $summary_CC) {
+        warn("WARNING: missing final summary line, no summary will be printed\n");
+    }
+
+    close(INPUTFILE);
+}
+
+#-----------------------------------------------------------------------------
+# Print options used
+#-----------------------------------------------------------------------------
+sub print_options ()
+{
+    print($fancy);
+    print($desc);
+    print("Command:          $cmd\n");
+    print("Events recorded:  @events\n");
+    print("Events shown:     @show_events\n");
+    print("Event sort order: @sort_events\n");
+    print("Threshold:        $threshold%\n");
+
+    my @include_dirs2 = @include_dirs;  # copy @include_dirs
+    shift(@include_dirs2);       # remove "" entry, which is always the first
+    unshift(@include_dirs2, "") if (0 == @include_dirs2); 
+    my $include_dir = shift(@include_dirs2);
+    print("Include dirs:     $include_dir\n");
+    foreach my $include_dir (@include_dirs2) {
+        print("                  $include_dir\n");
+    }
+
+    my @user_ann_files = keys %user_ann_files;
+    unshift(@user_ann_files, "") if (0 == @user_ann_files); 
+    my $user_ann_file = shift(@user_ann_files);
+    print("User annotated:   $user_ann_file\n");
+    foreach $user_ann_file (@user_ann_files) {
+        print("                  $user_ann_file\n");
+    }
+
+    my $is_on = ($auto_annotate ? "on" : "off");
+    print("Auto-annotation:  $is_on\n");
+    print("\n");
+}
+
+#-----------------------------------------------------------------------------
+# Print summary and sorted function totals
+#-----------------------------------------------------------------------------
+sub mycmp ($$) 
+{
+    my ($c, $d) = @_;
+
+    # Iterate through sort events (eg. 3,2); return result if two are different
+    foreach my $i (@sort_order) {
+        my ($x, $y);
+        $x = $c->[$i];
+        $y = $d->[$i];
+        $x = -1 unless defined $x;
+        $y = -1 unless defined $y;
+
+        my $cmp = $y <=> $x;        # reverse sort
+        if (0 != $cmp) {
+            return $cmp;
+        }
+    }
+    # Exhausted events, equal
+    return 0;
+}
+
+sub commify ($) {
+    my ($val) = @_;
+    1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/);
+    return $val;
+}
+
+# Because the counts can get very big, and we don't want to waste screen space
+# and make lines too long, we compute exactly how wide each column needs to be
+# by finding the widest entry for each one.
+sub compute_CC_col_widths (@) 
+{
+    my @CCs = @_;
+    my $CC_col_widths = [];
+
+    # Initialise with minimum widths (from event names)
+    foreach my $event (@events) {
+        push(@$CC_col_widths, length($event));
+    }
+    
+    # Find maximum width count for each column.  @CC_col_width positions
+    # correspond to @CC positions.
+    foreach my $CC (@CCs) {
+        foreach my $i (0 .. scalar(@$CC)-1) {
+            if (defined $CC->[$i]) {
+                # Find length, accounting for commas that will be added
+                my $length = length $CC->[$i];
+                my $clength = $length + int(($length - 1) / 3);
+                $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); 
+            }
+        }
+    }
+    return $CC_col_widths;
+}
+
+# Print the CC with each column's size dictated by $CC_col_widths.
+sub print_CC ($$) 
+{
+    my ($CC, $CC_col_widths) = @_;
+
+    foreach my $i (@show_order) {
+        my $count = (defined $CC->[$i] ? commify($CC->[$i]) : ".");
+        my $space = ' ' x ($CC_col_widths->[$i] - length($count));
+        print("$space$count ");
+    }
+}
+
+sub print_events ($)
+{
+    my ($CC_col_widths) = @_;
+
+    foreach my $i (@show_order) { 
+        my $event       = $events[$i];
+        my $event_width = length($event);
+        my $col_width   = $CC_col_widths->[$i];
+        my $space       = ' ' x ($col_width - $event_width);
+        print("$event$space ");
+    }
+}
+
+# Prints summary and function totals (with separate column widths, so that
+# function names aren't pushed over unnecessarily by huge summary figures).
+# Also returns a hash containing all the files that are involved in getting the
+# events count above the threshold (ie. all the interesting ones).
+sub print_summary_and_fn_totals ()
+{
+    my @fn_fullnames = keys   %fn_totals;
+
+    # Work out the size of each column for printing (summary and functions
+    # separately).
+    my $summary_CC_col_widths = compute_CC_col_widths($summary_CC);
+    my      $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals);
+
+    # Header and counts for summary
+    print($fancy);
+    print_events($summary_CC_col_widths);
+    print("\n");
+    print($fancy);
+    print_CC($summary_CC, $summary_CC_col_widths);
+    print(" PROGRAM TOTALS\n");
+    print("\n");
+
+    # Header for functions
+    print($fancy);
+    print_events($fn_CC_col_widths);
+    print(" file:function\n");
+    print($fancy);
+
+    # Sort function names into order dictated by --sort option.
+    @fn_fullnames = sort {
+        mycmp($fn_totals{$a}, $fn_totals{$b})
+    } @fn_fullnames;
+
+    # The thresholded event is the one that is the primary sort event.
+    my $threshold_files       = {};
+    my $threshold_event_index = $sort_order[0];
+    my $threshold_total       = $summary_CC->[$threshold_event_index];
+    my $curr_total            = 0;
+
+    # Print functions, stopping when the threshold has been reached.
+    foreach my $fn_name (@fn_fullnames) {
+
+        # Stop when we've reached the threshold
+        last if ($curr_total * 100 / $threshold_total >= $threshold);
+
+        # Print function results
+        my $fn_CC = $fn_totals{$fn_name};
+        print_CC($fn_CC, $fn_CC_col_widths);
+        print(" $fn_name\n");
+
+        # Update the threshold counting
+        my $filename = $fn_name;
+        $filename =~ s/:[^:]+$//;    # remove function name
+        $threshold_files->{$filename} = 1;
+        $curr_total += $fn_CC->[$threshold_event_index] 
+            if (defined $fn_CC->[$threshold_event_index]);
+    }
+    print("\n");
+
+    return $threshold_files;
+}
+
+#-----------------------------------------------------------------------------
+# Annotate selected files
+#-----------------------------------------------------------------------------
+
+# Issue a warning that the source file is more recent than the input file. 
+sub warning_on_src_more_recent_than_inputfile ($)
+{
+    my $src_file = $_[0];
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ Source file '$src_file' is more recent than input file '$input_file'.
+@ Annotations may not be correct.
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+END
+;
+    print($warning);
+}
+
+# If there is information about lines not in the file, issue a warning
+# explaining possible causes.
+sub warning_on_nonexistent_lines ($$$)
+{
+    my ($src_more_recent_than_inputfile, $src_file, $excess_line_nums) = @_;
+    my $cause_and_solution;
+
+    if ($src_more_recent_than_inputfile) {
+        $cause_and_solution = <<END
+@@ cause:    '$src_file' has changed since information was gathered.
+@@           If so, a warning will have already been issued about this.
+@@ solution: Recompile program and rerun under "valgrind --cachesim=yes" to 
+@@           gather new information.
+END
+    # We suppress warnings about .h files
+    } elsif ($src_file =~ /\.h$/) {
+        $cause_and_solution = <<END
+@@ cause:    bug in the Valgrind's debug info reader that screws up with .h
+@@           files sometimes
+@@ solution: none, sorry
+END
+    } else {
+        $cause_and_solution = <<END
+@@ cause:    not sure, sorry
+END
+    }
+
+    my $warning = <<END
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@
+@@ Information recorded about lines past the end of '$src_file'.
+@@
+@@ Probable cause and solution:
+$cause_and_solution@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+END
+;
+    print($warning);
+}
+
+sub annotate_ann_files($)
+{
+    my ($threshold_files) = @_; 
+
+    my %all_ann_files;
+    my @unfound_auto_annotate_files;
+    my $printed_totals_CC = [];
+
+    # If auto-annotating, add interesting files (but not "???")
+    if ($auto_annotate) {
+        delete $threshold_files->{"???"};
+        %all_ann_files = (%user_ann_files, %$threshold_files) 
+    } else {
+        %all_ann_files = %user_ann_files;
+    }
+
+    # Track if we did any annotations.
+    my $did_annotations = 0;
+
+    LOOP:
+    foreach my $src_file (keys %all_ann_files) {
+
+        my $opened_file = "";
+        my $full_file_name = "";
+        foreach my $include_dir (@include_dirs) {
+            my $try_name = $include_dir . $src_file;
+            if (open(INPUTFILE, "< $try_name")) {
+                $opened_file    = $try_name;
+                $full_file_name = ($include_dir eq "" 
+                                  ? $src_file 
+                                  : "$include_dir + $src_file"); 
+                last;
+            }
+        }
+        
+        if (not $opened_file) {
+            # Failed to open the file.  If chosen on the command line, die.
+            # If arose from auto-annotation, print a little message.
+            if (defined $user_ann_files{$src_file}) {
+                die("File $src_file not opened in any of: @include_dirs\n");
+
+            } else {
+                push(@unfound_auto_annotate_files, $src_file);
+            }
+
+        } else {
+            # File header (distinguish between user- and auto-selected files).
+            print("$fancy");
+            my $ann_type = 
+                (defined $user_ann_files{$src_file} ? "User" : "Auto");
+            print("-- $ann_type-annotated source: $full_file_name\n");
+            print("$fancy");
+
+            # Get file's CCs
+            my $src_file_CCs = $all_ind_CCs{$src_file};
+            if (!defined $src_file_CCs) {
+                print("  No information has been collected for $src_file\n\n");
+                next LOOP;
+            }
+        
+            $did_annotations = 1;
+            
+            # Numeric, not lexicographic sort!
+            my @line_nums = sort {$a <=> $b} keys %$src_file_CCs;  
+
+            # If $src_file more recent than cachegrind.out, issue warning
+            my $src_more_recent_than_inputfile = 0;
+            if ((stat $opened_file)[9] > (stat $input_file)[9]) {
+                $src_more_recent_than_inputfile = 1;
+                warning_on_src_more_recent_than_inputfile($src_file);
+            }
+
+            # Work out the size of each column for printing
+            my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs);
+
+            # Events header
+            print_events($CC_col_widths);
+            print("\n\n");
+
+            # Shift out 0 if it's in the line numbers (from unknown entries,
+            # likely due to bugs in Valgrind's stabs debug info reader)
+            shift(@line_nums) if (0 == $line_nums[0]);
+
+            # Finds interesting line ranges -- all lines with a CC, and all
+            # lines within $context lines of a line with a CC.
+            my $n = @line_nums;
+            my @pairs;
+            for (my $i = 0; $i < $n; $i++) {
+                push(@pairs, $line_nums[$i] - $context);   # lower marker
+                while ($i < $n-1 && 
+                       $line_nums[$i] + 2*$context >= $line_nums[$i+1]) {
+                    $i++;
+                }
+                push(@pairs, $line_nums[$i] + $context);   # upper marker
+            }
+
+            # Annotate chosen lines, tracking total counts of lines printed
+            $pairs[0] = 1 if ($pairs[0] < 1);
+            while (@pairs) {
+                my $low  = shift @pairs;
+                my $high = shift @pairs;
+                while ($. < $low-1) {
+                    my $tmp = <INPUTFILE>;
+                    last unless (defined $tmp);     # hack to detect EOF
+                }
+                my $src_line;
+                # Print line number, unless start of file
+                print("-- line $low " . '-' x 40 . "\n") if ($low != 1);
+                while (($. < $high) && ($src_line = <INPUTFILE>)) {
+                    if (defined $line_nums[0] && $. == $line_nums[0]) {
+                        print_CC($src_file_CCs->{$.}, $CC_col_widths);
+                        add_array_a_to_b($src_file_CCs->{$.}, 
+                                         $printed_totals_CC);
+                        shift(@line_nums);
+
+                    } else {
+                        print_CC( [], $CC_col_widths);
+                    }
+
+                    print(" $src_line");
+                }
+                # Print line number, unless EOF
+                if ($src_line) {
+                    print("-- line $high " . '-' x 40 . "\n");
+                } else {
+                    last;
+                }
+            }
+
+            # If there was info on lines past the end of the file...
+            if (@line_nums) {
+                foreach my $line_num (@line_nums) {
+                    print_CC($src_file_CCs->{$line_num}, $CC_col_widths);
+                    print(" <bogus line $line_num>\n");
+                }
+                print("\n");
+                warning_on_nonexistent_lines($src_more_recent_than_inputfile,
+                                             $src_file, \@line_nums);
+            }
+            print("\n");
+
+            # Print summary of counts attributed to file but not to any
+            # particular line (due to incomplete debug info).
+            if ($src_file_CCs->{0}) {
+                print_CC($src_file_CCs->{0}, $CC_col_widths);
+                print(" <counts for unidentified lines in $src_file>\n\n");
+            }
+            
+            close(INPUTFILE);
+        }
+    }
+
+    # Print list of unfound auto-annotate selected files.
+    if (@unfound_auto_annotate_files) {
+        print("$fancy");
+        print("The following files chosen for auto-annotation could not be found:\n");
+        print($fancy);
+        foreach my $f (@unfound_auto_annotate_files) {
+            print("  $f\n");
+        }
+        print("\n");
+    }
+
+    # If we did any annotating, print what proportion of events were covered by
+    # annotated lines above.
+    if ($did_annotations) {
+        my $percent_printed_CC;
+        foreach (my $i = 0; $i < @$summary_CC; $i++) {
+            $percent_printed_CC->[$i] = 
+                sprintf("%.0f", 
+                        $printed_totals_CC->[$i] / $summary_CC->[$i] * 100);
+        }
+        my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC);
+        print($fancy);
+        print_events($pp_CC_col_widths);
+        print("\n");
+        print($fancy);
+        print_CC($percent_printed_CC, $pp_CC_col_widths);
+        print(" percentage of events annotated\n\n");
+    }
+}
+
+#----------------------------------------------------------------------------
+# "main()"
+#----------------------------------------------------------------------------
+process_cmd_line();
+read_input_file();
+print_options();
+my $threshold_files = print_summary_and_fn_totals();
+annotate_ann_files($threshold_files);
+
diff --git a/vg_cachegen.in b/vg_cachegen.in
new file mode 100755
index 0000000000..e887f1bab3
--- /dev/null
+++ b/vg_cachegen.in
@@ -0,0 +1,300 @@
+#! /usr/bin/perl -w
+##--------------------------------------------------------------------##
+##--- The cache simulator generator                                ---##
+##---                                                  vg_cachegen ---##
+##--------------------------------------------------------------------##
+
+#  This file is part of Valgrind, an x86 protected-mode emulator 
+#  designed for debugging and profiling binaries on x86-Unixes.
+#
+#  Copyright (C) 2000-2002 Julian Seward 
+#     jseward@acm.org
+#     Julian_Seward@muraroa.demon.co.uk
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+#  02111-1307, USA.
+#
+#  The GNU General Public License is contained in the file LICENSE.
+
+#----------------------------------------------------------------------------
+# Cache simulator generator, creates files vg_cachesim_{I1,D1,L2}.c to be
+# #included in vg_cachesim.c.
+#
+# Notes:
+# - simulates a write-allocate cache
+# - (block --> set) hash function uses simple bit selection
+# - handling references straddling two cache blocks:
+#     - counts as only one cache access (not two)
+#     - both blocks hit                  --> one hit
+#     - one block hits, the other misses --> one miss
+#     - both blocks miss                 --> one miss (not two)
+
+use strict;
+
+#----------------------------------------------------------------------------
+# Global variables
+#----------------------------------------------------------------------------
+my %log2 = 
+    (         1 => 0,
+              2 => 1,
+              4 => 2,
+              8 => 3,
+             16 => 4,
+             32 => 5,
+             64 => 6,
+            128 => 7,
+            256 => 8,
+            512 => 9,
+           1024 => 10,
+           2048 => 11,
+           4096 => 12,
+           8192 => 13,
+          16384 => 14,
+          32768 => 15,
+          65536 => 16,
+         131072 => 17,
+         262144 => 18,
+         524288 => 19,
+        1048576 => 20,
+        2097152 => 21,
+        4194304 => 22,
+        8388608 => 23,
+       16777216 => 24,
+       33554432 => 25,
+       67108864 => 26,
+      134217728 => 27,
+      268435456 => 28,
+      536870912 => 29,
+     1073741824 => 30);
+
+my @valid_types = 
+    ("I1", "D1", "L2");
+
+my @valid_sizes = 
+    (1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288,
+     1048576, 2097152, 4194304, 8388608, 16777216, 67108864, 134217728);
+
+my @valid_linesizes =
+    (8, 16, 32, 64, 128);
+
+my @valid_ways = 
+    (1, 2, 4, 8, 256);
+
+my %valid_types;     for my $i (@valid_types)     { $valid_types{$i}     = 1; }
+my %valid_sizes;     for my $i (@valid_sizes)     { $valid_sizes{$i}     = 1; }
+my %valid_linesizes; for my $i (@valid_linesizes) { $valid_linesizes{$i} = 1; }
+my %valid_ways;      for my $i (@valid_ways)      { $valid_ways{$i}      = 1; }
+     
+my ($L, $size, $line_size, $n_ways);
+
+my @caches;
+
+#----------------------------------------------------------------------------
+# Argument and option handling
+#----------------------------------------------------------------------------
+sub process_cmd_line ()
+{
+#----
+    my $usage = <<END
+usage: cachegen [options]
+
+  options are:
+    --I1=size,line_size,number_of_ways
+    --D1=size,line_size,number_of_ways
+    --L2=size,line_size,number_of_ways
+  
+  where
+  size           is one of @valid_sizes
+  line_size      is one of @valid_linesizes
+  number_of_ways is one of @valid_ways
+
+  Valgrind is Copyright (C) 2000-2002 Julian Seward
+  and licensed under the GNU General Public License, version 2.
+  Bug reports, feedback, admiration, abuse, etc, to: jseward\@acm.org.
+            
+END
+;
+#----
+    (@ARGV > 0) or die($usage);
+    for my $arg (@ARGV) { 
+
+        if ($arg =~ /--(I1|D1|L2)=(\d+),(\d+),(\d+)/) {
+            my ($L, $size, $line_size, $n_ways) = ($1, $2, $3, $4);
+
+            (defined $valid_sizes{$size} && 
+             defined $valid_linesizes{$line_size} && 
+             defined $valid_ways{$n_ways}) or die($usage);
+
+            # Remember cache type too, and save
+            push(@caches, [ $L, $size, $line_size, $n_ways ]); 
+
+        }
+
+        else {            # -h and --help fall under this case
+            die($usage);
+        }
+    }
+}
+
+#----------------------------------------------------------------------------
+# Printing machinery (I)
+#----------------------------------------------------------------------------
+sub shiftSequence ($$$)
+{
+    my $s = "";
+    my ($i, $n, $L) = @_;
+
+    while ($i > 0) {
+        $s .= "         ${L}_tags[set$n][$i] = ";
+        $i--;
+        $s .= "${L}_tags[set$n][$i];\n"
+    }
+    return $s;
+}
+
+sub trySet ($$$$$) 
+{
+    my $s = "";
+    my ($k, $n, $L, $hit_ending, $miss_ending) = @_;
+
+    for (my $i = 0; $i < $k; $i++) {
+        $s .= "      "
+           .  (0 == $i ? "if" : "else if")
+           .  " (tag == ${L}_tags[set$n][$i]) {\n"
+           .  shiftSequence($i, $n, $L)
+           .  (0 == $i ? "" : "         ${L}_tags[set$n][0] = tag;\n")
+           .  ("" eq $hit_ending ? "" : "         $hit_ending\n")
+           .  "      }\n";
+    }
+
+    $s .= "      else {\n"
+       .  "         /* A miss */\n"
+       .  shiftSequence($k - 1, $n, $L)             # not if exclusive
+       .  "         ${L}_tags[set$n][0] = tag;\n"   # not if exclusive
+       .  "\n"
+       .  "         $miss_ending\n"
+       .  "      }\n";
+    return $s;
+}
+
+sub print_cache_simulator (@)
+{
+    my ($L, $size, $line_size, $n_ways) = @_;
+
+    my $n_lines     = $size / $line_size;
+    my $n_sets      = $n_lines / $n_ways;
+    my $n_line_bits = $log2{$line_size};
+    my $n_set_bits  = $log2{$n_sets};
+
+    my $assoc = (1 == $n_ways ? "direct-mapped" : "$n_ways-way associative");
+    my $L1_args = "Addr a, UChar size, ULong* m1, ULong *m2";
+    my $L2_args = "Addr a, UChar size, ULong *m2";
+    my $L_args  = ($L ne "L2" ? $L1_args : $L2_args);
+
+#----
+    my $comments_cache_init_desc_and_doref_start = <<END
+/*  $L cache simulator, generated by vg_cachegen.
+ *     total size    = $size bytes
+ *     line size     = $line_size bytes
+ *     associativity = $assoc
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char ${L}_desc_line[] = 
+    "desc: ${L} cache:         $size B, $line_size B, $assoc\\n";
+
+static UInt ${L}_tags[$n_sets][$n_ways];
+
+static void cachesim_${L}_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < $n_sets; set++)
+      for (way = 0; way < $n_ways; way++)
+         ${L}_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_${L}_doref($L_args)
+{
+   register UInt set1 = ( a         >> $n_line_bits) & ($n_sets-1);
+   register UInt set2 = ((a + size) >> $n_line_bits) & ($n_sets-1);
+   register UInt tag  = a >> ($n_line_bits + $n_set_bits);
+
+   if (set1 == set2) {
+
+END
+;
+#----
+    my $doref_middle = <<END
+
+   } else if ((set1 + 1) % $n_sets == set2) {
+
+      Bool is_${L}_miss = False;
+
+END
+;
+#----
+    my $L1_miss_treatment = "(*m1)++;\n" .
+                   "         cachesim_L2_doref(a, size, m2);";
+    my $L2_miss_treatment = "(*m2)++;";
+    my $L_miss_treatment = 
+            ( $L ne "L2" ? $L1_miss_treatment : $L2_miss_treatment );
+
+    my $straddle_case_miss_treatment = <<END
+      /* Miss treatment */
+      if (is_${L}_miss) {
+         $L_miss_treatment
+      }
+END
+;
+#----
+    my $doref_end = <<END
+
+   } else {
+      VG_(printf)("\\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\\n", a, size);
+      VG_(printf)("sets %d and %d.\\n", set1, set2);
+      VG_(panic)("$L cache set mismatch");
+   }
+}
+END
+;
+#----
+    my $outfile = "vg_cachesim_${L}.c";
+    open(OUTFILE, "> $outfile") or die("Couldn't open $outfile for writing\n");
+
+    print(OUTFILE $comments_cache_init_desc_and_doref_start);
+    print(OUTFILE trySet($n_ways, 1, $L, "return;", $L_miss_treatment));
+    print(OUTFILE $doref_middle);
+    print(OUTFILE "      /* Block one */\n");
+    print(OUTFILE trySet($n_ways, 1, $L, "", "is_${L}_miss = True;"));
+    print(OUTFILE "\n");
+    print(OUTFILE "      /* Block two */\n");
+    print(OUTFILE trySet($n_ways, 2, $L, "", "is_${L}_miss = True;"));
+    print(OUTFILE "\n");
+    print(OUTFILE $straddle_case_miss_treatment);
+    print(OUTFILE $doref_end);
+
+    close(OUTFILE);
+}
+
+#----------------------------------------------------------------------------
+# main()
+#----------------------------------------------------------------------------
+process_cmd_line();
+foreach my $cache (@caches) {
+    print_cache_simulator(@$cache);
+}
+
diff --git a/vg_cachesim.c b/vg_cachesim.c
new file mode 100644
index 0000000000..ea0cb410a2
--- /dev/null
+++ b/vg_cachesim.c
@@ -0,0 +1,1068 @@
+/*--------------------------------------------------------------------*/
+/*--- The cache simulation framework: instrumentation, recording   ---*/
+/*--- and results printing.                                        ---*/
+/*---                                                vg_cachesim.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, an x86 protected-mode emulator 
+   designed for debugging and profiling binaries on x86-Unixes.
+
+   Copyright (C) 2000-2002 Julian Seward 
+      jseward@acm.org
+      Julian_Seward@muraroa.demon.co.uk
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307, USA.
+
+   The GNU General Public License is contained in the file LICENSE.
+*/
+
+#include <string.h>
+
+#include "vg_include.h"
+
+#include "vg_cachesim_L2.c"
+#include "vg_cachesim_I1.c"
+#include "vg_cachesim_D1.c"
+
+
+/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
+#define MAX_x86_INSTR_SIZE  16
+
+/* Size of various buffers used for storing strings */
+#define FILENAME_LEN                      256
+#define FN_NAME_LEN                       256
+#define BUF_LEN                           512
+#define COMMIFY_BUF_LEN                   128
+#define RESULTS_BUF                       128
+
+/*------------------------------------------------------------*/
+/*--- Output file related stuff                            ---*/
+/*------------------------------------------------------------*/
+
+#define OUT_FILE        "cachegrind.out"
+
+static void file_err()
+{
+   VG_(message)(Vg_UserMsg,
+                "FATAL: can't open cache simulation output file `%s'",
+                OUT_FILE );
+   VG_(exit)(1);
+}
+
+/*------------------------------------------------------------*/
+/*--- Cost center types, operations                        ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _CC CC;
+struct _CC {
+   ULong a;
+   ULong m1;
+   ULong m2;
+};
+
+static __inline__ void initCC(CC* cc) {
+    cc->a  = 0;
+    cc->m1 = 0;
+    cc->m2 = 0;
+}
+
+
+typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type;
+
+/* Instruction-level cost-centres.  The typedefs for these structs are in
+ * vg_include.c 
+ *
+ * WARNING:  the 'tag' field *must* be the first byte of both CC types.
+ *           the 'instr_addr' *must* be the second word of both CC types.
+ *
+ * This is because we use them when we don't know what type of CC we're dealing
+ * with.
+ */ 
+struct _iCC {
+   /* word 1 */
+   UChar tag;
+   UChar instr_size;
+
+   /* words 2+ */
+   Addr instr_addr;
+   CC I;
+};
+
+struct _idCC {
+   /* word 1 */
+   UChar tag;
+   UChar instr_size;
+   UChar data_size;
+
+   /* words 2+ */
+   Addr instr_addr;
+   CC I;
+   CC D;
+};
+
+static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size)
+{
+   cc->tag        = INSTR_CC;
+   cc->instr_size = instr_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+}
+
+static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr,
+                      UInt instr_size, UInt data_size)
+{
+   cc->tag        = X_CC;
+   cc->instr_size = instr_size;
+   cc->data_size  = data_size;
+   cc->instr_addr = instr_addr;
+   initCC(&cc->I);
+   initCC(&cc->D);
+}
+
+static __inline__ void sprint_iCC(Char buf[BUF_LEN], UInt ln, iCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2/*, cc->instr_addr*/);
+}
+
+static __inline__ void sprint_read_or_mod_CC(Char buf[BUF_LEN], UInt ln, 
+                                             idCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2, 
+                          cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/);
+}
+
+static __inline__ void sprint_write_CC(Char buf[BUF_LEN], UInt ln, idCC* cc)
+{
+   VG_(sprintf)(buf, "%u %llu %llu %llu . . . %llu %llu %llu\n",
+                      ln, cc->I.a, cc->I.m1, cc->I.m2, 
+                          cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/);
+}
+
+/*------------------------------------------------------------*/
+/*--- BBCC hash table stuff                                ---*/
+/*------------------------------------------------------------*/
+
+/* The table of BBCCs is of the form hash(filename, hash(fn_name,
+ * hash(BBCCs))).  Each hash table is separately chained.  The sizes below work
+ * fairly well for Konqueror. */
+
+#define N_FILE_ENTRIES        251
+#define   N_FN_ENTRIES         53
+#define N_BBCC_ENTRIES         37
+
+/* The cost centres for a basic block are stored in a contiguous array.
+ * They are distinguishable by their tag field. */
+typedef struct _BBCC BBCC;
+struct _BBCC {
+   Addr  orig_addr;
+   UInt  array_size;    /* byte-size of variable length array */
+   BBCC* next;
+   Addr  array[0];      /* variable length array */
+};
+
+typedef struct _fn_node fn_node;
+struct _fn_node {
+   Char*    fn_name;
+   BBCC*    BBCCs[N_BBCC_ENTRIES];
+   fn_node* next;
+};
+
+typedef struct _file_node file_node;
+struct _file_node {
+   Char*      filename;
+   fn_node*   fns[N_FN_ENTRIES];
+   file_node* next;
+};
+
+/* BBCC_table structure:  list(filename, list(fn_name, list(BBCC))) */
+file_node *BBCC_table[N_FILE_ENTRIES];
+
+Int  distinct_files      = 0;
+Int  distinct_fns        = 0;
+
+Int  distinct_instrs     = 0;
+Int  full_debug_BBs      = 0;
+Int  file_line_debug_BBs = 0;
+Int  fn_name_debug_BBs   = 0;
+Int  no_debug_BBs        = 0;
+
+Int  BB_retranslations   = 0;
+
+static void init_BBCC_table()
+{
+   Int i;
+   for (i = 0; i < N_FILE_ENTRIES; i++)
+      BBCC_table[i] = NULL;
+}
+
+static void get_file_fn_names(Addr instr_addr, Char filename[FILENAME_LEN],
+                       Char fn_name[FN_NAME_LEN])
+{
+   UInt dummy_line_num;
+   Bool found1, found2, no_demangle = False;
+
+   found1 = VG_(what_line_is_this)(instr_addr, filename,
+                                   FILENAME_LEN, &dummy_line_num);
+   found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN);
+
+   if (!found1 && !found2) {
+      no_debug_BBs++;
+      VG_(strcpy)(filename, "???");
+      VG_(strcpy)(fn_name,  "???");
+
+   } else if ( found1 &&  found2) {
+      full_debug_BBs++;
+
+   } else if ( found1 && !found2) {
+      file_line_debug_BBs++;
+      VG_(strcpy)(fn_name,  "???");
+
+   } else  /*(!found1 &&  found2)*/ {
+      fn_name_debug_BBs++;
+      VG_(strcpy)(filename, "???");
+   }
+}
+
+/* Forward declaration. */
+static Int compute_BBCC_array_size(UCodeBlock* cb);
+
+static __inline__ 
+file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next)
+{
+   Int i;
+   file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node));
+   new->filename  = VG_(strdup)(VG_AR_PRIVATE, filename);
+   for (i = 0; i < N_FN_ENTRIES; i++) {
+      new->fns[i] = NULL;
+   }
+   new->next      = next;
+   return new;
+}
+
+static __inline__ 
+fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next)
+{
+   Int i;
+   fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node));
+   new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name);
+   for (i = 0; i < N_BBCC_ENTRIES; i++) {
+      new->BBCCs[i] = NULL;
+   }
+   new->next    = next;
+   return new;
+}
+
+static __inline__ 
+BBCC* new_BBCC(Addr bb_orig_addr, UCodeBlock* cb, BBCC* next)
+{
+   Int BBCC_array_size = compute_BBCC_array_size(cb);
+   BBCC* new;
+
+   new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size);
+   new->orig_addr  = bb_orig_addr;
+   new->array_size = BBCC_array_size;
+   new->next = next;
+
+   return new;
+}
+
+#define HASH_CONSTANT   256
+
+static UInt hash(Char *s, UInt table_size)
+{
+    int hash_value = 0;
+    for ( ; *s; s++)
+        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+    return hash_value;
+}
+
+/* Do a three step traversal: by filename, then fn_name, then instr_addr.
+ * In all cases prepends new nodes to their chain.  Returns a pointer to the
+ * cost centre.  Also sets BB_seen_before by reference. 
+ */ 
+static __inline__ BBCC* get_BBCC(Addr bb_orig_addr, UCodeBlock* cb, 
+                                 Bool *BB_seen_before)
+{
+   file_node *curr_file_node;
+   fn_node   *curr_fn_node;
+   BBCC      *curr_BBCC;
+   Char       filename[FILENAME_LEN], fn_name[FN_NAME_LEN];
+   UInt       filename_hash, fnname_hash, BBCC_hash;
+
+   get_file_fn_names(bb_orig_addr, filename, fn_name);
+
+   VGP_PUSHCC(VgpCacheGetBBCC);
+   filename_hash = hash(filename, N_FILE_ENTRIES);
+   curr_file_node = BBCC_table[filename_hash];
+   while (NULL != curr_file_node && 
+          strcmp(filename, curr_file_node->filename) != 0) {
+      curr_file_node = curr_file_node->next;
+   }
+   if (NULL == curr_file_node) {
+      BBCC_table[filename_hash] = curr_file_node = 
+         new_file_node(filename, BBCC_table[filename_hash]);
+      distinct_files++;
+   }
+
+   fnname_hash = hash(fn_name, N_FN_ENTRIES);
+   curr_fn_node = curr_file_node->fns[fnname_hash];
+   while (NULL != curr_fn_node && 
+          strcmp(fn_name, curr_fn_node->fn_name) != 0) {
+      curr_fn_node = curr_fn_node->next;
+   }
+   if (NULL == curr_fn_node) {
+      curr_file_node->fns[fnname_hash] = curr_fn_node = 
+         new_fn_node(fn_name, curr_file_node->fns[fnname_hash]);
+      distinct_fns++;
+   }
+
+   BBCC_hash = bb_orig_addr % N_BBCC_ENTRIES;
+   curr_BBCC = curr_fn_node->BBCCs[BBCC_hash];
+   while (NULL != curr_BBCC && bb_orig_addr != curr_BBCC->orig_addr) {
+      curr_BBCC = curr_BBCC->next;
+   }
+   if (curr_BBCC == NULL) {
+      curr_fn_node->BBCCs[BBCC_hash] = curr_BBCC = 
+         new_BBCC(bb_orig_addr, cb, curr_fn_node->BBCCs[BBCC_hash]);
+      *BB_seen_before = False;
+
+   } else {
+      vg_assert(bb_orig_addr == curr_BBCC->orig_addr);
+      vg_assert(curr_BBCC->array_size > 0 && curr_BBCC->array_size < 1000000);
+      if (VG_(clo_verbosity) > 1) {
+          VG_(message)(Vg_DebugMsg, "BB retranslation, retrieving from BBCC table");
+      }
+      *BB_seen_before = True;
+      BB_retranslations++;
+   }
+   VGP_POPCC;
+   return curr_BBCC;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation instrumentation phase               ---*/
+/*------------------------------------------------------------*/
+
+#define uInstr1   VG_(newUInstr1)
+#define uInstr2   VG_(newUInstr2)
+#define uInstr3   VG_(newUInstr3)
+#define dis       VG_(disassemble)
+#define uLiteral  VG_(setLiteralField)
+#define newTemp   VG_(getNewTemp)
+
+static Int compute_BBCC_array_size(UCodeBlock* cb)
+{
+   UInstr* u_in;
+   Int     i, CC_size, BBCC_size = 0;
+   Bool    is_LOAD, is_STORE, is_FPU_R, is_FPU_W;
+    
+   is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+
+   for (i = 0; i < cb->used; i++) {
+      //VG_(ppUInstr)(0, &cb->instrs[i]);
+
+      u_in = &cb->instrs[i];
+      switch(u_in->opcode) {
+
+         case INCEIP: 
+            goto case_for_end_of_instr;
+         
+         case JMP:
+            if (u_in->cond != CondAlways) break;
+
+            goto case_for_end_of_instr;
+
+            case_for_end_of_instr:
+
+            CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W 
+                      ? sizeof(idCC) : sizeof(iCC));
+
+            BBCC_size += CC_size;
+            is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
+            break;
+
+         case LOAD:
+            /* Two LDBs are possible for a single instruction */
+            vg_assert(/*!is_LOAD &&*/ !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_LOAD = True;
+            break;
+
+         case STORE:
+            /* Multiple STOREs are possible for 'pushal' */
+            vg_assert(            /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W);
+            is_STORE = True;
+            break;
+
+         case FPU_R:
+            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_FPU_R = True;
+            break;
+
+         case FPU_W:
+            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
+            is_FPU_W = True;
+            break;
+
+         default:
+            break;
+      }
+   }
+
+   return BBCC_size;
+}
+
+/* Use this rather than eg. -1 because it's stored as a UInt. */
+#define INVALID_DATA_SIZE   999999
+
+UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr)
+{
+   UCodeBlock* cb;
+   Int         i;
+   UInstr*     u_in;
+   BBCC*       BBCC_node;
+   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr;
+   Int         CC_size = -1;    /* Shut gcc warnings up */
+   Addr        instr_addr = orig_addr;
+   UInt        instr_size, data_size = INVALID_DATA_SIZE;
+   Int         helper = -1;     /* Shut gcc warnings up */
+   UInt        stack_used;
+   Bool        BB_seen_before       = False;
+   Bool        prev_instr_was_Jcond = False;
+   Addr        BBCC_ptr0, BBCC_ptr; 
+
+   /* Get BBCC (creating if necessary -- requires a counting pass over the BB
+    * if it's the first time it's been seen), and point to start of the 
+    * BBCC array.  */
+   BBCC_node = get_BBCC(orig_addr, cb_in, &BB_seen_before);
+   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
+
+   cb = VG_(allocCodeBlock)();
+   cb->nextTemp = cb_in->nextTemp;
+
+   t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG;
+
+   for (i = 0; i < cb_in->used; i++) {
+      u_in = &cb_in->instrs[i];
+
+      //VG_(ppUInstr)(0, u_in);
+
+      /* What this is all about:  we want to instrument each x86 instruction 
+       * translation.  The end of these are marked in three ways.  The three
+       * ways, and the way we instrument them, are as follows:
+       *
+       * 1. UCode, INCEIP         --> UCode, Instrumentation, INCEIP
+       * 2. UCode, Juncond        --> UCode, Instrumentation, Juncond
+       * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond
+       *
+       * We must put the instrumentation before the jumps so that it is always
+       * executed.  We don't have to put the instrumentation before the INCEIP
+       * (it could go after) but we do so for consistency.
+       *
+       * Junconds are always the last instruction in a basic block.  Jconds are
+       * always the 2nd last, and must be followed by a Jcond.  We check this
+       * with various assertions.
+       *
+       * Note that in VG_(disBB) we patched the `extra4b' field of the first
+       * occurring JMP in a block with the size of its x86 instruction.  This
+       * is used now.
+       *
+       * Note that we don't have to treat JIFZ specially;  unlike JMPs, JIFZ
+       * occurs in the middle of a BB and gets an INCEIP after it.
+       *
+       * The instrumentation is just a call to the appropriate helper function,
+       * passing it the address of the instruction's CC.
+       */
+      if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP);
+
+      switch (u_in->opcode) {
+
+         case INCEIP:
+            instr_size = u_in->val1;
+            goto case_for_end_of_x86_instr;
+
+         case JMP:
+            if (u_in->cond == CondAlways) {
+               vg_assert(i+1 == cb_in->used); 
+
+               /* Don't instrument if previous instr was a Jcond. */
+               if (prev_instr_was_Jcond) {
+                  vg_assert(0 == u_in->extra4b);
+                  VG_(copyUInstr)(cb, u_in);
+                  break;
+               }
+               prev_instr_was_Jcond = False;
+
+            } else {
+               vg_assert(i+2 == cb_in->used);  /* 2nd last instr in block */
+               prev_instr_was_Jcond = True;
+            }
+
+            /* Ah, the first JMP... instrument, please. */
+            instr_size = u_in->extra4b;
+            goto case_for_end_of_x86_instr;
+
+            /* Shared code that is executed at the end of an x86 translation
+             * block, marked by either an INCEIP or an unconditional JMP. */
+            case_for_end_of_x86_instr:
+
+#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)
+             
+            /* Initialise the CC in the BBCC array appropriately if it hasn't
+             * been initialised before.
+             * Then call appropriate sim function, passing it the CC address.
+             * Note that CALLM_S/CALL_E aren't required here;  by this point,
+             * the checking related to them has already happened. */
+            stack_used = 0;
+
+            vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE);
+            vg_assert(0 != instr_addr);
+
+            /* Save the caller-save registers before we push our args */
+            uInstr1(cb, PUSH, 4, RealReg, R_EAX);
+            uInstr1(cb, PUSH, 4, RealReg, R_ECX);
+            uInstr1(cb, PUSH, 4, RealReg, R_EDX);
+
+            if (!IS_(read) && !IS_(write)) {
+               iCC* CC_ptr = (iCC*)(BBCC_ptr);
+               vg_assert(INVALID_DATA_SIZE == data_size);
+               vg_assert(INVALID_TEMPREG == t_read_addr && 
+                         INVALID_TEMPREG == t_write_addr);
+               CC_size = sizeof(iCC);
+               if (!BB_seen_before)
+                   init_iCC(CC_ptr, instr_addr, instr_size);
+
+               helper = VGOFF_(cachesim_log_non_mem_instr);
+
+            } else { 
+               CC_type X_CC;
+               idCC* CC_ptr = (idCC*)(BBCC_ptr);
+                
+               vg_assert(4 == data_size || 2  == data_size || 1 == data_size || 
+                         8 == data_size || 10 == data_size);
+               
+               CC_size = sizeof(idCC);
+               helper = VGOFF_(cachesim_log_mem_instr);
+
+               if (IS_(read) && !IS_(write)) {
+                  X_CC = READ_CC;
+                  vg_assert(INVALID_TEMPREG != t_read_addr && 
+                            INVALID_TEMPREG == t_write_addr);
+                  t_data_addr = t_read_addr;
+
+               } else if (!IS_(read) && IS_(write)) {
+                  X_CC = WRITE_CC;
+                  vg_assert(INVALID_TEMPREG == t_read_addr && 
+                            INVALID_TEMPREG != t_write_addr);
+                  t_data_addr = t_write_addr;
+
+               } else {
+                  vg_assert(IS_(read) && IS_(write));
+                  X_CC = MOD_CC;
+                  vg_assert(INVALID_TEMPREG != t_read_addr && 
+                            INVALID_TEMPREG != t_write_addr);
+                  t_data_addr = t_read_addr;
+               }
+
+               if (!BB_seen_before)
+                  init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size);
+
+               /* 2nd arg: data addr */
+               uInstr1(cb, PUSH,  4, TempReg, t_data_addr);
+               stack_used += 4;
+            }
+#undef IS_
+
+            /* 1st arg: CC addr */
+            t_CC_addr = newTemp(cb);
+            uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
+            uLiteral(cb, BBCC_ptr);
+            uInstr1(cb, PUSH,  4, TempReg, t_CC_addr);
+            stack_used += 4;
+
+            /* Call function and return. */
+            uInstr1(cb, CALLM, 0, Lit16,   helper);
+            uInstr1(cb, CLEAR, 0, Lit16,   stack_used);
+
+            /* Restore the caller-save registers now the call is done */
+            uInstr1(cb, POP, 4, RealReg, R_EDX);
+            uInstr1(cb, POP, 4, RealReg, R_ECX);
+            uInstr1(cb, POP, 4, RealReg, R_EAX);
+
+            VG_(copyUInstr)(cb, u_in);
+
+            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
+            BBCC_ptr   += CC_size; 
+            instr_addr += instr_size;
+            t_CC_addr = t_read_addr = t_write_addr = 
+                                      t_data_addr  = INVALID_TEMPREG;
+            data_size = INVALID_DATA_SIZE;
+            break;
+
+
+         /* For memory-ref instrs, copy the data_addr into a temporary to be
+          * passed to the cachesim_log_function at the end of the instruction.
+          */
+         case LOAD: 
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         case FPU_R:
+            t_read_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val2,  TempReg, t_read_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         /* Note that we must set t_write_addr even for mod instructions;
+          * that's how the code above determines whether it does a write;
+          * without it, it would think a mod instruction is a read.
+          * As for the MOV, if it's a mod instruction it's redundant, but it's
+          * not expensive and mod instructions are rare anyway. */
+         case STORE:
+         case FPU_W:
+            t_write_addr = newTemp(cb);
+            uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr);
+            data_size = u_in->size;
+            VG_(copyUInstr)(cb, u_in);
+            break;
+
+         case NOP:  case CALLM_E:  case CALLM_S:
+            break;
+
+         default:
+            VG_(copyUInstr)(cb, u_in);
+            break;
+      }
+   }
+
+   /* Just check everything looks ok */
+   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
+
+   VG_(freeCodeBlock)(cb_in);
+   return cb;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache simulation stuff                               ---*/
+/*------------------------------------------------------------*/
+
+/* Total reads/writes/misses.  Calculated during CC traversal at the end. */
+static CC Ir_total;
+static CC Dr_total;
+static CC Dw_total;
+
+void VG_(init_cachesim)(void)
+{
+   /* Make sure the output file can be written. */
+   Int fd = VG_(open_write)(OUT_FILE);
+   if (-1 == fd) { 
+      fd = VG_(create_and_write)(OUT_FILE);
+      if (-1 == fd) {
+         file_err(); 
+      }
+   }
+   VG_(close)(fd);
+    
+   initCC(&Ir_total);
+   initCC(&Dr_total);
+   initCC(&Dw_total);
+   
+   cachesim_I1_initcache();
+   cachesim_D1_initcache();
+   cachesim_L2_initcache();
+
+   init_BBCC_table();
+}
+
+void VG_(cachesim_log_non_mem_instr)(iCC* cc)
+{
+   //VG_(printf)("sim  I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+   VGP_POPCC;
+}
+
+void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr)
+{
+   //VG_(printf)("sim  D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n",
+   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
+   VGP_PUSHCC(VgpCacheSimulate);
+   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
+   cc->I.a++;
+
+   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
+   cc->D.a++;
+   VGP_POPCC;
+}
+
+/*------------------------------------------------------------*/
+/*--- Printing of output file and summary stats            ---*/
+/*------------------------------------------------------------*/
+
+int get_line_num(Addr instr_addr) 
+{
+   Char filename[FILENAME_LEN] = "???";
+   UInt line_num;
+   Bool found;
+
+   found = VG_(what_line_is_this)(instr_addr, filename,
+                                  FILENAME_LEN, &line_num);
+   if (!found) {
+      line_num = 0; 
+   }
+   return line_num;
+}
+
+static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, 
+                                                 Char *first_instr_fn)
+{
+   Addr BBCC_ptr0, BBCC_ptr;
+   Char buf[BUF_LEN], curr_file[BUF_LEN], fbuf[BUF_LEN+4];
+   UInt line_num;
+
+   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
+
+   VG_(write)(fd, (void*)"\n", 1);
+
+   VG_(strcpy)(curr_file, first_instr_fl);
+   
+   while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) {
+
+      /* We pretend the CC is an iCC for getting the tag.  This is ok
+       * because both CC types have tag as their first byte.  Once we know
+       * the type, we can cast and act appropriately. */
+
+      Char fl_buf[FILENAME_LEN];
+      Char fn_buf[FN_NAME_LEN];
+
+      /* Assumes instr_addr position is same for both CCs. */
+      Addr instr_addr = ((iCC*)BBCC_ptr)->instr_addr;
+      get_file_fn_names(instr_addr, fl_buf, fn_buf);
+
+      /* Allow for filename switching in the middle of a BB;  if this happens,
+       * must print the new filename with the function name. */
+      if (0 != strcmp(fl_buf, curr_file)) {
+         VG_(strcpy)(curr_file, fl_buf);
+         VG_(sprintf)(fbuf, "fi=%s\n", curr_file);
+         VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
+      }
+
+      switch ( ((iCC*)BBCC_ptr)->tag ) {
+
+#define ADD_CC_TO(CC_type, cc, total)           \
+   total.a  += ((CC_type*)BBCC_ptr)->cc.a;      \
+   total.m1 += ((CC_type*)BBCC_ptr)->cc.m1;     \
+   total.m2 += ((CC_type*)BBCC_ptr)->cc.m2;
+          
+         case INSTR_CC:
+            line_num = get_line_num(((iCC*)BBCC_ptr)->instr_addr);
+            sprint_iCC(buf, line_num, (iCC*)BBCC_ptr);
+            ADD_CC_TO(iCC, I, Ir_total);
+            BBCC_ptr += sizeof(iCC);
+            break;
+
+         case READ_CC:
+         case  MOD_CC:
+            line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr);
+            sprint_read_or_mod_CC(buf, line_num, (idCC*)BBCC_ptr);
+            ADD_CC_TO(idCC, I, Ir_total);
+            ADD_CC_TO(idCC, D, Dr_total);
+            BBCC_ptr += sizeof(idCC);
+            break;
+
+         case WRITE_CC:
+            line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr);
+            sprint_write_CC(buf, line_num, (idCC*)BBCC_ptr);
+            ADD_CC_TO(idCC, I, Ir_total);
+            ADD_CC_TO(idCC, D, Dw_total);
+            BBCC_ptr += sizeof(idCC);
+            break;
+
+#undef ADD_CC_TO
+
+         default:
+            VG_(panic)("Unknown CC type in fprint_BBCC()\n");
+            break;
+      }
+      distinct_instrs++;
+      
+      /* If the function name for this instruction doesn't match that of the
+       * first instruction in the BB, print out a warning. */
+      if (VG_(clo_trace_symtab) && 0 != strcmp(fn_buf, first_instr_fn)) {
+         VG_(printf)("Mismatched function names\n");
+         VG_(printf)("  filenames: BB:%s, instr:%s;  "
+                     "fn_names:  BB:%s, instr:%s;  "
+                     "line: %d\n", 
+                     first_instr_fl, fl_buf, 
+                     first_instr_fn, fn_buf, 
+                     line_num);
+      }
+
+      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   }
+   /* If we switched filenames in the middle of the BB without switching back,
+    * switch back now because the subsequent BB may be relying on falling under
+    * the original file name. */
+   if (0 != VG_(strcmp)(first_instr_fl, curr_file)) {
+      VG_(sprintf)(fbuf, "fe=%s\n", first_instr_fl);
+      VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
+   }
+   //VG_(write)(fd, (void*)"#}\n", 3);
+
+   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
+}
+
+static void fprint_BBCC_table_and_calc_totals(Int client_argc, 
+                                              Char** client_argv)
+{
+   Int        fd;
+   Char       buf[BUF_LEN];
+   file_node *curr_file_node;
+   fn_node   *curr_fn_node;
+   BBCC      *curr_BBCC;
+   Int        i,j,k;
+
+   VGP_PUSHCC(VgpCacheDump);
+   fd = VG_(open_write)(OUT_FILE);
+   if (-1 == fd) { file_err(); }
+
+   /* "desc:" lines (giving I1/D1/L2 cache configuration) */
+   VG_(write)(fd, (void*)I1_desc_line, VG_(strlen)(I1_desc_line));
+   VG_(write)(fd, (void*)D1_desc_line, VG_(strlen)(D1_desc_line));
+   VG_(write)(fd, (void*)L2_desc_line, VG_(strlen)(L2_desc_line));
+
+   /* "cmd:" line */
+   VG_(strcpy)(buf, "cmd:");
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   for (i = 0; i < client_argc; i++) {
+       VG_(sprintf)(buf, " %s", client_argv[i]);
+       VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   }
+   /* "events:" line */
+   VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n");
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+   /* Six loops here:  three for the hash table arrays, and three for the
+    * chains hanging off the hash table arrays. */
+   for (i = 0; i < N_FILE_ENTRIES; i++) {
+      curr_file_node = BBCC_table[i];
+      while (curr_file_node != NULL) {
+         VG_(sprintf)(buf, "fl=%s\n", curr_file_node->filename);
+         VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+         for (j = 0; j < N_FN_ENTRIES; j++) {
+            curr_fn_node = curr_file_node->fns[j];
+            while (curr_fn_node != NULL) {
+               VG_(sprintf)(buf, "fn=%s\n", curr_fn_node->fn_name);
+               VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+
+               for (k = 0; k < N_BBCC_ENTRIES; k++) {
+                  curr_BBCC = curr_fn_node->BBCCs[k];
+                  while (curr_BBCC != NULL) {
+                     fprint_BBCC(fd, curr_BBCC, 
+                             
+                             curr_file_node->filename,
+                             curr_fn_node->fn_name);
+
+                     curr_BBCC = curr_BBCC->next;
+                  }
+               }
+               curr_fn_node = curr_fn_node->next;
+            }
+         }
+         curr_file_node = curr_file_node->next;
+      }
+   }
+
+   /* Summary stats must come after rest of table, since we calculate them
+    * during traversal.  */ 
+   VG_(sprintf)(buf, "summary: "
+                     "%llu %llu %llu "
+                     "%llu %llu %llu "
+                     "%llu %llu %llu\n", 
+                     Ir_total.a, Ir_total.m1, Ir_total.m2,
+                     Dr_total.a, Dr_total.m1, Dr_total.m2,
+                     Dw_total.a, Dw_total.m1, Dw_total.m2);
+   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
+   VG_(close)(fd);
+}
+
+/* Adds commas to ULong, right justifying in a field field_width wide, returns
+ * the string in buf. */
+Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN])
+{
+   int len, n_commas, i, j, new_len, space;
+
+   VG_(sprintf)(buf, "%lu", n);
+   len = VG_(strlen)(buf);
+   n_commas = (len - 1) / 3;
+   new_len = len + n_commas;
+   space = field_width - new_len;
+
+   /* Allow for printing a number in a field_width smaller than it's size */
+   if (space < 0) space = 0;    
+
+   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
+    * of three. */
+   for (j = -1, i = len ; i >= 0; i--) {
+      buf[i + n_commas + space] = buf[i];
+
+      if (3 == ++j) {
+         j = 0;
+         n_commas--;
+         buf[i + n_commas + space] = ',';
+      }
+   }
+   /* Right justify in field. */
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+   return new_len;
+}
+
+void percentify(Int n, Int pow, Int field_width, char buf[]) 
+{
+   int i, len, space;
+    
+   VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow);
+   len = VG_(strlen)(buf);
+   space = field_width - len;
+   i = len;
+
+   /* Right justify in field */
+   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
+   for (i = 0; i < space; i++)  buf[i] = ' ';
+}
+
+void VG_(show_cachesim_results)(Int client_argc, Char** client_argv)
+{
+   CC D_total;
+   ULong L2_total_m, L2_total_mr, L2_total_mw; 
+   char buf1[RESULTS_BUF], 
+        buf2[RESULTS_BUF], 
+        buf3[RESULTS_BUF];
+   Int l1, l2, l3;
+   Int p;
+
+   fprint_BBCC_table_and_calc_totals(client_argc, client_argv);
+
+   /* I cache results.  Use the I_refs value to determine the first column
+    * width. */
+   l1 = commify(Ir_total.a, 0, buf1);
+   VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);
+
+   commify(Ir_total.m1, l1, buf1);
+   VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);
+
+   commify(Ir_total.m2, l1, buf1);
+   VG_(message)(Vg_UserMsg, "L2  misses:    %s", buf1);
+
+   p = 100;
+
+   percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1);
+   VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
+                
+   percentify(Ir_total.m2 * 100 * p / Ir_total.a, p, l1+1, buf1);
+   VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
+   VG_(message)(Vg_UserMsg, "");
+
+   /* D cache results.  Use the D_refs.rd and D_refs.wr values to determine the
+    * width of columns 2 & 3. */
+   D_total.a  = Dr_total.a  + Dw_total.a;
+   D_total.m1 = Dr_total.m1 + Dw_total.m1;
+   D_total.m2 = Dr_total.m2 + Dw_total.m2;
+       
+        commify( D_total.a, 0, buf1);
+   l2 = commify(Dr_total.a, 0, buf2);
+   l3 = commify(Dw_total.a, 0, buf3);
+   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
+                buf1,  buf2,  buf3);
+
+   commify( D_total.m1, l1, buf1);
+   commify(Dr_total.m1, l2, buf2);
+   commify(Dw_total.m1, l3, buf3);
+   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   commify( D_total.m2, l1, buf1);
+   commify(Dr_total.m2, l2, buf2);
+   commify(Dw_total.m2, l3, buf3);
+   VG_(message)(Vg_UserMsg, "L2  misses:    %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   p = 10;
+   
+   percentify( D_total.m1 * 100 * p / D_total.a,  p, l1+1, buf1);
+   percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2);
+   percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+
+   percentify( D_total.m2 * 100 * p / D_total.a,  p, l1+1, buf1);
+   percentify(Dr_total.m2 * 100 * p / Dr_total.a, p, l2+1, buf2);
+   percentify(Dw_total.m2 * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
+   VG_(message)(Vg_UserMsg, "");
+
+   /* L2 overall results */
+   L2_total_m  = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
+   L2_total_mr = Dr_total.m2 + Ir_total.m2;
+   L2_total_mw = Dw_total.m2;
+
+   commify(L2_total_m,  l1, buf1);
+   commify(L2_total_mr, l2, buf2);
+   commify(L2_total_mw, l3, buf3);
+   VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
+                buf1, buf2, buf3);
+
+   percentify(L2_total_m  * 100 * p / (Ir_total.a + D_total.a),  p, l1+1, buf1);
+   percentify(L2_total_mr * 100 * p / (Ir_total.a + Dr_total.a), p, l2+1, buf2);
+   percentify(L2_total_mw * 100 * p / Dw_total.a, p, l3+1, buf3);
+   VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )", buf1, buf2,buf3);
+            
+
+   /* Hash table stats */
+   if (VG_(clo_verbosity) > 1) {
+       int BB_lookups = full_debug_BBs      + fn_name_debug_BBs +
+                        file_line_debug_BBs + no_debug_BBs;
+      
+       VG_(message)(Vg_DebugMsg, "");
+       VG_(message)(Vg_DebugMsg, "Distinct files:   %d", distinct_files);
+       VG_(message)(Vg_DebugMsg, "Distinct fns:     %d", distinct_fns);
+       VG_(message)(Vg_DebugMsg, "BB lookups:       %d", BB_lookups);
+       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
+                    full_debug_BBs    * 100 / BB_lookups,
+                    full_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
+                    file_line_debug_BBs * 100 / BB_lookups,
+                    file_line_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
+                    fn_name_debug_BBs * 100 / BB_lookups,
+                    fn_name_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
+                    no_debug_BBs      * 100 / BB_lookups,
+                    no_debug_BBs);
+       VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations);
+       VG_(message)(Vg_DebugMsg, "Distinct instrs:  %d", distinct_instrs);
+   }
+   VGP_POPCC;
+}
+
diff --git a/vg_cachesim_D1.c b/vg_cachesim_D1.c
new file mode 100644
index 0000000000..c2b3d4cc3e
--- /dev/null
+++ b/vg_cachesim_D1.c
@@ -0,0 +1,93 @@
+/*  D1 cache simulator, generated by vg_cachegen.
+ *     total size    = 65536 bytes
+ *     line size     = 64 bytes
+ *     associativity = 2-way associative
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char D1_desc_line[] = 
+    "desc: D1 cache:         65536 B, 64 B, 2-way associative\n";
+
+static UInt D1_tags[512][2];
+
+static void cachesim_D1_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < 512; set++)
+      for (way = 0; way < 2; way++)
+         D1_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_D1_doref(Addr a, UChar size, ULong* m1, ULong *m2)
+{
+   register UInt set1 = ( a         >> 6) & (512-1);
+   register UInt set2 = ((a + size) >> 6) & (512-1);
+   register UInt tag  = a >> (6 + 9);
+
+   if (set1 == set2) {
+
+      if (tag == D1_tags[set1][0]) {
+         return;
+      }
+      else if (tag == D1_tags[set1][1]) {
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+         return;
+      }
+      else {
+         /* A miss */
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else if ((set1 + 1) % 512 == set2) {
+
+      Bool is_D1_miss = False;
+
+      /* Block one */
+      if (tag == D1_tags[set1][0]) {
+      }
+      else if (tag == D1_tags[set1][1]) {
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+      }
+      else {
+         /* A miss */
+         D1_tags[set1][1] = D1_tags[set1][0];
+         D1_tags[set1][0] = tag;
+
+         is_D1_miss = True;
+      }
+
+      /* Block two */
+      if (tag == D1_tags[set2][0]) {
+      }
+      else if (tag == D1_tags[set2][1]) {
+         D1_tags[set2][1] = D1_tags[set2][0];
+         D1_tags[set2][0] = tag;
+      }
+      else {
+         /* A miss */
+         D1_tags[set2][1] = D1_tags[set2][0];
+         D1_tags[set2][0] = tag;
+
+         is_D1_miss = True;
+      }
+
+      /* Miss treatment */
+      if (is_D1_miss) {
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else {
+      VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size);
+      VG_(printf)("sets %d and %d.\n", set1, set2);
+      VG_(panic)("D1 cache set mismatch");
+   }
+}
diff --git a/vg_cachesim_I1.c b/vg_cachesim_I1.c
new file mode 100644
index 0000000000..32b89b51b0
--- /dev/null
+++ b/vg_cachesim_I1.c
@@ -0,0 +1,93 @@
+/*  I1 cache simulator, generated by vg_cachegen.
+ *     total size    = 65536 bytes
+ *     line size     = 64 bytes
+ *     associativity = 2-way associative
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char I1_desc_line[] = 
+    "desc: I1 cache:         65536 B, 64 B, 2-way associative\n";
+
+static UInt I1_tags[512][2];
+
+static void cachesim_I1_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < 512; set++)
+      for (way = 0; way < 2; way++)
+         I1_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_I1_doref(Addr a, UChar size, ULong* m1, ULong *m2)
+{
+   register UInt set1 = ( a         >> 6) & (512-1);
+   register UInt set2 = ((a + size) >> 6) & (512-1);
+   register UInt tag  = a >> (6 + 9);
+
+   if (set1 == set2) {
+
+      if (tag == I1_tags[set1][0]) {
+         return;
+      }
+      else if (tag == I1_tags[set1][1]) {
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+         return;
+      }
+      else {
+         /* A miss */
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else if ((set1 + 1) % 512 == set2) {
+
+      Bool is_I1_miss = False;
+
+      /* Block one */
+      if (tag == I1_tags[set1][0]) {
+      }
+      else if (tag == I1_tags[set1][1]) {
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+      }
+      else {
+         /* A miss */
+         I1_tags[set1][1] = I1_tags[set1][0];
+         I1_tags[set1][0] = tag;
+
+         is_I1_miss = True;
+      }
+
+      /* Block two */
+      if (tag == I1_tags[set2][0]) {
+      }
+      else if (tag == I1_tags[set2][1]) {
+         I1_tags[set2][1] = I1_tags[set2][0];
+         I1_tags[set2][0] = tag;
+      }
+      else {
+         /* A miss */
+         I1_tags[set2][1] = I1_tags[set2][0];
+         I1_tags[set2][0] = tag;
+
+         is_I1_miss = True;
+      }
+
+      /* Miss treatment */
+      if (is_I1_miss) {
+         (*m1)++;
+         cachesim_L2_doref(a, size, m2);
+      }
+
+   } else {
+      VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size);
+      VG_(printf)("sets %d and %d.\n", set1, set2);
+      VG_(panic)("I1 cache set mismatch");
+   }
+}
diff --git a/vg_cachesim_L2.c b/vg_cachesim_L2.c
new file mode 100644
index 0000000000..bb685326ab
--- /dev/null
+++ b/vg_cachesim_L2.c
@@ -0,0 +1,250 @@
+/*  L2 cache simulator, generated by vg_cachegen.
+ *     total size    = 262144 bytes
+ *     line size     = 64 bytes
+ *     associativity = 8-way associative
+ *
+ *  This file should be #include-d into vg_cachesim.c
+ */
+
+static char L2_desc_line[] = 
+    "desc: L2 cache:         262144 B, 64 B, 8-way associative\n";
+
+static UInt L2_tags[512][8];
+
+static void cachesim_L2_initcache(void)
+{
+   UInt set, way;
+   for (set = 0; set < 512; set++)
+      for (way = 0; way < 8; way++)
+         L2_tags[set][way] = 0;
+}
+
+static __inline__ 
+void cachesim_L2_doref(Addr a, UChar size, ULong *m2)
+{
+   register UInt set1 = ( a         >> 6) & (512-1);
+   register UInt set2 = ((a + size) >> 6) & (512-1);
+   register UInt tag  = a >> (6 + 9);
+
+   if (set1 == set2) {
+
+      if (tag == L2_tags[set1][0]) {
+         return;
+      }
+      else if (tag == L2_tags[set1][1]) {
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][2]) {
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][3]) {
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][4]) {
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][5]) {
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][6]) {
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else if (tag == L2_tags[set1][7]) {
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+         return;
+      }
+      else {
+         /* A miss */
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+
+         (*m2)++;
+      }
+
+   } else if ((set1 + 1) % 512 == set2) {
+
+      Bool is_L2_miss = False;
+
+      /* Block one */
+      if (tag == L2_tags[set1][0]) {
+      }
+      else if (tag == L2_tags[set1][1]) {
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][2]) {
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][3]) {
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][4]) {
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][5]) {
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][6]) {
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else if (tag == L2_tags[set1][7]) {
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+      }
+      else {
+         /* A miss */
+         L2_tags[set1][7] = L2_tags[set1][6];
+         L2_tags[set1][6] = L2_tags[set1][5];
+         L2_tags[set1][5] = L2_tags[set1][4];
+         L2_tags[set1][4] = L2_tags[set1][3];
+         L2_tags[set1][3] = L2_tags[set1][2];
+         L2_tags[set1][2] = L2_tags[set1][1];
+         L2_tags[set1][1] = L2_tags[set1][0];
+         L2_tags[set1][0] = tag;
+
+         is_L2_miss = True;
+      }
+
+      /* Block two */
+      if (tag == L2_tags[set2][0]) {
+      }
+      else if (tag == L2_tags[set2][1]) {
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][2]) {
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][3]) {
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][4]) {
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][5]) {
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][6]) {
+         L2_tags[set2][6] = L2_tags[set2][5];
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else if (tag == L2_tags[set2][7]) {
+         L2_tags[set2][7] = L2_tags[set2][6];
+         L2_tags[set2][6] = L2_tags[set2][5];
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+      }
+      else {
+         /* A miss */
+         L2_tags[set2][7] = L2_tags[set2][6];
+         L2_tags[set2][6] = L2_tags[set2][5];
+         L2_tags[set2][5] = L2_tags[set2][4];
+         L2_tags[set2][4] = L2_tags[set2][3];
+         L2_tags[set2][3] = L2_tags[set2][2];
+         L2_tags[set2][2] = L2_tags[set2][1];
+         L2_tags[set2][1] = L2_tags[set2][0];
+         L2_tags[set2][0] = tag;
+
+         is_L2_miss = True;
+      }
+
+      /* Miss treatment */
+      if (is_L2_miss) {
+         (*m2)++;
+      }
+
+   } else {
+      VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size);
+      VG_(printf)("sets %d and %d.\n", set1, set2);
+      VG_(panic)("L2 cache set mismatch");
+   }
+}
diff --git a/vg_include.h b/vg_include.h
index b51ef059a8..08a1ab20de 100644
--- a/vg_include.h
+++ b/vg_include.h
@@ -234,6 +234,8 @@ extern Bool  VG_(clo_optimise);
 extern Bool  VG_(clo_instrument);
 /* DEBUG: clean up instrumented code?  default: YES */
 extern Bool  VG_(clo_cleanup);
+/* Cache simulation instrumentation?  default: NO */
+extern Bool  VG_(clo_cachesim);
 /* SMC write checks?  default: SOME (1,2,4 byte movs to mem) */
 extern Int   VG_(clo_smc_check);
 /* DEBUG: print system calls?  default: NO */
@@ -287,7 +289,7 @@ extern void VG_(shutdown_logging) ( void );
 
 #define VGP_M_STACK 10
 
-#define VGP_M_CCS 20  /* == the # of elems in VGP_LIST */
+#define VGP_M_CCS 24  /* == the # of elems in VGP_LIST */
 #define VGP_LIST \
    VGP_PAIR(VgpRun=0,      "running"),                \
    VGP_PAIR(VgpMalloc,     "low-lev malloc/free"),    \
@@ -307,6 +309,10 @@ extern void VG_(shutdown_logging) ( void );
    VGP_PAIR(VgpAddToT,     "add-to-transtab"),        \
    VGP_PAIR(VgpSARP,       "set-addr-range-perms"),   \
    VGP_PAIR(VgpSyscall,    "syscall wrapper"),        \
+   VGP_PAIR(VgpCacheInstrument, "cache instrument"),  \
+   VGP_PAIR(VgpCacheGetBBCC,"cache get BBCC"),        \
+   VGP_PAIR(VgpCacheSimulate, "cache simulate"),      \
+   VGP_PAIR(VgpCacheDump,  "cache stats dump"),       \
    VGP_PAIR(VgpSpare1,     "spare 1"),                \
    VGP_PAIR(VgpSpare2,     "spare 2")
 
@@ -718,8 +724,10 @@ extern void VG_(assert_fail) ( Char* expr, Char* file,
                                Int line, Char* fn )
             __attribute__ ((__noreturn__));
 
-/* Reading files. */
+/* Reading and writing files. */
 extern Int  VG_(open_read) ( Char* pathname );
+extern Int  VG_(open_write)       ( Char* pathname );
+extern Int  VG_(create_and_write) ( Char* pathname );
 extern void VG_(close)     ( Int fd );
 extern Int  VG_(read)      ( Int fd, void* buf, Int count);
 extern Int  VG_(write)     ( Int fd, void* buf, Int count);
@@ -955,7 +963,9 @@ typedef
       UChar   tag3:4;     /* third  operand tag */
       UChar   extra4b:4;  /* Spare field, used by WIDEN for src
                              -size, and by LEA2 for scale 
-                             (1,2,4 or 8) */
+                             (1,2,4 or 8), and by unconditional JMPs for
+                             orig x86 instr size if --cachesim=yes */
+
 
       /* word 5 */
       UChar   cond;            /* condition, for jumps */
@@ -1044,6 +1054,10 @@ extern Bool VG_(anyFlagUse) ( UInstr* u );
 extern void  VG_(ppUInstr)        ( Int instrNo, UInstr* u );
 extern void  VG_(ppUCodeBlock)    ( UCodeBlock* cb, Char* title );
 
+extern UCodeBlock* VG_(allocCodeBlock) ( void );
+extern void  VG_(freeCodeBlock)        ( UCodeBlock* cb );
+extern void  VG_(copyUInstr)                ( UCodeBlock* cb, UInstr* instr );
+
 extern Char* VG_(nameCondcode)    ( Condcode cond );
 extern Bool  VG_(saneUInstr)      ( Bool beforeRA, UInstr* u );
 extern Bool  VG_(saneUCodeBlock)  ( UCodeBlock* cb );
@@ -1184,6 +1198,11 @@ extern void VG_(what_obj_and_fun_is_this)
                                      ( Addr a,
                                        Char* obj_buf, Int n_obj_buf,
                                        Char* fun_buf, Int n_fun_buf );
+extern Bool VG_(what_line_is_this) ( Addr a,
+                                     UChar* filename, Int n_filename,
+                                     UInt* lineno );
+extern Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a,
+                                     Char* fn_name, Int n_fn_name);
 
 extern void VG_(symtab_notify_munmap) ( Addr start, UInt length );
 
@@ -1590,6 +1609,20 @@ extern void VG_(helper_value_check0_fail);
 extern void VG_(signalreturn_bogusRA)( void );
 extern void VG_(pthreadreturn_bogusRA)( void );
 
+/* ---------------------------------------------------------------------
+   Exports of vg_cachesim.c
+   ------------------------------------------------------------------ */
+
+extern UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr);
+
+typedef struct  _iCC  iCC;
+typedef struct _idCC idCC;
+
+extern void VG_(init_cachesim)        ( void );
+extern void VG_(show_cachesim_results)( Int client_argc, Char** client_argv );
+
+extern void VG_(cachesim_log_non_mem_instr)(  iCC* cc );
+extern void VG_(cachesim_log_mem_instr)    ( idCC* cc, Addr data_addr );
 
 /* ---------------------------------------------------------------------
    The state of the simulated CPU.
@@ -1724,7 +1757,8 @@ extern Int VGOFF_(handle_esp_assignment); /* :: Addr -> void */
 extern Int VGOFF_(fpu_write_check);       /* :: Addr -> Int -> void */
 extern Int VGOFF_(fpu_read_check);        /* :: Addr -> Int -> void */
 
-
+extern Int VGOFF_(cachesim_log_non_mem_instr);
+extern Int VGOFF_(cachesim_log_mem_instr);
 
 #endif /* ndef __VG_INCLUDE_H */
 
diff --git a/vg_main.c b/vg_main.c
index 5f7fe59c34..ee875e226b 100644
--- a/vg_main.c
+++ b/vg_main.c
@@ -103,7 +103,8 @@ Int VGOFF_(helperc_STOREV1) = INVALID_OFFSET;
 Int VGOFF_(handle_esp_assignment) = INVALID_OFFSET;
 Int VGOFF_(fpu_write_check) = INVALID_OFFSET;
 Int VGOFF_(fpu_read_check) = INVALID_OFFSET;
-
+Int VGOFF_(cachesim_log_non_mem_instr) = INVALID_OFFSET;
+Int VGOFF_(cachesim_log_mem_instr)     = INVALID_OFFSET;
 
 /* This is the actual defn of baseblock. */
 UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
@@ -165,6 +166,13 @@ static void vg_init_baseBlock ( void )
    /* 16  */ VGOFF_(sh_edi)    = alloc_BaB(1);
    /* 17  */ VGOFF_(sh_eflags) = alloc_BaB(1);
 
+   /* 17a */ 
+   VGOFF_(cachesim_log_non_mem_instr)  
+      = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_non_mem_instr) );
+   /* 17b */ 
+   VGOFF_(cachesim_log_mem_instr)  
+      = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_mem_instr) );
+
    /* 18  */ 
    VGOFF_(helper_value_check4_fail) 
       = alloc_BaB_1_set( (Addr) & VG_(helper_value_check4_fail) );
@@ -419,6 +427,7 @@ Bool   VG_(clo_single_step);
 Bool   VG_(clo_optimise);
 Bool   VG_(clo_instrument);
 Bool   VG_(clo_cleanup);
+Bool   VG_(clo_cachesim);
 Int    VG_(clo_smc_check);
 Bool   VG_(clo_trace_syscalls);
 Bool   VG_(clo_trace_signals);
@@ -745,6 +754,11 @@ static void process_cmd_line_options ( void )
       else if (STREQ(argv[i], "--cleanup=no"))
          VG_(clo_cleanup) = False;
 
+      else if (STREQ(argv[i], "--cachesim=yes"))
+         VG_(clo_cachesim) = True;     
+      else if (STREQ(argv[i], "--cachesim=no"))
+         VG_(clo_cachesim) = False;
+
       else if (STREQ(argv[i], "--smc-check=none"))
          VG_(clo_smc_check) = VG_CLO_SMC_NONE;
       else if (STREQ(argv[i], "--smc-check=some"))
@@ -821,6 +835,11 @@ static void process_cmd_line_options ( void )
 
    VG_(clo_logfile_fd) = eventually_logfile_fd;
 
+   /* Don't do memory checking if simulating the cache. */
+   if (VG_(clo_cachesim)) {
+       VG_(clo_instrument) = False;
+   }
+
    if (VG_(clo_verbosity > 0))
       VG_(message)(Vg_UserMsg, 
                    "valgrind-%s, a memory error detector for x86 GNU/Linux.",
@@ -978,7 +997,7 @@ void VG_(main) ( void )
       attach GDB in another shell. */
    /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */
 
-   if (VG_(clo_instrument)) {
+   if (VG_(clo_instrument) || VG_(clo_cachesim)) {
       VGP_PUSHCC(VgpInitAudit);
       VGM_(init_memory_audit)();
       VGP_POPCC;
@@ -1012,6 +1031,9 @@ void VG_(main) ( void )
       VGM_(make_readable) ( (Addr)&VG_(clo_sloppy_malloc), 1 );
    }
 
+   if (VG_(clo_cachesim)) 
+      VG_(init_cachesim)();
+
    if (VG_(clo_verbosity) > 0)
       VG_(message)(Vg_UserMsg, "");
 
@@ -1039,6 +1061,9 @@ void VG_(main) ( void )
    }
    VG_(running_on_simd_CPU) = False;
 
+   if (VG_(clo_cachesim))
+      VG_(show_cachesim_results)(VG_(client_argc), VG_(client_argv));
+
    VG_(do_sanity_checks)( 1 /* root thread */, 
                           True /*include expensive checks*/ );
 
diff --git a/vg_mylibc.c b/vg_mylibc.c
index 740b21e3d1..77dd153a2e 100644
--- a/vg_mylibc.c
+++ b/vg_mylibc.c
@@ -885,7 +885,49 @@ Int VG_(open_read) ( Char* pathname )
    if (VG_(is_kerror)(fd)) fd = -1;
    return fd;
 }
+
+/* Returns -1 on failure. */
+static Int VG_(chmod_u_rw) ( Int fd )
+{
+   Int res;
+   const int O_IRUSR_IWUSR = 000600; /* See /usr/include/cpio.h */
+   res = vg_do_syscall2(__NR_fchmod, fd, O_IRUSR_IWUSR);
+   if (VG_(is_kerror)(res)) res = -1;
+   return res;
+}
+ 
+/* Returns -1 on failure. */
+Int VG_(create_and_write) ( Char* pathname )
+{
+   Int fd;
+
+   const int O_CR_AND_WR_ONLY = 0101; /* See /usr/include/bits/fcntl.h */
+   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_CR_AND_WR_ONLY, 0);
+   /* VG_(printf)("result = %d\n", fd); */
+   if (VG_(is_kerror)(fd)) {
+      fd = -1;
+   } else {
+      VG_(chmod_u_rw)(fd);
+      if (VG_(is_kerror)(fd)) {
+         fd = -1;
+      }
+   }
+   return fd;
+}
  
+/* Returns -1 on failure. */
+Int VG_(open_write) ( Char* pathname )
+{  
+   Int fd;
+
+   const int O_WRONLY_AND_TRUNC = 01001; /* See /usr/include/bits/fcntl.h */
+   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_WRONLY_AND_TRUNC, 0);
+   /* VG_(printf)("result = %d\n", fd); */
+   if (VG_(is_kerror)(fd)) {
+      fd = -1;
+   } 
+   return fd;
+}
 
 void VG_(close) ( Int fd )
 {
diff --git a/vg_symtab2.c b/vg_symtab2.c
index 74cada4e82..d9da8fff45 100644
--- a/vg_symtab2.c
+++ b/vg_symtab2.c
@@ -1,4 +1,3 @@
-
 /*--------------------------------------------------------------------*/
 /*--- Management of symbols and debugging information.             ---*/
 /*---                                                 vg_symtab2.c ---*/
@@ -50,6 +49,23 @@
 /*--- Structs n stuff                                      ---*/
 /*------------------------------------------------------------*/
 
+/* Stabs entry types, from:
+ *   The "stabs" debug format
+ *   Menapace, Kingdon and MacKenzie
+ *   Cygnus Support
+ */
+typedef enum { N_FUN   = 36,    /* Function start or end            */
+               N_STSYM = 38,    /* Data segment file-scope variable */
+               N_LCSYM = 40,    /* BSS segment file-scope variable  */
+               N_RSYM  = 64,    /* Register variable                */
+               N_SLINE = 68,    /* Source line number               */
+               N_SO    = 100,   /* Source file path and name        */
+               N_LSYM  = 128,   /* Stack variable or type           */
+               N_SOL   = 132,   /* Include file name                */
+               N_LBRAC = 192,   /* Start of lexical block           */
+               N_RBRAC = 224    /* End   of lexical block           */
+             } stab_types;
+      
 /* A structure to hold an ELF symbol (very crudely). */
 typedef 
    struct { 
@@ -535,12 +551,7 @@ void addLineInfo ( SegInfo* si,
 {
    RiLoc loc;
    UInt size = end - start + 1;
-#  if 0
-   if (size > 10000)
-   VG_(printf)( "line %4d: %p .. %p, in %s\n",
-                lineno, start, end, 
-                &si->strtab[fnmoff] );
-#  endif
+
    /* Sanity ... */
    if (size > 10000) return;
 
@@ -577,8 +588,8 @@ void vg_read_lib_symbols ( SegInfo* si )
    /* for the .stabs reader */
    Int    curr_filenmoff;
    Addr   curr_fnbaseaddr;
-   Addr   range_startAddr;
-   Int    range_lineno;
+   Char*  curr_file_name;
+   Int    n_stab_entries;
 
    oimage = (Addr)NULL;
    if (VG_(clo_verbosity) > 1)
@@ -880,25 +891,23 @@ void vg_read_lib_symbols ( SegInfo* si )
    /* Ok.  It all looks plausible.  Go on and read debug data. 
          stab kinds: 100   N_SO     a source file name
                       68   N_SLINE  a source line number
-                      36   N_FUN ?  start of a function
-
-      In this loop, we maintain a current file name, updated
-      as N_SOs appear, and a current function base address,
-      updated as N_FUNs appear.  Based on that, address ranges
-      for N_SLINEs are calculated, and stuffed into the 
-      line info table.
-
-      N_SLINE indicates the start of a source line.  Functions are
-      delimited by N_FUNS, at the start with a non-empty string and at
-      the end with an empty string.  The latter facilitates detecting
-      where to close the last N_SLINE for a function. 
+                      36   N_FUN    start of a function
+
+      In this loop, we maintain a current file name, updated as 
+      N_SO/N_SOLs appear, and a current function base address, 
+      updated as N_FUNs appear.  Based on that, address ranges for 
+      N_SLINEs are calculated, and stuffed into the line info table.
+
+      Finding the instruction address range covered by an N_SLINE is
+      complicated;  see the N_SLINE case below.
    */
    curr_filenmoff  = addStr(si,"???");
    curr_fnbaseaddr = (Addr)NULL;
-   range_startAddr = 0;
-   range_lineno    = 0;
+   curr_file_name  = (Char*)NULL;
 
-   for (i = 0; i < stab_sz/(int)sizeof(struct nlist); i++) {
+   n_stab_entries = stab_sz/(int)sizeof(struct nlist);
+
+   for (i = 0; i < n_stab_entries; i++) {
 #     if 0
       VG_(printf) ( "   %2d  ", i );
       VG_(printf) ( "type=0x%x   othr=%d   desc=%d   value=0x%x   strx=%d  %s",
@@ -910,61 +919,95 @@ void vg_read_lib_symbols ( SegInfo* si )
 #     endif
 
       switch (stab[i].n_type) {
-
-         case 68: { /* N_SLINE */
-            /* flush the current line, if any, and start a new one */
-            Addr range_endAddr 
-               = curr_fnbaseaddr 
-                    + (UInt)stab[i].n_value - 1;
-            if (range_startAddr != 0) {
-               addLineInfo ( si,
-                             curr_filenmoff,
-                             range_startAddr,
-                             range_endAddr,
-                             range_lineno );
+         UInt next_addr;
+
+         /* To compute the instr address range covered by a single line, find
+          * the address of the next thing and compute the difference.  The
+          * approach used depends on what kind of entry/entries follow... */
+         case N_SLINE: {
+            Int lineno = stab[i].n_desc;              
+            Int this_addr = (UInt)stab[i].n_value;
+
+            LOOP:
+            vg_assert(i+1 < n_stab_entries);    /* Haven't reached end */
+            switch (stab[i+1].n_type) {
+               /* Easy, common case: use address of next entry */
+               case N_SLINE: case N_SO:
+                  next_addr = (UInt)stab[i+1].n_value;
+                  break;
+
+               /* Boring one: skip, look for something more useful. */
+               case N_RSYM: case N_LSYM: case N_LBRAC: case N_RBRAC: 
+               case N_STSYM: case N_LCSYM:
+                  i++;
+                  goto LOOP;
+                  
+               /* Should be an end of fun entry, use its address */
+               case N_FUN: 
+                  if ('\0' == * (stabstr + stab[i+1].n_un.n_strx) ) {
+                     next_addr = (UInt)stab[i+1].n_value;
+                  } else {
+                     VG_(printf)("unhandled stabs case: N_FUN start %d %s\n",
+                                i, (stabstr + stab[i+1].n_un.n_strx) );
+                     VG_(panic)("argh");
+                  }
+                  break;
+
+               /* N_SOL should be followed by an N_SLINE which can be used */
+               case N_SOL:
+                  if (i+2 < n_stab_entries && N_SLINE == stab[i+2].n_type) {
+                     next_addr = (UInt)stab[i+2].n_value;
+                     break;
+                  } else {
+                     VG_(printf)("unhandled N_SOL stabs case: %d %d %d", 
+                                 stab[i+1].n_type, i, n_stab_entries);
+                     VG_(panic)("argh");
+                  }
+
+               default:
+                  VG_(printf)("unhandled stabs case: %d %d", 
+                              stab[i+1].n_type,i);
+                  VG_(panic)("argh");
             }
-            range_startAddr = range_endAddr + 1;
-            range_lineno = stab[i].n_desc;              
+            
+            //Int offset2 = (i+1 < n_stab_entries && 68 == stab[i+1].n_type
+            //              ? (UInt)stab[i+1].n_value - 1
+            //              : offset + 1);
+            //if (i+1 < n_stab_entries) {
+            //    int x;
+            //    if (68 != (x = stab[i+1].n_type)) {
+            //        VG_(printf)("%d  ", x);
+            //    }
+            //}
+
+            addLineInfo ( si, curr_filenmoff, curr_fnbaseaddr + this_addr, 
+                          curr_fnbaseaddr + next_addr - 1, lineno );
             break;
          }
 
-         case 36: { /* N_FUN */
-            if ('\0' == * (stabstr + stab[i].n_un.n_strx) ) {
-               /* N_FUN with no name -- indicates the end of a fn.
-                  Flush the current line, if any, but don't start a
-                  new one. */
-               Addr range_endAddr 
-                  = curr_fnbaseaddr 
-                       + (UInt)stab[i].n_value - 1;
-               if (range_startAddr != 0) {
-                  addLineInfo ( si,
-                                curr_filenmoff,
-                                range_startAddr,
-                                range_endAddr,
-                                range_lineno );
-               }
-               range_startAddr = 0;
-            } else {
+         case N_FUN: {
+            if ('\0' != (stabstr + stab[i].n_un.n_strx)[0] ) {
                /* N_FUN with a name -- indicates the start of a fn.  */
-               curr_fnbaseaddr = si->offset
-                                 + (Addr)stab[i].n_value;
-               range_startAddr = curr_fnbaseaddr;
+               curr_fnbaseaddr = si->offset + (Addr)stab[i].n_value;
             }
             break;
          }
 
-         case 100: /* N_SO */
-         case 132: /* N_SOL */
+         case N_SO: case N_SOL:
          /* seems to give lots of locations in header files */
          /* case 130: */ /* BINCL */
          { 
             UChar* nm = stabstr + stab[i].n_un.n_strx;
             UInt len = VG_(strlen)(nm);
-            if (len > 0 && nm[len-1] != '/')
+            
+            if (len > 0 && nm[len-1] != '/') {
                curr_filenmoff = addStr ( si, nm );
+               curr_file_name = stabstr + stab[i].n_un.n_strx;
+            }
             else
                if (len == 0)
                   curr_filenmoff = addStr ( si, "?1\0" );
+
             break;
          }
 
@@ -1071,8 +1114,8 @@ void read_symtab_callback (
    which happen to correspond to the munmap()d area.  */
 void VG_(read_symbols) ( void )
 {
-   if (! VG_(clo_instrument)) 
-     return;
+   if (! VG_(clo_instrument) && ! VG_(clo_cachesim)) 
+      return;
 
    VG_(read_procselfmaps) ( read_symtab_callback );
 
@@ -1251,9 +1294,8 @@ static void search_all_loctabs ( Addr ptr, SegInfo** psi, Int* locno )
    Caller supplies buf and nbuf.  If no_demangle is True, don't do
    demangling, regardless of vg_clo_demangle -- probably because the
    call has come from vg_what_fn_or_object_is_this. */
-static
-Bool vg_what_fn_is_this ( Bool no_demangle, Addr a, 
-                          Char* buf, Int nbuf )
+Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, 
+                            Char* buf, Int nbuf )
 {
    SegInfo* si;
    Int      sno;
@@ -1297,17 +1339,16 @@ void VG_(what_obj_and_fun_is_this) ( Addr a,
                                      Char* fun_buf, Int n_fun_buf )
 {
    (void)vg_what_object_is_this ( a, obj_buf, n_obj_buf );
-   (void)vg_what_fn_is_this ( True, a, fun_buf, n_fun_buf );
+   (void)VG_(what_fn_is_this) ( True, a, fun_buf, n_fun_buf );
 }
 
 
 /* Map a code address to a (filename, line number) pair.  
    Returns True if successful.
 */
-static
-Bool vg_what_line_is_this ( Addr a, 
-                            UChar* filename, Int n_filename, 
-                            UInt* lineno )
+Bool VG_(what_line_is_this)( Addr a, 
+                             UChar* filename, Int n_filename, 
+                             UInt* lineno )
 {
    SegInfo* si;
    Int      locno;
@@ -1317,6 +1358,7 @@ Bool vg_what_line_is_this ( Addr a,
    VG_(strncpy_safely)(filename, & si->strtab[si->loctab[locno].fnmoff], 
                        n_filename);
    *lineno = si->loctab[locno].lineno;
+
    return True;
 }
 
@@ -1348,11 +1390,11 @@ void VG_(mini_stack_dump) ( ExeContext* ec )
 
    n = 0;
 
-   know_fnname  = vg_what_fn_is_this(False,ec->eips[0], buf_fn, M_VG_ERRTXT);
+   know_fnname  = VG_(what_fn_is_this)(False,ec->eips[0], buf_fn, M_VG_ERRTXT);
    know_objname = vg_what_object_is_this(ec->eips[0], buf_obj, M_VG_ERRTXT);
-   know_srcloc  = vg_what_line_is_this(ec->eips[0], 
-                                       buf_srcloc, M_VG_ERRTXT, 
-                                       &lineno);
+   know_srcloc  = VG_(what_line_is_this)(ec->eips[0], 
+                                         buf_srcloc, M_VG_ERRTXT, 
+                                         &lineno);
 
    APPEND("   at ");
    VG_(sprintf)(ibuf,"0x%x: ", ec->eips[0]);
@@ -1383,9 +1425,9 @@ void VG_(mini_stack_dump) ( ExeContext* ec )
 
    clueless = 0;
    for (i = 1; i < stop_at; i++) {
-      know_fnname  = vg_what_fn_is_this(False,ec->eips[i], buf_fn, M_VG_ERRTXT);
+      know_fnname  = VG_(what_fn_is_this)(False,ec->eips[i], buf_fn, M_VG_ERRTXT);
       know_objname = vg_what_object_is_this(ec->eips[i],buf_obj, M_VG_ERRTXT);
-      know_srcloc  = vg_what_line_is_this(ec->eips[i], 
+      know_srcloc  = VG_(what_line_is_this)(ec->eips[i], 
                                           buf_srcloc, M_VG_ERRTXT, 
                                           &lineno);
       n = 0;
diff --git a/vg_to_ucode.c b/vg_to_ucode.c
index f06689d956..691ef26d67 100644
--- a/vg_to_ucode.c
+++ b/vg_to_ucode.c
@@ -4437,43 +4437,76 @@ Int VG_(disBB) ( UCodeBlock* cb, Addr eip0 )
    Addr eip   = eip0;
    Bool isEnd = False;
    Bool block_sane;
+   Int INCEIP_allowed_lag = 4;
+   Int delta = 0;
+
    if (dis) VG_(printf)("\n");
 
+   /* When cache simulating, to ensure cache misses are attributed to the
+    * correct line we ensure EIP is always correct.   This is done by:
+    *
+    * a) Using eager INCEIP updating to cope with all instructions except those
+    *    at the end of a basic block.
+    *
+    * b) Patching in the size of the original x86 instr in the `extra4b' field
+    *    of JMPs at the end of a basic block.  Two cases:
+    *       - Jcond followed by Juncond:  patch the Jcond
+    *       - Juncond alone:              patch the Juncond
+    *
+    * See vg_cachesim_instrument() for how this is used. 
+    */
+   if (VG_(clo_cachesim)) {
+       INCEIP_allowed_lag = 0;
+   }
+
    if (VG_(clo_single_step)) {
       eip = disInstr ( cb, eip, &isEnd );
-      uInstr1(cb, JMP, 0, Literal, 0);
-      uLiteral(cb, eip);
-      uCond(cb, CondAlways);
-      if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
+
+      /* Add a JMP to the next (single x86 instruction) BB if it doesn't
+       * already end with a JMP instr. We also need to check for no UCode,
+       * which occurs if the x86 instr was a nop */
+      if (cb->used == 0 || LAST_UINSTR(cb).opcode != JMP) {
+         uInstr1(cb, JMP, 0, Literal, 0);
+         uLiteral(cb, eip);
+         uCond(cb, CondAlways);
+         if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
+      }
+      delta = eip - eip0;
+
    } else {
-      Int delta = 0;
       Addr eip2;
-      while (True) {
-         if (isEnd) break;
+      while (!isEnd) {
          eip2 = disInstr ( cb, eip, &isEnd );
          delta += (eip2 - eip);
          eip = eip2;
-         if (delta > 4 && !isEnd) {
-            uInstr1(cb, INCEIP, 0, Lit16, delta);
-            if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-            delta = 0;
-         }
          /* Split up giant basic blocks into pieces, so the
             translations fall within 64k. */
-         if (eip - eip0 > 2000) {
+         if (eip - eip0 > 2000 && !isEnd) {
             if (VG_(clo_verbosity) > 0)
-               VG_(message)(Vg_DebugMsg, 
+               VG_(message)(Vg_DebugMsg,
                   "Warning: splitting giant basic block into pieces");
             uInstr1(cb, JMP, 0, Literal, 0);
             uLiteral(cb, eip);
             uCond(cb, CondAlways);
             if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-            if (dis) VG_(printf)("\n");
-            break;
+            isEnd = True;
+
+         } else if (delta > INCEIP_allowed_lag && !isEnd) {
+            uInstr1(cb, INCEIP, 0, Lit16, delta);
+            if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
+            delta = 0;
          }
          if (dis) VG_(printf)("\n");
       }
    }
+   if (VG_(clo_cachesim)) {
+      /* Patch instruction size into earliest JMP. */
+      if (cb->used >= 2 && JMP == cb->instrs[cb->used - 2].opcode) {
+         cb->instrs[cb->used - 2].extra4b = delta;
+      } else {
+         LAST_UINSTR(cb).extra4b = delta;
+      }
+   }
 
    block_sane = VG_(saneUCodeBlock)(cb);
    if (!block_sane) {
diff --git a/vg_translate.c b/vg_translate.c
index 73cad5e9bc..76c6ef8a9b 100644
--- a/vg_translate.c
+++ b/vg_translate.c
@@ -104,7 +104,7 @@ void VG_(jitfree) ( void* ptr )
 /*--- Basics                                               ---*/
 /*------------------------------------------------------------*/
 
-static UCodeBlock* allocCodeBlock ( void )
+UCodeBlock* VG_(allocCodeBlock) ( void )
 {
    UCodeBlock* cb = VG_(malloc)(VG_AR_PRIVATE, sizeof(UCodeBlock));
    cb->used = cb->size = cb->nextTemp = 0;
@@ -113,7 +113,7 @@ static UCodeBlock* allocCodeBlock ( void )
 }
 
 
-static void freeCodeBlock ( UCodeBlock* cb )
+void VG_(freeCodeBlock) ( UCodeBlock* cb )
 {
    if (cb->instrs) VG_(free)(VG_AR_PRIVATE, cb->instrs);
    VG_(free)(VG_AR_PRIVATE, cb);
@@ -239,17 +239,15 @@ void VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz )
    ui->size   = sz;
 }
 
-
 /* Copy an instruction into the given codeblock. */
-static __inline__ 
-void copyUInstr ( UCodeBlock* cb, UInstr* instr )
+__inline__ 
+void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr )
 {
    ensureUInstr(cb);
    cb->instrs[cb->used] = *instr;
    cb->used++;
 }
 
-
 /* Copy auxiliary info from one uinstr to another. */
 static __inline__ 
 void copyAuxInfoFromTo ( UInstr* src, UInstr* dst )
@@ -1654,7 +1652,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
 
    /* Resulting code goes here.  We generate it all in a forwards
       pass. */
-   c2 = allocCodeBlock();
+   c2 = VG_(allocCodeBlock)();
 
    /* At the start, no TempRegs are assigned to any real register.
       Correspondingly, all temps claim to be currently resident in
@@ -1841,7 +1839,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
       for (j = 0; j < k; j++)
          tempUse[j].realNo 
             = VG_(rankToRealRegNo)(temp_info[tempUse[j].tempNo].real_no);
-      copyUInstr(c2, &c1->instrs[i]);
+      VG_(copyUInstr)(c2, &c1->instrs[i]);
       patchUInstr(&LAST_UINSTR(c2), &tempUse[0], k);
 
       if (VG_(disassemble)) {
@@ -1853,7 +1851,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
    if (temp_info != NULL)
       VG_(jitfree)(temp_info);
 
-   freeCodeBlock(c1);
+   VG_(freeCodeBlock)(c1);
 
    if (spill_reqd) 
       VG_(translations_needing_spill)++;
@@ -2099,7 +2097,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
    Int         i, j;
    UInstr*     u_in;
    Int         qs, qd, qt, qtt;
-   cb = allocCodeBlock();
+   cb = VG_(allocCodeBlock)();
    cb->nextTemp = cb_in->nextTemp;
 
    for (i = 0; i < cb_in->used; i++) {
@@ -2115,7 +2113,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             break;
 
          case INCEIP:
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Loads and stores.  Test the V bits for the address.  24
@@ -2136,7 +2134,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, LOADV, u_in->size, 
                         TempReg, u_in->val1,
                         TempReg, SHADOW(u_in->val2));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          case STORE:
             if (VG_(clo_check_addrVs)) {
@@ -2146,7 +2144,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, STOREV, u_in->size,
                         TempReg, SHADOW(u_in->val1), 
                         TempReg, u_in->val2);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Moving stuff around.  Make the V bits follow accordingly,
@@ -2156,24 +2154,24 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, GETV, u_in->size,
                         ArchReg, u_in->val1,
                         TempReg, SHADOW(u_in->val2));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          case PUT:
             uInstr2(cb, PUTV, u_in->size, 
                         TempReg, SHADOW(u_in->val1),
                         ArchReg, u_in->val2);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          case GETF:
             /* This is not the smartest way to do it, but should work. */
             qd = create_GETVF(cb, u_in->size);
             uInstr2(cb, MOV, 4, TempReg, qd, TempReg, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          case PUTF:
             create_PUTVF(cb, u_in->size, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          case MOV:
@@ -2190,7 +2188,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
                default: 
                   VG_(panic)("vg_instrument: MOV");
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Special case of add, where one of the operands is a literal.
@@ -2203,7 +2201,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             qd = SHADOW(u_in->val2);
             uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qd);
             create_Left(cb, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Another form of add.  
@@ -2234,7 +2232,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             }
             create_UifU(cb, 4, qs, qd);
             create_Left(cb, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          }
 
@@ -2244,7 +2242,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             create_Left(cb, u_in->size, qd);
             if (u_in->flags_w != FlagsEmpty)
                create_PUTVF(cb, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* This is a HACK (approximation :-) */
@@ -2279,7 +2277,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             create_UifU(cb, 0, qs, qd);
             create_PUTVF(cb, 0, qd);
             create_PCast(cb, 0, u_in->size, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* for OP in shl shr sar rol ror
@@ -2329,7 +2327,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt);
             create_PCast(cb, 1, u_in->size, qt);
             create_UifU(cb, u_in->size, qt, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          }
 
@@ -2338,20 +2336,20 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->tag1 == TempReg);
             create_Widen(cb, u_in->signed_widen, u_in->extra4b, u_in->size, 
                              SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* not#(x) = x (since bitwise independent) */
          case NOT:
             vg_assert(u_in->tag1 == TempReg);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* neg#(x) = left(x) (derivable from case for SUB) */
          case NEG:
             vg_assert(u_in->tag1 == TempReg);
             create_Left(cb, u_in->size, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* bswap#(x) = bswap(x) */
@@ -2360,7 +2358,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->size == 4);
             qd = SHADOW(u_in->val1);
             uInstr1(cb, BSWAP, 4, TempReg, qd);
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* cc2val#(qd) = pcast-0-to-size(eflags#) */
@@ -2369,7 +2367,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->flags_r != FlagsEmpty);
             qt = create_GETVF(cb, u_in->size);
             uInstr2(cb, MOV, 4, TempReg, qt, TempReg, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* cmov#(qs,qd) = cmov(qs,qd)
@@ -2395,7 +2393,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             LAST_UINSTR(cb).cond    = u_in->cond;
             LAST_UINSTR(cb).flags_r = u_in->flags_r;
 
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* add#/sub#(qs,qd) 
@@ -2421,7 +2419,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             if (u_in->flags_w != FlagsEmpty) {
                create_PUTVF(cb, u_in->size, qd);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* xor#(qs,qd) = qs `UifU` qd */
@@ -2432,7 +2430,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             if (u_in->flags_w != FlagsEmpty) {
                create_PUTVF(cb, u_in->size, qd);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* and#/or#(qs,qd) 
@@ -2476,7 +2474,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             if (u_in->flags_w != FlagsEmpty) {
                create_PUTVF(cb, u_in->size, qd);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Machinery to do with supporting CALLM.  Copy the start and
@@ -2484,7 +2482,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             (debug); they generate no code and have no effect. 
          */
          case CALLM_S: case CALLM_E:
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Copy PUSH and POP verbatim.  Arg/result absval
@@ -2493,7 +2491,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             needs to be copied.  
          */
          case PUSH: case POP: case CLEAR:
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* In short:
@@ -2562,7 +2560,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
                   ... */
                uInstr1(cb, SETV, 0, TempReg, qt);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
          }
          /* Whew ... */
@@ -2582,13 +2580,13 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
                   ... */
                uInstr1(cb, SETV, 0, TempReg, qt);
             }
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          case JIFZ:
             uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
             uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* Emit a check on the address used.  For FPU_R, the value
@@ -2598,12 +2596,12 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
             vg_assert(u_in->tag2 == TempReg);
             uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val2));
             uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val2));
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          /* For FPU insns not referencing memory, just copy thru. */
          case FPU: 
-            copyUInstr(cb, u_in);
+            VG_(copyUInstr)(cb, u_in);
             break;
 
          default:
@@ -2614,7 +2612,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
 
    } /* end of for loop */
 
-   freeCodeBlock(cb_in);
+   VG_(freeCodeBlock)(cb_in);
    return cb;
 }
 
@@ -3083,7 +3081,7 @@ void VG_(translate) ( ThreadState* tst,
               VG_(overall_in_osize), VG_(overall_in_tsize),
               orig_addr );
    }
-   cb = allocCodeBlock();
+   cb = VG_(allocCodeBlock)();
 
    /* Disassemble this basic block into cb. */
    VGP_PUSHCC(VgpToUCode);
@@ -3117,6 +3115,19 @@ void VG_(translate) ( ThreadState* tst,
       }
    }
 
+   //VG_(disassemble) = True;
+
+   /* Add cache simulation code. */
+   if (VG_(clo_cachesim)) {
+      VGP_PUSHCC(VgpCacheInstrument);
+      cb = VG_(cachesim_instrument)(cb, orig_addr);
+      VGP_POPCC;
+      if (VG_(disassemble)) 
+         VG_(ppUCodeBlock) ( cb, "Cachesim instrumented code:" );
+   }
+   
+   //VG_(disassemble) = False;
+   
    /* Allocate registers. */
    VGP_PUSHCC(VgpRegAlloc);
    cb = vg_do_register_allocation ( cb );
@@ -3132,7 +3143,7 @@ void VG_(translate) ( ThreadState* tst,
       and so must be VG_(jitfree)'d. */
    final_code = VG_(emit_code)(cb, &final_code_size );
    VGP_POPCC;
-   freeCodeBlock(cb);
+   VG_(freeCodeBlock)(cb);
 
    if (debugging_translation) {
       /* Only done for debugging -- throw away final result. */
-- 
2.47.2