From fcc45a9e8580826d539fcc4c60cd26b57954da46 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 29 Apr 2002 16:03:24 +0000 Subject: [PATCH] New files: - vg_cachesim.c - vg_cachesim_{I1,D1,L2}.c - vg_annotate.in - vg_cachegen.in Changes to existing files: - valgrind/valgrind.in, added option: --cachesim=no|yes [no] - Makefile/Makefile.am: * added vg_cachesim.c to valgrind_so_SOURCES var * added vg_cachesim_I1.c, vg_cachesim_D1.c, vg_cachesim_L2.c to noinst_HEADERS var * added vg_annotate, vg_cachegen to 'bin_SCRIPTS' var, and added empty targets for them - vg_main.c: * added two offsets for cache sim functions (put in positions 17a,17b) * added option handling (detection of --cachesim=yes which turns off of --instrument); * added calls to cachesim initialisation/finalisation functions - vg_mylibc: added some system call wrappers (for chmod, open_write, etc) for file writing - vg_symtab2.c: * allow it to read symbols if either of --instrument or --cachesim is used * made vg_symtab2.c:vg_what_{line,fn}_is_this extern, renaming it as VG_(what_line_is_this) (and added to vg_include.h) * completely rewrote the read loop in vg_read_lib_symbols, fixing several bugs. Much better now, although probably not perfect. It's also relatively fragile -- I'm using the "die immediately if anything unexpected happens" approach. - vg_to_ucode.c: * in VG_(disBB), patching in x86 instruction size into extra4b field of JMP instructions at the end of basic blocks if --cachesim=yes. Shifted things around to do this; also had to fiddle around with single-step stuff to get this to work, by not sticking extra JMPs on the end of the single-instruction block if there was already one there (to avoid breaking an assertion in vg_cachesim.c). Did a similar thing to avoid an extra JMP on huge basic blocks that are split. - vg_translate.c: * if --cachesim=yes call the cachesim instrumentation phase * made some functions extern and renamed: allocCodeBlock() --> VG_(allocCodeBlock)() freeCodeBlock() --> VG_(freeCodeBlock)() copyUInstr() --> VG_(copyUInstr)() (added to vg_include.h too) - vg_include.c: declared * cachesim offsets * exports of vg_cachesim.c * added four new profiling events (increasing VGP_M_CCS to 24 -- I kept the spare ones) * added comment about UInstr.extra4b field being used for instr size in JMPs for cache simulation - docs/manual.html: * Added --cachesim option to section 2.5. * Added cache profiling stuff as section 7. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@168 --- Makefile.am | 11 +- addrcheck/Makefile.am | 11 +- cachegrind/Makefile.am | 11 +- cachegrind/cg_annotate.in | 892 +++++++++++++++++++++++++++++ cachegrind/cg_main.c | 1068 +++++++++++++++++++++++++++++++++++ cachegrind/cg_sim_D1.c | 93 +++ cachegrind/cg_sim_I1.c | 93 +++ cachegrind/cg_sim_L2.c | 250 ++++++++ cachegrind/docs/manual.html | 636 ++++++++++++++++++++- corecheck/Makefile.am | 11 +- coregrind/Makefile.am | 11 +- coregrind/docs/manual.html | 636 ++++++++++++++++++++- coregrind/valgrind.in | 3 + coregrind/vg_include.h | 42 +- coregrind/vg_main.c | 29 +- coregrind/vg_mylibc.c | 42 ++ coregrind/vg_symtab2.c | 196 ++++--- coregrind/vg_to_ucode.c | 65 ++- coregrind/vg_translate.c | 95 ++-- docs/manual.html | 636 ++++++++++++++++++++- helgrind/Makefile.am | 11 +- lackey/Makefile.am | 11 +- memcheck/Makefile.am | 11 +- memcheck/docs/manual.html | 636 ++++++++++++++++++++- none/Makefile.am | 11 +- valgrind.in | 3 + vg_annotate.in | 892 +++++++++++++++++++++++++++++ vg_cachegen.in | 300 ++++++++++ vg_cachesim.c | 1068 +++++++++++++++++++++++++++++++++++ vg_cachesim_D1.c | 93 +++ vg_cachesim_I1.c | 93 +++ vg_cachesim_L2.c | 250 ++++++++ vg_include.h | 42 +- vg_main.c | 29 +- vg_mylibc.c | 42 ++ vg_symtab2.c | 196 ++++--- vg_to_ucode.c | 65 ++- vg_translate.c | 95 ++-- 38 files changed, 8375 insertions(+), 304 deletions(-) create mode 100644 cachegrind/cg_annotate.in create mode 100644 cachegrind/cg_main.c create mode 100644 cachegrind/cg_sim_D1.c create mode 100644 cachegrind/cg_sim_I1.c create mode 100644 cachegrind/cg_sim_L2.c create mode 100644 vg_annotate.in create mode 100755 vg_cachegen.in create mode 100644 vg_cachesim.c create mode 100644 vg_cachesim_D1.c create mode 100644 vg_cachesim_I1.c create mode 100644 vg_cachesim_L2.c diff --git a/Makefile.am b/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/Makefile.am +++ b/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/addrcheck/Makefile.am b/addrcheck/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/addrcheck/Makefile.am +++ b/addrcheck/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/cachegrind/Makefile.am b/cachegrind/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/cachegrind/Makefile.am +++ b/cachegrind/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/cachegrind/cg_annotate.in b/cachegrind/cg_annotate.in new file mode 100644 index 0000000000..b3b091cc23 --- /dev/null +++ b/cachegrind/cg_annotate.in @@ -0,0 +1,892 @@ +#! /usr/bin/perl -w +##--------------------------------------------------------------------## +##--- The cache simulation framework: instrumentation, recording ---## +##--- and results printing. ---## +##--- vg_annotate ---## +##--------------------------------------------------------------------## + +# This file is part of Valgrind, an x86 protected-mode emulator +# designed for debugging and profiling binaries on x86-Unixes. +# +# Copyright (C) 2000-2002 Julian Seward +# jseward@acm.org +# Julian_Seward@muraroa.demon.co.uk +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +# 02111-1307, USA. +# +# The GNU General Public License is contained in the file LICENSE. + +#---------------------------------------------------------------------------- +# Annotator for valgrind --cachesim=yes. +# +# Input file has the following format: +# ::= * +# + +# ::= "desc:" ? +# ::= "cmd:" ? +# ::= "events:" ? ( )+ +# ::= | | +# ::= ("fl=" | "fi=" | "fe=" ) +# ::= "fn=" +# ::= ? ( )+ +# ::= "summary:" ? ( )+ +# ::= | "." +# +# where +# +# is any string not containing a newline +# is a command line invocation +# and can be anything +# and are decimal numbers +# is whitespace. +# is a newline +# +# The contents of the "desc:" lines is printed out at the top of the summary. +# This is a generic way of providing simulation specific information, eg. for +# giving the cache configuration for cache simulation. +# +# Counts can be ".", to represent "N/A". +# +# The number of counts in each and the should not exceed +# the number of events in the . If the number in each is +# less, we use "." for the the missing counts (the last however-many). +# +# A changes the current file name. A changes the current +# function name. A contains counts that pertain to the current +# filename/fn_name. A "fn=" and a must appear before any +# s to give the context of the first s. +# +# Each should be immediately followed by a . "fi=" +# are used to switch filenames for inlined functions; "fe=" +# are similar, but are put at the end of a basic block in which +# the file name hasn't been switched back to the original file name. (fi and +# fe lines behave the same, they are only distinguished to help debugging.) +# +#---------------------------------------------------------------------------- +# Performance improvements record, using cachegrind.out for cacheprof, doing no +# source annotation (irrelevant ones removed): +# user time +# 1. turned off warnings in add_hash_a_to_b() 3.81 --> 3.48s +# [now add_array_a_to_b()] +# 6. make line_to_CC() return a ref instead of a hash 3.01 --> 2.77s +# +#10. changed file format to avoid file/fn name repetition 2.40s +# (not sure why higher; maybe due to new '.' entries?) +#11. changed file format to drop unnecessary end-line "."s 2.36s +# (shrunk file by about 37%) +#12. switched from hash CCs to array CCs 1.61s +#13. only adding b[i] to a[i] if b[i] defined (was doing it if +# either a[i] or b[i] was defined, but if b[i] was undefined +# it just added 0) 1.48s +#14. Stopped converting "." entries to undef and then back 1.16s +#15. Using foreach $i (x..y) instead of for ($i = 0...) in +# add_array_a_to_b() 1.11s +# +# Auto-annotating primes: +#16. Finding count lengths by int((length-1)/3), not by +# commifying (halves the number of commify calls) 1.68s --> 1.47s + +use strict; + +#---------------------------------------------------------------------------- +# Overview: the running example in the comments is for: +# - events = A,B,C,D +# - --show=C,A,D +# - --sort=D,C +#---------------------------------------------------------------------------- + +#---------------------------------------------------------------------------- +# Global variables, main data structures +#---------------------------------------------------------------------------- +# CCs are arrays, the counts corresponding to @events, with 'undef' +# representing '.'. This makes things fast (faster than using hashes for CCs) +# but we have to use @sort_order and @show_order below to handle the --sort and +# --show options, which is a bit tricky. +#---------------------------------------------------------------------------- + +# Total counts for summary (an array reference). +my $summary_CC; + +# Totals for each function, for overall summary. +# hash(filename:fn_name => CC array) +my %fn_totals; + +# Individual CCs, organised by filename and line_num for easy annotation. +# hash(filename => hash(line_num => CC array)) +my %all_ind_CCs; + +# Files chosen for annotation on the command line. +# key = basename (trimmed of any directory), value = full filename +my %user_ann_files; + +# Generic description string. +my $desc = ""; + +# Command line of profiled program. +my $cmd; + +# Events in input file, eg. (A,B,C,D) +my @events; + +# Events to show, from command line, eg. (C,A,D) +my @show_events; + +# Map from @show_events indices to @events indices, eg. (2,0,3). Gives the +# order in which we must traverse @events in order to show the @show_events, +# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events. +# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).) +my @show_order; + +# Print out the function totals sorted by these events, eg. (D,C). +my @sort_events; + +# Map from @sort_events indices to @events indices, eg. (3,2). Same idea as +# for @show_order +my @sort_order; + +# Threshold; whatever event is the primary sort, we print out functions +# representing more than this proportion of 'event' events. +my $threshold = 99; + +# If on, automatically annotates all files that are involved in getting over +# the threshold count of the primary sort event. +my $auto_annotate = 0; + +# Number of lines to show around each annotated line. +my $context = 8; + +# Directories in which to look for annotation files. +my @include_dirs = (""); + +# Input file name +my $input_file = "cachegrind.out"; + +# Version number +my $version = "@VERSION@"; + +# Usage message. +my $usage = < percentage of counts (of primary sort event) we + are interested in [$threshold%] + --auto=yes|no annotate all source files containing functions + that helped reach the event count threshold [no] + --context=N print N lines of context before and after + annotated lines [8] + -I --include= add to list of directories to search for + source files + + Valgrind is Copyright (C) 2000-2002 Julian Seward + and licensed under the GNU General Public License, version 2. + Bug reports, feedback, admiration, abuse, etc, to: jseward\@acm.org. + +END +; + +# Used in various places of output. +my $fancy = '-' x 80 . "\n"; + +#----------------------------------------------------------------------------- +# Argument and option handling +#----------------------------------------------------------------------------- +sub process_cmd_line() +{ + for my $arg (@ARGV) { + + # Option handling + if ($arg =~ /^-/) { + + # --version + if ($arg =~ /^-v$|^--version$/) { + die("vg_annotate$version\n"); + + # --show=A,B,C + } elsif ($arg =~ /^--show=(.*)$/) { + @show_events = split(/,/, $1); + + # --sort=A,B,C + } elsif ($arg =~ /^--sort=(.*)$/) { + @sort_events = split(/,/, $1); + + # --threshold=X (tolerates a trailing '%') + } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) { + $threshold = $1; + if ($threshold < 0 || $threshold > 100) { + die($usage); + } + + # --auto=yes|no + } elsif ($arg =~ /^--auto=(yes|no)$/) { + $auto_annotate = 1 if ($1 eq "yes"); + $auto_annotate = 0 if ($1 eq "no"); + + # --context=N + } elsif ($arg =~ /^--context=([\d\.]+)$/) { + $context = $1; + if ($context < 0) { + die($usage); + } + + # --include=A,B,C + } elsif ($arg =~ /^(-I|--include)=(.*)$/) { + my $inc = $2; + $inc =~ s|/$||; # trim trailing '/' + push(@include_dirs, "$inc/"); + + } else { # -h and --help fall under this case + die($usage); + } + + # Argument handling -- annotation file checking and selection. + # Stick filenames into a hash for quick 'n easy lookup throughout + } else { + my $readable = 0; + foreach my $include_dir (@include_dirs) { + if (-r $include_dir . $arg) { + $readable = 1; + } + } + $readable or die("File $arg not found in any of: @include_dirs\n"); + $user_ann_files{$arg} = 1; + } + } +} + +#----------------------------------------------------------------------------- +# Reading of input file +#----------------------------------------------------------------------------- +sub max ($$) +{ + my ($x, $y) = @_; + return ($x > $y ? $x : $y); +} + +# Add the two arrays; any '.' entries are ignored. Two tricky things: +# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn +# off warnings to allow this. This makes things about 10% faster than +# checking for definedness ourselves. +# 2. We don't add a ".", even though it's value is 0, because we don't want to +# make an $a2->[$i] that is undef become 0 unnecessarily. +sub add_array_a_to_b ($$) +{ + my ($a1, $a2) = @_; + + my $n = max(scalar @$a1, scalar @$a2); + $^W = 0; + foreach my $i (0 .. $n-1) { + $a2->[$i] += $a1->[$i] if ("." ne $a1->[$i]); + } + $^W = 1; +} + +# Add each event count to the CC array. '.' counts become undef, as do +# missing entries (implicitly). +sub line_to_CC ($) +{ + my @CC = (split /\s+/, $_[0]); + (@CC <= @events) or die("Line $.: too many event counts\n"); + return \@CC; +} + +sub read_input_file() +{ + open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n"; + + # Read "desc:" lines. + my $line; + # This gives a "uninitialized value in substitution (s///)" warning; hmm... + #while ($line = && $line =~ s/desc:\s+//) { + # $desc .= "$line\n"; + #} + while (1) { + $line = ; + if ($line =~ s/desc:\s+//) { + $desc .= $line; + } else { + last; + } + } + + # Read "cmd:" line (Nb: will already be in $line from "desc:" loop above). + ($line =~ s/cmd:\s+//) or die("Line $.: missing command line\n"); + $cmd = $line; + chomp($cmd); # Remove newline + + # Read "events:" line. We make a temporary hash in which the Nth event's + # value is N, which is useful for handling --show/--sort options below. + $line = ; + ($line =~ s/events:\s+//) or die("Line $.: missing events line\n"); + @events = split(/\s+/, $line); + my %events; + my $n = 0; + foreach my $event (@events) { + $events{$event} = $n; + $n++ + } + + # If no --show arg give, default to showing all events in the file. + # If --show option is used, check all specified events appeared in the + # "events:" line. Then initialise @show_order. + if (@show_events) { + foreach my $show_event (@show_events) { + (defined $events{$show_event}) or + die("--show event `$show_event' did not appear in input\n"); + } + } else { + @show_events = @events; + } + foreach my $show_event (@show_events) { + push(@show_order, $events{$show_event}); + } + + # Do as for --show, but if no --sort arg given, default to sorting by + # column order (ie. first column event is primary sort key, 2nd column is + # 2ndary key, etc). + if (@sort_events) { + foreach my $sort_event (@sort_events) { + (defined $events{$sort_event}) or + die("--sort event `$sort_event' did not appear in input\n"); + } + } else { + @sort_events = @events; + } + foreach my $sort_event (@sort_events) { + push(@sort_order, $events{$sort_event}); + } + + my $curr_file; + my $curr_fn; + my $curr_name; + + my $curr_fn_CC = []; + my $curr_file_ind_CCs = {}; # hash(line_num => CC) + + # Read body of input file. + while () { + s/#.*$//; # remove comments + if (s/^(\d+)\s+//) { + my $line_num = $1; + my $CC = line_to_CC($_); + add_array_a_to_b($CC, $curr_fn_CC); + + # If curr_file is selected, add CC to curr_file list. We look for + # full filename matches; or, if auto-annotating, we have to + # remember everything -- we won't know until the end what's needed. + if ($auto_annotate || defined $user_ann_files{$curr_file}) { + my $tmp = $curr_file_ind_CCs->{$line_num}; + $tmp = [] unless defined $tmp; + add_array_a_to_b($CC, $tmp); + $curr_file_ind_CCs->{$line_num} = $tmp; + } + + } elsif (s/^fn=(.*)$//) { + # Commit result from previous function + $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name); + + # Setup new one + $curr_fn = $1; + $curr_name = "$curr_file:$curr_fn"; + $curr_fn_CC = $fn_totals{$curr_name}; + $curr_fn_CC = [] unless (defined $curr_fn_CC); + + } elsif (s/^fl=(.*)$//) { + $all_ind_CCs{$curr_file} = $curr_file_ind_CCs + if (defined $curr_file); + + $curr_file = $1; + $curr_file_ind_CCs = $all_ind_CCs{$curr_file}; + $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs); + + } elsif (s/^(fi|fe)=(.*)$//) { + (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n"); + $fn_totals{$curr_name} = $curr_fn_CC; + $all_ind_CCs{$curr_file} = $curr_file_ind_CCs; + + $curr_file = $2; + $curr_name = "$curr_file:$curr_fn"; + $curr_file_ind_CCs = $all_ind_CCs{$curr_file}; + $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs); + $curr_fn_CC = $fn_totals{$curr_name}; + $curr_fn_CC = [] unless (defined $curr_fn_CC); + + } elsif (s/^\s*$//) { + # blank, do nothing + + } elsif (s/^summary:\s+//) { + # Finish up handling final filename/fn_name counts + $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC + if (defined $curr_file && defined $curr_fn); + $all_ind_CCs{$curr_file} = + $curr_file_ind_CCs if (defined $curr_file); + + $summary_CC = line_to_CC($_); + (scalar(@$summary_CC) == @events) + or die("Line $.: summary event and total event mismatch\n"); + + } else { + warn("WARNING: line $. malformed, ignoring\n"); + } + } + + # Check if summary line was present + if (not defined $summary_CC) { + warn("WARNING: missing final summary line, no summary will be printed\n"); + } + + close(INPUTFILE); +} + +#----------------------------------------------------------------------------- +# Print options used +#----------------------------------------------------------------------------- +sub print_options () +{ + print($fancy); + print($desc); + print("Command: $cmd\n"); + print("Events recorded: @events\n"); + print("Events shown: @show_events\n"); + print("Event sort order: @sort_events\n"); + print("Threshold: $threshold%\n"); + + my @include_dirs2 = @include_dirs; # copy @include_dirs + shift(@include_dirs2); # remove "" entry, which is always the first + unshift(@include_dirs2, "") if (0 == @include_dirs2); + my $include_dir = shift(@include_dirs2); + print("Include dirs: $include_dir\n"); + foreach my $include_dir (@include_dirs2) { + print(" $include_dir\n"); + } + + my @user_ann_files = keys %user_ann_files; + unshift(@user_ann_files, "") if (0 == @user_ann_files); + my $user_ann_file = shift(@user_ann_files); + print("User annotated: $user_ann_file\n"); + foreach $user_ann_file (@user_ann_files) { + print(" $user_ann_file\n"); + } + + my $is_on = ($auto_annotate ? "on" : "off"); + print("Auto-annotation: $is_on\n"); + print("\n"); +} + +#----------------------------------------------------------------------------- +# Print summary and sorted function totals +#----------------------------------------------------------------------------- +sub mycmp ($$) +{ + my ($c, $d) = @_; + + # Iterate through sort events (eg. 3,2); return result if two are different + foreach my $i (@sort_order) { + my ($x, $y); + $x = $c->[$i]; + $y = $d->[$i]; + $x = -1 unless defined $x; + $y = -1 unless defined $y; + + my $cmp = $y <=> $x; # reverse sort + if (0 != $cmp) { + return $cmp; + } + } + # Exhausted events, equal + return 0; +} + +sub commify ($) { + my ($val) = @_; + 1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/); + return $val; +} + +# Because the counts can get very big, and we don't want to waste screen space +# and make lines too long, we compute exactly how wide each column needs to be +# by finding the widest entry for each one. +sub compute_CC_col_widths (@) +{ + my @CCs = @_; + my $CC_col_widths = []; + + # Initialise with minimum widths (from event names) + foreach my $event (@events) { + push(@$CC_col_widths, length($event)); + } + + # Find maximum width count for each column. @CC_col_width positions + # correspond to @CC positions. + foreach my $CC (@CCs) { + foreach my $i (0 .. scalar(@$CC)-1) { + if (defined $CC->[$i]) { + # Find length, accounting for commas that will be added + my $length = length $CC->[$i]; + my $clength = $length + int(($length - 1) / 3); + $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); + } + } + } + return $CC_col_widths; +} + +# Print the CC with each column's size dictated by $CC_col_widths. +sub print_CC ($$) +{ + my ($CC, $CC_col_widths) = @_; + + foreach my $i (@show_order) { + my $count = (defined $CC->[$i] ? commify($CC->[$i]) : "."); + my $space = ' ' x ($CC_col_widths->[$i] - length($count)); + print("$space$count "); + } +} + +sub print_events ($) +{ + my ($CC_col_widths) = @_; + + foreach my $i (@show_order) { + my $event = $events[$i]; + my $event_width = length($event); + my $col_width = $CC_col_widths->[$i]; + my $space = ' ' x ($col_width - $event_width); + print("$event$space "); + } +} + +# Prints summary and function totals (with separate column widths, so that +# function names aren't pushed over unnecessarily by huge summary figures). +# Also returns a hash containing all the files that are involved in getting the +# events count above the threshold (ie. all the interesting ones). +sub print_summary_and_fn_totals () +{ + my @fn_fullnames = keys %fn_totals; + + # Work out the size of each column for printing (summary and functions + # separately). + my $summary_CC_col_widths = compute_CC_col_widths($summary_CC); + my $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals); + + # Header and counts for summary + print($fancy); + print_events($summary_CC_col_widths); + print("\n"); + print($fancy); + print_CC($summary_CC, $summary_CC_col_widths); + print(" PROGRAM TOTALS\n"); + print("\n"); + + # Header for functions + print($fancy); + print_events($fn_CC_col_widths); + print(" file:function\n"); + print($fancy); + + # Sort function names into order dictated by --sort option. + @fn_fullnames = sort { + mycmp($fn_totals{$a}, $fn_totals{$b}) + } @fn_fullnames; + + # The thresholded event is the one that is the primary sort event. + my $threshold_files = {}; + my $threshold_event_index = $sort_order[0]; + my $threshold_total = $summary_CC->[$threshold_event_index]; + my $curr_total = 0; + + # Print functions, stopping when the threshold has been reached. + foreach my $fn_name (@fn_fullnames) { + + # Stop when we've reached the threshold + last if ($curr_total * 100 / $threshold_total >= $threshold); + + # Print function results + my $fn_CC = $fn_totals{$fn_name}; + print_CC($fn_CC, $fn_CC_col_widths); + print(" $fn_name\n"); + + # Update the threshold counting + my $filename = $fn_name; + $filename =~ s/:[^:]+$//; # remove function name + $threshold_files->{$filename} = 1; + $curr_total += $fn_CC->[$threshold_event_index] + if (defined $fn_CC->[$threshold_event_index]); + } + print("\n"); + + return $threshold_files; +} + +#----------------------------------------------------------------------------- +# Annotate selected files +#----------------------------------------------------------------------------- + +# Issue a warning that the source file is more recent than the input file. +sub warning_on_src_more_recent_than_inputfile ($) +{ + my $src_file = $_[0]; + + my $warning = <{"???"}; + %all_ann_files = (%user_ann_files, %$threshold_files) + } else { + %all_ann_files = %user_ann_files; + } + + # Track if we did any annotations. + my $did_annotations = 0; + + LOOP: + foreach my $src_file (keys %all_ann_files) { + + my $opened_file = ""; + my $full_file_name = ""; + foreach my $include_dir (@include_dirs) { + my $try_name = $include_dir . $src_file; + if (open(INPUTFILE, "< $try_name")) { + $opened_file = $try_name; + $full_file_name = ($include_dir eq "" + ? $src_file + : "$include_dir + $src_file"); + last; + } + } + + if (not $opened_file) { + # Failed to open the file. If chosen on the command line, die. + # If arose from auto-annotation, print a little message. + if (defined $user_ann_files{$src_file}) { + die("File $src_file not opened in any of: @include_dirs\n"); + + } else { + push(@unfound_auto_annotate_files, $src_file); + } + + } else { + # File header (distinguish between user- and auto-selected files). + print("$fancy"); + my $ann_type = + (defined $user_ann_files{$src_file} ? "User" : "Auto"); + print("-- $ann_type-annotated source: $full_file_name\n"); + print("$fancy"); + + # Get file's CCs + my $src_file_CCs = $all_ind_CCs{$src_file}; + if (!defined $src_file_CCs) { + print(" No information has been collected for $src_file\n\n"); + next LOOP; + } + + $did_annotations = 1; + + # Numeric, not lexicographic sort! + my @line_nums = sort {$a <=> $b} keys %$src_file_CCs; + + # If $src_file more recent than cachegrind.out, issue warning + my $src_more_recent_than_inputfile = 0; + if ((stat $opened_file)[9] > (stat $input_file)[9]) { + $src_more_recent_than_inputfile = 1; + warning_on_src_more_recent_than_inputfile($src_file); + } + + # Work out the size of each column for printing + my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs); + + # Events header + print_events($CC_col_widths); + print("\n\n"); + + # Shift out 0 if it's in the line numbers (from unknown entries, + # likely due to bugs in Valgrind's stabs debug info reader) + shift(@line_nums) if (0 == $line_nums[0]); + + # Finds interesting line ranges -- all lines with a CC, and all + # lines within $context lines of a line with a CC. + my $n = @line_nums; + my @pairs; + for (my $i = 0; $i < $n; $i++) { + push(@pairs, $line_nums[$i] - $context); # lower marker + while ($i < $n-1 && + $line_nums[$i] + 2*$context >= $line_nums[$i+1]) { + $i++; + } + push(@pairs, $line_nums[$i] + $context); # upper marker + } + + # Annotate chosen lines, tracking total counts of lines printed + $pairs[0] = 1 if ($pairs[0] < 1); + while (@pairs) { + my $low = shift @pairs; + my $high = shift @pairs; + while ($. < $low-1) { + my $tmp = ; + last unless (defined $tmp); # hack to detect EOF + } + my $src_line; + # Print line number, unless start of file + print("-- line $low " . '-' x 40 . "\n") if ($low != 1); + while (($. < $high) && ($src_line = )) { + if (defined $line_nums[0] && $. == $line_nums[0]) { + print_CC($src_file_CCs->{$.}, $CC_col_widths); + add_array_a_to_b($src_file_CCs->{$.}, + $printed_totals_CC); + shift(@line_nums); + + } else { + print_CC( [], $CC_col_widths); + } + + print(" $src_line"); + } + # Print line number, unless EOF + if ($src_line) { + print("-- line $high " . '-' x 40 . "\n"); + } else { + last; + } + } + + # If there was info on lines past the end of the file... + if (@line_nums) { + foreach my $line_num (@line_nums) { + print_CC($src_file_CCs->{$line_num}, $CC_col_widths); + print(" \n"); + } + print("\n"); + warning_on_nonexistent_lines($src_more_recent_than_inputfile, + $src_file, \@line_nums); + } + print("\n"); + + # Print summary of counts attributed to file but not to any + # particular line (due to incomplete debug info). + if ($src_file_CCs->{0}) { + print_CC($src_file_CCs->{0}, $CC_col_widths); + print(" \n\n"); + } + + close(INPUTFILE); + } + } + + # Print list of unfound auto-annotate selected files. + if (@unfound_auto_annotate_files) { + print("$fancy"); + print("The following files chosen for auto-annotation could not be found:\n"); + print($fancy); + foreach my $f (@unfound_auto_annotate_files) { + print(" $f\n"); + } + print("\n"); + } + + # If we did any annotating, print what proportion of events were covered by + # annotated lines above. + if ($did_annotations) { + my $percent_printed_CC; + foreach (my $i = 0; $i < @$summary_CC; $i++) { + $percent_printed_CC->[$i] = + sprintf("%.0f", + $printed_totals_CC->[$i] / $summary_CC->[$i] * 100); + } + my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC); + print($fancy); + print_events($pp_CC_col_widths); + print("\n"); + print($fancy); + print_CC($percent_printed_CC, $pp_CC_col_widths); + print(" percentage of events annotated\n\n"); + } +} + +#---------------------------------------------------------------------------- +# "main()" +#---------------------------------------------------------------------------- +process_cmd_line(); +read_input_file(); +print_options(); +my $threshold_files = print_summary_and_fn_totals(); +annotate_ann_files($threshold_files); + diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c new file mode 100644 index 0000000000..ea0cb410a2 --- /dev/null +++ b/cachegrind/cg_main.c @@ -0,0 +1,1068 @@ +/*--------------------------------------------------------------------*/ +/*--- The cache simulation framework: instrumentation, recording ---*/ +/*--- and results printing. ---*/ +/*--- vg_cachesim.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Valgrind, an x86 protected-mode emulator + designed for debugging and profiling binaries on x86-Unixes. + + Copyright (C) 2000-2002 Julian Seward + jseward@acm.org + Julian_Seward@muraroa.demon.co.uk + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307, USA. + + The GNU General Public License is contained in the file LICENSE. +*/ + +#include + +#include "vg_include.h" + +#include "vg_cachesim_L2.c" +#include "vg_cachesim_I1.c" +#include "vg_cachesim_D1.c" + + +/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */ +#define MAX_x86_INSTR_SIZE 16 + +/* Size of various buffers used for storing strings */ +#define FILENAME_LEN 256 +#define FN_NAME_LEN 256 +#define BUF_LEN 512 +#define COMMIFY_BUF_LEN 128 +#define RESULTS_BUF 128 + +/*------------------------------------------------------------*/ +/*--- Output file related stuff ---*/ +/*------------------------------------------------------------*/ + +#define OUT_FILE "cachegrind.out" + +static void file_err() +{ + VG_(message)(Vg_UserMsg, + "FATAL: can't open cache simulation output file `%s'", + OUT_FILE ); + VG_(exit)(1); +} + +/*------------------------------------------------------------*/ +/*--- Cost center types, operations ---*/ +/*------------------------------------------------------------*/ + +typedef struct _CC CC; +struct _CC { + ULong a; + ULong m1; + ULong m2; +}; + +static __inline__ void initCC(CC* cc) { + cc->a = 0; + cc->m1 = 0; + cc->m2 = 0; +} + + +typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type; + +/* Instruction-level cost-centres. The typedefs for these structs are in + * vg_include.c + * + * WARNING: the 'tag' field *must* be the first byte of both CC types. + * the 'instr_addr' *must* be the second word of both CC types. + * + * This is because we use them when we don't know what type of CC we're dealing + * with. + */ +struct _iCC { + /* word 1 */ + UChar tag; + UChar instr_size; + + /* words 2+ */ + Addr instr_addr; + CC I; +}; + +struct _idCC { + /* word 1 */ + UChar tag; + UChar instr_size; + UChar data_size; + + /* words 2+ */ + Addr instr_addr; + CC I; + CC D; +}; + +static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size) +{ + cc->tag = INSTR_CC; + cc->instr_size = instr_size; + cc->instr_addr = instr_addr; + initCC(&cc->I); +} + +static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr, + UInt instr_size, UInt data_size) +{ + cc->tag = X_CC; + cc->instr_size = instr_size; + cc->data_size = data_size; + cc->instr_addr = instr_addr; + initCC(&cc->I); + initCC(&cc->D); +} + +static __inline__ void sprint_iCC(Char buf[BUF_LEN], UInt ln, iCC* cc) +{ + VG_(sprintf)(buf, "%u %llu %llu %llu\n", + ln, cc->I.a, cc->I.m1, cc->I.m2/*, cc->instr_addr*/); +} + +static __inline__ void sprint_read_or_mod_CC(Char buf[BUF_LEN], UInt ln, + idCC* cc) +{ + VG_(sprintf)(buf, "%u %llu %llu %llu %llu %llu %llu\n", + ln, cc->I.a, cc->I.m1, cc->I.m2, + cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/); +} + +static __inline__ void sprint_write_CC(Char buf[BUF_LEN], UInt ln, idCC* cc) +{ + VG_(sprintf)(buf, "%u %llu %llu %llu . . . %llu %llu %llu\n", + ln, cc->I.a, cc->I.m1, cc->I.m2, + cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/); +} + +/*------------------------------------------------------------*/ +/*--- BBCC hash table stuff ---*/ +/*------------------------------------------------------------*/ + +/* The table of BBCCs is of the form hash(filename, hash(fn_name, + * hash(BBCCs))). Each hash table is separately chained. The sizes below work + * fairly well for Konqueror. */ + +#define N_FILE_ENTRIES 251 +#define N_FN_ENTRIES 53 +#define N_BBCC_ENTRIES 37 + +/* The cost centres for a basic block are stored in a contiguous array. + * They are distinguishable by their tag field. */ +typedef struct _BBCC BBCC; +struct _BBCC { + Addr orig_addr; + UInt array_size; /* byte-size of variable length array */ + BBCC* next; + Addr array[0]; /* variable length array */ +}; + +typedef struct _fn_node fn_node; +struct _fn_node { + Char* fn_name; + BBCC* BBCCs[N_BBCC_ENTRIES]; + fn_node* next; +}; + +typedef struct _file_node file_node; +struct _file_node { + Char* filename; + fn_node* fns[N_FN_ENTRIES]; + file_node* next; +}; + +/* BBCC_table structure: list(filename, list(fn_name, list(BBCC))) */ +file_node *BBCC_table[N_FILE_ENTRIES]; + +Int distinct_files = 0; +Int distinct_fns = 0; + +Int distinct_instrs = 0; +Int full_debug_BBs = 0; +Int file_line_debug_BBs = 0; +Int fn_name_debug_BBs = 0; +Int no_debug_BBs = 0; + +Int BB_retranslations = 0; + +static void init_BBCC_table() +{ + Int i; + for (i = 0; i < N_FILE_ENTRIES; i++) + BBCC_table[i] = NULL; +} + +static void get_file_fn_names(Addr instr_addr, Char filename[FILENAME_LEN], + Char fn_name[FN_NAME_LEN]) +{ + UInt dummy_line_num; + Bool found1, found2, no_demangle = False; + + found1 = VG_(what_line_is_this)(instr_addr, filename, + FILENAME_LEN, &dummy_line_num); + found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN); + + if (!found1 && !found2) { + no_debug_BBs++; + VG_(strcpy)(filename, "???"); + VG_(strcpy)(fn_name, "???"); + + } else if ( found1 && found2) { + full_debug_BBs++; + + } else if ( found1 && !found2) { + file_line_debug_BBs++; + VG_(strcpy)(fn_name, "???"); + + } else /*(!found1 && found2)*/ { + fn_name_debug_BBs++; + VG_(strcpy)(filename, "???"); + } +} + +/* Forward declaration. */ +static Int compute_BBCC_array_size(UCodeBlock* cb); + +static __inline__ +file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next) +{ + Int i; + file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node)); + new->filename = VG_(strdup)(VG_AR_PRIVATE, filename); + for (i = 0; i < N_FN_ENTRIES; i++) { + new->fns[i] = NULL; + } + new->next = next; + return new; +} + +static __inline__ +fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next) +{ + Int i; + fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node)); + new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name); + for (i = 0; i < N_BBCC_ENTRIES; i++) { + new->BBCCs[i] = NULL; + } + new->next = next; + return new; +} + +static __inline__ +BBCC* new_BBCC(Addr bb_orig_addr, UCodeBlock* cb, BBCC* next) +{ + Int BBCC_array_size = compute_BBCC_array_size(cb); + BBCC* new; + + new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size); + new->orig_addr = bb_orig_addr; + new->array_size = BBCC_array_size; + new->next = next; + + return new; +} + +#define HASH_CONSTANT 256 + +static UInt hash(Char *s, UInt table_size) +{ + int hash_value = 0; + for ( ; *s; s++) + hash_value = (HASH_CONSTANT * hash_value + *s) % table_size; + return hash_value; +} + +/* Do a three step traversal: by filename, then fn_name, then instr_addr. + * In all cases prepends new nodes to their chain. Returns a pointer to the + * cost centre. Also sets BB_seen_before by reference. + */ +static __inline__ BBCC* get_BBCC(Addr bb_orig_addr, UCodeBlock* cb, + Bool *BB_seen_before) +{ + file_node *curr_file_node; + fn_node *curr_fn_node; + BBCC *curr_BBCC; + Char filename[FILENAME_LEN], fn_name[FN_NAME_LEN]; + UInt filename_hash, fnname_hash, BBCC_hash; + + get_file_fn_names(bb_orig_addr, filename, fn_name); + + VGP_PUSHCC(VgpCacheGetBBCC); + filename_hash = hash(filename, N_FILE_ENTRIES); + curr_file_node = BBCC_table[filename_hash]; + while (NULL != curr_file_node && + strcmp(filename, curr_file_node->filename) != 0) { + curr_file_node = curr_file_node->next; + } + if (NULL == curr_file_node) { + BBCC_table[filename_hash] = curr_file_node = + new_file_node(filename, BBCC_table[filename_hash]); + distinct_files++; + } + + fnname_hash = hash(fn_name, N_FN_ENTRIES); + curr_fn_node = curr_file_node->fns[fnname_hash]; + while (NULL != curr_fn_node && + strcmp(fn_name, curr_fn_node->fn_name) != 0) { + curr_fn_node = curr_fn_node->next; + } + if (NULL == curr_fn_node) { + curr_file_node->fns[fnname_hash] = curr_fn_node = + new_fn_node(fn_name, curr_file_node->fns[fnname_hash]); + distinct_fns++; + } + + BBCC_hash = bb_orig_addr % N_BBCC_ENTRIES; + curr_BBCC = curr_fn_node->BBCCs[BBCC_hash]; + while (NULL != curr_BBCC && bb_orig_addr != curr_BBCC->orig_addr) { + curr_BBCC = curr_BBCC->next; + } + if (curr_BBCC == NULL) { + curr_fn_node->BBCCs[BBCC_hash] = curr_BBCC = + new_BBCC(bb_orig_addr, cb, curr_fn_node->BBCCs[BBCC_hash]); + *BB_seen_before = False; + + } else { + vg_assert(bb_orig_addr == curr_BBCC->orig_addr); + vg_assert(curr_BBCC->array_size > 0 && curr_BBCC->array_size < 1000000); + if (VG_(clo_verbosity) > 1) { + VG_(message)(Vg_DebugMsg, "BB retranslation, retrieving from BBCC table"); + } + *BB_seen_before = True; + BB_retranslations++; + } + VGP_POPCC; + return curr_BBCC; +} + +/*------------------------------------------------------------*/ +/*--- Cache simulation instrumentation phase ---*/ +/*------------------------------------------------------------*/ + +#define uInstr1 VG_(newUInstr1) +#define uInstr2 VG_(newUInstr2) +#define uInstr3 VG_(newUInstr3) +#define dis VG_(disassemble) +#define uLiteral VG_(setLiteralField) +#define newTemp VG_(getNewTemp) + +static Int compute_BBCC_array_size(UCodeBlock* cb) +{ + UInstr* u_in; + Int i, CC_size, BBCC_size = 0; + Bool is_LOAD, is_STORE, is_FPU_R, is_FPU_W; + + is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False; + + for (i = 0; i < cb->used; i++) { + //VG_(ppUInstr)(0, &cb->instrs[i]); + + u_in = &cb->instrs[i]; + switch(u_in->opcode) { + + case INCEIP: + goto case_for_end_of_instr; + + case JMP: + if (u_in->cond != CondAlways) break; + + goto case_for_end_of_instr; + + case_for_end_of_instr: + + CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W + ? sizeof(idCC) : sizeof(iCC)); + + BBCC_size += CC_size; + is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False; + break; + + case LOAD: + /* Two LDBs are possible for a single instruction */ + vg_assert(/*!is_LOAD &&*/ !is_STORE && !is_FPU_R && !is_FPU_W); + is_LOAD = True; + break; + + case STORE: + /* Multiple STOREs are possible for 'pushal' */ + vg_assert( /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W); + is_STORE = True; + break; + + case FPU_R: + vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W); + is_FPU_R = True; + break; + + case FPU_W: + vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W); + is_FPU_W = True; + break; + + default: + break; + } + } + + return BBCC_size; +} + +/* Use this rather than eg. -1 because it's stored as a UInt. */ +#define INVALID_DATA_SIZE 999999 + +UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr) +{ + UCodeBlock* cb; + Int i; + UInstr* u_in; + BBCC* BBCC_node; + Int t_CC_addr, t_read_addr, t_write_addr, t_data_addr; + Int CC_size = -1; /* Shut gcc warnings up */ + Addr instr_addr = orig_addr; + UInt instr_size, data_size = INVALID_DATA_SIZE; + Int helper = -1; /* Shut gcc warnings up */ + UInt stack_used; + Bool BB_seen_before = False; + Bool prev_instr_was_Jcond = False; + Addr BBCC_ptr0, BBCC_ptr; + + /* Get BBCC (creating if necessary -- requires a counting pass over the BB + * if it's the first time it's been seen), and point to start of the + * BBCC array. */ + BBCC_node = get_BBCC(orig_addr, cb_in, &BB_seen_before); + BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array); + + cb = VG_(allocCodeBlock)(); + cb->nextTemp = cb_in->nextTemp; + + t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG; + + for (i = 0; i < cb_in->used; i++) { + u_in = &cb_in->instrs[i]; + + //VG_(ppUInstr)(0, u_in); + + /* What this is all about: we want to instrument each x86 instruction + * translation. The end of these are marked in three ways. The three + * ways, and the way we instrument them, are as follows: + * + * 1. UCode, INCEIP --> UCode, Instrumentation, INCEIP + * 2. UCode, Juncond --> UCode, Instrumentation, Juncond + * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond + * + * We must put the instrumentation before the jumps so that it is always + * executed. We don't have to put the instrumentation before the INCEIP + * (it could go after) but we do so for consistency. + * + * Junconds are always the last instruction in a basic block. Jconds are + * always the 2nd last, and must be followed by a Jcond. We check this + * with various assertions. + * + * Note that in VG_(disBB) we patched the `extra4b' field of the first + * occurring JMP in a block with the size of its x86 instruction. This + * is used now. + * + * Note that we don't have to treat JIFZ specially; unlike JMPs, JIFZ + * occurs in the middle of a BB and gets an INCEIP after it. + * + * The instrumentation is just a call to the appropriate helper function, + * passing it the address of the instruction's CC. + */ + if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP); + + switch (u_in->opcode) { + + case INCEIP: + instr_size = u_in->val1; + goto case_for_end_of_x86_instr; + + case JMP: + if (u_in->cond == CondAlways) { + vg_assert(i+1 == cb_in->used); + + /* Don't instrument if previous instr was a Jcond. */ + if (prev_instr_was_Jcond) { + vg_assert(0 == u_in->extra4b); + VG_(copyUInstr)(cb, u_in); + break; + } + prev_instr_was_Jcond = False; + + } else { + vg_assert(i+2 == cb_in->used); /* 2nd last instr in block */ + prev_instr_was_Jcond = True; + } + + /* Ah, the first JMP... instrument, please. */ + instr_size = u_in->extra4b; + goto case_for_end_of_x86_instr; + + /* Shared code that is executed at the end of an x86 translation + * block, marked by either an INCEIP or an unconditional JMP. */ + case_for_end_of_x86_instr: + +#define IS_(X) (INVALID_TEMPREG != t_##X##_addr) + + /* Initialise the CC in the BBCC array appropriately if it hasn't + * been initialised before. + * Then call appropriate sim function, passing it the CC address. + * Note that CALLM_S/CALL_E aren't required here; by this point, + * the checking related to them has already happened. */ + stack_used = 0; + + vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE); + vg_assert(0 != instr_addr); + + /* Save the caller-save registers before we push our args */ + uInstr1(cb, PUSH, 4, RealReg, R_EAX); + uInstr1(cb, PUSH, 4, RealReg, R_ECX); + uInstr1(cb, PUSH, 4, RealReg, R_EDX); + + if (!IS_(read) && !IS_(write)) { + iCC* CC_ptr = (iCC*)(BBCC_ptr); + vg_assert(INVALID_DATA_SIZE == data_size); + vg_assert(INVALID_TEMPREG == t_read_addr && + INVALID_TEMPREG == t_write_addr); + CC_size = sizeof(iCC); + if (!BB_seen_before) + init_iCC(CC_ptr, instr_addr, instr_size); + + helper = VGOFF_(cachesim_log_non_mem_instr); + + } else { + CC_type X_CC; + idCC* CC_ptr = (idCC*)(BBCC_ptr); + + vg_assert(4 == data_size || 2 == data_size || 1 == data_size || + 8 == data_size || 10 == data_size); + + CC_size = sizeof(idCC); + helper = VGOFF_(cachesim_log_mem_instr); + + if (IS_(read) && !IS_(write)) { + X_CC = READ_CC; + vg_assert(INVALID_TEMPREG != t_read_addr && + INVALID_TEMPREG == t_write_addr); + t_data_addr = t_read_addr; + + } else if (!IS_(read) && IS_(write)) { + X_CC = WRITE_CC; + vg_assert(INVALID_TEMPREG == t_read_addr && + INVALID_TEMPREG != t_write_addr); + t_data_addr = t_write_addr; + + } else { + vg_assert(IS_(read) && IS_(write)); + X_CC = MOD_CC; + vg_assert(INVALID_TEMPREG != t_read_addr && + INVALID_TEMPREG != t_write_addr); + t_data_addr = t_read_addr; + } + + if (!BB_seen_before) + init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size); + + /* 2nd arg: data addr */ + uInstr1(cb, PUSH, 4, TempReg, t_data_addr); + stack_used += 4; + } +#undef IS_ + + /* 1st arg: CC addr */ + t_CC_addr = newTemp(cb); + uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_CC_addr); + uLiteral(cb, BBCC_ptr); + uInstr1(cb, PUSH, 4, TempReg, t_CC_addr); + stack_used += 4; + + /* Call function and return. */ + uInstr1(cb, CALLM, 0, Lit16, helper); + uInstr1(cb, CLEAR, 0, Lit16, stack_used); + + /* Restore the caller-save registers now the call is done */ + uInstr1(cb, POP, 4, RealReg, R_EDX); + uInstr1(cb, POP, 4, RealReg, R_ECX); + uInstr1(cb, POP, 4, RealReg, R_EAX); + + VG_(copyUInstr)(cb, u_in); + + /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */ + BBCC_ptr += CC_size; + instr_addr += instr_size; + t_CC_addr = t_read_addr = t_write_addr = + t_data_addr = INVALID_TEMPREG; + data_size = INVALID_DATA_SIZE; + break; + + + /* For memory-ref instrs, copy the data_addr into a temporary to be + * passed to the cachesim_log_function at the end of the instruction. + */ + case LOAD: + t_read_addr = newTemp(cb); + uInstr2(cb, MOV, 4, TempReg, u_in->val1, TempReg, t_read_addr); + data_size = u_in->size; + VG_(copyUInstr)(cb, u_in); + break; + + case FPU_R: + t_read_addr = newTemp(cb); + uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_read_addr); + data_size = u_in->size; + VG_(copyUInstr)(cb, u_in); + break; + + /* Note that we must set t_write_addr even for mod instructions; + * that's how the code above determines whether it does a write; + * without it, it would think a mod instruction is a read. + * As for the MOV, if it's a mod instruction it's redundant, but it's + * not expensive and mod instructions are rare anyway. */ + case STORE: + case FPU_W: + t_write_addr = newTemp(cb); + uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr); + data_size = u_in->size; + VG_(copyUInstr)(cb, u_in); + break; + + case NOP: case CALLM_E: case CALLM_S: + break; + + default: + VG_(copyUInstr)(cb, u_in); + break; + } + } + + /* Just check everything looks ok */ + vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size); + + VG_(freeCodeBlock)(cb_in); + return cb; +} + +/*------------------------------------------------------------*/ +/*--- Cache simulation stuff ---*/ +/*------------------------------------------------------------*/ + +/* Total reads/writes/misses. Calculated during CC traversal at the end. */ +static CC Ir_total; +static CC Dr_total; +static CC Dw_total; + +void VG_(init_cachesim)(void) +{ + /* Make sure the output file can be written. */ + Int fd = VG_(open_write)(OUT_FILE); + if (-1 == fd) { + fd = VG_(create_and_write)(OUT_FILE); + if (-1 == fd) { + file_err(); + } + } + VG_(close)(fd); + + initCC(&Ir_total); + initCC(&Dr_total); + initCC(&Dw_total); + + cachesim_I1_initcache(); + cachesim_D1_initcache(); + cachesim_L2_initcache(); + + init_BBCC_table(); +} + +void VG_(cachesim_log_non_mem_instr)(iCC* cc) +{ + //VG_(printf)("sim I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n", + // cc, cc->instr_addr, cc->instr_size) + VGP_PUSHCC(VgpCacheSimulate); + cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2); + cc->I.a++; + VGP_POPCC; +} + +void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr) +{ + //VG_(printf)("sim D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n", + // cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size) + VGP_PUSHCC(VgpCacheSimulate); + cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2); + cc->I.a++; + + cachesim_D1_doref(data_addr, cc->data_size, &cc->D.m1, &cc->D.m2); + cc->D.a++; + VGP_POPCC; +} + +/*------------------------------------------------------------*/ +/*--- Printing of output file and summary stats ---*/ +/*------------------------------------------------------------*/ + +int get_line_num(Addr instr_addr) +{ + Char filename[FILENAME_LEN] = "???"; + UInt line_num; + Bool found; + + found = VG_(what_line_is_this)(instr_addr, filename, + FILENAME_LEN, &line_num); + if (!found) { + line_num = 0; + } + return line_num; +} + +static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, + Char *first_instr_fn) +{ + Addr BBCC_ptr0, BBCC_ptr; + Char buf[BUF_LEN], curr_file[BUF_LEN], fbuf[BUF_LEN+4]; + UInt line_num; + + BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array); + + VG_(write)(fd, (void*)"\n", 1); + + VG_(strcpy)(curr_file, first_instr_fl); + + while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) { + + /* We pretend the CC is an iCC for getting the tag. This is ok + * because both CC types have tag as their first byte. Once we know + * the type, we can cast and act appropriately. */ + + Char fl_buf[FILENAME_LEN]; + Char fn_buf[FN_NAME_LEN]; + + /* Assumes instr_addr position is same for both CCs. */ + Addr instr_addr = ((iCC*)BBCC_ptr)->instr_addr; + get_file_fn_names(instr_addr, fl_buf, fn_buf); + + /* Allow for filename switching in the middle of a BB; if this happens, + * must print the new filename with the function name. */ + if (0 != strcmp(fl_buf, curr_file)) { + VG_(strcpy)(curr_file, fl_buf); + VG_(sprintf)(fbuf, "fi=%s\n", curr_file); + VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf)); + } + + switch ( ((iCC*)BBCC_ptr)->tag ) { + +#define ADD_CC_TO(CC_type, cc, total) \ + total.a += ((CC_type*)BBCC_ptr)->cc.a; \ + total.m1 += ((CC_type*)BBCC_ptr)->cc.m1; \ + total.m2 += ((CC_type*)BBCC_ptr)->cc.m2; + + case INSTR_CC: + line_num = get_line_num(((iCC*)BBCC_ptr)->instr_addr); + sprint_iCC(buf, line_num, (iCC*)BBCC_ptr); + ADD_CC_TO(iCC, I, Ir_total); + BBCC_ptr += sizeof(iCC); + break; + + case READ_CC: + case MOD_CC: + line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr); + sprint_read_or_mod_CC(buf, line_num, (idCC*)BBCC_ptr); + ADD_CC_TO(idCC, I, Ir_total); + ADD_CC_TO(idCC, D, Dr_total); + BBCC_ptr += sizeof(idCC); + break; + + case WRITE_CC: + line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr); + sprint_write_CC(buf, line_num, (idCC*)BBCC_ptr); + ADD_CC_TO(idCC, I, Ir_total); + ADD_CC_TO(idCC, D, Dw_total); + BBCC_ptr += sizeof(idCC); + break; + +#undef ADD_CC_TO + + default: + VG_(panic)("Unknown CC type in fprint_BBCC()\n"); + break; + } + distinct_instrs++; + + /* If the function name for this instruction doesn't match that of the + * first instruction in the BB, print out a warning. */ + if (VG_(clo_trace_symtab) && 0 != strcmp(fn_buf, first_instr_fn)) { + VG_(printf)("Mismatched function names\n"); + VG_(printf)(" filenames: BB:%s, instr:%s; " + "fn_names: BB:%s, instr:%s; " + "line: %d\n", + first_instr_fl, fl_buf, + first_instr_fn, fn_buf, + line_num); + } + + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + } + /* If we switched filenames in the middle of the BB without switching back, + * switch back now because the subsequent BB may be relying on falling under + * the original file name. */ + if (0 != VG_(strcmp)(first_instr_fl, curr_file)) { + VG_(sprintf)(fbuf, "fe=%s\n", first_instr_fl); + VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf)); + } + //VG_(write)(fd, (void*)"#}\n", 3); + + vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size); +} + +static void fprint_BBCC_table_and_calc_totals(Int client_argc, + Char** client_argv) +{ + Int fd; + Char buf[BUF_LEN]; + file_node *curr_file_node; + fn_node *curr_fn_node; + BBCC *curr_BBCC; + Int i,j,k; + + VGP_PUSHCC(VgpCacheDump); + fd = VG_(open_write)(OUT_FILE); + if (-1 == fd) { file_err(); } + + /* "desc:" lines (giving I1/D1/L2 cache configuration) */ + VG_(write)(fd, (void*)I1_desc_line, VG_(strlen)(I1_desc_line)); + VG_(write)(fd, (void*)D1_desc_line, VG_(strlen)(D1_desc_line)); + VG_(write)(fd, (void*)L2_desc_line, VG_(strlen)(L2_desc_line)); + + /* "cmd:" line */ + VG_(strcpy)(buf, "cmd:"); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + for (i = 0; i < client_argc; i++) { + VG_(sprintf)(buf, " %s", client_argv[i]); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + } + /* "events:" line */ + VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n"); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + + /* Six loops here: three for the hash table arrays, and three for the + * chains hanging off the hash table arrays. */ + for (i = 0; i < N_FILE_ENTRIES; i++) { + curr_file_node = BBCC_table[i]; + while (curr_file_node != NULL) { + VG_(sprintf)(buf, "fl=%s\n", curr_file_node->filename); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + + for (j = 0; j < N_FN_ENTRIES; j++) { + curr_fn_node = curr_file_node->fns[j]; + while (curr_fn_node != NULL) { + VG_(sprintf)(buf, "fn=%s\n", curr_fn_node->fn_name); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + + for (k = 0; k < N_BBCC_ENTRIES; k++) { + curr_BBCC = curr_fn_node->BBCCs[k]; + while (curr_BBCC != NULL) { + fprint_BBCC(fd, curr_BBCC, + + curr_file_node->filename, + curr_fn_node->fn_name); + + curr_BBCC = curr_BBCC->next; + } + } + curr_fn_node = curr_fn_node->next; + } + } + curr_file_node = curr_file_node->next; + } + } + + /* Summary stats must come after rest of table, since we calculate them + * during traversal. */ + VG_(sprintf)(buf, "summary: " + "%llu %llu %llu " + "%llu %llu %llu " + "%llu %llu %llu\n", + Ir_total.a, Ir_total.m1, Ir_total.m2, + Dr_total.a, Dr_total.m1, Dr_total.m2, + Dw_total.a, Dw_total.m1, Dw_total.m2); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + VG_(close)(fd); +} + +/* Adds commas to ULong, right justifying in a field field_width wide, returns + * the string in buf. */ +Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN]) +{ + int len, n_commas, i, j, new_len, space; + + VG_(sprintf)(buf, "%lu", n); + len = VG_(strlen)(buf); + n_commas = (len - 1) / 3; + new_len = len + n_commas; + space = field_width - new_len; + + /* Allow for printing a number in a field_width smaller than it's size */ + if (space < 0) space = 0; + + /* Make j = -1 because we copy the '\0' before doing the numbers in groups + * of three. */ + for (j = -1, i = len ; i >= 0; i--) { + buf[i + n_commas + space] = buf[i]; + + if (3 == ++j) { + j = 0; + n_commas--; + buf[i + n_commas + space] = ','; + } + } + /* Right justify in field. */ + for (i = 0; i < space; i++) buf[i] = ' '; + return new_len; +} + +void percentify(Int n, Int pow, Int field_width, char buf[]) +{ + int i, len, space; + + VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow); + len = VG_(strlen)(buf); + space = field_width - len; + i = len; + + /* Right justify in field */ + for ( ; i >= 0; i--) buf[i + space] = buf[i]; + for (i = 0; i < space; i++) buf[i] = ' '; +} + +void VG_(show_cachesim_results)(Int client_argc, Char** client_argv) +{ + CC D_total; + ULong L2_total_m, L2_total_mr, L2_total_mw; + char buf1[RESULTS_BUF], + buf2[RESULTS_BUF], + buf3[RESULTS_BUF]; + Int l1, l2, l3; + Int p; + + fprint_BBCC_table_and_calc_totals(client_argc, client_argv); + + /* I cache results. Use the I_refs value to determine the first column + * width. */ + l1 = commify(Ir_total.a, 0, buf1); + VG_(message)(Vg_UserMsg, "I refs: %s", buf1); + + commify(Ir_total.m1, l1, buf1); + VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1); + + commify(Ir_total.m2, l1, buf1); + VG_(message)(Vg_UserMsg, "L2 misses: %s", buf1); + + p = 100; + + percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1); + VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1); + + percentify(Ir_total.m2 * 100 * p / Ir_total.a, p, l1+1, buf1); + VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1); + VG_(message)(Vg_UserMsg, ""); + + /* D cache results. Use the D_refs.rd and D_refs.wr values to determine the + * width of columns 2 & 3. */ + D_total.a = Dr_total.a + Dw_total.a; + D_total.m1 = Dr_total.m1 + Dw_total.m1; + D_total.m2 = Dr_total.m2 + Dw_total.m2; + + commify( D_total.a, 0, buf1); + l2 = commify(Dr_total.a, 0, buf2); + l3 = commify(Dw_total.a, 0, buf3); + VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + commify( D_total.m1, l1, buf1); + commify(Dr_total.m1, l2, buf2); + commify(Dw_total.m1, l3, buf3); + VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + commify( D_total.m2, l1, buf1); + commify(Dr_total.m2, l2, buf2); + commify(Dw_total.m2, l3, buf3); + VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + p = 10; + + percentify( D_total.m1 * 100 * p / D_total.a, p, l1+1, buf1); + percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2); + percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3); + VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3); + + percentify( D_total.m2 * 100 * p / D_total.a, p, l1+1, buf1); + percentify(Dr_total.m2 * 100 * p / Dr_total.a, p, l2+1, buf2); + percentify(Dw_total.m2 * 100 * p / Dw_total.a, p, l3+1, buf3); + VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3); + VG_(message)(Vg_UserMsg, ""); + + /* L2 overall results */ + L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2; + L2_total_mr = Dr_total.m2 + Ir_total.m2; + L2_total_mw = Dw_total.m2; + + commify(L2_total_m, l1, buf1); + commify(L2_total_mr, l2, buf2); + commify(L2_total_mw, l3, buf3); + VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + percentify(L2_total_m * 100 * p / (Ir_total.a + D_total.a), p, l1+1, buf1); + percentify(L2_total_mr * 100 * p / (Ir_total.a + Dr_total.a), p, l2+1, buf2); + percentify(L2_total_mw * 100 * p / Dw_total.a, p, l3+1, buf3); + VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )", buf1, buf2,buf3); + + + /* Hash table stats */ + if (VG_(clo_verbosity) > 1) { + int BB_lookups = full_debug_BBs + fn_name_debug_BBs + + file_line_debug_BBs + no_debug_BBs; + + VG_(message)(Vg_DebugMsg, ""); + VG_(message)(Vg_DebugMsg, "Distinct files: %d", distinct_files); + VG_(message)(Vg_DebugMsg, "Distinct fns: %d", distinct_fns); + VG_(message)(Vg_DebugMsg, "BB lookups: %d", BB_lookups); + VG_(message)(Vg_DebugMsg, "With full debug info:%3d%% (%d)", + full_debug_BBs * 100 / BB_lookups, + full_debug_BBs); + VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", + file_line_debug_BBs * 100 / BB_lookups, + file_line_debug_BBs); + VG_(message)(Vg_DebugMsg, "With fn name debug info:%3d%% (%d)", + fn_name_debug_BBs * 100 / BB_lookups, + fn_name_debug_BBs); + VG_(message)(Vg_DebugMsg, "With no debug info:%3d%% (%d)", + no_debug_BBs * 100 / BB_lookups, + no_debug_BBs); + VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations); + VG_(message)(Vg_DebugMsg, "Distinct instrs: %d", distinct_instrs); + } + VGP_POPCC; +} + diff --git a/cachegrind/cg_sim_D1.c b/cachegrind/cg_sim_D1.c new file mode 100644 index 0000000000..c2b3d4cc3e --- /dev/null +++ b/cachegrind/cg_sim_D1.c @@ -0,0 +1,93 @@ +/* D1 cache simulator, generated by vg_cachegen. + * total size = 65536 bytes + * line size = 64 bytes + * associativity = 2-way associative + * + * This file should be #include-d into vg_cachesim.c + */ + +static char D1_desc_line[] = + "desc: D1 cache: 65536 B, 64 B, 2-way associative\n"; + +static UInt D1_tags[512][2]; + +static void cachesim_D1_initcache(void) +{ + UInt set, way; + for (set = 0; set < 512; set++) + for (way = 0; way < 2; way++) + D1_tags[set][way] = 0; +} + +static __inline__ +void cachesim_D1_doref(Addr a, UChar size, ULong* m1, ULong *m2) +{ + register UInt set1 = ( a >> 6) & (512-1); + register UInt set2 = ((a + size) >> 6) & (512-1); + register UInt tag = a >> (6 + 9); + + if (set1 == set2) { + + if (tag == D1_tags[set1][0]) { + return; + } + else if (tag == D1_tags[set1][1]) { + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + return; + } + else { + /* A miss */ + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else if ((set1 + 1) % 512 == set2) { + + Bool is_D1_miss = False; + + /* Block one */ + if (tag == D1_tags[set1][0]) { + } + else if (tag == D1_tags[set1][1]) { + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + } + else { + /* A miss */ + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + + is_D1_miss = True; + } + + /* Block two */ + if (tag == D1_tags[set2][0]) { + } + else if (tag == D1_tags[set2][1]) { + D1_tags[set2][1] = D1_tags[set2][0]; + D1_tags[set2][0] = tag; + } + else { + /* A miss */ + D1_tags[set2][1] = D1_tags[set2][0]; + D1_tags[set2][0] = tag; + + is_D1_miss = True; + } + + /* Miss treatment */ + if (is_D1_miss) { + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else { + VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size); + VG_(printf)("sets %d and %d.\n", set1, set2); + VG_(panic)("D1 cache set mismatch"); + } +} diff --git a/cachegrind/cg_sim_I1.c b/cachegrind/cg_sim_I1.c new file mode 100644 index 0000000000..32b89b51b0 --- /dev/null +++ b/cachegrind/cg_sim_I1.c @@ -0,0 +1,93 @@ +/* I1 cache simulator, generated by vg_cachegen. + * total size = 65536 bytes + * line size = 64 bytes + * associativity = 2-way associative + * + * This file should be #include-d into vg_cachesim.c + */ + +static char I1_desc_line[] = + "desc: I1 cache: 65536 B, 64 B, 2-way associative\n"; + +static UInt I1_tags[512][2]; + +static void cachesim_I1_initcache(void) +{ + UInt set, way; + for (set = 0; set < 512; set++) + for (way = 0; way < 2; way++) + I1_tags[set][way] = 0; +} + +static __inline__ +void cachesim_I1_doref(Addr a, UChar size, ULong* m1, ULong *m2) +{ + register UInt set1 = ( a >> 6) & (512-1); + register UInt set2 = ((a + size) >> 6) & (512-1); + register UInt tag = a >> (6 + 9); + + if (set1 == set2) { + + if (tag == I1_tags[set1][0]) { + return; + } + else if (tag == I1_tags[set1][1]) { + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + return; + } + else { + /* A miss */ + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else if ((set1 + 1) % 512 == set2) { + + Bool is_I1_miss = False; + + /* Block one */ + if (tag == I1_tags[set1][0]) { + } + else if (tag == I1_tags[set1][1]) { + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + } + else { + /* A miss */ + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + + is_I1_miss = True; + } + + /* Block two */ + if (tag == I1_tags[set2][0]) { + } + else if (tag == I1_tags[set2][1]) { + I1_tags[set2][1] = I1_tags[set2][0]; + I1_tags[set2][0] = tag; + } + else { + /* A miss */ + I1_tags[set2][1] = I1_tags[set2][0]; + I1_tags[set2][0] = tag; + + is_I1_miss = True; + } + + /* Miss treatment */ + if (is_I1_miss) { + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else { + VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size); + VG_(printf)("sets %d and %d.\n", set1, set2); + VG_(panic)("I1 cache set mismatch"); + } +} diff --git a/cachegrind/cg_sim_L2.c b/cachegrind/cg_sim_L2.c new file mode 100644 index 0000000000..bb685326ab --- /dev/null +++ b/cachegrind/cg_sim_L2.c @@ -0,0 +1,250 @@ +/* L2 cache simulator, generated by vg_cachegen. + * total size = 262144 bytes + * line size = 64 bytes + * associativity = 8-way associative + * + * This file should be #include-d into vg_cachesim.c + */ + +static char L2_desc_line[] = + "desc: L2 cache: 262144 B, 64 B, 8-way associative\n"; + +static UInt L2_tags[512][8]; + +static void cachesim_L2_initcache(void) +{ + UInt set, way; + for (set = 0; set < 512; set++) + for (way = 0; way < 8; way++) + L2_tags[set][way] = 0; +} + +static __inline__ +void cachesim_L2_doref(Addr a, UChar size, ULong *m2) +{ + register UInt set1 = ( a >> 6) & (512-1); + register UInt set2 = ((a + size) >> 6) & (512-1); + register UInt tag = a >> (6 + 9); + + if (set1 == set2) { + + if (tag == L2_tags[set1][0]) { + return; + } + else if (tag == L2_tags[set1][1]) { + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][2]) { + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][3]) { + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][4]) { + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][5]) { + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][6]) { + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][7]) { + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else { + /* A miss */ + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + + (*m2)++; + } + + } else if ((set1 + 1) % 512 == set2) { + + Bool is_L2_miss = False; + + /* Block one */ + if (tag == L2_tags[set1][0]) { + } + else if (tag == L2_tags[set1][1]) { + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][2]) { + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][3]) { + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][4]) { + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][5]) { + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][6]) { + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][7]) { + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else { + /* A miss */ + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + + is_L2_miss = True; + } + + /* Block two */ + if (tag == L2_tags[set2][0]) { + } + else if (tag == L2_tags[set2][1]) { + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][2]) { + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][3]) { + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][4]) { + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][5]) { + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][6]) { + L2_tags[set2][6] = L2_tags[set2][5]; + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][7]) { + L2_tags[set2][7] = L2_tags[set2][6]; + L2_tags[set2][6] = L2_tags[set2][5]; + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else { + /* A miss */ + L2_tags[set2][7] = L2_tags[set2][6]; + L2_tags[set2][6] = L2_tags[set2][5]; + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + + is_L2_miss = True; + } + + /* Miss treatment */ + if (is_L2_miss) { + (*m2)++; + } + + } else { + VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size); + VG_(printf)("sets %d and %d.\n", set1, set2); + VG_(panic)("L2 cache set mismatch"); + } +} diff --git a/cachegrind/docs/manual.html b/cachegrind/docs/manual.html index c441db1620..a97c2f9fec 100644 --- a/cachegrind/docs/manual.html +++ b/cachegrind/docs/manual.html @@ -78,7 +78,9 @@ Linux-x86 executables.

An example

-

The design and implementation of Valgrind

+

Cache profiling

+ +

The design and implementation of Valgrind


@@ -515,6 +517,11 @@ follows: buggy, so you may need to issue this flag if you use 3.0.4.

+

  • --cachesim=no [default]
    + --cachesim=yes +

    When enabled, turns off memory checking, and turns on cache profiling. + Cache profiling is described in detail in Section 7. +

  • There are also some options for debugging Valgrind itself. You @@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$

    The GCC folks fixed this about a week before gcc-3.0 shipped.


    + + + + +

    7  Cache profiling

    +As well as memory debugging, Valgrind also allows you to do cache simulations +and annotate your source line-by-line with the number of cache misses. In +particular, it records: +
      +
    • L1 instruction cache reads and misses; +
    • L1 data cache reads and read misses, writes and write misses; +
    • L2 unified cache reads and read misses, writes and writes misses. +
    +On a modern x86 machine, an L1 miss will typically cost around 10 cycles, +and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be +very useful for improving the performance of your program. + +Please note that this is an experimental feature. Any feedback, bug-fixes, +suggestions, etc, welcome. + + +

    7.1  Overview

    +First off, as for normal Valgrind use, you probably want to turn on debugging +info (the -g flag). But by contrast with normal Valgrind use, you +probably do want to turn optimisation on, since you should profile your +program as it will be normally run. + +The three steps are: +
      +
    1. Generate a cache simulator for your machine's cache configuration with + `vg_cachegen' and recompile Valgrind with make install. + Valgrind comes with a default simulator, but it is unlikely to be correct + for your system, so you should generate a simulator yourself.
    2. +
    3. Run your program with valgrind --cachesim=yes in front of + the normal command line invocation. When the program finishes, Valgrind + will print summary cache statistics. It also collects line-by-line + information in a file cachegrind.out.
    4. +
    5. Generate a function-by-function summary, and possibly annotate source + files with 'vg_annotate'. Source files to annotate can be specified + manually, or manually on the command line, or "interesting" source files + can be annotated automatically with the --auto=yes option. + You can annotate C/C++ files or assembly language files equally + easily.
    6. +
    + +Step 1 only needs to be done once, unless you are +interested in simulating different cache configurations (eg. first +concentrating on instruction cache misses, then on data cache misses).

    + +Step 2 should be done every time you want to collect +information about a new program, a changed program, or about the same program +with different input.

    + +Step 3 can be performed as many times as you like for +each Step 2; you may want to do multiple annotations showing different +information each time.

    + +The steps are described in detail in the following sections.

    + + + +

    7.3  Generating a cache simulator

    +Although Valgrind comes with a pre-generated cache simulator, it most likely +won't match the cache configuration of your machine, so you should generate +a new simulator.

    + +You need to generate three files, one for each of the I1, D1 and L2 caches. +For each cache, you need to know the: +

      +
    • Cache size (bytes); +
    • Line size (bytes); +
    • Associativity. +
    + +vg_cachegen takes three options: +
      +
    • --I1=size,line_size,associativity +
    • --D1=size,line_size,associativity +
    • --L2=size,line_size,associativity +
    + +You can specify one, two or all three caches per invocation of vg_cachegen. It +checks that the configuration is sensible before generating the simulators; to +see the allowed values, run vg_cachegen -h.

    + +An example invocation would be: + +

    + vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8 +
    + +This simulates a machine with a 128KB split L1 2-way associative cache, and a +256KB unified 8-way associative L2 cache. Both caches have 64B lines.

    + +If you don't know your cache configuration, you'll have to find it out. +(Ideally vg_cachegen could auto-identify your cache configuration using the +CPUID instruction, which could be done automatically during installation, and +this whole step could be skipped...)

    + + +

    7.4  Cache simulation specifics

    +vg_cachegen only generates simulations for a machine with a split L1 cache and +a unified L2 cache. This configuration is used for all x86-based machines we +are aware of.

    + +The more specific characteristics of the simulation are as follows. + +

      +
    • Write-allocate: when a write miss occurs, the block written to is brought + into the D1 cache. Most modern caches have this property.
    • + +

    • Bit-selection hash function: the line(s) in the cache to which a memory + block maps is chosen by the middle bits M--(M+N-1) of the byte address, + where: +
        +
      •  line size = 2^M bytes 
      • +
      • (cache size / line size) = 2^N bytes
      • +
    • + +

    • Inclusive L2 cache: the L2 cache replicates all the entries of the L1 + cache. This is standard on Pentium chips, but AMD Athlons use an + exclusive L2 cache that only holds blocks evicted from L1.
    • +

    + +Other noteworthy behaviour: + +
      +
    • References that straddle two cache lines are treated as follows:
    • +
        +
      • If both blocks hit --> counted as one hit
      • +
      • If one block hits, the other misses --> counted as one miss
      • +
      • If both blocks miss --> counted as one miss (not two)
      • +

      + +

    • Instructions that modify a memory location (eg. inc and + dec) are counted as doing just a read, ie. a single data + reference. This may seem strange, but since the write can never cause a + miss (the read guarantees the block is in the cache) it's not very + interesting.

      + + Thus it measures not the number of times the data cache is accessed, but + the number of times a data cache miss could occur.

      +

    • +
    + +If you are interested in simulating a cache with different properties, it is +not particularly hard to write your own cache simulator, or to modify existing +ones in vg_cachesim_I1.c, vg_cachesim_I1.c and +vg_cachesim_I1.c. We'd be interested to hear from anyone who +does. + + + +

    7.5  Profiling programs

    +Cache profiling is enabled by using the --cachesim=yes option to +Valgrind. This automatically turns off Valgrind's memory checking functions, +since the cache simulation is slow enough already, and you probably don't want +to do both at once.

    + +To gather cache profiling information about the program ls -lvalgrind --cachesim=yes ls -l + +The program will execute (slowly). Upon completion, summary statistics +that look like this will be printed: + +

    +==31751== I   refs:      27,742,716
    +==31751== I1  misses:           276
    +==31751== L2  misses:           275
    +==31751== I1  miss rate:        0.0%
    +==31751== L2i miss rate:        0.0%
    +==31751== 
    +==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
    +==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
    +==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
    +==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
    +==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
    +==31751== 
    +==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
    +==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
    +
    + +Cache accesses for instruction fetches are summarised first, giving the +number of fetches made (this is the number of instructions executed, which +can be useful to know in its own right), the number of I1 misses, and the +number of L2 instruction (L2i) misses.

    + +Cache accesses for data follow. The information is similar to that of the +instruction fetches, except that the values are also shown split between reads +and writes (note each row's rd and wr values add up +to the row's total).

    + +Combined instruction and data figures for the L2 cache follow that.

    + + +

    7.6  Output file

    +As well as printing summary information, Valgrind also writes line-by-line +cache profiling information to a file named cachegrind.out . This +file is human-readable, but is best interpreted by the accompanying program +vg_annotate, described in the next section.

    + +Things to note about the cachegrind.out file: +

      +
    • It is written every time valgrind --cachesim=yes is run; it + will automatically overwrite any existing cachegrind.out in + the current directory.
    • +
    • It can be quite large: ls -l generates a file of about + 350KB; browsing a few files and web pages with Konqueror generates a file + of around 10MB.
    • +
    + + + +

    7.7  Annotating C/C++ programs

    +Before using vg_annotate, it is worth widening your window to be at least +120-characters wide if possible, as the output lines can be quite long.

    + +To get a function-by-function summary, run vg_annotate in +directory containing a cachegrind.out file. The output looks like +this: + +

    +--------------------------------------------------------------------------------
    +I1 cache:              65536 B, 64 B, 2-way associative
    +D1 cache:              65536 B, 64 B, 2-way associative
    +L2 cache:              262144 B, 64 B, 8-way associative
    +Command:               concord vg_to_ucode.c
    +Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Threshold:             99%
    +Chosen for annotation:
    +Auto-annotation:       on
    +
    +--------------------------------------------------------------------------------
    +Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
    +--------------------------------------------------------------------------------
    +27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
    +
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
    +--------------------------------------------------------------------------------
    +8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
    +5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
    +2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
    +2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
    +2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
    +1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
    +  897,991   51   51   897,831    95    30        62      1      1  ???:???
    +  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
    +  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
    +  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
    +  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
    +  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
    +  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
    +  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
    +  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
    +  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
    +   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
    +   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
    +
    + +First up is a summary of the annotation options: + +
      +
    • I1 cache, D1 cache, L2 cache: cache configuration. So you know the + configuration with which these results were obtained.
    • + +

    • Command: the command line invocation of the program under + examination.
    • + +

    • Events recorded: event abbreviations are:

      +

        +
      • Ir : I cache reads (ie. instructions executed)
      • +
      • I1mr: I1 cache read misses
      • +
      • I2mr: L2 cache instruction read misses
      • +
      • Dr : D cache reads (ie. memory reads)
      • +
      • D1mr: D1 cache read misses
      • +
      • D2mr: L2 cache data read misses
      • +
      • Dw : D cache writes (ie. memory writes)
      • +
      • D1mw: D1 cache write misses
      • +
      • D2mw: L2 cache data write misses
      • +

      + Note that D1 total accesses is given by D1mr + + D1mw, and that L2 total accesses is given by + I2mr + D2mr + D2mw.

    • + +

    • Events shown: the events shown (a subset of events gathered). This can + be adjusted with the --show option.
    • + +

    • Event sort order: the sort order in which functions are shown. For + example, in this case the functions are sorted from highest + Ir counts to lowest. If two functions have identical + Ir counts, they will then be sorted by I1mr + counts, and so on. This order can be adjusted with the + --sort option.

      + + Note that this dictates the order the functions appear. It is not + the order in which the columns appear; that is dictated by the "events + shown" line (and can be changed with the --sort option). +

    • + +

    • Threshold: vg_annotate by default omits functions that cause very low + numbers of misses to avoid drowing you in information. In this case, + vg_annotate shows summaries the functions that account for 99% of the + Ir counts; Ir is chosen as the treshold event + since it is the primary sort event. The threshold can be adjusted with + the --threshold option.
    • + +

    • Chosen for annotation: names of files specified manually for annotation; + in this case none.
    • + +

    • Auto-annotation: whether auto-annotation was requested via the + --auto=yes option. In this case no.
    • +

    + +Then follows summary statistics for the whole program. These are similar +to the summary provided when running valgrind --cachesim=yes.

    + +Then follows function-by-function statistics. Each function is identified by a +file_name:function_name pair. If a column contains only a +`.' it means the function never performs that event (eg. the third row shows +that strcmp() contains no instructions that write to memory). The +name ??? is used if the the file name and/or function name could +not be determined from debugging information. (If most of the entries have the +form ???:??? the program probably wasn't compiled with +-g.)

    + +It is worth noting that functions will come from three types of source files: +

      +
    1. From the profiled program (concord.c in this example).
    2. +
    3. From libraries (eg. getc.c)
    4. +
    5. From Valgrind's implementation of some libc functions (eg. + vg_clientmalloc.c:malloc). These are recognisable because + the filename begins with vg_, and is probably one of + vg_main.c, vg_clientmalloc.c or + vg_mylibc.c. +
    6. +
    + +There are two ways to annotate source files -- by choosing them manually, or +with the --auto=yes option. To do it manually, just +specify the filenames as arguments to vg_annotate. For example, the output from +running vg_annotate concord.c for our example produces the same +output as above followed by an annotated version of concord.c, a +section of which looks like: + +
    +--------------------------------------------------------------------------------
    +-- User-annotated source: concord.c
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
    +
    +[snip]
    +
    +        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
    +        3    1    1       .     .     .       1      0      0  {
    +        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
    +        .    .    .       .     .     .       .      .      .      Word_Info *data;
    +        1    0    0       .     .     .       1      1      1      int line = 1, i;
    +        .    .    .       .     .     .       .      .      .
    +        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
    +        .    .    .       .     .     .       .      .      .
    +    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
    +    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
    +        .    .    .       .     .     .       .      .      .
    +        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
    +        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
    +        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
    +        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
    +        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
    +        .    .    .       .     .     .       .      .      .      }
    +        .    .    .       .     .     .       .      .      .
    +  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
    +  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
    +        .    .    .       .     .     .       .      .      .
    +        4    0    0       1     0     0       2      0      0      free(data);
    +        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
    +        3    0    0       2     0     0       .      .      .  }
    +
    + +(Although column widths are automatically minimised, a wide terminal is clearly +useful.)

    + +Each source file is clearly marked (User-annotated source) as +having been chosen manually for annotation. If the file was found in one of +the directories specified with the -I/--include +option, the directory and file are both given.

    + +Each line is annotated with its event counts. Events not applicable for a line +are represented by a `.'; this is useful for distinguishing between an event +which cannot happen, and one which can but did not.

    + +Sometimes only a small section of a source file is executed. To minimise +uninteresting output, Valgrind only shows annotated lines and lines within a +small distance of annotated lines. Gaps are marked with the line numbers so +you know which part of a file the shown code comes from, eg: + +

    +(figures and code for line 704)
    +-- line 704 ----------------------------------------
    +-- line 878 ----------------------------------------
    +(figures and code for line 878)
    +
    + +The amount of context to show around annotated lines is controlled by the +--context option.

    + +To get automatic annotation, run vg_annotate --auto=yes. +vg_annotate will automatically annotate every source file it can find that is +mentioned in the function-by-function summary. Therefore, the files chosen for +auto-annotation are affected by the --sort and +--threshold options. Each source file is clearly marked +(Auto-annotated source) as being chosen automatically. Any files +that could not be found are mentioned at the end of the output, eg: + +

    +--------------------------------------------------------------------------------
    +The following files chosen for auto-annotation could not be found:
    +--------------------------------------------------------------------------------
    +  getc.c
    +  ctype.c
    +  ../sysdeps/generic/lockfile.c
    +
    + +This is quite common for library files, since libraries are usually compiled +with debugging information, but the source files are often not present on a +system. If a file is chosen for annotation both manually and +automatically, it is marked as User-annotated source. + +Use the -I/--include option to tell Valgrind where to look for +source files if the filenames found from the debugging information aren't +specific enough. + +Beware that vg_annotate can take some time to digest large +cachegrind.out files, eg. 30 seconds or more. Also beware that +auto-annotation can produce a lot of output if your program is large! + + +

    7.8  Annotating assembler programs

    +Valgrind can annotate assembler programs too, or annotate the assembler +generated for your C program. Sometimes this is useful for understanding what +is really happening when an interesting line of C code is translated into +multiple instructions.

    + +To do this, you just need to assemble your .s files with +assembler-level debug information. gcc doesn't do this, but you can use GNU as +with the --gstabs option to generate object files with this +information, eg: + +

    as --gstabs foo.s
    + +You can then profile and annotate source files in the same way as for C/C++ +programs. + + +

    7.9  vg_annotate options

    +
      +
    • -h, --help
    • +

    • -v, --version

      + + Help and version, as usual.

    • + +
    • --sort=A,B,C [default: order in + cachegrind.out]

      + Specifies the events upon which the sorting of the function-by-function + entries will be based. Useful if you want to concentrate on eg. I cache + misses (--sort=I1mr,I2mr), or D cache misses + (--sort=D1mr,D2mr), or L2 misses + (--sort=D2mr,I2mr).

    • + +

    • --show=A,B,C [default: all, using order in + cachegrind.out]

      + Specifies which events to show (and the column order). Default is to use + all present in the cachegrind.out file (and use the order in + the file).

    • + +

    • --threshold=X [default: 99%]

      + Sets the threshold for the function-by-function summary. Functions are + shown that account for more than X% of all the primary sort events. If + auto-annotating, also affects which files are annotated.

    • + +

    • --auto=no [default]
      + --auto=yes

      + When enabled, automatically annotates every file that is mentioned in the + function-by-function summary that can be found. Also gives a list of + those that couldn't be found. + +

    • --context=N [default: 8]

      + Print N lines of context before and after each annotated line. Avoids + printing large sections of source files that were not executed. Use a + large number (eg. 10,000) to show all source lines. +

    • + +

    • -I=<dir>, --include=<dir> + [default: empty string]

      + Adds a directory to the list in which to search for files. Multiple + -I/--include options can be given to add multiple directories. +

    + + +

    7.10  Warnings

    +There are a couple of situations in which vg_annotate issues warnings. + +
      +
    • If a source file is more recent than the cachegrind.out + file. This is because the information in cachegrind.out is + only recorded with line numbers, so if the line numbers change at all in + the source (eg. lines added, deleted, swapped), any annotations will be + incorrect.

      + +

    • If information is recorded about line numbers past the end of a file. + This can be caused by the above problem, ie. shortening the source file + while using an old cachegrind.out file. If this happens, + the figures for the bogus lines are printed anyway (clearly marked as + bogus) in case they are important.
    • +

    + + +

    7.10  Things to watch out for

    +Some odd things that can occur during annotation: + +
      +
    • If annotating at the assembler level, you might see something like this: + +
      +      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
      +      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
      +      .    .    .  .    .    .  .    .    .          .align 4,0x90
      +      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
      +      
      + + How can the third instruction be executed twice when the others are + executed only once? As it turns out, it isn't. Here's a dump of the + executable, from objdump: + +
      +      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
      +      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
      +      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
      +      8048f32:       89 f6                   mov    %esi,%esi
      +      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
      +      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
      +      
      + + Notice the extra mov %esi,%esi instruction. Where did this + come from? The GNU assembler inserted it to serve as the two bytes of + padding needed to align the movl $.LnrB,%eax instruction on + a four-byte boundary, but pretended it didn't exist when adding debug + information. Thus when Valgrind reads the debug info it thinks that the + movl $0x1,0xffffffec(%ebp) instruction covers the address + range 0x8048f2b--0x804833 by itself, and attributes the counts for the + mov %esi,%esi to it.

      +

    • + +
    • + Inlined functions can cause strange results in the function-by-function + summary. If a function inline_me() is defined in + foo.h and inlined in the functions f1(), + f2() and f3() in bar.c, there will + not be a foo.h:inline_me() function entry. Instead, there + will be separate function entries for each inlining site, ie. + foo.h:f1(), foo.h:f2() and + foo.h:f3(). To find the total counts for + foo.h:inline_me(), add up the counts from each entry.

      + + The reason for this is that although the debug info output by gcc + indicates the switch from bar.c to foo.h, it + doesn't indicate the name of the function in foo.h, so + Valgrind keeps using the old one.

      + +

    • + Sometimes, the same filename might be represented with a relative name + and with an absolute name in different parts of the debug info, eg: + /home/user/proj/proj.h and ../proj.h. In this + case, if you use auto-annotation, the file will be annotated twice with + the counts split between the two.

      +

    • +
    + +Note: stabs is not an easy format to read. If you come across bizarre +annotations that look like might be caused by a bug in the stabs reader, +please let us know. + + +

    7.11  Accuracy

    +Valgrind's cache profiling has a number of shortcomings: + +
      +
    • It doesn't account for kernel activity -- the effect of system calls on + the cache contents is ignored.
    • + +

    • It doesn't account for other process activity (although this is probably + desirable when considering a single program).
    • + +

    • It doesn't account for virtual-to-physical address mappings; hence the + entire simulation is not a true representation of what's happening in the + cache.
    • + +

    • It doesn't account for cache misses not visible at the instruction level, + eg. those arising from TLB misses, or speculative execution.
    • +

    + +Another thing worth nothing is that results are very sensitive. Changing the +size of the valgrind.so file, the size of the program being +profiled, or even the length of its name can perturb the results. Variations +will be small, but don't expect perfectly repeatable results if your program +changes at all.

    + +While these factors mean you shouldn't trust the results to be super-accurate, +hopefully they should be close enough to be useful.

    + + +

    7.12  Todo

    +
      +
    • Use CPUID instruction to auto-identify cache configuration during + installation. This would save the user from having to know their cache + configuration and using vg_cachegen.
    • +

    • Program start-up/shut-down calls a lot of functions that aren't + interesting and just complicate the output. Would be nice to exclude + these somehow.
    • +

    +
    + diff --git a/corecheck/Makefile.am b/corecheck/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/corecheck/Makefile.am +++ b/corecheck/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/coregrind/Makefile.am +++ b/coregrind/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/coregrind/docs/manual.html b/coregrind/docs/manual.html index c441db1620..a97c2f9fec 100644 --- a/coregrind/docs/manual.html +++ b/coregrind/docs/manual.html @@ -78,7 +78,9 @@ Linux-x86 executables.

    An example

    -

    The design and implementation of Valgrind

    +

    Cache profiling

    + +

    The design and implementation of Valgrind


    @@ -515,6 +517,11 @@ follows: buggy, so you may need to issue this flag if you use 3.0.4.

    +

  • --cachesim=no [default]
    + --cachesim=yes +

    When enabled, turns off memory checking, and turns on cache profiling. + Cache profiling is described in detail in Section 7. +

  • There are also some options for debugging Valgrind itself. You @@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$

    The GCC folks fixed this about a week before gcc-3.0 shipped.


    + + + + +

    7  Cache profiling

    +As well as memory debugging, Valgrind also allows you to do cache simulations +and annotate your source line-by-line with the number of cache misses. In +particular, it records: +
      +
    • L1 instruction cache reads and misses; +
    • L1 data cache reads and read misses, writes and write misses; +
    • L2 unified cache reads and read misses, writes and writes misses. +
    +On a modern x86 machine, an L1 miss will typically cost around 10 cycles, +and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be +very useful for improving the performance of your program. + +Please note that this is an experimental feature. Any feedback, bug-fixes, +suggestions, etc, welcome. + + +

    7.1  Overview

    +First off, as for normal Valgrind use, you probably want to turn on debugging +info (the -g flag). But by contrast with normal Valgrind use, you +probably do want to turn optimisation on, since you should profile your +program as it will be normally run. + +The three steps are: +
      +
    1. Generate a cache simulator for your machine's cache configuration with + `vg_cachegen' and recompile Valgrind with make install. + Valgrind comes with a default simulator, but it is unlikely to be correct + for your system, so you should generate a simulator yourself.
    2. +
    3. Run your program with valgrind --cachesim=yes in front of + the normal command line invocation. When the program finishes, Valgrind + will print summary cache statistics. It also collects line-by-line + information in a file cachegrind.out.
    4. +
    5. Generate a function-by-function summary, and possibly annotate source + files with 'vg_annotate'. Source files to annotate can be specified + manually, or manually on the command line, or "interesting" source files + can be annotated automatically with the --auto=yes option. + You can annotate C/C++ files or assembly language files equally + easily.
    6. +
    + +Step 1 only needs to be done once, unless you are +interested in simulating different cache configurations (eg. first +concentrating on instruction cache misses, then on data cache misses).

    + +Step 2 should be done every time you want to collect +information about a new program, a changed program, or about the same program +with different input.

    + +Step 3 can be performed as many times as you like for +each Step 2; you may want to do multiple annotations showing different +information each time.

    + +The steps are described in detail in the following sections.

    + + + +

    7.3  Generating a cache simulator

    +Although Valgrind comes with a pre-generated cache simulator, it most likely +won't match the cache configuration of your machine, so you should generate +a new simulator.

    + +You need to generate three files, one for each of the I1, D1 and L2 caches. +For each cache, you need to know the: +

      +
    • Cache size (bytes); +
    • Line size (bytes); +
    • Associativity. +
    + +vg_cachegen takes three options: +
      +
    • --I1=size,line_size,associativity +
    • --D1=size,line_size,associativity +
    • --L2=size,line_size,associativity +
    + +You can specify one, two or all three caches per invocation of vg_cachegen. It +checks that the configuration is sensible before generating the simulators; to +see the allowed values, run vg_cachegen -h.

    + +An example invocation would be: + +

    + vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8 +
    + +This simulates a machine with a 128KB split L1 2-way associative cache, and a +256KB unified 8-way associative L2 cache. Both caches have 64B lines.

    + +If you don't know your cache configuration, you'll have to find it out. +(Ideally vg_cachegen could auto-identify your cache configuration using the +CPUID instruction, which could be done automatically during installation, and +this whole step could be skipped...)

    + + +

    7.4  Cache simulation specifics

    +vg_cachegen only generates simulations for a machine with a split L1 cache and +a unified L2 cache. This configuration is used for all x86-based machines we +are aware of.

    + +The more specific characteristics of the simulation are as follows. + +

      +
    • Write-allocate: when a write miss occurs, the block written to is brought + into the D1 cache. Most modern caches have this property.
    • + +

    • Bit-selection hash function: the line(s) in the cache to which a memory + block maps is chosen by the middle bits M--(M+N-1) of the byte address, + where: +
        +
      •  line size = 2^M bytes 
      • +
      • (cache size / line size) = 2^N bytes
      • +
    • + +

    • Inclusive L2 cache: the L2 cache replicates all the entries of the L1 + cache. This is standard on Pentium chips, but AMD Athlons use an + exclusive L2 cache that only holds blocks evicted from L1.
    • +

    + +Other noteworthy behaviour: + +
      +
    • References that straddle two cache lines are treated as follows:
    • +
        +
      • If both blocks hit --> counted as one hit
      • +
      • If one block hits, the other misses --> counted as one miss
      • +
      • If both blocks miss --> counted as one miss (not two)
      • +

      + +

    • Instructions that modify a memory location (eg. inc and + dec) are counted as doing just a read, ie. a single data + reference. This may seem strange, but since the write can never cause a + miss (the read guarantees the block is in the cache) it's not very + interesting.

      + + Thus it measures not the number of times the data cache is accessed, but + the number of times a data cache miss could occur.

      +

    • +
    + +If you are interested in simulating a cache with different properties, it is +not particularly hard to write your own cache simulator, or to modify existing +ones in vg_cachesim_I1.c, vg_cachesim_I1.c and +vg_cachesim_I1.c. We'd be interested to hear from anyone who +does. + + + +

    7.5  Profiling programs

    +Cache profiling is enabled by using the --cachesim=yes option to +Valgrind. This automatically turns off Valgrind's memory checking functions, +since the cache simulation is slow enough already, and you probably don't want +to do both at once.

    + +To gather cache profiling information about the program ls -lvalgrind --cachesim=yes ls -l + +The program will execute (slowly). Upon completion, summary statistics +that look like this will be printed: + +

    +==31751== I   refs:      27,742,716
    +==31751== I1  misses:           276
    +==31751== L2  misses:           275
    +==31751== I1  miss rate:        0.0%
    +==31751== L2i miss rate:        0.0%
    +==31751== 
    +==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
    +==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
    +==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
    +==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
    +==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
    +==31751== 
    +==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
    +==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
    +
    + +Cache accesses for instruction fetches are summarised first, giving the +number of fetches made (this is the number of instructions executed, which +can be useful to know in its own right), the number of I1 misses, and the +number of L2 instruction (L2i) misses.

    + +Cache accesses for data follow. The information is similar to that of the +instruction fetches, except that the values are also shown split between reads +and writes (note each row's rd and wr values add up +to the row's total).

    + +Combined instruction and data figures for the L2 cache follow that.

    + + +

    7.6  Output file

    +As well as printing summary information, Valgrind also writes line-by-line +cache profiling information to a file named cachegrind.out . This +file is human-readable, but is best interpreted by the accompanying program +vg_annotate, described in the next section.

    + +Things to note about the cachegrind.out file: +

      +
    • It is written every time valgrind --cachesim=yes is run; it + will automatically overwrite any existing cachegrind.out in + the current directory.
    • +
    • It can be quite large: ls -l generates a file of about + 350KB; browsing a few files and web pages with Konqueror generates a file + of around 10MB.
    • +
    + + + +

    7.7  Annotating C/C++ programs

    +Before using vg_annotate, it is worth widening your window to be at least +120-characters wide if possible, as the output lines can be quite long.

    + +To get a function-by-function summary, run vg_annotate in +directory containing a cachegrind.out file. The output looks like +this: + +

    +--------------------------------------------------------------------------------
    +I1 cache:              65536 B, 64 B, 2-way associative
    +D1 cache:              65536 B, 64 B, 2-way associative
    +L2 cache:              262144 B, 64 B, 8-way associative
    +Command:               concord vg_to_ucode.c
    +Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Threshold:             99%
    +Chosen for annotation:
    +Auto-annotation:       on
    +
    +--------------------------------------------------------------------------------
    +Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
    +--------------------------------------------------------------------------------
    +27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
    +
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
    +--------------------------------------------------------------------------------
    +8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
    +5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
    +2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
    +2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
    +2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
    +1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
    +  897,991   51   51   897,831    95    30        62      1      1  ???:???
    +  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
    +  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
    +  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
    +  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
    +  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
    +  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
    +  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
    +  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
    +  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
    +   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
    +   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
    +
    + +First up is a summary of the annotation options: + +
      +
    • I1 cache, D1 cache, L2 cache: cache configuration. So you know the + configuration with which these results were obtained.
    • + +

    • Command: the command line invocation of the program under + examination.
    • + +

    • Events recorded: event abbreviations are:

      +

        +
      • Ir : I cache reads (ie. instructions executed)
      • +
      • I1mr: I1 cache read misses
      • +
      • I2mr: L2 cache instruction read misses
      • +
      • Dr : D cache reads (ie. memory reads)
      • +
      • D1mr: D1 cache read misses
      • +
      • D2mr: L2 cache data read misses
      • +
      • Dw : D cache writes (ie. memory writes)
      • +
      • D1mw: D1 cache write misses
      • +
      • D2mw: L2 cache data write misses
      • +

      + Note that D1 total accesses is given by D1mr + + D1mw, and that L2 total accesses is given by + I2mr + D2mr + D2mw.

    • + +

    • Events shown: the events shown (a subset of events gathered). This can + be adjusted with the --show option.
    • + +

    • Event sort order: the sort order in which functions are shown. For + example, in this case the functions are sorted from highest + Ir counts to lowest. If two functions have identical + Ir counts, they will then be sorted by I1mr + counts, and so on. This order can be adjusted with the + --sort option.

      + + Note that this dictates the order the functions appear. It is not + the order in which the columns appear; that is dictated by the "events + shown" line (and can be changed with the --sort option). +

    • + +

    • Threshold: vg_annotate by default omits functions that cause very low + numbers of misses to avoid drowing you in information. In this case, + vg_annotate shows summaries the functions that account for 99% of the + Ir counts; Ir is chosen as the treshold event + since it is the primary sort event. The threshold can be adjusted with + the --threshold option.
    • + +

    • Chosen for annotation: names of files specified manually for annotation; + in this case none.
    • + +

    • Auto-annotation: whether auto-annotation was requested via the + --auto=yes option. In this case no.
    • +

    + +Then follows summary statistics for the whole program. These are similar +to the summary provided when running valgrind --cachesim=yes.

    + +Then follows function-by-function statistics. Each function is identified by a +file_name:function_name pair. If a column contains only a +`.' it means the function never performs that event (eg. the third row shows +that strcmp() contains no instructions that write to memory). The +name ??? is used if the the file name and/or function name could +not be determined from debugging information. (If most of the entries have the +form ???:??? the program probably wasn't compiled with +-g.)

    + +It is worth noting that functions will come from three types of source files: +

      +
    1. From the profiled program (concord.c in this example).
    2. +
    3. From libraries (eg. getc.c)
    4. +
    5. From Valgrind's implementation of some libc functions (eg. + vg_clientmalloc.c:malloc). These are recognisable because + the filename begins with vg_, and is probably one of + vg_main.c, vg_clientmalloc.c or + vg_mylibc.c. +
    6. +
    + +There are two ways to annotate source files -- by choosing them manually, or +with the --auto=yes option. To do it manually, just +specify the filenames as arguments to vg_annotate. For example, the output from +running vg_annotate concord.c for our example produces the same +output as above followed by an annotated version of concord.c, a +section of which looks like: + +
    +--------------------------------------------------------------------------------
    +-- User-annotated source: concord.c
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
    +
    +[snip]
    +
    +        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
    +        3    1    1       .     .     .       1      0      0  {
    +        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
    +        .    .    .       .     .     .       .      .      .      Word_Info *data;
    +        1    0    0       .     .     .       1      1      1      int line = 1, i;
    +        .    .    .       .     .     .       .      .      .
    +        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
    +        .    .    .       .     .     .       .      .      .
    +    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
    +    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
    +        .    .    .       .     .     .       .      .      .
    +        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
    +        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
    +        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
    +        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
    +        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
    +        .    .    .       .     .     .       .      .      .      }
    +        .    .    .       .     .     .       .      .      .
    +  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
    +  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
    +        .    .    .       .     .     .       .      .      .
    +        4    0    0       1     0     0       2      0      0      free(data);
    +        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
    +        3    0    0       2     0     0       .      .      .  }
    +
    + +(Although column widths are automatically minimised, a wide terminal is clearly +useful.)

    + +Each source file is clearly marked (User-annotated source) as +having been chosen manually for annotation. If the file was found in one of +the directories specified with the -I/--include +option, the directory and file are both given.

    + +Each line is annotated with its event counts. Events not applicable for a line +are represented by a `.'; this is useful for distinguishing between an event +which cannot happen, and one which can but did not.

    + +Sometimes only a small section of a source file is executed. To minimise +uninteresting output, Valgrind only shows annotated lines and lines within a +small distance of annotated lines. Gaps are marked with the line numbers so +you know which part of a file the shown code comes from, eg: + +

    +(figures and code for line 704)
    +-- line 704 ----------------------------------------
    +-- line 878 ----------------------------------------
    +(figures and code for line 878)
    +
    + +The amount of context to show around annotated lines is controlled by the +--context option.

    + +To get automatic annotation, run vg_annotate --auto=yes. +vg_annotate will automatically annotate every source file it can find that is +mentioned in the function-by-function summary. Therefore, the files chosen for +auto-annotation are affected by the --sort and +--threshold options. Each source file is clearly marked +(Auto-annotated source) as being chosen automatically. Any files +that could not be found are mentioned at the end of the output, eg: + +

    +--------------------------------------------------------------------------------
    +The following files chosen for auto-annotation could not be found:
    +--------------------------------------------------------------------------------
    +  getc.c
    +  ctype.c
    +  ../sysdeps/generic/lockfile.c
    +
    + +This is quite common for library files, since libraries are usually compiled +with debugging information, but the source files are often not present on a +system. If a file is chosen for annotation both manually and +automatically, it is marked as User-annotated source. + +Use the -I/--include option to tell Valgrind where to look for +source files if the filenames found from the debugging information aren't +specific enough. + +Beware that vg_annotate can take some time to digest large +cachegrind.out files, eg. 30 seconds or more. Also beware that +auto-annotation can produce a lot of output if your program is large! + + +

    7.8  Annotating assembler programs

    +Valgrind can annotate assembler programs too, or annotate the assembler +generated for your C program. Sometimes this is useful for understanding what +is really happening when an interesting line of C code is translated into +multiple instructions.

    + +To do this, you just need to assemble your .s files with +assembler-level debug information. gcc doesn't do this, but you can use GNU as +with the --gstabs option to generate object files with this +information, eg: + +

    as --gstabs foo.s
    + +You can then profile and annotate source files in the same way as for C/C++ +programs. + + +

    7.9  vg_annotate options

    +
      +
    • -h, --help
    • +

    • -v, --version

      + + Help and version, as usual.

    • + +
    • --sort=A,B,C [default: order in + cachegrind.out]

      + Specifies the events upon which the sorting of the function-by-function + entries will be based. Useful if you want to concentrate on eg. I cache + misses (--sort=I1mr,I2mr), or D cache misses + (--sort=D1mr,D2mr), or L2 misses + (--sort=D2mr,I2mr).

    • + +

    • --show=A,B,C [default: all, using order in + cachegrind.out]

      + Specifies which events to show (and the column order). Default is to use + all present in the cachegrind.out file (and use the order in + the file).

    • + +

    • --threshold=X [default: 99%]

      + Sets the threshold for the function-by-function summary. Functions are + shown that account for more than X% of all the primary sort events. If + auto-annotating, also affects which files are annotated.

    • + +

    • --auto=no [default]
      + --auto=yes

      + When enabled, automatically annotates every file that is mentioned in the + function-by-function summary that can be found. Also gives a list of + those that couldn't be found. + +

    • --context=N [default: 8]

      + Print N lines of context before and after each annotated line. Avoids + printing large sections of source files that were not executed. Use a + large number (eg. 10,000) to show all source lines. +

    • + +

    • -I=<dir>, --include=<dir> + [default: empty string]

      + Adds a directory to the list in which to search for files. Multiple + -I/--include options can be given to add multiple directories. +

    + + +

    7.10  Warnings

    +There are a couple of situations in which vg_annotate issues warnings. + +
      +
    • If a source file is more recent than the cachegrind.out + file. This is because the information in cachegrind.out is + only recorded with line numbers, so if the line numbers change at all in + the source (eg. lines added, deleted, swapped), any annotations will be + incorrect.

      + +

    • If information is recorded about line numbers past the end of a file. + This can be caused by the above problem, ie. shortening the source file + while using an old cachegrind.out file. If this happens, + the figures for the bogus lines are printed anyway (clearly marked as + bogus) in case they are important.
    • +

    + + +

    7.10  Things to watch out for

    +Some odd things that can occur during annotation: + +
      +
    • If annotating at the assembler level, you might see something like this: + +
      +      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
      +      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
      +      .    .    .  .    .    .  .    .    .          .align 4,0x90
      +      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
      +      
      + + How can the third instruction be executed twice when the others are + executed only once? As it turns out, it isn't. Here's a dump of the + executable, from objdump: + +
      +      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
      +      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
      +      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
      +      8048f32:       89 f6                   mov    %esi,%esi
      +      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
      +      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
      +      
      + + Notice the extra mov %esi,%esi instruction. Where did this + come from? The GNU assembler inserted it to serve as the two bytes of + padding needed to align the movl $.LnrB,%eax instruction on + a four-byte boundary, but pretended it didn't exist when adding debug + information. Thus when Valgrind reads the debug info it thinks that the + movl $0x1,0xffffffec(%ebp) instruction covers the address + range 0x8048f2b--0x804833 by itself, and attributes the counts for the + mov %esi,%esi to it.

      +

    • + +
    • + Inlined functions can cause strange results in the function-by-function + summary. If a function inline_me() is defined in + foo.h and inlined in the functions f1(), + f2() and f3() in bar.c, there will + not be a foo.h:inline_me() function entry. Instead, there + will be separate function entries for each inlining site, ie. + foo.h:f1(), foo.h:f2() and + foo.h:f3(). To find the total counts for + foo.h:inline_me(), add up the counts from each entry.

      + + The reason for this is that although the debug info output by gcc + indicates the switch from bar.c to foo.h, it + doesn't indicate the name of the function in foo.h, so + Valgrind keeps using the old one.

      + +

    • + Sometimes, the same filename might be represented with a relative name + and with an absolute name in different parts of the debug info, eg: + /home/user/proj/proj.h and ../proj.h. In this + case, if you use auto-annotation, the file will be annotated twice with + the counts split between the two.

      +

    • +
    + +Note: stabs is not an easy format to read. If you come across bizarre +annotations that look like might be caused by a bug in the stabs reader, +please let us know. + + +

    7.11  Accuracy

    +Valgrind's cache profiling has a number of shortcomings: + +
      +
    • It doesn't account for kernel activity -- the effect of system calls on + the cache contents is ignored.
    • + +

    • It doesn't account for other process activity (although this is probably + desirable when considering a single program).
    • + +

    • It doesn't account for virtual-to-physical address mappings; hence the + entire simulation is not a true representation of what's happening in the + cache.
    • + +

    • It doesn't account for cache misses not visible at the instruction level, + eg. those arising from TLB misses, or speculative execution.
    • +

    + +Another thing worth nothing is that results are very sensitive. Changing the +size of the valgrind.so file, the size of the program being +profiled, or even the length of its name can perturb the results. Variations +will be small, but don't expect perfectly repeatable results if your program +changes at all.

    + +While these factors mean you shouldn't trust the results to be super-accurate, +hopefully they should be close enough to be useful.

    + + +

    7.12  Todo

    +
      +
    • Use CPUID instruction to auto-identify cache configuration during + installation. This would save the user from having to know their cache + configuration and using vg_cachegen.
    • +

    • Program start-up/shut-down calls a lot of functions that aren't + interesting and just complicate the output. Would be nice to exclude + these somehow.
    • +

    +
    + diff --git a/coregrind/valgrind.in b/coregrind/valgrind.in index 7f18890796..b10f48795b 100755 --- a/coregrind/valgrind.in +++ b/coregrind/valgrind.in @@ -65,6 +65,8 @@ do --workaround-gcc296-bugs=yes) vgopts="$vgopts $arg"; shift;; --freelist-vol=*) vgopts="$vgopts $arg"; shift;; --suppressions=*) vgopts="$vgopts $arg"; shift;; + --cachesim=yes) vgopts="$vgopts $arg"; shift;; + --cachesim=no) vgopts="$vgopts $arg"; shift;; # options for debugging Valgrind --sanity-level=*) vgopts="$vgopts $arg"; shift;; --single-step=yes) vgopts="$vgopts $arg"; shift;; @@ -129,6 +131,7 @@ if [ $# = 0 ] || [ z"$dousage" = z1 ]; then echo " suppressions file " echo " --check-addrVs=no|yes experimental lighterweight checking? [yes]" echo " yes == Valgrind's original behaviour" + echo " --cachesim=no|yes do cache profiling? [no]" echo echo " options for debugging Valgrind itself are:" echo " --sanity-level= level of sanity checking to do [1]" diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h index b51ef059a8..08a1ab20de 100644 --- a/coregrind/vg_include.h +++ b/coregrind/vg_include.h @@ -234,6 +234,8 @@ extern Bool VG_(clo_optimise); extern Bool VG_(clo_instrument); /* DEBUG: clean up instrumented code? default: YES */ extern Bool VG_(clo_cleanup); +/* Cache simulation instrumentation? default: NO */ +extern Bool VG_(clo_cachesim); /* SMC write checks? default: SOME (1,2,4 byte movs to mem) */ extern Int VG_(clo_smc_check); /* DEBUG: print system calls? default: NO */ @@ -287,7 +289,7 @@ extern void VG_(shutdown_logging) ( void ); #define VGP_M_STACK 10 -#define VGP_M_CCS 20 /* == the # of elems in VGP_LIST */ +#define VGP_M_CCS 24 /* == the # of elems in VGP_LIST */ #define VGP_LIST \ VGP_PAIR(VgpRun=0, "running"), \ VGP_PAIR(VgpMalloc, "low-lev malloc/free"), \ @@ -307,6 +309,10 @@ extern void VG_(shutdown_logging) ( void ); VGP_PAIR(VgpAddToT, "add-to-transtab"), \ VGP_PAIR(VgpSARP, "set-addr-range-perms"), \ VGP_PAIR(VgpSyscall, "syscall wrapper"), \ + VGP_PAIR(VgpCacheInstrument, "cache instrument"), \ + VGP_PAIR(VgpCacheGetBBCC,"cache get BBCC"), \ + VGP_PAIR(VgpCacheSimulate, "cache simulate"), \ + VGP_PAIR(VgpCacheDump, "cache stats dump"), \ VGP_PAIR(VgpSpare1, "spare 1"), \ VGP_PAIR(VgpSpare2, "spare 2") @@ -718,8 +724,10 @@ extern void VG_(assert_fail) ( Char* expr, Char* file, Int line, Char* fn ) __attribute__ ((__noreturn__)); -/* Reading files. */ +/* Reading and writing files. */ extern Int VG_(open_read) ( Char* pathname ); +extern Int VG_(open_write) ( Char* pathname ); +extern Int VG_(create_and_write) ( Char* pathname ); extern void VG_(close) ( Int fd ); extern Int VG_(read) ( Int fd, void* buf, Int count); extern Int VG_(write) ( Int fd, void* buf, Int count); @@ -955,7 +963,9 @@ typedef UChar tag3:4; /* third operand tag */ UChar extra4b:4; /* Spare field, used by WIDEN for src -size, and by LEA2 for scale - (1,2,4 or 8) */ + (1,2,4 or 8), and by unconditional JMPs for + orig x86 instr size if --cachesim=yes */ + /* word 5 */ UChar cond; /* condition, for jumps */ @@ -1044,6 +1054,10 @@ extern Bool VG_(anyFlagUse) ( UInstr* u ); extern void VG_(ppUInstr) ( Int instrNo, UInstr* u ); extern void VG_(ppUCodeBlock) ( UCodeBlock* cb, Char* title ); +extern UCodeBlock* VG_(allocCodeBlock) ( void ); +extern void VG_(freeCodeBlock) ( UCodeBlock* cb ); +extern void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr ); + extern Char* VG_(nameCondcode) ( Condcode cond ); extern Bool VG_(saneUInstr) ( Bool beforeRA, UInstr* u ); extern Bool VG_(saneUCodeBlock) ( UCodeBlock* cb ); @@ -1184,6 +1198,11 @@ extern void VG_(what_obj_and_fun_is_this) ( Addr a, Char* obj_buf, Int n_obj_buf, Char* fun_buf, Int n_fun_buf ); +extern Bool VG_(what_line_is_this) ( Addr a, + UChar* filename, Int n_filename, + UInt* lineno ); +extern Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, + Char* fn_name, Int n_fn_name); extern void VG_(symtab_notify_munmap) ( Addr start, UInt length ); @@ -1590,6 +1609,20 @@ extern void VG_(helper_value_check0_fail); extern void VG_(signalreturn_bogusRA)( void ); extern void VG_(pthreadreturn_bogusRA)( void ); +/* --------------------------------------------------------------------- + Exports of vg_cachesim.c + ------------------------------------------------------------------ */ + +extern UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr); + +typedef struct _iCC iCC; +typedef struct _idCC idCC; + +extern void VG_(init_cachesim) ( void ); +extern void VG_(show_cachesim_results)( Int client_argc, Char** client_argv ); + +extern void VG_(cachesim_log_non_mem_instr)( iCC* cc ); +extern void VG_(cachesim_log_mem_instr) ( idCC* cc, Addr data_addr ); /* --------------------------------------------------------------------- The state of the simulated CPU. @@ -1724,7 +1757,8 @@ extern Int VGOFF_(handle_esp_assignment); /* :: Addr -> void */ extern Int VGOFF_(fpu_write_check); /* :: Addr -> Int -> void */ extern Int VGOFF_(fpu_read_check); /* :: Addr -> Int -> void */ - +extern Int VGOFF_(cachesim_log_non_mem_instr); +extern Int VGOFF_(cachesim_log_mem_instr); #endif /* ndef __VG_INCLUDE_H */ diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c index 5f7fe59c34..ee875e226b 100644 --- a/coregrind/vg_main.c +++ b/coregrind/vg_main.c @@ -103,7 +103,8 @@ Int VGOFF_(helperc_STOREV1) = INVALID_OFFSET; Int VGOFF_(handle_esp_assignment) = INVALID_OFFSET; Int VGOFF_(fpu_write_check) = INVALID_OFFSET; Int VGOFF_(fpu_read_check) = INVALID_OFFSET; - +Int VGOFF_(cachesim_log_non_mem_instr) = INVALID_OFFSET; +Int VGOFF_(cachesim_log_mem_instr) = INVALID_OFFSET; /* This is the actual defn of baseblock. */ UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS]; @@ -165,6 +166,13 @@ static void vg_init_baseBlock ( void ) /* 16 */ VGOFF_(sh_edi) = alloc_BaB(1); /* 17 */ VGOFF_(sh_eflags) = alloc_BaB(1); + /* 17a */ + VGOFF_(cachesim_log_non_mem_instr) + = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_non_mem_instr) ); + /* 17b */ + VGOFF_(cachesim_log_mem_instr) + = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_mem_instr) ); + /* 18 */ VGOFF_(helper_value_check4_fail) = alloc_BaB_1_set( (Addr) & VG_(helper_value_check4_fail) ); @@ -419,6 +427,7 @@ Bool VG_(clo_single_step); Bool VG_(clo_optimise); Bool VG_(clo_instrument); Bool VG_(clo_cleanup); +Bool VG_(clo_cachesim); Int VG_(clo_smc_check); Bool VG_(clo_trace_syscalls); Bool VG_(clo_trace_signals); @@ -745,6 +754,11 @@ static void process_cmd_line_options ( void ) else if (STREQ(argv[i], "--cleanup=no")) VG_(clo_cleanup) = False; + else if (STREQ(argv[i], "--cachesim=yes")) + VG_(clo_cachesim) = True; + else if (STREQ(argv[i], "--cachesim=no")) + VG_(clo_cachesim) = False; + else if (STREQ(argv[i], "--smc-check=none")) VG_(clo_smc_check) = VG_CLO_SMC_NONE; else if (STREQ(argv[i], "--smc-check=some")) @@ -821,6 +835,11 @@ static void process_cmd_line_options ( void ) VG_(clo_logfile_fd) = eventually_logfile_fd; + /* Don't do memory checking if simulating the cache. */ + if (VG_(clo_cachesim)) { + VG_(clo_instrument) = False; + } + if (VG_(clo_verbosity > 0)) VG_(message)(Vg_UserMsg, "valgrind-%s, a memory error detector for x86 GNU/Linux.", @@ -978,7 +997,7 @@ void VG_(main) ( void ) attach GDB in another shell. */ /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */ - if (VG_(clo_instrument)) { + if (VG_(clo_instrument) || VG_(clo_cachesim)) { VGP_PUSHCC(VgpInitAudit); VGM_(init_memory_audit)(); VGP_POPCC; @@ -1012,6 +1031,9 @@ void VG_(main) ( void ) VGM_(make_readable) ( (Addr)&VG_(clo_sloppy_malloc), 1 ); } + if (VG_(clo_cachesim)) + VG_(init_cachesim)(); + if (VG_(clo_verbosity) > 0) VG_(message)(Vg_UserMsg, ""); @@ -1039,6 +1061,9 @@ void VG_(main) ( void ) } VG_(running_on_simd_CPU) = False; + if (VG_(clo_cachesim)) + VG_(show_cachesim_results)(VG_(client_argc), VG_(client_argv)); + VG_(do_sanity_checks)( 1 /* root thread */, True /*include expensive checks*/ ); diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c index 740b21e3d1..77dd153a2e 100644 --- a/coregrind/vg_mylibc.c +++ b/coregrind/vg_mylibc.c @@ -885,7 +885,49 @@ Int VG_(open_read) ( Char* pathname ) if (VG_(is_kerror)(fd)) fd = -1; return fd; } + +/* Returns -1 on failure. */ +static Int VG_(chmod_u_rw) ( Int fd ) +{ + Int res; + const int O_IRUSR_IWUSR = 000600; /* See /usr/include/cpio.h */ + res = vg_do_syscall2(__NR_fchmod, fd, O_IRUSR_IWUSR); + if (VG_(is_kerror)(res)) res = -1; + return res; +} + +/* Returns -1 on failure. */ +Int VG_(create_and_write) ( Char* pathname ) +{ + Int fd; + + const int O_CR_AND_WR_ONLY = 0101; /* See /usr/include/bits/fcntl.h */ + fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_CR_AND_WR_ONLY, 0); + /* VG_(printf)("result = %d\n", fd); */ + if (VG_(is_kerror)(fd)) { + fd = -1; + } else { + VG_(chmod_u_rw)(fd); + if (VG_(is_kerror)(fd)) { + fd = -1; + } + } + return fd; +} +/* Returns -1 on failure. */ +Int VG_(open_write) ( Char* pathname ) +{ + Int fd; + + const int O_WRONLY_AND_TRUNC = 01001; /* See /usr/include/bits/fcntl.h */ + fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_WRONLY_AND_TRUNC, 0); + /* VG_(printf)("result = %d\n", fd); */ + if (VG_(is_kerror)(fd)) { + fd = -1; + } + return fd; +} void VG_(close) ( Int fd ) { diff --git a/coregrind/vg_symtab2.c b/coregrind/vg_symtab2.c index 74cada4e82..d9da8fff45 100644 --- a/coregrind/vg_symtab2.c +++ b/coregrind/vg_symtab2.c @@ -1,4 +1,3 @@ - /*--------------------------------------------------------------------*/ /*--- Management of symbols and debugging information. ---*/ /*--- vg_symtab2.c ---*/ @@ -50,6 +49,23 @@ /*--- Structs n stuff ---*/ /*------------------------------------------------------------*/ +/* Stabs entry types, from: + * The "stabs" debug format + * Menapace, Kingdon and MacKenzie + * Cygnus Support + */ +typedef enum { N_FUN = 36, /* Function start or end */ + N_STSYM = 38, /* Data segment file-scope variable */ + N_LCSYM = 40, /* BSS segment file-scope variable */ + N_RSYM = 64, /* Register variable */ + N_SLINE = 68, /* Source line number */ + N_SO = 100, /* Source file path and name */ + N_LSYM = 128, /* Stack variable or type */ + N_SOL = 132, /* Include file name */ + N_LBRAC = 192, /* Start of lexical block */ + N_RBRAC = 224 /* End of lexical block */ + } stab_types; + /* A structure to hold an ELF symbol (very crudely). */ typedef struct { @@ -535,12 +551,7 @@ void addLineInfo ( SegInfo* si, { RiLoc loc; UInt size = end - start + 1; -# if 0 - if (size > 10000) - VG_(printf)( "line %4d: %p .. %p, in %s\n", - lineno, start, end, - &si->strtab[fnmoff] ); -# endif + /* Sanity ... */ if (size > 10000) return; @@ -577,8 +588,8 @@ void vg_read_lib_symbols ( SegInfo* si ) /* for the .stabs reader */ Int curr_filenmoff; Addr curr_fnbaseaddr; - Addr range_startAddr; - Int range_lineno; + Char* curr_file_name; + Int n_stab_entries; oimage = (Addr)NULL; if (VG_(clo_verbosity) > 1) @@ -880,25 +891,23 @@ void vg_read_lib_symbols ( SegInfo* si ) /* Ok. It all looks plausible. Go on and read debug data. stab kinds: 100 N_SO a source file name 68 N_SLINE a source line number - 36 N_FUN ? start of a function - - In this loop, we maintain a current file name, updated - as N_SOs appear, and a current function base address, - updated as N_FUNs appear. Based on that, address ranges - for N_SLINEs are calculated, and stuffed into the - line info table. - - N_SLINE indicates the start of a source line. Functions are - delimited by N_FUNS, at the start with a non-empty string and at - the end with an empty string. The latter facilitates detecting - where to close the last N_SLINE for a function. + 36 N_FUN start of a function + + In this loop, we maintain a current file name, updated as + N_SO/N_SOLs appear, and a current function base address, + updated as N_FUNs appear. Based on that, address ranges for + N_SLINEs are calculated, and stuffed into the line info table. + + Finding the instruction address range covered by an N_SLINE is + complicated; see the N_SLINE case below. */ curr_filenmoff = addStr(si,"???"); curr_fnbaseaddr = (Addr)NULL; - range_startAddr = 0; - range_lineno = 0; + curr_file_name = (Char*)NULL; - for (i = 0; i < stab_sz/(int)sizeof(struct nlist); i++) { + n_stab_entries = stab_sz/(int)sizeof(struct nlist); + + for (i = 0; i < n_stab_entries; i++) { # if 0 VG_(printf) ( " %2d ", i ); VG_(printf) ( "type=0x%x othr=%d desc=%d value=0x%x strx=%d %s", @@ -910,61 +919,95 @@ void vg_read_lib_symbols ( SegInfo* si ) # endif switch (stab[i].n_type) { - - case 68: { /* N_SLINE */ - /* flush the current line, if any, and start a new one */ - Addr range_endAddr - = curr_fnbaseaddr - + (UInt)stab[i].n_value - 1; - if (range_startAddr != 0) { - addLineInfo ( si, - curr_filenmoff, - range_startAddr, - range_endAddr, - range_lineno ); + UInt next_addr; + + /* To compute the instr address range covered by a single line, find + * the address of the next thing and compute the difference. The + * approach used depends on what kind of entry/entries follow... */ + case N_SLINE: { + Int lineno = stab[i].n_desc; + Int this_addr = (UInt)stab[i].n_value; + + LOOP: + vg_assert(i+1 < n_stab_entries); /* Haven't reached end */ + switch (stab[i+1].n_type) { + /* Easy, common case: use address of next entry */ + case N_SLINE: case N_SO: + next_addr = (UInt)stab[i+1].n_value; + break; + + /* Boring one: skip, look for something more useful. */ + case N_RSYM: case N_LSYM: case N_LBRAC: case N_RBRAC: + case N_STSYM: case N_LCSYM: + i++; + goto LOOP; + + /* Should be an end of fun entry, use its address */ + case N_FUN: + if ('\0' == * (stabstr + stab[i+1].n_un.n_strx) ) { + next_addr = (UInt)stab[i+1].n_value; + } else { + VG_(printf)("unhandled stabs case: N_FUN start %d %s\n", + i, (stabstr + stab[i+1].n_un.n_strx) ); + VG_(panic)("argh"); + } + break; + + /* N_SOL should be followed by an N_SLINE which can be used */ + case N_SOL: + if (i+2 < n_stab_entries && N_SLINE == stab[i+2].n_type) { + next_addr = (UInt)stab[i+2].n_value; + break; + } else { + VG_(printf)("unhandled N_SOL stabs case: %d %d %d", + stab[i+1].n_type, i, n_stab_entries); + VG_(panic)("argh"); + } + + default: + VG_(printf)("unhandled stabs case: %d %d", + stab[i+1].n_type,i); + VG_(panic)("argh"); } - range_startAddr = range_endAddr + 1; - range_lineno = stab[i].n_desc; + + //Int offset2 = (i+1 < n_stab_entries && 68 == stab[i+1].n_type + // ? (UInt)stab[i+1].n_value - 1 + // : offset + 1); + //if (i+1 < n_stab_entries) { + // int x; + // if (68 != (x = stab[i+1].n_type)) { + // VG_(printf)("%d ", x); + // } + //} + + addLineInfo ( si, curr_filenmoff, curr_fnbaseaddr + this_addr, + curr_fnbaseaddr + next_addr - 1, lineno ); break; } - case 36: { /* N_FUN */ - if ('\0' == * (stabstr + stab[i].n_un.n_strx) ) { - /* N_FUN with no name -- indicates the end of a fn. - Flush the current line, if any, but don't start a - new one. */ - Addr range_endAddr - = curr_fnbaseaddr - + (UInt)stab[i].n_value - 1; - if (range_startAddr != 0) { - addLineInfo ( si, - curr_filenmoff, - range_startAddr, - range_endAddr, - range_lineno ); - } - range_startAddr = 0; - } else { + case N_FUN: { + if ('\0' != (stabstr + stab[i].n_un.n_strx)[0] ) { /* N_FUN with a name -- indicates the start of a fn. */ - curr_fnbaseaddr = si->offset - + (Addr)stab[i].n_value; - range_startAddr = curr_fnbaseaddr; + curr_fnbaseaddr = si->offset + (Addr)stab[i].n_value; } break; } - case 100: /* N_SO */ - case 132: /* N_SOL */ + case N_SO: case N_SOL: /* seems to give lots of locations in header files */ /* case 130: */ /* BINCL */ { UChar* nm = stabstr + stab[i].n_un.n_strx; UInt len = VG_(strlen)(nm); - if (len > 0 && nm[len-1] != '/') + + if (len > 0 && nm[len-1] != '/') { curr_filenmoff = addStr ( si, nm ); + curr_file_name = stabstr + stab[i].n_un.n_strx; + } else if (len == 0) curr_filenmoff = addStr ( si, "?1\0" ); + break; } @@ -1071,8 +1114,8 @@ void read_symtab_callback ( which happen to correspond to the munmap()d area. */ void VG_(read_symbols) ( void ) { - if (! VG_(clo_instrument)) - return; + if (! VG_(clo_instrument) && ! VG_(clo_cachesim)) + return; VG_(read_procselfmaps) ( read_symtab_callback ); @@ -1251,9 +1294,8 @@ static void search_all_loctabs ( Addr ptr, SegInfo** psi, Int* locno ) Caller supplies buf and nbuf. If no_demangle is True, don't do demangling, regardless of vg_clo_demangle -- probably because the call has come from vg_what_fn_or_object_is_this. */ -static -Bool vg_what_fn_is_this ( Bool no_demangle, Addr a, - Char* buf, Int nbuf ) +Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, + Char* buf, Int nbuf ) { SegInfo* si; Int sno; @@ -1297,17 +1339,16 @@ void VG_(what_obj_and_fun_is_this) ( Addr a, Char* fun_buf, Int n_fun_buf ) { (void)vg_what_object_is_this ( a, obj_buf, n_obj_buf ); - (void)vg_what_fn_is_this ( True, a, fun_buf, n_fun_buf ); + (void)VG_(what_fn_is_this) ( True, a, fun_buf, n_fun_buf ); } /* Map a code address to a (filename, line number) pair. Returns True if successful. */ -static -Bool vg_what_line_is_this ( Addr a, - UChar* filename, Int n_filename, - UInt* lineno ) +Bool VG_(what_line_is_this)( Addr a, + UChar* filename, Int n_filename, + UInt* lineno ) { SegInfo* si; Int locno; @@ -1317,6 +1358,7 @@ Bool vg_what_line_is_this ( Addr a, VG_(strncpy_safely)(filename, & si->strtab[si->loctab[locno].fnmoff], n_filename); *lineno = si->loctab[locno].lineno; + return True; } @@ -1348,11 +1390,11 @@ void VG_(mini_stack_dump) ( ExeContext* ec ) n = 0; - know_fnname = vg_what_fn_is_this(False,ec->eips[0], buf_fn, M_VG_ERRTXT); + know_fnname = VG_(what_fn_is_this)(False,ec->eips[0], buf_fn, M_VG_ERRTXT); know_objname = vg_what_object_is_this(ec->eips[0], buf_obj, M_VG_ERRTXT); - know_srcloc = vg_what_line_is_this(ec->eips[0], - buf_srcloc, M_VG_ERRTXT, - &lineno); + know_srcloc = VG_(what_line_is_this)(ec->eips[0], + buf_srcloc, M_VG_ERRTXT, + &lineno); APPEND(" at "); VG_(sprintf)(ibuf,"0x%x: ", ec->eips[0]); @@ -1383,9 +1425,9 @@ void VG_(mini_stack_dump) ( ExeContext* ec ) clueless = 0; for (i = 1; i < stop_at; i++) { - know_fnname = vg_what_fn_is_this(False,ec->eips[i], buf_fn, M_VG_ERRTXT); + know_fnname = VG_(what_fn_is_this)(False,ec->eips[i], buf_fn, M_VG_ERRTXT); know_objname = vg_what_object_is_this(ec->eips[i],buf_obj, M_VG_ERRTXT); - know_srcloc = vg_what_line_is_this(ec->eips[i], + know_srcloc = VG_(what_line_is_this)(ec->eips[i], buf_srcloc, M_VG_ERRTXT, &lineno); n = 0; diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c index f06689d956..691ef26d67 100644 --- a/coregrind/vg_to_ucode.c +++ b/coregrind/vg_to_ucode.c @@ -4437,43 +4437,76 @@ Int VG_(disBB) ( UCodeBlock* cb, Addr eip0 ) Addr eip = eip0; Bool isEnd = False; Bool block_sane; + Int INCEIP_allowed_lag = 4; + Int delta = 0; + if (dis) VG_(printf)("\n"); + /* When cache simulating, to ensure cache misses are attributed to the + * correct line we ensure EIP is always correct. This is done by: + * + * a) Using eager INCEIP updating to cope with all instructions except those + * at the end of a basic block. + * + * b) Patching in the size of the original x86 instr in the `extra4b' field + * of JMPs at the end of a basic block. Two cases: + * - Jcond followed by Juncond: patch the Jcond + * - Juncond alone: patch the Juncond + * + * See vg_cachesim_instrument() for how this is used. + */ + if (VG_(clo_cachesim)) { + INCEIP_allowed_lag = 0; + } + if (VG_(clo_single_step)) { eip = disInstr ( cb, eip, &isEnd ); - uInstr1(cb, JMP, 0, Literal, 0); - uLiteral(cb, eip); - uCond(cb, CondAlways); - if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); + + /* Add a JMP to the next (single x86 instruction) BB if it doesn't + * already end with a JMP instr. We also need to check for no UCode, + * which occurs if the x86 instr was a nop */ + if (cb->used == 0 || LAST_UINSTR(cb).opcode != JMP) { + uInstr1(cb, JMP, 0, Literal, 0); + uLiteral(cb, eip); + uCond(cb, CondAlways); + if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); + } + delta = eip - eip0; + } else { - Int delta = 0; Addr eip2; - while (True) { - if (isEnd) break; + while (!isEnd) { eip2 = disInstr ( cb, eip, &isEnd ); delta += (eip2 - eip); eip = eip2; - if (delta > 4 && !isEnd) { - uInstr1(cb, INCEIP, 0, Lit16, delta); - if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); - delta = 0; - } /* Split up giant basic blocks into pieces, so the translations fall within 64k. */ - if (eip - eip0 > 2000) { + if (eip - eip0 > 2000 && !isEnd) { if (VG_(clo_verbosity) > 0) - VG_(message)(Vg_DebugMsg, + VG_(message)(Vg_DebugMsg, "Warning: splitting giant basic block into pieces"); uInstr1(cb, JMP, 0, Literal, 0); uLiteral(cb, eip); uCond(cb, CondAlways); if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); - if (dis) VG_(printf)("\n"); - break; + isEnd = True; + + } else if (delta > INCEIP_allowed_lag && !isEnd) { + uInstr1(cb, INCEIP, 0, Lit16, delta); + if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); + delta = 0; } if (dis) VG_(printf)("\n"); } } + if (VG_(clo_cachesim)) { + /* Patch instruction size into earliest JMP. */ + if (cb->used >= 2 && JMP == cb->instrs[cb->used - 2].opcode) { + cb->instrs[cb->used - 2].extra4b = delta; + } else { + LAST_UINSTR(cb).extra4b = delta; + } + } block_sane = VG_(saneUCodeBlock)(cb); if (!block_sane) { diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c index 73cad5e9bc..76c6ef8a9b 100644 --- a/coregrind/vg_translate.c +++ b/coregrind/vg_translate.c @@ -104,7 +104,7 @@ void VG_(jitfree) ( void* ptr ) /*--- Basics ---*/ /*------------------------------------------------------------*/ -static UCodeBlock* allocCodeBlock ( void ) +UCodeBlock* VG_(allocCodeBlock) ( void ) { UCodeBlock* cb = VG_(malloc)(VG_AR_PRIVATE, sizeof(UCodeBlock)); cb->used = cb->size = cb->nextTemp = 0; @@ -113,7 +113,7 @@ static UCodeBlock* allocCodeBlock ( void ) } -static void freeCodeBlock ( UCodeBlock* cb ) +void VG_(freeCodeBlock) ( UCodeBlock* cb ) { if (cb->instrs) VG_(free)(VG_AR_PRIVATE, cb->instrs); VG_(free)(VG_AR_PRIVATE, cb); @@ -239,17 +239,15 @@ void VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz ) ui->size = sz; } - /* Copy an instruction into the given codeblock. */ -static __inline__ -void copyUInstr ( UCodeBlock* cb, UInstr* instr ) +__inline__ +void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr ) { ensureUInstr(cb); cb->instrs[cb->used] = *instr; cb->used++; } - /* Copy auxiliary info from one uinstr to another. */ static __inline__ void copyAuxInfoFromTo ( UInstr* src, UInstr* dst ) @@ -1654,7 +1652,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 ) /* Resulting code goes here. We generate it all in a forwards pass. */ - c2 = allocCodeBlock(); + c2 = VG_(allocCodeBlock)(); /* At the start, no TempRegs are assigned to any real register. Correspondingly, all temps claim to be currently resident in @@ -1841,7 +1839,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 ) for (j = 0; j < k; j++) tempUse[j].realNo = VG_(rankToRealRegNo)(temp_info[tempUse[j].tempNo].real_no); - copyUInstr(c2, &c1->instrs[i]); + VG_(copyUInstr)(c2, &c1->instrs[i]); patchUInstr(&LAST_UINSTR(c2), &tempUse[0], k); if (VG_(disassemble)) { @@ -1853,7 +1851,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 ) if (temp_info != NULL) VG_(jitfree)(temp_info); - freeCodeBlock(c1); + VG_(freeCodeBlock)(c1); if (spill_reqd) VG_(translations_needing_spill)++; @@ -2099,7 +2097,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) Int i, j; UInstr* u_in; Int qs, qd, qt, qtt; - cb = allocCodeBlock(); + cb = VG_(allocCodeBlock)(); cb->nextTemp = cb_in->nextTemp; for (i = 0; i < cb_in->used; i++) { @@ -2115,7 +2113,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) break; case INCEIP: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Loads and stores. Test the V bits for the address. 24 @@ -2136,7 +2134,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, LOADV, u_in->size, TempReg, u_in->val1, TempReg, SHADOW(u_in->val2)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case STORE: if (VG_(clo_check_addrVs)) { @@ -2146,7 +2144,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, STOREV, u_in->size, TempReg, SHADOW(u_in->val1), TempReg, u_in->val2); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Moving stuff around. Make the V bits follow accordingly, @@ -2156,24 +2154,24 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, GETV, u_in->size, ArchReg, u_in->val1, TempReg, SHADOW(u_in->val2)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case PUT: uInstr2(cb, PUTV, u_in->size, TempReg, SHADOW(u_in->val1), ArchReg, u_in->val2); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case GETF: /* This is not the smartest way to do it, but should work. */ qd = create_GETVF(cb, u_in->size); uInstr2(cb, MOV, 4, TempReg, qd, TempReg, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case PUTF: create_PUTVF(cb, u_in->size, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case MOV: @@ -2190,7 +2188,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) default: VG_(panic)("vg_instrument: MOV"); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Special case of add, where one of the operands is a literal. @@ -2203,7 +2201,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) qd = SHADOW(u_in->val2); uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qd); create_Left(cb, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Another form of add. @@ -2234,7 +2232,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) } create_UifU(cb, 4, qs, qd); create_Left(cb, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; } @@ -2244,7 +2242,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) create_Left(cb, u_in->size, qd); if (u_in->flags_w != FlagsEmpty) create_PUTVF(cb, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* This is a HACK (approximation :-) */ @@ -2279,7 +2277,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) create_UifU(cb, 0, qs, qd); create_PUTVF(cb, 0, qd); create_PCast(cb, 0, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* for OP in shl shr sar rol ror @@ -2329,7 +2327,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt); create_PCast(cb, 1, u_in->size, qt); create_UifU(cb, u_in->size, qt, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; } @@ -2338,20 +2336,20 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->tag1 == TempReg); create_Widen(cb, u_in->signed_widen, u_in->extra4b, u_in->size, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* not#(x) = x (since bitwise independent) */ case NOT: vg_assert(u_in->tag1 == TempReg); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* neg#(x) = left(x) (derivable from case for SUB) */ case NEG: vg_assert(u_in->tag1 == TempReg); create_Left(cb, u_in->size, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* bswap#(x) = bswap(x) */ @@ -2360,7 +2358,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->size == 4); qd = SHADOW(u_in->val1); uInstr1(cb, BSWAP, 4, TempReg, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* cc2val#(qd) = pcast-0-to-size(eflags#) */ @@ -2369,7 +2367,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->flags_r != FlagsEmpty); qt = create_GETVF(cb, u_in->size); uInstr2(cb, MOV, 4, TempReg, qt, TempReg, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* cmov#(qs,qd) = cmov(qs,qd) @@ -2395,7 +2393,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) LAST_UINSTR(cb).cond = u_in->cond; LAST_UINSTR(cb).flags_r = u_in->flags_r; - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* add#/sub#(qs,qd) @@ -2421,7 +2419,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) if (u_in->flags_w != FlagsEmpty) { create_PUTVF(cb, u_in->size, qd); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* xor#(qs,qd) = qs `UifU` qd */ @@ -2432,7 +2430,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) if (u_in->flags_w != FlagsEmpty) { create_PUTVF(cb, u_in->size, qd); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* and#/or#(qs,qd) @@ -2476,7 +2474,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) if (u_in->flags_w != FlagsEmpty) { create_PUTVF(cb, u_in->size, qd); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Machinery to do with supporting CALLM. Copy the start and @@ -2484,7 +2482,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) (debug); they generate no code and have no effect. */ case CALLM_S: case CALLM_E: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Copy PUSH and POP verbatim. Arg/result absval @@ -2493,7 +2491,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) needs to be copied. */ case PUSH: case POP: case CLEAR: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* In short: @@ -2562,7 +2560,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) ... */ uInstr1(cb, SETV, 0, TempReg, qt); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; } /* Whew ... */ @@ -2582,13 +2580,13 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) ... */ uInstr1(cb, SETV, 0, TempReg, qt); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case JIFZ: uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1)); uInstr1(cb, SETV, 4, TempReg, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Emit a check on the address used. For FPU_R, the value @@ -2598,12 +2596,12 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->tag2 == TempReg); uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val2)); uInstr1(cb, SETV, 4, TempReg, SHADOW(u_in->val2)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* For FPU insns not referencing memory, just copy thru. */ case FPU: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; default: @@ -2614,7 +2612,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) } /* end of for loop */ - freeCodeBlock(cb_in); + VG_(freeCodeBlock)(cb_in); return cb; } @@ -3083,7 +3081,7 @@ void VG_(translate) ( ThreadState* tst, VG_(overall_in_osize), VG_(overall_in_tsize), orig_addr ); } - cb = allocCodeBlock(); + cb = VG_(allocCodeBlock)(); /* Disassemble this basic block into cb. */ VGP_PUSHCC(VgpToUCode); @@ -3117,6 +3115,19 @@ void VG_(translate) ( ThreadState* tst, } } + //VG_(disassemble) = True; + + /* Add cache simulation code. */ + if (VG_(clo_cachesim)) { + VGP_PUSHCC(VgpCacheInstrument); + cb = VG_(cachesim_instrument)(cb, orig_addr); + VGP_POPCC; + if (VG_(disassemble)) + VG_(ppUCodeBlock) ( cb, "Cachesim instrumented code:" ); + } + + //VG_(disassemble) = False; + /* Allocate registers. */ VGP_PUSHCC(VgpRegAlloc); cb = vg_do_register_allocation ( cb ); @@ -3132,7 +3143,7 @@ void VG_(translate) ( ThreadState* tst, and so must be VG_(jitfree)'d. */ final_code = VG_(emit_code)(cb, &final_code_size ); VGP_POPCC; - freeCodeBlock(cb); + VG_(freeCodeBlock)(cb); if (debugging_translation) { /* Only done for debugging -- throw away final result. */ diff --git a/docs/manual.html b/docs/manual.html index c441db1620..a97c2f9fec 100644 --- a/docs/manual.html +++ b/docs/manual.html @@ -78,7 +78,9 @@ Linux-x86 executables.

    An example

    -

    The design and implementation of Valgrind

    +

    Cache profiling

    + +

    The design and implementation of Valgrind


    @@ -515,6 +517,11 @@ follows: buggy, so you may need to issue this flag if you use 3.0.4.

    +

  • --cachesim=no [default]
    + --cachesim=yes +

    When enabled, turns off memory checking, and turns on cache profiling. + Cache profiling is described in detail in Section 7. +

  • There are also some options for debugging Valgrind itself. You @@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$

    The GCC folks fixed this about a week before gcc-3.0 shipped.


    + + + + +

    7  Cache profiling

    +As well as memory debugging, Valgrind also allows you to do cache simulations +and annotate your source line-by-line with the number of cache misses. In +particular, it records: +
      +
    • L1 instruction cache reads and misses; +
    • L1 data cache reads and read misses, writes and write misses; +
    • L2 unified cache reads and read misses, writes and writes misses. +
    +On a modern x86 machine, an L1 miss will typically cost around 10 cycles, +and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be +very useful for improving the performance of your program. + +Please note that this is an experimental feature. Any feedback, bug-fixes, +suggestions, etc, welcome. + + +

    7.1  Overview

    +First off, as for normal Valgrind use, you probably want to turn on debugging +info (the -g flag). But by contrast with normal Valgrind use, you +probably do want to turn optimisation on, since you should profile your +program as it will be normally run. + +The three steps are: +
      +
    1. Generate a cache simulator for your machine's cache configuration with + `vg_cachegen' and recompile Valgrind with make install. + Valgrind comes with a default simulator, but it is unlikely to be correct + for your system, so you should generate a simulator yourself.
    2. +
    3. Run your program with valgrind --cachesim=yes in front of + the normal command line invocation. When the program finishes, Valgrind + will print summary cache statistics. It also collects line-by-line + information in a file cachegrind.out.
    4. +
    5. Generate a function-by-function summary, and possibly annotate source + files with 'vg_annotate'. Source files to annotate can be specified + manually, or manually on the command line, or "interesting" source files + can be annotated automatically with the --auto=yes option. + You can annotate C/C++ files or assembly language files equally + easily.
    6. +
    + +Step 1 only needs to be done once, unless you are +interested in simulating different cache configurations (eg. first +concentrating on instruction cache misses, then on data cache misses).

    + +Step 2 should be done every time you want to collect +information about a new program, a changed program, or about the same program +with different input.

    + +Step 3 can be performed as many times as you like for +each Step 2; you may want to do multiple annotations showing different +information each time.

    + +The steps are described in detail in the following sections.

    + + + +

    7.3  Generating a cache simulator

    +Although Valgrind comes with a pre-generated cache simulator, it most likely +won't match the cache configuration of your machine, so you should generate +a new simulator.

    + +You need to generate three files, one for each of the I1, D1 and L2 caches. +For each cache, you need to know the: +

      +
    • Cache size (bytes); +
    • Line size (bytes); +
    • Associativity. +
    + +vg_cachegen takes three options: +
      +
    • --I1=size,line_size,associativity +
    • --D1=size,line_size,associativity +
    • --L2=size,line_size,associativity +
    + +You can specify one, two or all three caches per invocation of vg_cachegen. It +checks that the configuration is sensible before generating the simulators; to +see the allowed values, run vg_cachegen -h.

    + +An example invocation would be: + +

    + vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8 +
    + +This simulates a machine with a 128KB split L1 2-way associative cache, and a +256KB unified 8-way associative L2 cache. Both caches have 64B lines.

    + +If you don't know your cache configuration, you'll have to find it out. +(Ideally vg_cachegen could auto-identify your cache configuration using the +CPUID instruction, which could be done automatically during installation, and +this whole step could be skipped...)

    + + +

    7.4  Cache simulation specifics

    +vg_cachegen only generates simulations for a machine with a split L1 cache and +a unified L2 cache. This configuration is used for all x86-based machines we +are aware of.

    + +The more specific characteristics of the simulation are as follows. + +

      +
    • Write-allocate: when a write miss occurs, the block written to is brought + into the D1 cache. Most modern caches have this property.
    • + +

    • Bit-selection hash function: the line(s) in the cache to which a memory + block maps is chosen by the middle bits M--(M+N-1) of the byte address, + where: +
        +
      •  line size = 2^M bytes 
      • +
      • (cache size / line size) = 2^N bytes
      • +
    • + +

    • Inclusive L2 cache: the L2 cache replicates all the entries of the L1 + cache. This is standard on Pentium chips, but AMD Athlons use an + exclusive L2 cache that only holds blocks evicted from L1.
    • +

    + +Other noteworthy behaviour: + +
      +
    • References that straddle two cache lines are treated as follows:
    • +
        +
      • If both blocks hit --> counted as one hit
      • +
      • If one block hits, the other misses --> counted as one miss
      • +
      • If both blocks miss --> counted as one miss (not two)
      • +

      + +

    • Instructions that modify a memory location (eg. inc and + dec) are counted as doing just a read, ie. a single data + reference. This may seem strange, but since the write can never cause a + miss (the read guarantees the block is in the cache) it's not very + interesting.

      + + Thus it measures not the number of times the data cache is accessed, but + the number of times a data cache miss could occur.

      +

    • +
    + +If you are interested in simulating a cache with different properties, it is +not particularly hard to write your own cache simulator, or to modify existing +ones in vg_cachesim_I1.c, vg_cachesim_I1.c and +vg_cachesim_I1.c. We'd be interested to hear from anyone who +does. + + + +

    7.5  Profiling programs

    +Cache profiling is enabled by using the --cachesim=yes option to +Valgrind. This automatically turns off Valgrind's memory checking functions, +since the cache simulation is slow enough already, and you probably don't want +to do both at once.

    + +To gather cache profiling information about the program ls -lvalgrind --cachesim=yes ls -l + +The program will execute (slowly). Upon completion, summary statistics +that look like this will be printed: + +

    +==31751== I   refs:      27,742,716
    +==31751== I1  misses:           276
    +==31751== L2  misses:           275
    +==31751== I1  miss rate:        0.0%
    +==31751== L2i miss rate:        0.0%
    +==31751== 
    +==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
    +==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
    +==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
    +==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
    +==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
    +==31751== 
    +==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
    +==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
    +
    + +Cache accesses for instruction fetches are summarised first, giving the +number of fetches made (this is the number of instructions executed, which +can be useful to know in its own right), the number of I1 misses, and the +number of L2 instruction (L2i) misses.

    + +Cache accesses for data follow. The information is similar to that of the +instruction fetches, except that the values are also shown split between reads +and writes (note each row's rd and wr values add up +to the row's total).

    + +Combined instruction and data figures for the L2 cache follow that.

    + + +

    7.6  Output file

    +As well as printing summary information, Valgrind also writes line-by-line +cache profiling information to a file named cachegrind.out . This +file is human-readable, but is best interpreted by the accompanying program +vg_annotate, described in the next section.

    + +Things to note about the cachegrind.out file: +

      +
    • It is written every time valgrind --cachesim=yes is run; it + will automatically overwrite any existing cachegrind.out in + the current directory.
    • +
    • It can be quite large: ls -l generates a file of about + 350KB; browsing a few files and web pages with Konqueror generates a file + of around 10MB.
    • +
    + + + +

    7.7  Annotating C/C++ programs

    +Before using vg_annotate, it is worth widening your window to be at least +120-characters wide if possible, as the output lines can be quite long.

    + +To get a function-by-function summary, run vg_annotate in +directory containing a cachegrind.out file. The output looks like +this: + +

    +--------------------------------------------------------------------------------
    +I1 cache:              65536 B, 64 B, 2-way associative
    +D1 cache:              65536 B, 64 B, 2-way associative
    +L2 cache:              262144 B, 64 B, 8-way associative
    +Command:               concord vg_to_ucode.c
    +Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Threshold:             99%
    +Chosen for annotation:
    +Auto-annotation:       on
    +
    +--------------------------------------------------------------------------------
    +Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
    +--------------------------------------------------------------------------------
    +27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
    +
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
    +--------------------------------------------------------------------------------
    +8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
    +5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
    +2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
    +2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
    +2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
    +1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
    +  897,991   51   51   897,831    95    30        62      1      1  ???:???
    +  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
    +  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
    +  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
    +  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
    +  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
    +  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
    +  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
    +  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
    +  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
    +   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
    +   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
    +
    + +First up is a summary of the annotation options: + +
      +
    • I1 cache, D1 cache, L2 cache: cache configuration. So you know the + configuration with which these results were obtained.
    • + +

    • Command: the command line invocation of the program under + examination.
    • + +

    • Events recorded: event abbreviations are:

      +

        +
      • Ir : I cache reads (ie. instructions executed)
      • +
      • I1mr: I1 cache read misses
      • +
      • I2mr: L2 cache instruction read misses
      • +
      • Dr : D cache reads (ie. memory reads)
      • +
      • D1mr: D1 cache read misses
      • +
      • D2mr: L2 cache data read misses
      • +
      • Dw : D cache writes (ie. memory writes)
      • +
      • D1mw: D1 cache write misses
      • +
      • D2mw: L2 cache data write misses
      • +

      + Note that D1 total accesses is given by D1mr + + D1mw, and that L2 total accesses is given by + I2mr + D2mr + D2mw.

    • + +

    • Events shown: the events shown (a subset of events gathered). This can + be adjusted with the --show option.
    • + +

    • Event sort order: the sort order in which functions are shown. For + example, in this case the functions are sorted from highest + Ir counts to lowest. If two functions have identical + Ir counts, they will then be sorted by I1mr + counts, and so on. This order can be adjusted with the + --sort option.

      + + Note that this dictates the order the functions appear. It is not + the order in which the columns appear; that is dictated by the "events + shown" line (and can be changed with the --sort option). +

    • + +

    • Threshold: vg_annotate by default omits functions that cause very low + numbers of misses to avoid drowing you in information. In this case, + vg_annotate shows summaries the functions that account for 99% of the + Ir counts; Ir is chosen as the treshold event + since it is the primary sort event. The threshold can be adjusted with + the --threshold option.
    • + +

    • Chosen for annotation: names of files specified manually for annotation; + in this case none.
    • + +

    • Auto-annotation: whether auto-annotation was requested via the + --auto=yes option. In this case no.
    • +

    + +Then follows summary statistics for the whole program. These are similar +to the summary provided when running valgrind --cachesim=yes.

    + +Then follows function-by-function statistics. Each function is identified by a +file_name:function_name pair. If a column contains only a +`.' it means the function never performs that event (eg. the third row shows +that strcmp() contains no instructions that write to memory). The +name ??? is used if the the file name and/or function name could +not be determined from debugging information. (If most of the entries have the +form ???:??? the program probably wasn't compiled with +-g.)

    + +It is worth noting that functions will come from three types of source files: +

      +
    1. From the profiled program (concord.c in this example).
    2. +
    3. From libraries (eg. getc.c)
    4. +
    5. From Valgrind's implementation of some libc functions (eg. + vg_clientmalloc.c:malloc). These are recognisable because + the filename begins with vg_, and is probably one of + vg_main.c, vg_clientmalloc.c or + vg_mylibc.c. +
    6. +
    + +There are two ways to annotate source files -- by choosing them manually, or +with the --auto=yes option. To do it manually, just +specify the filenames as arguments to vg_annotate. For example, the output from +running vg_annotate concord.c for our example produces the same +output as above followed by an annotated version of concord.c, a +section of which looks like: + +
    +--------------------------------------------------------------------------------
    +-- User-annotated source: concord.c
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
    +
    +[snip]
    +
    +        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
    +        3    1    1       .     .     .       1      0      0  {
    +        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
    +        .    .    .       .     .     .       .      .      .      Word_Info *data;
    +        1    0    0       .     .     .       1      1      1      int line = 1, i;
    +        .    .    .       .     .     .       .      .      .
    +        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
    +        .    .    .       .     .     .       .      .      .
    +    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
    +    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
    +        .    .    .       .     .     .       .      .      .
    +        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
    +        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
    +        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
    +        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
    +        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
    +        .    .    .       .     .     .       .      .      .      }
    +        .    .    .       .     .     .       .      .      .
    +  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
    +  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
    +        .    .    .       .     .     .       .      .      .
    +        4    0    0       1     0     0       2      0      0      free(data);
    +        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
    +        3    0    0       2     0     0       .      .      .  }
    +
    + +(Although column widths are automatically minimised, a wide terminal is clearly +useful.)

    + +Each source file is clearly marked (User-annotated source) as +having been chosen manually for annotation. If the file was found in one of +the directories specified with the -I/--include +option, the directory and file are both given.

    + +Each line is annotated with its event counts. Events not applicable for a line +are represented by a `.'; this is useful for distinguishing between an event +which cannot happen, and one which can but did not.

    + +Sometimes only a small section of a source file is executed. To minimise +uninteresting output, Valgrind only shows annotated lines and lines within a +small distance of annotated lines. Gaps are marked with the line numbers so +you know which part of a file the shown code comes from, eg: + +

    +(figures and code for line 704)
    +-- line 704 ----------------------------------------
    +-- line 878 ----------------------------------------
    +(figures and code for line 878)
    +
    + +The amount of context to show around annotated lines is controlled by the +--context option.

    + +To get automatic annotation, run vg_annotate --auto=yes. +vg_annotate will automatically annotate every source file it can find that is +mentioned in the function-by-function summary. Therefore, the files chosen for +auto-annotation are affected by the --sort and +--threshold options. Each source file is clearly marked +(Auto-annotated source) as being chosen automatically. Any files +that could not be found are mentioned at the end of the output, eg: + +

    +--------------------------------------------------------------------------------
    +The following files chosen for auto-annotation could not be found:
    +--------------------------------------------------------------------------------
    +  getc.c
    +  ctype.c
    +  ../sysdeps/generic/lockfile.c
    +
    + +This is quite common for library files, since libraries are usually compiled +with debugging information, but the source files are often not present on a +system. If a file is chosen for annotation both manually and +automatically, it is marked as User-annotated source. + +Use the -I/--include option to tell Valgrind where to look for +source files if the filenames found from the debugging information aren't +specific enough. + +Beware that vg_annotate can take some time to digest large +cachegrind.out files, eg. 30 seconds or more. Also beware that +auto-annotation can produce a lot of output if your program is large! + + +

    7.8  Annotating assembler programs

    +Valgrind can annotate assembler programs too, or annotate the assembler +generated for your C program. Sometimes this is useful for understanding what +is really happening when an interesting line of C code is translated into +multiple instructions.

    + +To do this, you just need to assemble your .s files with +assembler-level debug information. gcc doesn't do this, but you can use GNU as +with the --gstabs option to generate object files with this +information, eg: + +

    as --gstabs foo.s
    + +You can then profile and annotate source files in the same way as for C/C++ +programs. + + +

    7.9  vg_annotate options

    +
      +
    • -h, --help
    • +

    • -v, --version

      + + Help and version, as usual.

    • + +
    • --sort=A,B,C [default: order in + cachegrind.out]

      + Specifies the events upon which the sorting of the function-by-function + entries will be based. Useful if you want to concentrate on eg. I cache + misses (--sort=I1mr,I2mr), or D cache misses + (--sort=D1mr,D2mr), or L2 misses + (--sort=D2mr,I2mr).

    • + +

    • --show=A,B,C [default: all, using order in + cachegrind.out]

      + Specifies which events to show (and the column order). Default is to use + all present in the cachegrind.out file (and use the order in + the file).

    • + +

    • --threshold=X [default: 99%]

      + Sets the threshold for the function-by-function summary. Functions are + shown that account for more than X% of all the primary sort events. If + auto-annotating, also affects which files are annotated.

    • + +

    • --auto=no [default]
      + --auto=yes

      + When enabled, automatically annotates every file that is mentioned in the + function-by-function summary that can be found. Also gives a list of + those that couldn't be found. + +

    • --context=N [default: 8]

      + Print N lines of context before and after each annotated line. Avoids + printing large sections of source files that were not executed. Use a + large number (eg. 10,000) to show all source lines. +

    • + +

    • -I=<dir>, --include=<dir> + [default: empty string]

      + Adds a directory to the list in which to search for files. Multiple + -I/--include options can be given to add multiple directories. +

    + + +

    7.10  Warnings

    +There are a couple of situations in which vg_annotate issues warnings. + +
      +
    • If a source file is more recent than the cachegrind.out + file. This is because the information in cachegrind.out is + only recorded with line numbers, so if the line numbers change at all in + the source (eg. lines added, deleted, swapped), any annotations will be + incorrect.

      + +

    • If information is recorded about line numbers past the end of a file. + This can be caused by the above problem, ie. shortening the source file + while using an old cachegrind.out file. If this happens, + the figures for the bogus lines are printed anyway (clearly marked as + bogus) in case they are important.
    • +

    + + +

    7.10  Things to watch out for

    +Some odd things that can occur during annotation: + +
      +
    • If annotating at the assembler level, you might see something like this: + +
      +      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
      +      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
      +      .    .    .  .    .    .  .    .    .          .align 4,0x90
      +      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
      +      
      + + How can the third instruction be executed twice when the others are + executed only once? As it turns out, it isn't. Here's a dump of the + executable, from objdump: + +
      +      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
      +      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
      +      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
      +      8048f32:       89 f6                   mov    %esi,%esi
      +      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
      +      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
      +      
      + + Notice the extra mov %esi,%esi instruction. Where did this + come from? The GNU assembler inserted it to serve as the two bytes of + padding needed to align the movl $.LnrB,%eax instruction on + a four-byte boundary, but pretended it didn't exist when adding debug + information. Thus when Valgrind reads the debug info it thinks that the + movl $0x1,0xffffffec(%ebp) instruction covers the address + range 0x8048f2b--0x804833 by itself, and attributes the counts for the + mov %esi,%esi to it.

      +

    • + +
    • + Inlined functions can cause strange results in the function-by-function + summary. If a function inline_me() is defined in + foo.h and inlined in the functions f1(), + f2() and f3() in bar.c, there will + not be a foo.h:inline_me() function entry. Instead, there + will be separate function entries for each inlining site, ie. + foo.h:f1(), foo.h:f2() and + foo.h:f3(). To find the total counts for + foo.h:inline_me(), add up the counts from each entry.

      + + The reason for this is that although the debug info output by gcc + indicates the switch from bar.c to foo.h, it + doesn't indicate the name of the function in foo.h, so + Valgrind keeps using the old one.

      + +

    • + Sometimes, the same filename might be represented with a relative name + and with an absolute name in different parts of the debug info, eg: + /home/user/proj/proj.h and ../proj.h. In this + case, if you use auto-annotation, the file will be annotated twice with + the counts split between the two.

      +

    • +
    + +Note: stabs is not an easy format to read. If you come across bizarre +annotations that look like might be caused by a bug in the stabs reader, +please let us know. + + +

    7.11  Accuracy

    +Valgrind's cache profiling has a number of shortcomings: + +
      +
    • It doesn't account for kernel activity -- the effect of system calls on + the cache contents is ignored.
    • + +

    • It doesn't account for other process activity (although this is probably + desirable when considering a single program).
    • + +

    • It doesn't account for virtual-to-physical address mappings; hence the + entire simulation is not a true representation of what's happening in the + cache.
    • + +

    • It doesn't account for cache misses not visible at the instruction level, + eg. those arising from TLB misses, or speculative execution.
    • +

    + +Another thing worth nothing is that results are very sensitive. Changing the +size of the valgrind.so file, the size of the program being +profiled, or even the length of its name can perturb the results. Variations +will be small, but don't expect perfectly repeatable results if your program +changes at all.

    + +While these factors mean you shouldn't trust the results to be super-accurate, +hopefully they should be close enough to be useful.

    + + +

    7.12  Todo

    +
      +
    • Use CPUID instruction to auto-identify cache configuration during + installation. This would save the user from having to know their cache + configuration and using vg_cachegen.
    • +

    • Program start-up/shut-down calls a lot of functions that aren't + interesting and just complicate the output. Would be nice to exclude + these somehow.
    • +

    +
    + diff --git a/helgrind/Makefile.am b/helgrind/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/helgrind/Makefile.am +++ b/helgrind/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/lackey/Makefile.am b/lackey/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/lackey/Makefile.am +++ b/lackey/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/memcheck/Makefile.am b/memcheck/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/memcheck/Makefile.am +++ b/memcheck/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/memcheck/docs/manual.html b/memcheck/docs/manual.html index c441db1620..a97c2f9fec 100644 --- a/memcheck/docs/manual.html +++ b/memcheck/docs/manual.html @@ -78,7 +78,9 @@ Linux-x86 executables.

    An example

    -

    The design and implementation of Valgrind

    +

    Cache profiling

    + +

    The design and implementation of Valgrind


    @@ -515,6 +517,11 @@ follows: buggy, so you may need to issue this flag if you use 3.0.4.

    +

  • --cachesim=no [default]
    + --cachesim=yes +

    When enabled, turns off memory checking, and turns on cache profiling. + Cache profiling is described in detail in Section 7. +

  • There are also some options for debugging Valgrind itself. You @@ -1763,5 +1770,632 @@ sewardj@phoenix:~/newmat10$

    The GCC folks fixed this about a week before gcc-3.0 shipped.


    + + + + +

    7  Cache profiling

    +As well as memory debugging, Valgrind also allows you to do cache simulations +and annotate your source line-by-line with the number of cache misses. In +particular, it records: +
      +
    • L1 instruction cache reads and misses; +
    • L1 data cache reads and read misses, writes and write misses; +
    • L2 unified cache reads and read misses, writes and writes misses. +
    +On a modern x86 machine, an L1 miss will typically cost around 10 cycles, +and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be +very useful for improving the performance of your program. + +Please note that this is an experimental feature. Any feedback, bug-fixes, +suggestions, etc, welcome. + + +

    7.1  Overview

    +First off, as for normal Valgrind use, you probably want to turn on debugging +info (the -g flag). But by contrast with normal Valgrind use, you +probably do want to turn optimisation on, since you should profile your +program as it will be normally run. + +The three steps are: +
      +
    1. Generate a cache simulator for your machine's cache configuration with + `vg_cachegen' and recompile Valgrind with make install. + Valgrind comes with a default simulator, but it is unlikely to be correct + for your system, so you should generate a simulator yourself.
    2. +
    3. Run your program with valgrind --cachesim=yes in front of + the normal command line invocation. When the program finishes, Valgrind + will print summary cache statistics. It also collects line-by-line + information in a file cachegrind.out.
    4. +
    5. Generate a function-by-function summary, and possibly annotate source + files with 'vg_annotate'. Source files to annotate can be specified + manually, or manually on the command line, or "interesting" source files + can be annotated automatically with the --auto=yes option. + You can annotate C/C++ files or assembly language files equally + easily.
    6. +
    + +Step 1 only needs to be done once, unless you are +interested in simulating different cache configurations (eg. first +concentrating on instruction cache misses, then on data cache misses).

    + +Step 2 should be done every time you want to collect +information about a new program, a changed program, or about the same program +with different input.

    + +Step 3 can be performed as many times as you like for +each Step 2; you may want to do multiple annotations showing different +information each time.

    + +The steps are described in detail in the following sections.

    + + + +

    7.3  Generating a cache simulator

    +Although Valgrind comes with a pre-generated cache simulator, it most likely +won't match the cache configuration of your machine, so you should generate +a new simulator.

    + +You need to generate three files, one for each of the I1, D1 and L2 caches. +For each cache, you need to know the: +

      +
    • Cache size (bytes); +
    • Line size (bytes); +
    • Associativity. +
    + +vg_cachegen takes three options: +
      +
    • --I1=size,line_size,associativity +
    • --D1=size,line_size,associativity +
    • --L2=size,line_size,associativity +
    + +You can specify one, two or all three caches per invocation of vg_cachegen. It +checks that the configuration is sensible before generating the simulators; to +see the allowed values, run vg_cachegen -h.

    + +An example invocation would be: + +

    + vg_cachegen --I1=65536,64,2 --D1=65536,64,2 --L2=262144,64,8 +
    + +This simulates a machine with a 128KB split L1 2-way associative cache, and a +256KB unified 8-way associative L2 cache. Both caches have 64B lines.

    + +If you don't know your cache configuration, you'll have to find it out. +(Ideally vg_cachegen could auto-identify your cache configuration using the +CPUID instruction, which could be done automatically during installation, and +this whole step could be skipped...)

    + + +

    7.4  Cache simulation specifics

    +vg_cachegen only generates simulations for a machine with a split L1 cache and +a unified L2 cache. This configuration is used for all x86-based machines we +are aware of.

    + +The more specific characteristics of the simulation are as follows. + +

      +
    • Write-allocate: when a write miss occurs, the block written to is brought + into the D1 cache. Most modern caches have this property.
    • + +

    • Bit-selection hash function: the line(s) in the cache to which a memory + block maps is chosen by the middle bits M--(M+N-1) of the byte address, + where: +
        +
      •  line size = 2^M bytes 
      • +
      • (cache size / line size) = 2^N bytes
      • +
    • + +

    • Inclusive L2 cache: the L2 cache replicates all the entries of the L1 + cache. This is standard on Pentium chips, but AMD Athlons use an + exclusive L2 cache that only holds blocks evicted from L1.
    • +

    + +Other noteworthy behaviour: + +
      +
    • References that straddle two cache lines are treated as follows:
    • +
        +
      • If both blocks hit --> counted as one hit
      • +
      • If one block hits, the other misses --> counted as one miss
      • +
      • If both blocks miss --> counted as one miss (not two)
      • +

      + +

    • Instructions that modify a memory location (eg. inc and + dec) are counted as doing just a read, ie. a single data + reference. This may seem strange, but since the write can never cause a + miss (the read guarantees the block is in the cache) it's not very + interesting.

      + + Thus it measures not the number of times the data cache is accessed, but + the number of times a data cache miss could occur.

      +

    • +
    + +If you are interested in simulating a cache with different properties, it is +not particularly hard to write your own cache simulator, or to modify existing +ones in vg_cachesim_I1.c, vg_cachesim_I1.c and +vg_cachesim_I1.c. We'd be interested to hear from anyone who +does. + + + +

    7.5  Profiling programs

    +Cache profiling is enabled by using the --cachesim=yes option to +Valgrind. This automatically turns off Valgrind's memory checking functions, +since the cache simulation is slow enough already, and you probably don't want +to do both at once.

    + +To gather cache profiling information about the program ls -lvalgrind --cachesim=yes ls -l + +The program will execute (slowly). Upon completion, summary statistics +that look like this will be printed: + +

    +==31751== I   refs:      27,742,716
    +==31751== I1  misses:           276
    +==31751== L2  misses:           275
    +==31751== I1  miss rate:        0.0%
    +==31751== L2i miss rate:        0.0%
    +==31751== 
    +==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
    +==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
    +==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
    +==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
    +==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
    +==31751== 
    +==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
    +==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
    +
    + +Cache accesses for instruction fetches are summarised first, giving the +number of fetches made (this is the number of instructions executed, which +can be useful to know in its own right), the number of I1 misses, and the +number of L2 instruction (L2i) misses.

    + +Cache accesses for data follow. The information is similar to that of the +instruction fetches, except that the values are also shown split between reads +and writes (note each row's rd and wr values add up +to the row's total).

    + +Combined instruction and data figures for the L2 cache follow that.

    + + +

    7.6  Output file

    +As well as printing summary information, Valgrind also writes line-by-line +cache profiling information to a file named cachegrind.out . This +file is human-readable, but is best interpreted by the accompanying program +vg_annotate, described in the next section.

    + +Things to note about the cachegrind.out file: +

      +
    • It is written every time valgrind --cachesim=yes is run; it + will automatically overwrite any existing cachegrind.out in + the current directory.
    • +
    • It can be quite large: ls -l generates a file of about + 350KB; browsing a few files and web pages with Konqueror generates a file + of around 10MB.
    • +
    + + + +

    7.7  Annotating C/C++ programs

    +Before using vg_annotate, it is worth widening your window to be at least +120-characters wide if possible, as the output lines can be quite long.

    + +To get a function-by-function summary, run vg_annotate in +directory containing a cachegrind.out file. The output looks like +this: + +

    +--------------------------------------------------------------------------------
    +I1 cache:              65536 B, 64 B, 2-way associative
    +D1 cache:              65536 B, 64 B, 2-way associative
    +L2 cache:              262144 B, 64 B, 8-way associative
    +Command:               concord vg_to_ucode.c
    +Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
    +Threshold:             99%
    +Chosen for annotation:
    +Auto-annotation:       on
    +
    +--------------------------------------------------------------------------------
    +Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
    +--------------------------------------------------------------------------------
    +27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
    +
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
    +--------------------------------------------------------------------------------
    +8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
    +5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
    +2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
    +2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
    +2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
    +1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
    +  897,991   51   51   897,831    95    30        62      1      1  ???:???
    +  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
    +  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
    +  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
    +  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
    +  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
    +  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
    +  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
    +  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
    +  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
    +   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
    +   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
    +
    + +First up is a summary of the annotation options: + +
      +
    • I1 cache, D1 cache, L2 cache: cache configuration. So you know the + configuration with which these results were obtained.
    • + +

    • Command: the command line invocation of the program under + examination.
    • + +

    • Events recorded: event abbreviations are:

      +

        +
      • Ir : I cache reads (ie. instructions executed)
      • +
      • I1mr: I1 cache read misses
      • +
      • I2mr: L2 cache instruction read misses
      • +
      • Dr : D cache reads (ie. memory reads)
      • +
      • D1mr: D1 cache read misses
      • +
      • D2mr: L2 cache data read misses
      • +
      • Dw : D cache writes (ie. memory writes)
      • +
      • D1mw: D1 cache write misses
      • +
      • D2mw: L2 cache data write misses
      • +

      + Note that D1 total accesses is given by D1mr + + D1mw, and that L2 total accesses is given by + I2mr + D2mr + D2mw.

    • + +

    • Events shown: the events shown (a subset of events gathered). This can + be adjusted with the --show option.
    • + +

    • Event sort order: the sort order in which functions are shown. For + example, in this case the functions are sorted from highest + Ir counts to lowest. If two functions have identical + Ir counts, they will then be sorted by I1mr + counts, and so on. This order can be adjusted with the + --sort option.

      + + Note that this dictates the order the functions appear. It is not + the order in which the columns appear; that is dictated by the "events + shown" line (and can be changed with the --sort option). +

    • + +

    • Threshold: vg_annotate by default omits functions that cause very low + numbers of misses to avoid drowing you in information. In this case, + vg_annotate shows summaries the functions that account for 99% of the + Ir counts; Ir is chosen as the treshold event + since it is the primary sort event. The threshold can be adjusted with + the --threshold option.
    • + +

    • Chosen for annotation: names of files specified manually for annotation; + in this case none.
    • + +

    • Auto-annotation: whether auto-annotation was requested via the + --auto=yes option. In this case no.
    • +

    + +Then follows summary statistics for the whole program. These are similar +to the summary provided when running valgrind --cachesim=yes.

    + +Then follows function-by-function statistics. Each function is identified by a +file_name:function_name pair. If a column contains only a +`.' it means the function never performs that event (eg. the third row shows +that strcmp() contains no instructions that write to memory). The +name ??? is used if the the file name and/or function name could +not be determined from debugging information. (If most of the entries have the +form ???:??? the program probably wasn't compiled with +-g.)

    + +It is worth noting that functions will come from three types of source files: +

      +
    1. From the profiled program (concord.c in this example).
    2. +
    3. From libraries (eg. getc.c)
    4. +
    5. From Valgrind's implementation of some libc functions (eg. + vg_clientmalloc.c:malloc). These are recognisable because + the filename begins with vg_, and is probably one of + vg_main.c, vg_clientmalloc.c or + vg_mylibc.c. +
    6. +
    + +There are two ways to annotate source files -- by choosing them manually, or +with the --auto=yes option. To do it manually, just +specify the filenames as arguments to vg_annotate. For example, the output from +running vg_annotate concord.c for our example produces the same +output as above followed by an annotated version of concord.c, a +section of which looks like: + +
    +--------------------------------------------------------------------------------
    +-- User-annotated source: concord.c
    +--------------------------------------------------------------------------------
    +Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
    +
    +[snip]
    +
    +        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
    +        3    1    1       .     .     .       1      0      0  {
    +        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
    +        .    .    .       .     .     .       .      .      .      Word_Info *data;
    +        1    0    0       .     .     .       1      1      1      int line = 1, i;
    +        .    .    .       .     .     .       .      .      .
    +        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
    +        .    .    .       .     .     .       .      .      .
    +    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
    +    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
    +        .    .    .       .     .     .       .      .      .
    +        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
    +        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
    +        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
    +        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
    +        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
    +        .    .    .       .     .     .       .      .      .      }
    +        .    .    .       .     .     .       .      .      .
    +  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
    +  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
    +        .    .    .       .     .     .       .      .      .
    +        4    0    0       1     0     0       2      0      0      free(data);
    +        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
    +        3    0    0       2     0     0       .      .      .  }
    +
    + +(Although column widths are automatically minimised, a wide terminal is clearly +useful.)

    + +Each source file is clearly marked (User-annotated source) as +having been chosen manually for annotation. If the file was found in one of +the directories specified with the -I/--include +option, the directory and file are both given.

    + +Each line is annotated with its event counts. Events not applicable for a line +are represented by a `.'; this is useful for distinguishing between an event +which cannot happen, and one which can but did not.

    + +Sometimes only a small section of a source file is executed. To minimise +uninteresting output, Valgrind only shows annotated lines and lines within a +small distance of annotated lines. Gaps are marked with the line numbers so +you know which part of a file the shown code comes from, eg: + +

    +(figures and code for line 704)
    +-- line 704 ----------------------------------------
    +-- line 878 ----------------------------------------
    +(figures and code for line 878)
    +
    + +The amount of context to show around annotated lines is controlled by the +--context option.

    + +To get automatic annotation, run vg_annotate --auto=yes. +vg_annotate will automatically annotate every source file it can find that is +mentioned in the function-by-function summary. Therefore, the files chosen for +auto-annotation are affected by the --sort and +--threshold options. Each source file is clearly marked +(Auto-annotated source) as being chosen automatically. Any files +that could not be found are mentioned at the end of the output, eg: + +

    +--------------------------------------------------------------------------------
    +The following files chosen for auto-annotation could not be found:
    +--------------------------------------------------------------------------------
    +  getc.c
    +  ctype.c
    +  ../sysdeps/generic/lockfile.c
    +
    + +This is quite common for library files, since libraries are usually compiled +with debugging information, but the source files are often not present on a +system. If a file is chosen for annotation both manually and +automatically, it is marked as User-annotated source. + +Use the -I/--include option to tell Valgrind where to look for +source files if the filenames found from the debugging information aren't +specific enough. + +Beware that vg_annotate can take some time to digest large +cachegrind.out files, eg. 30 seconds or more. Also beware that +auto-annotation can produce a lot of output if your program is large! + + +

    7.8  Annotating assembler programs

    +Valgrind can annotate assembler programs too, or annotate the assembler +generated for your C program. Sometimes this is useful for understanding what +is really happening when an interesting line of C code is translated into +multiple instructions.

    + +To do this, you just need to assemble your .s files with +assembler-level debug information. gcc doesn't do this, but you can use GNU as +with the --gstabs option to generate object files with this +information, eg: + +

    as --gstabs foo.s
    + +You can then profile and annotate source files in the same way as for C/C++ +programs. + + +

    7.9  vg_annotate options

    +
      +
    • -h, --help
    • +

    • -v, --version

      + + Help and version, as usual.

    • + +
    • --sort=A,B,C [default: order in + cachegrind.out]

      + Specifies the events upon which the sorting of the function-by-function + entries will be based. Useful if you want to concentrate on eg. I cache + misses (--sort=I1mr,I2mr), or D cache misses + (--sort=D1mr,D2mr), or L2 misses + (--sort=D2mr,I2mr).

    • + +

    • --show=A,B,C [default: all, using order in + cachegrind.out]

      + Specifies which events to show (and the column order). Default is to use + all present in the cachegrind.out file (and use the order in + the file).

    • + +

    • --threshold=X [default: 99%]

      + Sets the threshold for the function-by-function summary. Functions are + shown that account for more than X% of all the primary sort events. If + auto-annotating, also affects which files are annotated.

    • + +

    • --auto=no [default]
      + --auto=yes

      + When enabled, automatically annotates every file that is mentioned in the + function-by-function summary that can be found. Also gives a list of + those that couldn't be found. + +

    • --context=N [default: 8]

      + Print N lines of context before and after each annotated line. Avoids + printing large sections of source files that were not executed. Use a + large number (eg. 10,000) to show all source lines. +

    • + +

    • -I=<dir>, --include=<dir> + [default: empty string]

      + Adds a directory to the list in which to search for files. Multiple + -I/--include options can be given to add multiple directories. +

    + + +

    7.10  Warnings

    +There are a couple of situations in which vg_annotate issues warnings. + +
      +
    • If a source file is more recent than the cachegrind.out + file. This is because the information in cachegrind.out is + only recorded with line numbers, so if the line numbers change at all in + the source (eg. lines added, deleted, swapped), any annotations will be + incorrect.

      + +

    • If information is recorded about line numbers past the end of a file. + This can be caused by the above problem, ie. shortening the source file + while using an old cachegrind.out file. If this happens, + the figures for the bogus lines are printed anyway (clearly marked as + bogus) in case they are important.
    • +

    + + +

    7.10  Things to watch out for

    +Some odd things that can occur during annotation: + +
      +
    • If annotating at the assembler level, you might see something like this: + +
      +      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
      +      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
      +      .    .    .  .    .    .  .    .    .          .align 4,0x90
      +      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
      +      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
      +      
      + + How can the third instruction be executed twice when the others are + executed only once? As it turns out, it isn't. Here's a dump of the + executable, from objdump: + +
      +      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
      +      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
      +      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
      +      8048f32:       89 f6                   mov    %esi,%esi
      +      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
      +      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
      +      
      + + Notice the extra mov %esi,%esi instruction. Where did this + come from? The GNU assembler inserted it to serve as the two bytes of + padding needed to align the movl $.LnrB,%eax instruction on + a four-byte boundary, but pretended it didn't exist when adding debug + information. Thus when Valgrind reads the debug info it thinks that the + movl $0x1,0xffffffec(%ebp) instruction covers the address + range 0x8048f2b--0x804833 by itself, and attributes the counts for the + mov %esi,%esi to it.

      +

    • + +
    • + Inlined functions can cause strange results in the function-by-function + summary. If a function inline_me() is defined in + foo.h and inlined in the functions f1(), + f2() and f3() in bar.c, there will + not be a foo.h:inline_me() function entry. Instead, there + will be separate function entries for each inlining site, ie. + foo.h:f1(), foo.h:f2() and + foo.h:f3(). To find the total counts for + foo.h:inline_me(), add up the counts from each entry.

      + + The reason for this is that although the debug info output by gcc + indicates the switch from bar.c to foo.h, it + doesn't indicate the name of the function in foo.h, so + Valgrind keeps using the old one.

      + +

    • + Sometimes, the same filename might be represented with a relative name + and with an absolute name in different parts of the debug info, eg: + /home/user/proj/proj.h and ../proj.h. In this + case, if you use auto-annotation, the file will be annotated twice with + the counts split between the two.

      +

    • +
    + +Note: stabs is not an easy format to read. If you come across bizarre +annotations that look like might be caused by a bug in the stabs reader, +please let us know. + + +

    7.11  Accuracy

    +Valgrind's cache profiling has a number of shortcomings: + +
      +
    • It doesn't account for kernel activity -- the effect of system calls on + the cache contents is ignored.
    • + +

    • It doesn't account for other process activity (although this is probably + desirable when considering a single program).
    • + +

    • It doesn't account for virtual-to-physical address mappings; hence the + entire simulation is not a true representation of what's happening in the + cache.
    • + +

    • It doesn't account for cache misses not visible at the instruction level, + eg. those arising from TLB misses, or speculative execution.
    • +

    + +Another thing worth nothing is that results are very sensitive. Changing the +size of the valgrind.so file, the size of the program being +profiled, or even the length of its name can perturb the results. Variations +will be small, but don't expect perfectly repeatable results if your program +changes at all.

    + +While these factors mean you shouldn't trust the results to be super-accurate, +hopefully they should be close enough to be useful.

    + + +

    7.12  Todo

    +
      +
    • Use CPUID instruction to auto-identify cache configuration during + installation. This would save the user from having to know their cache + configuration and using vg_cachegen.
    • +

    • Program start-up/shut-down calls a lot of functions that aren't + interesting and just complicate the output. Would be nice to exclude + these somehow.
    • +

    +
    + diff --git a/none/Makefile.am b/none/Makefile.am index b13d2439e7..fcbfae9b57 100644 --- a/none/Makefile.am +++ b/none/Makefile.am @@ -8,7 +8,7 @@ LDFLAGS = -Wl,-z -Wl,initfirst INCLUDES = -I$(srcdir)/demangle -bin_SCRIPTS = valgrind +bin_SCRIPTS = valgrind vg_annotate vg_cachegen SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp @@ -35,6 +35,7 @@ valgrinq_so_SOURCES = vg_valgrinq_dummy.c valgrind_so_SOURCES = \ vg_clientfuncs.c \ vg_scheduler.c \ + vg_cachesim.c \ vg_clientmalloc.c \ vg_clientperms.c \ vg_demangle.c \ @@ -69,14 +70,20 @@ valgrind_so_LDADD = \ include_HEADERS = valgrind.h noinst_HEADERS = \ + vg_cachesim_I1.c \ + vg_cachesim_D1.c \ + vg_cachesim_L2.c \ vg_kerneliface.h \ vg_include.h \ vg_constants.h \ vg_unsafe.h - MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) +vg_annotate: + +vg_cachegen: + vg_memory.o: vg_memory.c $(MANUAL_DEPS) $(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $< diff --git a/valgrind.in b/valgrind.in index 7f18890796..b10f48795b 100755 --- a/valgrind.in +++ b/valgrind.in @@ -65,6 +65,8 @@ do --workaround-gcc296-bugs=yes) vgopts="$vgopts $arg"; shift;; --freelist-vol=*) vgopts="$vgopts $arg"; shift;; --suppressions=*) vgopts="$vgopts $arg"; shift;; + --cachesim=yes) vgopts="$vgopts $arg"; shift;; + --cachesim=no) vgopts="$vgopts $arg"; shift;; # options for debugging Valgrind --sanity-level=*) vgopts="$vgopts $arg"; shift;; --single-step=yes) vgopts="$vgopts $arg"; shift;; @@ -129,6 +131,7 @@ if [ $# = 0 ] || [ z"$dousage" = z1 ]; then echo " suppressions file " echo " --check-addrVs=no|yes experimental lighterweight checking? [yes]" echo " yes == Valgrind's original behaviour" + echo " --cachesim=no|yes do cache profiling? [no]" echo echo " options for debugging Valgrind itself are:" echo " --sanity-level= level of sanity checking to do [1]" diff --git a/vg_annotate.in b/vg_annotate.in new file mode 100644 index 0000000000..b3b091cc23 --- /dev/null +++ b/vg_annotate.in @@ -0,0 +1,892 @@ +#! /usr/bin/perl -w +##--------------------------------------------------------------------## +##--- The cache simulation framework: instrumentation, recording ---## +##--- and results printing. ---## +##--- vg_annotate ---## +##--------------------------------------------------------------------## + +# This file is part of Valgrind, an x86 protected-mode emulator +# designed for debugging and profiling binaries on x86-Unixes. +# +# Copyright (C) 2000-2002 Julian Seward +# jseward@acm.org +# Julian_Seward@muraroa.demon.co.uk +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +# 02111-1307, USA. +# +# The GNU General Public License is contained in the file LICENSE. + +#---------------------------------------------------------------------------- +# Annotator for valgrind --cachesim=yes. +# +# Input file has the following format: +# ::= * +# + +# ::= "desc:" ? +# ::= "cmd:" ? +# ::= "events:" ? ( )+ +# ::= | | +# ::= ("fl=" | "fi=" | "fe=" ) +# ::= "fn=" +# ::= ? ( )+ +# ::= "summary:" ? ( )+ +# ::= | "." +# +# where +# +# is any string not containing a newline +# is a command line invocation +# and can be anything +# and are decimal numbers +# is whitespace. +# is a newline +# +# The contents of the "desc:" lines is printed out at the top of the summary. +# This is a generic way of providing simulation specific information, eg. for +# giving the cache configuration for cache simulation. +# +# Counts can be ".", to represent "N/A". +# +# The number of counts in each and the should not exceed +# the number of events in the . If the number in each is +# less, we use "." for the the missing counts (the last however-many). +# +# A changes the current file name. A changes the current +# function name. A contains counts that pertain to the current +# filename/fn_name. A "fn=" and a must appear before any +# s to give the context of the first s. +# +# Each should be immediately followed by a . "fi=" +# are used to switch filenames for inlined functions; "fe=" +# are similar, but are put at the end of a basic block in which +# the file name hasn't been switched back to the original file name. (fi and +# fe lines behave the same, they are only distinguished to help debugging.) +# +#---------------------------------------------------------------------------- +# Performance improvements record, using cachegrind.out for cacheprof, doing no +# source annotation (irrelevant ones removed): +# user time +# 1. turned off warnings in add_hash_a_to_b() 3.81 --> 3.48s +# [now add_array_a_to_b()] +# 6. make line_to_CC() return a ref instead of a hash 3.01 --> 2.77s +# +#10. changed file format to avoid file/fn name repetition 2.40s +# (not sure why higher; maybe due to new '.' entries?) +#11. changed file format to drop unnecessary end-line "."s 2.36s +# (shrunk file by about 37%) +#12. switched from hash CCs to array CCs 1.61s +#13. only adding b[i] to a[i] if b[i] defined (was doing it if +# either a[i] or b[i] was defined, but if b[i] was undefined +# it just added 0) 1.48s +#14. Stopped converting "." entries to undef and then back 1.16s +#15. Using foreach $i (x..y) instead of for ($i = 0...) in +# add_array_a_to_b() 1.11s +# +# Auto-annotating primes: +#16. Finding count lengths by int((length-1)/3), not by +# commifying (halves the number of commify calls) 1.68s --> 1.47s + +use strict; + +#---------------------------------------------------------------------------- +# Overview: the running example in the comments is for: +# - events = A,B,C,D +# - --show=C,A,D +# - --sort=D,C +#---------------------------------------------------------------------------- + +#---------------------------------------------------------------------------- +# Global variables, main data structures +#---------------------------------------------------------------------------- +# CCs are arrays, the counts corresponding to @events, with 'undef' +# representing '.'. This makes things fast (faster than using hashes for CCs) +# but we have to use @sort_order and @show_order below to handle the --sort and +# --show options, which is a bit tricky. +#---------------------------------------------------------------------------- + +# Total counts for summary (an array reference). +my $summary_CC; + +# Totals for each function, for overall summary. +# hash(filename:fn_name => CC array) +my %fn_totals; + +# Individual CCs, organised by filename and line_num for easy annotation. +# hash(filename => hash(line_num => CC array)) +my %all_ind_CCs; + +# Files chosen for annotation on the command line. +# key = basename (trimmed of any directory), value = full filename +my %user_ann_files; + +# Generic description string. +my $desc = ""; + +# Command line of profiled program. +my $cmd; + +# Events in input file, eg. (A,B,C,D) +my @events; + +# Events to show, from command line, eg. (C,A,D) +my @show_events; + +# Map from @show_events indices to @events indices, eg. (2,0,3). Gives the +# order in which we must traverse @events in order to show the @show_events, +# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events. +# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).) +my @show_order; + +# Print out the function totals sorted by these events, eg. (D,C). +my @sort_events; + +# Map from @sort_events indices to @events indices, eg. (3,2). Same idea as +# for @show_order +my @sort_order; + +# Threshold; whatever event is the primary sort, we print out functions +# representing more than this proportion of 'event' events. +my $threshold = 99; + +# If on, automatically annotates all files that are involved in getting over +# the threshold count of the primary sort event. +my $auto_annotate = 0; + +# Number of lines to show around each annotated line. +my $context = 8; + +# Directories in which to look for annotation files. +my @include_dirs = (""); + +# Input file name +my $input_file = "cachegrind.out"; + +# Version number +my $version = "@VERSION@"; + +# Usage message. +my $usage = < percentage of counts (of primary sort event) we + are interested in [$threshold%] + --auto=yes|no annotate all source files containing functions + that helped reach the event count threshold [no] + --context=N print N lines of context before and after + annotated lines [8] + -I --include= add to list of directories to search for + source files + + Valgrind is Copyright (C) 2000-2002 Julian Seward + and licensed under the GNU General Public License, version 2. + Bug reports, feedback, admiration, abuse, etc, to: jseward\@acm.org. + +END +; + +# Used in various places of output. +my $fancy = '-' x 80 . "\n"; + +#----------------------------------------------------------------------------- +# Argument and option handling +#----------------------------------------------------------------------------- +sub process_cmd_line() +{ + for my $arg (@ARGV) { + + # Option handling + if ($arg =~ /^-/) { + + # --version + if ($arg =~ /^-v$|^--version$/) { + die("vg_annotate$version\n"); + + # --show=A,B,C + } elsif ($arg =~ /^--show=(.*)$/) { + @show_events = split(/,/, $1); + + # --sort=A,B,C + } elsif ($arg =~ /^--sort=(.*)$/) { + @sort_events = split(/,/, $1); + + # --threshold=X (tolerates a trailing '%') + } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) { + $threshold = $1; + if ($threshold < 0 || $threshold > 100) { + die($usage); + } + + # --auto=yes|no + } elsif ($arg =~ /^--auto=(yes|no)$/) { + $auto_annotate = 1 if ($1 eq "yes"); + $auto_annotate = 0 if ($1 eq "no"); + + # --context=N + } elsif ($arg =~ /^--context=([\d\.]+)$/) { + $context = $1; + if ($context < 0) { + die($usage); + } + + # --include=A,B,C + } elsif ($arg =~ /^(-I|--include)=(.*)$/) { + my $inc = $2; + $inc =~ s|/$||; # trim trailing '/' + push(@include_dirs, "$inc/"); + + } else { # -h and --help fall under this case + die($usage); + } + + # Argument handling -- annotation file checking and selection. + # Stick filenames into a hash for quick 'n easy lookup throughout + } else { + my $readable = 0; + foreach my $include_dir (@include_dirs) { + if (-r $include_dir . $arg) { + $readable = 1; + } + } + $readable or die("File $arg not found in any of: @include_dirs\n"); + $user_ann_files{$arg} = 1; + } + } +} + +#----------------------------------------------------------------------------- +# Reading of input file +#----------------------------------------------------------------------------- +sub max ($$) +{ + my ($x, $y) = @_; + return ($x > $y ? $x : $y); +} + +# Add the two arrays; any '.' entries are ignored. Two tricky things: +# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn +# off warnings to allow this. This makes things about 10% faster than +# checking for definedness ourselves. +# 2. We don't add a ".", even though it's value is 0, because we don't want to +# make an $a2->[$i] that is undef become 0 unnecessarily. +sub add_array_a_to_b ($$) +{ + my ($a1, $a2) = @_; + + my $n = max(scalar @$a1, scalar @$a2); + $^W = 0; + foreach my $i (0 .. $n-1) { + $a2->[$i] += $a1->[$i] if ("." ne $a1->[$i]); + } + $^W = 1; +} + +# Add each event count to the CC array. '.' counts become undef, as do +# missing entries (implicitly). +sub line_to_CC ($) +{ + my @CC = (split /\s+/, $_[0]); + (@CC <= @events) or die("Line $.: too many event counts\n"); + return \@CC; +} + +sub read_input_file() +{ + open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n"; + + # Read "desc:" lines. + my $line; + # This gives a "uninitialized value in substitution (s///)" warning; hmm... + #while ($line = && $line =~ s/desc:\s+//) { + # $desc .= "$line\n"; + #} + while (1) { + $line = ; + if ($line =~ s/desc:\s+//) { + $desc .= $line; + } else { + last; + } + } + + # Read "cmd:" line (Nb: will already be in $line from "desc:" loop above). + ($line =~ s/cmd:\s+//) or die("Line $.: missing command line\n"); + $cmd = $line; + chomp($cmd); # Remove newline + + # Read "events:" line. We make a temporary hash in which the Nth event's + # value is N, which is useful for handling --show/--sort options below. + $line = ; + ($line =~ s/events:\s+//) or die("Line $.: missing events line\n"); + @events = split(/\s+/, $line); + my %events; + my $n = 0; + foreach my $event (@events) { + $events{$event} = $n; + $n++ + } + + # If no --show arg give, default to showing all events in the file. + # If --show option is used, check all specified events appeared in the + # "events:" line. Then initialise @show_order. + if (@show_events) { + foreach my $show_event (@show_events) { + (defined $events{$show_event}) or + die("--show event `$show_event' did not appear in input\n"); + } + } else { + @show_events = @events; + } + foreach my $show_event (@show_events) { + push(@show_order, $events{$show_event}); + } + + # Do as for --show, but if no --sort arg given, default to sorting by + # column order (ie. first column event is primary sort key, 2nd column is + # 2ndary key, etc). + if (@sort_events) { + foreach my $sort_event (@sort_events) { + (defined $events{$sort_event}) or + die("--sort event `$sort_event' did not appear in input\n"); + } + } else { + @sort_events = @events; + } + foreach my $sort_event (@sort_events) { + push(@sort_order, $events{$sort_event}); + } + + my $curr_file; + my $curr_fn; + my $curr_name; + + my $curr_fn_CC = []; + my $curr_file_ind_CCs = {}; # hash(line_num => CC) + + # Read body of input file. + while () { + s/#.*$//; # remove comments + if (s/^(\d+)\s+//) { + my $line_num = $1; + my $CC = line_to_CC($_); + add_array_a_to_b($CC, $curr_fn_CC); + + # If curr_file is selected, add CC to curr_file list. We look for + # full filename matches; or, if auto-annotating, we have to + # remember everything -- we won't know until the end what's needed. + if ($auto_annotate || defined $user_ann_files{$curr_file}) { + my $tmp = $curr_file_ind_CCs->{$line_num}; + $tmp = [] unless defined $tmp; + add_array_a_to_b($CC, $tmp); + $curr_file_ind_CCs->{$line_num} = $tmp; + } + + } elsif (s/^fn=(.*)$//) { + # Commit result from previous function + $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name); + + # Setup new one + $curr_fn = $1; + $curr_name = "$curr_file:$curr_fn"; + $curr_fn_CC = $fn_totals{$curr_name}; + $curr_fn_CC = [] unless (defined $curr_fn_CC); + + } elsif (s/^fl=(.*)$//) { + $all_ind_CCs{$curr_file} = $curr_file_ind_CCs + if (defined $curr_file); + + $curr_file = $1; + $curr_file_ind_CCs = $all_ind_CCs{$curr_file}; + $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs); + + } elsif (s/^(fi|fe)=(.*)$//) { + (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n"); + $fn_totals{$curr_name} = $curr_fn_CC; + $all_ind_CCs{$curr_file} = $curr_file_ind_CCs; + + $curr_file = $2; + $curr_name = "$curr_file:$curr_fn"; + $curr_file_ind_CCs = $all_ind_CCs{$curr_file}; + $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs); + $curr_fn_CC = $fn_totals{$curr_name}; + $curr_fn_CC = [] unless (defined $curr_fn_CC); + + } elsif (s/^\s*$//) { + # blank, do nothing + + } elsif (s/^summary:\s+//) { + # Finish up handling final filename/fn_name counts + $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC + if (defined $curr_file && defined $curr_fn); + $all_ind_CCs{$curr_file} = + $curr_file_ind_CCs if (defined $curr_file); + + $summary_CC = line_to_CC($_); + (scalar(@$summary_CC) == @events) + or die("Line $.: summary event and total event mismatch\n"); + + } else { + warn("WARNING: line $. malformed, ignoring\n"); + } + } + + # Check if summary line was present + if (not defined $summary_CC) { + warn("WARNING: missing final summary line, no summary will be printed\n"); + } + + close(INPUTFILE); +} + +#----------------------------------------------------------------------------- +# Print options used +#----------------------------------------------------------------------------- +sub print_options () +{ + print($fancy); + print($desc); + print("Command: $cmd\n"); + print("Events recorded: @events\n"); + print("Events shown: @show_events\n"); + print("Event sort order: @sort_events\n"); + print("Threshold: $threshold%\n"); + + my @include_dirs2 = @include_dirs; # copy @include_dirs + shift(@include_dirs2); # remove "" entry, which is always the first + unshift(@include_dirs2, "") if (0 == @include_dirs2); + my $include_dir = shift(@include_dirs2); + print("Include dirs: $include_dir\n"); + foreach my $include_dir (@include_dirs2) { + print(" $include_dir\n"); + } + + my @user_ann_files = keys %user_ann_files; + unshift(@user_ann_files, "") if (0 == @user_ann_files); + my $user_ann_file = shift(@user_ann_files); + print("User annotated: $user_ann_file\n"); + foreach $user_ann_file (@user_ann_files) { + print(" $user_ann_file\n"); + } + + my $is_on = ($auto_annotate ? "on" : "off"); + print("Auto-annotation: $is_on\n"); + print("\n"); +} + +#----------------------------------------------------------------------------- +# Print summary and sorted function totals +#----------------------------------------------------------------------------- +sub mycmp ($$) +{ + my ($c, $d) = @_; + + # Iterate through sort events (eg. 3,2); return result if two are different + foreach my $i (@sort_order) { + my ($x, $y); + $x = $c->[$i]; + $y = $d->[$i]; + $x = -1 unless defined $x; + $y = -1 unless defined $y; + + my $cmp = $y <=> $x; # reverse sort + if (0 != $cmp) { + return $cmp; + } + } + # Exhausted events, equal + return 0; +} + +sub commify ($) { + my ($val) = @_; + 1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/); + return $val; +} + +# Because the counts can get very big, and we don't want to waste screen space +# and make lines too long, we compute exactly how wide each column needs to be +# by finding the widest entry for each one. +sub compute_CC_col_widths (@) +{ + my @CCs = @_; + my $CC_col_widths = []; + + # Initialise with minimum widths (from event names) + foreach my $event (@events) { + push(@$CC_col_widths, length($event)); + } + + # Find maximum width count for each column. @CC_col_width positions + # correspond to @CC positions. + foreach my $CC (@CCs) { + foreach my $i (0 .. scalar(@$CC)-1) { + if (defined $CC->[$i]) { + # Find length, accounting for commas that will be added + my $length = length $CC->[$i]; + my $clength = $length + int(($length - 1) / 3); + $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); + } + } + } + return $CC_col_widths; +} + +# Print the CC with each column's size dictated by $CC_col_widths. +sub print_CC ($$) +{ + my ($CC, $CC_col_widths) = @_; + + foreach my $i (@show_order) { + my $count = (defined $CC->[$i] ? commify($CC->[$i]) : "."); + my $space = ' ' x ($CC_col_widths->[$i] - length($count)); + print("$space$count "); + } +} + +sub print_events ($) +{ + my ($CC_col_widths) = @_; + + foreach my $i (@show_order) { + my $event = $events[$i]; + my $event_width = length($event); + my $col_width = $CC_col_widths->[$i]; + my $space = ' ' x ($col_width - $event_width); + print("$event$space "); + } +} + +# Prints summary and function totals (with separate column widths, so that +# function names aren't pushed over unnecessarily by huge summary figures). +# Also returns a hash containing all the files that are involved in getting the +# events count above the threshold (ie. all the interesting ones). +sub print_summary_and_fn_totals () +{ + my @fn_fullnames = keys %fn_totals; + + # Work out the size of each column for printing (summary and functions + # separately). + my $summary_CC_col_widths = compute_CC_col_widths($summary_CC); + my $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals); + + # Header and counts for summary + print($fancy); + print_events($summary_CC_col_widths); + print("\n"); + print($fancy); + print_CC($summary_CC, $summary_CC_col_widths); + print(" PROGRAM TOTALS\n"); + print("\n"); + + # Header for functions + print($fancy); + print_events($fn_CC_col_widths); + print(" file:function\n"); + print($fancy); + + # Sort function names into order dictated by --sort option. + @fn_fullnames = sort { + mycmp($fn_totals{$a}, $fn_totals{$b}) + } @fn_fullnames; + + # The thresholded event is the one that is the primary sort event. + my $threshold_files = {}; + my $threshold_event_index = $sort_order[0]; + my $threshold_total = $summary_CC->[$threshold_event_index]; + my $curr_total = 0; + + # Print functions, stopping when the threshold has been reached. + foreach my $fn_name (@fn_fullnames) { + + # Stop when we've reached the threshold + last if ($curr_total * 100 / $threshold_total >= $threshold); + + # Print function results + my $fn_CC = $fn_totals{$fn_name}; + print_CC($fn_CC, $fn_CC_col_widths); + print(" $fn_name\n"); + + # Update the threshold counting + my $filename = $fn_name; + $filename =~ s/:[^:]+$//; # remove function name + $threshold_files->{$filename} = 1; + $curr_total += $fn_CC->[$threshold_event_index] + if (defined $fn_CC->[$threshold_event_index]); + } + print("\n"); + + return $threshold_files; +} + +#----------------------------------------------------------------------------- +# Annotate selected files +#----------------------------------------------------------------------------- + +# Issue a warning that the source file is more recent than the input file. +sub warning_on_src_more_recent_than_inputfile ($) +{ + my $src_file = $_[0]; + + my $warning = <{"???"}; + %all_ann_files = (%user_ann_files, %$threshold_files) + } else { + %all_ann_files = %user_ann_files; + } + + # Track if we did any annotations. + my $did_annotations = 0; + + LOOP: + foreach my $src_file (keys %all_ann_files) { + + my $opened_file = ""; + my $full_file_name = ""; + foreach my $include_dir (@include_dirs) { + my $try_name = $include_dir . $src_file; + if (open(INPUTFILE, "< $try_name")) { + $opened_file = $try_name; + $full_file_name = ($include_dir eq "" + ? $src_file + : "$include_dir + $src_file"); + last; + } + } + + if (not $opened_file) { + # Failed to open the file. If chosen on the command line, die. + # If arose from auto-annotation, print a little message. + if (defined $user_ann_files{$src_file}) { + die("File $src_file not opened in any of: @include_dirs\n"); + + } else { + push(@unfound_auto_annotate_files, $src_file); + } + + } else { + # File header (distinguish between user- and auto-selected files). + print("$fancy"); + my $ann_type = + (defined $user_ann_files{$src_file} ? "User" : "Auto"); + print("-- $ann_type-annotated source: $full_file_name\n"); + print("$fancy"); + + # Get file's CCs + my $src_file_CCs = $all_ind_CCs{$src_file}; + if (!defined $src_file_CCs) { + print(" No information has been collected for $src_file\n\n"); + next LOOP; + } + + $did_annotations = 1; + + # Numeric, not lexicographic sort! + my @line_nums = sort {$a <=> $b} keys %$src_file_CCs; + + # If $src_file more recent than cachegrind.out, issue warning + my $src_more_recent_than_inputfile = 0; + if ((stat $opened_file)[9] > (stat $input_file)[9]) { + $src_more_recent_than_inputfile = 1; + warning_on_src_more_recent_than_inputfile($src_file); + } + + # Work out the size of each column for printing + my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs); + + # Events header + print_events($CC_col_widths); + print("\n\n"); + + # Shift out 0 if it's in the line numbers (from unknown entries, + # likely due to bugs in Valgrind's stabs debug info reader) + shift(@line_nums) if (0 == $line_nums[0]); + + # Finds interesting line ranges -- all lines with a CC, and all + # lines within $context lines of a line with a CC. + my $n = @line_nums; + my @pairs; + for (my $i = 0; $i < $n; $i++) { + push(@pairs, $line_nums[$i] - $context); # lower marker + while ($i < $n-1 && + $line_nums[$i] + 2*$context >= $line_nums[$i+1]) { + $i++; + } + push(@pairs, $line_nums[$i] + $context); # upper marker + } + + # Annotate chosen lines, tracking total counts of lines printed + $pairs[0] = 1 if ($pairs[0] < 1); + while (@pairs) { + my $low = shift @pairs; + my $high = shift @pairs; + while ($. < $low-1) { + my $tmp = ; + last unless (defined $tmp); # hack to detect EOF + } + my $src_line; + # Print line number, unless start of file + print("-- line $low " . '-' x 40 . "\n") if ($low != 1); + while (($. < $high) && ($src_line = )) { + if (defined $line_nums[0] && $. == $line_nums[0]) { + print_CC($src_file_CCs->{$.}, $CC_col_widths); + add_array_a_to_b($src_file_CCs->{$.}, + $printed_totals_CC); + shift(@line_nums); + + } else { + print_CC( [], $CC_col_widths); + } + + print(" $src_line"); + } + # Print line number, unless EOF + if ($src_line) { + print("-- line $high " . '-' x 40 . "\n"); + } else { + last; + } + } + + # If there was info on lines past the end of the file... + if (@line_nums) { + foreach my $line_num (@line_nums) { + print_CC($src_file_CCs->{$line_num}, $CC_col_widths); + print(" \n"); + } + print("\n"); + warning_on_nonexistent_lines($src_more_recent_than_inputfile, + $src_file, \@line_nums); + } + print("\n"); + + # Print summary of counts attributed to file but not to any + # particular line (due to incomplete debug info). + if ($src_file_CCs->{0}) { + print_CC($src_file_CCs->{0}, $CC_col_widths); + print(" \n\n"); + } + + close(INPUTFILE); + } + } + + # Print list of unfound auto-annotate selected files. + if (@unfound_auto_annotate_files) { + print("$fancy"); + print("The following files chosen for auto-annotation could not be found:\n"); + print($fancy); + foreach my $f (@unfound_auto_annotate_files) { + print(" $f\n"); + } + print("\n"); + } + + # If we did any annotating, print what proportion of events were covered by + # annotated lines above. + if ($did_annotations) { + my $percent_printed_CC; + foreach (my $i = 0; $i < @$summary_CC; $i++) { + $percent_printed_CC->[$i] = + sprintf("%.0f", + $printed_totals_CC->[$i] / $summary_CC->[$i] * 100); + } + my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC); + print($fancy); + print_events($pp_CC_col_widths); + print("\n"); + print($fancy); + print_CC($percent_printed_CC, $pp_CC_col_widths); + print(" percentage of events annotated\n\n"); + } +} + +#---------------------------------------------------------------------------- +# "main()" +#---------------------------------------------------------------------------- +process_cmd_line(); +read_input_file(); +print_options(); +my $threshold_files = print_summary_and_fn_totals(); +annotate_ann_files($threshold_files); + diff --git a/vg_cachegen.in b/vg_cachegen.in new file mode 100755 index 0000000000..e887f1bab3 --- /dev/null +++ b/vg_cachegen.in @@ -0,0 +1,300 @@ +#! /usr/bin/perl -w +##--------------------------------------------------------------------## +##--- The cache simulator generator ---## +##--- vg_cachegen ---## +##--------------------------------------------------------------------## + +# This file is part of Valgrind, an x86 protected-mode emulator +# designed for debugging and profiling binaries on x86-Unixes. +# +# Copyright (C) 2000-2002 Julian Seward +# jseward@acm.org +# Julian_Seward@muraroa.demon.co.uk +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA +# 02111-1307, USA. +# +# The GNU General Public License is contained in the file LICENSE. + +#---------------------------------------------------------------------------- +# Cache simulator generator, creates files vg_cachesim_{I1,D1,L2}.c to be +# #included in vg_cachesim.c. +# +# Notes: +# - simulates a write-allocate cache +# - (block --> set) hash function uses simple bit selection +# - handling references straddling two cache blocks: +# - counts as only one cache access (not two) +# - both blocks hit --> one hit +# - one block hits, the other misses --> one miss +# - both blocks miss --> one miss (not two) + +use strict; + +#---------------------------------------------------------------------------- +# Global variables +#---------------------------------------------------------------------------- +my %log2 = + ( 1 => 0, + 2 => 1, + 4 => 2, + 8 => 3, + 16 => 4, + 32 => 5, + 64 => 6, + 128 => 7, + 256 => 8, + 512 => 9, + 1024 => 10, + 2048 => 11, + 4096 => 12, + 8192 => 13, + 16384 => 14, + 32768 => 15, + 65536 => 16, + 131072 => 17, + 262144 => 18, + 524288 => 19, + 1048576 => 20, + 2097152 => 21, + 4194304 => 22, + 8388608 => 23, + 16777216 => 24, + 33554432 => 25, + 67108864 => 26, + 134217728 => 27, + 268435456 => 28, + 536870912 => 29, + 1073741824 => 30); + +my @valid_types = + ("I1", "D1", "L2"); + +my @valid_sizes = + (1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, + 1048576, 2097152, 4194304, 8388608, 16777216, 67108864, 134217728); + +my @valid_linesizes = + (8, 16, 32, 64, 128); + +my @valid_ways = + (1, 2, 4, 8, 256); + +my %valid_types; for my $i (@valid_types) { $valid_types{$i} = 1; } +my %valid_sizes; for my $i (@valid_sizes) { $valid_sizes{$i} = 1; } +my %valid_linesizes; for my $i (@valid_linesizes) { $valid_linesizes{$i} = 1; } +my %valid_ways; for my $i (@valid_ways) { $valid_ways{$i} = 1; } + +my ($L, $size, $line_size, $n_ways); + +my @caches; + +#---------------------------------------------------------------------------- +# Argument and option handling +#---------------------------------------------------------------------------- +sub process_cmd_line () +{ +#---- + my $usage = < 0) or die($usage); + for my $arg (@ARGV) { + + if ($arg =~ /--(I1|D1|L2)=(\d+),(\d+),(\d+)/) { + my ($L, $size, $line_size, $n_ways) = ($1, $2, $3, $4); + + (defined $valid_sizes{$size} && + defined $valid_linesizes{$line_size} && + defined $valid_ways{$n_ways}) or die($usage); + + # Remember cache type too, and save + push(@caches, [ $L, $size, $line_size, $n_ways ]); + + } + + else { # -h and --help fall under this case + die($usage); + } + } +} + +#---------------------------------------------------------------------------- +# Printing machinery (I) +#---------------------------------------------------------------------------- +sub shiftSequence ($$$) +{ + my $s = ""; + my ($i, $n, $L) = @_; + + while ($i > 0) { + $s .= " ${L}_tags[set$n][$i] = "; + $i--; + $s .= "${L}_tags[set$n][$i];\n" + } + return $s; +} + +sub trySet ($$$$$) +{ + my $s = ""; + my ($k, $n, $L, $hit_ending, $miss_ending) = @_; + + for (my $i = 0; $i < $k; $i++) { + $s .= " " + . (0 == $i ? "if" : "else if") + . " (tag == ${L}_tags[set$n][$i]) {\n" + . shiftSequence($i, $n, $L) + . (0 == $i ? "" : " ${L}_tags[set$n][0] = tag;\n") + . ("" eq $hit_ending ? "" : " $hit_ending\n") + . " }\n"; + } + + $s .= " else {\n" + . " /* A miss */\n" + . shiftSequence($k - 1, $n, $L) # not if exclusive + . " ${L}_tags[set$n][0] = tag;\n" # not if exclusive + . "\n" + . " $miss_ending\n" + . " }\n"; + return $s; +} + +sub print_cache_simulator (@) +{ + my ($L, $size, $line_size, $n_ways) = @_; + + my $n_lines = $size / $line_size; + my $n_sets = $n_lines / $n_ways; + my $n_line_bits = $log2{$line_size}; + my $n_set_bits = $log2{$n_sets}; + + my $assoc = (1 == $n_ways ? "direct-mapped" : "$n_ways-way associative"); + my $L1_args = "Addr a, UChar size, ULong* m1, ULong *m2"; + my $L2_args = "Addr a, UChar size, ULong *m2"; + my $L_args = ($L ne "L2" ? $L1_args : $L2_args); + +#---- + my $comments_cache_init_desc_and_doref_start = <> $n_line_bits) & ($n_sets-1); + register UInt set2 = ((a + size) >> $n_line_bits) & ($n_sets-1); + register UInt tag = a >> ($n_line_bits + $n_set_bits); + + if (set1 == set2) { + +END +; +#---- + my $doref_middle = < $outfile") or die("Couldn't open $outfile for writing\n"); + + print(OUTFILE $comments_cache_init_desc_and_doref_start); + print(OUTFILE trySet($n_ways, 1, $L, "return;", $L_miss_treatment)); + print(OUTFILE $doref_middle); + print(OUTFILE " /* Block one */\n"); + print(OUTFILE trySet($n_ways, 1, $L, "", "is_${L}_miss = True;")); + print(OUTFILE "\n"); + print(OUTFILE " /* Block two */\n"); + print(OUTFILE trySet($n_ways, 2, $L, "", "is_${L}_miss = True;")); + print(OUTFILE "\n"); + print(OUTFILE $straddle_case_miss_treatment); + print(OUTFILE $doref_end); + + close(OUTFILE); +} + +#---------------------------------------------------------------------------- +# main() +#---------------------------------------------------------------------------- +process_cmd_line(); +foreach my $cache (@caches) { + print_cache_simulator(@$cache); +} + diff --git a/vg_cachesim.c b/vg_cachesim.c new file mode 100644 index 0000000000..ea0cb410a2 --- /dev/null +++ b/vg_cachesim.c @@ -0,0 +1,1068 @@ +/*--------------------------------------------------------------------*/ +/*--- The cache simulation framework: instrumentation, recording ---*/ +/*--- and results printing. ---*/ +/*--- vg_cachesim.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Valgrind, an x86 protected-mode emulator + designed for debugging and profiling binaries on x86-Unixes. + + Copyright (C) 2000-2002 Julian Seward + jseward@acm.org + Julian_Seward@muraroa.demon.co.uk + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307, USA. + + The GNU General Public License is contained in the file LICENSE. +*/ + +#include + +#include "vg_include.h" + +#include "vg_cachesim_L2.c" +#include "vg_cachesim_I1.c" +#include "vg_cachesim_D1.c" + + +/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */ +#define MAX_x86_INSTR_SIZE 16 + +/* Size of various buffers used for storing strings */ +#define FILENAME_LEN 256 +#define FN_NAME_LEN 256 +#define BUF_LEN 512 +#define COMMIFY_BUF_LEN 128 +#define RESULTS_BUF 128 + +/*------------------------------------------------------------*/ +/*--- Output file related stuff ---*/ +/*------------------------------------------------------------*/ + +#define OUT_FILE "cachegrind.out" + +static void file_err() +{ + VG_(message)(Vg_UserMsg, + "FATAL: can't open cache simulation output file `%s'", + OUT_FILE ); + VG_(exit)(1); +} + +/*------------------------------------------------------------*/ +/*--- Cost center types, operations ---*/ +/*------------------------------------------------------------*/ + +typedef struct _CC CC; +struct _CC { + ULong a; + ULong m1; + ULong m2; +}; + +static __inline__ void initCC(CC* cc) { + cc->a = 0; + cc->m1 = 0; + cc->m2 = 0; +} + + +typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type; + +/* Instruction-level cost-centres. The typedefs for these structs are in + * vg_include.c + * + * WARNING: the 'tag' field *must* be the first byte of both CC types. + * the 'instr_addr' *must* be the second word of both CC types. + * + * This is because we use them when we don't know what type of CC we're dealing + * with. + */ +struct _iCC { + /* word 1 */ + UChar tag; + UChar instr_size; + + /* words 2+ */ + Addr instr_addr; + CC I; +}; + +struct _idCC { + /* word 1 */ + UChar tag; + UChar instr_size; + UChar data_size; + + /* words 2+ */ + Addr instr_addr; + CC I; + CC D; +}; + +static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size) +{ + cc->tag = INSTR_CC; + cc->instr_size = instr_size; + cc->instr_addr = instr_addr; + initCC(&cc->I); +} + +static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr, + UInt instr_size, UInt data_size) +{ + cc->tag = X_CC; + cc->instr_size = instr_size; + cc->data_size = data_size; + cc->instr_addr = instr_addr; + initCC(&cc->I); + initCC(&cc->D); +} + +static __inline__ void sprint_iCC(Char buf[BUF_LEN], UInt ln, iCC* cc) +{ + VG_(sprintf)(buf, "%u %llu %llu %llu\n", + ln, cc->I.a, cc->I.m1, cc->I.m2/*, cc->instr_addr*/); +} + +static __inline__ void sprint_read_or_mod_CC(Char buf[BUF_LEN], UInt ln, + idCC* cc) +{ + VG_(sprintf)(buf, "%u %llu %llu %llu %llu %llu %llu\n", + ln, cc->I.a, cc->I.m1, cc->I.m2, + cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/); +} + +static __inline__ void sprint_write_CC(Char buf[BUF_LEN], UInt ln, idCC* cc) +{ + VG_(sprintf)(buf, "%u %llu %llu %llu . . . %llu %llu %llu\n", + ln, cc->I.a, cc->I.m1, cc->I.m2, + cc->D.a, cc->D.m1, cc->D.m2/*, cc->instr_addr*/); +} + +/*------------------------------------------------------------*/ +/*--- BBCC hash table stuff ---*/ +/*------------------------------------------------------------*/ + +/* The table of BBCCs is of the form hash(filename, hash(fn_name, + * hash(BBCCs))). Each hash table is separately chained. The sizes below work + * fairly well for Konqueror. */ + +#define N_FILE_ENTRIES 251 +#define N_FN_ENTRIES 53 +#define N_BBCC_ENTRIES 37 + +/* The cost centres for a basic block are stored in a contiguous array. + * They are distinguishable by their tag field. */ +typedef struct _BBCC BBCC; +struct _BBCC { + Addr orig_addr; + UInt array_size; /* byte-size of variable length array */ + BBCC* next; + Addr array[0]; /* variable length array */ +}; + +typedef struct _fn_node fn_node; +struct _fn_node { + Char* fn_name; + BBCC* BBCCs[N_BBCC_ENTRIES]; + fn_node* next; +}; + +typedef struct _file_node file_node; +struct _file_node { + Char* filename; + fn_node* fns[N_FN_ENTRIES]; + file_node* next; +}; + +/* BBCC_table structure: list(filename, list(fn_name, list(BBCC))) */ +file_node *BBCC_table[N_FILE_ENTRIES]; + +Int distinct_files = 0; +Int distinct_fns = 0; + +Int distinct_instrs = 0; +Int full_debug_BBs = 0; +Int file_line_debug_BBs = 0; +Int fn_name_debug_BBs = 0; +Int no_debug_BBs = 0; + +Int BB_retranslations = 0; + +static void init_BBCC_table() +{ + Int i; + for (i = 0; i < N_FILE_ENTRIES; i++) + BBCC_table[i] = NULL; +} + +static void get_file_fn_names(Addr instr_addr, Char filename[FILENAME_LEN], + Char fn_name[FN_NAME_LEN]) +{ + UInt dummy_line_num; + Bool found1, found2, no_demangle = False; + + found1 = VG_(what_line_is_this)(instr_addr, filename, + FILENAME_LEN, &dummy_line_num); + found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN); + + if (!found1 && !found2) { + no_debug_BBs++; + VG_(strcpy)(filename, "???"); + VG_(strcpy)(fn_name, "???"); + + } else if ( found1 && found2) { + full_debug_BBs++; + + } else if ( found1 && !found2) { + file_line_debug_BBs++; + VG_(strcpy)(fn_name, "???"); + + } else /*(!found1 && found2)*/ { + fn_name_debug_BBs++; + VG_(strcpy)(filename, "???"); + } +} + +/* Forward declaration. */ +static Int compute_BBCC_array_size(UCodeBlock* cb); + +static __inline__ +file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next) +{ + Int i; + file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node)); + new->filename = VG_(strdup)(VG_AR_PRIVATE, filename); + for (i = 0; i < N_FN_ENTRIES; i++) { + new->fns[i] = NULL; + } + new->next = next; + return new; +} + +static __inline__ +fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next) +{ + Int i; + fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node)); + new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name); + for (i = 0; i < N_BBCC_ENTRIES; i++) { + new->BBCCs[i] = NULL; + } + new->next = next; + return new; +} + +static __inline__ +BBCC* new_BBCC(Addr bb_orig_addr, UCodeBlock* cb, BBCC* next) +{ + Int BBCC_array_size = compute_BBCC_array_size(cb); + BBCC* new; + + new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size); + new->orig_addr = bb_orig_addr; + new->array_size = BBCC_array_size; + new->next = next; + + return new; +} + +#define HASH_CONSTANT 256 + +static UInt hash(Char *s, UInt table_size) +{ + int hash_value = 0; + for ( ; *s; s++) + hash_value = (HASH_CONSTANT * hash_value + *s) % table_size; + return hash_value; +} + +/* Do a three step traversal: by filename, then fn_name, then instr_addr. + * In all cases prepends new nodes to their chain. Returns a pointer to the + * cost centre. Also sets BB_seen_before by reference. + */ +static __inline__ BBCC* get_BBCC(Addr bb_orig_addr, UCodeBlock* cb, + Bool *BB_seen_before) +{ + file_node *curr_file_node; + fn_node *curr_fn_node; + BBCC *curr_BBCC; + Char filename[FILENAME_LEN], fn_name[FN_NAME_LEN]; + UInt filename_hash, fnname_hash, BBCC_hash; + + get_file_fn_names(bb_orig_addr, filename, fn_name); + + VGP_PUSHCC(VgpCacheGetBBCC); + filename_hash = hash(filename, N_FILE_ENTRIES); + curr_file_node = BBCC_table[filename_hash]; + while (NULL != curr_file_node && + strcmp(filename, curr_file_node->filename) != 0) { + curr_file_node = curr_file_node->next; + } + if (NULL == curr_file_node) { + BBCC_table[filename_hash] = curr_file_node = + new_file_node(filename, BBCC_table[filename_hash]); + distinct_files++; + } + + fnname_hash = hash(fn_name, N_FN_ENTRIES); + curr_fn_node = curr_file_node->fns[fnname_hash]; + while (NULL != curr_fn_node && + strcmp(fn_name, curr_fn_node->fn_name) != 0) { + curr_fn_node = curr_fn_node->next; + } + if (NULL == curr_fn_node) { + curr_file_node->fns[fnname_hash] = curr_fn_node = + new_fn_node(fn_name, curr_file_node->fns[fnname_hash]); + distinct_fns++; + } + + BBCC_hash = bb_orig_addr % N_BBCC_ENTRIES; + curr_BBCC = curr_fn_node->BBCCs[BBCC_hash]; + while (NULL != curr_BBCC && bb_orig_addr != curr_BBCC->orig_addr) { + curr_BBCC = curr_BBCC->next; + } + if (curr_BBCC == NULL) { + curr_fn_node->BBCCs[BBCC_hash] = curr_BBCC = + new_BBCC(bb_orig_addr, cb, curr_fn_node->BBCCs[BBCC_hash]); + *BB_seen_before = False; + + } else { + vg_assert(bb_orig_addr == curr_BBCC->orig_addr); + vg_assert(curr_BBCC->array_size > 0 && curr_BBCC->array_size < 1000000); + if (VG_(clo_verbosity) > 1) { + VG_(message)(Vg_DebugMsg, "BB retranslation, retrieving from BBCC table"); + } + *BB_seen_before = True; + BB_retranslations++; + } + VGP_POPCC; + return curr_BBCC; +} + +/*------------------------------------------------------------*/ +/*--- Cache simulation instrumentation phase ---*/ +/*------------------------------------------------------------*/ + +#define uInstr1 VG_(newUInstr1) +#define uInstr2 VG_(newUInstr2) +#define uInstr3 VG_(newUInstr3) +#define dis VG_(disassemble) +#define uLiteral VG_(setLiteralField) +#define newTemp VG_(getNewTemp) + +static Int compute_BBCC_array_size(UCodeBlock* cb) +{ + UInstr* u_in; + Int i, CC_size, BBCC_size = 0; + Bool is_LOAD, is_STORE, is_FPU_R, is_FPU_W; + + is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False; + + for (i = 0; i < cb->used; i++) { + //VG_(ppUInstr)(0, &cb->instrs[i]); + + u_in = &cb->instrs[i]; + switch(u_in->opcode) { + + case INCEIP: + goto case_for_end_of_instr; + + case JMP: + if (u_in->cond != CondAlways) break; + + goto case_for_end_of_instr; + + case_for_end_of_instr: + + CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W + ? sizeof(idCC) : sizeof(iCC)); + + BBCC_size += CC_size; + is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False; + break; + + case LOAD: + /* Two LDBs are possible for a single instruction */ + vg_assert(/*!is_LOAD &&*/ !is_STORE && !is_FPU_R && !is_FPU_W); + is_LOAD = True; + break; + + case STORE: + /* Multiple STOREs are possible for 'pushal' */ + vg_assert( /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W); + is_STORE = True; + break; + + case FPU_R: + vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W); + is_FPU_R = True; + break; + + case FPU_W: + vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W); + is_FPU_W = True; + break; + + default: + break; + } + } + + return BBCC_size; +} + +/* Use this rather than eg. -1 because it's stored as a UInt. */ +#define INVALID_DATA_SIZE 999999 + +UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr) +{ + UCodeBlock* cb; + Int i; + UInstr* u_in; + BBCC* BBCC_node; + Int t_CC_addr, t_read_addr, t_write_addr, t_data_addr; + Int CC_size = -1; /* Shut gcc warnings up */ + Addr instr_addr = orig_addr; + UInt instr_size, data_size = INVALID_DATA_SIZE; + Int helper = -1; /* Shut gcc warnings up */ + UInt stack_used; + Bool BB_seen_before = False; + Bool prev_instr_was_Jcond = False; + Addr BBCC_ptr0, BBCC_ptr; + + /* Get BBCC (creating if necessary -- requires a counting pass over the BB + * if it's the first time it's been seen), and point to start of the + * BBCC array. */ + BBCC_node = get_BBCC(orig_addr, cb_in, &BB_seen_before); + BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array); + + cb = VG_(allocCodeBlock)(); + cb->nextTemp = cb_in->nextTemp; + + t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG; + + for (i = 0; i < cb_in->used; i++) { + u_in = &cb_in->instrs[i]; + + //VG_(ppUInstr)(0, u_in); + + /* What this is all about: we want to instrument each x86 instruction + * translation. The end of these are marked in three ways. The three + * ways, and the way we instrument them, are as follows: + * + * 1. UCode, INCEIP --> UCode, Instrumentation, INCEIP + * 2. UCode, Juncond --> UCode, Instrumentation, Juncond + * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond + * + * We must put the instrumentation before the jumps so that it is always + * executed. We don't have to put the instrumentation before the INCEIP + * (it could go after) but we do so for consistency. + * + * Junconds are always the last instruction in a basic block. Jconds are + * always the 2nd last, and must be followed by a Jcond. We check this + * with various assertions. + * + * Note that in VG_(disBB) we patched the `extra4b' field of the first + * occurring JMP in a block with the size of its x86 instruction. This + * is used now. + * + * Note that we don't have to treat JIFZ specially; unlike JMPs, JIFZ + * occurs in the middle of a BB and gets an INCEIP after it. + * + * The instrumentation is just a call to the appropriate helper function, + * passing it the address of the instruction's CC. + */ + if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP); + + switch (u_in->opcode) { + + case INCEIP: + instr_size = u_in->val1; + goto case_for_end_of_x86_instr; + + case JMP: + if (u_in->cond == CondAlways) { + vg_assert(i+1 == cb_in->used); + + /* Don't instrument if previous instr was a Jcond. */ + if (prev_instr_was_Jcond) { + vg_assert(0 == u_in->extra4b); + VG_(copyUInstr)(cb, u_in); + break; + } + prev_instr_was_Jcond = False; + + } else { + vg_assert(i+2 == cb_in->used); /* 2nd last instr in block */ + prev_instr_was_Jcond = True; + } + + /* Ah, the first JMP... instrument, please. */ + instr_size = u_in->extra4b; + goto case_for_end_of_x86_instr; + + /* Shared code that is executed at the end of an x86 translation + * block, marked by either an INCEIP or an unconditional JMP. */ + case_for_end_of_x86_instr: + +#define IS_(X) (INVALID_TEMPREG != t_##X##_addr) + + /* Initialise the CC in the BBCC array appropriately if it hasn't + * been initialised before. + * Then call appropriate sim function, passing it the CC address. + * Note that CALLM_S/CALL_E aren't required here; by this point, + * the checking related to them has already happened. */ + stack_used = 0; + + vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE); + vg_assert(0 != instr_addr); + + /* Save the caller-save registers before we push our args */ + uInstr1(cb, PUSH, 4, RealReg, R_EAX); + uInstr1(cb, PUSH, 4, RealReg, R_ECX); + uInstr1(cb, PUSH, 4, RealReg, R_EDX); + + if (!IS_(read) && !IS_(write)) { + iCC* CC_ptr = (iCC*)(BBCC_ptr); + vg_assert(INVALID_DATA_SIZE == data_size); + vg_assert(INVALID_TEMPREG == t_read_addr && + INVALID_TEMPREG == t_write_addr); + CC_size = sizeof(iCC); + if (!BB_seen_before) + init_iCC(CC_ptr, instr_addr, instr_size); + + helper = VGOFF_(cachesim_log_non_mem_instr); + + } else { + CC_type X_CC; + idCC* CC_ptr = (idCC*)(BBCC_ptr); + + vg_assert(4 == data_size || 2 == data_size || 1 == data_size || + 8 == data_size || 10 == data_size); + + CC_size = sizeof(idCC); + helper = VGOFF_(cachesim_log_mem_instr); + + if (IS_(read) && !IS_(write)) { + X_CC = READ_CC; + vg_assert(INVALID_TEMPREG != t_read_addr && + INVALID_TEMPREG == t_write_addr); + t_data_addr = t_read_addr; + + } else if (!IS_(read) && IS_(write)) { + X_CC = WRITE_CC; + vg_assert(INVALID_TEMPREG == t_read_addr && + INVALID_TEMPREG != t_write_addr); + t_data_addr = t_write_addr; + + } else { + vg_assert(IS_(read) && IS_(write)); + X_CC = MOD_CC; + vg_assert(INVALID_TEMPREG != t_read_addr && + INVALID_TEMPREG != t_write_addr); + t_data_addr = t_read_addr; + } + + if (!BB_seen_before) + init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size); + + /* 2nd arg: data addr */ + uInstr1(cb, PUSH, 4, TempReg, t_data_addr); + stack_used += 4; + } +#undef IS_ + + /* 1st arg: CC addr */ + t_CC_addr = newTemp(cb); + uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_CC_addr); + uLiteral(cb, BBCC_ptr); + uInstr1(cb, PUSH, 4, TempReg, t_CC_addr); + stack_used += 4; + + /* Call function and return. */ + uInstr1(cb, CALLM, 0, Lit16, helper); + uInstr1(cb, CLEAR, 0, Lit16, stack_used); + + /* Restore the caller-save registers now the call is done */ + uInstr1(cb, POP, 4, RealReg, R_EDX); + uInstr1(cb, POP, 4, RealReg, R_ECX); + uInstr1(cb, POP, 4, RealReg, R_EAX); + + VG_(copyUInstr)(cb, u_in); + + /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */ + BBCC_ptr += CC_size; + instr_addr += instr_size; + t_CC_addr = t_read_addr = t_write_addr = + t_data_addr = INVALID_TEMPREG; + data_size = INVALID_DATA_SIZE; + break; + + + /* For memory-ref instrs, copy the data_addr into a temporary to be + * passed to the cachesim_log_function at the end of the instruction. + */ + case LOAD: + t_read_addr = newTemp(cb); + uInstr2(cb, MOV, 4, TempReg, u_in->val1, TempReg, t_read_addr); + data_size = u_in->size; + VG_(copyUInstr)(cb, u_in); + break; + + case FPU_R: + t_read_addr = newTemp(cb); + uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_read_addr); + data_size = u_in->size; + VG_(copyUInstr)(cb, u_in); + break; + + /* Note that we must set t_write_addr even for mod instructions; + * that's how the code above determines whether it does a write; + * without it, it would think a mod instruction is a read. + * As for the MOV, if it's a mod instruction it's redundant, but it's + * not expensive and mod instructions are rare anyway. */ + case STORE: + case FPU_W: + t_write_addr = newTemp(cb); + uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr); + data_size = u_in->size; + VG_(copyUInstr)(cb, u_in); + break; + + case NOP: case CALLM_E: case CALLM_S: + break; + + default: + VG_(copyUInstr)(cb, u_in); + break; + } + } + + /* Just check everything looks ok */ + vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size); + + VG_(freeCodeBlock)(cb_in); + return cb; +} + +/*------------------------------------------------------------*/ +/*--- Cache simulation stuff ---*/ +/*------------------------------------------------------------*/ + +/* Total reads/writes/misses. Calculated during CC traversal at the end. */ +static CC Ir_total; +static CC Dr_total; +static CC Dw_total; + +void VG_(init_cachesim)(void) +{ + /* Make sure the output file can be written. */ + Int fd = VG_(open_write)(OUT_FILE); + if (-1 == fd) { + fd = VG_(create_and_write)(OUT_FILE); + if (-1 == fd) { + file_err(); + } + } + VG_(close)(fd); + + initCC(&Ir_total); + initCC(&Dr_total); + initCC(&Dw_total); + + cachesim_I1_initcache(); + cachesim_D1_initcache(); + cachesim_L2_initcache(); + + init_BBCC_table(); +} + +void VG_(cachesim_log_non_mem_instr)(iCC* cc) +{ + //VG_(printf)("sim I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n", + // cc, cc->instr_addr, cc->instr_size) + VGP_PUSHCC(VgpCacheSimulate); + cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2); + cc->I.a++; + VGP_POPCC; +} + +void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr) +{ + //VG_(printf)("sim D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n", + // cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size) + VGP_PUSHCC(VgpCacheSimulate); + cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2); + cc->I.a++; + + cachesim_D1_doref(data_addr, cc->data_size, &cc->D.m1, &cc->D.m2); + cc->D.a++; + VGP_POPCC; +} + +/*------------------------------------------------------------*/ +/*--- Printing of output file and summary stats ---*/ +/*------------------------------------------------------------*/ + +int get_line_num(Addr instr_addr) +{ + Char filename[FILENAME_LEN] = "???"; + UInt line_num; + Bool found; + + found = VG_(what_line_is_this)(instr_addr, filename, + FILENAME_LEN, &line_num); + if (!found) { + line_num = 0; + } + return line_num; +} + +static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, + Char *first_instr_fn) +{ + Addr BBCC_ptr0, BBCC_ptr; + Char buf[BUF_LEN], curr_file[BUF_LEN], fbuf[BUF_LEN+4]; + UInt line_num; + + BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array); + + VG_(write)(fd, (void*)"\n", 1); + + VG_(strcpy)(curr_file, first_instr_fl); + + while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) { + + /* We pretend the CC is an iCC for getting the tag. This is ok + * because both CC types have tag as their first byte. Once we know + * the type, we can cast and act appropriately. */ + + Char fl_buf[FILENAME_LEN]; + Char fn_buf[FN_NAME_LEN]; + + /* Assumes instr_addr position is same for both CCs. */ + Addr instr_addr = ((iCC*)BBCC_ptr)->instr_addr; + get_file_fn_names(instr_addr, fl_buf, fn_buf); + + /* Allow for filename switching in the middle of a BB; if this happens, + * must print the new filename with the function name. */ + if (0 != strcmp(fl_buf, curr_file)) { + VG_(strcpy)(curr_file, fl_buf); + VG_(sprintf)(fbuf, "fi=%s\n", curr_file); + VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf)); + } + + switch ( ((iCC*)BBCC_ptr)->tag ) { + +#define ADD_CC_TO(CC_type, cc, total) \ + total.a += ((CC_type*)BBCC_ptr)->cc.a; \ + total.m1 += ((CC_type*)BBCC_ptr)->cc.m1; \ + total.m2 += ((CC_type*)BBCC_ptr)->cc.m2; + + case INSTR_CC: + line_num = get_line_num(((iCC*)BBCC_ptr)->instr_addr); + sprint_iCC(buf, line_num, (iCC*)BBCC_ptr); + ADD_CC_TO(iCC, I, Ir_total); + BBCC_ptr += sizeof(iCC); + break; + + case READ_CC: + case MOD_CC: + line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr); + sprint_read_or_mod_CC(buf, line_num, (idCC*)BBCC_ptr); + ADD_CC_TO(idCC, I, Ir_total); + ADD_CC_TO(idCC, D, Dr_total); + BBCC_ptr += sizeof(idCC); + break; + + case WRITE_CC: + line_num = get_line_num(((idCC*)BBCC_ptr)->instr_addr); + sprint_write_CC(buf, line_num, (idCC*)BBCC_ptr); + ADD_CC_TO(idCC, I, Ir_total); + ADD_CC_TO(idCC, D, Dw_total); + BBCC_ptr += sizeof(idCC); + break; + +#undef ADD_CC_TO + + default: + VG_(panic)("Unknown CC type in fprint_BBCC()\n"); + break; + } + distinct_instrs++; + + /* If the function name for this instruction doesn't match that of the + * first instruction in the BB, print out a warning. */ + if (VG_(clo_trace_symtab) && 0 != strcmp(fn_buf, first_instr_fn)) { + VG_(printf)("Mismatched function names\n"); + VG_(printf)(" filenames: BB:%s, instr:%s; " + "fn_names: BB:%s, instr:%s; " + "line: %d\n", + first_instr_fl, fl_buf, + first_instr_fn, fn_buf, + line_num); + } + + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + } + /* If we switched filenames in the middle of the BB without switching back, + * switch back now because the subsequent BB may be relying on falling under + * the original file name. */ + if (0 != VG_(strcmp)(first_instr_fl, curr_file)) { + VG_(sprintf)(fbuf, "fe=%s\n", first_instr_fl); + VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf)); + } + //VG_(write)(fd, (void*)"#}\n", 3); + + vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size); +} + +static void fprint_BBCC_table_and_calc_totals(Int client_argc, + Char** client_argv) +{ + Int fd; + Char buf[BUF_LEN]; + file_node *curr_file_node; + fn_node *curr_fn_node; + BBCC *curr_BBCC; + Int i,j,k; + + VGP_PUSHCC(VgpCacheDump); + fd = VG_(open_write)(OUT_FILE); + if (-1 == fd) { file_err(); } + + /* "desc:" lines (giving I1/D1/L2 cache configuration) */ + VG_(write)(fd, (void*)I1_desc_line, VG_(strlen)(I1_desc_line)); + VG_(write)(fd, (void*)D1_desc_line, VG_(strlen)(D1_desc_line)); + VG_(write)(fd, (void*)L2_desc_line, VG_(strlen)(L2_desc_line)); + + /* "cmd:" line */ + VG_(strcpy)(buf, "cmd:"); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + for (i = 0; i < client_argc; i++) { + VG_(sprintf)(buf, " %s", client_argv[i]); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + } + /* "events:" line */ + VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n"); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + + /* Six loops here: three for the hash table arrays, and three for the + * chains hanging off the hash table arrays. */ + for (i = 0; i < N_FILE_ENTRIES; i++) { + curr_file_node = BBCC_table[i]; + while (curr_file_node != NULL) { + VG_(sprintf)(buf, "fl=%s\n", curr_file_node->filename); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + + for (j = 0; j < N_FN_ENTRIES; j++) { + curr_fn_node = curr_file_node->fns[j]; + while (curr_fn_node != NULL) { + VG_(sprintf)(buf, "fn=%s\n", curr_fn_node->fn_name); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + + for (k = 0; k < N_BBCC_ENTRIES; k++) { + curr_BBCC = curr_fn_node->BBCCs[k]; + while (curr_BBCC != NULL) { + fprint_BBCC(fd, curr_BBCC, + + curr_file_node->filename, + curr_fn_node->fn_name); + + curr_BBCC = curr_BBCC->next; + } + } + curr_fn_node = curr_fn_node->next; + } + } + curr_file_node = curr_file_node->next; + } + } + + /* Summary stats must come after rest of table, since we calculate them + * during traversal. */ + VG_(sprintf)(buf, "summary: " + "%llu %llu %llu " + "%llu %llu %llu " + "%llu %llu %llu\n", + Ir_total.a, Ir_total.m1, Ir_total.m2, + Dr_total.a, Dr_total.m1, Dr_total.m2, + Dw_total.a, Dw_total.m1, Dw_total.m2); + VG_(write)(fd, (void*)buf, VG_(strlen)(buf)); + VG_(close)(fd); +} + +/* Adds commas to ULong, right justifying in a field field_width wide, returns + * the string in buf. */ +Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN]) +{ + int len, n_commas, i, j, new_len, space; + + VG_(sprintf)(buf, "%lu", n); + len = VG_(strlen)(buf); + n_commas = (len - 1) / 3; + new_len = len + n_commas; + space = field_width - new_len; + + /* Allow for printing a number in a field_width smaller than it's size */ + if (space < 0) space = 0; + + /* Make j = -1 because we copy the '\0' before doing the numbers in groups + * of three. */ + for (j = -1, i = len ; i >= 0; i--) { + buf[i + n_commas + space] = buf[i]; + + if (3 == ++j) { + j = 0; + n_commas--; + buf[i + n_commas + space] = ','; + } + } + /* Right justify in field. */ + for (i = 0; i < space; i++) buf[i] = ' '; + return new_len; +} + +void percentify(Int n, Int pow, Int field_width, char buf[]) +{ + int i, len, space; + + VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow); + len = VG_(strlen)(buf); + space = field_width - len; + i = len; + + /* Right justify in field */ + for ( ; i >= 0; i--) buf[i + space] = buf[i]; + for (i = 0; i < space; i++) buf[i] = ' '; +} + +void VG_(show_cachesim_results)(Int client_argc, Char** client_argv) +{ + CC D_total; + ULong L2_total_m, L2_total_mr, L2_total_mw; + char buf1[RESULTS_BUF], + buf2[RESULTS_BUF], + buf3[RESULTS_BUF]; + Int l1, l2, l3; + Int p; + + fprint_BBCC_table_and_calc_totals(client_argc, client_argv); + + /* I cache results. Use the I_refs value to determine the first column + * width. */ + l1 = commify(Ir_total.a, 0, buf1); + VG_(message)(Vg_UserMsg, "I refs: %s", buf1); + + commify(Ir_total.m1, l1, buf1); + VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1); + + commify(Ir_total.m2, l1, buf1); + VG_(message)(Vg_UserMsg, "L2 misses: %s", buf1); + + p = 100; + + percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1); + VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1); + + percentify(Ir_total.m2 * 100 * p / Ir_total.a, p, l1+1, buf1); + VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1); + VG_(message)(Vg_UserMsg, ""); + + /* D cache results. Use the D_refs.rd and D_refs.wr values to determine the + * width of columns 2 & 3. */ + D_total.a = Dr_total.a + Dw_total.a; + D_total.m1 = Dr_total.m1 + Dw_total.m1; + D_total.m2 = Dr_total.m2 + Dw_total.m2; + + commify( D_total.a, 0, buf1); + l2 = commify(Dr_total.a, 0, buf2); + l3 = commify(Dw_total.a, 0, buf3); + VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + commify( D_total.m1, l1, buf1); + commify(Dr_total.m1, l2, buf2); + commify(Dw_total.m1, l3, buf3); + VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + commify( D_total.m2, l1, buf1); + commify(Dr_total.m2, l2, buf2); + commify(Dw_total.m2, l3, buf3); + VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + p = 10; + + percentify( D_total.m1 * 100 * p / D_total.a, p, l1+1, buf1); + percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2); + percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3); + VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3); + + percentify( D_total.m2 * 100 * p / D_total.a, p, l1+1, buf1); + percentify(Dr_total.m2 * 100 * p / Dr_total.a, p, l2+1, buf2); + percentify(Dw_total.m2 * 100 * p / Dw_total.a, p, l3+1, buf3); + VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3); + VG_(message)(Vg_UserMsg, ""); + + /* L2 overall results */ + L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2; + L2_total_mr = Dr_total.m2 + Ir_total.m2; + L2_total_mw = Dw_total.m2; + + commify(L2_total_m, l1, buf1); + commify(L2_total_mr, l2, buf2); + commify(L2_total_mw, l3, buf3); + VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)", + buf1, buf2, buf3); + + percentify(L2_total_m * 100 * p / (Ir_total.a + D_total.a), p, l1+1, buf1); + percentify(L2_total_mr * 100 * p / (Ir_total.a + Dr_total.a), p, l2+1, buf2); + percentify(L2_total_mw * 100 * p / Dw_total.a, p, l3+1, buf3); + VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )", buf1, buf2,buf3); + + + /* Hash table stats */ + if (VG_(clo_verbosity) > 1) { + int BB_lookups = full_debug_BBs + fn_name_debug_BBs + + file_line_debug_BBs + no_debug_BBs; + + VG_(message)(Vg_DebugMsg, ""); + VG_(message)(Vg_DebugMsg, "Distinct files: %d", distinct_files); + VG_(message)(Vg_DebugMsg, "Distinct fns: %d", distinct_fns); + VG_(message)(Vg_DebugMsg, "BB lookups: %d", BB_lookups); + VG_(message)(Vg_DebugMsg, "With full debug info:%3d%% (%d)", + full_debug_BBs * 100 / BB_lookups, + full_debug_BBs); + VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", + file_line_debug_BBs * 100 / BB_lookups, + file_line_debug_BBs); + VG_(message)(Vg_DebugMsg, "With fn name debug info:%3d%% (%d)", + fn_name_debug_BBs * 100 / BB_lookups, + fn_name_debug_BBs); + VG_(message)(Vg_DebugMsg, "With no debug info:%3d%% (%d)", + no_debug_BBs * 100 / BB_lookups, + no_debug_BBs); + VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations); + VG_(message)(Vg_DebugMsg, "Distinct instrs: %d", distinct_instrs); + } + VGP_POPCC; +} + diff --git a/vg_cachesim_D1.c b/vg_cachesim_D1.c new file mode 100644 index 0000000000..c2b3d4cc3e --- /dev/null +++ b/vg_cachesim_D1.c @@ -0,0 +1,93 @@ +/* D1 cache simulator, generated by vg_cachegen. + * total size = 65536 bytes + * line size = 64 bytes + * associativity = 2-way associative + * + * This file should be #include-d into vg_cachesim.c + */ + +static char D1_desc_line[] = + "desc: D1 cache: 65536 B, 64 B, 2-way associative\n"; + +static UInt D1_tags[512][2]; + +static void cachesim_D1_initcache(void) +{ + UInt set, way; + for (set = 0; set < 512; set++) + for (way = 0; way < 2; way++) + D1_tags[set][way] = 0; +} + +static __inline__ +void cachesim_D1_doref(Addr a, UChar size, ULong* m1, ULong *m2) +{ + register UInt set1 = ( a >> 6) & (512-1); + register UInt set2 = ((a + size) >> 6) & (512-1); + register UInt tag = a >> (6 + 9); + + if (set1 == set2) { + + if (tag == D1_tags[set1][0]) { + return; + } + else if (tag == D1_tags[set1][1]) { + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + return; + } + else { + /* A miss */ + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else if ((set1 + 1) % 512 == set2) { + + Bool is_D1_miss = False; + + /* Block one */ + if (tag == D1_tags[set1][0]) { + } + else if (tag == D1_tags[set1][1]) { + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + } + else { + /* A miss */ + D1_tags[set1][1] = D1_tags[set1][0]; + D1_tags[set1][0] = tag; + + is_D1_miss = True; + } + + /* Block two */ + if (tag == D1_tags[set2][0]) { + } + else if (tag == D1_tags[set2][1]) { + D1_tags[set2][1] = D1_tags[set2][0]; + D1_tags[set2][0] = tag; + } + else { + /* A miss */ + D1_tags[set2][1] = D1_tags[set2][0]; + D1_tags[set2][0] = tag; + + is_D1_miss = True; + } + + /* Miss treatment */ + if (is_D1_miss) { + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else { + VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size); + VG_(printf)("sets %d and %d.\n", set1, set2); + VG_(panic)("D1 cache set mismatch"); + } +} diff --git a/vg_cachesim_I1.c b/vg_cachesim_I1.c new file mode 100644 index 0000000000..32b89b51b0 --- /dev/null +++ b/vg_cachesim_I1.c @@ -0,0 +1,93 @@ +/* I1 cache simulator, generated by vg_cachegen. + * total size = 65536 bytes + * line size = 64 bytes + * associativity = 2-way associative + * + * This file should be #include-d into vg_cachesim.c + */ + +static char I1_desc_line[] = + "desc: I1 cache: 65536 B, 64 B, 2-way associative\n"; + +static UInt I1_tags[512][2]; + +static void cachesim_I1_initcache(void) +{ + UInt set, way; + for (set = 0; set < 512; set++) + for (way = 0; way < 2; way++) + I1_tags[set][way] = 0; +} + +static __inline__ +void cachesim_I1_doref(Addr a, UChar size, ULong* m1, ULong *m2) +{ + register UInt set1 = ( a >> 6) & (512-1); + register UInt set2 = ((a + size) >> 6) & (512-1); + register UInt tag = a >> (6 + 9); + + if (set1 == set2) { + + if (tag == I1_tags[set1][0]) { + return; + } + else if (tag == I1_tags[set1][1]) { + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + return; + } + else { + /* A miss */ + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else if ((set1 + 1) % 512 == set2) { + + Bool is_I1_miss = False; + + /* Block one */ + if (tag == I1_tags[set1][0]) { + } + else if (tag == I1_tags[set1][1]) { + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + } + else { + /* A miss */ + I1_tags[set1][1] = I1_tags[set1][0]; + I1_tags[set1][0] = tag; + + is_I1_miss = True; + } + + /* Block two */ + if (tag == I1_tags[set2][0]) { + } + else if (tag == I1_tags[set2][1]) { + I1_tags[set2][1] = I1_tags[set2][0]; + I1_tags[set2][0] = tag; + } + else { + /* A miss */ + I1_tags[set2][1] = I1_tags[set2][0]; + I1_tags[set2][0] = tag; + + is_I1_miss = True; + } + + /* Miss treatment */ + if (is_I1_miss) { + (*m1)++; + cachesim_L2_doref(a, size, m2); + } + + } else { + VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size); + VG_(printf)("sets %d and %d.\n", set1, set2); + VG_(panic)("I1 cache set mismatch"); + } +} diff --git a/vg_cachesim_L2.c b/vg_cachesim_L2.c new file mode 100644 index 0000000000..bb685326ab --- /dev/null +++ b/vg_cachesim_L2.c @@ -0,0 +1,250 @@ +/* L2 cache simulator, generated by vg_cachegen. + * total size = 262144 bytes + * line size = 64 bytes + * associativity = 8-way associative + * + * This file should be #include-d into vg_cachesim.c + */ + +static char L2_desc_line[] = + "desc: L2 cache: 262144 B, 64 B, 8-way associative\n"; + +static UInt L2_tags[512][8]; + +static void cachesim_L2_initcache(void) +{ + UInt set, way; + for (set = 0; set < 512; set++) + for (way = 0; way < 8; way++) + L2_tags[set][way] = 0; +} + +static __inline__ +void cachesim_L2_doref(Addr a, UChar size, ULong *m2) +{ + register UInt set1 = ( a >> 6) & (512-1); + register UInt set2 = ((a + size) >> 6) & (512-1); + register UInt tag = a >> (6 + 9); + + if (set1 == set2) { + + if (tag == L2_tags[set1][0]) { + return; + } + else if (tag == L2_tags[set1][1]) { + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][2]) { + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][3]) { + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][4]) { + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][5]) { + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][6]) { + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else if (tag == L2_tags[set1][7]) { + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + return; + } + else { + /* A miss */ + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + + (*m2)++; + } + + } else if ((set1 + 1) % 512 == set2) { + + Bool is_L2_miss = False; + + /* Block one */ + if (tag == L2_tags[set1][0]) { + } + else if (tag == L2_tags[set1][1]) { + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][2]) { + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][3]) { + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][4]) { + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][5]) { + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][6]) { + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else if (tag == L2_tags[set1][7]) { + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + } + else { + /* A miss */ + L2_tags[set1][7] = L2_tags[set1][6]; + L2_tags[set1][6] = L2_tags[set1][5]; + L2_tags[set1][5] = L2_tags[set1][4]; + L2_tags[set1][4] = L2_tags[set1][3]; + L2_tags[set1][3] = L2_tags[set1][2]; + L2_tags[set1][2] = L2_tags[set1][1]; + L2_tags[set1][1] = L2_tags[set1][0]; + L2_tags[set1][0] = tag; + + is_L2_miss = True; + } + + /* Block two */ + if (tag == L2_tags[set2][0]) { + } + else if (tag == L2_tags[set2][1]) { + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][2]) { + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][3]) { + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][4]) { + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][5]) { + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][6]) { + L2_tags[set2][6] = L2_tags[set2][5]; + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else if (tag == L2_tags[set2][7]) { + L2_tags[set2][7] = L2_tags[set2][6]; + L2_tags[set2][6] = L2_tags[set2][5]; + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + } + else { + /* A miss */ + L2_tags[set2][7] = L2_tags[set2][6]; + L2_tags[set2][6] = L2_tags[set2][5]; + L2_tags[set2][5] = L2_tags[set2][4]; + L2_tags[set2][4] = L2_tags[set2][3]; + L2_tags[set2][3] = L2_tags[set2][2]; + L2_tags[set2][2] = L2_tags[set2][1]; + L2_tags[set2][1] = L2_tags[set2][0]; + L2_tags[set2][0] = tag; + + is_L2_miss = True; + } + + /* Miss treatment */ + if (is_L2_miss) { + (*m2)++; + } + + } else { + VG_(printf)("\nERROR: Data item 0x%x of size %u bytes is in two non-adjacent\n", a, size); + VG_(printf)("sets %d and %d.\n", set1, set2); + VG_(panic)("L2 cache set mismatch"); + } +} diff --git a/vg_include.h b/vg_include.h index b51ef059a8..08a1ab20de 100644 --- a/vg_include.h +++ b/vg_include.h @@ -234,6 +234,8 @@ extern Bool VG_(clo_optimise); extern Bool VG_(clo_instrument); /* DEBUG: clean up instrumented code? default: YES */ extern Bool VG_(clo_cleanup); +/* Cache simulation instrumentation? default: NO */ +extern Bool VG_(clo_cachesim); /* SMC write checks? default: SOME (1,2,4 byte movs to mem) */ extern Int VG_(clo_smc_check); /* DEBUG: print system calls? default: NO */ @@ -287,7 +289,7 @@ extern void VG_(shutdown_logging) ( void ); #define VGP_M_STACK 10 -#define VGP_M_CCS 20 /* == the # of elems in VGP_LIST */ +#define VGP_M_CCS 24 /* == the # of elems in VGP_LIST */ #define VGP_LIST \ VGP_PAIR(VgpRun=0, "running"), \ VGP_PAIR(VgpMalloc, "low-lev malloc/free"), \ @@ -307,6 +309,10 @@ extern void VG_(shutdown_logging) ( void ); VGP_PAIR(VgpAddToT, "add-to-transtab"), \ VGP_PAIR(VgpSARP, "set-addr-range-perms"), \ VGP_PAIR(VgpSyscall, "syscall wrapper"), \ + VGP_PAIR(VgpCacheInstrument, "cache instrument"), \ + VGP_PAIR(VgpCacheGetBBCC,"cache get BBCC"), \ + VGP_PAIR(VgpCacheSimulate, "cache simulate"), \ + VGP_PAIR(VgpCacheDump, "cache stats dump"), \ VGP_PAIR(VgpSpare1, "spare 1"), \ VGP_PAIR(VgpSpare2, "spare 2") @@ -718,8 +724,10 @@ extern void VG_(assert_fail) ( Char* expr, Char* file, Int line, Char* fn ) __attribute__ ((__noreturn__)); -/* Reading files. */ +/* Reading and writing files. */ extern Int VG_(open_read) ( Char* pathname ); +extern Int VG_(open_write) ( Char* pathname ); +extern Int VG_(create_and_write) ( Char* pathname ); extern void VG_(close) ( Int fd ); extern Int VG_(read) ( Int fd, void* buf, Int count); extern Int VG_(write) ( Int fd, void* buf, Int count); @@ -955,7 +963,9 @@ typedef UChar tag3:4; /* third operand tag */ UChar extra4b:4; /* Spare field, used by WIDEN for src -size, and by LEA2 for scale - (1,2,4 or 8) */ + (1,2,4 or 8), and by unconditional JMPs for + orig x86 instr size if --cachesim=yes */ + /* word 5 */ UChar cond; /* condition, for jumps */ @@ -1044,6 +1054,10 @@ extern Bool VG_(anyFlagUse) ( UInstr* u ); extern void VG_(ppUInstr) ( Int instrNo, UInstr* u ); extern void VG_(ppUCodeBlock) ( UCodeBlock* cb, Char* title ); +extern UCodeBlock* VG_(allocCodeBlock) ( void ); +extern void VG_(freeCodeBlock) ( UCodeBlock* cb ); +extern void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr ); + extern Char* VG_(nameCondcode) ( Condcode cond ); extern Bool VG_(saneUInstr) ( Bool beforeRA, UInstr* u ); extern Bool VG_(saneUCodeBlock) ( UCodeBlock* cb ); @@ -1184,6 +1198,11 @@ extern void VG_(what_obj_and_fun_is_this) ( Addr a, Char* obj_buf, Int n_obj_buf, Char* fun_buf, Int n_fun_buf ); +extern Bool VG_(what_line_is_this) ( Addr a, + UChar* filename, Int n_filename, + UInt* lineno ); +extern Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, + Char* fn_name, Int n_fn_name); extern void VG_(symtab_notify_munmap) ( Addr start, UInt length ); @@ -1590,6 +1609,20 @@ extern void VG_(helper_value_check0_fail); extern void VG_(signalreturn_bogusRA)( void ); extern void VG_(pthreadreturn_bogusRA)( void ); +/* --------------------------------------------------------------------- + Exports of vg_cachesim.c + ------------------------------------------------------------------ */ + +extern UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr); + +typedef struct _iCC iCC; +typedef struct _idCC idCC; + +extern void VG_(init_cachesim) ( void ); +extern void VG_(show_cachesim_results)( Int client_argc, Char** client_argv ); + +extern void VG_(cachesim_log_non_mem_instr)( iCC* cc ); +extern void VG_(cachesim_log_mem_instr) ( idCC* cc, Addr data_addr ); /* --------------------------------------------------------------------- The state of the simulated CPU. @@ -1724,7 +1757,8 @@ extern Int VGOFF_(handle_esp_assignment); /* :: Addr -> void */ extern Int VGOFF_(fpu_write_check); /* :: Addr -> Int -> void */ extern Int VGOFF_(fpu_read_check); /* :: Addr -> Int -> void */ - +extern Int VGOFF_(cachesim_log_non_mem_instr); +extern Int VGOFF_(cachesim_log_mem_instr); #endif /* ndef __VG_INCLUDE_H */ diff --git a/vg_main.c b/vg_main.c index 5f7fe59c34..ee875e226b 100644 --- a/vg_main.c +++ b/vg_main.c @@ -103,7 +103,8 @@ Int VGOFF_(helperc_STOREV1) = INVALID_OFFSET; Int VGOFF_(handle_esp_assignment) = INVALID_OFFSET; Int VGOFF_(fpu_write_check) = INVALID_OFFSET; Int VGOFF_(fpu_read_check) = INVALID_OFFSET; - +Int VGOFF_(cachesim_log_non_mem_instr) = INVALID_OFFSET; +Int VGOFF_(cachesim_log_mem_instr) = INVALID_OFFSET; /* This is the actual defn of baseblock. */ UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS]; @@ -165,6 +166,13 @@ static void vg_init_baseBlock ( void ) /* 16 */ VGOFF_(sh_edi) = alloc_BaB(1); /* 17 */ VGOFF_(sh_eflags) = alloc_BaB(1); + /* 17a */ + VGOFF_(cachesim_log_non_mem_instr) + = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_non_mem_instr) ); + /* 17b */ + VGOFF_(cachesim_log_mem_instr) + = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_mem_instr) ); + /* 18 */ VGOFF_(helper_value_check4_fail) = alloc_BaB_1_set( (Addr) & VG_(helper_value_check4_fail) ); @@ -419,6 +427,7 @@ Bool VG_(clo_single_step); Bool VG_(clo_optimise); Bool VG_(clo_instrument); Bool VG_(clo_cleanup); +Bool VG_(clo_cachesim); Int VG_(clo_smc_check); Bool VG_(clo_trace_syscalls); Bool VG_(clo_trace_signals); @@ -745,6 +754,11 @@ static void process_cmd_line_options ( void ) else if (STREQ(argv[i], "--cleanup=no")) VG_(clo_cleanup) = False; + else if (STREQ(argv[i], "--cachesim=yes")) + VG_(clo_cachesim) = True; + else if (STREQ(argv[i], "--cachesim=no")) + VG_(clo_cachesim) = False; + else if (STREQ(argv[i], "--smc-check=none")) VG_(clo_smc_check) = VG_CLO_SMC_NONE; else if (STREQ(argv[i], "--smc-check=some")) @@ -821,6 +835,11 @@ static void process_cmd_line_options ( void ) VG_(clo_logfile_fd) = eventually_logfile_fd; + /* Don't do memory checking if simulating the cache. */ + if (VG_(clo_cachesim)) { + VG_(clo_instrument) = False; + } + if (VG_(clo_verbosity > 0)) VG_(message)(Vg_UserMsg, "valgrind-%s, a memory error detector for x86 GNU/Linux.", @@ -978,7 +997,7 @@ void VG_(main) ( void ) attach GDB in another shell. */ /* {extern unsigned int sleep(unsigned int seconds); sleep(10);} */ - if (VG_(clo_instrument)) { + if (VG_(clo_instrument) || VG_(clo_cachesim)) { VGP_PUSHCC(VgpInitAudit); VGM_(init_memory_audit)(); VGP_POPCC; @@ -1012,6 +1031,9 @@ void VG_(main) ( void ) VGM_(make_readable) ( (Addr)&VG_(clo_sloppy_malloc), 1 ); } + if (VG_(clo_cachesim)) + VG_(init_cachesim)(); + if (VG_(clo_verbosity) > 0) VG_(message)(Vg_UserMsg, ""); @@ -1039,6 +1061,9 @@ void VG_(main) ( void ) } VG_(running_on_simd_CPU) = False; + if (VG_(clo_cachesim)) + VG_(show_cachesim_results)(VG_(client_argc), VG_(client_argv)); + VG_(do_sanity_checks)( 1 /* root thread */, True /*include expensive checks*/ ); diff --git a/vg_mylibc.c b/vg_mylibc.c index 740b21e3d1..77dd153a2e 100644 --- a/vg_mylibc.c +++ b/vg_mylibc.c @@ -885,7 +885,49 @@ Int VG_(open_read) ( Char* pathname ) if (VG_(is_kerror)(fd)) fd = -1; return fd; } + +/* Returns -1 on failure. */ +static Int VG_(chmod_u_rw) ( Int fd ) +{ + Int res; + const int O_IRUSR_IWUSR = 000600; /* See /usr/include/cpio.h */ + res = vg_do_syscall2(__NR_fchmod, fd, O_IRUSR_IWUSR); + if (VG_(is_kerror)(res)) res = -1; + return res; +} + +/* Returns -1 on failure. */ +Int VG_(create_and_write) ( Char* pathname ) +{ + Int fd; + + const int O_CR_AND_WR_ONLY = 0101; /* See /usr/include/bits/fcntl.h */ + fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_CR_AND_WR_ONLY, 0); + /* VG_(printf)("result = %d\n", fd); */ + if (VG_(is_kerror)(fd)) { + fd = -1; + } else { + VG_(chmod_u_rw)(fd); + if (VG_(is_kerror)(fd)) { + fd = -1; + } + } + return fd; +} +/* Returns -1 on failure. */ +Int VG_(open_write) ( Char* pathname ) +{ + Int fd; + + const int O_WRONLY_AND_TRUNC = 01001; /* See /usr/include/bits/fcntl.h */ + fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_WRONLY_AND_TRUNC, 0); + /* VG_(printf)("result = %d\n", fd); */ + if (VG_(is_kerror)(fd)) { + fd = -1; + } + return fd; +} void VG_(close) ( Int fd ) { diff --git a/vg_symtab2.c b/vg_symtab2.c index 74cada4e82..d9da8fff45 100644 --- a/vg_symtab2.c +++ b/vg_symtab2.c @@ -1,4 +1,3 @@ - /*--------------------------------------------------------------------*/ /*--- Management of symbols and debugging information. ---*/ /*--- vg_symtab2.c ---*/ @@ -50,6 +49,23 @@ /*--- Structs n stuff ---*/ /*------------------------------------------------------------*/ +/* Stabs entry types, from: + * The "stabs" debug format + * Menapace, Kingdon and MacKenzie + * Cygnus Support + */ +typedef enum { N_FUN = 36, /* Function start or end */ + N_STSYM = 38, /* Data segment file-scope variable */ + N_LCSYM = 40, /* BSS segment file-scope variable */ + N_RSYM = 64, /* Register variable */ + N_SLINE = 68, /* Source line number */ + N_SO = 100, /* Source file path and name */ + N_LSYM = 128, /* Stack variable or type */ + N_SOL = 132, /* Include file name */ + N_LBRAC = 192, /* Start of lexical block */ + N_RBRAC = 224 /* End of lexical block */ + } stab_types; + /* A structure to hold an ELF symbol (very crudely). */ typedef struct { @@ -535,12 +551,7 @@ void addLineInfo ( SegInfo* si, { RiLoc loc; UInt size = end - start + 1; -# if 0 - if (size > 10000) - VG_(printf)( "line %4d: %p .. %p, in %s\n", - lineno, start, end, - &si->strtab[fnmoff] ); -# endif + /* Sanity ... */ if (size > 10000) return; @@ -577,8 +588,8 @@ void vg_read_lib_symbols ( SegInfo* si ) /* for the .stabs reader */ Int curr_filenmoff; Addr curr_fnbaseaddr; - Addr range_startAddr; - Int range_lineno; + Char* curr_file_name; + Int n_stab_entries; oimage = (Addr)NULL; if (VG_(clo_verbosity) > 1) @@ -880,25 +891,23 @@ void vg_read_lib_symbols ( SegInfo* si ) /* Ok. It all looks plausible. Go on and read debug data. stab kinds: 100 N_SO a source file name 68 N_SLINE a source line number - 36 N_FUN ? start of a function - - In this loop, we maintain a current file name, updated - as N_SOs appear, and a current function base address, - updated as N_FUNs appear. Based on that, address ranges - for N_SLINEs are calculated, and stuffed into the - line info table. - - N_SLINE indicates the start of a source line. Functions are - delimited by N_FUNS, at the start with a non-empty string and at - the end with an empty string. The latter facilitates detecting - where to close the last N_SLINE for a function. + 36 N_FUN start of a function + + In this loop, we maintain a current file name, updated as + N_SO/N_SOLs appear, and a current function base address, + updated as N_FUNs appear. Based on that, address ranges for + N_SLINEs are calculated, and stuffed into the line info table. + + Finding the instruction address range covered by an N_SLINE is + complicated; see the N_SLINE case below. */ curr_filenmoff = addStr(si,"???"); curr_fnbaseaddr = (Addr)NULL; - range_startAddr = 0; - range_lineno = 0; + curr_file_name = (Char*)NULL; - for (i = 0; i < stab_sz/(int)sizeof(struct nlist); i++) { + n_stab_entries = stab_sz/(int)sizeof(struct nlist); + + for (i = 0; i < n_stab_entries; i++) { # if 0 VG_(printf) ( " %2d ", i ); VG_(printf) ( "type=0x%x othr=%d desc=%d value=0x%x strx=%d %s", @@ -910,61 +919,95 @@ void vg_read_lib_symbols ( SegInfo* si ) # endif switch (stab[i].n_type) { - - case 68: { /* N_SLINE */ - /* flush the current line, if any, and start a new one */ - Addr range_endAddr - = curr_fnbaseaddr - + (UInt)stab[i].n_value - 1; - if (range_startAddr != 0) { - addLineInfo ( si, - curr_filenmoff, - range_startAddr, - range_endAddr, - range_lineno ); + UInt next_addr; + + /* To compute the instr address range covered by a single line, find + * the address of the next thing and compute the difference. The + * approach used depends on what kind of entry/entries follow... */ + case N_SLINE: { + Int lineno = stab[i].n_desc; + Int this_addr = (UInt)stab[i].n_value; + + LOOP: + vg_assert(i+1 < n_stab_entries); /* Haven't reached end */ + switch (stab[i+1].n_type) { + /* Easy, common case: use address of next entry */ + case N_SLINE: case N_SO: + next_addr = (UInt)stab[i+1].n_value; + break; + + /* Boring one: skip, look for something more useful. */ + case N_RSYM: case N_LSYM: case N_LBRAC: case N_RBRAC: + case N_STSYM: case N_LCSYM: + i++; + goto LOOP; + + /* Should be an end of fun entry, use its address */ + case N_FUN: + if ('\0' == * (stabstr + stab[i+1].n_un.n_strx) ) { + next_addr = (UInt)stab[i+1].n_value; + } else { + VG_(printf)("unhandled stabs case: N_FUN start %d %s\n", + i, (stabstr + stab[i+1].n_un.n_strx) ); + VG_(panic)("argh"); + } + break; + + /* N_SOL should be followed by an N_SLINE which can be used */ + case N_SOL: + if (i+2 < n_stab_entries && N_SLINE == stab[i+2].n_type) { + next_addr = (UInt)stab[i+2].n_value; + break; + } else { + VG_(printf)("unhandled N_SOL stabs case: %d %d %d", + stab[i+1].n_type, i, n_stab_entries); + VG_(panic)("argh"); + } + + default: + VG_(printf)("unhandled stabs case: %d %d", + stab[i+1].n_type,i); + VG_(panic)("argh"); } - range_startAddr = range_endAddr + 1; - range_lineno = stab[i].n_desc; + + //Int offset2 = (i+1 < n_stab_entries && 68 == stab[i+1].n_type + // ? (UInt)stab[i+1].n_value - 1 + // : offset + 1); + //if (i+1 < n_stab_entries) { + // int x; + // if (68 != (x = stab[i+1].n_type)) { + // VG_(printf)("%d ", x); + // } + //} + + addLineInfo ( si, curr_filenmoff, curr_fnbaseaddr + this_addr, + curr_fnbaseaddr + next_addr - 1, lineno ); break; } - case 36: { /* N_FUN */ - if ('\0' == * (stabstr + stab[i].n_un.n_strx) ) { - /* N_FUN with no name -- indicates the end of a fn. - Flush the current line, if any, but don't start a - new one. */ - Addr range_endAddr - = curr_fnbaseaddr - + (UInt)stab[i].n_value - 1; - if (range_startAddr != 0) { - addLineInfo ( si, - curr_filenmoff, - range_startAddr, - range_endAddr, - range_lineno ); - } - range_startAddr = 0; - } else { + case N_FUN: { + if ('\0' != (stabstr + stab[i].n_un.n_strx)[0] ) { /* N_FUN with a name -- indicates the start of a fn. */ - curr_fnbaseaddr = si->offset - + (Addr)stab[i].n_value; - range_startAddr = curr_fnbaseaddr; + curr_fnbaseaddr = si->offset + (Addr)stab[i].n_value; } break; } - case 100: /* N_SO */ - case 132: /* N_SOL */ + case N_SO: case N_SOL: /* seems to give lots of locations in header files */ /* case 130: */ /* BINCL */ { UChar* nm = stabstr + stab[i].n_un.n_strx; UInt len = VG_(strlen)(nm); - if (len > 0 && nm[len-1] != '/') + + if (len > 0 && nm[len-1] != '/') { curr_filenmoff = addStr ( si, nm ); + curr_file_name = stabstr + stab[i].n_un.n_strx; + } else if (len == 0) curr_filenmoff = addStr ( si, "?1\0" ); + break; } @@ -1071,8 +1114,8 @@ void read_symtab_callback ( which happen to correspond to the munmap()d area. */ void VG_(read_symbols) ( void ) { - if (! VG_(clo_instrument)) - return; + if (! VG_(clo_instrument) && ! VG_(clo_cachesim)) + return; VG_(read_procselfmaps) ( read_symtab_callback ); @@ -1251,9 +1294,8 @@ static void search_all_loctabs ( Addr ptr, SegInfo** psi, Int* locno ) Caller supplies buf and nbuf. If no_demangle is True, don't do demangling, regardless of vg_clo_demangle -- probably because the call has come from vg_what_fn_or_object_is_this. */ -static -Bool vg_what_fn_is_this ( Bool no_demangle, Addr a, - Char* buf, Int nbuf ) +Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, + Char* buf, Int nbuf ) { SegInfo* si; Int sno; @@ -1297,17 +1339,16 @@ void VG_(what_obj_and_fun_is_this) ( Addr a, Char* fun_buf, Int n_fun_buf ) { (void)vg_what_object_is_this ( a, obj_buf, n_obj_buf ); - (void)vg_what_fn_is_this ( True, a, fun_buf, n_fun_buf ); + (void)VG_(what_fn_is_this) ( True, a, fun_buf, n_fun_buf ); } /* Map a code address to a (filename, line number) pair. Returns True if successful. */ -static -Bool vg_what_line_is_this ( Addr a, - UChar* filename, Int n_filename, - UInt* lineno ) +Bool VG_(what_line_is_this)( Addr a, + UChar* filename, Int n_filename, + UInt* lineno ) { SegInfo* si; Int locno; @@ -1317,6 +1358,7 @@ Bool vg_what_line_is_this ( Addr a, VG_(strncpy_safely)(filename, & si->strtab[si->loctab[locno].fnmoff], n_filename); *lineno = si->loctab[locno].lineno; + return True; } @@ -1348,11 +1390,11 @@ void VG_(mini_stack_dump) ( ExeContext* ec ) n = 0; - know_fnname = vg_what_fn_is_this(False,ec->eips[0], buf_fn, M_VG_ERRTXT); + know_fnname = VG_(what_fn_is_this)(False,ec->eips[0], buf_fn, M_VG_ERRTXT); know_objname = vg_what_object_is_this(ec->eips[0], buf_obj, M_VG_ERRTXT); - know_srcloc = vg_what_line_is_this(ec->eips[0], - buf_srcloc, M_VG_ERRTXT, - &lineno); + know_srcloc = VG_(what_line_is_this)(ec->eips[0], + buf_srcloc, M_VG_ERRTXT, + &lineno); APPEND(" at "); VG_(sprintf)(ibuf,"0x%x: ", ec->eips[0]); @@ -1383,9 +1425,9 @@ void VG_(mini_stack_dump) ( ExeContext* ec ) clueless = 0; for (i = 1; i < stop_at; i++) { - know_fnname = vg_what_fn_is_this(False,ec->eips[i], buf_fn, M_VG_ERRTXT); + know_fnname = VG_(what_fn_is_this)(False,ec->eips[i], buf_fn, M_VG_ERRTXT); know_objname = vg_what_object_is_this(ec->eips[i],buf_obj, M_VG_ERRTXT); - know_srcloc = vg_what_line_is_this(ec->eips[i], + know_srcloc = VG_(what_line_is_this)(ec->eips[i], buf_srcloc, M_VG_ERRTXT, &lineno); n = 0; diff --git a/vg_to_ucode.c b/vg_to_ucode.c index f06689d956..691ef26d67 100644 --- a/vg_to_ucode.c +++ b/vg_to_ucode.c @@ -4437,43 +4437,76 @@ Int VG_(disBB) ( UCodeBlock* cb, Addr eip0 ) Addr eip = eip0; Bool isEnd = False; Bool block_sane; + Int INCEIP_allowed_lag = 4; + Int delta = 0; + if (dis) VG_(printf)("\n"); + /* When cache simulating, to ensure cache misses are attributed to the + * correct line we ensure EIP is always correct. This is done by: + * + * a) Using eager INCEIP updating to cope with all instructions except those + * at the end of a basic block. + * + * b) Patching in the size of the original x86 instr in the `extra4b' field + * of JMPs at the end of a basic block. Two cases: + * - Jcond followed by Juncond: patch the Jcond + * - Juncond alone: patch the Juncond + * + * See vg_cachesim_instrument() for how this is used. + */ + if (VG_(clo_cachesim)) { + INCEIP_allowed_lag = 0; + } + if (VG_(clo_single_step)) { eip = disInstr ( cb, eip, &isEnd ); - uInstr1(cb, JMP, 0, Literal, 0); - uLiteral(cb, eip); - uCond(cb, CondAlways); - if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); + + /* Add a JMP to the next (single x86 instruction) BB if it doesn't + * already end with a JMP instr. We also need to check for no UCode, + * which occurs if the x86 instr was a nop */ + if (cb->used == 0 || LAST_UINSTR(cb).opcode != JMP) { + uInstr1(cb, JMP, 0, Literal, 0); + uLiteral(cb, eip); + uCond(cb, CondAlways); + if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); + } + delta = eip - eip0; + } else { - Int delta = 0; Addr eip2; - while (True) { - if (isEnd) break; + while (!isEnd) { eip2 = disInstr ( cb, eip, &isEnd ); delta += (eip2 - eip); eip = eip2; - if (delta > 4 && !isEnd) { - uInstr1(cb, INCEIP, 0, Lit16, delta); - if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); - delta = 0; - } /* Split up giant basic blocks into pieces, so the translations fall within 64k. */ - if (eip - eip0 > 2000) { + if (eip - eip0 > 2000 && !isEnd) { if (VG_(clo_verbosity) > 0) - VG_(message)(Vg_DebugMsg, + VG_(message)(Vg_DebugMsg, "Warning: splitting giant basic block into pieces"); uInstr1(cb, JMP, 0, Literal, 0); uLiteral(cb, eip); uCond(cb, CondAlways); if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); - if (dis) VG_(printf)("\n"); - break; + isEnd = True; + + } else if (delta > INCEIP_allowed_lag && !isEnd) { + uInstr1(cb, INCEIP, 0, Lit16, delta); + if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]); + delta = 0; } if (dis) VG_(printf)("\n"); } } + if (VG_(clo_cachesim)) { + /* Patch instruction size into earliest JMP. */ + if (cb->used >= 2 && JMP == cb->instrs[cb->used - 2].opcode) { + cb->instrs[cb->used - 2].extra4b = delta; + } else { + LAST_UINSTR(cb).extra4b = delta; + } + } block_sane = VG_(saneUCodeBlock)(cb); if (!block_sane) { diff --git a/vg_translate.c b/vg_translate.c index 73cad5e9bc..76c6ef8a9b 100644 --- a/vg_translate.c +++ b/vg_translate.c @@ -104,7 +104,7 @@ void VG_(jitfree) ( void* ptr ) /*--- Basics ---*/ /*------------------------------------------------------------*/ -static UCodeBlock* allocCodeBlock ( void ) +UCodeBlock* VG_(allocCodeBlock) ( void ) { UCodeBlock* cb = VG_(malloc)(VG_AR_PRIVATE, sizeof(UCodeBlock)); cb->used = cb->size = cb->nextTemp = 0; @@ -113,7 +113,7 @@ static UCodeBlock* allocCodeBlock ( void ) } -static void freeCodeBlock ( UCodeBlock* cb ) +void VG_(freeCodeBlock) ( UCodeBlock* cb ) { if (cb->instrs) VG_(free)(VG_AR_PRIVATE, cb->instrs); VG_(free)(VG_AR_PRIVATE, cb); @@ -239,17 +239,15 @@ void VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz ) ui->size = sz; } - /* Copy an instruction into the given codeblock. */ -static __inline__ -void copyUInstr ( UCodeBlock* cb, UInstr* instr ) +__inline__ +void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr ) { ensureUInstr(cb); cb->instrs[cb->used] = *instr; cb->used++; } - /* Copy auxiliary info from one uinstr to another. */ static __inline__ void copyAuxInfoFromTo ( UInstr* src, UInstr* dst ) @@ -1654,7 +1652,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 ) /* Resulting code goes here. We generate it all in a forwards pass. */ - c2 = allocCodeBlock(); + c2 = VG_(allocCodeBlock)(); /* At the start, no TempRegs are assigned to any real register. Correspondingly, all temps claim to be currently resident in @@ -1841,7 +1839,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 ) for (j = 0; j < k; j++) tempUse[j].realNo = VG_(rankToRealRegNo)(temp_info[tempUse[j].tempNo].real_no); - copyUInstr(c2, &c1->instrs[i]); + VG_(copyUInstr)(c2, &c1->instrs[i]); patchUInstr(&LAST_UINSTR(c2), &tempUse[0], k); if (VG_(disassemble)) { @@ -1853,7 +1851,7 @@ UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 ) if (temp_info != NULL) VG_(jitfree)(temp_info); - freeCodeBlock(c1); + VG_(freeCodeBlock)(c1); if (spill_reqd) VG_(translations_needing_spill)++; @@ -2099,7 +2097,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) Int i, j; UInstr* u_in; Int qs, qd, qt, qtt; - cb = allocCodeBlock(); + cb = VG_(allocCodeBlock)(); cb->nextTemp = cb_in->nextTemp; for (i = 0; i < cb_in->used; i++) { @@ -2115,7 +2113,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) break; case INCEIP: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Loads and stores. Test the V bits for the address. 24 @@ -2136,7 +2134,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, LOADV, u_in->size, TempReg, u_in->val1, TempReg, SHADOW(u_in->val2)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case STORE: if (VG_(clo_check_addrVs)) { @@ -2146,7 +2144,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, STOREV, u_in->size, TempReg, SHADOW(u_in->val1), TempReg, u_in->val2); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Moving stuff around. Make the V bits follow accordingly, @@ -2156,24 +2154,24 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, GETV, u_in->size, ArchReg, u_in->val1, TempReg, SHADOW(u_in->val2)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case PUT: uInstr2(cb, PUTV, u_in->size, TempReg, SHADOW(u_in->val1), ArchReg, u_in->val2); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case GETF: /* This is not the smartest way to do it, but should work. */ qd = create_GETVF(cb, u_in->size); uInstr2(cb, MOV, 4, TempReg, qd, TempReg, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case PUTF: create_PUTVF(cb, u_in->size, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case MOV: @@ -2190,7 +2188,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) default: VG_(panic)("vg_instrument: MOV"); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Special case of add, where one of the operands is a literal. @@ -2203,7 +2201,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) qd = SHADOW(u_in->val2); uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qd); create_Left(cb, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Another form of add. @@ -2234,7 +2232,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) } create_UifU(cb, 4, qs, qd); create_Left(cb, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; } @@ -2244,7 +2242,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) create_Left(cb, u_in->size, qd); if (u_in->flags_w != FlagsEmpty) create_PUTVF(cb, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* This is a HACK (approximation :-) */ @@ -2279,7 +2277,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) create_UifU(cb, 0, qs, qd); create_PUTVF(cb, 0, qd); create_PCast(cb, 0, u_in->size, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* for OP in shl shr sar rol ror @@ -2329,7 +2327,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt); create_PCast(cb, 1, u_in->size, qt); create_UifU(cb, u_in->size, qt, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; } @@ -2338,20 +2336,20 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->tag1 == TempReg); create_Widen(cb, u_in->signed_widen, u_in->extra4b, u_in->size, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* not#(x) = x (since bitwise independent) */ case NOT: vg_assert(u_in->tag1 == TempReg); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* neg#(x) = left(x) (derivable from case for SUB) */ case NEG: vg_assert(u_in->tag1 == TempReg); create_Left(cb, u_in->size, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* bswap#(x) = bswap(x) */ @@ -2360,7 +2358,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->size == 4); qd = SHADOW(u_in->val1); uInstr1(cb, BSWAP, 4, TempReg, qd); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* cc2val#(qd) = pcast-0-to-size(eflags#) */ @@ -2369,7 +2367,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->flags_r != FlagsEmpty); qt = create_GETVF(cb, u_in->size); uInstr2(cb, MOV, 4, TempReg, qt, TempReg, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* cmov#(qs,qd) = cmov(qs,qd) @@ -2395,7 +2393,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) LAST_UINSTR(cb).cond = u_in->cond; LAST_UINSTR(cb).flags_r = u_in->flags_r; - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* add#/sub#(qs,qd) @@ -2421,7 +2419,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) if (u_in->flags_w != FlagsEmpty) { create_PUTVF(cb, u_in->size, qd); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* xor#(qs,qd) = qs `UifU` qd */ @@ -2432,7 +2430,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) if (u_in->flags_w != FlagsEmpty) { create_PUTVF(cb, u_in->size, qd); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* and#/or#(qs,qd) @@ -2476,7 +2474,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) if (u_in->flags_w != FlagsEmpty) { create_PUTVF(cb, u_in->size, qd); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Machinery to do with supporting CALLM. Copy the start and @@ -2484,7 +2482,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) (debug); they generate no code and have no effect. */ case CALLM_S: case CALLM_E: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Copy PUSH and POP verbatim. Arg/result absval @@ -2493,7 +2491,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) needs to be copied. */ case PUSH: case POP: case CLEAR: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* In short: @@ -2562,7 +2560,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) ... */ uInstr1(cb, SETV, 0, TempReg, qt); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; } /* Whew ... */ @@ -2582,13 +2580,13 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) ... */ uInstr1(cb, SETV, 0, TempReg, qt); } - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; case JIFZ: uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1)); uInstr1(cb, SETV, 4, TempReg, SHADOW(u_in->val1)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* Emit a check on the address used. For FPU_R, the value @@ -2598,12 +2596,12 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) vg_assert(u_in->tag2 == TempReg); uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val2)); uInstr1(cb, SETV, 4, TempReg, SHADOW(u_in->val2)); - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; /* For FPU insns not referencing memory, just copy thru. */ case FPU: - copyUInstr(cb, u_in); + VG_(copyUInstr)(cb, u_in); break; default: @@ -2614,7 +2612,7 @@ static UCodeBlock* vg_instrument ( UCodeBlock* cb_in ) } /* end of for loop */ - freeCodeBlock(cb_in); + VG_(freeCodeBlock)(cb_in); return cb; } @@ -3083,7 +3081,7 @@ void VG_(translate) ( ThreadState* tst, VG_(overall_in_osize), VG_(overall_in_tsize), orig_addr ); } - cb = allocCodeBlock(); + cb = VG_(allocCodeBlock)(); /* Disassemble this basic block into cb. */ VGP_PUSHCC(VgpToUCode); @@ -3117,6 +3115,19 @@ void VG_(translate) ( ThreadState* tst, } } + //VG_(disassemble) = True; + + /* Add cache simulation code. */ + if (VG_(clo_cachesim)) { + VGP_PUSHCC(VgpCacheInstrument); + cb = VG_(cachesim_instrument)(cb, orig_addr); + VGP_POPCC; + if (VG_(disassemble)) + VG_(ppUCodeBlock) ( cb, "Cachesim instrumented code:" ); + } + + //VG_(disassemble) = False; + /* Allocate registers. */ VGP_PUSHCC(VgpRegAlloc); cb = vg_do_register_allocation ( cb ); @@ -3132,7 +3143,7 @@ void VG_(translate) ( ThreadState* tst, and so must be VG_(jitfree)'d. */ final_code = VG_(emit_code)(cb, &final_code_size ); VGP_POPCC; - freeCodeBlock(cb); + VG_(freeCodeBlock)(cb); if (debugging_translation) { /* Only done for debugging -- throw away final result. */ -- 2.47.2