From: no author <no_author@no_author>
Date: Fri, 26 Jul 2002 11:34:39 +0000 (+0000)
Subject: This commit was manufactured by cvs2svn to create branch
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5b579c5e34d0ee1f35a88898dbb1c307aeb73829;p=thirdparty%2Fvalgrind.git

This commit was manufactured by cvs2svn to create branch
'VALGRIND_1_0_BRANCH'.

git-svn-id: svn://svn.valgrind.org/valgrind/branches/VALGRIND_1_0_BRANCH@543
---

diff --git a/addrcheck/Makefile.am b/addrcheck/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/addrcheck/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/cachegrind/Makefile.am b/cachegrind/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/cachegrind/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/cachegrind/cg_annotate.in b/cachegrind/cg_annotate.in
deleted file mode 100644
index 11821901c2..0000000000
--- a/cachegrind/cg_annotate.in
+++ /dev/null
@@ -1,893 +0,0 @@
-#! /usr/bin/perl -w
-##--------------------------------------------------------------------##
-##--- The cache simulation framework: instrumentation, recording   ---##
-##--- and results printing.                                        ---##
-##---                                                  vg_annotate ---##
-##--------------------------------------------------------------------##
-
-#  This file is part of Valgrind, an x86 protected-mode emulator 
-#  designed for debugging and profiling binaries on x86-Unixes.
-#
-#  Copyright (C) 2002 Nicholas Nethercote
-#     njn25@cam.ac.uk
-#
-#  This program is free software; you can redistribute it and/or
-#  modify it under the terms of the GNU General Public License as
-#  published by the Free Software Foundation; either version 2 of the
-#  License, or (at your option) any later version.
-#
-#  This program is distributed in the hope that it will be useful, but
-#  WITHOUT ANY WARRANTY; without even the implied warranty of
-#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-#  General Public License for more details.
-#
-#  You should have received a copy of the GNU General Public License
-#  along with this program; if not, write to the Free Software
-#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-#  02111-1307, USA.
-#
-#  The GNU General Public License is contained in the file LICENSE.
-
-#----------------------------------------------------------------------------
-# Annotator for cachegrind. 
-#
-# File format is described in /docs/techdocs.html.
-#
-# Performance improvements record, using cachegrind.out for cacheprof, doing no
-# source annotation (irrelevant ones removed):
-#                                                               user time
-# 1. turned off warnings in add_hash_a_to_b()                   3.81 --> 3.48s
-#    [now add_array_a_to_b()]
-# 6. make line_to_CC() return a ref instead of a hash           3.01 --> 2.77s
-#
-#10. changed file format to avoid file/fn name repetition       2.40s
-#    (not sure why higher;  maybe due to new '.' entries?)
-#11. changed file format to drop unnecessary end-line "."s      2.36s
-#    (shrunk file by about 37%)
-#12. switched from hash CCs to array CCs                        1.61s
-#13. only adding b[i] to a[i] if b[i] defined (was doing it if
-#    either a[i] or b[i] was defined, but if b[i] was undefined
-#    it just added 0)                                           1.48s
-#14. Stopped converting "." entries to undef and then back      1.16s
-#15. Using foreach $i (x..y) instead of for ($i = 0...) in
-#    add_array_a_to_b()                                         1.11s
-#
-# Auto-annotating primes:
-#16. Finding count lengths by int((length-1)/3), not by
-#    commifying (halves the number of commify calls)            1.68s --> 1.47s
-
-use strict;
-
-#----------------------------------------------------------------------------
-# Overview: the running example in the comments is for:
-#   - events = A,B,C,D
-#   - --show=C,A,D
-#   - --sort=D,C
-#----------------------------------------------------------------------------
-
-#----------------------------------------------------------------------------
-# Global variables, main data structures
-#----------------------------------------------------------------------------
-# CCs are arrays, the counts corresponding to @events, with 'undef'
-# representing '.'.  This makes things fast (faster than using hashes for CCs)
-# but we have to use @sort_order and @show_order below to handle the --sort and
-# --show options, which is a bit tricky.
-#----------------------------------------------------------------------------
-
-# Total counts for summary (an array reference).
-my $summary_CC;
-
-# Totals for each function, for overall summary.
-# hash(filename:fn_name => CC array)
-my %fn_totals;
-
-# Individual CCs, organised by filename and line_num for easy annotation.
-# hash(filename => hash(line_num => CC array))
-my %all_ind_CCs;
-
-# Files chosen for annotation on the command line.  
-# key = basename (trimmed of any directory), value = full filename
-my %user_ann_files;
-
-# Generic description string.
-my $desc = "";
-
-# Command line of profiled program.
-my $cmd;
-
-# Events in input file, eg. (A,B,C,D)
-my @events;
-
-# Events to show, from command line, eg. (C,A,D)
-my @show_events;
-
-# Map from @show_events indices to @events indices, eg. (2,0,3).  Gives the
-# order in which we must traverse @events in order to show the @show_events, 
-# eg. (@events[$show_order[1]], @events[$show_order[2]]...) = @show_events.
-# (Might help to think of it like a hash (0 => 2, 1 => 0, 2 => 3).)
-my @show_order;
-
-# Print out the function totals sorted by these events, eg. (D,C).
-my @sort_events;
-
-# Map from @sort_events indices to @events indices, eg. (3,2).  Same idea as
-# for @show_order.
-my @sort_order;
-
-# Thresholds, one for each sort event (or default to 1 if no sort events
-# specified).  We print out functions and do auto-annotations until we've
-# handled this proportion of all the events thresholded.
-my @thresholds;
-
-my $default_threshold = 99;
-
-my $single_threshold  = $default_threshold;
-
-# If on, automatically annotates all files that are involved in getting over
-# all the threshold counts.
-my $auto_annotate = 0;
-
-# Number of lines to show around each annotated line.
-my $context = 8;
-
-# Directories in which to look for annotation files.
-my @include_dirs = ("");
-
-# Input file name
-my $input_file = "cachegrind.out";
-
-# Version number
-my $version = "@VERSION@";
-
-# Usage message.
-my $usage = <<END
-usage: vg_annotate [options] [source-files]
-
-  options for the user, with defaults in [ ], are:
-    -h --help             show this message
-    -v --version          show version
-    --show=A,B,C          only show figures for events A,B,C [all]
-    --sort=A,B,C          sort columns by events A,B,C [event column order]
-    --threshold=<0--100>  percentage of counts (of primary sort event) we
-                          are interested in [$default_threshold%]
-    --auto=yes|no         annotate all source files containing functions
-                          that helped reach the event count threshold [no]
-    --context=N           print N lines of context before and after
-                          annotated lines [8]
-    -I --include=<dir>    add <dir> to list of directories to search for 
-                          source files
-
-  Valgrind is Copyright (C) 2000-2002 Julian Seward
-  and licensed under the GNU General Public License, version 2.
-  Bug reports, feedback, admiration, abuse, etc, to: jseward\@acm.org.
-
-END
-;
-
-# Used in various places of output.
-my $fancy = '-' x 80 . "\n";
-
-#-----------------------------------------------------------------------------
-# Argument and option handling
-#-----------------------------------------------------------------------------
-sub process_cmd_line() 
-{
-    for my $arg (@ARGV) { 
-
-        # Option handling
-        if ($arg =~ /^-/) {
-
-            # --version
-            if ($arg =~ /^-v$|^--version$/) {
-                die("vg_annotate-$version\n");
-
-            # --show=A,B,C
-            } elsif ($arg =~ /^--show=(.*)$/) {
-                @show_events = split(/,/, $1);
-
-            # --sort=A,B,C
-            } elsif ($arg =~ /^--sort=(.*)$/) {
-                @sort_events = split(/,/, $1);
-                foreach my $i (0 .. scalar @sort_events - 1) {
-                    if ($sort_events[$i] =~#/.*:(\d+)$/) {
-                                            /.*:([\d\.]+)%?$/) {
-                        my $th = $1;
-                        ($th >= 0 && $th <= 100) or die($usage);
-                        $sort_events[$i] =~ s/:.*//;
-                        $thresholds[$i] = $th;
-                    } else {
-                        $thresholds[$i] = 0;
-                    }
-                }
-
-            # --threshold=X (tolerates a trailing '%')
-            } elsif ($arg =~ /^--threshold=([\d\.]+)%?$/) {
-                $single_threshold = $1;
-                ($1 >= 0 && $1 <= 100) or die($usage);
-
-            # --auto=yes|no
-            } elsif ($arg =~ /^--auto=(yes|no)$/) {
-                $auto_annotate = 1 if ($1 eq "yes");
-                $auto_annotate = 0 if ($1 eq "no");
-
-            # --context=N
-            } elsif ($arg =~ /^--context=([\d\.]+)$/) {
-                $context = $1;
-                if ($context < 0) {
-                    die($usage);
-                }
-
-            # --include=A,B,C
-            } elsif ($arg =~ /^(-I|--include)=(.*)$/) {
-                my $inc = $2;
-                $inc =~ s|/$||;         # trim trailing '/'
-                push(@include_dirs, "$inc/");
-
-            } else {            # -h and --help fall under this case
-                die($usage);
-            }
-
-        # Argument handling -- annotation file checking and selection.
-        # Stick filenames into a hash for quick 'n easy lookup throughout
-        } else {
-            my $readable = 0;
-            foreach my $include_dir (@include_dirs) {
-                if (-r $include_dir . $arg) {
-                    $readable = 1;
-                }
-            }
-            $readable or die("File $arg not found in any of: @include_dirs\n");
-            $user_ann_files{$arg} = 1;
-        } 
-    }
-}
-
-#-----------------------------------------------------------------------------
-# Reading of input file
-#-----------------------------------------------------------------------------
-sub max ($$) 
-{
-    my ($x, $y) = @_;
-    return ($x > $y ? $x : $y);
-}
-
-# Add the two arrays;  any '.' entries are ignored.  Two tricky things:
-# 1. If $a2->[$i] is undefined, it defaults to 0 which is what we want; we turn
-#    off warnings to allow this.  This makes things about 10% faster than
-#    checking for definedness ourselves.
-# 2. We don't add an undefined count or a ".", even though it's value is 0,
-#    because we don't want to make an $a2->[$i] that is undef become 0
-#    unnecessarily.
-sub add_array_a_to_b ($$) 
-{
-    my ($a1, $a2) = @_;
-
-    my $n = max(scalar @$a1, scalar @$a2);
-    $^W = 0;
-    foreach my $i (0 .. $n-1) {
-        $a2->[$i] += $a1->[$i] if (defined $a1->[$i] && "." ne $a1->[$i]);
-    }
-    $^W = 1;
-}
-
-# Add each event count to the CC array.  '.' counts become undef, as do
-# missing entries (implicitly).
-sub line_to_CC ($)
-{
-    my @CC = (split /\s+/, $_[0]);
-    (@CC <= @events) or die("Line $.: too many event counts\n");
-    return \@CC;
-}
-
-sub read_input_file() 
-{
-    open(INPUTFILE, "< $input_file") || die "File $input_file not opened\n";
-
-    # Read "desc:" lines.
-    my $line;
-    # This gives a "uninitialized value in substitution (s///)" warning; hmm...
-    #while ($line = <INPUTFILE> && $line =~ s/desc:\s+//) {
-    #    $desc .= "$line\n";
-    #}
-    while (1) {
-        $line = <INPUTFILE>;
-        if ($line =~ s/desc:\s+//) {
-            $desc .= $line;
-        } else {
-            last;
-        }
-    }
-
-    # Read "cmd:" line (Nb: will already be in $line from "desc:" loop above).
-    ($line =~ s/cmd:\s+//) or die("Line $.: missing command line\n");
-    $cmd = $line;
-    chomp($cmd);    # Remove newline
-
-    # Read "events:" line.  We make a temporary hash in which the Nth event's
-    # value is N, which is useful for handling --show/--sort options below.
-    $line = <INPUTFILE>;
-    ($line =~ s/events:\s+//) or die("Line $.: missing events line\n");
-    @events = split(/\s+/, $line);
-    my %events;
-    my $n = 0;
-    foreach my $event (@events) {
-        $events{$event} = $n;
-        $n++
-    }
-
-    # If no --show arg give, default to showing all events in the file.
-    # If --show option is used, check all specified events appeared in the
-    # "events:" line.  Then initialise @show_order.
-    if (@show_events) {
-        foreach my $show_event (@show_events) {
-            (defined $events{$show_event}) or 
-                die("--show event `$show_event' did not appear in input\n");
-        }
-    } else {
-        @show_events = @events;
-    }
-    foreach my $show_event (@show_events) {
-        push(@show_order, $events{$show_event});
-    }
-
-    # Do as for --show, but if no --sort arg given, default to sorting by
-    # column order (ie. first column event is primary sort key, 2nd column is
-    # 2ndary key, etc).
-    if (@sort_events) {
-        foreach my $sort_event (@sort_events) {
-            (defined $events{$sort_event}) or 
-                die("--sort event `$sort_event' did not appear in input\n");
-        }
-    } else {
-        @sort_events = @events;
-    }
-    foreach my $sort_event (@sort_events) {
-        push(@sort_order, $events{$sort_event});
-    }
-
-    # If multiple threshold args weren't given via --sort, stick in the single
-    # threshold (either from --threshold if used, or the default otherwise) for
-    # the primary sort event, and 0% for the rest.
-    if (not @thresholds) {
-        foreach my $e (@sort_order) {
-            push(@thresholds, 0);
-        }
-        $thresholds[0] = $single_threshold;
-    }
-
-    my $curr_file;
-    my $curr_fn;
-    my $curr_name;
-
-    my $curr_fn_CC = [];
-    my $curr_file_ind_CCs = {};     # hash(line_num => CC)
-
-    # Read body of input file.
-    while (<INPUTFILE>) {
-        s/#.*$//;   # remove comments
-        if (s/^(\d+)\s+//) {
-            my $line_num = $1;
-            my $CC = line_to_CC($_);
-            add_array_a_to_b($CC, $curr_fn_CC);
-            
-            # If curr_file is selected, add CC to curr_file list.  We look for
-            # full filename matches;  or, if auto-annotating, we have to
-            # remember everything -- we won't know until the end what's needed.
-            if ($auto_annotate || defined $user_ann_files{$curr_file}) {
-                my $tmp = $curr_file_ind_CCs->{$line_num};
-                $tmp = [] unless defined $tmp;
-                add_array_a_to_b($CC, $tmp);
-                $curr_file_ind_CCs->{$line_num} = $tmp;
-            }
-
-        } elsif (s/^fn=(.*)$//) {
-            # Commit result from previous function
-            $fn_totals{$curr_name} = $curr_fn_CC if (defined $curr_name);
-
-            # Setup new one
-            $curr_fn = $1;
-            $curr_name = "$curr_file:$curr_fn";
-            $curr_fn_CC = $fn_totals{$curr_name};
-            $curr_fn_CC = [] unless (defined $curr_fn_CC);
-
-        } elsif (s/^fl=(.*)$//) {
-            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs 
-                if (defined $curr_file);
-
-            $curr_file = $1;
-            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
-            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
-
-        } elsif (s/^(fi|fe)=(.*)$//) {
-            (defined $curr_name) or die("Line $.: Unexpected fi/fe line\n");
-            $fn_totals{$curr_name} = $curr_fn_CC;
-            $all_ind_CCs{$curr_file} = $curr_file_ind_CCs;
-
-            $curr_file = $2;
-            $curr_name = "$curr_file:$curr_fn";
-            $curr_file_ind_CCs = $all_ind_CCs{$curr_file};
-            $curr_file_ind_CCs = {} unless (defined $curr_file_ind_CCs);
-            $curr_fn_CC = $fn_totals{$curr_name};
-            $curr_fn_CC = [] unless (defined $curr_fn_CC);
-
-        } elsif (s/^\s*$//) {
-            # blank, do nothing
-        
-        } elsif (s/^summary:\s+//) {
-            # Finish up handling final filename/fn_name counts
-            $fn_totals{"$curr_file:$curr_fn"} = $curr_fn_CC 
-                if (defined $curr_file && defined $curr_fn);
-            $all_ind_CCs{$curr_file} = 
-                $curr_file_ind_CCs if (defined $curr_file);
-
-            $summary_CC = line_to_CC($_);
-            (scalar(@$summary_CC) == @events) 
-                or die("Line $.: summary event and total event mismatch\n");
-
-        } else {
-            warn("WARNING: line $. malformed, ignoring\n");
-        }
-    }
-
-    # Check if summary line was present
-    if (not defined $summary_CC) {
-        warn("WARNING: missing final summary line, no summary will be printed\n");
-    }
-
-    close(INPUTFILE);
-}
-
-#-----------------------------------------------------------------------------
-# Print options used
-#-----------------------------------------------------------------------------
-sub print_options ()
-{
-    print($fancy);
-    print($desc);
-    print("Command:          $cmd\n");
-    print("Events recorded:  @events\n");
-    print("Events shown:     @show_events\n");
-    print("Event sort order: @sort_events\n");
-    print("Thresholds:       @thresholds\n");
-
-    my @include_dirs2 = @include_dirs;  # copy @include_dirs
-    shift(@include_dirs2);       # remove "" entry, which is always the first
-    unshift(@include_dirs2, "") if (0 == @include_dirs2); 
-    my $include_dir = shift(@include_dirs2);
-    print("Include dirs:     $include_dir\n");
-    foreach my $include_dir (@include_dirs2) {
-        print("                  $include_dir\n");
-    }
-
-    my @user_ann_files = keys %user_ann_files;
-    unshift(@user_ann_files, "") if (0 == @user_ann_files); 
-    my $user_ann_file = shift(@user_ann_files);
-    print("User annotated:   $user_ann_file\n");
-    foreach $user_ann_file (@user_ann_files) {
-        print("                  $user_ann_file\n");
-    }
-
-    my $is_on = ($auto_annotate ? "on" : "off");
-    print("Auto-annotation:  $is_on\n");
-    print("\n");
-}
-
-#-----------------------------------------------------------------------------
-# Print summary and sorted function totals
-#-----------------------------------------------------------------------------
-sub mycmp ($$) 
-{
-    my ($c, $d) = @_;
-
-    # Iterate through sort events (eg. 3,2); return result if two are different
-    foreach my $i (@sort_order) {
-        my ($x, $y);
-        $x = $c->[$i];
-        $y = $d->[$i];
-        $x = -1 unless defined $x;
-        $y = -1 unless defined $y;
-
-        my $cmp = $y <=> $x;        # reverse sort
-        if (0 != $cmp) {
-            return $cmp;
-        }
-    }
-    # Exhausted events, equal
-    return 0;
-}
-
-sub commify ($) {
-    my ($val) = @_;
-    1 while ($val =~ s/^(\d+)(\d{3})/$1,$2/);
-    return $val;
-}
-
-# Because the counts can get very big, and we don't want to waste screen space
-# and make lines too long, we compute exactly how wide each column needs to be
-# by finding the widest entry for each one.
-sub compute_CC_col_widths (@) 
-{
-    my @CCs = @_;
-    my $CC_col_widths = [];
-
-    # Initialise with minimum widths (from event names)
-    foreach my $event (@events) {
-        push(@$CC_col_widths, length($event));
-    }
-    
-    # Find maximum width count for each column.  @CC_col_width positions
-    # correspond to @CC positions.
-    foreach my $CC (@CCs) {
-        foreach my $i (0 .. scalar(@$CC)-1) {
-            if (defined $CC->[$i]) {
-                # Find length, accounting for commas that will be added
-                my $length = length $CC->[$i];
-                my $clength = $length + int(($length - 1) / 3);
-                $CC_col_widths->[$i] = max($CC_col_widths->[$i], $clength); 
-            }
-        }
-    }
-    return $CC_col_widths;
-}
-
-# Print the CC with each column's size dictated by $CC_col_widths.
-sub print_CC ($$) 
-{
-    my ($CC, $CC_col_widths) = @_;
-
-    foreach my $i (@show_order) {
-        my $count = (defined $CC->[$i] ? commify($CC->[$i]) : ".");
-        my $space = ' ' x ($CC_col_widths->[$i] - length($count));
-        print("$space$count ");
-    }
-}
-
-sub print_events ($)
-{
-    my ($CC_col_widths) = @_;
-
-    foreach my $i (@show_order) { 
-        my $event       = $events[$i];
-        my $event_width = length($event);
-        my $col_width   = $CC_col_widths->[$i];
-        my $space       = ' ' x ($col_width - $event_width);
-        print("$space$event ");
-    }
-}
-
-# Prints summary and function totals (with separate column widths, so that
-# function names aren't pushed over unnecessarily by huge summary figures).
-# Also returns a hash containing all the files that are involved in getting the
-# events count above the thresholds (ie. all the interesting ones).
-sub print_summary_and_fn_totals ()
-{
-    my @fn_fullnames = keys   %fn_totals;
-
-    # Work out the size of each column for printing (summary and functions
-    # separately).
-    my $summary_CC_col_widths = compute_CC_col_widths($summary_CC);
-    my      $fn_CC_col_widths = compute_CC_col_widths(values %fn_totals);
-
-    # Header and counts for summary
-    print($fancy);
-    print_events($summary_CC_col_widths);
-    print("\n");
-    print($fancy);
-    print_CC($summary_CC, $summary_CC_col_widths);
-    print(" PROGRAM TOTALS\n");
-    print("\n");
-
-    # Header for functions
-    print($fancy);
-    print_events($fn_CC_col_widths);
-    print(" file:function\n");
-    print($fancy);
-
-    # Sort function names into order dictated by --sort option.
-    @fn_fullnames = sort {
-        mycmp($fn_totals{$a}, $fn_totals{$b})
-    } @fn_fullnames;
-
-
-    # Assertion
-    (scalar @sort_order == scalar @thresholds) or 
-        die("sort_order length != thresholds length:\n",
-            "  @sort_order\n  @thresholds\n");
-
-    my $threshold_files       = {};
-    # @curr_totals has the same shape as @sort_order and @thresholds
-    my @curr_totals = ();
-    foreach my $e (@thresholds) {
-        push(@curr_totals, 0);
-    }
-
-    # Print functions, stopping when the threshold has been reached.
-    foreach my $fn_name (@fn_fullnames) {
-
-        # Stop when we've reached all the thresholds
-        my $reached_all_thresholds = 1;
-        foreach my $i (0 .. scalar @thresholds - 1) {
-            my $prop = $curr_totals[$i] * 100 / $summary_CC->[$sort_order[$i]];
-            $reached_all_thresholds &= ($prop >= $thresholds[$i]);
-        }
-        last if $reached_all_thresholds;
-
-        # Print function results
-        my $fn_CC = $fn_totals{$fn_name};
-        print_CC($fn_CC, $fn_CC_col_widths);
-        print(" $fn_name\n");
-
-        # Update the threshold counts
-        my $filename = $fn_name;
-        $filename =~ s/:.+$//;    # remove function name
-        $threshold_files->{$filename} = 1;
-        foreach my $i (0 .. scalar @sort_order - 1) {
-            $curr_totals[$i] += $fn_CC->[$sort_order[$i]] 
-                if (defined $fn_CC->[$sort_order[$i]]);
-        }
-    }
-    print("\n");
-
-    return $threshold_files;
-}
-
-#-----------------------------------------------------------------------------
-# Annotate selected files
-#-----------------------------------------------------------------------------
-
-# Issue a warning that the source file is more recent than the input file. 
-sub warning_on_src_more_recent_than_inputfile ($)
-{
-    my $src_file = $_[0];
-
-    my $warning = <<END
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@ Source file '$src_file' is more recent than input file '$input_file'.
-@ Annotations may not be correct.
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
-END
-;
-    print($warning);
-}
-
-# If there is information about lines not in the file, issue a warning
-# explaining possible causes.
-sub warning_on_nonexistent_lines ($$$)
-{
-    my ($src_more_recent_than_inputfile, $src_file, $excess_line_nums) = @_;
-    my $cause_and_solution;
-
-    if ($src_more_recent_than_inputfile) {
-        $cause_and_solution = <<END
-@@ cause:    '$src_file' has changed since information was gathered.
-@@           If so, a warning will have already been issued about this.
-@@ solution: Recompile program and rerun under "valgrind --cachesim=yes" to 
-@@           gather new information.
-END
-    # We suppress warnings about .h files
-    } elsif ($src_file =~ /\.h$/) {
-        $cause_and_solution = <<END
-@@ cause:    bug in the Valgrind's debug info reader that screws up with .h
-@@           files sometimes
-@@ solution: none, sorry
-END
-    } else {
-        $cause_and_solution = <<END
-@@ cause:    not sure, sorry
-END
-    }
-
-    my $warning = <<END
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@ WARNING @@
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@@
-@@ Information recorded about lines past the end of '$src_file'.
-@@
-@@ Probable cause and solution:
-$cause_and_solution@@
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-END
-;
-    print($warning);
-}
-
-sub annotate_ann_files($)
-{
-    my ($threshold_files) = @_; 
-
-    my %all_ann_files;
-    my @unfound_auto_annotate_files;
-    my $printed_totals_CC = [];
-
-    # If auto-annotating, add interesting files (but not "???")
-    if ($auto_annotate) {
-        delete $threshold_files->{"???"};
-        %all_ann_files = (%user_ann_files, %$threshold_files) 
-    } else {
-        %all_ann_files = %user_ann_files;
-    }
-
-    # Track if we did any annotations.
-    my $did_annotations = 0;
-
-    LOOP:
-    foreach my $src_file (keys %all_ann_files) {
-
-        my $opened_file = "";
-        my $full_file_name = "";
-        foreach my $include_dir (@include_dirs) {
-            my $try_name = $include_dir . $src_file;
-            if (open(INPUTFILE, "< $try_name")) {
-                $opened_file    = $try_name;
-                $full_file_name = ($include_dir eq "" 
-                                  ? $src_file 
-                                  : "$include_dir + $src_file"); 
-                last;
-            }
-        }
-        
-        if (not $opened_file) {
-            # Failed to open the file.  If chosen on the command line, die.
-            # If arose from auto-annotation, print a little message.
-            if (defined $user_ann_files{$src_file}) {
-                die("File $src_file not opened in any of: @include_dirs\n");
-
-            } else {
-                push(@unfound_auto_annotate_files, $src_file);
-            }
-
-        } else {
-            # File header (distinguish between user- and auto-selected files).
-            print("$fancy");
-            my $ann_type = 
-                (defined $user_ann_files{$src_file} ? "User" : "Auto");
-            print("-- $ann_type-annotated source: $full_file_name\n");
-            print("$fancy");
-
-            # Get file's CCs
-            my $src_file_CCs = $all_ind_CCs{$src_file};
-            if (!defined $src_file_CCs) {
-                print("  No information has been collected for $src_file\n\n");
-                next LOOP;
-            }
-        
-            $did_annotations = 1;
-            
-            # Numeric, not lexicographic sort!
-            my @line_nums = sort {$a <=> $b} keys %$src_file_CCs;  
-
-            # If $src_file more recent than cachegrind.out, issue warning
-            my $src_more_recent_than_inputfile = 0;
-            if ((stat $opened_file)[9] > (stat $input_file)[9]) {
-                $src_more_recent_than_inputfile = 1;
-                warning_on_src_more_recent_than_inputfile($src_file);
-            }
-
-            # Work out the size of each column for printing
-            my $CC_col_widths = compute_CC_col_widths(values %$src_file_CCs);
-
-            # Events header
-            print_events($CC_col_widths);
-            print("\n\n");
-
-            # Shift out 0 if it's in the line numbers (from unknown entries,
-            # likely due to bugs in Valgrind's stabs debug info reader)
-            shift(@line_nums) if (0 == $line_nums[0]);
-
-            # Finds interesting line ranges -- all lines with a CC, and all
-            # lines within $context lines of a line with a CC.
-            my $n = @line_nums;
-            my @pairs;
-            for (my $i = 0; $i < $n; $i++) {
-                push(@pairs, $line_nums[$i] - $context);   # lower marker
-                while ($i < $n-1 && 
-                       $line_nums[$i] + 2*$context >= $line_nums[$i+1]) {
-                    $i++;
-                }
-                push(@pairs, $line_nums[$i] + $context);   # upper marker
-            }
-
-            # Annotate chosen lines, tracking total counts of lines printed
-            $pairs[0] = 1 if ($pairs[0] < 1);
-            while (@pairs) {
-                my $low  = shift @pairs;
-                my $high = shift @pairs;
-                while ($. < $low-1) {
-                    my $tmp = <INPUTFILE>;
-                    last unless (defined $tmp);     # hack to detect EOF
-                }
-                my $src_line;
-                # Print line number, unless start of file
-                print("-- line $low " . '-' x 40 . "\n") if ($low != 1);
-                while (($. < $high) && ($src_line = <INPUTFILE>)) {
-                    if (defined $line_nums[0] && $. == $line_nums[0]) {
-                        print_CC($src_file_CCs->{$.}, $CC_col_widths);
-                        add_array_a_to_b($src_file_CCs->{$.}, 
-                                         $printed_totals_CC);
-                        shift(@line_nums);
-
-                    } else {
-                        print_CC( [], $CC_col_widths);
-                    }
-
-                    print(" $src_line");
-                }
-                # Print line number, unless EOF
-                if ($src_line) {
-                    print("-- line $high " . '-' x 40 . "\n");
-                } else {
-                    last;
-                }
-            }
-
-            # If there was info on lines past the end of the file...
-            if (@line_nums) {
-                foreach my $line_num (@line_nums) {
-                    print_CC($src_file_CCs->{$line_num}, $CC_col_widths);
-                    print(" <bogus line $line_num>\n");
-                }
-                print("\n");
-                warning_on_nonexistent_lines($src_more_recent_than_inputfile,
-                                             $src_file, \@line_nums);
-            }
-            print("\n");
-
-            # Print summary of counts attributed to file but not to any
-            # particular line (due to incomplete debug info).
-            if ($src_file_CCs->{0}) {
-                print_CC($src_file_CCs->{0}, $CC_col_widths);
-                print(" <counts for unidentified lines in $src_file>\n\n");
-            }
-            
-            close(INPUTFILE);
-        }
-    }
-
-    # Print list of unfound auto-annotate selected files.
-    if (@unfound_auto_annotate_files) {
-        print("$fancy");
-        print("The following files chosen for auto-annotation could not be found:\n");
-        print($fancy);
-        foreach my $f (@unfound_auto_annotate_files) {
-            print("  $f\n");
-        }
-        print("\n");
-    }
-
-    # If we did any annotating, print what proportion of events were covered by
-    # annotated lines above.
-    if ($did_annotations) {
-        my $percent_printed_CC;
-        foreach (my $i = 0; $i < @$summary_CC; $i++) {
-            $percent_printed_CC->[$i] = 
-                sprintf("%.0f", 
-                        $printed_totals_CC->[$i] / $summary_CC->[$i] * 100);
-        }
-        my $pp_CC_col_widths = compute_CC_col_widths($percent_printed_CC);
-        print($fancy);
-        print_events($pp_CC_col_widths);
-        print("\n");
-        print($fancy);
-        print_CC($percent_printed_CC, $pp_CC_col_widths);
-        print(" percentage of events annotated\n\n");
-    }
-}
-
-#----------------------------------------------------------------------------
-# "main()"
-#----------------------------------------------------------------------------
-process_cmd_line();
-read_input_file();
-print_options();
-my $threshold_files = print_summary_and_fn_totals();
-annotate_ann_files($threshold_files);
-
-##--------------------------------------------------------------------##
-##--- end                                           vg_annotate.in ---##
-##--------------------------------------------------------------------##
-
-
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
deleted file mode 100644
index 4f1bf10716..0000000000
--- a/cachegrind/cg_main.c
+++ /dev/null
@@ -1,1602 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- The cache simulation framework: instrumentation, recording   ---*/
-/*--- and results printing.                                        ---*/
-/*---                                                vg_cachesim.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2002 Nicholas Nethercote
-      njn25@cam.ac.uk
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-#include "vg_cachesim_L2.c"
-#include "vg_cachesim_I1.c"
-#include "vg_cachesim_D1.c"
-
-
-/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
-#define MAX_x86_INSTR_SIZE              16
-
-/* Size of various buffers used for storing strings */
-#define FILENAME_LEN                    256
-#define FN_NAME_LEN                     256
-#define BUF_LEN                         512
-#define COMMIFY_BUF_LEN                 128
-#define RESULTS_BUF_LEN                 128
-#define LINE_BUF_LEN                     64
-
-
-/*------------------------------------------------------------*/
-/*--- Generic utility stuff                                ---*/
-/*------------------------------------------------------------*/
-
-Int VG_(log2) ( Int x ) 
-{
-   Int i;
-   /* Any more than 32 and we overflow anyway... */
-   for (i = 0; i < 32; i++) {
-      if (1 << i == x) return i;
-   }
-   return -1;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Output file related stuff                            ---*/
-/*------------------------------------------------------------*/
-
-#define OUT_FILE        "cachegrind.out"
-
-static void file_err()
-{
-   VG_(message)(Vg_UserMsg,
-                "error: can't open cache simulation output file `%s'",
-                OUT_FILE );
-   VG_(exit)(1);
-}
-
-/*------------------------------------------------------------*/
-/*--- Cost center types, operations                        ---*/
-/*------------------------------------------------------------*/
-
-typedef struct _CC CC;
-struct _CC {
-   ULong a;
-   ULong m1;
-   ULong m2;
-};
-
-static __inline__ void initCC(CC* cc) {
-    cc->a  = 0;
-    cc->m1 = 0;
-    cc->m2 = 0;
-}
-
-typedef enum { INSTR_CC, READ_CC, WRITE_CC, MOD_CC } CC_type;
-
-/* Instruction-level cost-centres.  The typedefs for these structs are in
- * vg_include.c 
- *
- * WARNING:  the 'tag' field *must* be the first byte of both CC types.
- *
- * This is because we use it to work out what kind of CC we're dealing with.
- */ 
-struct _iCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   /* 2 bytes padding */
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-};
-
-struct _idCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   UChar data_size;
-   /* 1 byte padding */
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-   CC D;
-};
-
-static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size)
-{
-   cc->tag        = INSTR_CC;
-   cc->instr_size = instr_size;
-   cc->instr_addr = instr_addr;
-   initCC(&cc->I);
-}
-
-static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr,
-                      UInt instr_size, UInt data_size)
-{
-   cc->tag        = X_CC;
-   cc->instr_size = instr_size;
-   cc->data_size  = data_size;
-   cc->instr_addr = instr_addr;
-   initCC(&cc->I);
-   initCC(&cc->D);
-}
-
-#define ADD_CC_TO(CC_type, cc, total)           \
-   total.a  += ((CC_type*)BBCC_ptr)->cc.a;      \
-   total.m1 += ((CC_type*)BBCC_ptr)->cc.m1;     \
-   total.m2 += ((CC_type*)BBCC_ptr)->cc.m2;
-          
-/* If 1, address of each instruction is printed as a comment after its counts
- * in cachegrind.out */
-#define PRINT_INSTR_ADDRS 0
-
-static __inline__ void sprint_iCC(Char buf[BUF_LEN], iCC* cc)
-{
-#if PRINT_INSTR_ADDRS
-   VG_(sprintf)(buf, "%llu %llu %llu # %x\n",
-                      cc->I.a, cc->I.m1, cc->I.m2, cc->instr_addr);
-#else
-   VG_(sprintf)(buf, "%llu %llu %llu\n",
-                      cc->I.a, cc->I.m1, cc->I.m2);
-#endif
-}
-
-static __inline__ void sprint_read_or_mod_CC(Char buf[BUF_LEN], idCC* cc)
-{
-#if PRINT_INSTR_ADDRS
-   VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu # %x\n",
-                      cc->I.a, cc->I.m1, cc->I.m2, 
-                      cc->D.a, cc->D.m1, cc->D.m2, cc->instr_addr);
-#else
-   VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu\n",
-                      cc->I.a, cc->I.m1, cc->I.m2, 
-                      cc->D.a, cc->D.m1, cc->D.m2);
-#endif
-}
-
-static __inline__ void sprint_write_CC(Char buf[BUF_LEN], idCC* cc)
-{
-#if PRINT_INSTR_ADDRS
-   VG_(sprintf)(buf, "%llu %llu %llu . . . %llu %llu %llu # %x\n",
-                      cc->I.a, cc->I.m1, cc->I.m2, 
-                      cc->D.a, cc->D.m1, cc->D.m2, cc->instr_addr);
-#else
-   VG_(sprintf)(buf, "%llu %llu %llu . . . %llu %llu %llu\n",
-                      cc->I.a, cc->I.m1, cc->I.m2, 
-                      cc->D.a, cc->D.m1, cc->D.m2);
-#endif
-}
-
-/*------------------------------------------------------------*/
-/*--- BBCC hash table stuff                                ---*/
-/*------------------------------------------------------------*/
-
-/* The table of BBCCs is of the form hash(filename, hash(fn_name,
- * hash(BBCCs))).  Each hash table is separately chained.  The sizes below work
- * fairly well for Konqueror. */
-
-#define N_FILE_ENTRIES        251
-#define   N_FN_ENTRIES         53
-#define N_BBCC_ENTRIES         37
-
-/* The cost centres for a basic block are stored in a contiguous array.
- * They are distinguishable by their tag field. */
-typedef struct _BBCC BBCC;
-struct _BBCC {
-   Addr  orig_addr;
-   UInt  array_size;    /* byte-size of variable length array */
-   BBCC* next;
-   Addr  array[0];      /* variable length array */
-};
-
-typedef struct _fn_node fn_node;
-struct _fn_node {
-   Char*    fn_name;
-   BBCC*    BBCCs[N_BBCC_ENTRIES];
-   fn_node* next;
-};
-
-typedef struct _file_node file_node;
-struct _file_node {
-   Char*      filename;
-   fn_node*   fns[N_FN_ENTRIES];
-   file_node* next;
-};
-
-/* BBCC_table structure:  list(filename, list(fn_name, list(BBCC))) */
-static file_node *BBCC_table[N_FILE_ENTRIES];
-
-static Int  distinct_files      = 0;
-static Int  distinct_fns        = 0;
-
-static Int  distinct_instrs     = 0;
-static Int  full_debug_BBs      = 0;
-static Int  file_line_debug_BBs = 0;
-static Int  fn_name_debug_BBs   = 0;
-static Int  no_debug_BBs        = 0;
-
-static Int  BB_retranslations   = 0;
-
-static CC Ir_discards;
-static CC Dr_discards;
-static CC Dw_discards;
-
-static void init_BBCC_table()
-{
-   Int i;
-   for (i = 0; i < N_FILE_ENTRIES; i++)
-      BBCC_table[i] = NULL;
-}
-
-static void get_debug_info(Addr instr_addr, Char filename[FILENAME_LEN],
-                           Char fn_name[FN_NAME_LEN], Int* line_num)
-{
-   Bool found1, found2, no_demangle = False;
-
-   found1 = VG_(what_line_is_this)(instr_addr, filename,
-                                   FILENAME_LEN, line_num);
-   found2 = VG_(what_fn_is_this)(no_demangle, instr_addr, fn_name, FN_NAME_LEN);
-
-   if (!found1 && !found2) {
-      no_debug_BBs++;
-      VG_(strcpy)(filename, "???");
-      VG_(strcpy)(fn_name,  "???");
-      *line_num = 0;
-
-   } else if ( found1 &&  found2) {
-      full_debug_BBs++;
-
-   } else if ( found1 && !found2) {
-      file_line_debug_BBs++;
-      VG_(strcpy)(fn_name,  "???");
-
-   } else  /*(!found1 &&  found2)*/ {
-      fn_name_debug_BBs++;
-      VG_(strcpy)(filename, "???");
-      *line_num = 0;
-   }
-}
-
-/* Forward declaration. */
-static Int compute_BBCC_array_size(UCodeBlock* cb);
-
-static __inline__ 
-file_node* new_file_node(Char filename[FILENAME_LEN], file_node* next)
-{
-   Int i;
-   file_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(file_node));
-   new->filename  = VG_(strdup)(VG_AR_PRIVATE, filename);
-   for (i = 0; i < N_FN_ENTRIES; i++) {
-      new->fns[i] = NULL;
-   }
-   new->next      = next;
-   return new;
-}
-
-static __inline__ 
-fn_node* new_fn_node(Char fn_name[FILENAME_LEN], fn_node* next)
-{
-   Int i;
-   fn_node* new = VG_(malloc)(VG_AR_PRIVATE, sizeof(fn_node));
-   new->fn_name = VG_(strdup)(VG_AR_PRIVATE, fn_name);
-   for (i = 0; i < N_BBCC_ENTRIES; i++) {
-      new->BBCCs[i] = NULL;
-   }
-   new->next    = next;
-   return new;
-}
-
-static __inline__ 
-BBCC* new_BBCC(Addr bb_orig_addr, UCodeBlock* cb, BBCC* next)
-{
-   Int BBCC_array_size = compute_BBCC_array_size(cb);
-   BBCC* new;
-
-   new = (BBCC*)VG_(malloc)(VG_AR_PRIVATE, sizeof(BBCC) + BBCC_array_size);
-   new->orig_addr  = bb_orig_addr;
-   new->array_size = BBCC_array_size;
-   new->next = next;
-
-   return new;
-}
-
-#define HASH_CONSTANT   256
-
-static UInt hash(Char *s, UInt table_size)
-{
-    int hash_value = 0;
-    for ( ; *s; s++)
-        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
-    return hash_value;
-}
-
-/* Do a three step traversal: by filename, then fn_name, then instr_addr.
- * In all cases prepends new nodes to their chain.  Returns a pointer to the
- * cost centre.  Also sets BB_seen_before by reference. 
- */ 
-static __inline__ BBCC* get_BBCC(Addr bb_orig_addr, UCodeBlock* cb, 
-                                 Bool remove, Bool *BB_seen_before)
-{
-   file_node *curr_file_node;
-   fn_node   *curr_fn_node;
-   BBCC     **prev_BBCC_next_ptr, *curr_BBCC;
-   Char       filename[FILENAME_LEN], fn_name[FN_NAME_LEN];
-   UInt       filename_hash, fnname_hash, BBCC_hash;
-   Int        dummy_line_num;
-
-   get_debug_info(bb_orig_addr, filename, fn_name, &dummy_line_num);
-
-   VGP_PUSHCC(VgpCacheGetBBCC);
-   filename_hash = hash(filename, N_FILE_ENTRIES);
-   curr_file_node = BBCC_table[filename_hash];
-   while (NULL != curr_file_node && 
-          VG_(strcmp)(filename, curr_file_node->filename) != 0) {
-      curr_file_node = curr_file_node->next;
-   }
-   if (NULL == curr_file_node) {
-      BBCC_table[filename_hash] = curr_file_node = 
-         new_file_node(filename, BBCC_table[filename_hash]);
-      distinct_files++;
-   }
-
-   fnname_hash = hash(fn_name, N_FN_ENTRIES);
-   curr_fn_node = curr_file_node->fns[fnname_hash];
-   while (NULL != curr_fn_node && 
-          VG_(strcmp)(fn_name, curr_fn_node->fn_name) != 0) {
-      curr_fn_node = curr_fn_node->next;
-   }
-   if (NULL == curr_fn_node) {
-      curr_file_node->fns[fnname_hash] = curr_fn_node = 
-         new_fn_node(fn_name, curr_file_node->fns[fnname_hash]);
-      distinct_fns++;
-   }
-
-   BBCC_hash = bb_orig_addr % N_BBCC_ENTRIES;
-   prev_BBCC_next_ptr = &(curr_fn_node->BBCCs[BBCC_hash]);
-   curr_BBCC = curr_fn_node->BBCCs[BBCC_hash];
-   while (NULL != curr_BBCC && bb_orig_addr != curr_BBCC->orig_addr) {
-      prev_BBCC_next_ptr = &(curr_BBCC->next);
-      curr_BBCC = curr_BBCC->next;
-   }
-   if (curr_BBCC == NULL) {
-
-      vg_assert(False == remove);
-
-      curr_fn_node->BBCCs[BBCC_hash] = curr_BBCC = 
-         new_BBCC(bb_orig_addr, cb, curr_fn_node->BBCCs[BBCC_hash]);
-      *BB_seen_before = False;
-
-   } else {
-      vg_assert(bb_orig_addr == curr_BBCC->orig_addr);
-      vg_assert(curr_BBCC->array_size > 0 && curr_BBCC->array_size < 1000000);
-      if (VG_(clo_verbosity) > 2) {
-          VG_(message)(Vg_DebugMsg, 
-            "BB retranslation, retrieving from BBCC table");
-      }
-      *BB_seen_before = True;
-
-      if (True == remove) {
-          // Remove curr_BBCC from chain;  it will be used and free'd by the
-          // caller.
-          *prev_BBCC_next_ptr = curr_BBCC->next;
-
-      } else {
-          BB_retranslations++;
-      }
-   }
-   VGP_POPCC;
-   return curr_BBCC;
-}
-
-/*------------------------------------------------------------*/
-/*--- Cache simulation instrumentation phase               ---*/
-/*------------------------------------------------------------*/
-
-#define uInstr1   VG_(newUInstr1)
-#define uInstr2   VG_(newUInstr2)
-#define uInstr3   VG_(newUInstr3)
-#define dis       VG_(disassemble)
-#define uLiteral  VG_(setLiteralField)
-#define newTemp   VG_(getNewTemp)
-
-static Int compute_BBCC_array_size(UCodeBlock* cb)
-{
-   UInstr* u_in;
-   Int     i, CC_size, BBCC_size = 0;
-   Bool    is_LOAD, is_STORE, is_FPU_R, is_FPU_W;
-    
-   is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
-
-   for (i = 0; i < cb->used; i++) {
-      /* VG_(ppUInstr)(0, &cb->instrs[i]); */
-
-      u_in = &cb->instrs[i];
-      switch(u_in->opcode) {
-
-         case INCEIP: 
-            goto case_for_end_of_instr;
-         
-         case JMP:
-            if (u_in->cond != CondAlways) break;
-
-            goto case_for_end_of_instr;
-
-            case_for_end_of_instr:
-
-            CC_size = (is_LOAD || is_STORE || is_FPU_R || is_FPU_W 
-                      ? sizeof(idCC) : sizeof(iCC));
-
-            BBCC_size += CC_size;
-            is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
-            break;
-
-         case LOAD:
-            /* Two LDBs are possible for a single instruction */
-            /* Also, a STORE can come after a LOAD for bts/btr/btc */
-            vg_assert(/*!is_LOAD &&*/ /* !is_STORE && */ 
-                      !is_FPU_R && !is_FPU_W);
-            is_LOAD = True;
-            break;
-
-         case STORE:
-            /* Multiple STOREs are possible for 'pushal' */
-            vg_assert(            /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W);
-            is_STORE = True;
-            break;
-
-         case FPU_R:
-            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
-            is_FPU_R = True;
-            break;
-
-         case FPU_W:
-            vg_assert(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
-            is_FPU_W = True;
-            break;
-
-         default:
-            break;
-      }
-   }
-
-   return BBCC_size;
-}
-
-/* Use this rather than eg. -1 because it's stored as a UInt. */
-#define INVALID_DATA_SIZE   999999
-
-UCodeBlock* VG_(cachesim_instrument)(UCodeBlock* cb_in, Addr orig_addr)
-{
-   UCodeBlock* cb;
-   Int         i;
-   UInstr*     u_in;
-   BBCC*       BBCC_node;
-   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr;
-   Int         CC_size = -1;    /* Shut gcc warnings up */
-   Addr        instr_addr = orig_addr;
-   UInt        instr_size, data_size = INVALID_DATA_SIZE;
-   Int         helper = -1;     /* Shut gcc warnings up */
-   UInt        stack_used;
-   Bool        BB_seen_before       = False;
-   Bool        prev_instr_was_Jcond = False;
-   Addr        BBCC_ptr0, BBCC_ptr; 
-
-   /* Get BBCC (creating if necessary -- requires a counting pass over the BB
-    * if it's the first time it's been seen), and point to start of the 
-    * BBCC array.  */
-   BBCC_node = get_BBCC(orig_addr, cb_in, False, &BB_seen_before);
-   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
-
-   cb = VG_(allocCodeBlock)();
-   cb->nextTemp = cb_in->nextTemp;
-
-   t_CC_addr = t_read_addr = t_write_addr = t_data_addr = INVALID_TEMPREG;
-
-   for (i = 0; i < cb_in->used; i++) {
-      u_in = &cb_in->instrs[i];
-
-      //VG_(ppUInstr)(0, u_in);
-
-      /* What this is all about:  we want to instrument each x86 instruction 
-       * translation.  The end of these are marked in three ways.  The three
-       * ways, and the way we instrument them, are as follows:
-       *
-       * 1. UCode, INCEIP         --> UCode, Instrumentation, INCEIP
-       * 2. UCode, Juncond        --> UCode, Instrumentation, Juncond
-       * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond
-       *
-       * We must put the instrumentation before the jumps so that it is always
-       * executed.  We don't have to put the instrumentation before the INCEIP
-       * (it could go after) but we do so for consistency.
-       *
-       * Junconds are always the last instruction in a basic block.  Jconds are
-       * always the 2nd last, and must be followed by a Jcond.  We check this
-       * with various assertions.
-       *
-       * Note that in VG_(disBB) we patched the `extra4b' field of the first
-       * occurring JMP in a block with the size of its x86 instruction.  This
-       * is used now.
-       *
-       * Note that we don't have to treat JIFZ specially;  unlike JMPs, JIFZ
-       * occurs in the middle of a BB and gets an INCEIP after it.
-       *
-       * The instrumentation is just a call to the appropriate helper function,
-       * passing it the address of the instruction's CC.
-       */
-      if (prev_instr_was_Jcond) vg_assert(u_in->opcode == JMP);
-
-      switch (u_in->opcode) {
-
-         case INCEIP:
-            instr_size = u_in->val1;
-            goto case_for_end_of_x86_instr;
-
-         case JMP:
-            if (u_in->cond == CondAlways) {
-               vg_assert(i+1 == cb_in->used); 
-
-               /* Don't instrument if previous instr was a Jcond. */
-               if (prev_instr_was_Jcond) {
-                  vg_assert(0 == u_in->extra4b);
-                  VG_(copyUInstr)(cb, u_in);
-                  break;
-               }
-               prev_instr_was_Jcond = False;
-
-            } else {
-               vg_assert(i+2 == cb_in->used);  /* 2nd last instr in block */
-               prev_instr_was_Jcond = True;
-            }
-
-            /* Ah, the first JMP... instrument, please. */
-            instr_size = u_in->extra4b;
-            goto case_for_end_of_x86_instr;
-
-            /* Shared code that is executed at the end of an x86 translation
-             * block, marked by either an INCEIP or an unconditional JMP. */
-            case_for_end_of_x86_instr:
-
-#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)
-             
-            /* Initialise the CC in the BBCC array appropriately if it hasn't
-             * been initialised before.
-             * Then call appropriate sim function, passing it the CC address.
-             * Note that CALLM_S/CALL_E aren't required here;  by this point,
-             * the checking related to them has already happened. */
-            stack_used = 0;
-
-            vg_assert(instr_size >= 1 && instr_size <= MAX_x86_INSTR_SIZE);
-            vg_assert(0 != instr_addr);
-
-            /* Save the caller-save registers before we push our args */
-            uInstr1(cb, PUSH, 4, RealReg, R_EAX);
-            uInstr1(cb, PUSH, 4, RealReg, R_ECX);
-            uInstr1(cb, PUSH, 4, RealReg, R_EDX);
-
-            if (!IS_(read) && !IS_(write)) {
-               iCC* CC_ptr = (iCC*)(BBCC_ptr);
-               vg_assert(INVALID_DATA_SIZE == data_size);
-               vg_assert(INVALID_TEMPREG == t_read_addr && 
-                         INVALID_TEMPREG == t_write_addr);
-               CC_size = sizeof(iCC);
-               if (!BB_seen_before)
-                   init_iCC(CC_ptr, instr_addr, instr_size);
-
-               helper = VGOFF_(cachesim_log_non_mem_instr);
-
-            } else { 
-               CC_type X_CC;
-               idCC* CC_ptr = (idCC*)(BBCC_ptr);
-                
-               vg_assert(4 == data_size || 2  == data_size || 1 == data_size || 
-                         8 == data_size || 10 == data_size);
-               
-               CC_size = sizeof(idCC);
-               helper = VGOFF_(cachesim_log_mem_instr);
-
-               if (IS_(read) && !IS_(write)) {
-                  X_CC = READ_CC;
-                  vg_assert(INVALID_TEMPREG != t_read_addr && 
-                            INVALID_TEMPREG == t_write_addr);
-                  t_data_addr = t_read_addr;
-
-               } else if (!IS_(read) && IS_(write)) {
-                  X_CC = WRITE_CC;
-                  vg_assert(INVALID_TEMPREG == t_read_addr && 
-                            INVALID_TEMPREG != t_write_addr);
-                  t_data_addr = t_write_addr;
-
-               } else {
-                  vg_assert(IS_(read) && IS_(write));
-                  X_CC = MOD_CC;
-                  vg_assert(INVALID_TEMPREG != t_read_addr && 
-                            INVALID_TEMPREG != t_write_addr);
-                  t_data_addr = t_read_addr;
-               }
-
-               if (!BB_seen_before)
-                  init_idCC(X_CC, CC_ptr, instr_addr, instr_size, data_size);
-
-               /* 2nd arg: data addr */
-               uInstr1(cb, PUSH,  4, TempReg, t_data_addr);
-               stack_used += 4;
-            }
-#undef IS_
-
-            /* 1st arg: CC addr */
-            t_CC_addr = newTemp(cb);
-            uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
-            uLiteral(cb, BBCC_ptr);
-            uInstr1(cb, PUSH,  4, TempReg, t_CC_addr);
-            stack_used += 4;
-
-            /* Call function and return. */
-            uInstr1(cb, CALLM, 0, Lit16,   helper);
-            uInstr1(cb, CLEAR, 0, Lit16,   stack_used);
-
-            /* Restore the caller-save registers now the call is done */
-            uInstr1(cb, POP, 4, RealReg, R_EDX);
-            uInstr1(cb, POP, 4, RealReg, R_ECX);
-            uInstr1(cb, POP, 4, RealReg, R_EAX);
-
-            VG_(copyUInstr)(cb, u_in);
-
-            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
-            BBCC_ptr   += CC_size; 
-            instr_addr += instr_size;
-            t_CC_addr = t_read_addr = t_write_addr = 
-                                      t_data_addr  = INVALID_TEMPREG;
-            data_size = INVALID_DATA_SIZE;
-            break;
-
-
-         /* For memory-ref instrs, copy the data_addr into a temporary to be
-          * passed to the cachesim_log_function at the end of the instruction.
-          */
-         case LOAD: 
-            t_read_addr = newTemp(cb);
-            uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
-            data_size = u_in->size;
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         case FPU_R:
-            t_read_addr = newTemp(cb);
-            uInstr2(cb, MOV, 4, TempReg, u_in->val2,  TempReg, t_read_addr);
-            data_size = u_in->size;
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Note that we must set t_write_addr even for mod instructions;
-          * that's how the code above determines whether it does a write;
-          * without it, it would think a mod instruction is a read.
-          * As for the MOV, if it's a mod instruction it's redundant, but it's
-          * not expensive and mod instructions are rare anyway. */
-         case STORE:
-         case FPU_W:
-            t_write_addr = newTemp(cb);
-            uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr);
-            data_size = u_in->size;
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         case NOP:  case CALLM_E:  case CALLM_S:
-            break;
-
-         default:
-            VG_(copyUInstr)(cb, u_in);
-            break;
-      }
-   }
-
-   /* Just check everything looks ok */
-   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
-
-   VG_(freeCodeBlock)(cb_in);
-   return cb;
-}
-
-/*------------------------------------------------------------*/
-/*--- Cache simulation stuff                               ---*/
-/*------------------------------------------------------------*/
-
-#define MIN_LINE_SIZE   16
-
-/* Total reads/writes/misses.  Calculated during CC traversal at the end. */
-static CC Ir_total;
-static CC Dr_total;
-static CC Dw_total;
-
-/* All CPUID info taken from sandpile.org/a32/cpuid.htm */
-/* Probably only works for Intel and AMD chips, and probably only for some of
- * them. 
- */
-
-static __inline__ void cpuid(Int n, Int *a, Int *b, Int *c, Int *d)
-{
-   __asm__ __volatile__ (
-    "cpuid"
-    : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)      /* output */
-    : "0" (n)         /* input */
-    );
-}
-
-static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
-{
-    VG_(message)(Vg_DebugMsg, 
-       "warning: Pentium with %d K micro_op instruction trace cache", 
-       actual_size);
-    VG_(message)(Vg_DebugMsg, 
-       "         Simulating a %d KB cache with %d B lines", 
-       used_size, line_size);
-}
-
-/* Intel method is truly wretched.  We have to do an insane indexing into an
- * array of pre-defined configurations for various parts of the memory
- * hierarchy. 
- */
-static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   UChar info[16];
-   Int   i, trials;
-
-   if (level < 2) {
-      VG_(message)(Vg_DebugMsg, 
-         "warning: CPUID level < 2 for Intel processor (%d)", 
-         level);
-      return -1;
-   }
-
-   cpuid(2, (Int*)&info[0], (Int*)&info[4], 
-            (Int*)&info[8], (Int*)&info[12]);
-   trials  = info[0] - 1;   /* AL register - bits 0..7 of %eax */
-   info[0] = 0x0;           /* reset AL */
-
-   if (0 != trials) {
-      VG_(message)(Vg_DebugMsg, 
-         "warning: non-zero CPUID trials for Intel processor (%d)",
-         trials);
-      return -1;
-   }
-
-   for (i = 0; i < 16; i++) {
-
-      switch (info[i]) {
-
-      case 0x0:       /* ignore zeros */
-          break;
-          
-      case 0x01: case 0x02: case 0x03: case 0x04:     /* TLB info, ignore */
-      case 0x90: case 0x96: case 0x9b:
-          break;      
-
-      case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
-      case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
-
-      case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
-      case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
-
-      case 0x22: case 0x23: case 0x25: case 0x29: 
-      case 0x88: case 0x89: case 0x8a:
-          VG_(message)(Vg_DebugMsg, 
-             "warning: L3 cache detected but ignored\n");
-          break;
-
-      case 0x40: 
-          VG_(message)(Vg_DebugMsg, 
-             "warning: L2 cache not installed, ignore L2 results.");
-          break;
-
-      case 0x41: *L2c = (cache_t) {  128, 4, 32 };    break;
-      case 0x42: *L2c = (cache_t) {  256, 4, 32 };    break;
-      case 0x43: *L2c = (cache_t) {  512, 4, 32 };    break;
-      case 0x44: *L2c = (cache_t) { 1024, 4, 32 };    break;
-      case 0x45: *L2c = (cache_t) { 2048, 4, 32 };    break;
-
-      /* These are sectored, whatever that means */
-      case 0x66: *D1c = (cache_t) {  8, 4, 64 };  break;      /* sectored */
-      case 0x67: *D1c = (cache_t) { 16, 4, 64 };  break;      /* sectored */
-      case 0x68: *D1c = (cache_t) { 32, 4, 64 };  break;      /* sectored */
-
-      /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
-       * conversion to byte size is a total guess;  treat the 12K and 16K
-       * cases the same since the cache byte size must be a power of two for
-       * everything to work!.  Also guessing 32 bytes for the line size... 
-       */
-      case 0x70:    /* 12K micro-ops, 8-way */
-         *I1c = (cache_t) { 16, 8, 32 };  
-         micro_ops_warn(12, 16, 32);
-         break;  
-      case 0x71:    /* 16K micro-ops, 8-way */
-         *I1c = (cache_t) { 16, 8, 32 };  
-         micro_ops_warn(16, 16, 32); 
-         break;  
-      case 0x72:    /* 32K micro-ops, 8-way */
-         *I1c = (cache_t) { 32, 8, 32 };  
-         micro_ops_warn(32, 32, 32); 
-         break;  
-
-      case 0x79: *L2c = (cache_t) {  128, 8, 64 };    break;  /* sectored */
-      case 0x7a: *L2c = (cache_t) {  256, 8, 64 };    break;  /* sectored */
-      case 0x7b: *L2c = (cache_t) {  512, 8, 64 };    break;  /* sectored */
-      case 0x7c: *L2c = (cache_t) { 1024, 8, 64 };    break;  /* sectored */
-
-      case 0x81: *L2c = (cache_t) {  128, 8, 32 };    break;
-      case 0x82: *L2c = (cache_t) {  256, 8, 32 };    break;
-      case 0x83: *L2c = (cache_t) {  512, 8, 32 };    break;
-      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };    break;
-      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };    break;
-
-      default:
-          VG_(message)(Vg_DebugMsg, 
-             "warning: Unknown Intel cache config value "
-             "(0x%x), ignoring\n", info[i]);
-          break;
-      }
-   }
-   return 0;
-}
-
-/* AMD method is straightforward, just extract appropriate bits from the
- * result registers.
- *
- * Bits, for D1 and I1:
- *  31..24  data L1 cache size in KBs    
- *  23..16  data L1 cache associativity (FFh=full)    
- *  15.. 8  data L1 cache lines per tag    
- *   7.. 0  data L1 cache line size in bytes
- *
- * Bits, for L2:
- *  31..16  unified L2 cache size in KBs
- *  15..12  unified L2 cache associativity (0=off, FFh=full)
- *  11.. 8  unified L2 cache lines per tag    
- *   7.. 0  unified L2 cache line size in bytes
- *
- * #3  The AMD K7 processor's L2 cache must be configured prior to relying 
- *     upon this information. (Whatever that means -- njn)
- *
- * Returns 0 on success, non-zero on failure.
- */
-static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   Int dummy, ext_level;
-   Int I1i, D1i, L2i;
-   
-   cpuid(0x80000000, &ext_level, &dummy, &dummy, &dummy);
-
-   if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
-      VG_(message)(Vg_UserMsg, 
-         "warning: ext_level < 0x80000006 for AMD processor (0x%x)", 
-         ext_level);
-      return -1;
-   }
-
-   cpuid(0x80000005, &dummy, &dummy, &D1i, &I1i);
-   cpuid(0x80000006, &dummy, &dummy, &L2i, &dummy);
-
-   D1c->size      = (D1i >> 24) & 0xff;
-   D1c->assoc     = (D1i >> 16) & 0xff;
-   D1c->line_size = (D1i >>  0) & 0xff;
-
-   I1c->size      = (I1i >> 24) & 0xff;
-   I1c->assoc     = (I1i >> 16) & 0xff;
-   I1c->line_size = (I1i >>  0) & 0xff;
-
-   L2c->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
-   L2c->assoc     = (L2i >> 12) & 0xf;
-   L2c->line_size = (L2i >>  0) & 0xff;
-
-   return 0;
-}
-
-static jmp_buf cpuid_jmpbuf;
-
-static
-void cpuid_SIGILL_handler(int signum)
-{
-   __builtin_longjmp(cpuid_jmpbuf, 1);
-}
-
-static 
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   Int  level, res, ret;
-   Char vendor_id[13];
-   vki_ksigaction sigill_new, sigill_saved;
-
-   /* Install own SIGILL handler */
-   sigill_new.ksa_handler  = cpuid_SIGILL_handler;
-   sigill_new.ksa_flags    = 0;
-   sigill_new.ksa_restorer = NULL;
-   res = VG_(ksigemptyset)( &sigill_new.ksa_mask );
-   vg_assert(res == 0);
-
-   res = VG_(ksigaction)( VKI_SIGILL, &sigill_new, &sigill_saved );
-   vg_assert(res == 0);
-
-   /* Trap for illegal instruction, in case it's a really old processor that
-    * doesn't support CPUID. */
-   if (__builtin_setjmp(cpuid_jmpbuf) == 0) {
-      cpuid(0, &level, (int*)&vendor_id[0], 
-                       (int*)&vendor_id[8], (int*)&vendor_id[4]);    
-      vendor_id[12] = '\0';
-
-      /* Restore old SIGILL handler */
-      res = VG_(ksigaction)( VKI_SIGILL, &sigill_saved, NULL );
-      vg_assert(res == 0);
-
-   } else  {
-      VG_(message)(Vg_DebugMsg, "CPUID instruction not supported");
-
-      /* Restore old SIGILL handler */
-      res = VG_(ksigaction)( VKI_SIGILL, &sigill_saved, NULL );
-      vg_assert(res == 0);
-      return -1;
-   }
-
-   if (0 == level) {
-      VG_(message)(Vg_DebugMsg, "CPUID level is 0, early Pentium?\n");
-      return -1;
-   }
-
-   /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
-   if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
-      ret = Intel_cache_info(level, I1c, D1c, L2c);
-
-   } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
-      ret = AMD_cache_info(I1c, D1c, L2c);
-
-   } else {
-      VG_(message)(Vg_DebugMsg, "CPU vendor ID not recognised (%s)",
-                   vendor_id);
-      return -1;
-   }
-
-   /* Successful!  Convert sizes from KB to bytes */
-   I1c->size *= 1024;
-   D1c->size *= 1024;
-   L2c->size *= 1024;
-      
-   return ret;
-}
-
-/* Checks cache config is ok;  makes it so if not. */
-static 
-void check_cache(cache_t* cache, cache_t* dflt, Char *name)
-{
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
-      VG_(message)(Vg_UserMsg,
-         "warning: %s size of %dB not a power of two; "
-         "defaulting to %dB", name, cache->size, dflt->size);
-      cache->size = dflt->size;
-   }
-
-   if (-1 == VG_(log2)(cache->assoc)) {
-      VG_(message)(Vg_UserMsg,
-         "warning: %s associativity of %d not a power of two; "
-         "defaulting to %d-way", name, cache->assoc, dflt->assoc);
-      cache->assoc = dflt->assoc;
-   }
-
-   if (-1 == VG_(log2)(cache->line_size)) {
-      VG_(message)(Vg_UserMsg,
-         "warning: %s line size of %dB not a power of two; "
-         "defaulting to %dB", 
-         name, cache->line_size, dflt->line_size);
-      cache->line_size = dflt->line_size;
-   }
-
-   /* Then check line size >= 16 -- any smaller and a single instruction could
-    * straddle three cache lines, which breaks a simulation assertion and is
-    * stupid anyway. */
-   if (cache->line_size < MIN_LINE_SIZE) {
-      VG_(message)(Vg_UserMsg,
-         "warning: %s line size of %dB too small; "
-         "increasing to %dB", name, cache->line_size, MIN_LINE_SIZE);
-      cache->line_size = MIN_LINE_SIZE;
-   }
-
-   /* Then check cache size > line size (causes seg faults if not). */
-   if (cache->size <= cache->line_size) {
-      VG_(message)(Vg_UserMsg,
-         "warning: %s cache size of %dB <= line size of %dB; "
-         "increasing to %dB", name, cache->size, cache->line_size,
-                              cache->line_size * 2);
-      cache->size = cache->line_size * 2;
-   }
-
-   /* Then check assoc <= (size / line size) (seg faults otherwise). */
-   if (cache->assoc > (cache->size / cache->line_size)) {
-      VG_(message)(Vg_UserMsg,
-         "warning: %s associativity > (size / line size); "
-         "increasing size to %dB", 
-            name, cache->assoc * cache->line_size);
-      cache->size = cache->assoc * cache->line_size;
-   }
-}
-
-/* On entry, args are undefined.  Fill them with any info from the
- * command-line, then fill in any remaining with CPUID instruction if possible,
- * otherwise use defaults.  Then check them and fix if not ok. */
-static 
-void get_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   /* Defaults are for a model 3 or 4 Athlon */
-   cache_t I1_dflt = (cache_t) {  65536, 2, 64 };
-   cache_t D1_dflt = (cache_t) {  65536, 2, 64 };
-   cache_t L2_dflt = (cache_t) { 262144, 8, 64 };
-
-#define CMD_LINE_DEFINED(L)                 \
-   (-1 != VG_(clo_##L##_cache).size  ||     \
-    -1 != VG_(clo_##L##_cache).assoc ||     \
-    -1 != VG_(clo_##L##_cache).line_size)
-
-   *I1c = VG_(clo_I1_cache);
-   *D1c = VG_(clo_D1_cache);
-   *L2c = VG_(clo_L2_cache);
-
-   /* If any undefined on command-line, try CPUID */
-   if (! CMD_LINE_DEFINED(I1) ||
-       ! CMD_LINE_DEFINED(D1) ||
-       ! CMD_LINE_DEFINED(L2)) { 
-
-      /* Overwrite CPUID result for any cache defined on command-line */
-      if (0 == get_caches_from_CPUID(I1c, D1c, L2c)) {
-   
-         if (CMD_LINE_DEFINED(I1)) *I1c = VG_(clo_I1_cache);
-         if (CMD_LINE_DEFINED(D1)) *D1c = VG_(clo_D1_cache);
-         if (CMD_LINE_DEFINED(L2)) *L2c = VG_(clo_L2_cache);
-
-      /* CPUID failed, use defaults for each undefined by command-line */
-      } else {
-         VG_(message)(Vg_DebugMsg, 
-                      "Couldn't detect cache configuration, using one "
-                      "or more defaults ");
-
-         *I1c = (CMD_LINE_DEFINED(I1) ? VG_(clo_I1_cache) : I1_dflt);
-         *D1c = (CMD_LINE_DEFINED(D1) ? VG_(clo_D1_cache) : D1_dflt);
-         *L2c = (CMD_LINE_DEFINED(L2) ? VG_(clo_L2_cache) : L2_dflt);
-      }
-   }
-#undef CMD_LINE_DEFINED
-
-   check_cache(I1c, &I1_dflt, "I1");
-   check_cache(D1c, &D1_dflt, "D1");
-   check_cache(L2c, &L2_dflt, "L2");
-
-   if (VG_(clo_verbosity) > 1) {
-      VG_(message)(Vg_UserMsg, "Cache configuration used:");
-      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines",
-                               I1c->size, I1c->assoc, I1c->line_size);
-      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines",
-                               D1c->size, D1c->assoc, D1c->line_size);
-      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines",
-                               L2c->size, L2c->assoc, L2c->line_size);
-   }
-}
-
-void VG_(init_cachesim)(void)
-{
-   cache_t I1c, D1c, L2c; 
-
-   /* Make sure the output file can be written. */
-   Int fd = VG_(open_write)(OUT_FILE);
-   if (-1 == fd) { 
-      fd = VG_(create_and_write)(OUT_FILE);
-      if (-1 == fd) {
-         file_err(); 
-      }
-   }
-   VG_(close)(fd);
-
-   initCC(&Ir_total);
-   initCC(&Dr_total);
-   initCC(&Dw_total);
-   
-   initCC(&Ir_discards);
-   initCC(&Dr_discards);
-   initCC(&Dw_discards);
-
-   get_caches(&I1c, &D1c, &L2c);
-
-   cachesim_I1_initcache(I1c);
-   //cachesim_I1_initcache();
-   cachesim_D1_initcache(D1c);
-   //cachesim_D1_initcache();
-   cachesim_L2_initcache(L2c);
-   //cachesim_L2_initcache();
-
-   init_BBCC_table();
-}
-
-void VG_(cachesim_log_non_mem_instr)(iCC* cc)
-{
-   //VG_(printf)("sim  I: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
-   //            cc, cc->instr_addr, cc->instr_size)
-   VGP_PUSHCC(VgpCacheSimulate);
-   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
-   cc->I.a++;
-   VGP_POPCC;
-}
-
-void VG_(cachesim_log_mem_instr)(idCC* cc, Addr data_addr)
-{
-   //VG_(printf)("sim  D: CCaddr=0x%x, iaddr=0x%x, isize=%u, daddr=0x%x, dsize=%u\n",
-   //            cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size)
-   VGP_PUSHCC(VgpCacheSimulate);
-   cachesim_I1_doref(cc->instr_addr, cc->instr_size, &cc->I.m1, &cc->I.m2);
-   cc->I.a++;
-
-   cachesim_D1_doref(data_addr,      cc->data_size,  &cc->D.m1, &cc->D.m2);
-   cc->D.a++;
-   VGP_POPCC;
-}
-
-/*------------------------------------------------------------*/
-/*--- Printing of output file and summary stats            ---*/
-/*------------------------------------------------------------*/
-
-static void fprint_BBCC(Int fd, BBCC* BBCC_node, Char *first_instr_fl, 
-                                                 Char *first_instr_fn)
-{
-   Addr BBCC_ptr0, BBCC_ptr;
-   Char buf[BUF_LEN], curr_file[BUF_LEN], 
-        fbuf[BUF_LEN+4], lbuf[LINE_BUF_LEN];
-   UInt line_num;
-
-   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
-
-   /* Mark start of basic block in output, just to ease debugging */
-   VG_(write)(fd, (void*)"\n", 1);  
-
-   VG_(strcpy)(curr_file, first_instr_fl);
-   
-   while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) {
-
-      /* We pretend the CC is an iCC for getting the tag.  This is ok
-       * because both CC types have tag as their first byte.  Once we know
-       * the type, we can cast and act appropriately. */
-
-      Char fl_buf[FILENAME_LEN];
-      Char fn_buf[FN_NAME_LEN];
-
-      Addr instr_addr;
-      switch ( ((iCC*)BBCC_ptr)->tag ) {
-
-         case INSTR_CC:
-            instr_addr = ((iCC*)BBCC_ptr)->instr_addr;
-            sprint_iCC(buf, (iCC*)BBCC_ptr);
-            ADD_CC_TO(iCC, I, Ir_total);
-            BBCC_ptr += sizeof(iCC);
-            break;
-
-         case READ_CC:
-         case  MOD_CC:
-            instr_addr = ((idCC*)BBCC_ptr)->instr_addr;
-            sprint_read_or_mod_CC(buf, (idCC*)BBCC_ptr);
-            ADD_CC_TO(idCC, I, Ir_total);
-            ADD_CC_TO(idCC, D, Dr_total);
-            BBCC_ptr += sizeof(idCC);
-            break;
-
-         case WRITE_CC:
-            instr_addr = ((idCC*)BBCC_ptr)->instr_addr;
-            sprint_write_CC(buf, (idCC*)BBCC_ptr);
-            ADD_CC_TO(idCC, I, Ir_total);
-            ADD_CC_TO(idCC, D, Dw_total);
-            BBCC_ptr += sizeof(idCC);
-            break;
-
-         default:
-            VG_(panic)("Unknown CC type in fprint_BBCC()\n");
-            break;
-      }
-      distinct_instrs++;
-      
-      get_debug_info(instr_addr, fl_buf, fn_buf, &line_num);
-
-      /* Allow for filename switching in the middle of a BB;  if this happens,
-       * must print the new filename with the function name. */
-      if (0 != VG_(strcmp)(fl_buf, curr_file)) {
-         VG_(strcpy)(curr_file, fl_buf);
-         VG_(sprintf)(fbuf, "fi=%s\n", curr_file);
-         VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
-      }
-
-      /* If the function name for this instruction doesn't match that of the
-       * first instruction in the BB, print warning. */
-      if (VG_(clo_trace_symtab) && 0 != VG_(strcmp)(fn_buf, first_instr_fn)) {
-         VG_(printf)("Mismatched function names\n");
-         VG_(printf)("  filenames: BB:%s, instr:%s;"
-                     "  fn_names:  BB:%s, instr:%s;"
-                     "  line: %d\n", 
-                     first_instr_fl, fl_buf, 
-                     first_instr_fn, fn_buf, 
-                     line_num);
-      }
-
-      VG_(sprintf)(lbuf, "%u ", line_num);
-      VG_(write)(fd, (void*)lbuf, VG_(strlen)(lbuf));   /* line number */
-      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));     /* cost centre */
-   }
-   /* If we switched filenames in the middle of the BB without switching back,
-    * switch back now because the subsequent BB may be relying on falling under
-    * the original file name. */
-   if (0 != VG_(strcmp)(first_instr_fl, curr_file)) {
-      VG_(sprintf)(fbuf, "fe=%s\n", first_instr_fl);
-      VG_(write)(fd, (void*)fbuf, VG_(strlen)(fbuf));
-   }
-
-   /* Mark end of basic block */
-   /* VG_(write)(fd, (void*)"#}\n", 3); */
-
-   vg_assert(BBCC_ptr - BBCC_ptr0 == BBCC_node->array_size);
-}
-
-static void fprint_BBCC_table_and_calc_totals(Int client_argc, 
-                                              Char** client_argv)
-{
-   Int        fd;
-   Char       buf[BUF_LEN];
-   file_node *curr_file_node;
-   fn_node   *curr_fn_node;
-   BBCC      *curr_BBCC;
-   Int        i,j,k;
-
-   VGP_PUSHCC(VgpCacheDump);
-   fd = VG_(open_write)(OUT_FILE);
-   if (-1 == fd) { file_err(); }
-
-   /* "desc:" lines (giving I1/D1/L2 cache configuration) */
-   VG_(sprintf)(buf, "desc: I1 cache:         %s\n", I1.desc_line);
-   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-   VG_(sprintf)(buf, "desc: D1 cache:         %s\n", D1.desc_line);
-   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-   VG_(sprintf)(buf, "desc: L2 cache:         %s\n", L2.desc_line);
-   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-
-   /* "cmd:" line */
-   VG_(strcpy)(buf, "cmd:");
-   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-   for (i = 0; i < client_argc; i++) {
-       VG_(sprintf)(buf, " %s", client_argv[i]);
-       VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-   }
-   /* "events:" line */
-   VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw\n");
-   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-
-   /* Six loops here:  three for the hash table arrays, and three for the
-    * chains hanging off the hash table arrays. */
-   for (i = 0; i < N_FILE_ENTRIES; i++) {
-      curr_file_node = BBCC_table[i];
-      while (curr_file_node != NULL) {
-         VG_(sprintf)(buf, "fl=%s\n", curr_file_node->filename);
-         VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-
-         for (j = 0; j < N_FN_ENTRIES; j++) {
-            curr_fn_node = curr_file_node->fns[j];
-            while (curr_fn_node != NULL) {
-               VG_(sprintf)(buf, "fn=%s\n", curr_fn_node->fn_name);
-               VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-
-               for (k = 0; k < N_BBCC_ENTRIES; k++) {
-                  curr_BBCC = curr_fn_node->BBCCs[k];
-                  while (curr_BBCC != NULL) {
-                     fprint_BBCC(fd, curr_BBCC, 
-                             
-                             curr_file_node->filename,
-                             curr_fn_node->fn_name);
-
-                     curr_BBCC = curr_BBCC->next;
-                  }
-               }
-               curr_fn_node = curr_fn_node->next;
-            }
-         }
-         curr_file_node = curr_file_node->next;
-      }
-   }
-
-   /* Print stats from any discarded basic blocks */
-   if (0 != Ir_discards.a) {
-
-      VG_(sprintf)(buf, "fl=(discarded)\n");
-      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-      VG_(sprintf)(buf, "fn=(discarded)\n");
-      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-
-      /* Use 0 as line number */
-      VG_(sprintf)(buf, "0 %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
-                   Ir_discards.a, Ir_discards.m1, Ir_discards.m2, 
-                   Dr_discards.a, Dr_discards.m1, Dr_discards.m2, 
-                   Dw_discards.a, Dw_discards.m1, Dw_discards.m2);
-      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-
-      Ir_total.a  += Ir_discards.a;
-      Ir_total.m1 += Ir_discards.m1;
-      Ir_total.m2 += Ir_discards.m2;
-      Dr_total.a  += Dr_discards.a;
-      Dr_total.m1 += Dr_discards.m1;
-      Dr_total.m2 += Dr_discards.m2;
-      Dw_total.a  += Dw_discards.a;
-      Dw_total.m1 += Dw_discards.m1;
-      Dw_total.m2 += Dw_discards.m2;
-   }
-
-   /* Summary stats must come after rest of table, since we calculate them
-    * during traversal.  */ 
-   VG_(sprintf)(buf, "summary: "
-                     "%llu %llu %llu "
-                     "%llu %llu %llu "
-                     "%llu %llu %llu\n", 
-                     Ir_total.a, Ir_total.m1, Ir_total.m2,
-                     Dr_total.a, Dr_total.m1, Dr_total.m2,
-                     Dw_total.a, Dw_total.m1, Dw_total.m2);
-   VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
-   VG_(close)(fd);
-}
-
-/* Adds commas to ULong, right justifying in a field field_width wide, returns
- * the string in buf. */
-static
-Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN])
-{
-   int len, n_commas, i, j, new_len, space;
-
-   VG_(sprintf)(buf, "%lu", n);
-   len = VG_(strlen)(buf);
-   n_commas = (len - 1) / 3;
-   new_len = len + n_commas;
-   space = field_width - new_len;
-
-   /* Allow for printing a number in a field_width smaller than it's size */
-   if (space < 0) space = 0;    
-
-   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
-    * of three. */
-   for (j = -1, i = len ; i >= 0; i--) {
-      buf[i + n_commas + space] = buf[i];
-
-      if (3 == ++j) {
-         j = 0;
-         n_commas--;
-         buf[i + n_commas + space] = ',';
-      }
-   }
-   /* Right justify in field. */
-   for (i = 0; i < space; i++)  buf[i] = ' ';
-   return new_len;
-}
-
-static
-void percentify(Int n, Int pow, Int field_width, char buf[]) 
-{
-   int i, len, space;
-    
-   VG_(sprintf)(buf, "%d.%d%%", n / pow, n % pow);
-   len = VG_(strlen)(buf);
-   space = field_width - len;
-   i = len;
-
-   /* Right justify in field */
-   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
-   for (i = 0; i < space; i++)  buf[i] = ' ';
-}
-
-void VG_(do_cachesim_results)(Int client_argc, Char** client_argv)
-{
-   CC D_total;
-   ULong L2_total_m, L2_total_mr, L2_total_mw,
-         L2_total, L2_total_r, L2_total_w;
-   char buf1[RESULTS_BUF_LEN], 
-        buf2[RESULTS_BUF_LEN], 
-        buf3[RESULTS_BUF_LEN];
-   Int l1, l2, l3;
-   Int p;
-
-   fprint_BBCC_table_and_calc_totals(client_argc, client_argv);
-
-   if (VG_(clo_verbosity) == 0) 
-      return;
-
-   /* I cache results.  Use the I_refs value to determine the first column
-    * width. */
-   l1 = commify(Ir_total.a, 0, buf1);
-   VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);
-
-   commify(Ir_total.m1, l1, buf1);
-   VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);
-
-   commify(Ir_total.m2, l1, buf1);
-   VG_(message)(Vg_UserMsg, "L2i misses:    %s", buf1);
-
-   p = 100;
-
-   percentify(Ir_total.m1 * 100 * p / Ir_total.a, p, l1+1, buf1);
-   VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
-                
-   percentify(Ir_total.m2 * 100 * p / Ir_total.a, p, l1+1, buf1);
-   VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
-   VG_(message)(Vg_UserMsg, "");
-
-   /* D cache results.  Use the D_refs.rd and D_refs.wr values to determine the
-    * width of columns 2 & 3. */
-   D_total.a  = Dr_total.a  + Dw_total.a;
-   D_total.m1 = Dr_total.m1 + Dw_total.m1;
-   D_total.m2 = Dr_total.m2 + Dw_total.m2;
-       
-        commify( D_total.a, l1, buf1);
-   l2 = commify(Dr_total.a, 0,  buf2);
-   l3 = commify(Dw_total.a, 0,  buf3);
-   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
-                buf1,  buf2,  buf3);
-
-   commify( D_total.m1, l1, buf1);
-   commify(Dr_total.m1, l2, buf2);
-   commify(Dw_total.m1, l3, buf3);
-   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
-                buf1, buf2, buf3);
-
-   commify( D_total.m2, l1, buf1);
-   commify(Dr_total.m2, l2, buf2);
-   commify(Dw_total.m2, l3, buf3);
-   VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)",
-                buf1, buf2, buf3);
-
-   p = 10;
-   
-   percentify( D_total.m1 * 100 * p / D_total.a,  p, l1+1, buf1);
-   percentify(Dr_total.m1 * 100 * p / Dr_total.a, p, l2+1, buf2);
-   percentify(Dw_total.m1 * 100 * p / Dw_total.a, p, l3+1, buf3);
-   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
-
-   percentify( D_total.m2 * 100 * p / D_total.a,  p, l1+1, buf1);
-   percentify(Dr_total.m2 * 100 * p / Dr_total.a, p, l2+1, buf2);
-   percentify(Dw_total.m2 * 100 * p / Dw_total.a, p, l3+1, buf3);
-   VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
-   VG_(message)(Vg_UserMsg, "");
-
-   /* L2 overall results */
-
-   L2_total   = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
-   L2_total_r = Dr_total.m1 + Ir_total.m1;
-   L2_total_w = Dw_total.m1;
-   commify(L2_total,   l1, buf1);
-   commify(L2_total_r, l2, buf2);
-   commify(L2_total_w, l3, buf3);
-   VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)",
-                buf1, buf2, buf3);
-
-   L2_total_m  = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
-   L2_total_mr = Dr_total.m2 + Ir_total.m2;
-   L2_total_mw = Dw_total.m2;
-   commify(L2_total_m,  l1, buf1);
-   commify(L2_total_mr, l2, buf2);
-   commify(L2_total_mw, l3, buf3);
-   VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
-                buf1, buf2, buf3);
-
-   percentify(L2_total_m  * 100 * p / (Ir_total.a + D_total.a),  p, l1+1, buf1);
-   percentify(L2_total_mr * 100 * p / (Ir_total.a + Dr_total.a), p, l2+1, buf2);
-   percentify(L2_total_mw * 100 * p / Dw_total.a, p, l3+1, buf3);
-   VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )", buf1, buf2,buf3);
-            
-
-   /* Hash table stats */
-   if (VG_(clo_verbosity) > 1) {
-       int BB_lookups = full_debug_BBs      + fn_name_debug_BBs +
-                        file_line_debug_BBs + no_debug_BBs;
-      
-       VG_(message)(Vg_DebugMsg, "");
-       VG_(message)(Vg_DebugMsg, "Distinct files:   %d", distinct_files);
-       VG_(message)(Vg_DebugMsg, "Distinct fns:     %d", distinct_fns);
-       VG_(message)(Vg_DebugMsg, "BB lookups:       %d", BB_lookups);
-       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
-                    full_debug_BBs    * 100 / BB_lookups,
-                    full_debug_BBs);
-       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
-                    file_line_debug_BBs * 100 / BB_lookups,
-                    file_line_debug_BBs);
-       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
-                    fn_name_debug_BBs * 100 / BB_lookups,
-                    fn_name_debug_BBs);
-       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
-                    no_debug_BBs      * 100 / BB_lookups,
-                    no_debug_BBs);
-       VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d", BB_retranslations);
-       VG_(message)(Vg_DebugMsg, "Distinct instrs:  %d", distinct_instrs);
-   }
-   VGP_POPCC;
-}
-
-
-/* Called when a translation is invalidated due to self-modifying code or
- * unloaded of a shared object.
- *
- * Finds the BBCC in the table, removes it, adds the counts to the discard
- * counters, and then frees the BBCC. */
-void VG_(cachesim_notify_discard) ( TTEntry* tte )
-{
-   BBCC *BBCC_node;
-   Addr BBCC_ptr0, BBCC_ptr;
-   Bool BB_seen_before;
-    
-   if (0)
-   VG_(printf)( "cachesim_notify_discard: %p for %d\n", 
-                tte->orig_addr, (Int)tte->orig_size);
-
-   /* 2nd arg won't be used since BB should have been seen before (assertions
-    * ensure this). */
-   BBCC_node = get_BBCC(tte->orig_addr, NULL, True, &BB_seen_before);
-   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);
-
-   vg_assert(True == BB_seen_before);
-
-   while (BBCC_ptr - BBCC_ptr0 < BBCC_node->array_size) {
-
-      /* We pretend the CC is an iCC for getting the tag.  This is ok
-       * because both CC types have tag as their first byte.  Once we know
-       * the type, we can cast and act appropriately. */
-
-      switch ( ((iCC*)BBCC_ptr)->tag ) {
-
-         case INSTR_CC:
-            ADD_CC_TO(iCC, I, Ir_discards);
-            BBCC_ptr += sizeof(iCC);
-            break;
-
-         case READ_CC:
-         case  MOD_CC:
-            ADD_CC_TO(idCC, I, Ir_discards);
-            ADD_CC_TO(idCC, D, Dr_discards);
-            BBCC_ptr += sizeof(idCC);
-            break;
-
-         case WRITE_CC:
-            ADD_CC_TO(idCC, I, Ir_discards);
-            ADD_CC_TO(idCC, D, Dw_discards);
-            BBCC_ptr += sizeof(idCC);
-            break;
-
-         default:
-            VG_(panic)("Unknown CC type in VG_(cachesim_notify_discard)()\n");
-            break;
-      }
-   }
-
-   VG_(free)(VG_AR_PRIVATE, BBCC_node);
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                            vg_cachesim.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg_sim_D1.c b/cachegrind/cg_sim_D1.c
deleted file mode 100644
index 7b8a8da155..0000000000
--- a/cachegrind/cg_sim_D1.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*--------------------------------------------------------------------*/
-/*--- D1 cache simulation.                                         ---*/
-/*---                                             vg_cachesim_D1.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2002 Nicholas Nethercote
-      njn25@cam.ac.uk
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_cachesim_gen.c"
-
-CACHESIM(D1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
-
-/*--------------------------------------------------------------------*/
-/*--- end                                         vg_cachesim_D1.c ---*/
-/*--------------------------------------------------------------------*/
-
diff --git a/cachegrind/cg_sim_I1.c b/cachegrind/cg_sim_I1.c
deleted file mode 100644
index 26db3b3488..0000000000
--- a/cachegrind/cg_sim_I1.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*--------------------------------------------------------------------*/
-/*--- I1 cache simulation.                                         ---*/
-/*---                                             vg_cachesim_I1.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2002 Nicholas Nethercote
-      njn25@cam.ac.uk
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_cachesim_gen.c"
-
-CACHESIM(I1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
-
-/*--------------------------------------------------------------------*/
-/*--- end                                         vg_cachesim_I1.c ---*/
-/*--------------------------------------------------------------------*/
-
diff --git a/cachegrind/cg_sim_L2.c b/cachegrind/cg_sim_L2.c
deleted file mode 100644
index ec89027429..0000000000
--- a/cachegrind/cg_sim_L2.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*--------------------------------------------------------------------*/
-/*--- L2 cache simulation.                                         ---*/
-/*---                                             vg_cachesim_L2.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2002 Nicholas Nethercote
-      njn25@cam.ac.uk
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_cachesim_gen.c"
-
-CACHESIM(L2, (*m2)++ );
-
-/*--------------------------------------------------------------------*/
-/*--- end                                         vg_cachesim_L2.c ---*/
-/*--------------------------------------------------------------------*/
-
diff --git a/cachegrind/cg_sim_gen.c b/cachegrind/cg_sim_gen.c
deleted file mode 100644
index f938bc4b0f..0000000000
--- a/cachegrind/cg_sim_gen.c
+++ /dev/null
@@ -1,212 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Generic stuff shared by all cache simulation files.          ---*/
-/*---                                            vg_cachesim_gen.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2002 Nicholas Nethercote
-      njn25@cam.ac.uk
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-/* Notes:
-  - simulates a write-allocate cache
-  - (block --> set) hash function uses simple bit selection
-  - handling of references straddling two cache blocks:
-      - counts as only one cache access (not two)
-      - both blocks hit                  --> one hit
-      - one block hits, the other misses --> one miss
-      - both blocks miss                 --> one miss (not two)
-*/
-
-#ifndef __VG_CACHESIM_GEN_C
-#define __VG_CACHESIM_GEN_C
-
-typedef struct {
-   int          size;                   /* bytes */
-   int          assoc;
-   int          line_size;              /* bytes */
-   int          sets;
-   int          sets_min_1;
-   int          assoc_bits;
-   int          line_size_bits;
-   int          tag_shift;
-   char         desc_line[128];
-   int*         tags;
-} cache_t2;
-
-/* By this point, the size/assoc/line_size has been checked. */
-static void cachesim_initcache(cache_t config, cache_t2* c)
-{
-   int i;
-
-   c->size      = config.size;
-   c->assoc     = config.assoc;
-   c->line_size = config.line_size;
-
-   c->sets           = (c->size / c->line_size) / c->assoc;
-   c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
-   c->line_size_bits = VG_(log2)(c->line_size);
-   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
-
-   if (c->assoc == 1) {
-      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped", 
-                                 c->size, c->line_size);
-   } else {
-      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative",
-                                 c->size, c->line_size, c->assoc);
-   }
-
-   c->tags = VG_(malloc)(VG_AR_PRIVATE, 
-                         sizeof(UInt) * c->sets * c->assoc);
-
-   for (i = 0; i < c->sets * c->assoc; i++)
-      c->tags[i] = 0;
-}
-
-#if 0
-static void print_cache(cache_t2* c)
-{
-   UInt set, way, i;
-
-   /* Note initialisation and update of 'i'. */
-   for (i = 0, set = 0; set < c->sets; set++) {
-      for (way = 0; way < c->assoc; way++, i++) {
-         VG_(printf)("%8x ", c->tags[i]);
-      }
-      VG_(printf)("\n");
-   }
-}
-#endif 
-
-/* XXX: This is done as a macro rather than by passing in the cache_t2 as
- * an arg because it slows things down by a small amount (3-5%) due to all that
- * extra indirection. */
-
-#define CACHESIM(L, MISS_TREATMENT)                                         \
-/* The cache and associated bits and pieces. */                             \
-static cache_t2 L;                                                          \
-                                                                            \
-static void cachesim_##L##_initcache(cache_t config)                        \
-{                                                                           \
-    cachesim_initcache(config, &L);                                         \
-}                                                                           \
-                                                                            \
-static __inline__                                                           \
-void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
-{                                                                           \
-   register UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);  \
-   register UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);  \
-   register UInt tag  = a >> L.tag_shift;                                   \
-   int i, j;                                                                \
-   Bool is_miss = False;                                                    \
-   int* set;                                                                \
-                                                                            \
-   /* First case: word entirely within line. */                             \
-   if (set1 == set2) {                                                      \
-                                                                            \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
-                                                                            \
-      /* This loop is unrolled for just the first case, which is the most */\
-      /* common.  We can't unroll any further because it would screw up   */\
-      /* if we have a direct-mapped (1-way) cache.                        */\
-      if (tag == set[0]) {                                                  \
-         return;                                                            \
-      }                                                                     \
-      /* If the tag is one other than the MRU, move it into the MRU spot  */\
-      /* and shuffle the rest down.                                       */\
-      for (i = 1; i < L.assoc; i++) {                                       \
-         if (tag == set[i]) {                                               \
-            for (j = i; j > 0; j--) {                                       \
-               set[j] = set[j - 1];                                         \
-            }                                                               \
-            set[0] = tag;                                                   \
-            return;                                                         \
-         }                                                                  \
-      }                                                                     \
-                                                                            \
-      /* A miss;  install this tag as MRU, shuffle rest down. */            \
-      for (j = L.assoc - 1; j > 0; j--) {                                   \
-         set[j] = set[j - 1];                                               \
-      }                                                                     \
-      set[0] = tag;                                                         \
-      MISS_TREATMENT;                                                       \
-      return;                                                               \
-                                                                            \
-   /* Second case: word straddles two lines. */                             \
-   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
-   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
-      if (tag == set[0]) {                                                  \
-         goto block2;                                                       \
-      }                                                                     \
-      for (i = 1; i < L.assoc; i++) {                                       \
-         if (tag == set[i]) {                                               \
-            for (j = i; j > 0; j--) {                                       \
-               set[j] = set[j - 1];                                         \
-            }                                                               \
-            set[0] = tag;                                                   \
-            goto block2;                                                    \
-         }                                                                  \
-      }                                                                     \
-      for (j = L.assoc - 1; j > 0; j--) {                                   \
-         set[j] = set[j - 1];                                               \
-      }                                                                     \
-      set[0] = tag;                                                         \
-      is_miss = True;                                                       \
-block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
-      if (tag == set[0]) {                                                  \
-         goto miss_treatment;                                               \
-      }                                                                     \
-      for (i = 1; i < L.assoc; i++) {                                       \
-         if (tag == set[i]) {                                               \
-            for (j = i; j > 0; j--) {                                       \
-               set[j] = set[j - 1];                                         \
-            }                                                               \
-            set[0] = tag;                                                   \
-            goto miss_treatment;                                            \
-         }                                                                  \
-      }                                                                     \
-      for (j = L.assoc - 1; j > 0; j--) {                                   \
-         set[j] = set[j - 1];                                               \
-      }                                                                     \
-      set[0] = tag;                                                         \
-      is_miss = True;                                                       \
-miss_treatment:                                                             \
-      if (is_miss) { MISS_TREATMENT; }                                      \
-                                                                            \
-   } else {                                                                 \
-       VG_(panic)("item straddles more than two cache sets");               \
-   }                                                                        \
-   return;                                                                  \
-}
-
-#endif  /* ndef __VG_CACHESIM_GEN_C */
-
-/*--------------------------------------------------------------------*/
-/*--- end                                        vg_cachesim_gen.c ---*/
-/*--------------------------------------------------------------------*/
-
diff --git a/cachegrind/docs/Makefile.am b/cachegrind/docs/Makefile.am
deleted file mode 100644
index e8a58fa18e..0000000000
--- a/cachegrind/docs/Makefile.am
+++ /dev/null
@@ -1,5 +0,0 @@
-docdir = $(datadir)/doc/valgrind
-
-doc_DATA = index.html manual.html nav.html techdocs.html
-
-EXTRA_DIST = $(doc_DATA)
diff --git a/cachegrind/docs/index.html b/cachegrind/docs/index.html
deleted file mode 100644
index 1111702565..0000000000
--- a/cachegrind/docs/index.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-
-<head>
-  <meta http-equiv="Content-Type"     
-        content="text/html; charset=iso-8859-1">
-  <meta http-equiv="Content-Language" content="en-gb">
-  <meta name="generator" 
-        content="Mozilla/4.76 (X11; U; Linux 2.4.1-0.1.9 i586) [Netscape]">
-  <meta name="author" content="Julian Seward <jseward@acm.org>">
-  <meta name="description" content="say what this prog does">
-  <meta name="keywords" content="Valgrind, memory checker, x86, GPL">
-  <title>Valgrind's user manual</title>
-</head>
-
-<frameset cols="150,*">
-  <frame name="nav" target="main" src="nav.html">
-  <frame name="main" src="manual.html" scrolling="auto">
-  <noframes>
-    <body>
-     <p>This page uses frames, but your browser doesn't support them.</p>
-    </body>
-  </noframes>
-</frameset>
-
-</html>
diff --git a/cachegrind/docs/manual.html b/cachegrind/docs/manual.html
deleted file mode 100644
index b715ee3dfe..0000000000
--- a/cachegrind/docs/manual.html
+++ /dev/null
@@ -1,2702 +0,0 @@
-<html>
-  <head>
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-    <title>Valgrind</title>
-  </head>
-
-<body bgcolor="#ffffff">
-
-<a name="title">&nbsp;</a>
-<h1 align=center>Valgrind, version 1.0.0</h1>
-<center>This manual was last updated on 20020726</center>
-<p>
-
-<center>
-<a href="mailto:jseward@acm.org">jseward@acm.org</a><br>
-Copyright &copy; 2000-2002 Julian Seward
-<p>
-Valgrind is licensed under the GNU General Public License, 
-version 2<br>
-An open-source tool for finding memory-management problems in
-Linux-x86 executables.
-</center>
-
-<p>
-
-<hr width="100%">
-<a name="contents"></a>
-<h2>Contents of this manual</h2>
-
-<h4>1&nbsp; <a href="#intro">Introduction</a></h4>
-    1.1&nbsp; <a href="#whatfor">What Valgrind is for</a><br>
-    1.2&nbsp; <a href="#whatdoes">What it does with your program</a>
-
-<h4>2&nbsp; <a href="#howtouse">How to use it, and how to make sense 
-    of the results</a></h4>
-    2.1&nbsp; <a href="#starta">Getting started</a><br>
-    2.2&nbsp; <a href="#comment">The commentary</a><br>
-    2.3&nbsp; <a href="#report">Reporting of errors</a><br>
-    2.4&nbsp; <a href="#suppress">Suppressing errors</a><br>
-    2.5&nbsp; <a href="#flags">Command-line flags</a><br>
-    2.6&nbsp; <a href="#errormsgs">Explaination of error messages</a><br>
-    2.7&nbsp; <a href="#suppfiles">Writing suppressions files</a><br>
-    2.8&nbsp; <a href="#clientreq">The Client Request mechanism</a><br>
-    2.9&nbsp; <a href="#pthreads">Support for POSIX pthreads</a><br>
-    2.10&nbsp; <a href="#install">Building and installing</a><br>
-    2.11&nbsp; <a href="#problems">If you have problems</a><br>
-
-<h4>3&nbsp; <a href="#machine">Details of the checking machinery</a></h4>
-    3.1&nbsp; <a href="#vvalue">Valid-value (V) bits</a><br>
-    3.2&nbsp; <a href="#vaddress">Valid-address (A)&nbsp;bits</a><br>
-    3.3&nbsp; <a href="#together">Putting it all together</a><br>
-    3.4&nbsp; <a href="#signals">Signals</a><br>
-    3.5&nbsp; <a href="#leaks">Memory leak detection</a><br>
-
-<h4>4&nbsp; <a href="#limits">Limitations</a></h4>
-
-<h4>5&nbsp; <a href="#howitworks">How it works -- a rough overview</a></h4>
-    5.1&nbsp; <a href="#startb">Getting started</a><br>
-    5.2&nbsp; <a href="#engine">The translation/instrumentation engine</a><br>
-    5.3&nbsp; <a href="#track">Tracking the status of memory</a><br>
-    5.4&nbsp; <a href="#sys_calls">System calls</a><br>
-    5.5&nbsp; <a href="#sys_signals">Signals</a><br>
-
-<h4>6&nbsp; <a href="#example">An example</a></h4>
-
-<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
-
-<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
-
-<hr width="100%">
-
-<a name="intro"></a>
-<h2>1&nbsp; Introduction</h2>
-
-<a name="whatfor"></a>
-<h3>1.1&nbsp; What Valgrind is for</h3>
-
-Valgrind is a tool to help you find memory-management problems in your
-programs. When a program is run under Valgrind's supervision, all
-reads and writes of memory are checked, and calls to
-malloc/new/free/delete are intercepted. As a result, Valgrind can
-detect problems such as:
-<ul>
-  <li>Use of uninitialised memory</li>
-  <li>Reading/writing memory after it has been free'd</li>
-  <li>Reading/writing off the end of malloc'd blocks</li>
-  <li>Reading/writing inappropriate areas on the stack</li>
-  <li>Memory leaks -- where pointers to malloc'd blocks are lost
-  forever</li>
-  <li>Mismatched use of malloc/new/new [] vs free/delete/delete
-  []</li>
-  <li>Some misuses of the POSIX pthreads API</li>
-</ul>
-
-Problems like these can be difficult to find by other means, often
-lying undetected for long periods, then causing occasional,
-difficult-to-diagnose crashes.
-
-<p>
-Valgrind is closely tied to details of the CPU, operating system and
-to a less extent, compiler and basic C libraries. This makes it
-difficult to make it portable, so I have chosen at the outset to
-concentrate on what I believe to be a widely used platform: Linux on
-x86s.  Valgrind uses the standard Unix <code>./configure</code>,
-<code>make</code>, <code>make install</code> mechanism, and I have
-attempted to ensure that it works on machines with kernel 2.2 or 2.4
-and glibc 2.1.X or 2.2.X.  This should cover the vast majority of
-modern Linux installations.
-
-
-<p>
-Valgrind is licensed under the GNU General Public License, version
-2. Read the file LICENSE in the source distribution for details.  Some
-of the PThreads test cases, <code>test/pth_*.c</code>, are taken from
-"Pthreads Programming" by Bradford Nichols, Dick Buttlar &amp; Jacqueline
-Proulx Farrell, ISBN 1-56592-115-1, published by O'Reilly &amp;
-Associates, Inc.
-
-
-<a name="whatdoes"></a>
-<h3>1.2&nbsp; What it does with your program</h3>
-
-Valgrind is designed to be as non-intrusive as possible. It works
-directly with existing executables. You don't need to recompile,
-relink, or otherwise modify, the program to be checked. Simply place
-the word <code>valgrind</code> at the start of the command line
-normally used to run the program. So, for example, if you want to run
-the command <code>ls -l</code> on Valgrind, simply issue the
-command: <code>valgrind ls -l</code>.
-
-<p>Valgrind takes control of your program before it starts. Debugging
-information is read from the executable and associated libraries, so
-that error messages can be phrased in terms of source code
-locations. Your program is then run on a synthetic x86 CPU which
-checks every memory access. All detected errors are written to a
-log. When the program finishes, Valgrind searches for and reports on
-leaked memory.
-
-<p>You can run pretty much any dynamically linked ELF x86 executable
-using Valgrind. Programs run 25 to 50 times slower, and take a lot
-more memory, than they usually would. It works well enough to run
-large programs. For example, the Konqueror web browser from the KDE
-Desktop Environment, version 3.0, runs slowly but usably on Valgrind.
-
-<p>Valgrind simulates every single instruction your program executes.
-Because of this, it finds errors not only in your application but also
-in all supporting dynamically-linked (<code>.so</code>-format)
-libraries, including the GNU C library, the X client libraries, Qt, if
-you work with KDE, and so on. That often includes libraries, for
-example the GNU C library, which contain memory access violations, but
-which you cannot or do not want to fix.
-
-<p>Rather than swamping you with errors in which you are not
-interested, Valgrind allows you to selectively suppress errors, by
-recording them in a suppressions file which is read when Valgrind
-starts up.  The build mechanism attempts to select suppressions which
-give reasonable behaviour for the libc and XFree86 versions detected
-on your machine.
-
-
-<p><a href="#example">Section 6</a> shows an example of use.
-<p>
-<hr width="100%">
-
-<a name="howtouse"></a>
-<h2>2&nbsp; How to use it, and how to make sense of the results</h2>
-
-<a name="starta"></a>
-<h3>2.1&nbsp; Getting started</h3>
-
-First off, consider whether it might be beneficial to recompile your
-application and supporting libraries with optimisation disabled and
-debugging info enabled (the <code>-g</code> flag).  You don't have to
-do this, but doing so helps Valgrind produce more accurate and less
-confusing error reports.  Chances are you're set up like this already,
-if you intended to debug your program with GNU gdb, or some other
-debugger.  
-
-<p>
-A plausible compromise is to use <code>-g -O</code>.
-Optimisation levels above <code>-O</code> have been observed, on very
-rare occasions, to cause gcc to generate code which fools Valgrind's
-error tracking machinery into wrongly reporting uninitialised value
-errors.  <code>-O</code> gets you the vast majority of the benefits of
-higher optimisation levels anyway, so you don't lose much there.
-
-<p>
-Valgrind understands both the older "stabs" debugging format, used by
-gcc versions prior to 3.1, and the newer DWARF2 format used by gcc 3.1
-and later.
-
-<p>
-Then just run your application, but place the word
-<code>valgrind</code> in front of your usual command-line invokation.
-Note that you should run the real (machine-code) executable here.  If
-your application is started by, for example, a shell or perl script,
-you'll need to modify it to invoke Valgrind on the real executables.
-Running such scripts directly under Valgrind will result in you
-getting error reports pertaining to <code>/bin/sh</code>,
-<code>/usr/bin/perl</code>, or whatever interpreter you're using.
-This almost certainly isn't what you want and can be confusing.
-
-<a name="comment"></a>
-<h3>2.2&nbsp; The commentary</h3>
-
-Valgrind writes a commentary, detailing error reports and other
-significant events.  The commentary goes to standard output by
-default.  This may interfere with your program, so you can ask for it
-to be directed elsewhere.
-
-<p>All lines in the commentary are of the following form:<br>
-<pre>
-  ==12345== some-message-from-Valgrind
-</pre>
-<p>The <code>12345</code>  is the process ID.  This scheme makes it easy
-to distinguish program output from Valgrind commentary, and also easy
-to differentiate commentaries from different processes which have
-become merged together, for whatever reason.
-
-<p>By default, Valgrind writes only essential messages to the commentary,
-so as to avoid flooding you with information of secondary importance.
-If you want more information about what is happening, re-run, passing
-the <code>-v</code> flag to Valgrind.
-
-
-<a name="report"></a>
-<h3>2.3&nbsp; Reporting of errors</h3>
-
-When Valgrind detects something bad happening in the program, an error
-message is written to the commentary.  For example:<br>
-<pre>
-  ==25832== Invalid read of size 4
-  ==25832==    at 0x8048724: BandMatrix::ReSize(int, int, int) (bogon.cpp:45)
-  ==25832==    by 0x80487AF: main (bogon.cpp:66)
-  ==25832==    by 0x40371E5E: __libc_start_main (libc-start.c:129)
-  ==25832==    by 0x80485D1: (within /home/sewardj/newmat10/bogon)
-  ==25832==    Address 0xBFFFF74C is not stack'd, malloc'd or free'd
-</pre>
-
-<p>This message says that the program did an illegal 4-byte read of
-address 0xBFFFF74C, which, as far as it can tell, is not a valid stack
-address, nor corresponds to any currently malloc'd or free'd blocks.
-The read is happening at line 45 of <code>bogon.cpp</code>, called
-from line 66 of the same file, etc.  For errors associated with an
-identified malloc'd/free'd block, for example reading free'd memory,
-Valgrind reports not only the location where the error happened, but
-also where the associated block was malloc'd/free'd.
-
-<p>Valgrind remembers all error reports.  When an error is detected,
-it is compared against old reports, to see if it is a duplicate.  If
-so, the error is noted, but no further commentary is emitted.  This
-avoids you being swamped with bazillions of duplicate error reports.
-
-<p>If you want to know how many times each error occurred, run with
-the <code>-v</code> option.  When execution finishes, all the reports
-are printed out, along with, and sorted by, their occurrence counts.
-This makes it easy to see which errors have occurred most frequently.
-
-<p>Errors are reported before the associated operation actually
-happens.  For example, if you program decides to read from address
-zero, Valgrind will emit a message to this effect, and the program
-will then duly die with a segmentation fault.
-
-<p>In general, you should try and fix errors in the order that they
-are reported.  Not doing so can be confusing.  For example, a program
-which copies uninitialised values to several memory locations, and
-later uses them, will generate several error messages.  The first such
-error message may well give the most direct clue to the root cause of
-the problem.
-
-<p>The process of detecting duplicate errors is quite an expensive
-one and can become a significant performance overhead if your program
-generates huge quantities of errors.  To avoid serious problems here,
-Valgrind will simply stop collecting errors after 300 different errors
-have been seen, or 30000 errors in total have been seen.  In this
-situation you might as well stop your program and fix it, because
-Valgrind won't tell you anything else useful after this.  Note that
-the 300/30000 limits apply after suppressed errors are removed.  These
-limits are defined in <code>vg_include.h</code> and can be increased
-if necessary.
-
-<p>To avoid this cutoff you can use the
-<code>--error-limit=no</code> flag.  Then valgrind will always show
-errors, regardless of how many there are.  Use this flag carefully,
-since it may have a dire effect on performance.
-
-
-<a name="suppress"></a>
-<h3>2.4&nbsp; Suppressing errors</h3>
-
-Valgrind detects numerous problems in the base libraries, such as the
-GNU C library, and the XFree86 client libraries, which come
-pre-installed on your GNU/Linux system.  You can't easily fix these,
-but you don't want to see these errors (and yes, there are many!)  So
-Valgrind reads a list of errors to suppress at startup.  
-A default suppression file is cooked up by the
-<code>./configure</code> script.
-
-<p>You can modify and add to the suppressions file at your leisure,
-or, better, write your own.  Multiple suppression files are allowed.
-This is useful if part of your project contains errors you can't or
-don't want to fix, yet you don't want to continuously be reminded of
-them.
-
-<p>Each error to be suppressed is described very specifically, to
-minimise the possibility that a suppression-directive inadvertantly
-suppresses a bunch of similar errors which you did want to see.  The
-suppression mechanism is designed to allow precise yet flexible
-specification of errors to suppress.
-
-<p>If you use the <code>-v</code> flag, at the end of execution, Valgrind
-prints out one line for each used suppression, giving its name and the
-number of times it got used.  Here's the suppressions used by a run of
-<code>ls -l</code>:
-<pre>
-  --27579-- supp: 1 socketcall.connect(serv_addr)/__libc_connect/__nscd_getgrgid_r
-  --27579-- supp: 1 socketcall.connect(serv_addr)/__libc_connect/__nscd_getpwuid_r
-  --27579-- supp: 6 strrchr/_dl_map_object_from_fd/_dl_map_object
-</pre>
-
-<a name="flags"></a>
-<h3>2.5&nbsp; Command-line flags</h3>
-
-You invoke Valgrind like this:
-<pre>
-  valgrind [options-for-Valgrind] your-prog [options for your-prog]
-</pre>
-
-<p>Note that Valgrind also reads options from the environment variable
-<code>$VALGRIND</code>, and processes them before the command-line
-options.
-
-<p>Valgrind's default settings succeed in giving reasonable behaviour
-in most cases.  Available options, in no particular order, are as
-follows:
-<ul>
-  <li><code>--help</code></li><br>
-
-  <li><code>--version</code><br>
-      <p>The usual deal.</li><br><p>
-
-  <li><code>-v --verbose</code><br>
-      <p>Be more verbose.  Gives extra information on various aspects
-      of your program, such as: the shared objects loaded, the
-      suppressions used, the progress of the instrumentation engine,
-      and warnings about unusual behaviour.
-      </li><br><p>
-
-  <li><code>-q --quiet</code><br>
-      <p>Run silently, and only print error messages.  Useful if you
-      are running regression tests or have some other automated test
-      machinery.
-      </li><br><p>
-
-  <li><code>--demangle=no</code><br>
-      <code>--demangle=yes</code> [the default]
-      <p>Disable/enable automatic demangling (decoding) of C++ names.
-      Enabled by default.  When enabled, Valgrind will attempt to
-      translate encoded C++ procedure names back to something
-      approaching the original.  The demangler handles symbols mangled
-      by g++ versions 2.X and 3.X.
-
-      <p>An important fact about demangling is that function
-      names mentioned in suppressions files should be in their mangled
-      form.  Valgrind does not demangle function names when searching
-      for applicable suppressions, because to do otherwise would make
-      suppressions file contents dependent on the state of Valgrind's
-      demangling machinery, and would also be slow and pointless.
-      </li><br><p>
-
-  <li><code>--num-callers=&lt;number&gt;</code> [default=4]<br>
-      <p>By default, Valgrind shows four levels of function call names
-      to help you identify program locations.  You can change that
-      number with this option.  This can help in determining the
-      program's location in deeply-nested call chains.  Note that errors
-      are commoned up using only the top three function locations (the
-      place in the current function, and that of its two immediate
-      callers).  So this doesn't affect the total number of errors
-      reported.  
-      <p>
-      The maximum value for this is 50.  Note that higher settings
-      will make Valgrind run a bit more slowly and take a bit more
-      memory, but can be useful when working with programs with
-      deeply-nested call chains.  
-      </li><br><p>
-
-  <li><code>--gdb-attach=no</code> [the default]<br>
-      <code>--gdb-attach=yes</code>
-      <p>When enabled, Valgrind will pause after every error shown,
-      and print the line
-      <br>
-      <code>---- Attach to GDB ? --- [Return/N/n/Y/y/C/c] ----</code>
-      <p>
-      Pressing <code>Ret</code>, or <code>N</code> <code>Ret</code>
-      or <code>n</code> <code>Ret</code>, causes Valgrind not to
-      start GDB for this error.
-      <p>
-      <code>Y</code> <code>Ret</code>
-      or <code>y</code> <code>Ret</code> causes Valgrind to
-      start GDB, for the program at this point.  When you have
-      finished with GDB, quit from it, and the program will continue.
-      Trying to continue from inside GDB doesn't work.
-      <p>
-      <code>C</code> <code>Ret</code>
-      or <code>c</code> <code>Ret</code> causes Valgrind not to
-      start GDB, and not to ask again.
-      <p>
-      <code>--gdb-attach=yes</code> conflicts with
-      <code>--trace-children=yes</code>.  You can't use them together.
-      Valgrind refuses to start up in this situation.  1 May 2002:
-      this is a historical relic which could be easily fixed if it
-      gets in your way.  Mail me and complain if this is a problem for
-      you.  </li><br><p>
-     
-  <li><code>--partial-loads-ok=yes</code> [the default]<br>
-      <code>--partial-loads-ok=no</code>
-      <p>Controls how Valgrind handles word (4-byte) loads from
-      addresses for which some bytes are addressible and others
-      are not.  When <code>yes</code> (the default), such loads
-      do not elicit an address error.  Instead, the loaded V bytes
-      corresponding to the illegal addresses indicate undefined, and
-      those corresponding to legal addresses are loaded from shadow 
-      memory, as usual.
-      <p>
-      When <code>no</code>, loads from partially
-      invalid addresses are treated the same as loads from completely
-      invalid addresses: an illegal-address error is issued,
-      and the resulting V bytes indicate valid data.
-      </li><br><p>
-
-  <li><code>--sloppy-malloc=no</code> [the default]<br>
-      <code>--sloppy-malloc=yes</code>
-      <p>When enabled, all requests for malloc/calloc are rounded up
-      to a whole number of machine words -- in other words, made
-      divisible by 4.  For example, a request for 17 bytes of space
-      would result in a 20-byte area being made available.  This works
-      around bugs in sloppy libraries which assume that they can
-      safely rely on malloc/calloc requests being rounded up in this
-      fashion.  Without the workaround, these libraries tend to
-      generate large numbers of errors when they access the ends of
-      these areas.  
-      <p>
-      Valgrind snapshots dated 17 Feb 2002 and later are
-      cleverer about this problem, and you should no longer need to 
-      use this flag.  To put it bluntly, if you do need to use this
-      flag, your program violates the ANSI C semantics defined for
-      <code>malloc</code> and <code>free</code>, even if it appears to
-      work correctly, and you should fix it, at least if you hope for
-      maximum portability.
-      </li><br><p>
-
-  <li><code>--alignment=&lt;number></code> [default: 4]<br> <p>By
-      default valgrind's <code>malloc</code>, <code>realloc</code>,
-      etc, return 4-byte aligned addresses.  These are suitable for
-      any accesses on x86 processors. 
-      Some programs might however assume that <code>malloc</code> et
-      al return 8- or more aligned memory.
-      These programs are broken and should be fixed, but
-      if this is impossible for whatever reason the alignment can be
-      increased using this parameter.  The supplied value must be
-      between 4 and 4096 inclusive, and must be a power of two.</li><br><p>
-
-  <li><code>--trace-children=no</code> [the default]<br>
-      <code>--trace-children=yes</code>
-      <p>When enabled, Valgrind will trace into child processes.  This
-      is confusing and usually not what you want, so is disabled by
-      default.  As of 1 May 2002, tracing into a child process from a
-      parent which uses <code>libpthread.so</code> is probably broken
-      and is likely to cause breakage.  Please report any such
-      problems to me.  </li><br><p>
-
-  <li><code>--freelist-vol=&lt;number></code> [default: 1000000]
-      <p>When the client program releases memory using free (in C) or
-      delete (C++), that memory is not immediately made available for
-      re-allocation.  Instead it is marked inaccessible and placed in
-      a queue of freed blocks.  The purpose is to delay the point at
-      which freed-up memory comes back into circulation.  This
-      increases the chance that Valgrind will be able to detect
-      invalid accesses to blocks for some significant period of time
-      after they have been freed.  
-      <p>
-      This flag specifies the maximum total size, in bytes, of the
-      blocks in the queue.  The default value is one million bytes.
-      Increasing this increases the total amount of memory used by
-      Valgrind but may detect invalid uses of freed blocks which would
-      otherwise go undetected.</li><br><p>
-
-  <li><code>--logfile-fd=&lt;number></code> [default: 2, stderr]
-      <p>Specifies the file descriptor on which Valgrind communicates
-      all of its messages.  The default, 2, is the standard error
-      channel.  This may interfere with the client's own use of
-      stderr.  To dump Valgrind's commentary in a file without using
-      stderr, something like the following works well (sh/bash
-      syntax):<br>
-      <code>&nbsp;&nbsp;
-            valgrind --logfile-fd=9 my_prog 9> logfile</code><br>
-      That is: tell Valgrind to send all output to file descriptor 9,
-      and ask the shell to route file descriptor 9 to "logfile".
-      </li><br><p>
-
-  <li><code>--suppressions=&lt;filename></code> 
-      [default: $PREFIX/lib/valgrind/default.supp]
-      <p>Specifies an extra
-      file from which to read descriptions of errors to suppress.  You
-      may use as many extra suppressions files as you
-      like.</li><br><p>
-
-  <li><code>--leak-check=no</code> [default]<br>
-      <code>--leak-check=yes</code> 
-      <p>When enabled, search for memory leaks when the client program
-      finishes.  A memory leak means a malloc'd block, which has not
-      yet been free'd, but to which no pointer can be found.  Such a
-      block can never be free'd by the program, since no pointer to it
-      exists.  Leak checking is disabled by default because it tends
-      to generate dozens of error messages.  </li><br><p>
-
-  <li><code>--show-reachable=no</code> [default]<br>
-      <code>--show-reachable=yes</code> 
-      <p>When disabled, the memory leak detector only shows blocks for
-      which it cannot find a pointer to at all, or it can only find a
-      pointer to the middle of.  These blocks are prime candidates for
-      memory leaks.  When enabled, the leak detector also reports on
-      blocks which it could find a pointer to.  Your program could, at
-      least in principle, have freed such blocks before exit.
-      Contrast this to blocks for which no pointer, or only an
-      interior pointer could be found: they are more likely to
-      indicate memory leaks, because you do not actually have a
-      pointer to the start of the block which you can hand to
-      <code>free</code>, even if you wanted to.  </li><br><p>
-
-  <li><code>--leak-resolution=low</code> [default]<br>
-      <code>--leak-resolution=med</code> <br>
-      <code>--leak-resolution=high</code>
-      <p>When doing leak checking, determines how willing Valgrind is
-      to consider different backtraces to be the same.  When set to
-      <code>low</code>, the default, only the first two entries need
-      match.  When <code>med</code>, four entries have to match.  When
-      <code>high</code>, all entries need to match.  
-      <p>
-      For hardcore leak debugging, you probably want to use
-      <code>--leak-resolution=high</code> together with 
-      <code>--num-callers=40</code> or some such large number.  Note
-      however that this can give an overwhelming amount of
-      information, which is why the defaults are 4 callers and
-      low-resolution matching.
-      <p>
-      Note that the <code>--leak-resolution=</code> setting does not
-      affect Valgrind's ability to find leaks.  It only changes how
-      the results are presented.
-      </li><br><p>
-
-  <li><code>--workaround-gcc296-bugs=no</code> [default]<br>
-      <code>--workaround-gcc296-bugs=yes</code> <p>When enabled,
-      assume that reads and writes some small distance below the stack
-      pointer <code>%esp</code> are due to bugs in gcc 2.96, and does
-      not report them.  The "small distance" is 256 bytes by default.
-      Note that gcc 2.96 is the default compiler on some popular Linux
-      distributions (RedHat 7.X, Mandrake) and so you may well need to
-      use this flag.  Do not use it if you do not have to, as it can
-      cause real errors to be overlooked.  Another option is to use a
-      gcc/g++ which does not generate accesses below the stack
-      pointer.  2.95.3 seems to be a good choice in this respect.
-      <p>
-      Unfortunately (27 Feb 02) it looks like g++ 3.0.4 has a similar
-      bug, so you may need to issue this flag if you use 3.0.4.  A
-      while later (early Apr 02) this is confirmed as a scheduling bug
-      in g++-3.0.4.
-      </li><br><p>
-
-  <li><code>--error-limit=yes</code> [default]<br>
-      <code>--error-limit=no</code> <p>When enabled, valgrind stops
-      reporting errors after 30000 in total, or 300 different ones,
-      have been seen.  This is to stop the error tracking machinery
-      from becoming a huge performance overhead in programs with many
-      errors.  </li><br><p>
-
-  <li><code>--cachesim=no</code> [default]<br>
-      <code>--cachesim=yes</code> <p>When enabled, turns off memory
-      checking, and turns on cache profiling.  Cache profiling is
-      described in detail in <a href="#cache">Section 7</a>.
-      </li><br><p>
-
-  <li><code>--weird-hacks=hack1,hack2,...</code>
-      Pass miscellaneous hints to Valgrind which slightly modify the
-      simulated behaviour in nonstandard or dangerous ways, possibly
-      to help the simulation of strange features.  By default no hacks
-      are enabled.  Use with caution!  Currently known hacks are:
-      <p>
-      <ul>
-      <li><code>ioctl-VTIME</code> Use this if you have a program
-          which sets readable file descriptors to have a timeout by
-          doing <code>ioctl</code> on them with a
-          <code>TCSETA</code>-style command <b>and</b> a non-zero
-          <code>VTIME</code> timeout value.  This is considered
-          potentially dangerous and therefore is not engaged by
-          default, because it is (remotely) conceivable that it could
-          cause threads doing <code>read</code> to incorrectly block
-          the entire process.
-          <p>
-          You probably want to try this one if you have a program
-          which unexpectedly blocks in a <code>read</code> from a file
-          descriptor which you know to have been messed with by
-          <code>ioctl</code>.  This could happen, for example, if the
-          descriptor is used to read input from some kind of screen
-          handling library.
-          <p>
-          To find out if your program is blocking unexpectedly in the
-          <code>read</code> system call, run with
-          <code>--trace-syscalls=yes</code> flag.
-      <p>
-      <li><code>truncate-writes</code> Use this if you have a threaded
-          program which appears to unexpectedly block whilst writing
-          into a pipe.  The effect is to modify all calls to
-          <code>write()</code> so that requests to write more than
-          4096 bytes are treated as if they only requested a write of
-          4096 bytes.  Valgrind does this by changing the
-          <code>count</code> argument of <code>write()</code>, as
-          passed to the kernel, so that it is at most 4096.  The
-          amount of data written will then be less than the client
-          program asked for, but the client should have a loop around
-          its <code>write()</code> call to check whether the requested
-          number of bytes have been written.  If not, it should issue
-          further <code>write()</code> calls until all the data is
-          written.
-          <p>
-          This all sounds pretty dodgy to me, which is why I've made
-          this behaviour only happen on request.  It is not the
-          default behaviour.  At the time of writing this (30 June
-          2002) I have only seen one example where this is necessary,
-          so either the problem is extremely rare or nobody is using
-          Valgrind :-)
-          <p>
-          On experimentation I see that <code>truncate-writes</code>
-          doesn't interact well with <code>ioctl-VTIME</code>, so you
-          probably don't want to try both at once.
-          <p>
-          As above, to find out if your program is blocking
-          unexpectedly in the <code>write()</code> system call, you
-          may find the <code>--trace-syscalls=yes
-          --trace-sched=yes</code> flags useful.
-      </ul>
-
-      </li><p>
-</ul>
-
-There are also some options for debugging Valgrind itself.  You
-shouldn't need to use them in the normal run of things.  Nevertheless:
-
-<ul>
-
-  <li><code>--single-step=no</code> [default]<br>
-      <code>--single-step=yes</code>
-      <p>When enabled, each x86 insn is translated seperately into
-      instrumented code.  When disabled, translation is done on a
-      per-basic-block basis, giving much better translations.</li><br>
-      <p>
-
-  <li><code>--optimise=no</code><br>
-      <code>--optimise=yes</code> [default]
-      <p>When enabled, various improvements are applied to the
-      intermediate code, mainly aimed at allowing the simulated CPU's
-      registers to be cached in the real CPU's registers over several
-      simulated instructions.</li><br>
-      <p>
-
-  <li><code>--instrument=no</code><br>
-      <code>--instrument=yes</code> [default]
-      <p>When disabled, the translations don't actually contain any
-      instrumentation.</li><br>
-      <p>
-
-  <li><code>--cleanup=no</code><br>
-      <code>--cleanup=yes</code> [default]
-      <p>When enabled, various improvments are applied to the
-      post-instrumented intermediate code, aimed at removing redundant
-      value checks.</li><br>
-      <p>
-
-  <li><code>--trace-syscalls=no</code> [default]<br>
-      <code>--trace-syscalls=yes</code>
-      <p>Enable/disable tracing of system call intercepts.</li><br>
-      <p>
-
-  <li><code>--trace-signals=no</code> [default]<br>
-      <code>--trace-signals=yes</code>
-      <p>Enable/disable tracing of signal handling.</li><br>
-      <p>
-
-  <li><code>--trace-sched=no</code> [default]<br>
-      <code>--trace-sched=yes</code>
-      <p>Enable/disable tracing of thread scheduling events.</li><br>
-      <p>
-
-  <li><code>--trace-pthread=none</code> [default]<br>
-      <code>--trace-pthread=some</code> <br>
-      <code>--trace-pthread=all</code>
-      <p>Specifies amount of trace detail for pthread-related events.</li><br>
-      <p>
-
-  <li><code>--trace-symtab=no</code> [default]<br>
-      <code>--trace-symtab=yes</code>
-      <p>Enable/disable tracing of symbol table reading.</li><br>
-      <p>
-
-  <li><code>--trace-malloc=no</code> [default]<br>
-      <code>--trace-malloc=yes</code>
-      <p>Enable/disable tracing of malloc/free (et al) intercepts.
-      </li><br>
-      <p>
-
-  <li><code>--stop-after=&lt;number></code> 
-      [default: infinity, more or less]
-      <p>After &lt;number> basic blocks have been executed, shut down
-      Valgrind and switch back to running the client on the real CPU.
-      </li><br>
-      <p>
-
-  <li><code>--dump-error=&lt;number></code> [default: inactive]
-      <p>After the program has exited, show gory details of the
-      translation of the basic block containing the &lt;number>'th
-      error context.  When used with <code>--single-step=yes</code>,
-      can show the exact x86 instruction causing an error.  This is
-      all fairly dodgy and doesn't work at all if threads are
-      involved.</li><br>
-      <p>
-</ul>
-
-
-<a name="errormsgs"></a>
-<h3>2.6&nbsp; Explaination of error messages</h3>
-
-Despite considerable sophistication under the hood, Valgrind can only
-really detect two kinds of errors, use of illegal addresses, and use
-of undefined values.  Nevertheless, this is enough to help you
-discover all sorts of memory-management nasties in your code.  This
-section presents a quick summary of what error messages mean.  The
-precise behaviour of the error-checking machinery is described in
-<a href="#machine">Section 4</a>.
-
-
-<h4>2.6.1&nbsp; Illegal read / Illegal write errors</h4>
-For example:
-<pre>
-  Invalid read of size 4
-     at 0x40F6BBCC: (within /usr/lib/libpng.so.2.1.0.9)
-     by 0x40F6B804: (within /usr/lib/libpng.so.2.1.0.9)
-     by 0x40B07FF4: read_png_image__FP8QImageIO (kernel/qpngio.cpp:326)
-     by 0x40AC751B: QImageIO::read() (kernel/qimage.cpp:3621)
-     Address 0xBFFFF0E0 is not stack'd, malloc'd or free'd
-</pre>
-
-<p>This happens when your program reads or writes memory at a place
-which Valgrind reckons it shouldn't.  In this example, the program did
-a 4-byte read at address 0xBFFFF0E0, somewhere within the
-system-supplied library libpng.so.2.1.0.9, which was called from
-somewhere else in the same library, called from line 326 of
-qpngio.cpp, and so on.
-
-<p>Valgrind tries to establish what the illegal address might relate
-to, since that's often useful.  So, if it points into a block of
-memory which has already been freed, you'll be informed of this, and
-also where the block was free'd at.  Likewise, if it should turn out
-to be just off the end of a malloc'd block, a common result of
-off-by-one-errors in array subscripting, you'll be informed of this
-fact, and also where the block was malloc'd.
-
-<p>In this example, Valgrind can't identify the address.  Actually the
-address is on the stack, but, for some reason, this is not a valid
-stack address -- it is below the stack pointer, %esp, and that isn't
-allowed.  In this particular case it's probably caused by gcc
-generating invalid code, a known bug in various flavours of gcc.
-
-<p>Note that Valgrind only tells you that your program is about to
-access memory at an illegal address.  It can't stop the access from
-happening.  So, if your program makes an access which normally would
-result in a segmentation fault, you program will still suffer the same
-fate -- but you will get a message from Valgrind immediately prior to
-this.  In this particular example, reading junk on the stack is
-non-fatal, and the program stays alive.
-
-
-<h4>2.6.2&nbsp; Use of uninitialised values</h4>
-For example:
-<pre>
-  Conditional jump or move depends on uninitialised value(s)
-     at 0x402DFA94: _IO_vfprintf (_itoa.h:49)
-     by 0x402E8476: _IO_printf (printf.c:36)
-     by 0x8048472: main (tests/manuel1.c:8)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-</pre>
-
-<p>An uninitialised-value use error is reported when your program uses
-a value which hasn't been initialised -- in other words, is undefined.
-Here, the undefined value is used somewhere inside the printf()
-machinery of the C library.  This error was reported when running the
-following small program:
-<pre>
-  int main()
-  {
-    int x;
-    printf ("x = %d\n", x);
-  }
-</pre>
-
-<p>It is important to understand that your program can copy around
-junk (uninitialised) data to its heart's content.  Valgrind observes
-this and keeps track of the data, but does not complain.  A complaint
-is issued only when your program attempts to make use of uninitialised
-data.  In this example, x is uninitialised.  Valgrind observes the
-value being passed to _IO_printf and thence to _IO_vfprintf, but makes
-no comment.  However, _IO_vfprintf has to examine the value of x so it
-can turn it into the corresponding ASCII string, and it is at this
-point that Valgrind complains.
-
-<p>Sources of uninitialised data tend to be:
-<ul>
-  <li>Local variables in procedures which have not been initialised,
-      as in the example above.</li><br><p>
-
-  <li>The contents of malloc'd blocks, before you write something
-      there.  In C++, the new operator is a wrapper round malloc, so
-      if you create an object with new, its fields will be
-      uninitialised until you fill them in, which is only Right and
-      Proper.</li>
-</ul>
-
-
-
-<h4>2.6.3&nbsp; Illegal frees</h4>
-For example:
-<pre>
-  Invalid free()
-     at 0x4004FFDF: free (ut_clientmalloc.c:577)
-     by 0x80484C7: main (tests/doublefree.c:10)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/doublefree)
-     Address 0x3807F7B4 is 0 bytes inside a block of size 177 free'd
-     at 0x4004FFDF: free (ut_clientmalloc.c:577)
-     by 0x80484C7: main (tests/doublefree.c:10)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/doublefree)
-</pre>
-<p>Valgrind keeps track of the blocks allocated by your program with
-malloc/new, so it can know exactly whether or not the argument to
-free/delete is legitimate or not.  Here, this test program has
-freed the same block twice.  As with the illegal read/write errors,
-Valgrind attempts to make sense of the address free'd.  If, as
-here, the address is one which has previously been freed, you wil
-be told that -- making duplicate frees of the same block easy to spot.
-
-
-<h4>2.6.4&nbsp; When a block is freed with an inappropriate
-deallocation function</h4>
-In the following example, a block allocated with <code>new[]</code>
-has wrongly been deallocated with <code>free</code>:
-<pre>
-  Mismatched free() / delete / delete []
-     at 0x40043249: free (vg_clientfuncs.c:171)
-     by 0x4102BB4E: QGArray::~QGArray(void) (tools/qgarray.cpp:149)
-     by 0x4C261C41: PptDoc::~PptDoc(void) (include/qmemarray.h:60)
-     by 0x4C261F0E: PptXml::~PptXml(void) (pptxml.cc:44)
-     Address 0x4BB292A8 is 0 bytes inside a block of size 64 alloc'd
-     at 0x4004318C: __builtin_vec_new (vg_clientfuncs.c:152)
-     by 0x4C21BC15: KLaola::readSBStream(int) const (klaola.cc:314)
-     by 0x4C21C155: KLaola::stream(KLaola::OLENode const *) (klaola.cc:416)
-     by 0x4C21788F: OLEFilter::convert(QCString const &amp;) (olefilter.cc:272)
-</pre>
-The following was told to me be the KDE 3 developers.  I didn't know
-any of it myself.  They also implemented the check itself.
-<p>
-In C++ it's important to deallocate memory in a way compatible with
-how it was allocated.  The deal is:
-<ul>
-<li>If allocated with <code>malloc</code>, <code>calloc</code>,
-    <code>realloc</code>, <code>valloc</code> or
-    <code>memalign</code>, you must deallocate with <code>free</code>.
-<li>If allocated with <code>new[]</code>, you must deallocate with
-    <code>delete[]</code>.
-<li>If allocated with <code>new</code>, you must deallocate with
-    <code>delete</code>.
-</ul>
-The worst thing is that on Linux apparently it doesn't matter if you
-do muddle these up, and it all seems to work ok, but the same program
-may then crash on a different platform, Solaris for example.  So it's
-best to fix it properly.  According to the KDE folks "it's amazing how
-many C++ programmers don't know this".  
-<p>
-Pascal Massimino adds the following clarification:
-<code>delete[]</code> must be called associated with a
-<code>new[]</code> because the compiler stores the size of the array
-and the pointer-to-member to the destructor of the array's content
-just before the pointer actually returned.  This implies a
-variable-sized overhead in what's returned by <code>new</code> or
-<code>new[]</code>.  It rather surprising how compilers [Ed:
-runtime-support libraries?] are robust to mismatch in
-<code>new</code>/<code>delete</code>
-<code>new[]</code>/<code>delete[]</code>.
-
-
-<h4>2.6.5&nbsp; Passing system call parameters with inadequate
-read/write permissions</h4>
-
-Valgrind checks all parameters to system calls.  If a system call
-needs to read from a buffer provided by your program, Valgrind checks
-that the entire buffer is addressible and has valid data, ie, it is
-readable.  And if the system call needs to write to a user-supplied
-buffer, Valgrind checks that the buffer is addressible.  After the
-system call, Valgrind updates its administrative information to
-precisely reflect any changes in memory permissions caused by the
-system call.
-
-<p>Here's an example of a system call with an invalid parameter:
-<pre>
-  #include &lt;stdlib.h>
-  #include &lt;unistd.h>
-  int main( void )
-  {
-    char* arr = malloc(10);
-    (void) write( 1 /* stdout */, arr, 10 );
-    return 0;
-  }
-</pre>
-
-<p>You get this complaint ...
-<pre>
-  Syscall param write(buf) contains uninitialised or unaddressable byte(s)
-     at 0x4035E072: __libc_write
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/badwrite)
-     by &lt;bogus frame pointer> ???
-     Address 0x3807E6D0 is 0 bytes inside a block of size 10 alloc'd
-     at 0x4004FEE6: malloc (ut_clientmalloc.c:539)
-     by 0x80484A0: main (tests/badwrite.c:6)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/badwrite)
-</pre>
-
-<p>... because the program has tried to write uninitialised junk from
-the malloc'd block to the standard output.
-
-
-<h4>2.6.6&nbsp; Warning messages you might see</h4>
-
-Most of these only appear if you run in verbose mode (enabled by
-<code>-v</code>):
-<ul>
-<li> <code>More than 50 errors detected.  Subsequent errors
-     will still be recorded, but in less detail than before.</code>
-     <br>
-     After 50 different errors have been shown, Valgrind becomes 
-     more conservative about collecting them.  It then requires only 
-     the program counters in the top two stack frames to match when
-     deciding whether or not two errors are really the same one.
-     Prior to this point, the PCs in the top four frames are required
-     to match.  This hack has the effect of slowing down the
-     appearance of new errors after the first 50.  The 50 constant can
-     be changed by recompiling Valgrind.
-<p>
-<li> <code>More than 300 errors detected.  I'm not reporting any more.
-     Final error counts may be inaccurate.  Go fix your
-     program!</code>
-     <br>
-     After 300 different errors have been detected, Valgrind ignores
-     any more.  It seems unlikely that collecting even more different
-     ones would be of practical help to anybody, and it avoids the
-     danger that Valgrind spends more and more of its time comparing
-     new errors against an ever-growing collection.  As above, the 300
-     number is a compile-time constant.
-<p>
-<li> <code>Warning: client switching stacks?</code>
-     <br>
-     Valgrind spotted such a large change in the stack pointer, %esp,
-     that it guesses the client is switching to a different stack.
-     At this point it makes a kludgey guess where the base of the new
-     stack is, and sets memory permissions accordingly.  You may get
-     many bogus error messages following this, if Valgrind guesses
-     wrong.  At the moment "large change" is defined as a change of
-     more that 2000000 in the value of the %esp (stack pointer)
-     register.
-<p>
-<li> <code>Warning: client attempted to close Valgrind's logfile fd &lt;number>
-     </code>
-     <br>
-     Valgrind doesn't allow the client
-     to close the logfile, because you'd never see any diagnostic
-     information after that point.  If you see this message,
-     you may want to use the <code>--logfile-fd=&lt;number></code> 
-     option to specify a different logfile file-descriptor number.
-<p>
-<li> <code>Warning: noted but unhandled ioctl &lt;number></code>
-     <br>
-     Valgrind observed a call to one of the vast family of
-     <code>ioctl</code> system calls, but did not modify its
-     memory status info (because I have not yet got round to it).
-     The call will still have gone through, but you may get spurious
-     errors after this as a result of the non-update of the memory info.
-<p>
-<li> <code>Warning: set address range perms: large range &lt;number></code>
-     <br> 
-     Diagnostic message, mostly for my benefit, to do with memory 
-     permissions.
-</ul>
-
-
-<a name="suppfiles"></a>
-<h3>2.7&nbsp; Writing suppressions files</h3>
-
-A suppression file describes a bunch of errors which, for one reason
-or another, you don't want Valgrind to tell you about.  Usually the
-reason is that the system libraries are buggy but unfixable, at least
-within the scope of the current debugging session.  Multiple
-suppressions files are allowed.  By default, Valgrind uses
-<code>$PREFIX/lib/valgrind/default.supp</code>.
-
-<p>
-You can ask to add suppressions from another file, by specifying
-<code>--suppressions=/path/to/file.supp</code>.
-
-<p>Each suppression has the following components:<br>
-<ul>
-
-  <li>Its name.  This merely gives a handy name to the suppression, by
-      which it is referred to in the summary of used suppressions
-      printed out when a program finishes.  It's not important what
-      the name is; any identifying string will do.
-      <p>
-
-  <li>The nature of the error to suppress.  Either: 
-      <code>Value1</code>, 
-      <code>Value2</code>,
-      <code>Value4</code> or
-      <code>Value8</code>,
-      meaning an uninitialised-value error when
-      using a value of 1, 2, 4 or 8 bytes.
-      Or
-      <code>Cond</code> (or its old name, <code>Value0</code>),
-      meaning use of an uninitialised CPU condition code.  Or: 
-      <code>Addr1</code>,
-      <code>Addr2</code>, 
-      <code>Addr4</code> or 
-      <code>Addr8</code>, meaning an invalid address during a
-      memory access of 1, 2, 4 or 8 bytes respectively.  Or 
-      <code>Param</code>,
-      meaning an invalid system call parameter error.  Or
-      <code>Free</code>, meaning an invalid or mismatching free.
-      Or <code>PThread</code>, meaning any kind of complaint to do
-      with the PThreads API.</li><br>
-      <p>
-
-  <li>The "immediate location" specification.  For Value and Addr
-      errors, is either the name of the function in which the error
-      occurred, or, failing that, the full path the the .so file
-      containing the error location.  For Param errors, is the name of
-      the offending system call parameter.  For Free errors, is the
-      name of the function doing the freeing (eg, <code>free</code>,
-      <code>__builtin_vec_delete</code>, etc)</li><br>
-      <p>
-
-  <li>The caller of the above "immediate location".  Again, either a
-      function or shared-object name.</li><br>
-      <p>
-
-  <li>Optionally, one or two extra calling-function or object names,
-      for greater precision.</li>
-</ul>
-
-<p>
-Locations may be either names of shared objects or wildcards matching
-function names.  They begin <code>obj:</code> and <code>fun:</code>
-respectively.  Function and object names to match against may use the 
-wildcard characters <code>*</code> and <code>?</code>.
-
-A suppression only suppresses an error when the error matches all the
-details in the suppression.  Here's an example:
-<pre>
-  {
-    __gconv_transform_ascii_internal/__mbrtowc/mbtowc
-    Value4
-    fun:__gconv_transform_ascii_internal
-    fun:__mbr*toc
-    fun:mbtowc
-  }
-</pre>
-
-<p>What is means is: suppress a use-of-uninitialised-value error, when
-the data size is 4, when it occurs in the function
-<code>__gconv_transform_ascii_internal</code>, when that is called
-from any function of name matching <code>__mbr*toc</code>, 
-when that is called from
-<code>mbtowc</code>.  It doesn't apply under any other circumstances.
-The string by which this suppression is identified to the user is
-__gconv_transform_ascii_internal/__mbrtowc/mbtowc.
-
-<p>Another example:
-<pre>
-  {
-    libX11.so.6.2/libX11.so.6.2/libXaw.so.7.0
-    Value4
-    obj:/usr/X11R6/lib/libX11.so.6.2
-    obj:/usr/X11R6/lib/libX11.so.6.2
-    obj:/usr/X11R6/lib/libXaw.so.7.0
-  }
-</pre>
-
-<p>Suppress any size 4 uninitialised-value error which occurs anywhere
-in <code>libX11.so.6.2</code>, when called from anywhere in the same
-library, when called from anywhere in <code>libXaw.so.7.0</code>.  The
-inexact specification of locations is regrettable, but is about all
-you can hope for, given that the X11 libraries shipped with Red Hat
-7.2 have had their symbol tables removed.
-
-<p>Note -- since the above two examples did not make it clear -- that
-you can freely mix the <code>obj:</code> and <code>fun:</code>
-styles of description within a single suppression record.
-
-
-<a name="clientreq"></a>
-<h3>2.8&nbsp; The Client Request mechanism</h3>
-
-Valgrind has a trapdoor mechanism via which the client program can
-pass all manner of requests and queries to Valgrind.  Internally, this
-is used extensively to make malloc, free, signals, threads, etc, work,
-although you don't see that.
-<p>
-For your convenience, a subset of these so-called client requests is
-provided to allow you to tell Valgrind facts about the behaviour of
-your program, and conversely to make queries.  In particular, your
-program can tell Valgrind about changes in memory range permissions
-that Valgrind would not otherwise know about, and so allows clients to
-get Valgrind to do arbitrary custom checks.
-<p>
-Clients need to include the header file <code>valgrind.h</code> to
-make this work.  The macros therein have the magical property that
-they generate code in-line which Valgrind can spot.  However, the code
-does nothing when not run on Valgrind, so you are not forced to run
-your program on Valgrind just because you use the macros in this file.
-Also, you are not required to link your program with any extra
-supporting libraries.
-<p>
-A brief description of the available macros:
-<ul>
-<li><code>VALGRIND_MAKE_NOACCESS</code>,
-    <code>VALGRIND_MAKE_WRITABLE</code> and
-    <code>VALGRIND_MAKE_READABLE</code>.  These mark address
-    ranges as completely inaccessible, accessible but containing
-    undefined data, and accessible and containing defined data,
-    respectively.  Subsequent errors may have their faulting
-    addresses described in terms of these blocks.  Returns a
-    "block handle".  Returns zero when not run on Valgrind.
-<p>
-<li><code>VALGRIND_DISCARD</code>: At some point you may want
-    Valgrind to stop reporting errors in terms of the blocks
-    defined by the previous three macros.  To do this, the above
-    macros return a small-integer "block handle".  You can pass
-    this block handle to <code>VALGRIND_DISCARD</code>.  After
-    doing so, Valgrind will no longer be able to relate
-    addressing errors to the user-defined block associated with
-    the handle.  The permissions settings associated with the
-    handle remain in place; this just affects how errors are
-    reported, not whether they are reported.  Returns 1 for an
-    invalid handle and 0 for a valid handle (although passing
-    invalid handles is harmless).  Always returns 0 when not run
-    on Valgrind.
-<p>
-<li><code>VALGRIND_CHECK_NOACCESS</code>,
-    <code>VALGRIND_CHECK_WRITABLE</code> and
-    <code>VALGRIND_CHECK_READABLE</code>: check immediately
-    whether or not the given address range has the relevant
-    property, and if not, print an error message.  Also, for the
-    convenience of the client, returns zero if the relevant
-    property holds; otherwise, the returned value is the address
-    of the first byte for which the property is not true.
-    Always returns 0 when not run on Valgrind.
-<p>
-<li><code>VALGRIND_CHECK_NOACCESS</code>: a quick and easy way
-    to find out whether Valgrind thinks a particular variable
-    (lvalue, to be precise) is addressible and defined.  Prints
-    an error message if not.  Returns no value.
-<p>
-<li><code>VALGRIND_MAKE_NOACCESS_STACK</code>: a highly
-    experimental feature.  Similarly to
-    <code>VALGRIND_MAKE_NOACCESS</code>, this marks an address
-    range as inaccessible, so that subsequent accesses to an
-    address in the range gives an error.  However, this macro
-    does not return a block handle.  Instead, all annotations
-    created like this are reviewed at each client
-    <code>ret</code> (subroutine return) instruction, and those
-    which now define an address range block the client's stack
-    pointer register (<code>%esp</code>) are automatically
-    deleted.
-    <p>
-    In other words, this macro allows the client to tell
-    Valgrind about red-zones on its own stack.  Valgrind
-    automatically discards this information when the stack
-    retreats past such blocks.  Beware: hacky and flaky, and
-    probably interacts badly with the new pthread support.
-<p>
-<li><code>RUNNING_ON_VALGRIND</code>: returns 1 if running on
-    Valgrind, 0 if running on the real CPU.
-<p>
-<li><code>VALGRIND_DO_LEAK_CHECK</code>: run the memory leak detector
-    right now.  Returns no value.  I guess this could be used to
-    incrementally check for leaks between arbitrary places in the
-    program's execution.  Warning: not properly tested!
-<p>
-<li><code>VALGRIND_DISCARD_TRANSLATIONS</code>: discard translations
-    of code in the specified address range.  Useful if you are
-    debugging a JITter or some other dynamic code generation system.
-    After this call, attempts to execute code in the invalidated
-    address range will cause valgrind to make new translations of that
-    code, which is probably the semantics you want.  Note that this is
-    implemented naively, and involves checking all 200191 entries in
-    the translation table to see if any of them overlap the specified
-    address range.  So try not to call it often, or performance will
-    nosedive.  Note that you can be clever about this: you only need
-    to call it when an area which previously contained code is
-    overwritten with new code.  You can choose to write code into
-    fresh memory, and just call this occasionally to discard large
-    chunks of old code all at once.
-    <p>
-    Warning: minimally tested, especially for the cache simulator.
-</ul>
-<p>
-
-
-<a name="pthreads"></a>
-<h3>2.9&nbsp; Support for POSIX Pthreads</h3>
-
-As of late April 02, Valgrind supports programs which use POSIX
-pthreads.  Doing this has proved technically challenging but is now
-mostly complete.  It works well enough for significant threaded
-applications to work.
-<p>
-It works as follows: threaded apps are (dynamically) linked against
-<code>libpthread.so</code>.  Usually this is the one installed with
-your Linux distribution.  Valgrind, however, supplies its own
-<code>libpthread.so</code> and automatically connects your program to
-it instead.
-<p>
-The fake <code>libpthread.so</code> and Valgrind cooperate to
-implement a user-space pthreads package.  This approach avoids the 
-horrible implementation problems of implementing a truly
-multiprocessor version of Valgrind, but it does mean that threaded
-apps run only on one CPU, even if you have a multiprocessor machine.
-<p>
-Valgrind schedules your threads in a round-robin fashion, with all
-threads having equal priority.  It switches threads every 50000 basic
-blocks (typically around 300000 x86 instructions), which means you'll
-get a much finer interleaving of thread executions than when run
-natively.  This in itself may cause your program to behave differently
-if you have some kind of concurrency, critical race, locking, or
-similar, bugs.
-<p>
-The current (valgrind-1.0 release) state of pthread support is as
-follows:
-<ul>
-<li>Mutexes, condition variables, thread-specific data,
-    <code>pthread_once</code>, reader-writer locks, semaphores,
-    cleanup stacks, cancellation and thread detaching currently work.
-    Various attribute-like calls are handled but ignored; you get a
-    warning message.
-<p>
-<li>Currently the following syscalls are thread-safe (nonblocking):
-    <code>write</code> <code>read</code> <code>nanosleep</code>
-    <code>sleep</code> <code>select</code> <code>poll</code> 
-    <code>recvmsg</code> and
-    <code>accept</code>.
-<p>
-<li>Signals in pthreads are now handled properly(ish): 
-    <code>pthread_sigmask</code>, <code>pthread_kill</code>,
-    <code>sigwait</code> and <code>raise</code> are now implemented.
-    Each thread has its own signal mask, as POSIX requires.
-    It's a bit kludgey -- there's a system-wide pending signal set,
-    rather than one for each thread.  But hey.
-</ul>
-
-
-As of 18 May 02, the following threaded programs now work fine on my
-RedHat 7.2 box: Opera 6.0Beta2, KNode in KDE 3.0, Mozilla-0.9.2.1 and
-Galeon-0.11.3, both as supplied with RedHat 7.2.  Also Mozilla 1.0RC2.
-OpenOffice 1.0.  MySQL 3.something (the current stable release).
-
-<a name="install"></a>
-<h3>2.10&nbsp; Building and installing</h3>
-
-We now use the standard Unix <code>./configure</code>,
-<code>make</code>, <code>make install</code> mechanism, and I have
-attempted to ensure that it works on machines with kernel 2.2 or 2.4
-and glibc 2.1.X or 2.2.X.  I don't think there is much else to say.
-There are no options apart from the usual <code>--prefix</code> that
-you should give to <code>./configure</code>.
-
-<p>
-The <code>configure</code> script tests the version of the X server
-currently indicated by the current <code>$DISPLAY</code>.  This is a
-known bug.  The intention was to detect the version of the current
-XFree86 client libraries, so that correct suppressions could be
-selected for them, but instead the test checks the server version.
-This is just plain wrong.
-
-<p>
-If you are building a binary package of Valgrind for distribution,
-please read <code>README_PACKAGERS</code>.  It contains some important
-information.
-
-<p>
-Apart from that there is no excitement here.  Let me know if you have
-build problems.
-
-
-
-<a name="problems"></a>
-<h3>2.11&nbsp; If you have problems</h3>
-Mail me (<a href="mailto:jseward@acm.org">jseward@acm.org</a>).
-
-<p>See <a href="#limits">Section 4</a> for the known limitations of
-Valgrind, and for a list of programs which are known not to work on
-it.
-
-<p>The translator/instrumentor has a lot of assertions in it.  They
-are permanently enabled, and I have no plans to disable them.  If one
-of these breaks, please mail me!
-
-<p>If you get an assertion failure on the expression
-<code>chunkSane(ch)</code> in <code>vg_free()</code> in
-<code>vg_malloc.c</code>, this may have happened because your program
-wrote off the end of a malloc'd block, or before its beginning.
-Valgrind should have emitted a proper message to that effect before
-dying in this way.  This is a known problem which I should fix.
-<p>
-
-<hr width="100%">
-
-<a name="machine"></a>
-<h2>3&nbsp; Details of the checking machinery</h2>
-
-Read this section if you want to know, in detail, exactly what and how
-Valgrind is checking.
-
-<a name="vvalue"></a>
-<h3>3.1&nbsp; Valid-value (V) bits</h3>
-
-It is simplest to think of Valgrind implementing a synthetic Intel x86
-CPU which is identical to a real CPU, except for one crucial detail.
-Every bit (literally) of data processed, stored and handled by the
-real CPU has, in the synthetic CPU, an associated "valid-value" bit,
-which says whether or not the accompanying bit has a legitimate value.
-In the discussions which follow, this bit is referred to as the V
-(valid-value) bit.
-
-<p>Each byte in the system therefore has a 8 V bits which follow
-it wherever it goes.  For example, when the CPU loads a word-size item
-(4 bytes) from memory, it also loads the corresponding 32 V bits from
-a bitmap which stores the V bits for the process' entire address
-space.  If the CPU should later write the whole or some part of that
-value to memory at a different address, the relevant V bits will be
-stored back in the V-bit bitmap.
-
-<p>In short, each bit in the system has an associated V bit, which
-follows it around everywhere, even inside the CPU.  Yes, the CPU's
-(integer and <code>%eflags</code>) registers have their own V bit
-vectors.
-
-<p>Copying values around does not cause Valgrind to check for, or
-report on, errors.  However, when a value is used in a way which might
-conceivably affect the outcome of your program's computation, the
-associated V bits are immediately checked.  If any of these indicate
-that the value is undefined, an error is reported.
-
-<p>Here's an (admittedly nonsensical) example:
-<pre>
-  int i, j;
-  int a[10], b[10];
-  for (i = 0; i &lt; 10; i++) {
-    j = a[i];
-    b[i] = j;
-  }
-</pre>
-
-<p>Valgrind emits no complaints about this, since it merely copies
-uninitialised values from <code>a[]</code> into <code>b[]</code>, and
-doesn't use them in any way.  However, if the loop is changed to
-<pre>
-  for (i = 0; i &lt; 10; i++) {
-    j += a[i];
-  }
-  if (j == 77) 
-     printf("hello there\n");
-</pre>
-then Valgrind will complain, at the <code>if</code>, that the
-condition depends on uninitialised values.
-
-<p>Most low level operations, such as adds, cause Valgrind to 
-use the V bits for the operands to calculate the V bits for the
-result.  Even if the result is partially or wholly undefined,
-it does not complain.
-
-<p>Checks on definedness only occur in two places: when a value is
-used to generate a memory address, and where control flow decision
-needs to be made.  Also, when a system call is detected, valgrind
-checks definedness of parameters as required.
-
-<p>If a check should detect undefinedness, an error message is
-issued.  The resulting value is subsequently regarded as well-defined.
-To do otherwise would give long chains of error messages.  In effect,
-we say that undefined values are non-infectious.
-
-<p>This sounds overcomplicated.  Why not just check all reads from
-memory, and complain if an undefined value is loaded into a CPU register? 
-Well, that doesn't work well, because perfectly legitimate C programs routinely
-copy uninitialised values around in memory, and we don't want endless complaints
-about that.  Here's the canonical example.  Consider a struct
-like this:
-<pre>
-  struct S { int x; char c; };
-  struct S s1, s2;
-  s1.x = 42;
-  s1.c = 'z';
-  s2 = s1;
-</pre>
-
-<p>The question to ask is: how large is <code>struct S</code>, in
-bytes?  An int is 4 bytes and a char one byte, so perhaps a struct S
-occupies 5 bytes?  Wrong.  All (non-toy) compilers I know of will
-round the size of <code>struct S</code> up to a whole number of words,
-in this case 8 bytes.  Not doing this forces compilers to generate
-truly appalling code for subscripting arrays of <code>struct
-S</code>'s.
-
-<p>So s1 occupies 8 bytes, yet only 5 of them will be initialised.
-For the assignment <code>s2 = s1</code>, gcc generates code to copy
-all 8 bytes wholesale into <code>s2</code> without regard for their
-meaning.  If Valgrind simply checked values as they came out of
-memory, it would yelp every time a structure assignment like this
-happened.  So the more complicated semantics described above is
-necessary.  This allows gcc to copy <code>s1</code> into
-<code>s2</code> any way it likes, and a warning will only be emitted
-if the uninitialised values are later used.
-
-<p>One final twist to this story.  The above scheme allows garbage to
-pass through the CPU's integer registers without complaint.  It does
-this by giving the integer registers V tags, passing these around in
-the expected way.  This complicated and computationally expensive to
-do, but is necessary.  Valgrind is more simplistic about
-floating-point loads and stores.  In particular, V bits for data read
-as a result of floating-point loads are checked at the load
-instruction.  So if your program uses the floating-point registers to
-do memory-to-memory copies, you will get complaints about
-uninitialised values.  Fortunately, I have not yet encountered a
-program which (ab)uses the floating-point registers in this way.
-
-<a name="vaddress"></a>
-<h3>3.2&nbsp; Valid-address (A) bits</h3>
-
-Notice that the previous section describes how the validity of values
-is established and maintained without having to say whether the
-program does or does not have the right to access any particular
-memory location.  We now consider the latter issue.
-
-<p>As described above, every bit in memory or in the CPU has an
-associated valid-value (V) bit.  In addition, all bytes in memory, but
-not in the CPU, have an associated valid-address (A) bit.  This
-indicates whether or not the program can legitimately read or write
-that location.  It does not give any indication of the validity or the
-data at that location -- that's the job of the V bits -- only whether
-or not the location may be accessed.
-
-<p>Every time your program reads or writes memory, Valgrind checks the
-A bits associated with the address.  If any of them indicate an
-invalid address, an error is emitted.  Note that the reads and writes
-themselves do not change the A bits, only consult them.
-
-<p>So how do the A bits get set/cleared?  Like this:
-
-<ul>
-  <li>When the program starts, all the global data areas are marked as
-      accessible.</li><br>
-      <p>
-
-  <li>When the program does malloc/new, the A bits for the exactly the
-      area allocated, and not a byte more, are marked as accessible.
-      Upon freeing the area the A bits are changed to indicate
-      inaccessibility.</li><br>
-      <p>
-
-  <li>When the stack pointer register (%esp) moves up or down, A bits
-      are set.  The rule is that the area from %esp up to the base of
-      the stack is marked as accessible, and below %esp is
-      inaccessible.  (If that sounds illogical, bear in mind that the
-      stack grows down, not up, on almost all Unix systems, including
-      GNU/Linux.)  Tracking %esp like this has the useful side-effect
-      that the section of stack used by a function for local variables
-      etc is automatically marked accessible on function entry and
-      inaccessible on exit.</li><br>
-      <p>
-
-  <li>When doing system calls, A bits are changed appropriately.  For
-      example, mmap() magically makes files appear in the process's
-      address space, so the A bits must be updated if mmap()
-      succeeds.</li><br>
-      <p>
-
-  <li>Optionally, your program can tell Valgrind about such changes
-      explicitly, using the client request mechanism described above.
-</ul>
-
-
-<a name="together"></a>
-<h3>3.3&nbsp; Putting it all together</h3>
-Valgrind's checking machinery can be summarised as follows:
-
-<ul>
-  <li>Each byte in memory has 8 associated V (valid-value) bits,
-      saying whether or not the byte has a defined value, and a single
-      A (valid-address) bit, saying whether or not the program
-      currently has the right to read/write that address.</li><br>
-      <p>
-
-  <li>When memory is read or written, the relevant A bits are
-      consulted.  If they indicate an invalid address, Valgrind emits
-      an Invalid read or Invalid write error.</li><br>
-      <p>
-
-  <li>When memory is read into the CPU's integer registers, the
-      relevant V bits are fetched from memory and stored in the
-      simulated CPU.  They are not consulted.</li><br>
-      <p>
-
-  <li>When an integer register is written out to memory, the V bits
-      for that register are written back to memory too.</li><br>
-      <p>
-
-  <li>When memory is read into the CPU's floating point registers, the
-      relevant V bits are read from memory and they are immediately
-      checked.  If any are invalid, an uninitialised value error is
-      emitted.  This precludes using the floating-point registers to
-      copy possibly-uninitialised memory, but simplifies Valgrind in
-      that it does not have to track the validity status of the
-      floating-point registers.</li><br>
-      <p>
-
-  <li>As a result, when a floating-point register is written to
-      memory, the associated V bits are set to indicate a valid
-      value.</li><br>
-      <p>
-
-  <li>When values in integer CPU registers are used to generate a
-      memory address, or to determine the outcome of a conditional
-      branch, the V bits for those values are checked, and an error
-      emitted if any of them are undefined.</li><br>
-      <p>
-
-  <li>When values in integer CPU registers are used for any other
-      purpose, Valgrind computes the V bits for the result, but does
-      not check them.</li><br>
-      <p>
-
-  <li>One the V bits for a value in the CPU have been checked, they
-      are then set to indicate validity.  This avoids long chains of
-      errors.</li><br>
-      <p>
-
-  <li>When values are loaded from memory, valgrind checks the A bits
-      for that location and issues an illegal-address warning if
-      needed.  In that case, the V bits loaded are forced to indicate
-      Valid, despite the location being invalid.
-      <p>
-      This apparently strange choice reduces the amount of confusing
-      information presented to the user.  It avoids the
-      unpleasant phenomenon in which memory is read from a place which
-      is both unaddressible and contains invalid values, and, as a
-      result, you get not only an invalid-address (read/write) error,
-      but also a potentially large set of uninitialised-value errors,
-      one for every time the value is used.
-      <p>
-      There is a hazy boundary case to do with multi-byte loads from
-      addresses which are partially valid and partially invalid.  See
-      details of the flag <code>--partial-loads-ok</code> for details.
-      </li><br>
-</ul>
-
-Valgrind intercepts calls to malloc, calloc, realloc, valloc,
-memalign, free, new and delete.  The behaviour you get is:
-
-<ul>
-
-  <li>malloc/new: the returned memory is marked as addressible but not
-      having valid values.  This means you have to write on it before
-      you can read it.</li><br>
-      <p>
-
-  <li>calloc: returned memory is marked both addressible and valid,
-      since calloc() clears the area to zero.</li><br>
-      <p>
-
-  <li>realloc: if the new size is larger than the old, the new section
-      is addressible but invalid, as with malloc.</li><br>
-      <p>
-
-  <li>If the new size is smaller, the dropped-off section is marked as
-      unaddressible.  You may only pass to realloc a pointer
-      previously issued to you by malloc/calloc/new/realloc.</li><br>
-      <p>
-
-  <li>free/delete: you may only pass to free a pointer previously
-      issued to you by malloc/calloc/new/realloc, or the value
-      NULL. Otherwise, Valgrind complains.  If the pointer is indeed
-      valid, Valgrind marks the entire area it points at as
-      unaddressible, and places the block in the freed-blocks-queue.
-      The aim is to defer as long as possible reallocation of this
-      block.  Until that happens, all attempts to access it will
-      elicit an invalid-address error, as you would hope.</li><br>
-</ul>
-
-
-
-<a name="signals"></a>
-<h3>3.4&nbsp; Signals</h3>
-
-Valgrind provides suitable handling of signals, so, provided you stick
-to POSIX stuff, you should be ok.  Basic sigaction() and sigprocmask()
-are handled.  Signal handlers may return in the normal way or do
-longjmp(); both should work ok.  As specified by POSIX, a signal is
-blocked in its own handler.  Default actions for signals should work
-as before.  Etc, etc.
-
-<p>Under the hood, dealing with signals is a real pain, and Valgrind's
-simulation leaves much to be desired.  If your program does
-way-strange stuff with signals, bad things may happen.  If so, let me
-know.  I don't promise to fix it, but I'd at least like to be aware of
-it.
-
-
-<a name="leaks"></a>
-<h3>3.5&nbsp; Memory leak detection</h3>
-
-Valgrind keeps track of all memory blocks issued in response to calls
-to malloc/calloc/realloc/new.  So when the program exits, it knows
-which blocks are still outstanding -- have not been returned, in other
-words.  Ideally, you want your program to have no blocks still in use
-at exit.  But many programs do.
-
-<p>For each such block, Valgrind scans the entire address space of the
-process, looking for pointers to the block.  One of three situations
-may result:
-
-<ul>
-  <li>A pointer to the start of the block is found.  This usually
-      indicates programming sloppiness; since the block is still
-      pointed at, the programmer could, at least in principle, free'd
-      it before program exit.</li><br>
-      <p>
-
-  <li>A pointer to the interior of the block is found.  The pointer
-      might originally have pointed to the start and have been moved
-      along, or it might be entirely unrelated.  Valgrind deems such a
-      block as "dubious", that is, possibly leaked,
-      because it's unclear whether or
-      not a pointer to it still exists.</li><br>
-      <p>
-
-  <li>The worst outcome is that no pointer to the block can be found.
-      The block is classified as "leaked", because the
-      programmer could not possibly have free'd it at program exit,
-      since no pointer to it exists.  This might be a symptom of
-      having lost the pointer at some earlier point in the
-      program.</li>
-</ul>
-
-Valgrind reports summaries about leaked and dubious blocks.
-For each such block, it will also tell you where the block was
-allocated.  This should help you figure out why the pointer to it has
-been lost.  In general, you should attempt to ensure your programs do
-not have any leaked or dubious blocks at exit.
-
-<p>The precise area of memory in which Valgrind searches for pointers
-is: all naturally-aligned 4-byte words for which all A bits indicate
-addressibility and all V bits indicated that the stored value is
-actually valid.
-
-<p><hr width="100%">
-
-
-<a name="limits"></a>
-<h2>4&nbsp; Limitations</h2>
-
-The following list of limitations seems depressingly long.  However,
-most programs actually work fine.
-
-<p>Valgrind will run x86-GNU/Linux ELF dynamically linked binaries, on
-a kernel 2.2.X or 2.4.X system, subject to the following constraints:
-
-<ul>
-  <li>No MMX, SSE, SSE2, 3DNow instructions.  If the translator
-      encounters these, Valgrind will simply give up.  It may be
-      possible to add support for them at a later time. Intel added a
-      few instructions such as "cmov" to the integer instruction set
-      on Pentium and later processors, and these are supported.
-      Nevertheless it's safest to think of Valgrind as implementing
-      the 486 instruction set.</li><br>
-      <p>
-
-  <li>Pthreads support is improving, but there are still significant
-      limitations in that department.  See the section above on
-      Pthreads.  Note that your program must be dynamically linked
-      against <code>libpthread.so</code>, so that Valgrind can
-      substitute its own implementation at program startup time.  If
-      you're statically linked against it, things will fail
-      badly.</li><br>
-      <p>
-
-  <li>Valgrind assumes that the floating point registers are not used
-      as intermediaries in memory-to-memory copies, so it immediately
-      checks V bits in floating-point loads/stores.  If you want to
-      write code which copies around possibly-uninitialised values,
-      you must ensure these travel through the integer registers, not
-      the FPU.</li><br>
-      <p>
-
-  <li>If your program does its own memory management, rather than
-      using malloc/new/free/delete, it should still work, but
-      Valgrind's error checking won't be so effective.</li><br>
-      <p>
-
-  <li>Valgrind's signal simulation is not as robust as it could be.
-      Basic POSIX-compliant sigaction and sigprocmask functionality is
-      supplied, but it's conceivable that things could go badly awry
-      if you do wierd things with signals.  Workaround: don't.
-      Programs that do non-POSIX signal tricks are in any case
-      inherently unportable, so should be avoided if
-      possible.</li><br>
-      <p>
-
-  <li>Programs which switch stacks are not well handled.  Valgrind
-      does have support for this, but I don't have great faith in it.
-      It's difficult -- there's no cast-iron way to decide whether a
-      large change in %esp is as a result of the program switching
-      stacks, or merely allocating a large object temporarily on the
-      current stack -- yet Valgrind needs to handle the two situations
-      differently.  1 May 02: this probably interacts badly with the
-      new pthread support.  I haven't checked properly.</li><br>
-      <p>
-
-  <li>x86 instructions, and system calls, have been implemented on
-      demand.  So it's possible, although unlikely, that a program
-      will fall over with a message to that effect.  If this happens,
-      please mail me ALL the details printed out, so I can try and
-      implement the missing feature.</li><br>
-      <p>
-
-  <li>x86 floating point works correctly, but floating-point code may
-      run even more slowly than integer code, due to my simplistic
-      approach to FPU emulation.</li><br>
-      <p>
-
-  <li>You can't Valgrind-ize statically linked binaries.  Valgrind
-      relies on the dynamic-link mechanism to gain control at
-      startup.</li><br>
-      <p>
-
-  <li>Memory consumption of your program is majorly increased whilst
-      running under Valgrind.  This is due to the large amount of
-      adminstrative information maintained behind the scenes.  Another
-      cause is that Valgrind dynamically translates the original
-      executable.  Translated, instrumented code is 14-16 times larger
-      than the original (!) so you can easily end up with 30+ MB of
-      translations when running (eg) a web browser.
-      </li>
-</ul>
-
-Programs which are known not to work are:
-
-<ul>
-  <li>emacs starts up but immediately concludes it is out of memory
-      and aborts.  Emacs has it's own memory-management scheme, but I
-      don't understand why this should interact so badly with
-      Valgrind.  Emacs works fine if you build it to use the standard
-      malloc/free routines.</li><br>
-      <p>
-</ul>
-
-Known platform-specific limitations, as of release 1.0.0:
-
-<ul>
-  <li>On Red Hat 7.3, there have been reports of link errors (at
-      program start time) for threaded programs using
-      <code>__pthread_clock_gettime</code> and
-      <code>__pthread_clock_settime</code>.  This appears to be due to
-      <code>/lib/librt-2.2.5.so</code> needing them.  Unfortunately I
-      do not understand enough about this problem to fix it properly,
-      and I can't reproduce it on my test RedHat 7.3 system.  Please
-      mail me if you have more information / understanding.  </li><br>
-      <p>
-  <li>
-      1.0.0 now partially works on Red Hat 7.3.92 ("Limbo"
-      public beta).  However, don't expect a smooth ride.
-      Basically valgrind won't work as-is with any 
-      glibc-2.3 based system.  Limbo is just a little pre glibc-2.3 
-      and it just about works.  Limbo is also gcc-3.1 based and so
-      suffers from the problems in the following point.</li><br>
-      <p>
-  <li>
-      Inlining of string functions with gcc-3.1 or above causes a
-      large number of false reports of uninitialised value uses.  I
-      know what the problem is and roughly how to fix it, but I need
-      to devise a reasonably efficient fix.  Try to reduce the
-      optimisation level, or use <code>-fno-builtin-strlen</code> in
-      the meantime.  Or use an earlier gcc.</li><br>
-      <p>
-</ul>
-
-
-<p><hr width="100%">
-
-
-<a name="howitworks"></a>
-<h2>5&nbsp; How it works -- a rough overview</h2>
-Some gory details, for those with a passion for gory details.  You
-don't need to read this section if all you want to do is use Valgrind.
-
-<a name="startb"></a>
-<h3>5.1&nbsp; Getting started</h3>
-
-Valgrind is compiled into a shared object, valgrind.so.  The shell
-script valgrind sets the LD_PRELOAD environment variable to point to
-valgrind.so.  This causes the .so to be loaded as an extra library to
-any subsequently executed dynamically-linked ELF binary, viz, the
-program you want to debug.
-
-<p>The dynamic linker allows each .so in the process image to have an
-initialisation function which is run before main().  It also allows
-each .so to have a finalisation function run after main() exits.
-
-<p>When valgrind.so's initialisation function is called by the dynamic
-linker, the synthetic CPU to starts up.  The real CPU remains locked
-in valgrind.so for the entire rest of the program, but the synthetic
-CPU returns from the initialisation function.  Startup of the program
-now continues as usual -- the dynamic linker calls all the other .so's
-initialisation routines, and eventually runs main().  This all runs on
-the synthetic CPU, not the real one, but the client program cannot
-tell the difference.
-
-<p>Eventually main() exits, so the synthetic CPU calls valgrind.so's
-finalisation function.  Valgrind detects this, and uses it as its cue
-to exit.  It prints summaries of all errors detected, possibly checks
-for memory leaks, and then exits the finalisation routine, but now on
-the real CPU.  The synthetic CPU has now lost control -- permanently
--- so the program exits back to the OS on the real CPU, just as it
-would have done anyway.
-
-<p>On entry, Valgrind switches stacks, so it runs on its own stack.
-On exit, it switches back.  This means that the client program
-continues to run on its own stack, so we can switch back and forth
-between running it on the simulated and real CPUs without difficulty.
-This was an important design decision, because it makes it easy (well,
-significantly less difficult) to debug the synthetic CPU.
-
-
-<a name="engine"></a>
-<h3>5.2&nbsp; The translation/instrumentation engine</h3>
-
-Valgrind does not directly run any of the original program's code.  Only
-instrumented translations are run.  Valgrind maintains a translation
-table, which allows it to find the translation quickly for any branch
-target (code address).  If no translation has yet been made, the
-translator - a just-in-time translator - is summoned.  This makes an
-instrumented translation, which is added to the collection of
-translations.  Subsequent jumps to that address will use this
-translation.
-
-<p>Valgrind no longer directly supports detection of self-modifying
-code.  Such checking is expensive, and in practice (fortunately)
-almost no applications need it.  However, to help people who are
-debugging dynamic code generation systems, there is a Client Request 
-(basically a macro you can put in your program) which directs Valgrind
-to discard translations in a given address range.  So Valgrind can
-still work in this situation provided the client tells it when
-code has become out-of-date and needs to be retranslated.
-
-<p>The JITter translates basic blocks -- blocks of straight-line-code
--- as single entities.  To minimise the considerable difficulties of
-dealing with the x86 instruction set, x86 instructions are first
-translated to a RISC-like intermediate code, similar to sparc code,
-but with an infinite number of virtual integer registers.  Initially
-each insn is translated seperately, and there is no attempt at
-instrumentation.
-
-<p>The intermediate code is improved, mostly so as to try and cache
-the simulated machine's registers in the real machine's registers over
-several simulated instructions.  This is often very effective.  Also,
-we try to remove redundant updates of the simulated machines's
-condition-code register.
-
-<p>The intermediate code is then instrumented, giving more
-intermediate code.  There are a few extra intermediate-code operations
-to support instrumentation; it is all refreshingly simple.  After
-instrumentation there is a cleanup pass to remove redundant value
-checks.
-
-<p>This gives instrumented intermediate code which mentions arbitrary
-numbers of virtual registers.  A linear-scan register allocator is
-used to assign real registers and possibly generate spill code.  All
-of this is still phrased in terms of the intermediate code.  This
-machinery is inspired by the work of Reuben Thomas (MITE).
-
-<p>Then, and only then, is the final x86 code emitted.  The
-intermediate code is carefully designed so that x86 code can be
-generated from it without need for spare registers or other
-inconveniences.
-
-<p>The translations are managed using a traditional LRU-based caching
-scheme.  The translation cache has a default size of about 14MB.
-
-<a name="track"></a>
-
-<h3>5.3&nbsp; Tracking the status of memory</h3> Each byte in the
-process' address space has nine bits associated with it: one A bit and
-eight V bits.  The A and V bits for each byte are stored using a
-sparse array, which flexibly and efficiently covers arbitrary parts of
-the 32-bit address space without imposing significant space or
-performance overheads for the parts of the address space never
-visited.  The scheme used, and speedup hacks, are described in detail
-at the top of the source file vg_memory.c, so you should read that for
-the gory details.
-
-<a name="sys_calls"></a>
-
-<h3>5.4 System calls</h3>
-All system calls are intercepted.  The memory status map is consulted
-before and updated after each call.  It's all rather tiresome.  See
-vg_syscall_mem.c for details.
-
-<a name="sys_signals"></a>
-
-<h3>5.5&nbsp; Signals</h3>
-All system calls to sigaction() and sigprocmask() are intercepted.  If
-the client program is trying to set a signal handler, Valgrind makes a
-note of the handler address and which signal it is for.  Valgrind then
-arranges for the same signal to be delivered to its own handler.
-
-<p>When such a signal arrives, Valgrind's own handler catches it, and
-notes the fact.  At a convenient safe point in execution, Valgrind
-builds a signal delivery frame on the client's stack and runs its
-handler.  If the handler longjmp()s, there is nothing more to be said.
-If the handler returns, Valgrind notices this, zaps the delivery
-frame, and carries on where it left off before delivering the signal.
-
-<p>The purpose of this nonsense is that setting signal handlers
-essentially amounts to giving callback addresses to the Linux kernel.
-We can't allow this to happen, because if it did, signal handlers
-would run on the real CPU, not the simulated one.  This means the
-checking machinery would not operate during the handler run, and,
-worse, memory permissions maps would not be updated, which could cause
-spurious error reports once the handler had returned.
-
-<p>An even worse thing would happen if the signal handler longjmp'd
-rather than returned: Valgrind would completely lose control of the
-client program.
-
-<p>Upshot: we can't allow the client to install signal handlers
-directly.  Instead, Valgrind must catch, on behalf of the client, any
-signal the client asks to catch, and must delivery it to the client on
-the simulated CPU, not the real one.  This involves considerable
-gruesome fakery; see vg_signals.c for details.
-<p>
-
-<hr width="100%">
-
-<a name="example"></a>
-<h2>6&nbsp; Example</h2>
-This is the log for a run of a small program. The program is in fact
-correct, and the reported error is as the result of a potentially serious
-code generation bug in GNU g++ (snapshot 20010527).
-<pre>
-sewardj@phoenix:~/newmat10$
-~/Valgrind-6/valgrind -v ./bogon 
-==25832== Valgrind 0.10, a memory error detector for x86 RedHat 7.1.
-==25832== Copyright (C) 2000-2001, and GNU GPL'd, by Julian Seward.
-==25832== Startup, with flags:
-==25832== --suppressions=/home/sewardj/Valgrind/redhat71.supp
-==25832== reading syms from /lib/ld-linux.so.2
-==25832== reading syms from /lib/libc.so.6
-==25832== reading syms from /mnt/pima/jrs/Inst/lib/libgcc_s.so.0
-==25832== reading syms from /lib/libm.so.6
-==25832== reading syms from /mnt/pima/jrs/Inst/lib/libstdc++.so.3
-==25832== reading syms from /home/sewardj/Valgrind/valgrind.so
-==25832== reading syms from /proc/self/exe
-==25832== loaded 5950 symbols, 142333 line number locations
-==25832== 
-==25832== Invalid read of size 4
-==25832==    at 0x8048724: _ZN10BandMatrix6ReSizeEiii (bogon.cpp:45)
-==25832==    by 0x80487AF: main (bogon.cpp:66)
-==25832==    by 0x40371E5E: __libc_start_main (libc-start.c:129)
-==25832==    by 0x80485D1: (within /home/sewardj/newmat10/bogon)
-==25832==    Address 0xBFFFF74C is not stack'd, malloc'd or free'd
-==25832==
-==25832== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
-==25832== malloc/free: in use at exit: 0 bytes in 0 blocks.
-==25832== malloc/free: 0 allocs, 0 frees, 0 bytes allocated.
-==25832== For a detailed leak analysis, rerun with: --leak-check=yes
-==25832==
-==25832== exiting, did 1881 basic blocks, 0 misses.
-==25832== 223 translations, 3626 bytes in, 56801 bytes out.
-</pre>
-<p>The GCC folks fixed this about a week before gcc-3.0 shipped.
-<hr width="100%">
-<p>
-
-
-
-<a name="cache"></a>
-<h2>7&nbsp; Cache profiling</h2>
-As well as memory debugging, Valgrind also allows you to do cache simulations
-and annotate your source line-by-line with the number of cache misses.  In
-particular, it records:
-<ul>
-  <li>L1 instruction cache reads and misses;
-  <li>L1 data cache reads and read misses, writes and write misses;
-  <li>L2 unified cache reads and read misses, writes and writes misses.
-</ul>
-On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
-and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
-very useful for improving the performance of your program.<p>
-
-Also, since one instruction cache read is performed per instruction executed,
-you can find out how many instructions are executed per line, which can be
-useful for traditional profiling and test coverage.<p>
-
-Any feedback, bug-fixes, suggestions, etc, welcome.
-
-
-<h3>7.1&nbsp; Overview</h3>
-First off, as for normal Valgrind use, you probably want to turn on debugging
-info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
-probably <b>do</b> want to turn optimisation on, since you should profile your
-program as it will be normally run.
-
-The two steps are:
-<ol>
-  <li>Run your program with <code>cachegrind</code> in front of the
-      normal command line invocation.  When the program finishes,
-      Valgrind will print summary cache statistics. It also collects
-      line-by-line information in a file <code>cachegrind.out</code>.
-      <p>
-      This step should be done every time you want to collect
-      information about a new program, a changed program, or about the
-      same program with different input.
-  </li>
-  <p>
-  <li>Generate a function-by-function summary, and possibly annotate
-      source files with 'vg_annotate'. Source files to annotate can be
-      specified manually, or manually on the command line, or
-      "interesting" source files can be annotated automatically with
-      the <code>--auto=yes</code> option.  You can annotate C/C++
-      files or assembly language files equally easily.
-      <p>
-      This step can be performed as many times as you like for each
-      Step 2.  You may want to do multiple annotations showing
-      different information each time.<p>
-  </li>
-</ol>
-
-The steps are described in detail in the following sections.<p>
-
-
-<h3>7.2&nbsp; Cache simulation specifics</h3>
-
-Cachegrind uses a simulation for a machine with a split L1 cache and a unified
-L2 cache.  This configuration is used for all (modern) x86-based machines we
-are aware of.  Old Cyrix CPUs had a unified I and D L1 cache, but they are
-ancient history now.<p>
-
-The more specific characteristics of the simulation are as follows.
-
-<ul>
-  <li>Write-allocate: when a write miss occurs, the block written to
-      is brought into the D1 cache.  Most modern caches have this
-      property.</li><p>
-
-  <li>Bit-selection hash function: the line(s) in the cache to which a
-      memory block maps is chosen by the middle bits M--(M+N-1) of the
-      byte address, where:
-      <ul>
-        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
-        <li>(cache size / line size) = 2^N bytes</li>
-      </ul> </li><p>
-
-  <li>Inclusive L2 cache: the L2 cache replicates all the entries of
-      the L1 cache.  This is standard on Pentium chips, but AMD
-      Athlons use an exclusive L2 cache that only holds blocks evicted
-      from L1.  Ditto AMD Durons and most modern VIAs.</li><p>
-</ul>
-
-The cache configuration simulated (cache size, associativity and line size) is
-determined automagically using the CPUID instruction.  If you have an old
-machine that (a) doesn't support the CPUID instruction, or (b) supports it in
-an early incarnation that doesn't give any cache information, then Cachegrind
-will fall back to using a default configuration (that of a model 3/4 Athlon).
-Cachegrind will tell you if this happens.  You can manually specify one, two or
-all three levels (I1/D1/L2) of the cache from the command line using the
-<code>--I1</code>, <code>--D1</code> and <code>--L2</code> options.<p>
-
-Other noteworthy behaviour:
-
-<ul>
-  <li>References that straddle two cache lines are treated as follows:
-  <ul>
-    <li>If both blocks hit --&gt; counted as one hit</li>
-    <li>If one block hits, the other misses --&gt; counted as one miss</li>
-    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
-  </ul><p></li>
-
-  <li>Instructions that modify a memory location (eg. <code>inc</code> and
-      <code>dec</code>) are counted as doing just a read, ie. a single data
-      reference.  This may seem strange, but since the write can never cause a
-      miss (the read guarantees the block is in the cache) it's not very
-      interesting.<p>
-
-      Thus it measures not the number of times the data cache is accessed, but
-      the number of times a data cache miss could occur.<p>
-      </li>
-</ul>
-
-If you are interested in simulating a cache with different properties, it is
-not particularly hard to write your own cache simulator, or to modify the
-existing ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_D1.c</code>,
-<code>vg_cachesim_L2.c</code> and <code>vg_cachesim_gen.c</code>.  We'd be
-interested to hear from anyone who does.
-
-<a name="profile"></a>
-<h3>7.3&nbsp; Profiling programs</h3>
-
-Cache profiling is enabled by using the <code>--cachesim=yes</code>
-option to the <code>valgrind</code> shell script.  Alternatively, it
-is probably more convenient to use the <code>cachegrind</code> script.
-Either way automatically turns off Valgrind's memory checking functions,
-since the cache simulation is slow enough already, and you probably
-don't want to do both at once.
-<p>
-To gather cache profiling information about the program <code>ls
--l</code>, type:
-
-<blockquote><code>cachegrind ls -l</code></blockquote>
-
-The program will execute (slowly).  Upon completion, summary statistics
-that look like this will be printed:
-
-<pre>
-==31751== I   refs:      27,742,716
-==31751== I1  misses:           276
-==31751== L2  misses:           275
-==31751== I1  miss rate:        0.0%
-==31751== L2i miss rate:        0.0%
-==31751== 
-==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
-==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
-==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
-==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
-==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
-==31751== 
-==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
-==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
-</pre>
-
-Cache accesses for instruction fetches are summarised first, giving the
-number of fetches made (this is the number of instructions executed, which
-can be useful to know in its own right), the number of I1 misses, and the
-number of L2 instruction (<code>L2i</code>) misses.<p>
-
-Cache accesses for data follow. The information is similar to that of the
-instruction fetches, except that the values are also shown split between reads
-and writes (note each row's <code>rd</code> and <code>wr</code> values add up
-to the row's total).<p>
-
-Combined instruction and data figures for the L2 cache follow that.<p>
-
-
-<h3>7.4&nbsp; Output file</h3>
-
-As well as printing summary information, Cachegrind also writes
-line-by-line cache profiling information to a file named
-<code>cachegrind.out</code>.  This file is human-readable, but is best
-interpreted by the accompanying program <code>vg_annotate</code>,
-described in the next section.
-<p>
-Things to note about the <code>cachegrind.out</code> file:
-<ul>
-  <li>It is written every time <code>valgrind --cachesim=yes</code> or
-      <code>cachegrind</code> is run, and will overwrite any existing
-      <code>cachegrind.out</code> in the current directory.</li>
-  <p>
-  <li>It can be huge: <code>ls -l</code> generates a file of about
-      350KB.  Browsing a few files and web pages with a Konqueror
-      built with full debugging information generates a file
-      of around 15 MB.</li>
-</ul>
-
-<a name="profileflags"></a>
-<h3>7.5&nbsp; Cachegrind options</h3>
-Cachegrind accepts all the options that Valgrind does, although some of them
-(ones related to memory checking) don't do anything when cache profiling.<p>
-
-The interesting cache-simulation specific options are:
-
-<ul>
-  <li><code>--I1=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><br>
-      <code>--D1=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><br> 
-      <code>--L2=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><p> 
-      [default: uses CPUID for automagic cache configuration]<p>
-
-      Manually specifies the I1/D1/L2 cache configuration, where
-      <code>size</code> and <code>line_size</code> are measured in bytes.  The
-      three items must be comma-separated, but with no spaces, eg:
-
-      <blockquote><code>cachegrind --I1=65535,2,64</code></blockquote>
-
-      You can specify one, two or three of the I1/D1/L2 caches.  Any level not
-      manually specified will be simulated using the configuration found in the
-      normal way (via the CPUID instruction, or failing that, via defaults).
-</ul>
-
-  
-<a name="annotate"></a>
-<h3>7.6&nbsp; Annotating C/C++ programs</h3>
-
-Before using <code>vg_annotate</code>, it is worth widening your
-window to be at least 120-characters wide if possible, as the output
-lines can be quite long.
-<p>
-To get a function-by-function summary, run <code>vg_annotate</code> in
-directory containing a <code>cachegrind.out</code> file.  The output
-looks like this:
-
-<pre>
---------------------------------------------------------------------------------
-I1 cache:              65536 B, 64 B, 2-way associative
-D1 cache:              65536 B, 64 B, 2-way associative
-L2 cache:              262144 B, 64 B, 8-way associative
-Command:               concord vg_to_ucode.c
-Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Threshold:             99%
-Chosen for annotation:
-Auto-annotation:       on
-
---------------------------------------------------------------------------------
-Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
---------------------------------------------------------------------------------
-27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
-
---------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
---------------------------------------------------------------------------------
-8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
-5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
-2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
-2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
-2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
-1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
-  897,991   51   51   897,831    95    30        62      1      1  ???:???
-  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
-  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
-  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
-  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
-  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
-  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
-  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
-  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
-  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
-   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
-   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
-</pre>
-
-First up is a summary of the annotation options:
-                    
-<ul>
-  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
-      configuration with which these results were obtained.</li><p>
-
-  <li>Command: the command line invocation of the program under
-      examination.</li><p>
-
-  <li>Events recorded: event abbreviations are:<p>
-  <ul>
-    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
-    <li><code>I1mr</code>: I1 cache read misses</li>
-    <li><code>I2mr</code>: L2 cache instruction read misses</li>
-    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
-    <li><code>D1mr</code>: D1 cache read misses</li>
-    <li><code>D2mr</code>: L2 cache data read misses</li>
-    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
-    <li><code>D1mw</code>: D1 cache write misses</li>
-    <li><code>D2mw</code>: L2 cache data write misses</li>
-  </ul><p>
-      Note that D1 total accesses is given by <code>D1mr</code> +
-      <code>D1mw</code>, and that L2 total accesses is given by
-      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
-
-  <li>Events shown: the events shown (a subset of events gathered).  This can
-      be adjusted with the <code>--show</code> option.</li><p>
-
-  <li>Event sort order: the sort order in which functions are shown.  For
-      example, in this case the functions are sorted from highest
-      <code>Ir</code> counts to lowest.  If two functions have identical
-      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
-      counts, and so on.  This order can be adjusted with the
-      <code>--sort</code> option.<p>
-
-      Note that this dictates the order the functions appear.  It is <b>not</b>
-      the order in which the columns appear;  that is dictated by the "events
-      shown" line (and can be changed with the <code>--show</code> option).
-      </li><p>
-
-  <li>Threshold: <code>vg_annotate</code> by default omits functions
-      that cause very low numbers of misses to avoid drowning you in
-      information.  In this case, vg_annotate shows summaries the
-      functions that account for 99% of the <code>Ir</code> counts;
-      <code>Ir</code> is chosen as the threshold event since it is the
-      primary sort event.  The threshold can be adjusted with the
-      <code>--threshold</code> option.</li><p>
-
-  <li>Chosen for annotation: names of files specified manually for annotation; 
-      in this case none.</li><p>
-
-  <li>Auto-annotation: whether auto-annotation was requested via the 
-      <code>--auto=yes</code> option. In this case no.</li><p>
-</ul>
-
-Then follows summary statistics for the whole program. These are similar
-to the summary provided when running <code>cachegrind</code>.<p>
-  
-Then follows function-by-function statistics. Each function is
-identified by a <code>file_name:function_name</code> pair. If a column
-contains only a dot it means the function never performs
-that event (eg. the third row shows that <code>strcmp()</code>
-contains no instructions that write to memory). The name
-<code>???</code> is used if the the file name and/or function name
-could not be determined from debugging information. If most of the
-entries have the form <code>???:???</code> the program probably wasn't
-compiled with <code>-g</code>.  If any code was invalidated (either due to
-self-modifying code or unloading of shared objects) its counts are aggregated
-into a single cost centre written as <code>(discarded):(discarded)</code>.<p>
-
-It is worth noting that functions will come from three types of source files:
-<ol>
-  <li> From the profiled program (<code>concord.c</code> in this example).</li>
-  <li>From libraries (eg. <code>getc.c</code>)</li>
-  <li>From Valgrind's implementation of some libc functions (eg.
-      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
-      the filename begins with <code>vg_</code>, and is probably one of
-      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
-      <code>vg_mylibc.c</code>.
-  </li>
-</ol>
-
-There are two ways to annotate source files -- by choosing them
-manually, or with the <code>--auto=yes</code> option. To do it
-manually, just specify the filenames as arguments to
-<code>vg_annotate</code>. For example, the output from running
-<code>vg_annotate concord.c</code> for our example produces the same
-output as above followed by an annotated version of
-<code>concord.c</code>, a section of which looks like:
-
-<pre>
---------------------------------------------------------------------------------
--- User-annotated source: concord.c
---------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
-
-[snip]
-
-        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
-        3    1    1       .     .     .       1      0      0  {
-        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
-        .    .    .       .     .     .       .      .      .      Word_Info *data;
-        1    0    0       .     .     .       1      1      1      int line = 1, i;
-        .    .    .       .     .     .       .      .      .
-        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
-        .    .    .       .     .     .       .      .      .
-    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
-    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
-        .    .    .       .     .     .       .      .      .
-        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
-        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
-        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
-        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
-        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
-        .    .    .       .     .     .       .      .      .      }
-        .    .    .       .     .     .       .      .      .
-  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
-  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
-        .    .    .       .     .     .       .      .      .
-        4    0    0       1     0     0       2      0      0      free(data);
-        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
-        3    0    0       2     0     0       .      .      .  }
-</pre>
-
-(Although column widths are automatically minimised, a wide terminal is clearly
-useful.)<p>
-  
-Each source file is clearly marked (<code>User-annotated source</code>) as
-having been chosen manually for annotation.  If the file was found in one of
-the directories specified with the <code>-I</code>/<code>--include</code>
-option, the directory and file are both given.<p>
-
-Each line is annotated with its event counts.  Events not applicable for a line
-are represented by a `.';  this is useful for distinguishing between an event
-which cannot happen, and one which can but did not.<p> 
-
-Sometimes only a small section of a source file is executed.  To minimise
-uninteresting output, Valgrind only shows annotated lines and lines within a
-small distance of annotated lines.  Gaps are marked with the line numbers so
-you know which part of a file the shown code comes from, eg:
-
-<pre>
-(figures and code for line 704)
--- line 704 ----------------------------------------
--- line 878 ----------------------------------------
-(figures and code for line 878)
-</pre>
-
-The amount of context to show around annotated lines is controlled by the
-<code>--context</code> option.<p>
-
-To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
-vg_annotate will automatically annotate every source file it can find that is
-mentioned in the function-by-function summary.  Therefore, the files chosen for
-auto-annotation  are affected by the <code>--sort</code> and
-<code>--threshold</code> options.  Each source file is clearly marked
-(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
-that could not be found are mentioned at the end of the output, eg:    
-
-<pre>
---------------------------------------------------------------------------------
-The following files chosen for auto-annotation could not be found:
---------------------------------------------------------------------------------
-  getc.c
-  ctype.c
-  ../sysdeps/generic/lockfile.c
-</pre>
-
-This is quite common for library files, since libraries are usually compiled
-with debugging information, but the source files are often not present on a
-system.  If a file is chosen for annotation <b>both</b> manually and
-automatically, it is marked as <code>User-annotated source</code>.
-
-Use the <code>-I/--include</code> option to tell Valgrind where to look for
-source files if the filenames found from the debugging information aren't
-specific enough.
-
-Beware that vg_annotate can take some time to digest large
-<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
-auto-annotation can produce a lot of output if your program is large!
-
-
-<h3>7.7&nbsp; Annotating assembler programs</h3>
-
-Valgrind can annotate assembler programs too, or annotate the
-assembler generated for your C program.  Sometimes this is useful for
-understanding what is really happening when an interesting line of C
-code is translated into multiple instructions.<p>
-
-To do this, you just need to assemble your <code>.s</code> files with
-assembler-level debug information.  gcc doesn't do this, but you can
-use the GNU assembler with the <code>--gstabs</code> option to
-generate object files with this information, eg:
-
-<blockquote><code>as --gstabs foo.s</code></blockquote>
-
-You can then profile and annotate source files in the same way as for C/C++
-programs.
-
-
-<h3>7.8&nbsp; <code>vg_annotate</code> options</h3>
-<ul>
-  <li><code>-h, --help</code></li><p>
-  <li><code>-v, --version</code><p>
-
-      Help and version, as usual.</li>
-
-  <li><code>--sort=A,B,C</code> [default: order in 
-      <code>cachegrind.out</code>]<p>
-      Specifies the events upon which the sorting of the function-by-function
-      entries will be based.  Useful if you want to concentrate on eg. I cache
-      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
-      (<code>--sort=D1mr,D2mr</code>), or L2 misses
-      (<code>--sort=D2mr,I2mr</code>).</li><p>
-
-  <li><code>--show=A,B,C</code> [default: all, using order in
-      <code>cachegrind.out</code>]<p>
-      Specifies which events to show (and the column order). Default is to use
-      all present in the <code>cachegrind.out</code> file (and use the order in
-      the file).</li><p>
-
-  <li><code>--threshold=X</code> [default: 99%] <p>
-      Sets the threshold for the function-by-function summary.  Functions are
-      shown that account for more than X% of the primary sort event.  If
-      auto-annotating, also affects which files are annotated.
-      
-      Note: thresholds can be set for more than one of the events by appending
-      any events for the <code>--sort</code> option with a colon and a number
-      (no spaces, though).  E.g. if you want to see the functions that cover
-      99% of L2 read misses and 99% of L2 write misses, use this option:
-      
-      <blockquote><code>--sort=D2mr:99,D2mw:99</code></blockquote>
-      </li><p>
-
-  <li><code>--auto=no</code> [default]<br>
-      <code>--auto=yes</code> <p>
-      When enabled, automatically annotates every file that is mentioned in the
-      function-by-function summary that can be found.  Also gives a list of
-      those that couldn't be found.
-
-  <li><code>--context=N</code> [default: 8]<p>
-      Print N lines of context before and after each annotated line.  Avoids
-      printing large sections of source files that were not executed.  Use a 
-      large number (eg. 10,000) to show all source lines.
-      </li><p>
-
-  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
-      [default: empty string]<p>
-      Adds a directory to the list in which to search for files.  Multiple
-      -I/--include options can be given to add multiple directories.
-</ul>
-  
-
-<h3>7.9&nbsp; Warnings</h3>
-There are a couple of situations in which vg_annotate issues warnings.
-
-<ul>
-  <li>If a source file is more recent than the <code>cachegrind.out</code>
-      file.  This is because the information in <code>cachegrind.out</code> is
-      only recorded with line numbers, so if the line numbers change at all in
-      the source (eg. lines added, deleted, swapped), any annotations will be 
-      incorrect.<p>
-
-  <li>If information is recorded about line numbers past the end of a file.
-      This can be caused by the above problem, ie. shortening the source file
-      while using an old <code>cachegrind.out</code> file.  If this happens,
-      the figures for the bogus lines are printed anyway (clearly marked as
-      bogus) in case they are important.</li><p>
-</ul>
-
-
-<h3>7.10&nbsp; Things to watch out for</h3>
-Some odd things that can occur during annotation:
-
-<ul>
-  <li>If annotating at the assembler level, you might see something like this:
-
-      <pre>
-      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
-      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
-      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
-      .    .    .  .    .    .  .    .    .          .align 4,0x90
-      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
-      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
-      </pre>
-
-      How can the third instruction be executed twice when the others are
-      executed only once?  As it turns out, it isn't.  Here's a dump of the
-      executable, using <code>objdump -d</code>:
-
-      <pre>
-      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
-      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
-      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
-      8048f32:       89 f6                   mov    %esi,%esi
-      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
-      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
-      </pre>
-
-      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
-      come from?  The GNU assembler inserted it to serve as the two bytes of
-      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
-      a four-byte boundary, but pretended it didn't exist when adding debug
-      information.  Thus when Valgrind reads the debug info it thinks that the
-      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
-      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
-      <code>mov %esi,%esi</code> to it.<p>
-  </li>
-
-  <li>Inlined functions can cause strange results in the function-by-function
-      summary.  If a function <code>inline_me()</code> is defined in
-      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
-      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
-      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
-      will be separate function entries for each inlining site, ie.
-      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
-      <code>foo.h:f3()</code>.  To find the total counts for
-      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
-
-      The reason for this is that although the debug info output by gcc
-      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
-      doesn't indicate the name of the function in <code>foo.h</code>, so
-      Valgrind keeps using the old one.<p>
-
-  <li>Sometimes, the same filename might be represented with a relative name
-      and with an absolute name in different parts of the debug info, eg:
-      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
-      case, if you use auto-annotation, the file will be annotated twice with
-      the counts split between the two.<p>
-  </li>
-
-  <li>Files with more than 65,535 lines cause difficulties for the stabs debug
-      info reader.  This is because the line number in the <code>struct
-      nlist</code> defined in <code>a.out.h</code> under Linux is only a 16-bit
-      value.  Valgrind can handle some files with more than 65,535 lines
-      correctly by making some guesses to identify line number overflows.  But
-      some cases are beyond it, in which case you'll get a warning message
-      explaining that annotations for the file might be incorrect.<p>
-  </li>
-
-  <li>If you compile some files with <code>-g</code> and some without, some
-      events that take place in a file without debug info could be attributed
-      to the last line of a file with debug info (whichever one gets placed
-      before the non-debug-info file in the executable).<p>
-  </li>
-</ul>
-
-This list looks long, but these cases should be fairly rare.<p>
-
-Note: stabs is not an easy format to read.  If you come across bizarre
-annotations that look like might be caused by a bug in the stabs reader,
-please let us know.<p>
-
-
-<h3>7.11&nbsp; Accuracy</h3>
-Valgrind's cache profiling has a number of shortcomings:
-
-<ul>
-  <li>It doesn't account for kernel activity -- the effect of system calls on
-      the cache contents is ignored.</li><p>
-
-  <li>It doesn't account for other process activity (although this is probably
-      desirable when considering a single program).</li><p>
-
-  <li>It doesn't account for virtual-to-physical address mappings;  hence the
-      entire simulation is not a true representation of what's happening in the
-      cache.</li><p>
-
-  <li>It doesn't account for cache misses not visible at the instruction level,
-      eg. those arising from TLB misses, or speculative execution.</li><p>
-
-  <li>Valgrind's custom <code>malloc()</code> will allocate memory in different
-      ways to the standard <code>malloc()</code>, which could warp the results.
-      </li><p>
-
-  <li>Valgrind's custom threads implementation will schedule threads
-      differently to the standard one.  This too could warp the results for
-      threaded programs.
-      </li><p>
-
-  <li>The instructions <code>bts</code>, <code>btr</code> and <code>btc</code>
-      will incorrectly be counted as doing a data read if both the arguments
-      are registers, eg:
-
-      <blockquote><code>btsl %eax, %edx</code></blockquote>
-
-      This should only happen rarely.
-</ul>
-
-Another thing worth nothing is that results are very sensitive.  Changing the
-size of the <code>valgrind.so</code> file, the size of the program being
-profiled, or even the length of its name can perturb the results.  Variations
-will be small, but don't expect perfectly repeatable results if your program
-changes at all.<p>
-
-While these factors mean you shouldn't trust the results to be super-accurate,
-hopefully they should be close enough to be useful.<p>
-
-
-<h3>7.12&nbsp; Todo</h3>
-<ul>
-  <li>Program start-up/shut-down calls a lot of functions that aren't
-      interesting and just complicate the output.  Would be nice to exclude
-      these somehow.</li>
-  <p>
-</ul> 
-<hr width="100%">
-</body>
-</html>
-
diff --git a/cachegrind/docs/nav.html b/cachegrind/docs/nav.html
deleted file mode 100644
index ad920ad443..0000000000
--- a/cachegrind/docs/nav.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<html>
-  <head>
-    <title>Valgrind</title>
-    <base target="main">
-    <style type="text/css">
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-  </head>
-
-  <body>
-    <br>
-    <a href="manual.html#contents"><b>Contents of this manual</b></a><br>
-    <a href="manual.html#intro">1 Introduction</a><br>
-    <a href="manual.html#whatfor">1.1 What Valgrind is for</a><br>
-    <a href="manual.html#whatdoes">1.2 What it does with
-       your program</a>
-    <p>
-    <a href="manual.html#howtouse">2 <b>How to use it, and how to
-       make sense of the results</b></a><br>
-    <a href="manual.html#starta">2.1 Getting started</a><br>
-    <a href="manual.html#comment">2.2 The commentary</a><br>
-    <a href="manual.html#report">2.3 Reporting of errors</a><br>
-    <a href="manual.html#suppress">2.4 Suppressing errors</a><br>
-    <a href="manual.html#flags">2.5 Command-line flags</a><br>
-    <a href="manual.html#errormsgs">2.6 Explanation of error messages</a><br>
-    <a href="manual.html#suppfiles">2.7 Writing suppressions files</a><br>
-    <a href="manual.html#clientreq">2.8 The Client Request mechanism</a><br>
-    <a href="manual.html#pthreads">2.9 Support for POSIX pthreads</a><br>
-    <a href="manual.html#install">2.10 Building and installing</a><br>
-    <a href="manual.html#problems">2.11 If you have problems</a>
-    <p>
-    <a href="manual.html#machine">3 <b>Details of the checking machinery</b></a><br>
-    <a href="manual.html#vvalue">3.1 Valid-value (V) bits</a><br>
-    <a href="manual.html#vaddress">3.2 Valid-address (A) bits</a><br>
-    <a href="manual.html#together">3.3 Putting it all together</a><br>
-    <a href="manual.html#signals">3.4 Signals</a><br>
-    <a href="manual.html#leaks">3.5 Memory leak detection</a>
-    <p>
-    <a href="manual.html#limits">4 <b>Limitations</b></a><br>
-    <p>
-    <a href="manual.html#howitworks">5 <b>How it works -- a rough overview</b></a><br>
-    <a href="manual.html#startb">5.1 Getting started</a><br>
-    <a href="manual.html#engine">5.2 The translation/instrumentation engine</a><br>
-    <a href="manual.html#track">5.3 Tracking the status of memory</a><br>
-    <a href="manual.html#sys_calls">5.4 System calls</a><br>
-    <a href="manual.html#sys_signals">5.5 Signals</a>
-    <p>
-    <a href="manual.html#example">6 <b>An example</b></a><br>
-    <p>
-    <a href="manual.html#cache">7 <b>Cache profiling</b></a></h4>
-    <p>
-    <a href="techdocs.html">8 <b>The design and implementation of Valgrind</b></a><br>
-
-</body>
-</html>
diff --git a/cachegrind/docs/techdocs.html b/cachegrind/docs/techdocs.html
deleted file mode 100644
index 2e1cc8b7e9..0000000000
--- a/cachegrind/docs/techdocs.html
+++ /dev/null
@@ -1,2524 +0,0 @@
-<html>
-  <head>
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-    <title>The design and implementation of Valgrind</title>
-  </head>
-
-<body bgcolor="#ffffff">
-
-<a name="title">&nbsp;</a>
-<h1 align=center>The design and implementation of Valgrind</h1>
-
-<center>
-Detailed technical notes for hackers, maintainers and the
-overly-curious<br>
-These notes pertain to snapshot 20020306<br>
-<p>
-<a href="mailto:jseward@acm.org">jseward@acm.org<br>
-<a href="http://developer.kde.org/~sewardj">http://developer.kde.org/~sewardj</a><br>
-Copyright &copy; 2000-2002 Julian Seward
-<p>
-Valgrind is licensed under the GNU General Public License, 
-version 2<br>
-An open-source tool for finding memory-management problems in
-x86 GNU/Linux executables.
-</center>
-
-<p>
-
-
-
-
-<hr width="100%">
-
-<h2>Introduction</h2>
-
-This document contains a detailed, highly-technical description of the
-internals of Valgrind.  This is not the user manual; if you are an
-end-user of Valgrind, you do not want to read this.  Conversely, if
-you really are a hacker-type and want to know how it works, I assume
-that you have read the user manual thoroughly.
-<p>
-You may need to read this document several times, and carefully.  Some
-important things, I only say once.
-
-
-<h3>History</h3>
-
-Valgrind came into public view in late Feb 2002.  However, it has been
-under contemplation for a very long time, perhaps seriously for about
-five years.  Somewhat over two years ago, I started working on the x86
-code generator for the Glasgow Haskell Compiler
-(http://www.haskell.org/ghc), gaining familiarity with x86 internals
-on the way.  I then did Cacheprof (http://www.cacheprof.org), gaining
-further x86 experience.  Some time around Feb 2000 I started
-experimenting with a user-space x86 interpreter for x86-Linux.  This
-worked, but it was clear that a JIT-based scheme would be necessary to
-give reasonable performance for Valgrind.  Design work for the JITter
-started in earnest in Oct 2000, and by early 2001 I had an x86-to-x86
-dynamic translator which could run quite large programs.  This
-translator was in a sense pointless, since it did not do any
-instrumentation or checking.
-
-<p>
-Most of the rest of 2001 was taken up designing and implementing the
-instrumentation scheme.  The main difficulty, which consumed a lot
-of effort, was to design a scheme which did not generate large numbers
-of false uninitialised-value warnings.  By late 2001 a satisfactory
-scheme had been arrived at, and I started to test it on ever-larger
-programs, with an eventual eye to making it work well enough so that
-it was helpful to folks debugging the upcoming version 3 of KDE.  I've
-used KDE since before version 1.0, and wanted to Valgrind to be an
-indirect contribution to the KDE 3 development effort.  At the start of
-Feb 02 the kde-core-devel crew started using it, and gave a huge
-amount of helpful feedback and patches in the space of three weeks.
-Snapshot 20020306 is the result.
-
-<p>
-In the best Unix tradition, or perhaps in the spirit of Fred Brooks'
-depressing-but-completely-accurate epitaph "build one to throw away;
-you will anyway", much of Valgrind is a second or third rendition of
-the initial idea.  The instrumentation machinery
-(<code>vg_translate.c</code>, <code>vg_memory.c</code>) and core CPU
-simulation (<code>vg_to_ucode.c</code>, <code>vg_from_ucode.c</code>)
-have had three redesigns and rewrites; the register allocator,
-low-level memory manager (<code>vg_malloc2.c</code>) and symbol table
-reader (<code>vg_symtab2.c</code>) are on the second rewrite.  In a
-sense, this document serves to record some of the knowledge gained as
-a result.
-
-
-<h3>Design overview</h3>
-
-Valgrind is compiled into a Linux shared object,
-<code>valgrind.so</code>, and also a dummy one,
-<code>valgrinq.so</code>, of which more later.  The
-<code>valgrind</code> shell script adds <code>valgrind.so</code> to
-the <code>LD_PRELOAD</code> list of extra libraries to be
-loaded with any dynamically linked library.  This is a standard trick,
-one which I assume the <code>LD_PRELOAD</code> mechanism was developed
-to support.
-
-<p>
-<code>valgrind.so</code>
-is linked with the <code>-z initfirst</code> flag, which requests that
-its initialisation code is run before that of any other object in the
-executable image.  When this happens, valgrind gains control.  The
-real CPU becomes "trapped" in <code>valgrind.so</code> and the 
-translations it generates.  The synthetic CPU provided by Valgrind
-does, however, return from this initialisation function.  So the 
-normal startup actions, orchestrated by the dynamic linker
-<code>ld.so</code>, continue as usual, except on the synthetic CPU,
-not the real one.  Eventually <code>main</code> is run and returns,
-and then the finalisation code of the shared objects is run,
-presumably in inverse order to which they were initialised.  Remember,
-this is still all happening on the simulated CPU.  Eventually
-<code>valgrind.so</code>'s own finalisation code is called.  It spots
-this event, shuts down the simulated CPU, prints any error summaries
-and/or does leak detection, and returns from the initialisation code
-on the real CPU.  At this point, in effect the real and synthetic CPUs
-have merged back into one, Valgrind has lost control of the program,
-and the program finally <code>exit()s</code> back to the kernel in the
-usual way.
-
-<p>
-The normal course of activity, one Valgrind has started up, is as
-follows.  Valgrind never runs any part of your program (usually
-referred to as the "client"), not a single byte of it, directly.
-Instead it uses function <code>VG_(translate)</code> to translate
-basic blocks (BBs, straight-line sequences of code) into instrumented
-translations, and those are run instead.  The translations are stored
-in the translation cache (TC), <code>vg_tc</code>, with the
-translation table (TT), <code>vg_tt</code> supplying the
-original-to-translation code address mapping.  Auxiliary array
-<code>VG_(tt_fast)</code> is used as a direct-map cache for fast
-lookups in TT; it usually achieves a hit rate of around 98% and
-facilitates an orig-to-trans lookup in 4 x86 insns, which is not bad.
-
-<p>
-Function <code>VG_(dispatch)</code> in <code>vg_dispatch.S</code> is
-the heart of the JIT dispatcher.  Once a translated code address has
-been found, it is executed simply by an x86 <code>call</code>
-to the translation.  At the end of the translation, the next 
-original code addr is loaded into <code>%eax</code>, and the 
-translation then does a <code>ret</code>, taking it back to the
-dispatch loop, with, interestingly, zero branch mispredictions.  
-The address requested in <code>%eax</code> is looked up first in
-<code>VG_(tt_fast)</code>, and, if not found, by calling C helper
-<code>VG_(search_transtab)</code>.  If there is still no translation 
-available, <code>VG_(dispatch)</code> exits back to the top-level
-C dispatcher <code>VG_(toploop)</code>, which arranges for 
-<code>VG_(translate)</code> to make a new translation.  All fairly
-unsurprising, really.  There are various complexities described below.
-
-<p>
-The translator, orchestrated by <code>VG_(translate)</code>, is
-complicated but entirely self-contained.  It is described in great
-detail in subsequent sections.  Translations are stored in TC, with TT
-tracking administrative information.  The translations are subject to
-an approximate LRU-based management scheme.  With the current
-settings, the TC can hold at most about 15MB of translations, and LRU
-passes prune it to about 13.5MB.  Given that the
-orig-to-translation expansion ratio is about 13:1 to 14:1, this means
-TC holds translations for more or less a megabyte of original code,
-which generally comes to about 70000 basic blocks for C++ compiled
-with optimisation on.  Generating new translations is expensive, so it
-is worth having a large TC to minimise the (capacity) miss rate.
-
-<p>
-The dispatcher, <code>VG_(dispatch)</code>, receives hints from
-the translations which allow it to cheaply spot all control 
-transfers corresponding to x86 <code>call</code> and <code>ret</code>
-instructions.  It has to do this in order to spot some special events:
-<ul>
-<li>Calls to <code>VG_(shutdown)</code>.  This is Valgrind's cue to
-    exit.  NOTE: actually this is done a different way; it should be
-    cleaned up.
-<p>
-<li>Returns of system call handlers, to the return address 
-    <code>VG_(signalreturn_bogusRA)</code>.  The signal simulator
-    needs to know when a signal handler is returning, so we spot
-    jumps (returns) to this address.
-<p>
-<li>Calls to <code>vg_trap_here</code>.  All <code>malloc</code>,
-    <code>free</code>, etc calls that the client program makes are
-    eventually routed to a call to <code>vg_trap_here</code>,
-    and Valgrind does its own special thing with these calls.
-    In effect this provides a trapdoor, by which Valgrind can
-    intercept certain calls on the simulated CPU, run the call as it
-    sees fit itself (on the real CPU), and return the result to
-    the simulated CPU, quite transparently to the client program.
-</ul>
-Valgrind intercepts the client's <code>malloc</code>,
-<code>free</code>, etc,
-calls, so that it can store additional information.  Each block 
-<code>malloc</code>'d by the client gives rise to a shadow block
-in which Valgrind stores the call stack at the time of the
-<code>malloc</code>
-call.  When the client calls <code>free</code>, Valgrind tries to
-find the shadow block corresponding to the address passed to
-<code>free</code>, and emits an error message if none can be found.
-If it is found, the block is placed on the freed blocks queue 
-<code>vg_freed_list</code>, it is marked as inaccessible, and
-its shadow block now records the call stack at the time of the
-<code>free</code> call.  Keeping <code>free</code>'d blocks in
-this queue allows Valgrind to spot all (presumably invalid) accesses
-to them.  However, once the volume of blocks in the free queue 
-exceeds <code>VG_(clo_freelist_vol)</code>, blocks are finally
-removed from the queue.
-
-<p>
-Keeping track of A and V bits (note: if you don't know what these are,
-you haven't read the user guide carefully enough) for memory is done
-in <code>vg_memory.c</code>.  This implements a sparse array structure
-which covers the entire 4G address space in a way which is reasonably
-fast and reasonably space efficient.  The 4G address space is divided
-up into 64K sections, each covering 64Kb of address space.  Given a
-32-bit address, the top 16 bits are used to select one of the 65536
-entries in <code>VG_(primary_map)</code>.  The resulting "secondary"
-(<code>SecMap</code>) holds A and V bits for the 64k of address space
-chunk corresponding to the lower 16 bits of the address.
-
-
-<h3>Design decisions</h3>
-
-Some design decisions were motivated by the need to make Valgrind
-debuggable.  Imagine you are writing a CPU simulator.  It works fairly
-well.  However, you run some large program, like Netscape, and after
-tens of millions of instructions, it crashes.  How can you figure out
-where in your simulator the bug is?
-
-<p>
-Valgrind's answer is: cheat.  Valgrind is designed so that it is
-possible to switch back to running the client program on the real
-CPU at any point.  Using the <code>--stop-after= </code> flag, you can 
-ask Valgrind to run just some number of basic blocks, and then 
-run the rest of the way on the real CPU.  If you are searching for
-a bug in the simulated CPU, you can use this to do a binary search,
-which quickly leads you to the specific basic block which is
-causing the problem.  
-
-<p>
-This is all very handy.  It does constrain the design in certain
-unimportant ways.  Firstly, the layout of memory, when viewed from the
-client's point of view, must be identical regardless of whether it is
-running on the real or simulated CPU.  This means that Valgrind can't
-do pointer swizzling -- well, no great loss -- and it can't run on 
-the same stack as the client -- again, no great loss.  
-Valgrind operates on its own stack, <code>VG_(stack)</code>, which
-it switches to at startup, temporarily switching back to the client's
-stack when doing system calls for the client.
-
-<p>
-Valgrind also receives signals on its own stack,
-<code>VG_(sigstack)</code>, but for different gruesome reasons
-discussed below.
-
-<p>
-This nice clean switch-back-to-the-real-CPU-whenever-you-like story
-is muddied by signals.  Problem is that signals arrive at arbitrary
-times and tend to slightly perturb the basic block count, with the
-result that you can get close to the basic block causing a problem but
-can't home in on it exactly.  My kludgey hack is to define
-<code>SIGNAL_SIMULATION</code> to 1 towards the bottom of 
-<code>vg_syscall_mem.c</code>, so that signal handlers are run on the
-real CPU and don't change the BB counts.
-
-<p>
-A second hole in the switch-back-to-real-CPU story is that Valgrind's
-way of delivering signals to the client is different from that of the
-kernel.  Specifically, the layout of the signal delivery frame, and
-the mechanism used to detect a sighandler returning, are different.
-So you can't expect to make the transition inside a sighandler and
-still have things working, but in practice that's not much of a
-restriction.
-
-<p>
-Valgrind's implementation of <code>malloc</code>, <code>free</code>,
-etc, (in <code>vg_clientmalloc.c</code>, not the low-level stuff in
-<code>vg_malloc2.c</code>) is somewhat complicated by the need to 
-handle switching back at arbitrary points.  It does work tho.
-
-
-
-<h3>Correctness</h3>
-
-There's only one of me, and I have a Real Life (tm) as well as hacking
-Valgrind [allegedly :-].  That means I don't have time to waste
-chasing endless bugs in Valgrind.  My emphasis is therefore on doing
-everything as simply as possible, with correctness, stability and
-robustness being the number one priority, more important than
-performance or functionality.  As a result:
-<ul>
-<li>The code is absolutely loaded with assertions, and these are
-    <b>permanently enabled.</b>  I have no plan to remove or disable
-    them later.  Over the past couple of months, as valgrind has
-    become more widely used, they have shown their worth, pulling
-    up various bugs which would otherwise have appeared as
-    hard-to-find segmentation faults.
-    <p>
-    I am of the view that it's acceptable to spend 5% of the total
-    running time of your valgrindified program doing assertion checks
-    and other internal sanity checks.
-<p>
-<li>Aside from the assertions, valgrind contains various sets of
-    internal sanity checks, which get run at varying frequencies
-    during normal operation.  <code>VG_(do_sanity_checks)</code>
-    runs every 1000 basic blocks, which means 500 to 2000 times/second 
-    for typical machines at present.  It checks that Valgrind hasn't
-    overrun its private stack, and does some simple checks on the
-    memory permissions maps.  Once every 25 calls it does some more
-    extensive checks on those maps.  Etc, etc.
-    <p>
-    The following components also have sanity check code, which can
-    be enabled to aid debugging:
-    <ul>
-    <li>The low-level memory-manager
-        (<code>VG_(mallocSanityCheckArena)</code>).  This does a 
-        complete check of all blocks and chains in an arena, which
-        is very slow.  Is not engaged by default.
-    <p>
-    <li>The symbol table reader(s): various checks to ensure
-        uniqueness of mappings; see <code>VG_(read_symbols)</code>
-        for a start.  Is permanently engaged.
-    <p>
-    <li>The A and V bit tracking stuff in <code>vg_memory.c</code>.
-        This can be compiled with cpp symbol
-        <code>VG_DEBUG_MEMORY</code> defined, which removes all the
-        fast, optimised cases, and uses simple-but-slow fallbacks
-        instead.  Not engaged by default.
-    <p>
-    <li>Ditto <code>VG_DEBUG_LEAKCHECK</code>.
-    <p>
-    <li>The JITter parses x86 basic blocks into sequences of 
-        UCode instructions.  It then sanity checks each one with
-        <code>VG_(saneUInstr)</code> and sanity checks the sequence
-        as a whole with <code>VG_(saneUCodeBlock)</code>.  This stuff
-        is engaged by default, and has caught some way-obscure bugs
-        in the simulated CPU machinery in its time.
-    <p>
-    <li>The system call wrapper does
-        <code>VG_(first_and_last_secondaries_look_plausible)</code> after
-        every syscall; this is known to pick up bugs in the syscall
-        wrappers.  Engaged by default.
-    <p>
-    <li>The main dispatch loop, in <code>VG_(dispatch)</code>, checks
-        that translations do not set <code>%ebp</code> to any value
-        different from <code>VG_EBP_DISPATCH_CHECKED</code> or
-        <code>& VG_(baseBlock)</code>.  In effect this test is free,
-        and is permanently engaged.
-    <p>
-    <li>There are a couple of ifdefed-out consistency checks I
-        inserted whilst debugging the new register allocater, 
-        <code>vg_do_register_allocation</code>.
-    </ul>
-<p>
-<li>I try to avoid techniques, algorithms, mechanisms, etc, for which
-    I can supply neither a convincing argument that they are correct,
-    nor sanity-check code which might pick up bugs in my
-    implementation.  I don't always succeed in this, but I try.
-    Basically the idea is: avoid techniques which are, in practice,
-    unverifiable, in some sense.   When doing anything, always have in
-    mind: "how can I verify that this is correct?"
-</ul>
-
-<p>
-Some more specific things are:
-
-<ul>
-<li>Valgrind runs in the same namespace as the client, at least from
-    <code>ld.so</code>'s point of view, and it therefore absolutely
-    had better not export any symbol with a name which could clash
-    with that of the client or any of its libraries.  Therefore, all
-    globally visible symbols exported from <code>valgrind.so</code>
-    are defined using the <code>VG_</code> CPP macro.  As you'll see
-    from <code>vg_constants.h</code>, this appends some arbitrary
-    prefix to the symbol, in order that it be, we hope, globally
-    unique.  Currently the prefix is <code>vgPlain_</code>.  For
-    convenience there are also <code>VGM_</code>, <code>VGP_</code>
-    and <code>VGOFF_</code>.  All locally defined symbols are declared
-    <code>static</code> and do not appear in the final shared object.
-    <p>
-    To check this, I periodically do 
-    <code>nm valgrind.so | grep " T "</code>, 
-    which shows you all the globally exported text symbols.
-    They should all have an approved prefix, except for those like
-    <code>malloc</code>, <code>free</code>, etc, which we deliberately
-    want to shadow and take precedence over the same names exported
-    from <code>glibc.so</code>, so that valgrind can intercept those
-    calls easily.  Similarly, <code>nm valgrind.so | grep " D "</code>
-    allows you to find any rogue data-segment symbol names.
-<p>
-<li>Valgrind tries, and almost succeeds, in being completely
-    independent of all other shared objects, in particular of
-    <code>glibc.so</code>.  For example, we have our own low-level
-    memory manager in <code>vg_malloc2.c</code>, which is a fairly
-    standard malloc/free scheme augmented with arenas, and
-    <code>vg_mylibc.c</code> exports reimplementations of various bits
-    and pieces you'd normally get from the C library.
-    <p>
-    Why all the hassle?  Because imagine the potential chaos of both
-    the simulated and real CPUs executing in <code>glibc.so</code>.
-    It just seems simpler and cleaner to be completely self-contained,
-    so that only the simulated CPU visits <code>glibc.so</code>.  In
-    practice it's not much hassle anyway.  Also, valgrind starts up
-    before glibc has a chance to initialise itself, and who knows what
-    difficulties that could lead to.  Finally, glibc has definitions
-    for some types, specifically <code>sigset_t</code>, which conflict
-    (are different from) the Linux kernel's idea of same.  When 
-    Valgrind wants to fiddle around with signal stuff, it wants to
-    use the kernel's definitions, not glibc's definitions.  So it's 
-    simplest just to keep glibc out of the picture entirely.
-    <p>
-    To find out which glibc symbols are used by Valgrind, reinstate
-    the link flags <code>-nostdlib -Wl,-no-undefined</code>.  This
-    causes linking to fail, but will tell you what you depend on.
-    I have mostly, but not entirely, got rid of the glibc
-    dependencies; what remains is, IMO, fairly harmless.  AFAIK the
-    current dependencies are: <code>memset</code>,
-    <code>memcmp</code>, <code>stat</code>, <code>system</code>,
-    <code>sbrk</code>, <code>setjmp</code> and <code>longjmp</code>.
-
-<p>
-<li>Similarly, valgrind should not really import any headers other
-    than the Linux kernel headers, since it knows of no API other than
-    the kernel interface to talk to.  At the moment this is really not
-    in a good state, and <code>vg_syscall_mem</code> imports, via
-    <code>vg_unsafe.h</code>, a significant number of C-library
-    headers so as to know the sizes of various structs passed across
-    the kernel boundary.  This is of course completely bogus, since
-    there is no guarantee that the C library's definitions of these
-    structs matches those of the kernel.  I have started to sort this
-    out using <code>vg_kerneliface.h</code>, into which I had intended
-    to copy all kernel definitions which valgrind could need, but this
-    has not gotten very far.  At the moment it mostly contains
-    definitions for <code>sigset_t</code> and <code>struct
-    sigaction</code>, since the kernel's definition for these really
-    does clash with glibc's.  I plan to use a <code>vki_</code> prefix
-    on all these types and constants, to denote the fact that they
-    pertain to <b>V</b>algrind's <b>K</b>ernel <b>I</b>nterface.
-    <p>
-    Another advantage of having a <code>vg_kerneliface.h</code> file
-    is that it makes it simpler to interface to a different kernel.
-    Once can, for example, easily imagine writing a new
-    <code>vg_kerneliface.h</code> for FreeBSD, or x86 NetBSD.
-
-</ul>
-
-<h3>Current limitations</h3>
-
-No threads.  I think fixing this is close to a research-grade problem.
-<p>
-No MMX.  Fixing this should be relatively easy, using the same giant
-trick used for x86 FPU instructions.  See below.
-<p>
-Support for weird (non-POSIX) signal stuff is patchy.  Does anybody
-care?
-<p>
-
-
-
-
-<hr width="100%">
-
-<h2>The instrumenting JITter</h2>
-
-This really is the heart of the matter.  We begin with various side
-issues.
-
-<h3>Run-time storage, and the use of host registers</h3>
-
-Valgrind translates client (original) basic blocks into instrumented
-basic blocks, which live in the translation cache TC, until either the
-client finishes or the translations are ejected from TC to make room
-for newer ones.
-<p>
-Since it generates x86 code in memory, Valgrind has complete control
-of the use of registers in the translations.  Now pay attention.  I
-shall say this only once, and it is important you understand this.  In
-what follows I will refer to registers in the host (real) cpu using
-their standard names, <code>%eax</code>, <code>%edi</code>, etc.  I
-refer to registers in the simulated CPU by capitalising them:
-<code>%EAX</code>, <code>%EDI</code>, etc.  These two sets of
-registers usually bear no direct relationship to each other; there is
-no fixed mapping between them.  This naming scheme is used fairly
-consistently in the comments in the sources.
-<p>
-Host registers, once things are up and running, are used as follows:
-<ul>
-<li><code>%esp</code>, the real stack pointer, points
-    somewhere in Valgrind's private stack area,
-    <code>VG_(stack)</code> or, transiently, into its signal delivery
-    stack, <code>VG_(sigstack)</code>.
-<p>
-<li><code>%edi</code> is used as a temporary in code generation; it
-    is almost always dead, except when used for the <code>Left</code>
-    value-tag operations.
-<p>
-<li><code>%eax</code>, <code>%ebx</code>, <code>%ecx</code>,
-    <code>%edx</code> and <code>%esi</code> are available to
-    Valgrind's register allocator.  They are dead (carry unimportant
-    values) in between translations, and are live only in
-    translations.  The one exception to this is <code>%eax</code>,
-    which, as mentioned far above, has a special significance to the
-    dispatch loop <code>VG_(dispatch)</code>: when a translation
-    returns to the dispatch loop, <code>%eax</code> is expected to
-    contain the original-code-address of the next translation to run.
-    The register allocator is so good at minimising spill code that
-    using five regs and not having to save/restore <code>%edi</code>
-    actually gives better code than allocating to <code>%edi</code>
-    as well, but then having to push/pop it around special uses.
-<p>
-<li><code>%ebp</code> points permanently at
-    <code>VG_(baseBlock)</code>.  Valgrind's translations are
-    position-independent, partly because this is convenient, but also
-    because translations get moved around in TC as part of the LRUing
-    activity.  <b>All</b> static entities which need to be referred to
-    from generated code, whether data or helper functions, are stored
-    starting at <code>VG_(baseBlock)</code> and are therefore reached
-    by indexing from <code>%ebp</code>.  There is but one exception, 
-    which is that by placing the value
-    <code>VG_EBP_DISPATCH_CHECKED</code>
-    in <code>%ebp</code> just before a return to the dispatcher, 
-    the dispatcher is informed that the next address to run, 
-    in <code>%eax</code>, requires special treatment.
-<p>
-<li>The real machine's FPU state is pretty much unimportant, for
-    reasons which will become obvious.  Ditto its <code>%eflags</code>
-    register.
-</ul>
-
-<p>
-The state of the simulated CPU is stored in memory, in
-<code>VG_(baseBlock)</code>, which is a block of 200 words IIRC.
-Recall that <code>%ebp</code> points permanently at the start of this
-block.  Function <code>vg_init_baseBlock</code> decides what the
-offsets of various entities in <code>VG_(baseBlock)</code> are to be,
-and allocates word offsets for them.  The code generator then emits
-<code>%ebp</code> relative addresses to get at those things.  The
-sequence in which entities are allocated has been carefully chosen so
-that the 32 most popular entities come first, because this means 8-bit
-offsets can be used in the generated code.
-
-<p>
-If I was clever, I could make <code>%ebp</code> point 32 words along 
-<code>VG_(baseBlock)</code>, so that I'd have another 32 words of
-short-form offsets available, but that's just complicated, and it's
-not important -- the first 32 words take 99% (or whatever) of the
-traffic.
-
-<p>
-Currently, the sequence of stuff in <code>VG_(baseBlock)</code> is as
-follows:
-<ul>
-<li>9 words, holding the simulated integer registers,
-    <code>%EAX</code> .. <code>%EDI</code>, and the simulated flags,
-    <code>%EFLAGS</code>.
-<p>
-<li>Another 9 words, holding the V bit "shadows" for the above 9 regs.
-<p>
-<li>The <b>addresses</b> of various helper routines called from
-    generated code: 
-    <code>VG_(helper_value_check4_fail)</code>,
-    <code>VG_(helper_value_check0_fail)</code>,
-    which register V-check failures,
-    <code>VG_(helperc_STOREV4)</code>,
-    <code>VG_(helperc_STOREV1)</code>,
-    <code>VG_(helperc_LOADV4)</code>,
-    <code>VG_(helperc_LOADV1)</code>,
-    which do stores and loads of V bits to/from the 
-    sparse array which keeps track of V bits in memory,
-    and
-    <code>VGM_(handle_esp_assignment)</code>, which messes with
-    memory addressibility resulting from changes in <code>%ESP</code>.
-<p>
-<li>The simulated <code>%EIP</code>.
-<p>
-<li>24 spill words, for when the register allocator can't make it work
-    with 5 measly registers.
-<p>
-<li>Addresses of helpers <code>VG_(helperc_STOREV2)</code>,
-    <code>VG_(helperc_LOADV2)</code>.  These are here because 2-byte
-    loads and stores are relatively rare, so are placed above the
-    magic 32-word offset boundary.
-<p>
-<li>For similar reasons, addresses of helper functions 
-    <code>VGM_(fpu_write_check)</code> and
-    <code>VGM_(fpu_read_check)</code>, which handle the A/V maps
-    testing and changes required by FPU writes/reads.  
-<p>
-<li>Some other boring helper addresses:
-    <code>VG_(helper_value_check2_fail)</code> and
-    <code>VG_(helper_value_check1_fail)</code>.  These are probably
-    never emitted now, and should be removed.
-<p>
-<li>The entire state of the simulated FPU, which I believe to be
-    108 bytes long.
-<p>
-<li>Finally, the addresses of various other helper functions in
-    <code>vg_helpers.S</code>, which deal with rare situations which
-    are tedious or difficult to generate code in-line for.
-</ul>
-
-<p>
-As a general rule, the simulated machine's state lives permanently in
-memory at <code>VG_(baseBlock)</code>.  However, the JITter does some
-optimisations which allow the simulated integer registers to be
-cached in real registers over multiple simulated instructions within
-the same basic block.  These are always flushed back into memory at
-the end of every basic block, so that the in-memory state is
-up-to-date between basic blocks.  (This flushing is implied by the
-statement above that the real machine's allocatable registers are
-dead in between simulated blocks).
-
-
-<h3>Startup, shutdown, and system calls</h3>
-
-Getting into of Valgrind (<code>VG_(startup)</code>, called from
-<code>valgrind.so</code>'s initialisation section), really means
-copying the real CPU's state into <code>VG_(baseBlock)</code>, and
-then installing our own stack pointer, etc, into the real CPU, and
-then starting up the JITter.  Exiting valgrind involves copying the
-simulated state back to the real state.
-
-<p>
-Unfortunately, there's a complication at startup time.  Problem is
-that at the point where we need to take a snapshot of the real CPU's
-state, the offsets in <code>VG_(baseBlock)</code> are not set up yet,
-because to do so would involve disrupting the real machine's state
-significantly.  The way round this is to dump the real machine's state
-into a temporary, static block of memory,
-<code>VG_(m_state_static)</code>.  We can then set up the
-<code>VG_(baseBlock)</code> offsets at our leisure, and copy into it
-from <code>VG_(m_state_static)</code> at some convenient later time.
-This copying is done by
-<code>VG_(copy_m_state_static_to_baseBlock)</code>.
-
-<p>
-On exit, the inverse transformation is (rather unnecessarily) used:
-stuff in <code>VG_(baseBlock)</code> is copied to
-<code>VG_(m_state_static)</code>, and the assembly stub then copies
-from <code>VG_(m_state_static)</code> into the real machine registers.
-
-<p>
-Doing system calls on behalf of the client (<code>vg_syscall.S</code>)
-is something of a half-way house.  We have to make the world look
-sufficiently like that which the client would normally have to make
-the syscall actually work properly, but we can't afford to lose
-control.  So the trick is to copy all of the client's state, <b>except
-its program counter</b>, into the real CPU, do the system call, and
-copy the state back out.  Note that the client's state includes its
-stack pointer register, so one effect of this partial restoration is
-to cause the system call to be run on the client's stack, as it should
-be.
-
-<p>
-As ever there are complications.  We have to save some of our own state
-somewhere when restoring the client's state into the CPU, so that we
-can keep going sensibly afterwards.  In fact the only thing which is
-important is our own stack pointer, but for paranoia reasons I save 
-and restore our own FPU state as well, even though that's probably
-pointless.
-
-<p>
-The complication on the above complication is, that for horrible
-reasons to do with signals, we may have to handle a second client
-system call whilst the client is blocked inside some other system 
-call (unbelievable!).  That means there's two sets of places to 
-dump Valgrind's stack pointer and FPU state across the syscall,
-and we decide which to use by consulting
-<code>VG_(syscall_depth)</code>, which is in turn maintained by
-<code>VG_(wrap_syscall)</code>.
-
-
-
-<h3>Introduction to UCode</h3>
-
-UCode lies at the heart of the x86-to-x86 JITter.  The basic premise
-is that dealing the the x86 instruction set head-on is just too darn
-complicated, so we do the traditional compiler-writer's trick and
-translate it into a simpler, easier-to-deal-with form.
-
-<p>
-In normal operation, translation proceeds through six stages,
-coordinated by <code>VG_(translate)</code>:
-<ol>
-<li>Parsing of an x86 basic block into a sequence of UCode
-    instructions (<code>VG_(disBB)</code>).
-<p>
-<li>UCode optimisation (<code>vg_improve</code>), with the aim of
-    caching simulated registers in real registers over multiple
-    simulated instructions, and removing redundant simulated
-    <code>%EFLAGS</code> saving/restoring.
-<p>
-<li>UCode instrumentation (<code>vg_instrument</code>), which adds
-    value and address checking code.
-<p>
-<li>Post-instrumentation cleanup (<code>vg_cleanup</code>), removing
-    redundant value-check computations.
-<p>
-<li>Register allocation (<code>vg_do_register_allocation</code>),
-    which, note, is done on UCode.
-<p>
-<li>Emission of final instrumented x86 code
-    (<code>VG_(emit_code)</code>).
-</ol>
-
-<p>
-Notice how steps 2, 3, 4 and 5 are simple UCode-to-UCode
-transformation passes, all on straight-line blocks of UCode (type
-<code>UCodeBlock</code>).  Steps 2 and 4 are optimisation passes and
-can be disabled for debugging purposes, with
-<code>--optimise=no</code> and <code>--cleanup=no</code> respectively.
-
-<p>
-Valgrind can also run in a no-instrumentation mode, given
-<code>--instrument=no</code>.  This is useful for debugging the JITter
-quickly without having to deal with the complexity of the
-instrumentation mechanism too.  In this mode, steps 3 and 4 are
-omitted.
-
-<p>
-These flags combine, so that <code>--instrument=no</code> together with 
-<code>--optimise=no</code> means only steps 1, 5 and 6 are used.
-<code>--single-step=yes</code> causes each x86 instruction to be
-treated as a single basic block.  The translations are terrible but
-this is sometimes instructive.  
-
-<p>
-The <code>--stop-after=N</code> flag switches back to the real CPU
-after <code>N</code> basic blocks.  It also re-JITs the final basic
-block executed and prints the debugging info resulting, so this
-gives you a way to get a quick snapshot of how a basic block looks as
-it passes through the six stages mentioned above.  If you want to 
-see full information for every block translated (probably not, but
-still ...) find, in <code>VG_(translate)</code>, the lines
-<br><code>   dis = True;</code>
-<br><code>   dis = debugging_translation;</code>
-<br>
-and comment out the second line.  This will spew out debugging
-junk faster than you can possibly imagine.
-
-
-
-<h3>UCode operand tags: type <code>Tag</code></h3>
-
-UCode is, more or less, a simple two-address RISC-like code.  In
-keeping with the x86 AT&T assembly syntax, generally speaking the
-first operand is the source operand, and the second is the destination
-operand, which is modified when the uinstr is notionally executed.
-
-<p>
-UCode instructions have up to three operand fields, each of which has
-a corresponding <code>Tag</code> describing it.  Possible values for
-the tag are:
-
-<ul>
-<li><code>NoValue</code>: indicates that the field is not in use.
-<p>
-<li><code>Lit16</code>: the field contains a 16-bit literal.
-<p>
-<li><code>Literal</code>: the field denotes a 32-bit literal, whose
-    value is stored in the <code>lit32</code> field of the uinstr
-    itself.  Since there is only one <code>lit32</code> for the whole
-    uinstr, only one operand field may contain this tag.
-<p>
-<li><code>SpillNo</code>: the field contains a spill slot number, in
-    the range 0 to 23 inclusive, denoting one of the spill slots
-    contained inside <code>VG_(baseBlock)</code>.  Such tags only
-    exist after register allocation.
-<p>
-<li><code>RealReg</code>: the field contains a number in the range 0
-    to 7 denoting an integer x86 ("real") register on the host.  The
-    number is the Intel encoding for integer registers.  Such tags
-    only exist after register allocation.
-<p>
-<li><code>ArchReg</code>: the field contains a number in the range 0
-    to 7 denoting an integer x86 register on the simulated CPU.  In
-    reality this means a reference to one of the first 8 words of
-    <code>VG_(baseBlock)</code>.  Such tags can exist at any point in
-    the translation process.
-<p>
-<li>Last, but not least, <code>TempReg</code>.  The field contains the
-    number of one of an infinite set of virtual (integer)
-    registers. <code>TempReg</code>s are used everywhere throughout
-    the translation process; you can have as many as you want.  The
-    register allocator maps as many as it can into
-    <code>RealReg</code>s and turns the rest into
-    <code>SpillNo</code>s, so <code>TempReg</code>s should not exist
-    after the register allocation phase.
-    <p>
-    <code>TempReg</code>s are always 32 bits long, even if the data
-    they hold is logically shorter.  In that case the upper unused
-    bits are required, and, I think, generally assumed, to be zero.  
-    <code>TempReg</code>s holding V bits for quantities shorter than 
-    32 bits are expected to have ones in the unused places, since a
-    one denotes "undefined".
-</ul>
-
-
-<h3>UCode instructions: type <code>UInstr</code></h3>
-
-<p>
-UCode was carefully designed to make it possible to do register
-allocation on UCode and then translate the result into x86 code
-without needing any extra registers ... well, that was the original
-plan, anyway.  Things have gotten a little more complicated since
-then.  In what follows, UCode instructions are referred to as uinstrs,
-to distinguish them from x86 instructions.  Uinstrs of course have
-uopcodes which are (naturally) different from x86 opcodes.
-
-<p>
-A uinstr (type <code>UInstr</code>) contains
-various fields, not all of which are used by any one uopcode:
-<ul>
-<li>Three 16-bit operand fields, <code>val1</code>, <code>val2</code>
-    and <code>val3</code>.
-<p>
-<li>Three tag fields, <code>tag1</code>, <code>tag2</code>
-    and <code>tag3</code>.  Each of these has a value of type
-    <code>Tag</code>,
-    and they describe what the <code>val1</code>, <code>val2</code>
-    and <code>val3</code> fields contain.
-<p>
-<li>A 32-bit literal field.
-<p>
-<li>Two <code>FlagSet</code>s, specifying which x86 condition codes are
-    read and written by the uinstr.
-<p>
-<li>An opcode byte, containing a value of type <code>Opcode</code>.
-<p>
-<li>A size field, indicating the data transfer size (1/2/4/8/10) in
-    cases where this makes sense, or zero otherwise.
-<p>
-<li>A condition-code field, which, for jumps, holds a
-    value of type <code>Condcode</code>, indicating the condition
-    which applies.  The encoding is as it is in the x86 insn stream,
-    except we add a 17th value <code>CondAlways</code> to indicate
-    an unconditional transfer.
-<p>
-<li>Various 1-bit flags, indicating whether this insn pertains to an
-    x86 CALL or RET instruction, whether a widening is signed or not,
-    etc.
-</ul>
-
-<p>
-UOpcodes (type <code>Opcode</code>) are divided into two groups: those
-necessary merely to express the functionality of the x86 code, and
-extra uopcodes needed to express the instrumentation.  The former
-group contains:
-<ul>
-<li><code>GET</code> and <code>PUT</code>, which move values from the
-    simulated CPU's integer registers (<code>ArchReg</code>s) into
-    <code>TempReg</code>s, and back.  <code>GETF</code> and
-    <code>PUTF</code> do the corresponding thing for the simulated
-    <code>%EFLAGS</code>.  There are no corresponding insns for the
-    FPU register stack, since we don't explicitly simulate its
-    registers.
-<p>
-<li><code>LOAD</code> and <code>STORE</code>, which, in RISC-like
-    fashion, are the only uinstrs able to interact with memory.
-<p>
-<li><code>MOV</code> and <code>CMOV</code> allow unconditional and
-    conditional moves of values between <code>TempReg</code>s.
-<p>
-<li>ALU operations.  Again in RISC-like fashion, these only operate on
-    <code>TempReg</code>s (before reg-alloc) or <code>RealReg</code>s
-    (after reg-alloc).  These are: <code>ADD</code>, <code>ADC</code>,
-    <code>AND</code>, <code>OR</code>, <code>XOR</code>,
-    <code>SUB</code>, <code>SBB</code>, <code>SHL</code>,
-    <code>SHR</code>, <code>SAR</code>, <code>ROL</code>,
-    <code>ROR</code>, <code>RCL</code>, <code>RCR</code>,
-    <code>NOT</code>, <code>NEG</code>, <code>INC</code>,
-    <code>DEC</code>, <code>BSWAP</code>, <code>CC2VAL</code> and
-    <code>WIDEN</code>.  <code>WIDEN</code> does signed or unsigned
-    value widening.  <code>CC2VAL</code> is used to convert condition
-    codes into a value, zero or one.  The rest are obvious.
-    <p>
-    To allow for more efficient code generation, we bend slightly the
-    restriction at the start of the previous para: for
-    <code>ADD</code>, <code>ADC</code>, <code>XOR</code>,
-    <code>SUB</code> and <code>SBB</code>, we allow the first (source)
-    operand to also be an <code>ArchReg</code>, that is, one of the
-    simulated machine's registers.  Also, many of these ALU ops allow
-    the source operand to be a literal.  See
-    <code>VG_(saneUInstr)</code> for the final word on the allowable
-    forms of uinstrs.
-<p>
-<li><code>LEA1</code> and <code>LEA2</code> are not strictly
-    necessary, but allow faciliate better translations.  They
-    record the fancy x86 addressing modes in a direct way, which
-    allows those amodes to be emitted back into the final
-    instruction stream more or less verbatim.
-<p>
-<li><code>CALLM</code> calls a machine-code helper, one of the methods
-    whose address is stored at some <code>VG_(baseBlock)</code>
-    offset.  <code>PUSH</code> and <code>POP</code> move values
-    to/from <code>TempReg</code> to the real (Valgrind's) stack, and
-    <code>CLEAR</code> removes values from the stack.
-    <code>CALLM_S</code> and <code>CALLM_E</code> delimit the
-    boundaries of call setups and clearings, for the benefit of the
-    instrumentation passes.  Getting this right is critical, and so
-    <code>VG_(saneUCodeBlock)</code> makes various checks on the use
-    of these uopcodes.
-    <p>
-    It is important to understand that these uopcodes have nothing to
-    do with the x86 <code>call</code>, <code>return,</code>
-    <code>push</code> or <code>pop</code> instructions, and are not
-    used to implement them.  Those guys turn into combinations of
-    <code>GET</code>, <code>PUT</code>, <code>LOAD</code>,
-    <code>STORE</code>, <code>ADD</code>, <code>SUB</code>, and
-    <code>JMP</code>.  What these uopcodes support is calling of
-    helper functions such as <code>VG_(helper_imul_32_64)</code>,
-    which do stuff which is too difficult or tedious to emit inline.
-<p>
-<li><code>FPU</code>, <code>FPU_R</code> and <code>FPU_W</code>.
-    Valgrind doesn't attempt to simulate the internal state of the
-    FPU at all.  Consequently it only needs to be able to distinguish
-    FPU ops which read and write memory from those that don't, and
-    for those which do, it needs to know the effective address and
-    data transfer size.  This is made easier because the x86 FP
-    instruction encoding is very regular, basically consisting of
-    16 bits for a non-memory FPU insn and 11 (IIRC) bits + an address mode
-    for a memory FPU insn.  So our <code>FPU</code> uinstr carries
-    the 16 bits in its <code>val1</code> field.  And
-    <code>FPU_R</code> and <code>FPU_W</code> carry 11 bits in that
-    field, together with the identity of a <code>TempReg</code> or
-    (later) <code>RealReg</code> which contains the address.
-<p>
-<li><code>JIFZ</code> is unique, in that it allows a control-flow
-    transfer which is not deemed to end a basic block.  It causes a
-    jump to a literal (original) address if the specified argument
-    is zero.
-<p>
-<li>Finally, <code>INCEIP</code> advances the simulated
-    <code>%EIP</code> by the specified literal amount.  This supports
-    lazy <code>%EIP</code> updating, as described below.
-</ul>
-
-<p>
-Stages 1 and 2 of the 6-stage translation process mentioned above
-deal purely with these uopcodes, and no others.  They are
-sufficient to express pretty much all the x86 32-bit protected-mode 
-instruction set, at
-least everything understood by a pre-MMX original Pentium (P54C). 
-
-<p>
-Stages 3, 4, 5 and 6 also deal with the following extra
-"instrumentation" uopcodes.  They are used to express all the
-definedness-tracking and -checking machinery which valgrind does.  In
-later sections we show how to create checking code for each of the
-uopcodes above.  Note that these instrumentation uopcodes, although
-some appearing complicated, have been carefully chosen so that
-efficient x86 code can be generated for them.  GNU superopt v2.5 did a
-great job helping out here.  Anyways, the uopcodes are as follows:
-
-<ul>
-<li><code>GETV</code> and <code>PUTV</code> are analogues to
-    <code>GET</code> and <code>PUT</code> above.  They are identical
-    except that they move the V bits for the specified values back and
-    forth to <code>TempRegs</code>, rather than moving the values
-    themselves.
-<p>
-<li>Similarly, <code>LOADV</code> and <code>STOREV</code> read and
-    write V bits from the synthesised shadow memory that Valgrind
-    maintains.  In fact they do more than that, since they also do
-    address-validity checks, and emit complaints if the read/written
-    addresses are unaddressible.
-<p>
-<li><code>TESTV</code>, whose parameters are a <code>TempReg</code>
-    and a size, tests the V bits in the <code>TempReg</code>, at the
-    specified operation size (0/1/2/4 byte) and emits an error if any
-    of them indicate undefinedness.  This is the only uopcode capable
-    of doing such tests.
-<p>
-<li><code>SETV</code>, whose parameters are also <code>TempReg</code>
-    and a size, makes the V bits in the <code>TempReg</code> indicated
-    definedness, at the specified operation size.  This is usually
-    used to generate the correct V bits for a literal value, which is
-    of course fully defined.
-<p>
-<li><code>GETVF</code> and <code>PUTVF</code> are analogues to
-    <code>GETF</code> and <code>PUTF</code>.  They move the single V
-    bit used to model definedness of <code>%EFLAGS</code> between its
-    home in <code>VG_(baseBlock)</code> and the specified
-    <code>TempReg</code>.
-<p>
-<li><code>TAG1</code> denotes one of a family of unary operations on
-    <code>TempReg</code>s containing V bits.  Similarly,
-    <code>TAG2</code> denotes one in a family of binary operations on
-    V bits.
-</ul>
-
-<p>
-These 10 uopcodes are sufficient to express Valgrind's entire
-definedness-checking semantics.  In fact most of the interesting magic
-is done by the <code>TAG1</code> and <code>TAG2</code>
-suboperations.
-
-<p>
-First, however, I need to explain about V-vector operation sizes.
-There are 4 sizes: 1, 2 and 4, which operate on groups of 8, 16 and 32
-V bits at a time, supporting the usual 1, 2 and 4 byte x86 operations.
-However there is also the mysterious size 0, which really means a
-single V bit.  Single V bits are used in various circumstances; in
-particular, the definedness of <code>%EFLAGS</code> is modelled with a
-single V bit.  Now might be a good time to also point out that for
-V bits, 1 means "undefined" and 0 means "defined".  Similarly, for A
-bits, 1 means "invalid address" and 0 means "valid address".  This
-seems counterintuitive (and so it is), but testing against zero on
-x86s saves instructions compared to testing against all 1s, because
-many ALU operations set the Z flag for free, so to speak.
-
-<p>
-With that in mind, the tag ops are:
-
-<ul>
-<li><b>(UNARY) Pessimising casts</b>: <code>VgT_PCast40</code>,
-    <code>VgT_PCast20</code>, <code>VgT_PCast10</code>,
-    <code>VgT_PCast01</code>, <code>VgT_PCast02</code> and
-    <code>VgT_PCast04</code>.  A "pessimising cast" takes a V-bit
-    vector at one size, and creates a new one at another size,
-    pessimised in the sense that if any of the bits in the source
-    vector indicate undefinedness, then all the bits in the result
-    indicate undefinedness.  In this case the casts are all to or from
-    a single V bit, so for example <code>VgT_PCast40</code> is a
-    pessimising cast from 32 bits to 1, whereas
-    <code>VgT_PCast04</code> simply copies the single source V bit
-    into all 32 bit positions in the result.  Surprisingly, these ops
-    can all be implemented very efficiently.
-    <p>
-    There are also the pessimising casts <code>VgT_PCast14</code>,
-    from 8 bits to 32, <code>VgT_PCast12</code>, from 8 bits to 16,
-    and <code>VgT_PCast11</code>, from 8 bits to 8.  This last one
-    seems nonsensical, but in fact it isn't a no-op because, as
-    mentioned above, any undefined (1) bits in the source infect the
-    entire result.
-<p>
-<li><b>(UNARY) Propagating undefinedness upwards in a word</b>:
-    <code>VgT_Left4</code>, <code>VgT_Left2</code> and
-    <code>VgT_Left1</code>.  These are used to simulate the worst-case
-    effects of carry propagation in adds and subtracts.  They return a
-    V vector identical to the original, except that if the original
-    contained any undefined bits, then it and all bits above it are
-    marked as undefined too.  Hence the Left bit in the names.
-<p>
-<li><b>(UNARY) Signed and unsigned value widening</b>:
-     <code>VgT_SWiden14</code>, <code>VgT_SWiden24</code>,
-     <code>VgT_SWiden12</code>, <code>VgT_ZWiden14</code>,
-     <code>VgT_ZWiden24</code> and <code>VgT_ZWiden12</code>.  These
-     mimic the definedness effects of standard signed and unsigned
-     integer widening.  Unsigned widening creates zero bits in the new
-     positions, so <code>VgT_ZWiden*</code> accordingly park mark
-     those parts of their argument as defined.  Signed widening copies
-     the sign bit into the new positions, so <code>VgT_SWiden*</code>
-     copies the definedness of the sign bit into the new positions.
-     Because 1 means undefined and 0 means defined, these operations
-     can (fascinatingly) be done by the same operations which they
-     mimic.  Go figure.
-<p>
-<li><b>(BINARY) Undefined-if-either-Undefined,
-     Defined-if-either-Defined</b>: <code>VgT_UifU4</code>,
-     <code>VgT_UifU2</code>, <code>VgT_UifU1</code>,
-     <code>VgT_UifU0</code>, <code>VgT_DifD4</code>,
-     <code>VgT_DifD2</code>, <code>VgT_DifD1</code>.  These do simple
-     bitwise operations on pairs of V-bit vectors, with
-     <code>UifU</code> giving undefined if either arg bit is
-     undefined, and <code>DifD</code> giving defined if either arg bit
-     is defined.  Abstract interpretation junkies, if any make it this
-     far, may like to think of them as meets and joins (or is it joins
-     and meets) in the definedness lattices.  
-<p>
-<li><b>(BINARY; one value, one V bits) Generate argument improvement
-    terms for AND and OR</b>: <code>VgT_ImproveAND4_TQ</code>,
-    <code>VgT_ImproveAND2_TQ</code>, <code>VgT_ImproveAND1_TQ</code>,
-    <code>VgT_ImproveOR4_TQ</code>, <code>VgT_ImproveOR2_TQ</code>,
-    <code>VgT_ImproveOR1_TQ</code>.  These help out with AND and OR
-    operations.  AND and OR have the inconvenient property that the
-    definedness of the result depends on the actual values of the
-    arguments as well as their definedness.  At the bit level:
-    <br><code>1 AND undefined = undefined</code>, but 
-    <br><code>0 AND undefined = 0</code>, and similarly 
-    <br><code>0 OR  undefined = undefined</code>, but 
-    <br><code>1 OR  undefined = 1</code>.
-    <br>
-    <p>
-    It turns out that gcc (quite legitimately) generates code which
-    relies on this fact, so we have to model it properly in order to
-    avoid flooding users with spurious value errors.  The ultimate
-    definedness result of AND and OR is calculated using
-    <code>UifU</code> on the definedness of the arguments, but we
-    also <code>DifD</code> in some "improvement" terms which 
-    take into account the above phenomena.  
-    <p>
-    <code>ImproveAND</code> takes as its first argument the actual
-    value of an argument to AND (the T) and the definedness of that
-    argument (the Q), and returns a V-bit vector which is defined (0)
-    for bits which have value 0 and are defined; this, when
-    <code>DifD</code> into the final result causes those bits to be
-    defined even if the corresponding bit in the other argument is undefined.
-    <p>
-    The <code>ImproveOR</code> ops do the dual thing for OR
-    arguments.  Note that XOR does not have this property that one
-    argument can make the other irrelevant, so there is no need for
-    such complexity for XOR.
-</ul>
-
-<p>
-That's all the tag ops.  If you stare at this long enough, and then
-run Valgrind and stare at the pre- and post-instrumented ucode, it
-should be fairly obvious how the instrumentation machinery hangs
-together.
-
-<p>
-One point, if you do this: in order to make it easy to differentiate
-<code>TempReg</code>s carrying values from <code>TempReg</code>s
-carrying V bit vectors, Valgrind prints the former as (for example)
-<code>t28</code> and the latter as <code>q28</code>; the fact that
-they carry the same number serves to indicate their relationship.
-This is purely for the convenience of the human reader; the register
-allocator and code generator don't regard them as different.
-
-
-<h3>Translation into UCode</h3>
-
-<code>VG_(disBB)</code> allocates a new <code>UCodeBlock</code> and
-then uses <code>disInstr</code> to translate x86 instructions one at a
-time into UCode, dumping the result in the <code>UCodeBlock</code>.
-This goes on until a control-flow transfer instruction is encountered.
-
-<p>
-Despite the large size of <code>vg_to_ucode.c</code>, this translation
-is really very simple.  Each x86 instruction is translated entirely
-independently of its neighbours, merrily allocating new
-<code>TempReg</code>s as it goes.  The idea is to have a simple
-translator -- in reality, no more than a macro-expander -- and the --
-resulting bad UCode translation is cleaned up by the UCode
-optimisation phase which follows.  To give you an idea of some x86
-instructions and their translations (this is a complete basic block,
-as Valgrind sees it):
-<pre>
-        0x40435A50:  incl %edx
-
-           0: GETL      %EDX, t0
-           1: INCL      t0  (-wOSZAP)
-           2: PUTL      t0, %EDX
-
-        0x40435A51:  movsbl (%edx),%eax
-
-           3: GETL      %EDX, t2
-           4: LDB       (t2), t2
-           5: WIDENL_Bs t2
-           6: PUTL      t2, %EAX
-
-        0x40435A54:  testb $0x20, 1(%ecx,%eax,2)
-
-           7: GETL      %EAX, t6
-           8: GETL      %ECX, t8
-           9: LEA2L     1(t8,t6,2), t4
-          10: LDB       (t4), t10
-          11: MOVB      $0x20, t12
-          12: ANDB      t12, t10  (-wOSZACP)
-          13: INCEIPo   $9
-
-        0x40435A59:  jnz-8 0x40435A50
-
-          14: Jnzo      $0x40435A50  (-rOSZACP)
-          15: JMPo      $0x40435A5B
-</pre>
-
-<p>
-Notice how the block always ends with an unconditional jump to the
-next block.  This is a bit unnecessary, but makes many things simpler.
-
-<p>
-Most x86 instructions turn into sequences of <code>GET</code>,
-<code>PUT</code>, <code>LEA1</code>, <code>LEA2</code>,
-<code>LOAD</code> and <code>STORE</code>.  Some complicated ones
-however rely on calling helper bits of code in 
-<code>vg_helpers.S</code>.  The ucode instructions <code>PUSH</code>,
-<code>POP</code>, <code>CALL</code>, <code>CALLM_S</code> and
-<code>CALLM_E</code> support this.  The calling convention is somewhat
-ad-hoc and is not the C calling convention.  The helper routines must 
-save all integer registers, and the flags, that they use.  Args are
-passed on the stack underneath the return address, as usual, and if 
-result(s) are to be returned, it (they) are either placed in dummy arg
-slots created by the ucode <code>PUSH</code> sequence, or just
-overwrite the incoming args.
-
-<p>
-In order that the instrumentation mechanism can handle calls to these
-helpers, <code>VG_(saneUCodeBlock)</code> enforces the following
-restrictions on calls to helpers:
-
-<ul>
-<li>Each <code>CALL</code> uinstr must be bracketed by a preceding
-    <code>CALLM_S</code> marker (dummy uinstr) and a trailing
-    <code>CALLM_E</code> marker.  These markers are used by the
-    instrumentation mechanism later to establish the boundaries of the
-    <code>PUSH</code>, <code>POP</code> and <code>CLEAR</code>
-    sequences for the call.
-<p>
-<li><code>PUSH</code>, <code>POP</code> and <code>CLEAR</code>
-    may only appear inside sections bracketed by <code>CALLM_S</code>
-    and <code>CALLM_E</code>, and nowhere else.
-<p>
-<li>In any such bracketed section, no two <code>PUSH</code> insns may
-    push the same <code>TempReg</code>.  Dually, no two two
-    <code>POP</code>s may pop the same <code>TempReg</code>.
-<p>
-<li>Finally, although this is not checked, args should be removed from
-    the stack with <code>CLEAR</code>, rather than <code>POP</code>s
-    into a <code>TempReg</code> which is not subsequently used.  This
-    is because the instrumentation mechanism assumes that all values
-    <code>POP</code>ped from the stack are actually used.
-</ul>
-
-Some of the translations may appear to have redundant
-<code>TempReg</code>-to-<code>TempReg</code> moves.  This helps the
-next phase, UCode optimisation, to generate better code.
-
-
-
-<h3>UCode optimisation</h3>
-
-UCode is then subjected to an improvement pass
-(<code>vg_improve()</code>), which blurs the boundaries between the
-translations of the original x86 instructions.  It's pretty
-straightforward.  Three transformations are done:
-
-<ul>
-<li>Redundant <code>GET</code> elimination.  Actually, more general
-    than that -- eliminates redundant fetches of ArchRegs.  In our
-    running example, uinstr 3 <code>GET</code>s <code>%EDX</code> into
-    <code>t2</code> despite the fact that, by looking at the previous
-    uinstr, it is already in <code>t0</code>.  The <code>GET</code> is
-    therefore removed, and <code>t2</code> renamed to <code>t0</code>.
-    Assuming <code>t0</code> is allocated to a host register, it means
-    the simulated <code>%EDX</code> will exist in a host CPU register
-    for more than one simulated x86 instruction, which seems to me to
-    be a highly desirable property.
-    <p>
-    There is some mucking around to do with subregisters;
-    <code>%AL</code> vs <code>%AH</code> <code>%AX</code> vs
-    <code>%EAX</code> etc.  I can't remember how it works, but in
-    general we are very conservative, and these tend to invalidate the
-    caching. 
-<p>
-<li>Redundant <code>PUT</code> elimination.  This annuls
-    <code>PUT</code>s of values back to simulated CPU registers if a
-    later <code>PUT</code> would overwrite the earlier
-    <code>PUT</code> value, and there is no intervening reads of the
-    simulated register (<code>ArchReg</code>).
-    <p>
-    As before, we are paranoid when faced with subregister references.
-    Also, <code>PUT</code>s of <code>%ESP</code> are never annulled,
-    because it is vital the instrumenter always has an up-to-date
-    <code>%ESP</code> value available, <code>%ESP</code> changes
-    affect addressibility of the memory around the simulated stack
-    pointer.
-    <p>
-    The implication of the above paragraph is that the simulated
-    machine's registers are only lazily updated once the above two
-    optimisation phases have run, with the exception of
-    <code>%ESP</code>.  <code>TempReg</code>s go dead at the end of
-    every basic block, from which is is inferrable that any
-    <code>TempReg</code> caching a simulated CPU reg is flushed (back
-    into the relevant <code>VG_(baseBlock)</code> slot) at the end of
-    every basic block.  The further implication is that the simulated
-    registers are only up-to-date at in between basic blocks, and not
-    at arbitrary points inside basic blocks.  And the consequence of
-    that is that we can only deliver signals to the client in between
-    basic blocks.  None of this seems any problem in practice.
-<p>
-<li>Finally there is a simple def-use thing for condition codes.  If
-    an earlier uinstr writes the condition codes, and the next uinsn
-    along which actually cares about the condition codes writes the
-    same or larger set of them, but does not read any, the earlier
-    uinsn is marked as not writing any condition codes.  This saves 
-    a lot of redundant cond-code saving and restoring.
-</ul>
-
-The effect of these transformations on our short block is rather
-unexciting, and shown below.  On longer basic blocks they can
-dramatically improve code quality.
-
-<pre>
-at 3: delete GET, rename t2 to t0 in (4 .. 6)
-at 7: delete GET, rename t6 to t0 in (8 .. 9)
-at 1: annul flag write OSZAP due to later OSZACP
-
-Improved code:
-           0: GETL      %EDX, t0
-           1: INCL      t0
-           2: PUTL      t0, %EDX
-           4: LDB       (t0), t0
-           5: WIDENL_Bs t0
-           6: PUTL      t0, %EAX
-           8: GETL      %ECX, t8
-           9: LEA2L     1(t8,t0,2), t4
-          10: LDB       (t4), t10
-          11: MOVB      $0x20, t12
-          12: ANDB      t12, t10  (-wOSZACP)
-          13: INCEIPo   $9
-          14: Jnzo      $0x40435A50  (-rOSZACP)
-          15: JMPo      $0x40435A5B
-</pre>
-
-<h3>UCode instrumentation</h3>
-
-Once you understand the meaning of the instrumentation uinstrs,
-discussed in detail above, the instrumentation scheme is fairly
-straighforward.  Each uinstr is instrumented in isolation, and the
-instrumentation uinstrs are placed before the original uinstr.
-Our running example continues below.  I have placed a blank line 
-after every original ucode, to make it easier to see which
-instrumentation uinstrs correspond to which originals.
-
-<p>
-As mentioned somewhere above, <code>TempReg</code>s carrying values 
-have names like <code>t28</code>, and each one has a shadow carrying
-its V bits, with names like <code>q28</code>.  This pairing aids in
-reading instrumented ucode.
-
-<p>
-One decision about all this is where to have "observation points",
-that is, where to check that V bits are valid.  I use a minimalistic
-scheme, only checking where a failure of validity could cause the 
-original program to (seg)fault.  So the use of values as memory
-addresses causes a check, as do conditional jumps (these cause a check
-on the definedness of the condition codes).  And arguments
-<code>PUSH</code>ed for helper calls are checked, hence the wierd
-restrictions on help call preambles described above.
-
-<p>
-Another decision is that once a value is tested, it is thereafter
-regarded as defined, so that we do not emit multiple undefined-value
-errors for the same undefined value.  That means that
-<code>TESTV</code> uinstrs are always followed by <code>SETV</code> 
-on the same (shadow) <code>TempReg</code>s.  Most of these
-<code>SETV</code>s are redundant and are removed by the
-post-instrumentation cleanup phase.
-
-<p>
-The instrumentation for calling helper functions deserves further
-comment.  The definedness of results from a helper is modelled using
-just one V bit.  So, in short, we do pessimising casts of the
-definedness of all the args, down to a single bit, and then
-<code>UifU</code> these bits together.  So this single V bit will say
-"undefined" if any part of any arg is undefined.  This V bit is then
-pessimally cast back up to the result(s) sizes, as needed.  If, by
-seeing that all the args are got rid of with <code>CLEAR</code> and
-none with <code>POP</code>, Valgrind sees that the result of the call
-is not actually used, it immediately examines the result V bit with a
-<code>TESTV</code> -- <code>SETV</code> pair.  If it did not do this,
-there would be no observation point to detect that the some of the
-args to the helper were undefined.  Of course, if the helper's results
-are indeed used, we don't do this, since the result usage will
-presumably cause the result definedness to be checked at some suitable
-future point.
-
-<p>
-In general Valgrind tries to track definedness on a bit-for-bit basis,
-but as the above para shows, for calls to helpers we throw in the
-towel and approximate down to a single bit.  This is because it's too
-complex and difficult to track bit-level definedness through complex
-ops such as integer multiply and divide, and in any case there is no
-reasonable code fragments which attempt to (eg) multiply two
-partially-defined values and end up with something meaningful, so
-there seems little point in modelling multiplies, divides, etc, in
-that level of detail.
-
-<p>
-Integer loads and stores are instrumented with firstly a test of the
-definedness of the address, followed by a <code>LOADV</code> or
-<code>STOREV</code> respectively.  These turn into calls to 
-(for example) <code>VG_(helperc_LOADV4)</code>.  These helpers do two
-things: they perform an address-valid check, and they load or store V
-bits from/to the relevant address in the (simulated V-bit) memory.
-
-<p>
-FPU loads and stores are different.  As above the definedness of the
-address is first tested.  However, the helper routine for FPU loads
-(<code>VGM_(fpu_read_check)</code>) emits an error if either the
-address is invalid or the referenced area contains undefined values.
-It has to do this because we do not simulate the FPU at all, and so
-cannot track definedness of values loaded into it from memory, so we
-have to check them as soon as they are loaded into the FPU, ie, at
-this point.  We notionally assume that everything in the FPU is
-defined.
-
-<p>
-It follows therefore that FPU writes first check the definedness of
-the address, then the validity of the address, and finally mark the
-written bytes as well-defined.
-
-<p>
-If anyone is inspired to extend Valgrind to MMX/SSE insns, I suggest
-you use the same trick.  It works provided that the FPU/MMX unit is
-not used to merely as a conduit to copy partially undefined data from
-one place in memory to another.  Unfortunately the integer CPU is used
-like that (when copying C structs with holes, for example) and this is
-the cause of much of the elaborateness of the instrumentation here
-described.
-
-<p>
-<code>vg_instrument()</code> in <code>vg_translate.c</code> actually
-does the instrumentation.  There are comments explaining how each
-uinstr is handled, so we do not repeat that here.  As explained
-already, it is bit-accurate, except for calls to helper functions.
-Unfortunately the x86 insns <code>bt/bts/btc/btr</code> are done by
-helper fns, so bit-level accuracy is lost there.  This should be fixed
-by doing them inline; it will probably require adding a couple new
-uinstrs.  Also, left and right rotates through the carry flag (x86
-<code>rcl</code> and <code>rcr</code>) are approximated via a single
-V bit; so far this has not caused anyone to complain.  The
-non-carry rotates, <code>rol</code> and <code>ror</code>, are much
-more common and are done exactly.  Re-visiting the instrumentation for
-AND and OR, they seem rather verbose, and I wonder if it could be done
-more concisely now.
-
-<p>
-The lowercase <code>o</code> on many of the uopcodes in the running
-example indicates that the size field is zero, usually meaning a
-single-bit operation.
-
-<p>
-Anyroads, the post-instrumented version of our running example looks
-like this:
-
-<pre>
-Instrumented code:
-           0: GETVL     %EDX, q0
-           1: GETL      %EDX, t0
-
-           2: TAG1o     q0 = Left4 ( q0 )
-           3: INCL      t0
-
-           4: PUTVL     q0, %EDX
-           5: PUTL      t0, %EDX
-
-           6: TESTVL    q0
-           7: SETVL     q0
-           8: LOADVB    (t0), q0
-           9: LDB       (t0), t0
-
-          10: TAG1o     q0 = SWiden14 ( q0 )
-          11: WIDENL_Bs t0
-
-          12: PUTVL     q0, %EAX
-          13: PUTL      t0, %EAX
-
-          14: GETVL     %ECX, q8
-          15: GETL      %ECX, t8
-
-          16: MOVL      q0, q4
-          17: SHLL      $0x1, q4
-          18: TAG2o     q4 = UifU4 ( q8, q4 )
-          19: TAG1o     q4 = Left4 ( q4 )
-          20: LEA2L     1(t8,t0,2), t4
-
-          21: TESTVL    q4
-          22: SETVL     q4
-          23: LOADVB    (t4), q10
-          24: LDB       (t4), t10
-
-          25: SETVB     q12
-          26: MOVB      $0x20, t12
-
-          27: MOVL      q10, q14
-          28: TAG2o     q14 = ImproveAND1_TQ ( t10, q14 )
-          29: TAG2o     q10 = UifU1 ( q12, q10 )
-          30: TAG2o     q10 = DifD1 ( q14, q10 )
-          31: MOVL      q12, q14
-          32: TAG2o     q14 = ImproveAND1_TQ ( t12, q14 )
-          33: TAG2o     q10 = DifD1 ( q14, q10 )
-          34: MOVL      q10, q16
-          35: TAG1o     q16 = PCast10 ( q16 )
-          36: PUTVFo    q16
-          37: ANDB      t12, t10  (-wOSZACP)
-
-          38: INCEIPo   $9
-
-          39: GETVFo    q18
-          40: TESTVo    q18
-          41: SETVo     q18
-          42: Jnzo      $0x40435A50  (-rOSZACP)
-
-          43: JMPo      $0x40435A5B
-</pre>
-
-
-<h3>UCode post-instrumentation cleanup</h3>
-
-<p>
-This pass, coordinated by <code>vg_cleanup()</code>, removes redundant
-definedness computation created by the simplistic instrumentation
-pass.  It consists of two passes,
-<code>vg_propagate_definedness()</code> followed by
-<code>vg_delete_redundant_SETVs</code>.
-
-<p>
-<code>vg_propagate_definedness()</code> is a simple
-constant-propagation and constant-folding pass.  It tries to determine
-which <code>TempReg</code>s containing V bits will always indicate
-"fully defined", and it propagates this information as far as it can,
-and folds out as many operations as possible.  For example, the
-instrumentation for an ADD of a literal to a variable quantity will be
-reduced down so that the definedness of the result is simply the
-definedness of the variable quantity, since the literal is by
-definition fully defined.
-
-<p>
-<code>vg_delete_redundant_SETVs</code> removes <code>SETV</code>s on
-shadow <code>TempReg</code>s for which the next action is a write.
-I don't think there's anything else worth saying about this; it is
-simple.  Read the sources for details.
-
-<p>
-So the cleaned-up running example looks like this.  As above, I have
-inserted line breaks after every original (non-instrumentation) uinstr
-to aid readability.  As with straightforward ucode optimisation, the
-results in this block are undramatic because it is so short; longer
-blocks benefit more because they have more redundancy which gets
-eliminated.
-
-
-<pre>
-at 29: delete UifU1 due to defd arg1
-at 32: change ImproveAND1_TQ to MOV due to defd arg2
-at 41: delete SETV
-at 31: delete MOV
-at 25: delete SETV
-at 22: delete SETV
-at 7: delete SETV
-
-           0: GETVL     %EDX, q0
-           1: GETL      %EDX, t0
-
-           2: TAG1o     q0 = Left4 ( q0 )
-           3: INCL      t0
-
-           4: PUTVL     q0, %EDX
-           5: PUTL      t0, %EDX
-
-           6: TESTVL    q0
-           8: LOADVB    (t0), q0
-           9: LDB       (t0), t0
-
-          10: TAG1o     q0 = SWiden14 ( q0 )
-          11: WIDENL_Bs t0
-
-          12: PUTVL     q0, %EAX
-          13: PUTL      t0, %EAX
-
-          14: GETVL     %ECX, q8
-          15: GETL      %ECX, t8
-
-          16: MOVL      q0, q4
-          17: SHLL      $0x1, q4
-          18: TAG2o     q4 = UifU4 ( q8, q4 )
-          19: TAG1o     q4 = Left4 ( q4 )
-          20: LEA2L     1(t8,t0,2), t4
-
-          21: TESTVL    q4
-          23: LOADVB    (t4), q10
-          24: LDB       (t4), t10
-
-          26: MOVB      $0x20, t12
-
-          27: MOVL      q10, q14
-          28: TAG2o     q14 = ImproveAND1_TQ ( t10, q14 )
-          30: TAG2o     q10 = DifD1 ( q14, q10 )
-          32: MOVL      t12, q14
-          33: TAG2o     q10 = DifD1 ( q14, q10 )
-          34: MOVL      q10, q16
-          35: TAG1o     q16 = PCast10 ( q16 )
-          36: PUTVFo    q16
-          37: ANDB      t12, t10  (-wOSZACP)
-
-          38: INCEIPo   $9
-          39: GETVFo    q18
-          40: TESTVo    q18
-          42: Jnzo      $0x40435A50  (-rOSZACP)
-
-          43: JMPo      $0x40435A5B
-</pre>
-
-
-<h3>Translation from UCode</h3>
-
-This is all very simple, even though <code>vg_from_ucode.c</code>
-is a big file.  Position-independent x86 code is generated into 
-a dynamically allocated array <code>emitted_code</code>; this is
-doubled in size when it overflows.  Eventually the array is handed
-back to the caller of <code>VG_(translate)</code>, who must copy
-the result into TC and TT, and free the array.
-
-<p>
-This file is structured into four layers of abstraction, which,
-thankfully, are glued back together with extensive
-<code>__inline__</code> directives.  From the bottom upwards:
-
-<ul>
-<li>Address-mode emitters, <code>emit_amode_regmem_reg</code> et al.
-<p>
-<li>Emitters for specific x86 instructions.  There are quite a lot of
-    these, with names such as <code>emit_movv_offregmem_reg</code>.
-    The <code>v</code> suffix is Intel parlance for a 16/32 bit insn;
-    there are also <code>b</code> suffixes for 8 bit insns.
-<p>
-<li>The next level up are the <code>synth_*</code> functions, which
-    synthesise possibly a sequence of raw x86 instructions to do some
-    simple task.  Some of these are quite complex because they have to
-    work around Intel's silly restrictions on subregister naming.  See 
-    <code>synth_nonshiftop_reg_reg</code> for example.
-<p>
-<li>Finally, at the top of the heap, we have
-    <code>emitUInstr()</code>,
-    which emits code for a single uinstr.
-</ul>
-
-<p>
-Some comments:
-<ul>
-<li>The hack for FPU instructions becomes apparent here.  To do a
-    <code>FPU</code> ucode instruction, we load the simulated FPU's
-    state into from its <code>VG_(baseBlock)</code> into the real FPU
-    using an x86 <code>frstor</code> insn, do the ucode
-    <code>FPU</code> insn on the real CPU, and write the updated FPU
-    state back into <code>VG_(baseBlock)</code> using an
-    <code>fnsave</code> instruction.  This is pretty brutal, but is
-    simple and it works, and even seems tolerably efficient.  There is
-    no attempt to cache the simulated FPU state in the real FPU over
-    multiple back-to-back ucode FPU instructions.
-    <p>
-    <code>FPU_R</code> and <code>FPU_W</code> are also done this way,
-    with the minor complication that we need to patch in some
-    addressing mode bits so the resulting insn knows the effective
-    address to use.  This is easy because of the regularity of the x86
-    FPU instruction encodings.
-<p>
-<li>An analogous trick is done with ucode insns which claim, in their
-    <code>flags_r</code> and <code>flags_w</code> fields, that they
-    read or write the simulated <code>%EFLAGS</code>.  For such cases
-    we first copy the simulated <code>%EFLAGS</code> into the real
-    <code>%eflags</code>, then do the insn, then, if the insn says it
-    writes the flags, copy back to <code>%EFLAGS</code>.  This is a
-    bit expensive, which is why the ucode optimisation pass goes to
-    some effort to remove redundant flag-update annotations.
-</ul>
-
-<p>
-And so ... that's the end of the documentation for the instrumentating
-translator!  It's really not that complex, because it's composed as a
-sequence of simple(ish) self-contained transformations on
-straight-line blocks of code.
-
-
-<h3>Top-level dispatch loop</h3>
-
-Urk.  In <code>VG_(toploop)</code>.  This is basically boring and
-unsurprising, not to mention fiddly and fragile.  It needs to be
-cleaned up.  
-
-<p>
-The only perhaps surprise is that the whole thing is run
-on top of a <code>setjmp</code>-installed exception handler, because,
-supposing a translation got a segfault, we have to bail out of the
-Valgrind-supplied exception handler <code>VG_(oursignalhandler)</code>
-and immediately start running the client's segfault handler, if it has
-one.  In particular we can't finish the current basic block and then
-deliver the signal at some convenient future point, because signals
-like SIGILL, SIGSEGV and SIGBUS mean that the faulting insn should not
-simply be re-tried.  (I'm sure there is a clearer way to explain this).
-
-
-<h3>Exceptions, creating new translations</h3>
-<h3>Self-modifying code</h3>
-
-<h3>Lazy updates of the simulated program counter</h3>
-
-Simulated <code>%EIP</code> is not updated after every simulated x86
-insn as this was regarded as too expensive.  Instead ucode
-<code>INCEIP</code> insns move it along as and when necessary.
-Currently we don't allow it to fall more than 4 bytes behind reality
-(see <code>VG_(disBB)</code> for the way this works).
-<p>
-Note that <code>%EIP</code> is always brought up to date by the inner
-dispatch loop in <code>VG_(dispatch)</code>, so that if the client
-takes a fault we know at least which basic block this happened in.
-
-
-<h3>The translation cache and translation table</h3>
-
-<h3>Signals</h3>
-
-Horrible, horrible.  <code>vg_signals.c</code>.
-Basically, since we have to intercept all system
-calls anyway, we can see when the client tries to install a signal
-handler.  If it does so, we make a note of what the client asked to
-happen, and ask the kernel to route the signal to our own signal
-handler, <code>VG_(oursignalhandler)</code>.  This simply notes the
-delivery of signals, and returns.  
-
-<p>
-Every 1000 basic blocks, we see if more signals have arrived.  If so,
-<code>VG_(deliver_signals)</code> builds signal delivery frames on the
-client's stack, and allows their handlers to be run.  Valgrind places
-in these signal delivery frames a bogus return address,
-</code>VG_(signalreturn_bogusRA)</code>, and checks all jumps to see
-if any jump to it.  If so, this is a sign that a signal handler is
-returning, and if so Valgrind removes the relevant signal frame from
-the client's stack, restores the from the signal frame the simulated
-state before the signal was delivered, and allows the client to run
-onwards.  We have to do it this way because some signal handlers never
-return, they just <code>longjmp()</code>, which nukes the signal
-delivery frame.
-
-<p>
-The Linux kernel has a different but equally horrible hack for
-detecting signal handler returns.  Discovering it is left as an
-exercise for the reader.
-
-
-
-<h3>Errors, error contexts, error reporting, suppressions</h3>
-<h3>Client malloc/free</h3>
-<h3>Low-level memory management</h3>
-<h3>A and V bitmaps</h3>
-<h3>Symbol table management</h3>
-<h3>Dealing with system calls</h3>
-<h3>Namespace management</h3>
-<h3>GDB attaching</h3>
-<h3>Non-dependence on glibc or anything else</h3>
-<h3>The leak detector</h3>
-<h3>Performance problems</h3>
-<h3>Continuous sanity checking</h3>
-<h3>Tracing, or not tracing, child processes</h3>
-<h3>Assembly glue for syscalls</h3>
-
-
-<hr width="100%">
-
-<h2>Extensions</h2>
-
-Some comments about Stuff To Do.
-
-<h3>Bugs</h3>
-
-Stephan Kulow and Marc Mutz report problems with kmail in KDE 3 CVS
-(RC2 ish) when run on Valgrind.  Stephan has it deadlocking; Marc has
-it looping at startup.  I can't repro either behaviour. Needs
-repro-ing and fixing.
-
-
-<h3>Threads</h3>
-
-Doing a good job of thread support strikes me as almost a
-research-level problem.  The central issues are how to do fast cheap
-locking of the <code>VG_(primary_map)</code> structure, whether or not
-accesses to the individual secondary maps need locking, what
-race-condition issues result, and whether the already-nasty mess that
-is the signal simulator needs further hackery.
-
-<p>
-I realise that threads are the most-frequently-requested feature, and
-I am thinking about it all.  If you have guru-level understanding of 
-fast mutual exclusion mechanisms and race conditions, I would be
-interested in hearing from you.
-
-
-<h3>Verification suite</h3>
-
-Directory <code>tests/</code> contains various ad-hoc tests for
-Valgrind.  However, there is no systematic verification or regression
-suite, that, for example, exercises all the stuff in
-<code>vg_memory.c</code>, to ensure that illegal memory accesses and
-undefined value uses are detected as they should be.  It would be good
-to have such a suite.
-
-
-<h3>Porting to other platforms</h3>
-
-It would be great if Valgrind was ported to FreeBSD and x86 NetBSD,
-and to x86 OpenBSD, if it's possible (doesn't OpenBSD use a.out-style
-executables, not ELF ?)
-
-<p>
-The main difficulties, for an x86-ELF platform, seem to be:
-
-<ul>
-<li>You'd need to rewrite the <code>/proc/self/maps</code> parser
-    (<code>vg_procselfmaps.c</code>).
-    Easy.
-<p>
-<li>You'd need to rewrite <code>vg_syscall_mem.c</code>, or, more
-    specifically, provide one for your OS.  This is tedious, but you
-    can implement syscalls on demand, and the Linux kernel interface
-    is, for the most part, going to look very similar to the *BSD
-    interfaces, so it's really a copy-paste-and-modify-on-demand job.
-    As part of this, you'd need to supply a new
-    <code>vg_kerneliface.h</code> file.
-<p>
-<li>You'd also need to change the syscall wrappers for Valgrind's
-    internal use, in <code>vg_mylibc.c</code>.
-</ul>
-
-All in all, I think a port to x86-ELF *BSDs is not really very
-difficult, and in some ways I would like to see it happen, because
-that would force a more clear factoring of Valgrind into platform
-dependent and independent pieces.  Not to mention, *BSD folks also
-deserve to use Valgrind just as much as the Linux crew do.
-
-
-<p>
-<hr width="100%">
-
-<h2>Easy stuff which ought to be done</h2>
-
-<h3>MMX instructions</h3>
-
-MMX insns should be supported, using the same trick as for FPU insns.
-If the MMX registers are not used to copy uninitialised junk from one
-place to another in memory, this means we don't have to actually
-simulate the internal MMX unit state, so the FPU hack applies.  This
-should be fairly easy.
-
-
-
-<h3>Fix stabs-info reader</h3>
-
-The machinery in <code>vg_symtab2.c</code> which reads "stabs" style
-debugging info is pretty weak.  It usually correctly translates 
-simulated program counter values into line numbers and procedure
-names, but the file name is often completely wrong.  I think the
-logic used to parse "stabs" entries is weak.  It should be fixed.
-The simplest solution, IMO, is to copy either the logic or simply the
-code out of GNU binutils which does this; since GDB can clearly get it
-right, binutils (or GDB?) must have code to do this somewhere.
-
-
-
-
-
-<h3>BT/BTC/BTS/BTR</h3>
-
-These are x86 instructions which test, complement, set, or reset, a
-single bit in a word.  At the moment they are both incorrectly
-implemented and incorrectly instrumented.
-
-<p>
-The incorrect instrumentation is due to use of helper functions.  This
-means we lose bit-level definedness tracking, which could wind up
-giving spurious uninitialised-value use errors.  The Right Thing to do
-is to invent a couple of new UOpcodes, I think <code>GET_BIT</code>
-and <code>SET_BIT</code>, which can be used to implement all 4 x86
-insns, get rid of the helpers, and give bit-accurate instrumentation
-rules for the two new UOpcodes.
-
-<p>
-I realised the other day that they are mis-implemented too.  The x86
-insns take a bit-index and a register or memory location to access.
-For registers the bit index clearly can only be in the range zero to
-register-width minus 1, and I assumed the same applied to memory
-locations too.  But evidently not; for memory locations the index can
-be arbitrary, and the processor will index arbitrarily into memory as
-a result.  This too should be fixed.  Sigh.  Presumably indexing
-outside the immediate word is not actually used by any programs yet
-tested on Valgrind, for otherwise they (presumably) would simply not
-work at all.  If you plan to hack on this, first check the Intel docs
-to make sure my understanding is really correct.
-
-
-
-<h3>Using PREFETCH instructions</h3>
-
-Here's a small but potentially interesting project for performance
-junkies.  Experiments with valgrind's code generator and optimiser(s)
-suggest that reducing the number of instructions executed in the
-translations and mem-check helpers gives disappointingly small
-performance improvements.  Perhaps this is because performance of
-Valgrindified code is limited by cache misses.  After all, each read
-in the original program now gives rise to at least three reads, one
-for the <code>VG_(primary_map)</code>, one of the resulting
-secondary, and the original.  Not to mention, the instrumented
-translations are 13 to 14 times larger than the originals.  All in all
-one would expect the memory system to be hammered to hell and then
-some.
-
-<p>
-So here's an idea.  An x86 insn involving a read from memory, after
-instrumentation, will turn into ucode of the following form:
-<pre>
-    ... calculate effective addr, into ta and qa ...
-    TESTVL qa             -- is the addr defined?
-    LOADV (ta), qloaded   -- fetch V bits for the addr
-    LOAD  (ta), tloaded   -- do the original load
-</pre>
-At the point where the <code>LOADV</code> is done, we know the actual
-address (<code>ta</code>) from which the real <code>LOAD</code> will
-be done.  We also know that the <code>LOADV</code> will take around
-20 x86 insns to do.  So it seems plausible that doing a prefetch of
-<code>ta</code> just before the <code>LOADV</code> might just avoid a
-miss at the <code>LOAD</code> point, and that might be a significant
-performance win.
-
-<p>
-Prefetch insns are notoriously tempermental, more often than not
-making things worse rather than better, so this would require
-considerable fiddling around.  It's complicated because Intels and
-AMDs have different prefetch insns with different semantics, so that
-too needs to be taken into account.  As a general rule, even placing
-the prefetches before the <code>LOADV</code> insn is too near the
-<code>LOAD</code>; the ideal distance is apparently circa 200 CPU
-cycles.  So it might be worth having another analysis/transformation
-pass which pushes prefetches as far back as possible, hopefully 
-immediately after the effective address becomes available.
-
-<p>
-Doing too many prefetches is also bad because they soak up bus
-bandwidth / cpu resources, so some cleverness in deciding which loads
-to prefetch and which to not might be helpful.  One can imagine not
-prefetching client-stack-relative (<code>%EBP</code> or
-<code>%ESP</code>) accesses, since the stack in general tends to show
-good locality anyway.
-
-<p>
-There's quite a lot of experimentation to do here, but I think it
-might make an interesting week's work for someone.
-
-<p>
-As of 15-ish March 2002, I've started to experiment with this, using
-the AMD <code>prefetch/prefetchw</code> insns.
-
-
-
-<h3>User-defined permission ranges</h3>
-
-This is quite a large project -- perhaps a month's hacking for a
-capable hacker to do a good job -- but it's potentially very
-interesting.  The outcome would be that Valgrind could detect a 
-whole class of bugs which it currently cannot.
-
-<p>
-The presentation falls into two pieces.
-
-<p>
-<b>Part 1: user-defined address-range permission setting</b>
-<p>
-
-Valgrind intercepts the client's <code>malloc</code>,
-<code>free</code>, etc calls, watches system calls, and watches the
-stack pointer move.  This is currently the only way it knows about
-which addresses are valid and which not.  Sometimes the client program
-knows extra information about its memory areas.  For example, the
-client could at some point know that all elements of an array are
-out-of-date.  We would like to be able to convey to Valgrind this
-information that the array is now addressable-but-uninitialised, so
-that Valgrind can then warn if elements are used before they get new
-values. 
-
-<p>
-What I would like are some macros like this:
-<pre>
-   VALGRIND_MAKE_NOACCESS(addr, len)
-   VALGRIND_MAKE_WRITABLE(addr, len)
-   VALGRIND_MAKE_READABLE(addr, len)
-</pre>
-and also, to check that memory is addressible/initialised,
-<pre>
-   VALGRIND_CHECK_ADDRESSIBLE(addr, len)
-   VALGRIND_CHECK_INITIALISED(addr, len)
-</pre>
-
-<p>
-I then include in my sources a header defining these macros, rebuild
-my app, run under Valgrind, and get user-defined checks.
-
-<p>
-Now here's a neat trick.  It's a nuisance to have to re-link the app
-with some new library which implements the above macros.  So the idea
-is to define the macros so that the resulting executable is still
-completely stand-alone, and can be run without Valgrind, in which case
-the macros do nothing, but when run on Valgrind, the Right Thing
-happens.  How to do this?  The idea is for these macros to turn into a
-piece of inline assembly code, which (1) has no effect when run on the
-real CPU, (2) is easily spotted by Valgrind's JITter, and (3) no sane
-person would ever write, which is important for avoiding false matches
-in (2).  So here's a suggestion:
-<pre>
-   VALGRIND_MAKE_NOACCESS(addr, len)
-</pre>
-becomes (roughly speaking)
-<pre>
-   movl addr, %eax
-   movl len,  %ebx
-   movl $1,   %ecx   -- 1 describes the action; MAKE_WRITABLE might be
-                     -- 2, etc
-   rorl $13, %ecx
-   rorl $19, %ecx
-   rorl $11, %eax
-   rorl $21, %eax
-</pre>
-The rotate sequences have no effect, and it's unlikely they would
-appear for any other reason, but they define a unique byte-sequence
-which the JITter can easily spot.  Using the operand constraints
-section at the end of a gcc inline-assembly statement, we can tell gcc
-that the assembly fragment kills <code>%eax</code>, <code>%ebx</code>,
-<code>%ecx</code> and the condition codes, so this fragment is made
-harmless when not running on Valgrind, runs quickly when not on
-Valgrind, and does not require any other library support.
-
-
-<p>
-<b>Part 2: using it to detect interference between stack variables</b>
-<p>
-
-Currently Valgrind cannot detect errors of the following form:
-<pre>
-void fooble ( void )
-{
-   int a[10];
-   int b[10];
-   a[10] = 99;
-}
-</pre>
-Now imagine rewriting this as
-<pre>
-void fooble ( void )
-{
-   int spacer0;
-   int a[10];
-   int spacer1;
-   int b[10];
-   int spacer2;
-   VALGRIND_MAKE_NOACCESS(&spacer0, sizeof(int));
-   VALGRIND_MAKE_NOACCESS(&spacer1, sizeof(int));
-   VALGRIND_MAKE_NOACCESS(&spacer2, sizeof(int));
-   a[10] = 99;
-}
-</pre>
-Now the invalid write is certain to hit <code>spacer0</code> or
-<code>spacer1</code>, so Valgrind will spot the error.
-
-<p>
-There are two complications.
-
-<p>
-The first is that we don't want to annotate sources by hand, so the
-Right Thing to do is to write a C/C++ parser, annotator, prettyprinter
-which does this automatically, and run it on post-CPP'd C/C++ source.
-See http://www.cacheprof.org for an example of a system which
-transparently inserts another phase into the gcc/g++ compilation
-route.  The parser/prettyprinter is probably not as hard as it sounds;
-I would write it in Haskell, a powerful functional language well
-suited to doing symbolic computation, with which I am intimately
-familar.  There is already a C parser written in Haskell by someone in
-the Haskell community, and that would probably be a good starting
-point.
-
-<p>
-The second complication is how to get rid of these
-<code>NOACCESS</code> records inside Valgrind when the instrumented
-function exits; after all, these refer to stack addresses and will
-make no sense whatever when some other function happens to re-use the
-same stack address range, probably shortly afterwards.  I think I
-would be inclined to define a special stack-specific macro
-<pre>
-   VALGRIND_MAKE_NOACCESS_STACK(addr, len)
-</pre>
-which causes Valgrind to record the client's <code>%ESP</code> at the
-time it is executed.  Valgrind will then watch for changes in
-<code>%ESP</code> and discard such records as soon as the protected
-area is uncovered by an increase in <code>%ESP</code>.  I hesitate
-with this scheme only because it is potentially expensive, if there
-are hundreds of such records, and considering that changes in
-<code>%ESP</code> already require expensive messing with stack access
-permissions.
-
-<p>
-This is probably easier and more robust than for the instrumenter 
-program to try and spot all exit points for the procedure and place
-suitable deallocation annotations there.  Plus C++ procedures can 
-bomb out at any point if they get an exception, so spotting return
-points at the source level just won't work at all.
-
-<p>
-Although some work, it's all eminently doable, and it would make
-Valgrind into an even-more-useful tool.
-
-
-<p>
-
-
-<hr width="100%">
-
-<h2>Cache profiling</h2>
-Valgrind is a very nice platform for doing cache profiling and other kinds of
-simulation, because it converts horrible x86 instructions into nice clean
-RISC-like UCode.  For example, for cache profiling we are interested in
-instructions that read and write memory;  in UCode there are only four
-instructions that do this:  <code>LOAD</code>, <code>STORE</code>,
-<code>FPU_R</code> and <code>FPU_W</code>.  By contrast, because of the x86
-addressing modes, almost every instruction can read or write memory.<p>
-
-Most of the cache profiling machinery is in the file
-<code>vg_cachesim.c</code>.<p>
-
-These notes are a somewhat haphazard guide to how Valgrind's cache profiling
-works.<p>
-
-<h3>Cost centres</h3>
-Valgrind gathers cache profiling about every instruction executed,
-individually.  Each instruction has a <b>cost centre</b> associated with it.
-There are two kinds of cost centre: one for instructions that don't reference
-memory (<code>iCC</code>), and one for instructions that do
-(<code>idCC</code>):
-
-<pre>
-typedef struct _CC {
-   ULong a;
-   ULong m1;
-   ULong m2;
-} CC;
-
-typedef struct _iCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-} iCC;
-   
-typedef struct _idCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   UChar data_size;
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I; 
-   CC D; 
-} idCC; 
-</pre>
-
-Each <code>CC</code> has three fields <code>a</code>, <code>m1</code>,
-<code>m2</code> for recording references, level 1 misses and level 2 misses.
-Each of these is a 64-bit <code>ULong</code> -- the numbers can get very large,
-ie. greater than 4.2 billion allowed by a 32-bit unsigned int.<p>
-
-A <code>iCC</code> has one <code>CC</code> for instruction cache accesses.  A
-<code>idCC</code> has two, one for instruction cache accesses, and one for data
-cache accesses.<p>
-
-The <code>iCC</code> and <code>dCC</code> structs also store unchanging
-information about the instruction:
-<ul>
-  <li>An instruction-type identification tag (explained below)</li><p>
-  <li>Instruction size</li><p>
-  <li>Data reference size (<code>idCC</code> only)</li><p>
-  <li>Instruction address</li><p>
-</ul>
-
-Note that data address is not one of the fields for <code>idCC</code>.  This is
-because for many memory-referencing instructions the data address can change
-each time it's executed (eg. if it uses register-offset addressing).  We have
-to give this item to the cache simulation in a different way (see
-Instrumentation section below). Some memory-referencing instructions do always
-reference the same address, but we don't try to treat them specialy in order to
-keep things simple.<p>
-
-Also note that there is only room for recording info about one data cache
-access in an <code>idCC</code>.  So what about instructions that do a read then
-a write, such as:
-
-<blockquote><code>inc %(esi)</code></blockquote>
-
-In a write-allocate cache, as simulated by Valgrind, the write cannot miss,
-since it immediately follows the read which will drag the block into the cache
-if it's not already there.  So the write access isn't really interesting, and
-Valgrind doesn't record it.  This means that Valgrind doesn't measure
-memory references, but rather memory references that could miss in the cache.
-This behaviour is the same as that used by the AMD Athlon hardware counters.
-It also has the benefit of simplifying the implementation -- instructions that
-read and write memory can be treated like instructions that read memory.<p>
-
-<h3>Storing cost-centres</h3>
-Cost centres are stored in a way that makes them very cheap to lookup, which is
-important since one is looked up for every original x86 instruction
-executed.<p>
-
-Valgrind does JIT translations at the basic block level, and cost centres are
-also setup and stored at the basic block level.  By doing things carefully, we
-store all the cost centres for a basic block in a contiguous array, and lookup
-comes almost for free.<p>
-
-Consider this part of a basic block (for exposition purposes, pretend it's an
-entire basic block):
-
-<pre>
-movl $0x0,%eax
-movl $0x99, -4(%ebp)
-</pre>
-
-The translation to UCode looks like this:
-                
-<pre>
-MOVL      $0x0, t20
-PUTL      t20, %EAX
-INCEIPo   $5
-
-LEA1L     -4(t4), t14
-MOVL      $0x99, t18
-STL       t18, (t14)
-INCEIPo   $7
-</pre>
-
-The first step is to allocate the cost centres.  This requires a preliminary
-pass to count how many x86 instructions were in the basic block, and their
-types (and thus sizes).  UCode translations for single x86 instructions are
-delimited by the <code>INCEIPo</code> instruction, the argument of which gives
-the byte size of the instruction (note that lazy INCEIP updating is turned off
-to allow this).<p>
-
-We can tell if an x86 instruction references memory by looking for
-<code>LDL</code> and <code>STL</code> UCode instructions, and thus what kind of
-cost centre is required.  From this we can determine how many cost centres we
-need for the basic block, and their sizes.  We can then allocate them in a
-single array.<p>
-
-Consider the example code above.  After the preliminary pass, we know we need
-two cost centres, one <code>iCC</code> and one <code>dCC</code>.  So we
-allocate an array to store these which looks like this:
-
-<pre>
-|(uninit)|      tag         (1 byte)
-|(uninit)|      instr_size  (1 bytes)
-|(uninit)|      (padding)   (2 bytes)
-|(uninit)|      instr_addr  (4 bytes)
-|(uninit)|      I.a         (8 bytes)
-|(uninit)|      I.m1        (8 bytes)
-|(uninit)|      I.m2        (8 bytes)
-
-|(uninit)|      tag         (1 byte)
-|(uninit)|      instr_size  (1 byte)
-|(uninit)|      data_size   (1 byte)
-|(uninit)|      (padding)   (1 byte)
-|(uninit)|      instr_addr  (4 bytes)
-|(uninit)|      I.a         (8 bytes)
-|(uninit)|      I.m1        (8 bytes)
-|(uninit)|      I.m2        (8 bytes)
-|(uninit)|      D.a         (8 bytes)
-|(uninit)|      D.m1        (8 bytes)
-|(uninit)|      D.m2        (8 bytes)
-</pre>
-
-(We can see now why we need tags to distinguish between the two types of cost
-centres.)<p>
-
-We also record the size of the array.  We look up the debug info of the first
-instruction in the basic block, and then stick the array into a table indexed
-by filename and function name.  This makes it easy to dump the information
-quickly to file at the end.<p>
-
-<h3>Instrumentation</h3>
-The instrumentation pass has two main jobs:
-
-<ol>
-  <li>Fill in the gaps in the allocated cost centres.</li><p>
-  <li>Add UCode to call the cache simulator for each instruction.</li><p>
-</ol>
-
-The instrumentation pass steps through the UCode and the cost centres in
-tandem.  As each original x86 instruction's UCode is processed, the appropriate
-gaps in the instructions cost centre are filled in, for example:
-
-<pre>
-|INSTR_CC|      tag         (1 byte)
-|5       |      instr_size  (1 bytes)
-|(uninit)|      (padding)   (2 bytes)
-|i_addr1 |      instr_addr  (4 bytes)
-|0       |      I.a         (8 bytes)
-|0       |      I.m1        (8 bytes)
-|0       |      I.m2        (8 bytes)
-
-|WRITE_CC|      tag         (1 byte)
-|7       |      instr_size  (1 byte)
-|4       |      data_size   (1 byte)
-|(uninit)|      (padding)   (1 byte)
-|i_addr2 |      instr_addr  (4 bytes)
-|0       |      I.a         (8 bytes)
-|0       |      I.m1        (8 bytes)
-|0       |      I.m2        (8 bytes)
-|0       |      D.a         (8 bytes)
-|0       |      D.m1        (8 bytes)
-|0       |      D.m2        (8 bytes)
-</pre>
-
-(Note that this step is not performed if a basic block is re-translated;  see
-<a href="#retranslations">here</a> for more information.)<p>
-
-GCC inserts padding before the <code>instr_size</code> field so that it is word
-aligned.<p>
-
-The instrumentation added to call the cache simulation function looks like this
-(instrumentation is indented to distinguish it from the original UCode):
-
-<pre>
-MOVL      $0x0, t20
-PUTL      t20, %EAX
-  PUSHL     %eax
-  PUSHL     %ecx
-  PUSHL     %edx
-  MOVL      $0x4091F8A4, t46  # address of 1st CC
-  PUSHL     t46
-  CALLMo    $0x12             # second cachesim function
-  CLEARo    $0x4
-  POPL      %edx
-  POPL      %ecx
-  POPL      %eax
-INCEIPo   $5
-
-LEA1L     -4(t4), t14
-MOVL      $0x99, t18
-  MOVL      t14, t42
-STL       t18, (t14)
-  PUSHL     %eax
-  PUSHL     %ecx
-  PUSHL     %edx
-  PUSHL     t42
-  MOVL      $0x4091F8C4, t44  # address of 2nd CC
-  PUSHL     t44
-  CALLMo    $0x13             # second cachesim function
-  CLEARo    $0x8
-  POPL      %edx
-  POPL      %ecx
-  POPL      %eax
-INCEIPo   $7
-</pre>
-
-Consider the first instruction's UCode.  Each call is surrounded by three
-<code>PUSHL</code> and <code>POPL</code> instructions to save and restore the
-caller-save registers.  Then the address of the instruction's cost centre is
-pushed onto the stack, to be the first argument to the cache simulation
-function.  The address is known at this point because we are doing a
-simultaneous pass through the cost centre array.  This means the cost centre
-lookup for each instruction is almost free (just the cost of pushing an
-argument for a function call).  Then the call to the cache simulation function
-for non-memory-reference instructions is made (note that the
-<code>CALLMo</code> UInstruction takes an offset into a table of predefined
-functions;  it is not an absolute address), and the single argument is
-<code>CLEAR</code>ed from the stack.<p>
-
-The second instruction's UCode is similar.  The only difference is that, as
-mentioned before, we have to pass the address of the data item referenced to
-the cache simulation function too.  This explains the <code>MOVL t14,
-t42</code> and <code>PUSHL t42</code> UInstructions.  (Note that the seemingly
-redundant <code>MOV</code>ing will probably be optimised away during register
-allocation.)<p>
-
-Note that instead of storing unchanging information about each instruction
-(instruction size, data size, etc) in its cost centre, we could have passed in
-these arguments to the simulation function.  But this would slow the calls down
-(two or three extra arguments pushed onto the stack).  Also it would bloat the
-UCode instrumentation by amounts similar to the space required for them in the
-cost centre;  bloated UCode would also fill the translation cache more quickly,
-requiring more translations for large programs and slowing them down more.<p>
-
-<a name="retranslations"></a>
-<h3>Handling basic block retranslations</h3>
-The above description ignores one complication.  Valgrind has a limited size
-cache for basic block translations;  if it fills up, old translations are
-discarded.  If a discarded basic block is executed again, it must be
-re-translated.<p>
-
-However, we can't use this approach for profiling -- we can't throw away cost
-centres for instructions in the middle of execution!  So when a basic block is
-translated, we first look for its cost centre array in the hash table.  If
-there is no cost centre array, it must be the first translation, so we proceed
-as described above.  But if there is a cost centre array already, it must be a
-retranslation.  In this case, we skip the cost centre allocation and
-initialisation steps, but still do the UCode instrumentation step.<p>
-
-<h3>The cache simulation</h3>
-The cache simulation is fairly straightforward.  It just tracks which memory
-blocks are in the cache at the moment (it doesn't track the contents, since
-that is irrelevant).<p>
-
-The interface to the simulation is quite clean.  The functions called from the
-UCode contain calls to the simulation functions in the files
-<Code>vg_cachesim_{I1,D1,L2}.c</code>;  these calls are inlined so that only
-one function call is done per simulated x86 instruction.  The file
-<code>vg_cachesim.c</code> simply <code>#include</code>s the three files
-containing the simulation, which makes plugging in new cache simulations is
-very easy -- you just replace the three files and recompile.<p>
-
-<h3>Output</h3>
-Output is fairly straightforward, basically printing the cost centre for every
-instruction, grouped by files and functions.  Total counts (eg. total cache
-accesses, total L1 misses) are calculated when traversing this structure rather
-than during execution, to save time;  the cache simulation functions are called
-so often that even one or two extra adds can make a sizeable difference.<p>
-
-Input file has the following format:
-
-<pre>
-file         ::= desc_line* cmd_line events_line data_line+ summary_line
-desc_line    ::= "desc:" ws? non_nl_string
-cmd_line     ::= "cmd:" ws? cmd
-events_line  ::= "events:" ws? (event ws)+
-data_line    ::= file_line | fn_line | count_line
-file_line    ::= ("fl=" | "fi=" | "fe=") filename
-fn_line      ::= "fn=" fn_name
-count_line   ::= line_num ws? (count ws)+
-summary_line ::= "summary:" ws? (count ws)+
-count        ::= num | "."
-</pre>
-
-Where:
-
-<ul>
-  <li><code>non_nl_string</code> is any string not containing a newline.</li><p>
-  <li><code>cmd</code> is a command line invocation.</li><p>
-  <li><code>filename</code> and <code>fn_name</code> can be anything.</li><p>
-  <li><code>num</code> and <code>line_num</code> are decimal numbers.</li><p>
-  <li><code>ws</code> is whitespace.</li><p>
-  <li><code>nl</code> is a newline.</li><p>
-</ul>
-
-The contents of the "desc:" lines is printed out at the top of the summary.
-This is a generic way of providing simulation specific information, eg. for
-giving the cache configuration for cache simulation.<p>
-
-Counts can be "." to represent "N/A", eg. the number of write misses for an
-instruction that doesn't write to memory.<p>
-
-The number of counts in each <code>line</code> and the
-<code>summary_line</code> should not exceed the number of events in the
-<code>event_line</code>.  If the number in each <code>line</code> is less,
-vg_annotate treats those missing as though they were a "." entry.  <p>
-
-A <code>file_line</code> changes the current file name.  A <code>fn_line</code>
-changes the current function name.  A <code>count_line</code> contains counts
-that pertain to the current filename/fn_name.  A "fn=" <code>file_line</code>
-and a <code>fn_line</code> must appear before any <code>count_line</code>s to
-give the context of the first <code>count_line</code>s.<p>
-
-Each <code>file_line</code> should be immediately followed by a
-<code>fn_line</code>.  "fi=" <code>file_lines</code> are used to switch
-filenames for inlined functions; "fe=" <code>file_lines</code> are similar, but
-are put at the end of a basic block in which the file name hasn't been switched
-back to the original file name.  (fi and fe lines behave the same, they are
-only distinguished to help debugging.)<p>
-
-
-<h3>Summary of performance features</h3>
-Quite a lot of work has gone into making the profiling as fast as possible.
-This is a summary of the important features:
-
-<ul>
-  <li>The basic block-level cost centre storage allows almost free cost centre
-      lookup.</li><p>
-  
-  <li>Only one function call is made per instruction simulated;  even this
-      accounts for a sizeable percentage of execution time, but it seems
-      unavoidable if we want flexibility in the cache simulator.</li><p>
-
-  <li>Unchanging information about an instruction is stored in its cost centre,
-      avoiding unnecessary argument pushing, and minimising UCode
-      instrumentation bloat.</li><p>
-
-  <li>Summary counts are calculated at the end, rather than during
-      execution.</li><p>
-
-  <li>The <code>cachegrind.out</code> output files can contain huge amounts of
-      information; file format was carefully chosen to minimise file
-      sizes.</li><p>
-</ul>
-
-
-<h3>Annotation</h3>
-Annotation is done by vg_annotate.  It is a fairly straightforward Perl script
-that slurps up all the cost centres, and then runs through all the chosen
-source files, printing out cost centres with them.  It too has been carefully
-optimised.
-
-
-<h3>Similar work, extensions</h3>
-It would be relatively straightforward to do other simulations and obtain
-line-by-line information about interesting events.  A good example would be
-branch prediction -- all branches could be instrumented to interact with a
-branch prediction simulator, using very similar techniques to those described
-above.<p>
-
-In particular, vg_annotate would not need to change -- the file format is such
-that it is not specific to the cache simulation, but could be used for any kind
-of line-by-line information.  The only part of vg_annotate that is specific to
-the cache simulation is the name of the input file
-(<code>cachegrind.out</code>), although it would be very simple to add an
-option to control this.<p>
-
-</body>
-</html>
diff --git a/corecheck/Makefile.am b/corecheck/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/corecheck/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/coregrind/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/coregrind/arch/x86-linux/vg_libpthread.c b/coregrind/arch/x86-linux/vg_libpthread.c
deleted file mode 100644
index 60c4dc95ff..0000000000
--- a/coregrind/arch/x86-linux/vg_libpthread.c
+++ /dev/null
@@ -1,2850 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A replacement for the standard libpthread.so.                ---*/
-/*---                                              vg_libpthread.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-/* ALL THIS CODE RUNS ON THE SIMULATED CPU.
-
-   This is a replacement for the standard libpthread.so.  It is loaded
-   as part of the client's image (if required) and directs pthread
-   calls through to Valgrind's request mechanism. 
-
-   A couple of caveats.
- 
-   1.  Since it's a binary-compatible replacement for an existing library, 
-       we must take care to used exactly the same data layouts, etc, as 
-       the standard pthread.so does.  
-
-   2.  Since this runs as part of the client, there are no specific
-       restrictions on what headers etc we can include, so long as
-       this libpthread.so does not end up having dependencies on .so's
-       which the real one doesn't.
-
-   Later ... it appears we cannot call file-related stuff in libc here,
-   perhaps fair enough.  Be careful what you call from here.  Even exit()
-   doesn't work (gives infinite recursion and then stack overflow); hence
-   myexit().  Also fprintf doesn't seem safe.
-*/
-
-#include "valgrind.h"    /* For the request-passing mechanism */
-#include "vg_include.h"  /* For the VG_USERREQ__* constants */
-
-#define __USE_UNIX98
-#include <sys/types.h>
-#include <pthread.h>
-#undef __USE_UNIX98
-
-#include <unistd.h>
-#include <string.h>
-#ifdef GLIBC_2_1
-#include <sys/time.h>
-#endif
-
-#include <stdio.h>
-
-
-/* ---------------------------------------------------------------------
-   Forwardses.
-   ------------------------------------------------------------------ */
-
-static void wait_for_fd_to_be_readable_or_erring ( int fd );
-
-static
-int my_do_syscall2 ( int syscallno, 
-                     int arg1, int arg2 );
-
-
-/* ---------------------------------------------------------------------
-   Helpers.  We have to be pretty self-sufficient.
-   ------------------------------------------------------------------ */
-
-/* Number of times any given error message is printed. */
-#define N_MOANS 3
-
-/* Extract from Valgrind the value of VG_(clo_trace_pthread_level).
-   Returns 0 (none) if not running on Valgrind. */
-static
-int get_pt_trace_level ( void )
-{
-   int res;
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__GET_PTHREAD_TRACE_LEVEL,
-                           0, 0, 0, 0);
-   return res;
-}
-
-
-static
-void my_exit ( int arg )
-{
-   int __res;
-   __asm__ volatile ("movl %%ecx, %%ebx ; int $0x80"
-                     : "=a" (__res)
-                     : "0" (__NR_exit),
-                       "c" (arg) );
-   /* We don't bother to mention the fact that this asm trashes %ebx,
-      since it won't return.  If you ever do let it return ... fix
-      this! */
-}
-
-
-/* We need this guy -- it's in valgrind.so. */
-extern void VG_(startup) ( void );
-
-
-/* Just start up Valgrind if it's not already going.  VG_(startup)()
-   detects and ignores second and subsequent calls. */
-static __inline__
-void ensure_valgrind ( char* caller )
-{
-   VG_(startup)();
-}
-
-/* While we're at it ... hook our own startup function into this
-   game. */
-__asm__ (
-   ".section .init\n"
-   "\tcall vgPlain_startup"
-);
-
-
-static
-__attribute__((noreturn))
-void barf ( char* str )
-{
-   char buf[100];
-   buf[0] = 0;
-   strcat(buf, "\nvalgrind's libpthread.so: ");
-   strcat(buf, str);
-   strcat(buf, "\n\n");
-   write(2, buf, strlen(buf));
-   my_exit(1);
-   /* We have to persuade gcc into believing this doesn't return. */
-   while (1) { };
-}
-
-
-static void ignored ( char* msg )
-{
-   if (get_pt_trace_level() >= 0) {
-      char* ig = "valgrind's libpthread.so: IGNORED call to: ";
-      write(2, ig, strlen(ig));
-      write(2, msg, strlen(msg));
-      ig = "\n";
-      write(2, ig, strlen(ig));
-   }
-}
-
-static void kludged ( char* msg )
-{
-   if (get_pt_trace_level() >= 0) {
-      char* ig = "valgrind's libpthread.so: KLUDGED call to: ";
-      write(2, ig, strlen(ig));
-      write(2, msg, strlen(msg));
-      ig = "\n";
-      write(2, ig, strlen(ig));
-   }
-}
-
-static void not_inside ( char* msg )
-{
-   VG_(startup)();
-}
-
-__attribute__((noreturn))
-void vgPlain_unimp ( char* what )
-{
-   char* ig = "valgrind's libpthread.so: UNIMPLEMENTED FUNCTION: ";
-   write(2, ig, strlen(ig));
-   write(2, what, strlen(what));
-   ig = "\n";
-   write(2, ig, strlen(ig));
-   barf("Please report this bug to me at: jseward@acm.org");
-}
-
-
-static
-void my_assert_fail ( Char* expr, Char* file, Int line, Char* fn )
-{
-   static Bool entered = False;
-   if (entered) 
-      my_exit(2);
-   entered = True;
-   fprintf(stderr, "\n%s: %s:%d (%s): Assertion `%s' failed.\n",
-                   "valgrind", file, line, fn, expr );
-   fprintf(stderr, "Please report this bug to me at: %s\n\n", 
-                   VG_EMAIL_ADDR);
-   my_exit(1);
-}
-
-#define MY__STRING(__str)  #__str
-
-#define my_assert(expr)                                               \
-  ((void) ((expr) ? 0 :						      \
-	   (my_assert_fail  (MY__STRING(expr),			      \
-			      __FILE__, __LINE__,                     \
-                              __PRETTY_FUNCTION__), 0)))
-
-
-/* ---------------------------------------------------------------------
-   Pass pthread_ calls to Valgrind's request mechanism.
-   ------------------------------------------------------------------ */
-
-#include <errno.h>
-#include <sys/time.h> /* gettimeofday */
-
-
-/* ---------------------------------------------------
-   Ummm ..
-   ------------------------------------------------ */
-
-static
-void pthread_error ( const char* msg )
-{
-   int res;
-   VALGRIND_MAGIC_SEQUENCE(res, 0,
-                           VG_USERREQ__PTHREAD_ERROR, 
-                           msg, 0, 0, 0);
-}
-
-
-/* ---------------------------------------------------
-   THREAD ATTRIBUTES
-   ------------------------------------------------ */
-
-int pthread_attr_init(pthread_attr_t *attr)
-{
-   /* Just initialise the fields which we might look at. */
-   attr->__detachstate = PTHREAD_CREATE_JOINABLE;
-   return 0;
-}
-
-int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate)
-{
-   if (detachstate != PTHREAD_CREATE_JOINABLE 
-       && detachstate != PTHREAD_CREATE_DETACHED) {
-      pthread_error("pthread_attr_setdetachstate: "
-                    "detachstate is invalid");
-      return EINVAL;
-   }
-   attr->__detachstate = detachstate;
-   return 0;
-}
-
-int pthread_attr_setinheritsched(pthread_attr_t *attr, int inherit)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_attr_setinheritsched");
-   return 0;
-}
-
-__attribute__((weak))
-int pthread_attr_setstacksize (pthread_attr_t *__attr,
-                               size_t __stacksize)
-{
-   size_t limit;
-   char buf[1024];
-   ensure_valgrind("pthread_attr_setstacksize");
-   limit = VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB 
-                                 - 1000; /* paranoia */
-   if (__stacksize < limit)
-      return 0;
-   snprintf(buf, sizeof(buf), "pthread_attr_setstacksize: "
-            "requested size %d >= VG_PTHREAD_STACK_SIZE\n   "
-            "edit vg_include.h and rebuild.", __stacksize);
-   buf[sizeof(buf)-1] = '\0'; /* Make sure it is zero terminated */
-   barf(buf);
-}
-
-
-/* This is completely bogus. */
-int  pthread_attr_getschedparam(const  pthread_attr_t  *attr,  
-                                struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      kludged("pthread_attr_getschedparam");
-#  ifdef HAVE_SCHED_PRIORITY
-   if (param) param->sched_priority = 0; /* who knows */
-#  else
-   if (param) param->__sched_priority = 0; /* who knows */
-#  endif
-   return 0;
-}
-
-int  pthread_attr_setschedparam(pthread_attr_t  *attr,
-                                const  struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_attr_setschedparam");
-   return 0;
-}
-
-int pthread_attr_destroy(pthread_attr_t *attr)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_attr_destroy");
-   return 0;
-}
-
-/* These are no-ops, as with LinuxThreads. */
-int pthread_attr_setscope ( pthread_attr_t *attr, int scope )
-{
-   ensure_valgrind("pthread_attr_setscope");
-   if (scope == PTHREAD_SCOPE_SYSTEM)
-      return 0;
-   pthread_error("pthread_attr_setscope: "
-                 "invalid or unsupported scope");
-   if (scope == PTHREAD_SCOPE_PROCESS)
-      return ENOTSUP;
-   return EINVAL;
-}
-
-int pthread_attr_getscope ( const pthread_attr_t *attr, int *scope )
-{
-   ensure_valgrind("pthread_attr_setscope");
-   if (scope)
-      *scope = PTHREAD_SCOPE_SYSTEM;
-   return 0;
-}
-
-
-/* Pretty bogus.  Avoid if possible. */
-int pthread_getattr_np (pthread_t thread, pthread_attr_t *attr)
-{
-   int    detached;
-   size_t limit;
-   ensure_valgrind("pthread_getattr_np");
-   kludged("pthread_getattr_np");
-   limit = VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB 
-                                 - 1000; /* paranoia */
-   attr->__detachstate = PTHREAD_CREATE_JOINABLE;
-   attr->__schedpolicy = SCHED_OTHER;
-   attr->__schedparam.sched_priority = 0;
-   attr->__inheritsched = PTHREAD_EXPLICIT_SCHED;
-   attr->__scope = PTHREAD_SCOPE_SYSTEM;
-   attr->__guardsize = VKI_BYTES_PER_PAGE;
-   attr->__stackaddr = NULL;
-   attr->__stackaddr_set = 0;
-   attr->__stacksize = limit;
-   VALGRIND_MAGIC_SEQUENCE(detached, (-1) /* default */,
-                           VG_USERREQ__SET_OR_GET_DETACH, 
-                           2 /* get */, thread, 0, 0);
-   my_assert(detached == 0 || detached == 1);
-   if (detached)
-      attr->__detachstate = PTHREAD_CREATE_DETACHED;
-   return 0;
-}
-
-
-/* Bogus ... */
-int pthread_attr_getstackaddr ( const pthread_attr_t * attr,
-                                void ** stackaddr )
-{
-   ensure_valgrind("pthread_attr_getstackaddr");
-   kludged("pthread_attr_getstackaddr");
-   if (stackaddr)
-      *stackaddr = NULL;
-   return 0;
-}
-
-/* Not bogus (!) */
-int pthread_attr_getstacksize ( const pthread_attr_t * _attr, 
-                                size_t * __stacksize )
-{
-   size_t limit;
-   ensure_valgrind("pthread_attr_getstacksize");
-   limit = VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB 
-                                 - 1000; /* paranoia */
-   if (__stacksize)
-      *__stacksize = limit;
-   return 0;
-}
-
-int pthread_attr_setschedpolicy(pthread_attr_t *attr, int policy)
-{
-  if (policy != SCHED_OTHER && policy != SCHED_FIFO && policy != SCHED_RR)
-    return EINVAL;
-  attr->__schedpolicy = policy;
-  return 0;
-}
-
-int pthread_attr_getschedpolicy(const pthread_attr_t *attr, int *policy)
-{
-  *policy = attr->__schedpolicy;
-  return 0;
-}
-
-
-/* --------------------------------------------------- 
-   Helper functions for running a thread 
-   and for clearing up afterwards.
-   ------------------------------------------------ */
-
-/* All exiting threads eventually pass through here, bearing the
-   return value, or PTHREAD_CANCELED, in ret_val. */
-static
-__attribute__((noreturn))
-void thread_exit_wrapper ( void* ret_val )
-{
-   int           detached, res;
-   CleanupEntry  cu;
-   pthread_key_t key;
-
-   /* Run this thread's cleanup handlers. */
-   while (1) {
-      VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                              VG_USERREQ__CLEANUP_POP,
-                              &cu, 0, 0, 0);
-      if (res == -1) break; /* stack empty */
-      my_assert(res == 0);
-      if (0) printf("running exit cleanup handler");
-      cu.fn ( cu.arg );
-   }
-
-   /* Run this thread's key finalizers.  Really this should be run
-      PTHREAD_DESTRUCTOR_ITERATIONS times. */
-   for (key = 0; key < VG_N_THREAD_KEYS; key++) {
-      VALGRIND_MAGIC_SEQUENCE(res, (-2) /* default */,
-                              VG_USERREQ__GET_KEY_D_AND_S,
-                              key, &cu, 0, 0 );
-      if (res == 0) {
-         /* valid key */
-         if (cu.fn && cu.arg)
-            cu.fn /* destructor for key */ 
-                  ( cu.arg /* specific for key for this thread */ );
-         continue;
-      }
-      my_assert(res == -1);
-   }
-
-   /* Decide on my final disposition. */
-   VALGRIND_MAGIC_SEQUENCE(detached, (-1) /* default */,
-                           VG_USERREQ__SET_OR_GET_DETACH, 
-                           2 /* get */, pthread_self(), 0, 0);
-   my_assert(detached == 0 || detached == 1);
-
-   if (detached) {
-      /* Detached; I just quit right now. */
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__QUIT, 0, 0, 0, 0);
-   } else {
-      /* Not detached; so I wait for a joiner. */
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__WAIT_JOINER, ret_val, 0, 0, 0);
-   }
-   /* NOTREACHED */
-   barf("thread_exit_wrapper: still alive?!");
-}
-
-
-/* This function is a wrapper function for running a thread.  It runs
-   the root function specified in pthread_create, and then, should the
-   root function return a value, it arranges to run the thread's
-   cleanup handlers and exit correctly. */
-
-/* Struct used to convey info from pthread_create to thread_wrapper.
-   Must be careful not to pass to the child thread any pointers to
-   objects which might be on the parent's stack.  */
-typedef
-   struct {
-      int   attr__detachstate;
-      void* (*root_fn) ( void* );
-      void* arg;
-   }
-   NewThreadInfo;
-
-
-/* This is passed to the VG_USERREQ__APPLY_IN_NEW_THREAD and so must
-   not return.  Note that this runs in the new thread, not the
-   parent. */
-static
-__attribute__((noreturn))
-void thread_wrapper ( NewThreadInfo* info )
-{
-   int   res;
-   int   attr__detachstate;
-   void* (*root_fn) ( void* );
-   void* arg;
-   void* ret_val;
-
-   attr__detachstate = info->attr__detachstate;
-   root_fn           = info->root_fn;
-   arg               = info->arg;
-
-   /* Free up the arg block that pthread_create malloced. */
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__FREE, info, 0, 0, 0);
-   my_assert(res == 0);
-
-   /* Minimally observe the attributes supplied. */
-   if (attr__detachstate != PTHREAD_CREATE_DETACHED
-       && attr__detachstate != PTHREAD_CREATE_JOINABLE)
-      pthread_error("thread_wrapper: invalid attr->__detachstate");
-   if (attr__detachstate == PTHREAD_CREATE_DETACHED)
-      pthread_detach(pthread_self());
-
-   /* The root function might not return.  But if it does we simply
-      move along to thread_exit_wrapper.  All other ways out for the
-      thread (cancellation, or calling pthread_exit) lead there
-      too. */
-   ret_val = root_fn(arg);
-   thread_exit_wrapper(ret_val);
-   /* NOTREACHED */
-}
-
-
-/* ---------------------------------------------------
-   THREADs
-   ------------------------------------------------ */
-
-__attribute__((weak))
-int pthread_yield ( void )
-{
-   int res;
-   ensure_valgrind("pthread_yield");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_YIELD, 0, 0, 0, 0);
-   return 0;
-}
-
-
-int pthread_equal(pthread_t thread1, pthread_t thread2)
-{
-   return thread1 == thread2 ? 1 : 0;
-}
-
-
-/* Bundle up the args into a malloc'd block and create a new thread
-   consisting of thread_wrapper() applied to said malloc'd block. */
-int
-pthread_create (pthread_t *__restrict __thredd,
-                __const pthread_attr_t *__restrict __attr,
-                void *(*__start_routine) (void *),
-                void *__restrict __arg)
-{
-   int            tid_child;
-   NewThreadInfo* info;
-
-   ensure_valgrind("pthread_create");
-
-   /* Allocate space for the arg block.  thread_wrapper will free
-      it. */
-   VALGRIND_MAGIC_SEQUENCE(info, NULL /* default */,
-                           VG_USERREQ__MALLOC, 
-                           sizeof(NewThreadInfo), 0, 0, 0);
-   my_assert(info != NULL);
-
-   if (__attr)
-      info->attr__detachstate = __attr->__detachstate;
-   else 
-      info->attr__detachstate = PTHREAD_CREATE_JOINABLE;
-
-   info->root_fn = __start_routine;
-   info->arg     = __arg;
-   VALGRIND_MAGIC_SEQUENCE(tid_child, VG_INVALID_THREADID /* default */,
-                           VG_USERREQ__APPLY_IN_NEW_THREAD,
-                           &thread_wrapper, info, 0, 0);
-   my_assert(tid_child != VG_INVALID_THREADID);
-
-   if (__thredd)
-      *__thredd = tid_child;
-   return 0; /* success */
-}
-
-
-int 
-pthread_join (pthread_t __th, void **__thread_return)
-{
-   int res;
-   ensure_valgrind("pthread_join");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_JOIN,
-                           __th, __thread_return, 0, 0);
-   return res;
-}
-
-
-void pthread_exit(void *retval)
-{
-   ensure_valgrind("pthread_exit");
-   /* Simple! */
-   thread_exit_wrapper(retval);
-}
-
-
-pthread_t pthread_self(void)
-{
-   int tid;
-   ensure_valgrind("pthread_self");
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("pthread_self: invalid ThreadId");
-   return tid;
-}
-
-
-int pthread_detach(pthread_t th)
-{
-   int res;
-   ensure_valgrind("pthread_detach");
-   /* First we enquire as to the current detach state. */
-   VALGRIND_MAGIC_SEQUENCE(res, (-2) /* default */,
-                           VG_USERREQ__SET_OR_GET_DETACH,
-                           2 /* get */, th, 0, 0);
-   if (res == -1) {
-      /* not found */ 
-      pthread_error("pthread_detach: "
-                    "invalid target thread");
-      return ESRCH;
-   }
-   if (res == 1) { 
-      /* already detached */
-      pthread_error("pthread_detach: "
-                    "target thread is already detached");
-      return EINVAL;
-   }
-   if (res == 0) {
-      VALGRIND_MAGIC_SEQUENCE(res, (-2) /* default */,
-                              VG_USERREQ__SET_OR_GET_DETACH,
-                              1 /* set */, th, 0, 0);
-      my_assert(res == 0);
-      return 0;
-   }
-   barf("pthread_detach");
-}
-
-
-/* ---------------------------------------------------
-   CLEANUP STACKS
-   ------------------------------------------------ */
-
-void _pthread_cleanup_push (struct _pthread_cleanup_buffer *__buffer,
-                            void (*__routine) (void *),
-                            void *__arg)
-{
-   int          res;
-   CleanupEntry cu;
-   ensure_valgrind("_pthread_cleanup_push");
-   cu.fn  = __routine;
-   cu.arg = __arg;
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__CLEANUP_PUSH,
-                           &cu, 0, 0, 0);
-   my_assert(res == 0);
-}
-
-
-void _pthread_cleanup_push_defer (struct _pthread_cleanup_buffer *__buffer,
-                                  void (*__routine) (void *),
-                                  void *__arg)
-{
-   /* As _pthread_cleanup_push, but first save the thread's original
-      cancellation type in __buffer and set it to Deferred. */
-   int orig_ctype;
-   ensure_valgrind("_pthread_cleanup_push_defer");
-   /* Set to Deferred, and put the old cancellation type in res. */
-   my_assert(-1 != PTHREAD_CANCEL_DEFERRED);
-   my_assert(-1 != PTHREAD_CANCEL_ASYNCHRONOUS);
-   my_assert(sizeof(struct _pthread_cleanup_buffer) >= sizeof(int));
-   VALGRIND_MAGIC_SEQUENCE(orig_ctype, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELTYPE,
-                           PTHREAD_CANCEL_DEFERRED, 0, 0, 0);   
-   my_assert(orig_ctype != -1);
-   *((int*)(__buffer)) = orig_ctype;
-   /* Now push the cleanup. */
-   _pthread_cleanup_push(NULL, __routine, __arg);
-}
-
-
-void _pthread_cleanup_pop (struct _pthread_cleanup_buffer *__buffer,
-                           int __execute)
-{
-   int          res;
-   CleanupEntry cu;
-   ensure_valgrind("_pthread_cleanup_push");
-   cu.fn = cu.arg = NULL; /* paranoia */
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__CLEANUP_POP,
-                           &cu, 0, 0, 0);
-   if (res == 0) {
-      /* pop succeeded */
-     if (__execute) {
-        cu.fn ( cu.arg );
-     }
-     return;
-   }   
-   if (res == -1) {
-      /* stack underflow */
-      return;
-   }
-   barf("_pthread_cleanup_pop");
-}
-
-
-void _pthread_cleanup_pop_restore (struct _pthread_cleanup_buffer *__buffer,
-                                   int __execute)
-{
-   int orig_ctype, fake_ctype;
-   /* As _pthread_cleanup_pop, but after popping/running the handler,
-      restore the thread's original cancellation type from the first
-      word of __buffer. */
-   _pthread_cleanup_pop(NULL, __execute);
-   orig_ctype = *((int*)(__buffer));
-   my_assert(orig_ctype == PTHREAD_CANCEL_DEFERRED
-          || orig_ctype == PTHREAD_CANCEL_ASYNCHRONOUS);
-   my_assert(-1 != PTHREAD_CANCEL_DEFERRED);
-   my_assert(-1 != PTHREAD_CANCEL_ASYNCHRONOUS);
-   my_assert(sizeof(struct _pthread_cleanup_buffer) >= sizeof(int));
-   VALGRIND_MAGIC_SEQUENCE(fake_ctype, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELTYPE,
-                           orig_ctype, 0, 0, 0); 
-   my_assert(fake_ctype == PTHREAD_CANCEL_DEFERRED);
-}
-
-
-/* ---------------------------------------------------
-   MUTEX ATTRIBUTES
-   ------------------------------------------------ */
-
-int __pthread_mutexattr_init(pthread_mutexattr_t *attr)
-{
-   attr->__mutexkind = PTHREAD_MUTEX_ERRORCHECK_NP;
-   return 0;
-}
-
-int __pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
-{
-   switch (type) {
-#     ifndef GLIBC_2_1    
-      case PTHREAD_MUTEX_TIMED_NP:
-      case PTHREAD_MUTEX_ADAPTIVE_NP:
-#     endif
-#     ifdef GLIBC_2_1    
-      case PTHREAD_MUTEX_FAST_NP:
-#     endif
-      case PTHREAD_MUTEX_RECURSIVE_NP:
-      case PTHREAD_MUTEX_ERRORCHECK_NP:
-         attr->__mutexkind = type;
-         return 0;
-      default:
-         pthread_error("pthread_mutexattr_settype: "
-                       "invalid type");
-         return EINVAL;
-   }
-}
-
-int __pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
-{
-   return 0;
-}
-
-
-/* ---------------------------------------------------
-   MUTEXes
-   ------------------------------------------------ */
-
-int __pthread_mutex_init(pthread_mutex_t *mutex, 
-                         const  pthread_mutexattr_t *mutexattr)
-{
-   mutex->__m_count = 0;
-   mutex->__m_owner = (_pthread_descr)VG_INVALID_THREADID;
-   mutex->__m_kind  = PTHREAD_MUTEX_ERRORCHECK_NP;
-   if (mutexattr)
-      mutex->__m_kind = mutexattr->__mutexkind;
-   return 0;
-}
-
-
-int __pthread_mutex_lock(pthread_mutex_t *mutex)
-{
-   int res;
-   static int moans = N_MOANS;
-   if (RUNNING_ON_VALGRIND) {
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__PTHREAD_MUTEX_LOCK,
-                              mutex, 0, 0, 0);
-      return res;
-   } else {
-      if (moans-- > 0)
-         not_inside("pthread_mutex_lock");
-      return 0; /* success */
-   }
-}
-
-
-int __pthread_mutex_trylock(pthread_mutex_t *mutex)
-{
-   int res;
-   static int moans = N_MOANS;
-   if (RUNNING_ON_VALGRIND) {
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__PTHREAD_MUTEX_TRYLOCK,
-                              mutex, 0, 0, 0);
-      return res;
-   } else {
-      if (moans-- > 0)
-         not_inside("pthread_mutex_trylock");
-      return 0;
-   }
-}
-
-
-int __pthread_mutex_unlock(pthread_mutex_t *mutex)
-{
-   int res;
-   static int moans = N_MOANS;
-   if (RUNNING_ON_VALGRIND) {
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__PTHREAD_MUTEX_UNLOCK,
-                              mutex, 0, 0, 0);
-      return res;
-   } else {
-      if (moans-- > 0)
-         not_inside("pthread_mutex_unlock");
-      return 0;
-   }
-}
-
-
-int __pthread_mutex_destroy(pthread_mutex_t *mutex)
-{
-   /* Valgrind doesn't hold any resources on behalf of the mutex, so no
-      need to involve it. */
-   if (mutex->__m_count > 0) {
-       pthread_error("pthread_mutex_destroy: "
-                     "mutex is still in use");
-       return EBUSY;
-   }
-   mutex->__m_count = 0;
-   mutex->__m_owner = (_pthread_descr)VG_INVALID_THREADID;
-   mutex->__m_kind  = PTHREAD_MUTEX_ERRORCHECK_NP;
-   return 0;
-}
-
-
-/* ---------------------------------------------------
-   CONDITION VARIABLES
-   ------------------------------------------------ */
-
-/* LinuxThreads supports no attributes for conditions.  Hence ... */
-
-int pthread_condattr_init(pthread_condattr_t *attr)
-{
-   return 0;
-}
-
-int pthread_condattr_destroy(pthread_condattr_t *attr)
-{
-   return 0;
-}
-
-int pthread_cond_init( pthread_cond_t *cond,
-                       const pthread_condattr_t *cond_attr)
-{
-   cond->__c_waiting = (_pthread_descr)VG_INVALID_THREADID;
-   return 0;
-}
-
-int pthread_cond_destroy(pthread_cond_t *cond)
-{
-   /* should check that no threads are waiting on this CV */
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      kludged("pthread_cond_destroy");
-   return 0;
-}
-
-/* ---------------------------------------------------
-   SCHEDULING
-   ------------------------------------------------ */
-
-/* This is completely bogus. */
-int   pthread_getschedparam(pthread_t  target_thread,  
-                            int  *policy,
-                            struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      kludged("pthread_getschedparam");
-   if (policy) *policy = SCHED_OTHER;
-#  ifdef HAVE_SCHED_PRIORITY
-   if (param) param->sched_priority = 0; /* who knows */
-#  else
-   if (param) param->__sched_priority = 0; /* who knows */
-#  endif
-   return 0;
-}
-
-int pthread_setschedparam(pthread_t target_thread, 
-                          int policy, 
-                          const struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_setschedparam");
-   return 0;
-}
-
-int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
-{
-   int res;
-   ensure_valgrind("pthread_cond_wait");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_WAIT,
-			   cond, mutex, 0, 0);
-   return res;
-}
-
-int pthread_cond_timedwait ( pthread_cond_t *cond, 
-                             pthread_mutex_t *mutex, 
-                             const struct  timespec *abstime )
-{
-   int res;
-   unsigned int ms_now, ms_end;
-   struct  timeval timeval_now;
-   unsigned long long int ull_ms_now_after_1970;
-   unsigned long long int ull_ms_end_after_1970;
-
-   ensure_valgrind("pthread_cond_timedwait");
-   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__READ_MILLISECOND_TIMER,
-                           0, 0, 0, 0);
-   my_assert(ms_now != 0xFFFFFFFF);
-   res = gettimeofday(&timeval_now, NULL);
-   my_assert(res == 0);
-
-   ull_ms_now_after_1970 
-      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
-        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
-   ull_ms_end_after_1970
-      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
-        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
-   if (ull_ms_end_after_1970 < ull_ms_now_after_1970)
-      ull_ms_end_after_1970 = ull_ms_now_after_1970;
-   ms_end 
-      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
-			   cond, mutex, ms_end, 0);
-   return res;
-}
-
-
-int pthread_cond_signal(pthread_cond_t *cond)
-{
-   int res;
-   ensure_valgrind("pthread_cond_signal");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_SIGNAL,
-			   cond, 0, 0, 0);
-   return res;
-}
-
-int pthread_cond_broadcast(pthread_cond_t *cond)
-{
-   int res;
-   ensure_valgrind("pthread_cond_broadcast");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_BROADCAST,
-			   cond, 0, 0, 0);
-   return res;
-}
-
-
-/* ---------------------------------------------------
-   CANCELLATION
-   ------------------------------------------------ */
-
-int pthread_setcancelstate(int state, int *oldstate)
-{
-   int res;
-   ensure_valgrind("pthread_setcancelstate");
-   if (state != PTHREAD_CANCEL_ENABLE
-       && state != PTHREAD_CANCEL_DISABLE) {
-      pthread_error("pthread_setcancelstate: "
-                    "invalid state");
-      return EINVAL;
-   }
-   my_assert(-1 != PTHREAD_CANCEL_ENABLE);
-   my_assert(-1 != PTHREAD_CANCEL_DISABLE);
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELSTATE,
-                           state, 0, 0, 0);
-   my_assert(res != -1);
-   if (oldstate) 
-      *oldstate = res;
-   return 0;
-}
-
-int pthread_setcanceltype(int type, int *oldtype)
-{
-   int res;
-   ensure_valgrind("pthread_setcanceltype");
-   if (type != PTHREAD_CANCEL_DEFERRED
-       && type != PTHREAD_CANCEL_ASYNCHRONOUS) {
-      pthread_error("pthread_setcanceltype: "
-                    "invalid type");
-      return EINVAL;
-   }
-   my_assert(-1 != PTHREAD_CANCEL_DEFERRED);
-   my_assert(-1 != PTHREAD_CANCEL_ASYNCHRONOUS);
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELTYPE,
-                           type, 0, 0, 0);
-   my_assert(res != -1);
-   if (oldtype) 
-      *oldtype = res;
-   return 0;
-}
-
-int pthread_cancel(pthread_t thread)
-{
-   int res;
-   ensure_valgrind("pthread_cancel");
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELPEND,
-                           thread, &thread_exit_wrapper, 0, 0);
-   my_assert(res != -1);
-   return res;
-}
-
-static __inline__
-void __my_pthread_testcancel(void)
-{
-   int res;
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__TESTCANCEL,
-                           0, 0, 0, 0);
-   my_assert(res == 0);
-}
-
-void pthread_testcancel ( void )
-{
-   __my_pthread_testcancel();
-}
-
-
-/* Not really sure what this is for.  I suspect for doing the POSIX
-   requirements for fork() and exec().  We do this internally anyway
-   whenever those syscalls are observed, so this could be superfluous,
-   but hey ... 
-*/
-void __pthread_kill_other_threads_np ( void )
-{
-   int res;
-   ensure_valgrind("__pthread_kill_other_threads_np");
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__NUKE_OTHER_THREADS,
-                           0, 0, 0, 0);
-   my_assert(res == 0);
-}
-
-
-/* ---------------------------------------------------
-   SIGNALS
-   ------------------------------------------------ */
-
-#include <signal.h>
-
-int pthread_sigmask(int how, const sigset_t *newmask, 
-                             sigset_t *oldmask)
-{
-   int res;
-
-   /* A bit subtle, because the scheduler expects newmask and oldmask
-      to be vki_sigset_t* rather than sigset_t*, and the two are
-      different.  Fortunately the first 64 bits of a sigset_t are
-      exactly a vki_sigset_t, so we just pass the pointers through
-      unmodified.  Haaaack! 
-
-      Also mash the how value so that the SIG_ constants from glibc
-      constants to VKI_ constants, so that the former do not have to
-      be included into vg_scheduler.c. */
-
-   ensure_valgrind("pthread_sigmask");
-
-   switch (how) {
-      case SIG_SETMASK: how = VKI_SIG_SETMASK; break;
-      case SIG_BLOCK:   how = VKI_SIG_BLOCK; break;
-      case SIG_UNBLOCK: how = VKI_SIG_UNBLOCK; break;
-      default: pthread_error("pthread_sigmask: invalid how");
-               return EINVAL;
-   }
-
-   /* Crude check */
-   if (newmask == NULL)
-      return EFAULT;
-
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_SIGMASK,
-                           how, newmask, oldmask, 0);
-
-   /* The scheduler tells us of any memory violations. */
-   return res == 0 ? 0 : EFAULT;
-}
-
-
-int sigwait ( const sigset_t* set, int* sig )
-{
-   int res;
-   ensure_valgrind("sigwait");
-   /* As with pthread_sigmask we deliberately confuse sigset_t with
-      vki_ksigset_t. */
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__SIGWAIT,
-                           set, sig, 0, 0);
-   return res;
-}
-
-
-int pthread_kill(pthread_t thread, int signo)
-{
-   int res;
-   ensure_valgrind("pthread_kill");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_KILL, 
-                           thread, signo, 0, 0);
-   return res;
-}
-
-
-/* Copied verbatim from Linuxthreads */
-/* Redefine raise() to send signal to calling thread only,
-   as per POSIX 1003.1c */
-int raise (int sig)
-{
-  int retcode = pthread_kill(pthread_self(), sig);
-  if (retcode == 0) {
-    return 0;
-  } else {
-    errno = retcode;
-    return -1;
-  }
-}
-
-
-int pause ( void )
-{
-   unsigned int n_orig, n_now;
-   struct vki_timespec nanosleep_interval;
-   ensure_valgrind("pause");
-
-   /* This is surely a cancellation point. */
-   __my_pthread_testcancel();
-
-   VALGRIND_MAGIC_SEQUENCE(n_orig, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__GET_N_SIGS_RETURNED, 
-                           0, 0, 0, 0);
-   my_assert(n_orig != 0xFFFFFFFF);
-
-   while (1) {
-      VALGRIND_MAGIC_SEQUENCE(n_now, 0xFFFFFFFF /* default */,
-                              VG_USERREQ__GET_N_SIGS_RETURNED, 
-                              0, 0, 0, 0);
-      my_assert(n_now != 0xFFFFFFFF);
-      my_assert(n_now >= n_orig);
-      if (n_now != n_orig) break;
-
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 52 * 1000 * 1000; /* 52 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      (void)my_do_syscall2(__NR_nanosleep, 
-                           (int)(&nanosleep_interval), (int)NULL);
-   }
-
-   * (__errno_location()) = EINTR;
-   return -1;
-}
-
-
-/* ---------------------------------------------------
-   THREAD-SPECIFICs
-   ------------------------------------------------ */
-
-int __pthread_key_create(pthread_key_t *key,  
-                         void  (*destr_function)  (void *))
-{
-   int res;
-   ensure_valgrind("pthread_key_create");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_KEY_CREATE,
-                           key, destr_function, 0, 0);
-   return res;
-}
-
-int pthread_key_delete(pthread_key_t key)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_key_delete");
-   return 0;
-}
-
-int __pthread_setspecific(pthread_key_t key, const void *pointer)
-{
-   int res;
-   ensure_valgrind("pthread_setspecific");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_SETSPECIFIC,
-                           key, pointer, 0, 0);
-   return res;
-}
-
-void * __pthread_getspecific(pthread_key_t key)
-{
-   int res;
-   ensure_valgrind("pthread_getspecific");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_GETSPECIFIC,
-                           key, 0 , 0, 0);
-   return (void*)res;
-}
-
-
-/* ---------------------------------------------------
-   ONCEry
-   ------------------------------------------------ */
-
-static pthread_mutex_t once_masterlock = PTHREAD_MUTEX_INITIALIZER;
-
-
-int __pthread_once ( pthread_once_t *once_control, 
-                     void (*init_routine) (void) )
-{
-   int res;
-   ensure_valgrind("pthread_once");
-
-   res = __pthread_mutex_lock(&once_masterlock);
-
-   if (res != 0) {
-      barf("pthread_once: Looks like your program's "
-           "init routine calls back to pthread_once() ?!");
-   }
-
-   if (*once_control == 0) {
-      *once_control = 1;
-      init_routine();
-   }
-
-   __pthread_mutex_unlock(&once_masterlock);
-
-   return 0;
-}
-
-
-/* ---------------------------------------------------
-   MISC
-   ------------------------------------------------ */
-
-static pthread_mutex_t pthread_atfork_lock 
-   = PTHREAD_MUTEX_INITIALIZER;
-
-int __pthread_atfork ( void (*prepare)(void),
-                       void (*parent)(void),
-                       void (*child)(void) )
-{
-   int n, res;
-   ForkHandlerEntry entry;
-
-   ensure_valgrind("pthread_atfork");
-   __pthread_mutex_lock(&pthread_atfork_lock);
-
-   /* Fetch old counter */
-   VALGRIND_MAGIC_SEQUENCE(n, -2 /* default */,
-                           VG_USERREQ__GET_FHSTACK_USED,
-                           0, 0, 0, 0);
-   my_assert(n >= 0 && n < VG_N_FORKHANDLERSTACK);
-   if (n == VG_N_FORKHANDLERSTACK-1)
-      barf("pthread_atfork: VG_N_FORKHANDLERSTACK is too low; "
-           "increase and recompile");
-
-   /* Add entry */
-   entry.prepare = *prepare;
-   entry.parent  = *parent;
-   entry.child   = *child;   
-   VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                           VG_USERREQ__SET_FHSTACK_ENTRY,
-                           n, &entry, 0, 0);
-   my_assert(res == 0);
-
-   /* Bump counter */
-   VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                           VG_USERREQ__SET_FHSTACK_USED,
-                           n+1, 0, 0, 0);
-   my_assert(res == 0);
-
-   __pthread_mutex_unlock(&pthread_atfork_lock);
-   return 0;
-}
-
-
-__attribute__((weak)) 
-void __pthread_initialize ( void )
-{
-   ensure_valgrind("__pthread_initialize");
-}
-
-
-/* ---------------------------------------------------
-   LIBRARY-PRIVATE THREAD SPECIFIC STATE
-   ------------------------------------------------ */
-
-#include <resolv.h>
-static int thread_specific_errno[VG_N_THREADS];
-static int thread_specific_h_errno[VG_N_THREADS];
-static struct __res_state
-           thread_specific_res_state[VG_N_THREADS];
-
-int* __errno_location ( void )
-{
-   int tid;
-   /* ensure_valgrind("__errno_location"); */
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   /* 'cos I'm paranoid ... */
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("__errno_location: invalid ThreadId");
-   return & thread_specific_errno[tid];
-}
-
-int* __h_errno_location ( void )
-{
-   int tid;
-   /* ensure_valgrind("__h_errno_location"); */
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   /* 'cos I'm paranoid ... */
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("__h_errno_location: invalid ThreadId");
-   return & thread_specific_h_errno[tid];
-}
-
-struct __res_state* __res_state ( void )
-{
-   int tid;
-   /* ensure_valgrind("__res_state"); */
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   /* 'cos I'm paranoid ... */
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("__res_state: invalid ThreadId");
-   return & thread_specific_res_state[tid];
-}
-
-
-/* ---------------------------------------------------
-   LIBC-PRIVATE SPECIFIC DATA
-   ------------------------------------------------ */
-
-/* Relies on assumption that initial private data is NULL.  This
-   should be fixed somehow. */
-
-/* The allowable keys (indices) (all 2 of them). 
-   From sysdeps/pthread/bits/libc-tsd.h
-*/
-#define N_LIBC_TSD_EXTRA_KEYS 1
-
-enum __libc_tsd_key_t { _LIBC_TSD_KEY_MALLOC = 0,
-                        _LIBC_TSD_KEY_DL_ERROR,
-                        _LIBC_TSD_KEY_N };
-
-/* Auto-initialising subsystem.  libc_specifics_inited is set 
-   after initialisation.  libc_specifics_inited_mx guards it. */
-static int             libc_specifics_inited    = 0;
-static pthread_mutex_t libc_specifics_inited_mx = PTHREAD_MUTEX_INITIALIZER;
-
-/* These are the keys we must initialise the first time. */
-static pthread_key_t libc_specifics_keys[_LIBC_TSD_KEY_N
-                                         + N_LIBC_TSD_EXTRA_KEYS];
-
-/* Initialise the keys, if they are not already initialise. */
-static
-void init_libc_tsd_keys ( void )
-{
-   int res, i;
-   pthread_key_t k;
-
-   res = pthread_mutex_lock(&libc_specifics_inited_mx);
-   if (res != 0) barf("init_libc_tsd_keys: lock");
-
-   if (libc_specifics_inited == 0) {
-      /* printf("INIT libc specifics\n"); */
-      libc_specifics_inited = 1;
-      for (i = 0; i < _LIBC_TSD_KEY_N + N_LIBC_TSD_EXTRA_KEYS; i++) {
-         res = pthread_key_create(&k, NULL);
-	 if (res != 0) barf("init_libc_tsd_keys: create");
-         libc_specifics_keys[i] = k;
-      }
-   }
-
-   res = pthread_mutex_unlock(&libc_specifics_inited_mx);
-   if (res != 0) barf("init_libc_tsd_keys: unlock");
-}
-
-
-static int
-libc_internal_tsd_set ( enum __libc_tsd_key_t key, 
-                        const void * pointer )
-{
-   int        res;
-   static int moans = N_MOANS;
-   /* printf("SET SET SET key %d ptr %p\n", key, pointer); */
-   if (key < _LIBC_TSD_KEY_MALLOC 
-       || key >= _LIBC_TSD_KEY_N + N_LIBC_TSD_EXTRA_KEYS)
-      barf("libc_internal_tsd_set: invalid key");
-   if (key >= _LIBC_TSD_KEY_N && moans-- > 0)
-      fprintf(stderr, 
-         "valgrind's libpthread.so: libc_internal_tsd_set: "
-         "dubious key %d\n", key);
-   init_libc_tsd_keys();
-   res = pthread_setspecific(libc_specifics_keys[key], pointer);
-   if (res != 0) barf("libc_internal_tsd_set: setspecific failed");
-   return 0;
-}
-
-static void *
-libc_internal_tsd_get ( enum __libc_tsd_key_t key )
-{
-   void*      v;
-   static int moans = N_MOANS;
-   /* printf("GET GET GET key %d\n", key); */
-   if (key < _LIBC_TSD_KEY_MALLOC 
-       || key >= _LIBC_TSD_KEY_N + N_LIBC_TSD_EXTRA_KEYS)
-      barf("libc_internal_tsd_get: invalid key");
-   if (key >= _LIBC_TSD_KEY_N && moans-- > 0)
-      fprintf(stderr, 
-         "valgrind's libpthread.so: libc_internal_tsd_get: "
-         "dubious key %d\n", key);
-   init_libc_tsd_keys();
-   v = pthread_getspecific(libc_specifics_keys[key]);
-   /* if (v == NULL) barf("libc_internal_tsd_set: getspecific failed"); */
-   return v;
-}
-
-
-
-
-int (*__libc_internal_tsd_set)
-    (enum __libc_tsd_key_t key, const void * pointer)
-   = libc_internal_tsd_set;
-
-void* (*__libc_internal_tsd_get)
-      (enum __libc_tsd_key_t key)
-   = libc_internal_tsd_get;
-
-
-/* ---------------------------------------------------------------------
-   These are here (I think) because they are deemed cancellation
-   points by POSIX.  For the moment we'll simply pass the call along
-   to the corresponding thread-unaware (?) libc routine.
-   ------------------------------------------------------------------ */
-
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-
-#ifdef GLIBC_2_1
-extern
-int __sigaction
-             (int signum, 
-              const struct sigaction *act,  
-              struct  sigaction *oldact);
-#else
-extern
-int __libc_sigaction
-             (int signum, 
-              const struct sigaction *act,  
-              struct  sigaction *oldact);
-#endif
-int sigaction(int signum, 
-              const struct sigaction *act,  
-              struct  sigaction *oldact)
-{
-   __my_pthread_testcancel();
-#  ifdef GLIBC_2_1
-   return __sigaction(signum, act, oldact);
-#  else
-   return __libc_sigaction(signum, act, oldact);
-#  endif
-}
-
-
-extern
-int  __libc_connect(int  sockfd,  
-                    const  struct  sockaddr  *serv_addr, 
-                    socklen_t addrlen);
-__attribute__((weak))
-int  connect(int  sockfd,  
-             const  struct  sockaddr  *serv_addr, 
-             socklen_t addrlen)
-{
-   __my_pthread_testcancel();
-   return __libc_connect(sockfd, serv_addr, addrlen);
-}
-
-
-extern
-int __libc_fcntl(int fd, int cmd, long arg);
-__attribute__((weak))
-int fcntl(int fd, int cmd, long arg)
-{
-   __my_pthread_testcancel();
-   return __libc_fcntl(fd, cmd, arg);
-}
-
-
-extern 
-ssize_t __libc_write(int fd, const void *buf, size_t count);
-__attribute__((weak))
-ssize_t write(int fd, const void *buf, size_t count)
-{
-   __my_pthread_testcancel();
-   return __libc_write(fd, buf, count);
-}
-
-
-extern 
-ssize_t __libc_read(int fd, void *buf, size_t count);
-__attribute__((weak))
-ssize_t read(int fd, void *buf, size_t count)
-{
-   __my_pthread_testcancel();
-   return __libc_read(fd, buf, count);
-}
-
- 
-extern
-int __libc_open64(const char *pathname, int flags, mode_t mode);
-__attribute__((weak))
-int open64(const char *pathname, int flags, mode_t mode)
-{
-   __my_pthread_testcancel();
-   return __libc_open64(pathname, flags, mode);
-}
-
-
-extern
-int __libc_open(const char *pathname, int flags, mode_t mode);
-__attribute__((weak))
-int open(const char *pathname, int flags, mode_t mode)
-{
-   __my_pthread_testcancel();
-   return __libc_open(pathname, flags, mode);
-}
-
-
-extern
-int __libc_close(int fd);
-__attribute__((weak))
-int close(int fd)
-{
-   __my_pthread_testcancel();
-   return __libc_close(fd);
-}
-
-
-extern
-int __libc_accept(int s, struct sockaddr *addr, socklen_t *addrlen);
-__attribute__((weak))
-int accept(int s, struct sockaddr *addr, socklen_t *addrlen)
-{
-   __my_pthread_testcancel();
-   wait_for_fd_to_be_readable_or_erring(s);
-   __my_pthread_testcancel();
-   return __libc_accept(s, addr, addrlen);
-}
-
-
-extern
-pid_t __libc_waitpid(pid_t pid, int *status, int options);
-__attribute__((weak))
-pid_t waitpid(pid_t pid, int *status, int options)
-{
-   __my_pthread_testcancel();
-   return __libc_waitpid(pid, status, options);
-}
-
-
-extern
-int __libc_nanosleep(const struct timespec *req, struct timespec *rem);
-__attribute__((weak))
-int nanosleep(const struct timespec *req, struct timespec *rem)
-{
-   __my_pthread_testcancel();
-   return __libc_nanosleep(req, rem);
-}
-
-
-extern
-int __libc_fsync(int fd);
-__attribute__((weak))
-int fsync(int fd)
-{
-   __my_pthread_testcancel();
-   return __libc_fsync(fd);
-}
-
-
-extern
-off_t __libc_lseek(int fildes, off_t offset, int whence);
-__attribute__((weak))
-off_t lseek(int fildes, off_t offset, int whence)
-{
-   __my_pthread_testcancel();
-   return __libc_lseek(fildes, offset, whence);
-}
-
-
-extern
-__off64_t __libc_lseek64(int fildes, __off64_t offset, int whence);
-__attribute__((weak))
-__off64_t lseek64(int fildes, __off64_t offset, int whence)
-{
-   __my_pthread_testcancel();
-   return __libc_lseek64(fildes, offset, whence);
-}
-
-
-extern 
-ssize_t __libc_pread64 (int __fd, void *__buf, size_t __nbytes,
-                        __off64_t __offset);
-ssize_t __pread64 (int __fd, void *__buf, size_t __nbytes,
-                   __off64_t __offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pread64(__fd, __buf, __nbytes, __offset);
-}
-
-
-extern
-ssize_t __libc_pwrite64 (int __fd, const void *__buf, size_t __nbytes,
-                        __off64_t __offset);
-ssize_t __pwrite64 (int __fd, const void *__buf, size_t __nbytes,
-                   __off64_t __offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pwrite64(__fd, __buf, __nbytes, __offset);
-}
-
-
-extern 
-ssize_t __libc_pwrite(int fd, const void *buf, size_t count, off_t offset);
-__attribute__((weak))
-ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pwrite(fd, buf, count, offset);
-}
-
-
-extern 
-ssize_t __libc_pread(int fd, void *buf, size_t count, off_t offset);
-__attribute__((weak))
-ssize_t pread(int fd, void *buf, size_t count, off_t offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pread(fd, buf, count, offset);
-}
-
-
-extern  
-void __libc_longjmp(jmp_buf env, int val) __attribute((noreturn));
-/* not weak: __attribute__((weak)) */
-void longjmp(jmp_buf env, int val)
-{
-   __libc_longjmp(env, val);
-}
-
-
-extern void __libc_siglongjmp (sigjmp_buf env, int val)
-                               __attribute__ ((noreturn));
-void siglongjmp(sigjmp_buf env, int val)
-{
-   kludged("siglongjmp (cleanup handlers are ignored)");
-   __libc_siglongjmp(env, val);
-}
-
-
-extern
-int __libc_send(int s, const void *msg, size_t len, int flags);
-__attribute__((weak))
-int send(int s, const void *msg, size_t len, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_send(s, msg, len, flags);
-}
-
-
-extern
-int __libc_recv(int s, void *buf, size_t len, int flags);
-__attribute__((weak))
-int recv(int s, void *buf, size_t len, int flags)
-{
-   __my_pthread_testcancel();
-   wait_for_fd_to_be_readable_or_erring(s);
-   __my_pthread_testcancel();
-   return __libc_recv(s, buf, len, flags);
-}
-
-
-extern 
-int __libc_sendmsg(int s, const struct msghdr *msg, int flags);
-__attribute__((weak))
-int sendmsg(int s, const struct msghdr *msg, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_sendmsg(s, msg, flags);
-}
-
-
-extern
-int __libc_recvmsg(int s, struct msghdr *msg, int flags);
-__attribute__((weak))
-int recvmsg(int s, struct msghdr *msg, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_recvmsg(s, msg, flags);
-}
-
-
-extern
-int __libc_recvfrom(int s, void *buf, size_t len, int flags,
-                    struct sockaddr *from, socklen_t *fromlen);
-__attribute__((weak))
-int recvfrom(int s, void *buf, size_t len, int flags,
-             struct sockaddr *from, socklen_t *fromlen)
-{
-   __my_pthread_testcancel();
-   wait_for_fd_to_be_readable_or_erring(s);
-   __my_pthread_testcancel();
-   return __libc_recvfrom(s, buf, len, flags, from, fromlen);
-}
-
-
-extern
-int __libc_sendto(int s, const void *msg, size_t len, int flags, 
-                  const struct sockaddr *to, socklen_t tolen);
-__attribute__((weak))
-int sendto(int s, const void *msg, size_t len, int flags, 
-           const struct sockaddr *to, socklen_t tolen)
-{
-   __my_pthread_testcancel();
-   return __libc_sendto(s, msg, len, flags, to, tolen);
-}
-
-
-extern 
-int __libc_system(const char* str);
-__attribute__((weak))
-int system(const char* str)
-{
-   __my_pthread_testcancel();
-   return __libc_system(str);
-}
-
-
-extern
-pid_t __libc_wait(int *status);
-__attribute__((weak))
-pid_t wait(int *status)
-{
-   __my_pthread_testcancel();
-   return __libc_wait(status);
-}
-
-
-extern
-int __libc_msync(const void *start, size_t length, int flags);
-__attribute__((weak))
-int msync(const void *start, size_t length, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_msync(start, length, flags);
-}
-
-
-/*--- fork and its helper ---*/
-
-static
-void run_fork_handlers ( int what )
-{
-   ForkHandlerEntry entry;
-   int n_h, n_handlers, i, res;
-
-   my_assert(what == 0 || what == 1 || what == 2);
-
-   /* Fetch old counter */
-   VALGRIND_MAGIC_SEQUENCE(n_handlers, -2 /* default */,
-                           VG_USERREQ__GET_FHSTACK_USED,
-                           0, 0, 0, 0);
-   my_assert(n_handlers >= 0 && n_handlers < VG_N_FORKHANDLERSTACK);
-
-   /* Prepare handlers (what == 0) are called in opposite order of
-      calls to pthread_atfork.  Parent and child handlers are called
-      in the same order as calls to pthread_atfork. */
-   if (what == 0)
-      n_h = n_handlers - 1;
-   else
-      n_h = 0;
-
-   for (i = 0; i < n_handlers; i++) {
-      VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                              VG_USERREQ__GET_FHSTACK_ENTRY,
-                              n_h, &entry, 0, 0);
-      my_assert(res == 0);
-      switch (what) {
-         case 0:  if (entry.prepare) entry.prepare(); 
-                  n_h--; break;
-         case 1:  if (entry.parent) entry.parent(); 
-                  n_h++; break;
-         case 2:  if (entry.child) entry.child(); 
-                  n_h++; break;
-         default: barf("run_fork_handlers: invalid what");
-      }
-   }
-
-   if (what != 0 /* prepare */) {
-      /* Empty out the stack. */
-      VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                              VG_USERREQ__SET_FHSTACK_USED,
-                              0, 0, 0, 0);
-      my_assert(res == 0);
-   }
-}
-
-extern
-pid_t __libc_fork(void);
-pid_t __fork(void)
-{
-   pid_t pid;
-   __my_pthread_testcancel();
-   __pthread_mutex_lock(&pthread_atfork_lock);
-
-   run_fork_handlers(0 /* prepare */);
-   pid = __libc_fork();
-   if (pid == 0) {
-      /* I am the child */
-      run_fork_handlers(2 /* child */);
-      __pthread_mutex_init(&pthread_atfork_lock, NULL);
-   } else {
-      /* I am the parent */
-      run_fork_handlers(1 /* parent */);
-      __pthread_mutex_unlock(&pthread_atfork_lock);
-   }
-   return pid;
-}
-
-
-
-
-/* ---------------------------------------------------------------------
-   Nonblocking implementations of select() and poll().  This stuff will
-   surely rot your mind.
-   ------------------------------------------------------------------ */
-
-/*--------------------------------------------------*/
-
-#include "vg_kerneliface.h"
-
-static
-__inline__
-int is_kerror ( int res )
-{
-   if (res >= -4095 && res <= -1)
-      return 1;
-   else
-      return 0;
-}
-
-
-static
-int my_do_syscall1 ( int syscallno, int arg1 )
-{ 
-   int __res;
-   __asm__ volatile ("pushl %%ebx; movl %%edx,%%ebx ; int $0x80 ; popl %%ebx"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "d" (arg1) );
-   return __res;
-}
-
-static
-int my_do_syscall2 ( int syscallno, 
-                     int arg1, int arg2 )
-{ 
-   int __res;
-   __asm__ volatile ("pushl %%ebx; movl %%edx,%%ebx ; int $0x80 ; popl %%ebx"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "d" (arg1),
-                       "c" (arg2) );
-   return __res;
-}
-
-static
-int my_do_syscall3 ( int syscallno, 
-                     int arg1, int arg2, int arg3 )
-{ 
-   int __res;
-   __asm__ volatile ("pushl %%ebx; movl %%esi,%%ebx ; int $0x80 ; popl %%ebx"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "S" (arg1),
-                       "c" (arg2),
-                       "d" (arg3) );
-   return __res;
-}
-
-static
-int do_syscall_select( int n, 
-                       vki_fd_set* readfds, 
-                       vki_fd_set* writefds, 
-                       vki_fd_set* exceptfds, 
-                       struct vki_timeval * timeout )
-{
-   int res;
-   int args[5];
-   args[0] = n;
-   args[1] = (int)readfds;
-   args[2] = (int)writefds;
-   args[3] = (int)exceptfds;
-   args[4] = (int)timeout;
-   res = my_do_syscall1(__NR_select, (int)(&(args[0])) );
-   return res;
-}
-
-
-/* This is a wrapper round select(), which makes it thread-safe,
-   meaning that only this thread will block, rather than the entire
-   process.  This wrapper in turn depends on nanosleep() not to block
-   the entire process, but I think (hope? suspect?) that POSIX
-   pthreads guarantees that to be the case.
-
-   Basic idea is: modify the timeout parameter to select so that it
-   returns immediately.  Poll like this until select returns non-zero,
-   indicating something interesting happened, or until our time is up.
-   Space out the polls with nanosleeps of say 20 milliseconds, which
-   is required to be nonblocking; this allows other threads to run.  
-
-   Assumes:
-   * (checked via my_assert) types fd_set and vki_fd_set are identical.
-   * (checked via my_assert) types timeval and vki_timeval are identical.
-   * (unchecked) libc error numbers (EINTR etc) are the negation of the
-     kernel's error numbers (VKI_EINTR etc).
-*/
-
-/* __attribute__((weak)) */
-int select ( int n, 
-             fd_set *rfds, 
-             fd_set *wfds, 
-             fd_set *xfds, 
-             struct timeval *timeout )
-{
-   unsigned int ms_now, ms_end;
-   int    res;
-   fd_set rfds_copy;
-   fd_set wfds_copy;
-   fd_set xfds_copy;
-   struct vki_timeval  t_now;
-   struct vki_timeval  zero_timeout;
-   struct vki_timespec nanosleep_interval;
-
-   __my_pthread_testcancel();
-
-   /* gcc's complains about ms_end being used uninitialised -- classic
-      case it can't understand, where ms_end is both defined and used
-      only if timeout != NULL.  Hence ... */
-   ms_end = 0;
-
-   /* We assume that the kernel and libc data layouts are identical
-      for the following types.  These asserts provide a crude
-      check. */
-   if (sizeof(fd_set) != sizeof(vki_fd_set)
-       || sizeof(struct timeval) != sizeof(struct vki_timeval))
-      barf("valgrind's hacky non-blocking select(): data sizes error");
-
-   /* Detect the current time and simultaneously find out if we are
-      running on Valgrind. */
-   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__READ_MILLISECOND_TIMER,
-                           0, 0, 0, 0);
-
-   /* If a zero timeout specified, this call is harmless.  Also go
-      this route if we're not running on Valgrind, for whatever
-      reason. */
-   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
-        || (ms_now == 0xFFFFFFFF) ) {
-      res = do_syscall_select( n, (vki_fd_set*)rfds, 
-                                   (vki_fd_set*)wfds, 
-                                   (vki_fd_set*)xfds, 
-                                   (struct vki_timeval*)timeout);
-      if (is_kerror(res)) {
-         * (__errno_location()) = -res;
-         return -1;
-      } else {
-         return res;
-      }
-   }
-
-   /* If a timeout was specified, set ms_end to be the end millisecond
-      counter [wallclock] time. */
-   if (timeout) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      my_assert(res == 0);
-      ms_end = ms_now;
-      ms_end += (timeout->tv_usec / 1000);
-      ms_end += (timeout->tv_sec * 1000);
-      /* Stay sane ... */
-      my_assert (ms_end >= ms_now);
-   }
-
-   /* fprintf(stderr, "MY_SELECT: before loop\n"); */
-
-   /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case ms_end holds the end time. */
-
-   while (1) {
-
-      /* First, do a return-immediately select(). */
-
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
-      if (rfds) rfds_copy = *rfds;
-      if (wfds) wfds_copy = *wfds;
-      if (xfds) xfds_copy = *xfds;
-
-      zero_timeout.tv_sec = zero_timeout.tv_usec = 0;
-
-      res = do_syscall_select( n, 
-                               rfds ? (vki_fd_set*)(&rfds_copy) : NULL,
-                               wfds ? (vki_fd_set*)(&wfds_copy) : NULL,
-                               xfds ? (vki_fd_set*)(&xfds_copy) : NULL,
-                               & zero_timeout );
-      if (is_kerror(res)) {
-         /* Some kind of error (including EINTR).  Set errno and
-            return.  The sets are unspecified in this case. */
-         * (__errno_location()) = -res;
-         return -1;
-      }
-      if (res > 0) {
-         /* one or more fds is ready.  Copy out resulting sets and
-            return. */
-         if (rfds) *rfds = rfds_copy;
-         if (wfds) *wfds = wfds_copy;
-         if (xfds) *xfds = xfds_copy;
-         return res;
-      }
-
-      /* Nothing interesting happened, so we go to sleep for a
-         while. */
-
-      /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
-      /* nanosleep and go round again */
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 50 * 1000 * 1000; /* 50 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      res = my_do_syscall2(__NR_nanosleep, 
-                           (int)(&nanosleep_interval), (int)NULL);
-      if (res == -VKI_EINTR) {
-         /* The nanosleep was interrupted by a signal.  So we do the
-            same. */
-         * (__errno_location()) = EINTR;
-         return -1;
-      }
-
-      /* Sleeping finished.  If a finite timeout, check to see if it
-         has expired yet. */
-      if (timeout) {
-         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                                 VG_USERREQ__READ_MILLISECOND_TIMER,
-                                 0, 0, 0, 0);
-         my_assert(ms_now != 0xFFFFFFFF);
-         if (ms_now >= ms_end) {
-            /* timeout; nothing interesting happened. */
-            if (rfds) FD_ZERO(rfds);
-            if (wfds) FD_ZERO(wfds);
-            if (xfds) FD_ZERO(xfds);
-            return 0;
-         }
-      }
-
-   }
-}
-
-
-
-
-#include <sys/poll.h>
-
-#ifndef HAVE_NFDS_T
-typedef unsigned long int nfds_t;
-#endif
-
-
-/* __attribute__((weak)) */
-int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
-{
-   unsigned int        ms_now, ms_end;
-   int                 res, i;
-   struct vki_timespec nanosleep_interval;
-
-   __my_pthread_testcancel();
-   ensure_valgrind("poll");
-
-   /* Detect the current time and simultaneously find out if we are
-      running on Valgrind. */
-   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__READ_MILLISECOND_TIMER,
-                           0, 0, 0, 0);
-
-   if (/* CHECK SIZES FOR struct pollfd */
-       sizeof(struct timeval) != sizeof(struct vki_timeval))
-      barf("valgrind's hacky non-blocking poll(): data sizes error");
-
-   /* dummy initialisation to keep gcc -Wall happy */
-   ms_end = 0;
-
-   /* If a zero timeout specified, this call is harmless.  Also do
-      this if not running on Valgrind. */
-   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
-      res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
-      if (is_kerror(res)) {
-         * (__errno_location()) = -res;
-         return -1;
-      } else {
-         return res;
-      }
-   }
-
-   /* If a timeout was specified, set ms_end to be the end wallclock
-      time.  Easy considering that __timeout is in milliseconds. */
-   if (__timeout > 0) {
-      ms_end = ms_now + (unsigned int)__timeout;
-   }
-
-   /* fprintf(stderr, "MY_POLL: before loop\n"); */
-
-   /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
-      in which case t_end holds the end time. */
-
-   my_assert(__timeout != 0);
-
-   while (1) {
-
-      /* Do a return-immediately poll. */
-
-      res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
-      if (is_kerror(res)) {
-         /* Some kind of error.  Set errno and return.  */
-         * (__errno_location()) = -res;
-         return -1;
-      }
-      if (res > 0) {
-         /* One or more fds is ready.  Return now. */
-         return res;
-      }
-
-      /* Nothing interesting happened, so we go to sleep for a
-         while. */
-
-      /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
-      /* nanosleep and go round again */
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 51 * 1000 * 1000; /* 51 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      (void)my_do_syscall2(__NR_nanosleep, 
-                           (int)(&nanosleep_interval), (int)NULL);
-
-      /* Sleeping finished.  If a finite timeout, check to see if it
-         has expired yet. */
-      if (__timeout > 0) {
-         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                                 VG_USERREQ__READ_MILLISECOND_TIMER,
-                                 0, 0, 0, 0);
-         my_assert(ms_now != 0xFFFFFFFF);
-         if (ms_now >= ms_end) {
-            /* timeout; nothing interesting happened. */
-            for (i = 0; i < __nfds; i++) 
-               __fds[i].revents = 0;
-            return 0;
-         }
-      }
-
-   }
-}
-
-
-/* Helper function used to make accept() non-blocking.  Idea is to use
-   the above nonblocking poll() to make this thread ONLY wait for the
-   specified fd to become ready, and then return. */
-
-/* Sigh -- a hack.  We're not supposed to include this file directly;
-   should do it via /usr/include/fcntl.h, but that introduces a
-   varargs prototype for fcntl itself, which we can't mimic. */
-#define _FCNTL_H
-#include <bits/fcntl.h>
-
-static void wait_for_fd_to_be_readable_or_erring ( int fd )
-{
-   struct pollfd pfd;
-   int           res;
-
-   /* fprintf(stderr, "wait_for_fd_to_be_readable_or_erring %d\n", fd); */
-
-   /* First check to see if the fd is nonblocking, and/or invalid.  In
-      either case return immediately. */
-   res = __libc_fcntl(fd, F_GETFL, 0);
-   if (res == -1) return; /* fd is invalid somehow */
-   if (res & O_NONBLOCK) return; /* fd is nonblocking */
-
-   /* Ok, we'd better wait with poll. */
-   pfd.fd = fd;
-   pfd.events = POLLIN | POLLPRI | POLLERR | POLLHUP | POLLNVAL;
-   /* ... but not POLLOUT, you may notice. */
-   pfd.revents = 0;
-   (void)poll(&pfd, 1, -1 /* forever */);
-}
-
-
-/* ---------------------------------------------------------------------
-   Hacky implementation of semaphores.
-   ------------------------------------------------------------------ */
-
-#include <semaphore.h>
-
-/* This is a terrible way to do the remapping.  Plan is to import an
-   AVL tree at some point. */
-
-typedef
-   struct {
-      pthread_mutex_t se_mx;
-      pthread_cond_t se_cv;
-      int count;
-   }
-   vg_sem_t;
-
-static pthread_mutex_t se_remap_mx = PTHREAD_MUTEX_INITIALIZER;
-
-static int      se_remap_used = 0;
-static sem_t*   se_remap_orig[VG_N_SEMAPHORES];
-static vg_sem_t se_remap_new[VG_N_SEMAPHORES];
-
-static vg_sem_t* se_remap ( sem_t* orig )
-{
-   int res, i;
-   res = __pthread_mutex_lock(&se_remap_mx);
-   my_assert(res == 0);
-
-   for (i = 0; i < se_remap_used; i++) {
-      if (se_remap_orig[i] == orig)
-         break;
-   }
-   if (i == se_remap_used) {
-      if (se_remap_used == VG_N_SEMAPHORES) {
-         res = pthread_mutex_unlock(&se_remap_mx);
-         my_assert(res == 0);
-         barf("VG_N_SEMAPHORES is too low.  Increase and recompile.");
-      }
-      se_remap_used++;
-      se_remap_orig[i] = orig;
-      /* printf("allocated semaphore %d\n", i); */
-   }
-   res = __pthread_mutex_unlock(&se_remap_mx);
-   my_assert(res == 0);
-   return &se_remap_new[i];
-}
-
-
-int sem_init(sem_t *sem, int pshared, unsigned int value)
-{
-   int       res;
-   vg_sem_t* vg_sem;
-   ensure_valgrind("sem_init");
-   if (pshared != 0) {
-      pthread_error("sem_init: unsupported pshared value");
-      errno = ENOSYS;
-      return -1;
-   }
-   vg_sem = se_remap(sem);
-   res = pthread_mutex_init(&vg_sem->se_mx, NULL);
-   my_assert(res == 0);
-   res = pthread_cond_init(&vg_sem->se_cv, NULL);
-   my_assert(res == 0);
-   vg_sem->count = value;
-   return 0;
-}
-
-
-int sem_wait ( sem_t* sem ) 
-{
-   int       res;
-   vg_sem_t* vg_sem;
-   ensure_valgrind("sem_wait");
-   vg_sem = se_remap(sem);
-   res = __pthread_mutex_lock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   while (vg_sem->count == 0) {
-      res = pthread_cond_wait(&vg_sem->se_cv, &vg_sem->se_mx);
-      my_assert(res == 0);
-   }
-   vg_sem->count--;
-   res = __pthread_mutex_unlock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-int sem_post ( sem_t* sem ) 
-{
-   int       res;
-   vg_sem_t* vg_sem; 
-   ensure_valgrind("sem_post");
-   vg_sem = se_remap(sem);
-   res = __pthread_mutex_lock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   if (vg_sem->count == 0) {
-      vg_sem->count++;
-      res = pthread_cond_broadcast(&vg_sem->se_cv);
-      my_assert(res == 0);
-   } else {
-      vg_sem->count++;
-   }
-   res = __pthread_mutex_unlock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int sem_trywait ( sem_t* sem ) 
-{
-   int       ret, res;
-   vg_sem_t* vg_sem; 
-   ensure_valgrind("sem_trywait");
-   vg_sem = se_remap(sem);
-   res = __pthread_mutex_lock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   if (vg_sem->count > 0) { 
-      vg_sem->count--; 
-      ret = 0; 
-   } else { 
-      ret = -1; 
-      errno = EAGAIN; 
-   }
-   res = __pthread_mutex_unlock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   return ret;
-}
-
-
-int sem_getvalue(sem_t* sem, int * sval)
-{
-   vg_sem_t* vg_sem; 
-   ensure_valgrind("sem_trywait");
-   vg_sem = se_remap(sem);
-   *sval = vg_sem->count;
-   return 0;
-}
-
-
-int sem_destroy(sem_t * sem)
-{
-   kludged("sem_destroy");
-   /* if someone waiting on this semaphore, errno = EBUSY, return -1 */
-   return 0;
-}
-
-
-/* ---------------------------------------------------------------------
-   Reader-writer locks.
-   ------------------------------------------------------------------ */
-
-typedef 
-   struct {
-      int             initted;  /* != 0 --> in use; sanity check only */
-      int             prefer_w; /* != 0 --> prefer writer */
-      int             nwait_r;  /* # of waiting readers */
-      int             nwait_w;  /* # of waiting writers */
-      pthread_cond_t  cv_r;     /* for signalling readers */
-      pthread_cond_t  cv_w;     /* for signalling writers */
-      pthread_mutex_t mx;
-      int             status;
-      /* allowed range for status: >= -1.  -1 means 1 writer currently
-         active, >= 0 means N readers currently active. */
-   } 
-   vg_rwlock_t;
-
-
-static pthread_mutex_t rw_remap_mx = PTHREAD_MUTEX_INITIALIZER;
-
-static int                 rw_remap_used = 0;
-static pthread_rwlock_t*   rw_remap_orig[VG_N_RWLOCKS];
-static vg_rwlock_t         rw_remap_new[VG_N_RWLOCKS];
-
-
-static 
-void init_vg_rwlock ( vg_rwlock_t* vg_rwl )
-{
-   int res = 0;
-   vg_rwl->initted = 1;
-   vg_rwl->prefer_w = 1;
-   vg_rwl->nwait_r = 0;
-   vg_rwl->nwait_w = 0;
-   vg_rwl->status = 0;
-   res = pthread_mutex_init(&vg_rwl->mx, NULL);
-   res |= pthread_cond_init(&vg_rwl->cv_r, NULL);
-   res |= pthread_cond_init(&vg_rwl->cv_w, NULL);
-   my_assert(res == 0);
-}
-
-
-/* Take the address of a LinuxThreads rwlock_t and return the shadow
-   address of our version.  Further, if the LinuxThreads version
-   appears to have been statically initialised, do the same to the one
-   we allocate here.  The pthread_rwlock_t.__rw_readers field is set
-   to zero by PTHREAD_RWLOCK_INITIALIZER, so we take zero as meaning
-   uninitialised and non-zero meaning initialised. 
-*/
-static vg_rwlock_t* rw_remap ( pthread_rwlock_t* orig )
-{
-   int          res, i;
-   vg_rwlock_t* vg_rwl;
-   res = __pthread_mutex_lock(&rw_remap_mx);
-   my_assert(res == 0);
-
-   for (i = 0; i < rw_remap_used; i++) {
-      if (rw_remap_orig[i] == orig)
-         break;
-   }
-   if (i == rw_remap_used) {
-      if (rw_remap_used == VG_N_RWLOCKS) {
-         res = __pthread_mutex_unlock(&rw_remap_mx);
-         my_assert(res == 0);
-         barf("VG_N_RWLOCKS is too low.  Increase and recompile.");
-      }
-      rw_remap_used++;
-      rw_remap_orig[i] = orig;
-      rw_remap_new[i].initted = 0;
-      if (0) printf("allocated rwlock %d\n", i);
-   }
-   res = __pthread_mutex_unlock(&rw_remap_mx);
-   my_assert(res == 0);
-   vg_rwl = &rw_remap_new[i];
-
-   /* Initialise the shadow, if required. */
-   if (orig->__rw_readers == 0) {
-      orig->__rw_readers = 1;
-      init_vg_rwlock(vg_rwl);
-      if (orig->__rw_kind == PTHREAD_RWLOCK_PREFER_READER_NP)
-         vg_rwl->prefer_w = 0;
-   }
-
-   return vg_rwl;
-}
-
-
-int pthread_rwlock_init ( pthread_rwlock_t* orig,
-                          const pthread_rwlockattr_t* attr )
-{
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_init\n");
-   /* Force the remapper to initialise the shadow. */
-   orig->__rw_readers = 0;
-   /* Install the lock preference; the remapper needs to know it. */
-   orig->__rw_kind = PTHREAD_RWLOCK_DEFAULT_NP;
-   if (attr)
-      orig->__rw_kind = attr->__lockkind;
-   rwl = rw_remap ( orig );
-   return 0;
-}
-
-
-static 
-void pthread_rwlock_rdlock_CANCEL_HDLR ( void* rwl_v )
-{
-   vg_rwlock_t* rwl = (vg_rwlock_t*)rwl_v;
-   rwl->nwait_r--;
-   pthread_mutex_unlock (&rwl->mx);
-}
-
-
-int pthread_rwlock_rdlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_rdlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status < 0) {
-      my_assert(rwl->status == -1);
-      rwl->nwait_r++;
-      pthread_cleanup_push( pthread_rwlock_rdlock_CANCEL_HDLR, rwl );
-      while (1) {
-         if (rwl->status == 0) break;
-         res = pthread_cond_wait(&rwl->cv_r, &rwl->mx);
-         my_assert(res == 0);
-      }
-      pthread_cleanup_pop(0);
-      rwl->nwait_r--;
-   }
-   my_assert(rwl->status >= 0);
-   rwl->status++;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int pthread_rwlock_tryrdlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_tryrdlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status == -1) {
-      /* Writer active; we have to give up. */
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EBUSY;
-   }
-   /* Success */
-   my_assert(rwl->status >= 0);
-   rwl->status++;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-static 
-void pthread_rwlock_wrlock_CANCEL_HDLR ( void* rwl_v )
-{
-   vg_rwlock_t* rwl = (vg_rwlock_t*)rwl_v;
-   rwl->nwait_w--;
-   pthread_mutex_unlock (&rwl->mx);
-}
-
-
-int pthread_rwlock_wrlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_wrlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status != 0) {
-      rwl->nwait_w++;
-      pthread_cleanup_push( pthread_rwlock_wrlock_CANCEL_HDLR, rwl );
-      while (1) {
-         if (rwl->status == 0) break;
-         res = pthread_cond_wait(&rwl->cv_w, &rwl->mx);
-         my_assert(res == 0);
-      }
-      pthread_cleanup_pop(0);
-      rwl->nwait_w--;
-   }
-   my_assert(rwl->status == 0);
-   rwl->status = -1;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int pthread_rwlock_trywrlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_wrlock_trywrlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status != 0) {
-      /* Reader(s) or a writer active; we have to give up. */
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EBUSY;
-   }
-   /* Success */
-   my_assert(rwl->status == 0);
-   rwl->status = -1;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int pthread_rwlock_unlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_unlock\n");
-   rwl = rw_remap ( orig );
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status == 0) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EPERM;
-   }
-   my_assert(rwl->status != 0);
-   if (rwl->status == -1) {
-     rwl->status = 0;
-   } else {
-     my_assert(rwl->status > 0);
-     rwl->status--;
-   }
-
-   my_assert(rwl->status >= 0);
-
-   if (rwl->prefer_w) {
-
-      /* Favour waiting writers, if any. */
-      if (rwl->nwait_w > 0) {
-         /* Writer(s) are waiting. */
-         if (rwl->status == 0) {
-            /* We can let a writer in. */
-            res = pthread_cond_signal(&rwl->cv_w);
-            my_assert(res == 0);
-         } else {
-            /* There are still readers active.  Do nothing; eventually
-               they will disappear, at which point a writer will be
-               admitted. */
-         }
-      } 
-      else
-      /* No waiting writers. */
-      if (rwl->nwait_r > 0) {
-         /* Let in a waiting reader. */
-         res = pthread_cond_signal(&rwl->cv_r);
-         my_assert(res == 0);
-      }
-
-   } else {
-
-      /* Favour waiting readers, if any. */
-      if (rwl->nwait_r > 0) {
-         /* Reader(s) are waiting; let one in. */
-         res = pthread_cond_signal(&rwl->cv_r);
-         my_assert(res == 0);
-      } 
-      else
-      /* No waiting readers. */
-      if (rwl->nwait_w > 0 && rwl->status == 0) {
-         /* We have waiting writers and no active readers; let a
-            writer in. */
-         res = pthread_cond_signal(&rwl->cv_w);
-         my_assert(res == 0);
-      }
-   }
-
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;   
-}
-
-
-int pthread_rwlock_destroy ( pthread_rwlock_t *orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_destroy\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status != 0 || rwl->nwait_r > 0 || rwl->nwait_w > 0) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EBUSY;
-   }
-   rwl->initted = 0;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-/* Copied directly from LinuxThreads. */
-int
-pthread_rwlockattr_init (pthread_rwlockattr_t *attr)
-{
-  attr->__lockkind = 0;
-  attr->__pshared = PTHREAD_PROCESS_PRIVATE;
-
-  return 0;
-}
-
-/* Copied directly from LinuxThreads. */
-int
-pthread_rwlockattr_setpshared (pthread_rwlockattr_t *attr, int pshared)
-{
-  if (pshared != PTHREAD_PROCESS_PRIVATE && pshared != PTHREAD_PROCESS_SHARED)
-    return EINVAL;
-
-  /* For now it is not possible to shared a conditional variable.  */
-  if (pshared != PTHREAD_PROCESS_PRIVATE)
-    return ENOSYS;
-
-  attr->__pshared = pshared;
-
-  return 0;
-}
-
-
-/* ---------------------------------------------------------------------
-   B'stard.
-   ------------------------------------------------------------------ */
-
-# define strong_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
-
-# define weak_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
-
-strong_alias(__pthread_mutex_lock, pthread_mutex_lock)
-strong_alias(__pthread_mutex_trylock, pthread_mutex_trylock)
-strong_alias(__pthread_mutex_unlock, pthread_mutex_unlock)
-strong_alias(__pthread_mutexattr_init, pthread_mutexattr_init)
-  weak_alias(__pthread_mutexattr_settype, pthread_mutexattr_settype)
-strong_alias(__pthread_mutex_init, pthread_mutex_init)
-strong_alias(__pthread_mutexattr_destroy, pthread_mutexattr_destroy)
-strong_alias(__pthread_mutex_destroy, pthread_mutex_destroy)
-strong_alias(__pthread_once, pthread_once)
-strong_alias(__pthread_atfork, pthread_atfork)
-strong_alias(__pthread_key_create, pthread_key_create)
-strong_alias(__pthread_getspecific, pthread_getspecific)
-strong_alias(__pthread_setspecific, pthread_setspecific)
-
-#ifndef GLIBC_2_1
-strong_alias(sigaction, __sigaction)
-#endif
-     
-strong_alias(close, __close)
-strong_alias(fcntl, __fcntl)
-strong_alias(lseek, __lseek)
-strong_alias(open, __open)
-strong_alias(open64, __open64)
-strong_alias(read, __read)
-strong_alias(wait, __wait)
-strong_alias(write, __write)
-strong_alias(connect, __connect)
-strong_alias(send, __send)
-
-weak_alias (__pread64, pread64)
-weak_alias (__pwrite64, pwrite64)
-weak_alias(__fork, fork)
-
-weak_alias (__pthread_kill_other_threads_np, pthread_kill_other_threads_np)
-
-/*--------------------------------------------------*/
-
-weak_alias(pthread_rwlock_rdlock, __pthread_rwlock_rdlock)
-weak_alias(pthread_rwlock_unlock, __pthread_rwlock_unlock)
-weak_alias(pthread_rwlock_wrlock, __pthread_rwlock_wrlock)
-
-weak_alias(pthread_rwlock_destroy, __pthread_rwlock_destroy)
-weak_alias(pthread_rwlock_init, __pthread_rwlock_init)
-weak_alias(pthread_rwlock_tryrdlock, __pthread_rwlock_tryrdlock)
-weak_alias(pthread_rwlock_trywrlock, __pthread_rwlock_trywrlock)
-
-
-/* I've no idea what these are, but they get called quite a lot.
-   Anybody know? */
-
-#undef _IO_flockfile
-void _IO_flockfile ( _IO_FILE * file )
-{
-   pthread_mutex_lock(file->_lock);
-}
-weak_alias(_IO_flockfile, flockfile);
-
-
-#undef _IO_funlockfile
-void _IO_funlockfile ( _IO_FILE * file )
-{
-   pthread_mutex_unlock(file->_lock);
-}
-weak_alias(_IO_funlockfile, funlockfile);
-
-
-/* This doesn't seem to be needed to simulate libpthread.so's external
-   interface, but many people complain about its absence. */
-
-strong_alias(__pthread_mutexattr_settype, __pthread_mutexattr_setkind_np)
-weak_alias(__pthread_mutexattr_setkind_np, pthread_mutexattr_setkind_np)
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                          vg_libpthread.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/arch/x86-linux/vg_libpthread_unimp.c b/coregrind/arch/x86-linux/vg_libpthread_unimp.c
deleted file mode 100644
index f413887f27..0000000000
--- a/coregrind/arch/x86-linux/vg_libpthread_unimp.c
+++ /dev/null
@@ -1,262 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Give dummy bindings for everything the real libpthread.so    ---*/
-/*--- binds.                                 vg_libpthread_unimp.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-/* ---------------------------------------------------------------------
-   ALL THIS CODE RUNS ON THE SIMULATED CPU.
-   Give a binding for everything the real libpthread.so binds.
-   ------------------------------------------------------------------ */
-
-extern void vgPlain_unimp ( char* );
-#define unimp(str) vgPlain_unimp(str)
-
-//void _IO_flockfile ( void )  { unimp("_IO_flockfile"); }
-void _IO_ftrylockfile ( void )  { unimp("_IO_ftrylockfile"); }
-//void _IO_funlockfile ( void )  { unimp("_IO_funlockfile"); }
-//void __close ( void )  { unimp("__close"); }
-//void __connect ( void )  { unimp("__connect"); }
-//void __errno_location ( void )  { unimp("__errno_location"); }
-//void __fcntl ( void )  { unimp("__fcntl"); }
-//void __fork ( void )  { unimp("__fork"); }
-//void __h_errno_location ( void )  { unimp("__h_errno_location"); }
-void __libc_allocate_rtsig ( void )  { unimp("__libc_allocate_rtsig"); }
-void __libc_current_sigrtmax ( void )  { unimp("__libc_current_sigrtmax"); }
-void __libc_current_sigrtmin ( void )  { unimp("__libc_current_sigrtmin"); }
-//void __lseek ( void )  { unimp("__lseek"); }
-//void __open ( void )  { unimp("__open"); }
-//void __open64 ( void )  { unimp("__open64"); }
-//void __pread64 ( void )  { unimp("__pread64"); }
-//void __pthread_atfork ( void )  { unimp("__pthread_atfork"); }
-//void __pthread_getspecific ( void )  { unimp("__pthread_getspecific"); }
-//void __pthread_key_create ( void )  { unimp("__pthread_key_create"); }
-//void __pthread_kill_other_threads_np ( void )  { unimp("__pthread_kill_other_threads_np"); }
-//void __pthread_mutex_destroy ( void )  { unimp("__pthread_mutex_destroy"); }
-//void __pthread_mutex_init ( void )  { unimp("__pthread_mutex_init"); }
-//void __pthread_mutex_lock ( void )  { unimp("__pthread_mutex_lock"); }
-//void __pthread_mutex_trylock ( void )  { unimp("__pthread_mutex_trylock"); }
-//void __pthread_mutex_unlock ( void )  { unimp("__pthread_mutex_unlock"); }
-//void __pthread_mutexattr_destroy ( void )  { unimp("__pthread_mutexattr_destroy"); }
-//void __pthread_mutexattr_init ( void )  { unimp("__pthread_mutexattr_init"); }
-//void __pthread_mutexattr_settype ( void )  { unimp("__pthread_mutexattr_settype"); }
-//void __pthread_once ( void )  { unimp("__pthread_once"); }
-//void __pthread_setspecific ( void )  { unimp("__pthread_setspecific"); }
-//void __pwrite64 ( void )  { unimp("__pwrite64"); }
-//void __read ( void )  { unimp("__read"); }
-//void __res_state ( void )  { unimp("__res_state"); }
-//void __send ( void )  { unimp("__send"); }
-//void __sigaction ( void )  { unimp("__sigaction"); }
-//--//void __vfork ( void )  { unimp("__vfork"); }
-//void __wait ( void )  { unimp("__wait"); }
-//void __write ( void )  { unimp("__write"); }
-//void _pthread_cleanup_pop ( void )  { unimp("_pthread_cleanup_pop"); }
-//void _pthread_cleanup_pop_restore ( void )  { unimp("_pthread_cleanup_pop_restore"); }
-//void _pthread_cleanup_push ( void )  { unimp("_pthread_cleanup_push"); }
-//void _pthread_cleanup_push_defer ( void )  { unimp("_pthread_cleanup_push_defer"); }
-//void longjmp ( void )  { unimp("longjmp"); }
-//void pthread_atfork ( void )  { unimp("pthread_atfork"); }
-//void pthread_attr_destroy ( void )  { unimp("pthread_attr_destroy"); }
-void pthread_attr_getdetachstate ( void )  { unimp("pthread_attr_getdetachstate"); }
-void pthread_attr_getinheritsched ( void )  { unimp("pthread_attr_getinheritsched"); }
-//void pthread_attr_getschedparam ( void )  { unimp("pthread_attr_getschedparam"); }
-//void pthread_attr_getschedpolicy ( void )  { unimp("pthread_attr_getschedpolicy"); }
-//void pthread_attr_getscope ( void )  { unimp("pthread_attr_getscope"); }
-
-//void pthread_attr_setdetachstate ( void )  { unimp("pthread_attr_setdetachstate"); }
-//void pthread_attr_setinheritsched ( void )  { unimp("pthread_attr_setinheritsched"); }
-//void pthread_attr_setschedparam ( void )  { unimp("pthread_attr_setschedparam"); }
-//void pthread_attr_setschedpolicy ( void )  { unimp("pthread_attr_setschedpolicy"); }
-//void pthread_attr_setscope ( void )  { unimp("pthread_attr_setscope"); }
-void pthread_barrier_destroy ( void )  { unimp("pthread_barrier_destroy"); }
-void pthread_barrier_init ( void )  { unimp("pthread_barrier_init"); }
-void pthread_barrier_wait ( void )  { unimp("pthread_barrier_wait"); }
-void pthread_barrierattr_destroy ( void )  { unimp("pthread_barrierattr_destroy"); }
-void pthread_barrierattr_init ( void )  { unimp("pthread_barrierattr_init"); }
-void pthread_barrierattr_setpshared ( void )  { unimp("pthread_barrierattr_setpshared"); }
-//void pthread_cancel ( void )  { unimp("pthread_cancel"); }
-//void pthread_cond_broadcast ( void )  { unimp("pthread_cond_broadcast"); }
-//void pthread_cond_destroy ( void )  { unimp("pthread_cond_destroy"); }
-//void pthread_cond_init ( void )  { unimp("pthread_cond_init"); }
-//void pthread_cond_signal ( void )  { unimp("pthread_cond_signal"); }
-//void pthread_cond_timedwait ( void )  { unimp("pthread_cond_timedwait"); }
-//void pthread_cond_wait ( void )  { unimp("pthread_cond_wait"); }
-//void pthread_condattr_destroy ( void )  { unimp("pthread_condattr_destroy"); }
-void pthread_condattr_getpshared ( void )  { unimp("pthread_condattr_getpshared"); }
-//void pthread_condattr_init ( void )  { unimp("pthread_condattr_init"); }
-void pthread_condattr_setpshared ( void )  { unimp("pthread_condattr_setpshared"); }
-//void pthread_detach ( void )  { unimp("pthread_detach"); }
-//void pthread_equal ( void )  { unimp("pthread_equal"); }
-//void pthread_exit ( void )  { unimp("pthread_exit"); }
-//void pthread_getattr_np ( void )  { unimp("pthread_getattr_np"); }
-void pthread_getcpuclockid ( void )  { unimp("pthread_getcpuclockid"); }
-//void pthread_getschedparam ( void )  { unimp("pthread_getschedparam"); }
-//void pthread_getspecific ( void )  { unimp("pthread_getspecific"); }
-//void pthread_join ( void )  { unimp("pthread_join"); }
-//void pthread_key_create ( void )  { unimp("pthread_key_create"); }
-//void pthread_key_delete ( void )  { unimp("pthread_key_delete"); }
-//void pthread_kill ( void )  { unimp("pthread_kill"); }
-//void pthread_mutex_destroy ( void )  { unimp("pthread_mutex_destroy"); }
-//void pthread_mutex_init ( void )  { unimp("pthread_mutex_init"); }
-//void pthread_mutex_lock ( void )  { unimp("pthread_mutex_lock"); }
-void pthread_mutex_timedlock ( void )  { unimp("pthread_mutex_timedlock"); }
-//void pthread_mutex_trylock ( void )  { unimp("pthread_mutex_trylock"); }
-//void pthread_mutex_unlock ( void )  { unimp("pthread_mutex_unlock"); }
-//void pthread_mutexattr_destroy ( void )  { unimp("pthread_mutexattr_destroy"); }
-//void pthread_mutexattr_init ( void )  { unimp("pthread_mutexattr_init"); }
-//void pthread_once ( void )  { unimp("pthread_once"); }
-//void pthread_rwlock_destroy ( void )  { unimp("pthread_rwlock_destroy"); }
-//void pthread_rwlock_init ( void )  { unimp("pthread_rwlock_init"); }
-//void pthread_rwlock_rdlock ( void )  { unimp("pthread_rwlock_rdlock"); }
-void pthread_rwlock_timedrdlock ( void )  { unimp("pthread_rwlock_timedrdlock"); }
-void pthread_rwlock_timedwrlock ( void )  { unimp("pthread_rwlock_timedwrlock"); }
-//void pthread_rwlock_tryrdlock ( void )  { unimp("pthread_rwlock_tryrdlock"); }
-//void pthread_rwlock_trywrlock ( void )  { unimp("pthread_rwlock_trywrlock"); }
-//void pthread_rwlock_unlock ( void )  { unimp("pthread_rwlock_unlock"); }
-//void pthread_rwlock_wrlock ( void )  { unimp("pthread_rwlock_wrlock"); }
-void pthread_rwlockattr_destroy ( void )  { unimp("pthread_rwlockattr_destroy"); }
-void pthread_rwlockattr_getkind_np ( void )  { unimp("pthread_rwlockattr_getkind_np"); }
-void pthread_rwlockattr_getpshared ( void )  { unimp("pthread_rwlockattr_getpshared"); }
-//void pthread_rwlockattr_init ( void )  { unimp("pthread_rwlockattr_init"); }
-void pthread_rwlockattr_setkind_np ( void )  { unimp("pthread_rwlockattr_setkind_np"); }
-//void pthread_rwlockattr_setpshared ( void )  { unimp("pthread_rwlockattr_setpshared"); }
-//void pthread_self ( void )  { unimp("pthread_self"); }
-//void pthread_setcancelstate ( void )  { unimp("pthread_setcancelstate"); }
-//void pthread_setcanceltype ( void )  { unimp("pthread_setcanceltype"); }
-//void pthread_setschedparam ( void )  { unimp("pthread_setschedparam"); }
-//void pthread_setspecific ( void )  { unimp("pthread_setspecific"); }
-//void pthread_sigmask ( void )  { unimp("pthread_sigmask"); }
-//void pthread_testcancel ( void )  { unimp("pthread_testcancel"); }
-//void raise ( void )  { unimp("raise"); }
-void sem_close ( void )  { unimp("sem_close"); }
-void sem_open ( void )  { unimp("sem_open"); }
-void sem_timedwait ( void )  { unimp("sem_timedwait"); }
-void sem_unlink ( void )  { unimp("sem_unlink"); }
-//void sigaction ( void )  { unimp("sigaction"); }
-//void siglongjmp ( void )  { unimp("siglongjmp"); }
-//void sigwait ( void )  { unimp("sigwait"); }
-
-void __pthread_clock_gettime ( void ) { unimp("__pthread_clock_gettime"); }
-void __pthread_clock_settime ( void ) { unimp("__pthread_clock_settime"); }
-
-#if 0
-void pthread_create@@GLIBC_2.1 ( void )  { unimp("pthread_create@@GLIBC_2.1"); }
-void pthread_create@GLIBC_2.0 ( void )  { unimp("pthread_create@GLIBC_2.0"); }
-
-void sem_wait@@GLIBC_2.1 ( void )  { unimp("sem_wait@@GLIBC_2.1"); }
-void sem_wait@GLIBC_2.0 ( void )  { unimp("sem_wait@GLIBC_2.0"); }
-
-void sem_trywait@@GLIBC_2.1 ( void )  { unimp("sem_trywait@@GLIBC_2.1"); }
-void sem_trywait@GLIBC_2.0 ( void )  { unimp("sem_trywait@GLIBC_2.0"); }
-
-void sem_post@@GLIBC_2.1 ( void )  { unimp("sem_post@@GLIBC_2.1"); }
-void sem_post@GLIBC_2.0 ( void )  { unimp("sem_post@GLIBC_2.0"); }
-
-void sem_destroy@@GLIBC_2.1 ( void )  { unimp("sem_destroy@@GLIBC_2.1"); }
-void sem_destroy@GLIBC_2.0 ( void )  { unimp("sem_destroy@GLIBC_2.0"); }
-void sem_getvalue@@GLIBC_2.1 ( void )  { unimp("sem_getvalue@@GLIBC_2.1"); }
-void sem_getvalue@GLIBC_2.0 ( void )  { unimp("sem_getvalue@GLIBC_2.0"); }
-void sem_init@@GLIBC_2.1 ( void )  { unimp("sem_init@@GLIBC_2.1"); }
-void sem_init@GLIBC_2.0 ( void )  { unimp("sem_init@GLIBC_2.0"); }
-
-void pthread_attr_init@@GLIBC_2.1 ( void )  { unimp("pthread_attr_init@@GLIBC_2.1"); }
-void pthread_attr_init@GLIBC_2.0 ( void )  { unimp("pthread_attr_init@GLIBC_2.0"); }
-#endif
-
-
-
-# define strong_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
-
-# define weak_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
-
-//weak_alias(pthread_rwlock_destroy, __pthread_rwlock_destroy)
-//weak_alias(pthread_rwlock_init, __pthread_rwlock_init)
-//weak_alias(pthread_rwlock_tryrdlock, __pthread_rwlock_tryrdlock)
-//weak_alias(pthread_rwlock_trywrlock, __pthread_rwlock_trywrlock)
-//weak_alias(pthread_rwlock_wrlock, __pthread_rwlock_wrlock)
-weak_alias(_IO_ftrylockfile, ftrylockfile)
-
-//__attribute__((weak)) void pread ( void ) { vgPlain_unimp("pread"); }
-//__attribute__((weak)) void pwrite ( void ) { vgPlain_unimp("pwrite"); }
-//__attribute__((weak)) void msync ( void ) { vgPlain_unimp("msync"); }
-//__attribute__((weak)) void pause ( void ) { vgPlain_unimp("pause"); }
-//__attribute__((weak)) void recvfrom ( void ) { vgPlain_unimp("recvfrom"); }
-//__attribute__((weak)) void recvmsg ( void ) { vgPlain_unimp("recvmsg"); }
-//__attribute__((weak)) void sendmsg ( void ) { vgPlain_unimp("sendmsg"); }
-__attribute__((weak)) void tcdrain ( void ) { vgPlain_unimp("tcdrain"); }
-//--//__attribute__((weak)) void vfork ( void ) { vgPlain_unimp("vfork"); }
-
-__attribute__((weak)) void pthread_attr_getguardsize ( void )
-                      { vgPlain_unimp("pthread_attr_getguardsize"); }
-__attribute__((weak)) void pthread_attr_getstack ( void )
-                      { vgPlain_unimp("pthread_attr_getstack"); }
-__attribute__((weak)) void pthread_attr_getstackaddr ( void )
-                      { vgPlain_unimp("pthread_attr_getstackaddr"); }
-__attribute__((weak)) void pthread_attr_getstacksize ( void )
-                      { vgPlain_unimp("pthread_attr_getstacksize"); }
-__attribute__((weak)) void pthread_attr_setguardsize ( void )
-                      { vgPlain_unimp("pthread_attr_setguardsize"); }
-__attribute__((weak)) void pthread_attr_setstack ( void )
-                      { vgPlain_unimp("pthread_attr_setstack"); }
-__attribute__((weak)) void pthread_attr_setstackaddr ( void )
-                      { vgPlain_unimp("pthread_attr_setstackaddr"); }
-//__attribute__((weak)) void pthread_attr_setstacksize ( void )
-//                      { vgPlain_unimp("pthread_attr_setstacksize"); }
-__attribute__((weak)) void pthread_getconcurrency ( void )
-                      { vgPlain_unimp("pthread_getconcurrency"); }
-//__attribute__((weak)) void pthread_kill_other_threads_np ( void )
-//                      { vgPlain_unimp("pthread_kill_other_threads_np"); }
-__attribute__((weak)) void pthread_mutexattr_getkind_np ( void )
-                      { vgPlain_unimp("pthread_mutexattr_getkind_np"); }
-__attribute__((weak)) void pthread_mutexattr_getpshared ( void )
-                      { vgPlain_unimp("pthread_mutexattr_getpshared"); }
-__attribute__((weak)) void pthread_mutexattr_gettype ( void )
-                      { vgPlain_unimp("pthread_mutexattr_gettype"); }
-__attribute__((weak)) void pthread_mutexattr_setkind_np ( void )
-                      { vgPlain_unimp("pthread_mutexattr_setkind_np"); }
-__attribute__((weak)) void pthread_mutexattr_setpshared ( void )
-                      { vgPlain_unimp("pthread_mutexattr_setpshared"); }
-__attribute__((weak)) void pthread_setconcurrency ( void )
-                      { vgPlain_unimp("pthread_setconcurrency"); }
-__attribute__((weak)) void pthread_spin_destroy ( void )
-                      { vgPlain_unimp("pthread_spin_destroy"); }
-__attribute__((weak)) void pthread_spin_init ( void )
-                      { vgPlain_unimp("pthread_spin_init"); }
-__attribute__((weak)) void pthread_spin_lock ( void )
-                      { vgPlain_unimp("pthread_spin_lock"); }
-__attribute__((weak)) void pthread_spin_trylock ( void )
-                      { vgPlain_unimp("pthread_spin_trylock"); }
-__attribute__((weak)) void pthread_spin_unlock ( void )
-                      { vgPlain_unimp("pthread_spin_unlock"); }
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                    vg_libpthread_unimp.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/arch/x86-linux/vg_syscall.S b/coregrind/arch/x86-linux/vg_syscall.S
deleted file mode 100644
index adabbedbbe..0000000000
--- a/coregrind/arch/x86-linux/vg_syscall.S
+++ /dev/null
@@ -1,104 +0,0 @@
-
-##--------------------------------------------------------------------##
-##--- Support for doing system calls.                              ---##
-##---                                                 vg_syscall.S ---##
-##--------------------------------------------------------------------##
-
-/*
-  This file is part of Valgrind, an x86 protected-mode emulator 
-  designed for debugging and profiling binaries on x86-Unixes.
-
-  Copyright (C) 2000-2002 Julian Seward 
-     jseward@acm.org
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of the
-  License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-  02111-1307, USA.
-
-  The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_constants.h"
-
-
-.globl	VG_(do_syscall)
-
-# NOTE that this routine expects the simulated machines state
-# to be in m_state_static.  Therefore it needs to be wrapped by
-# code which copies from baseBlock before the call, into
-# m_state_static, and back afterwards.
-	
-VG_(do_syscall):
-	# Save all the int registers of the real machines state on the
-	# simulators stack.
-	pushal
-
-	# and save the real FPU state too
-	fwait
-	fnsave	VG_(real_fpu_state_saved_over_syscall)
-	frstor	VG_(real_fpu_state_saved_over_syscall)
-
-	# remember what the simulators stack pointer is
-	movl	%esp, VG_(esp_saved_over_syscall)
-	
-	# Now copy the simulated machines state into the real one
-	# esp still refers to the simulators stack
-	frstor	VG_(m_state_static)+40
-	movl	VG_(m_state_static)+32, %eax
-	pushl	%eax
-	popfl
-	movl	VG_(m_state_static)+0, %eax
-	movl	VG_(m_state_static)+4, %ecx
-	movl	VG_(m_state_static)+8, %edx
-	movl	VG_(m_state_static)+12, %ebx
-	movl	VG_(m_state_static)+16, %esp
-	movl	VG_(m_state_static)+20, %ebp
-	movl	VG_(m_state_static)+24, %esi
-	movl	VG_(m_state_static)+28, %edi
-
-	# esp now refers to the simulatees stack
-	# Do the actual system call
-	int	$0x80
-
-	# restore stack as soon as possible
-	# esp refers to simulatees stack
-	movl	%esp, VG_(m_state_static)+16
-	movl	VG_(esp_saved_over_syscall), %esp
-	# esp refers to simulators stack
-
-	# ... and undo everything else.  
-	# Copy real state back to simulated state.	
-	movl	%eax, VG_(m_state_static)+0
-	movl	%ecx, VG_(m_state_static)+4
-	movl	%edx, VG_(m_state_static)+8
-	movl	%ebx, VG_(m_state_static)+12
-	movl	%ebp, VG_(m_state_static)+20
-	movl	%esi, VG_(m_state_static)+24
-	movl	%edi, VG_(m_state_static)+28
-	pushfl
-	popl	%eax
-	movl	%eax, VG_(m_state_static)+32
-	fwait
-	fnsave	VG_(m_state_static)+40
-	frstor	VG_(m_state_static)+40
-
-	# Restore the state of the simulator
-	frstor	VG_(real_fpu_state_saved_over_syscall)
-	popal
-
-	ret
-
-##--------------------------------------------------------------------##
-##--- end                                             vg_syscall.S ---##
-##--------------------------------------------------------------------##
diff --git a/coregrind/demangle/Makefile.am b/coregrind/demangle/Makefile.am
deleted file mode 100644
index 554c75bdbc..0000000000
--- a/coregrind/demangle/Makefile.am
+++ /dev/null
@@ -1,25 +0,0 @@
-INCLUDES = -I$(top_srcdir)
-
-CFLAGS = $(WERROR) -Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-noinst_HEADERS = \
-	ansidecl.h     \
-	dyn-string.h   \
-	demangle.h     \
-	safe-ctype.h 
-
-noinst_LIBRARIES = libdemangle.a
-
-libdemangle_a_SOURCES = \
-	cp-demangle.c cplus-dem.c dyn-string.c safe-ctype.c
-
-# some files don't like my config.h, so just pretend it does not exist...
-
-cp-demangle.o: cp-demangle.c
-	$(COMPILE) -Wno-unused -Wno-shadow -c $< -UHAVE_CONFIG_H
-
-dyn-string.o: dyn-string.c
-	$(COMPILE) -c $< -UHAVE_CONFIG_H
-
-cplus-dem.o: cplus-dem.c
-	$(COMPILE) -Wno-unused -c $<
diff --git a/coregrind/demangle/ansidecl.h b/coregrind/demangle/ansidecl.h
deleted file mode 100644
index 9a7c5777ff..0000000000
--- a/coregrind/demangle/ansidecl.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* ANSI and traditional C compatability macros
-   Copyright 1991, 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001
-   Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
-
-/* ANSI and traditional C compatibility macros
-
-   ANSI C is assumed if __STDC__ is #defined.
-
-   Macro		ANSI C definition	Traditional C definition
-   -----		---- - ----------	----------- - ----------
-   ANSI_PROTOTYPES	1			not defined
-   PTR			`void *'		`char *'
-   PTRCONST		`void *const'		`char *'
-   LONG_DOUBLE		`long double'		`double'
-   const		not defined		`'
-   volatile		not defined		`'
-   signed		not defined		`'
-   VA_START(ap, var)	va_start(ap, var)	va_start(ap)
-
-   Note that it is safe to write "void foo();" indicating a function
-   with no return value, in all K+R compilers we have been able to test.
-
-   For declaring functions with prototypes, we also provide these:
-
-   PARAMS ((prototype))
-   -- for functions which take a fixed number of arguments.  Use this
-   when declaring the function.  When defining the function, write a
-   K+R style argument list.  For example:
-
-	char *strcpy PARAMS ((char *dest, char *source));
-	...
-	char *
-	strcpy (dest, source)
-	     char *dest;
-	     char *source;
-	{ ... }
-
-
-   VPARAMS ((prototype, ...))
-   -- for functions which take a variable number of arguments.  Use
-   PARAMS to declare the function, VPARAMS to define it.  For example:
-
-	int printf PARAMS ((const char *format, ...));
-	...
-	int
-	printf VPARAMS ((const char *format, ...))
-	{
-	   ...
-	}
-
-   For writing functions which take variable numbers of arguments, we
-   also provide the VA_OPEN, VA_CLOSE, and VA_FIXEDARG macros.  These
-   hide the differences between K+R <varargs.h> and C89 <stdarg.h> more
-   thoroughly than the simple VA_START() macro mentioned above.
-
-   VA_OPEN and VA_CLOSE are used *instead of* va_start and va_end.
-   Immediately after VA_OPEN, put a sequence of VA_FIXEDARG calls
-   corresponding to the list of fixed arguments.  Then use va_arg
-   normally to get the variable arguments, or pass your va_list object
-   around.  You do not declare the va_list yourself; VA_OPEN does it
-   for you.
-
-   Here is a complete example:
-
-	int
-	printf VPARAMS ((const char *format, ...))
-	{
-	   int result;
-
-	   VA_OPEN (ap, format);
-	   VA_FIXEDARG (ap, const char *, format);
-
-	   result = vfprintf (stdout, format, ap);
-	   VA_CLOSE (ap);
-
-	   return result;
-	}
-
-
-   You can declare variables either before or after the VA_OPEN,
-   VA_FIXEDARG sequence.  Also, VA_OPEN and VA_CLOSE are the beginning
-   and end of a block.  They must appear at the same nesting level,
-   and any variables declared after VA_OPEN go out of scope at
-   VA_CLOSE.  Unfortunately, with a K+R compiler, that includes the
-   argument list.  You can have multiple instances of VA_OPEN/VA_CLOSE
-   pairs in a single function in case you need to traverse the
-   argument list more than once.
-
-   For ease of writing code which uses GCC extensions but needs to be
-   portable to other compilers, we provide the GCC_VERSION macro that
-   simplifies testing __GNUC__ and __GNUC_MINOR__ together, and various
-   wrappers around __attribute__.  Also, __extension__ will be #defined
-   to nothing if it doesn't work.  See below.
-
-   This header also defines a lot of obsolete macros:
-   CONST, VOLATILE, SIGNED, PROTO, EXFUN, DEFUN, DEFUN_VOID,
-   AND, DOTS, NOARGS.  Don't use them.  */
-
-#ifndef	_ANSIDECL_H
-#define _ANSIDECL_H	1
-
-/* Every source file includes this file,
-   so they will all get the switch for lint.  */
-/* LINTLIBRARY */
-
-/* Using MACRO(x,y) in cpp #if conditionals does not work with some
-   older preprocessors.  Thus we can't define something like this:
-
-#define HAVE_GCC_VERSION(MAJOR, MINOR) \
-  (__GNUC__ > (MAJOR) || (__GNUC__ == (MAJOR) && __GNUC_MINOR__ >= (MINOR)))
-
-and then test "#if HAVE_GCC_VERSION(2,7)".
-
-So instead we use the macro below and test it against specific values.  */
-
-/* This macro simplifies testing whether we are using gcc, and if it
-   is of a particular minimum version. (Both major & minor numbers are
-   significant.)  This macro will evaluate to 0 if we are not using
-   gcc at all.  */
-#ifndef GCC_VERSION
-#define GCC_VERSION (__GNUC__ * 1000 + __GNUC_MINOR__)
-#endif /* GCC_VERSION */
-
-#if defined (__STDC__) || defined (_AIX) || (defined (__mips) && defined (_SYSTYPE_SVR4)) || defined(_WIN32)
-/* All known AIX compilers implement these things (but don't always
-   define __STDC__).  The RISC/OS MIPS compiler defines these things
-   in SVR4 mode, but does not define __STDC__.  */
-
-#define ANSI_PROTOTYPES	1
-#define PTR		void *
-#define PTRCONST	void *const
-#define LONG_DOUBLE	long double
-
-#define PARAMS(ARGS)		ARGS
-#define VPARAMS(ARGS)		ARGS
-#define VA_START(VA_LIST, VAR)	va_start(VA_LIST, VAR)
-
-/* variadic function helper macros */
-/* "struct Qdmy" swallows the semicolon after VA_OPEN/VA_FIXEDARG's
-   use without inhibiting further decls and without declaring an
-   actual variable.  */
-#define VA_OPEN(AP, VAR)	{ va_list AP; va_start(AP, VAR); { struct Qdmy
-#define VA_CLOSE(AP)		} va_end(AP); }
-#define VA_FIXEDARG(AP, T, N)	struct Qdmy
- 
-#undef const
-#undef volatile
-#undef signed
-
-/* inline requires special treatment; it's in C99, and GCC >=2.7 supports
-   it too, but it's not in C89.  */
-#undef inline
-#if __STDC_VERSION__ > 199901L
-/* it's a keyword */
-#else
-# if GCC_VERSION >= 2007
-#  define inline __inline__   /* __inline__ prevents -pedantic warnings */
-# else
-#  define inline  /* nothing */
-# endif
-#endif
-
-/* These are obsolete.  Do not use.  */
-#ifndef IN_GCC
-#define CONST		const
-#define VOLATILE	volatile
-#define SIGNED		signed
-
-#define PROTO(type, name, arglist)	type name arglist
-#define EXFUN(name, proto)		name proto
-#define DEFUN(name, arglist, args)	name(args)
-#define DEFUN_VOID(name)		name(void)
-#define AND		,
-#define DOTS		, ...
-#define NOARGS		void
-#endif /* ! IN_GCC */
-
-#else	/* Not ANSI C.  */
-
-#undef  ANSI_PROTOTYPES
-#define PTR		char *
-#define PTRCONST	PTR
-#define LONG_DOUBLE	double
-
-#define PARAMS(args)		()
-#define VPARAMS(args)		(va_alist) va_dcl
-#define VA_START(va_list, var)	va_start(va_list)
-
-#define VA_OPEN(AP, VAR)		{ va_list AP; va_start(AP); { struct Qdmy
-#define VA_CLOSE(AP)			} va_end(AP); }
-#define VA_FIXEDARG(AP, TYPE, NAME)	TYPE NAME = va_arg(AP, TYPE)
-
-/* some systems define these in header files for non-ansi mode */
-#undef const
-#undef volatile
-#undef signed
-#undef inline
-#define const
-#define volatile
-#define signed
-#define inline
-
-#ifndef IN_GCC
-#define CONST
-#define VOLATILE
-#define SIGNED
-
-#define PROTO(type, name, arglist)	type name ()
-#define EXFUN(name, proto)		name()
-#define DEFUN(name, arglist, args)	name arglist args;
-#define DEFUN_VOID(name)		name()
-#define AND		;
-#define DOTS
-#define NOARGS
-#endif /* ! IN_GCC */
-
-#endif	/* ANSI C.  */
-
-/* Define macros for some gcc attributes.  This permits us to use the
-   macros freely, and know that they will come into play for the
-   version of gcc in which they are supported.  */
-
-#if (GCC_VERSION < 2007)
-# define __attribute__(x)
-#endif
-
-/* Attribute __malloc__ on functions was valid as of gcc 2.96. */
-#ifndef ATTRIBUTE_MALLOC
-# if (GCC_VERSION >= 2096)
-#  define ATTRIBUTE_MALLOC __attribute__ ((__malloc__))
-# else
-#  define ATTRIBUTE_MALLOC
-# endif /* GNUC >= 2.96 */
-#endif /* ATTRIBUTE_MALLOC */
-
-/* Attributes on labels were valid as of gcc 2.93. */
-#ifndef ATTRIBUTE_UNUSED_LABEL
-# if (GCC_VERSION >= 2093)
-#  define ATTRIBUTE_UNUSED_LABEL ATTRIBUTE_UNUSED
-# else
-#  define ATTRIBUTE_UNUSED_LABEL
-# endif /* GNUC >= 2.93 */
-#endif /* ATTRIBUTE_UNUSED_LABEL */
-
-#ifndef ATTRIBUTE_UNUSED
-#define ATTRIBUTE_UNUSED __attribute__ ((__unused__))
-#endif /* ATTRIBUTE_UNUSED */
-
-#ifndef ATTRIBUTE_NORETURN
-#define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
-#endif /* ATTRIBUTE_NORETURN */
-
-#ifndef ATTRIBUTE_PRINTF
-#define ATTRIBUTE_PRINTF(m, n) __attribute__ ((__format__ (__printf__, m, n)))
-#define ATTRIBUTE_PRINTF_1 ATTRIBUTE_PRINTF(1, 2)
-#define ATTRIBUTE_PRINTF_2 ATTRIBUTE_PRINTF(2, 3)
-#define ATTRIBUTE_PRINTF_3 ATTRIBUTE_PRINTF(3, 4)
-#define ATTRIBUTE_PRINTF_4 ATTRIBUTE_PRINTF(4, 5)
-#define ATTRIBUTE_PRINTF_5 ATTRIBUTE_PRINTF(5, 6)
-#endif /* ATTRIBUTE_PRINTF */
-
-/* We use __extension__ in some places to suppress -pedantic warnings
-   about GCC extensions.  This feature didn't work properly before
-   gcc 2.8.  */
-#if GCC_VERSION < 2008
-#define __extension__
-#endif
-
-/* Bootstrap support:  Adjust certain macros defined by Autoconf,
-   which are only valid for the stage1 compiler.  If we detect
-   a modern version of GCC, we are probably in stage2 or beyond,
-   so unconditionally reset the values.  Note that const, inline,
-   etc. have been dealt with above.  */
-#if (GCC_VERSION >= 2007)
-# ifndef HAVE_LONG_DOUBLE
-#  define HAVE_LONG_DOUBLE 1
-# endif
-#endif /* GCC >= 2.7 */
-
-#endif	/* ansidecl.h	*/
diff --git a/coregrind/demangle/cp-demangle.c b/coregrind/demangle/cp-demangle.c
deleted file mode 100644
index 5cf99c8c89..0000000000
--- a/coregrind/demangle/cp-demangle.c
+++ /dev/null
@@ -1,4174 +0,0 @@
-/* Demangler for IA64 / g++ V3 ABI.
-   Copyright (C) 2000, 2001 Free Software Foundation, Inc.
-   Written by Alex Samuel <samuel@codesourcery.com>. 
-
-   This file is part of GNU CC.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 
-*/
-
-/* This file implements demangling of C++ names mangled according to
-   the IA64 / g++ V3 ABI.  Use the cp_demangle function to
-   demangle a mangled name, or compile with the preprocessor macro
-   STANDALONE_DEMANGLER defined to create a demangling filter
-   executable (functionally similar to c++filt, but includes this
-   demangler only).  */
-
-#include <sys/types.h>
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-
-#ifdef HAVE_STRING_H
-#include <string.h>
-#endif
-
-#include "vg_include.h"
-#include "ansidecl.h"
-#include "dyn-string.h"
-#include "demangle.h"
-
-#ifndef STANDALONE
-#define malloc(s) VG_(malloc)(VG_AR_DEMANGLE, s)
-#define free(p) VG_(free)(VG_AR_DEMANGLE, p)
-#define realloc(p,s) VG_(realloc)(VG_AR_DEMANGLE, p, s)
-#endif
-
-/* If CP_DEMANGLE_DEBUG is defined, a trace of the grammar evaluation,
-   and other debugging output, will be generated. */
-#ifdef CP_DEMANGLE_DEBUG
-#define DEMANGLE_TRACE(PRODUCTION, DM)                                  \
-  fprintf (stderr, " -> %-24s at position %3d\n",                       \
-           (PRODUCTION), current_position (DM));
-#else
-#define DEMANGLE_TRACE(PRODUCTION, DM)
-#endif
-
-/* Don't include <ctype.h>, to prevent additional unresolved symbols
-   from being dragged into the C++ runtime library.  */
-#define IS_DIGIT(CHAR) ((CHAR) >= '0' && (CHAR) <= '9')
-#define IS_ALPHA(CHAR)                                                  \
-  (((CHAR) >= 'a' && (CHAR) <= 'z')                                     \
-   || ((CHAR) >= 'A' && (CHAR) <= 'Z'))
-
-/* The prefix prepended by GCC to an identifier represnting the
-   anonymous namespace.  */
-#define ANONYMOUS_NAMESPACE_PREFIX "_GLOBAL_"
-
-/* Character(s) to use for namespace separation in demangled output */
-#define NAMESPACE_SEPARATOR (dm->style == DMGL_JAVA ? "." : "::")
-
-/* If flag_verbose is zero, some simplifications will be made to the
-   output to make it easier to read and supress details that are
-   generally not of interest to the average C++ programmer.
-   Otherwise, the demangled representation will attempt to convey as
-   much information as the mangled form.  */
-static int flag_verbose;
-
-/* If flag_strict is non-zero, demangle strictly according to the
-   specification -- don't demangle special g++ manglings.  */
-static int flag_strict;
-
-/* String_list_t is an extended form of dyn_string_t which provides a
-   link field and a caret position for additions to the string.  A
-   string_list_t may safely be cast to and used as a dyn_string_t.  */
-
-struct string_list_def
-{
-  /* The dyn_string; must be first.  */
-  struct dyn_string string;
-
-  /* The position at which additional text is added to this string
-     (using the result_add* macros).  This value is an offset from the
-     end of the string, not the beginning (and should be
-     non-positive).  */
-  int caret_position;
-
-  /* The next string in the list.  */
-  struct string_list_def *next;
-};
-
-typedef struct string_list_def *string_list_t;
-
-/* Data structure representing a potential substitution.  */
-
-struct substitution_def
-{
-  /* The demangled text of the substitution.  */
-  dyn_string_t text;
-
-  /* Whether this substitution represents a template item.  */
-  int template_p : 1;
-};
-
-/* Data structure representing a template argument list.  */
-
-struct template_arg_list_def
-{
-  /* The next (lower) template argument list in the stack of currently
-     active template arguments.  */
-  struct template_arg_list_def *next;
-
-  /* The first element in the list of template arguments in
-     left-to-right order.  */
-  string_list_t first_argument;
-
-  /* The last element in the arguments lists.  */
-  string_list_t last_argument;
-};
-
-typedef struct template_arg_list_def *template_arg_list_t;
-
-/* Data structure to maintain the state of the current demangling.  */
-
-struct demangling_def
-{
-  /* The full mangled name being mangled.  */
-  const char *name;
-
-  /* Pointer into name at the current position.  */
-  const char *next;
-
-  /* Stack for strings containing demangled result generated so far.
-     Text is emitted to the topmost (first) string.  */
-  string_list_t result;
-
-  /* The number of presently available substitutions.  */
-  int num_substitutions;
-
-  /* The allocated size of the substitutions array.  */
-  int substitutions_allocated;
-
-  /* An array of available substitutions.  The number of elements in
-     the array is given by num_substitions, and the allocated array
-     size in substitutions_size.  
-
-     The most recent substition is at the end, so
-
-       - `S_'  corresponds to substititutions[num_substitutions - 1] 
-       - `S0_' corresponds to substititutions[num_substitutions - 2]
-
-     etc. */
-  struct substitution_def *substitutions;
-
-  /* The stack of template argument lists.  */
-  template_arg_list_t template_arg_lists;
-
-  /* The most recently demangled source-name.  */
-  dyn_string_t last_source_name;
-  
-  /* Language style to use for demangled output. */
-  int style;
-
-  /* Set to non-zero iff this name is a constructor.  The actual value
-     indicates what sort of constructor this is; see demangle.h.  */
-  enum gnu_v3_ctor_kinds is_constructor;
-
-  /* Set to non-zero iff this name is a destructor.  The actual value
-     indicates what sort of destructor this is; see demangle.h.  */
-  enum gnu_v3_dtor_kinds is_destructor;
-
-};
-
-typedef struct demangling_def *demangling_t;
-
-/* This type is the standard return code from most functions.  Values
-   other than STATUS_OK contain descriptive messages.  */
-typedef const char *status_t;
-
-/* Special values that can be used as a status_t.  */
-#define STATUS_OK                       NULL
-#define STATUS_ERROR                    "Error."
-#define STATUS_UNIMPLEMENTED            "Unimplemented."
-#define STATUS_INTERNAL_ERROR           "Internal error."
-
-/* This status code indicates a failure in malloc or realloc.  */
-static const char *const status_allocation_failed = "Allocation failed.";
-#define STATUS_ALLOCATION_FAILED        status_allocation_failed
-
-/* Non-zero if STATUS indicates that no error has occurred.  */
-#define STATUS_NO_ERROR(STATUS)         ((STATUS) == STATUS_OK)
-
-/* Evaluate EXPR, which must produce a status_t.  If the status code
-   indicates an error, return from the current function with that
-   status code.  */
-#define RETURN_IF_ERROR(EXPR)                                           \
-  do                                                                    \
-    {                                                                   \
-      status_t s = EXPR;                                                \
-      if (!STATUS_NO_ERROR (s))                                         \
-	return s;                                                       \
-    }                                                                   \
-  while (0)
-
-static status_t int_to_dyn_string 
-  PARAMS ((int, dyn_string_t));
-static string_list_t string_list_new
-  PARAMS ((int));
-static void string_list_delete
-  PARAMS ((string_list_t));
-static status_t result_add_separated_char
-  PARAMS ((demangling_t, int));
-static status_t result_push
-  PARAMS ((demangling_t));
-static string_list_t result_pop
-  PARAMS ((demangling_t));
-static int substitution_start
-  PARAMS ((demangling_t));
-static status_t substitution_add
-  PARAMS ((demangling_t, int, int));
-static dyn_string_t substitution_get
-  PARAMS ((demangling_t, int, int *));
-#ifdef CP_DEMANGLE_DEBUG
-static void substitutions_print 
-  PARAMS ((demangling_t, FILE *));
-#endif
-static template_arg_list_t template_arg_list_new
-  PARAMS ((void));
-static void template_arg_list_delete
-  PARAMS ((template_arg_list_t));
-static void template_arg_list_add_arg 
-  PARAMS ((template_arg_list_t, string_list_t));
-static string_list_t template_arg_list_get_arg
-  PARAMS ((template_arg_list_t, int));
-static void push_template_arg_list
-  PARAMS ((demangling_t, template_arg_list_t));
-static void pop_to_template_arg_list
-  PARAMS ((demangling_t, template_arg_list_t));
-#ifdef CP_DEMANGLE_DEBUG
-static void template_arg_list_print
-  PARAMS ((template_arg_list_t, FILE *));
-#endif
-static template_arg_list_t current_template_arg_list
-  PARAMS ((demangling_t));
-static demangling_t demangling_new
-  PARAMS ((const char *, int));
-static void demangling_delete 
-  PARAMS ((demangling_t));
-
-/* The last character of DS.  Warning: DS is evaluated twice.  */
-#define dyn_string_last_char(DS)                                        \
-  (dyn_string_buf (DS)[dyn_string_length (DS) - 1])
-
-/* Append a space character (` ') to DS if it does not already end
-   with one.  Evaluates to 1 on success, or 0 on allocation failure.  */
-#define dyn_string_append_space(DS)                                     \
-      ((dyn_string_length (DS) > 0                                      \
-        && dyn_string_last_char (DS) != ' ')                            \
-       ? dyn_string_append_char ((DS), ' ')                             \
-       : 1)
-
-/* Returns the index of the current position in the mangled name.  */
-#define current_position(DM)    ((DM)->next - (DM)->name)
-
-/* Returns the character at the current position of the mangled name.  */
-#define peek_char(DM)           (*((DM)->next))
-
-/* Returns the character one past the current position of the mangled
-   name.  */
-#define peek_char_next(DM)                                              \
-  (peek_char (DM) == '\0' ? '\0' : (*((DM)->next + 1)))
-
-/* Returns the character at the current position, and advances the
-   current position to the next character.  */
-#define next_char(DM)           (*((DM)->next)++)
-
-/* Returns non-zero if the current position is the end of the mangled
-   name, i.e. one past the last character.  */
-#define end_of_name_p(DM)       (peek_char (DM) == '\0')
-
-/* Advances the current position by one character.  */
-#define advance_char(DM)        (++(DM)->next)
-
-/* Returns the string containing the current demangled result.  */
-#define result_string(DM)       (&(DM)->result->string)
-
-/* Returns the position at which new text is inserted into the
-   demangled result.  */
-#define result_caret_pos(DM)                                            \
-  (result_length (DM) +                                                 \
-   ((string_list_t) result_string (DM))->caret_position)
-
-/* Adds a dyn_string_t to the demangled result.  */
-#define result_add_string(DM, STRING)                                   \
-  (dyn_string_insert (&(DM)->result->string,                            \
-		      result_caret_pos (DM), (STRING))                  \
-   ? STATUS_OK : STATUS_ALLOCATION_FAILED)
-
-/* Adds NUL-terminated string CSTR to the demangled result.    */
-#define result_add(DM, CSTR)                                            \
-  (dyn_string_insert_cstr (&(DM)->result->string,                       \
-			   result_caret_pos (DM), (CSTR))               \
-   ? STATUS_OK : STATUS_ALLOCATION_FAILED)
-
-/* Adds character CHAR to the demangled result.  */
-#define result_add_char(DM, CHAR)                                       \
-  (dyn_string_insert_char (&(DM)->result->string,                       \
-			   result_caret_pos (DM), (CHAR))               \
-   ? STATUS_OK : STATUS_ALLOCATION_FAILED)
-
-/* Inserts a dyn_string_t to the demangled result at position POS.  */
-#define result_insert_string(DM, POS, STRING)                           \
-  (dyn_string_insert (&(DM)->result->string, (POS), (STRING))           \
-   ? STATUS_OK : STATUS_ALLOCATION_FAILED)
-
-/* Inserts NUL-terminated string CSTR to the demangled result at
-   position POS.  */
-#define result_insert(DM, POS, CSTR)                                    \
-  (dyn_string_insert_cstr (&(DM)->result->string, (POS), (CSTR))        \
-   ? STATUS_OK : STATUS_ALLOCATION_FAILED)
-
-/* Inserts character CHAR to the demangled result at position POS.  */
-#define result_insert_char(DM, POS, CHAR)                               \
-  (dyn_string_insert_char (&(DM)->result->string, (POS), (CHAR))        \
-   ? STATUS_OK : STATUS_ALLOCATION_FAILED)
-
-/* The length of the current demangled result.  */
-#define result_length(DM)                                               \
-  dyn_string_length (&(DM)->result->string)
-
-/* Appends a (less-than, greater-than) character to the result in DM
-   to (open, close) a template argument or parameter list.  Appends a
-   space first if necessary to prevent spurious elision of angle
-   brackets with the previous character.  */
-#define result_open_template_list(DM) result_add_separated_char(DM, '<')
-#define result_close_template_list(DM) result_add_separated_char(DM, '>')
-
-/* Appends a base 10 representation of VALUE to DS.  STATUS_OK on
-   success.  On failure, deletes DS and returns an error code.  */
-
-static status_t
-int_to_dyn_string (value, ds)
-     int value;
-     dyn_string_t ds;
-{
-  int i;
-  int mask = 1;
-
-  /* Handle zero up front.  */
-  if (value == 0)
-    {
-      if (!dyn_string_append_char (ds, '0'))
-	return STATUS_ALLOCATION_FAILED;
-      return STATUS_OK;
-    }
-
-  /* For negative numbers, emit a minus sign.  */
-  if (value < 0)
-    {
-      if (!dyn_string_append_char (ds, '-'))
-	return STATUS_ALLOCATION_FAILED;
-      value = -value;
-    }
-  
-  /* Find the power of 10 of the first digit.  */
-  i = value;
-  while (i > 9)
-    {
-      mask *= 10;
-      i /= 10;
-    }
-
-  /* Write the digits.  */
-  while (mask > 0)
-    {
-      int digit = value / mask;
-
-      if (!dyn_string_append_char (ds, '0' + digit))
-	return STATUS_ALLOCATION_FAILED;
-
-      value -= digit * mask;
-      mask /= 10;
-    }
-
-  return STATUS_OK;
-}
-
-/* Creates a new string list node.  The contents of the string are
-   empty, but the initial buffer allocation is LENGTH.  The string
-   list node should be deleted with string_list_delete.  Returns NULL
-   if allocation fails.  */
-
-static string_list_t 
-string_list_new (length)
-     int length;
-{
-  string_list_t s = (string_list_t) malloc (sizeof (struct string_list_def));
-  s->caret_position = 0;
-  if (s == NULL)
-    return NULL;
-  if (!dyn_string_init ((dyn_string_t) s, length))
-    return NULL;
-  return s;
-}  
-
-/* Deletes the entire string list starting at NODE.  */
-
-static void
-string_list_delete (node)
-     string_list_t node;
-{
-  while (node != NULL)
-    {
-      string_list_t next = node->next;
-      dyn_string_delete ((dyn_string_t) node);
-      node = next;
-    }
-}
-
-/* Appends CHARACTER to the demangled result.  If the current trailing
-   character of the result is CHARACTER, a space is inserted first.  */
-
-static status_t
-result_add_separated_char (dm, character)
-     demangling_t dm;
-     int character;
-{
-  char *result = dyn_string_buf (result_string (dm));
-  int caret_pos = result_caret_pos (dm);
-
-  /* Add a space if the last character is already the character we
-     want to add.  */
-  if (caret_pos > 0 && result[caret_pos - 1] == character)
-    RETURN_IF_ERROR (result_add_char (dm, ' '));
-  /* Add the character.  */
-  RETURN_IF_ERROR (result_add_char (dm, character));
-
-  return STATUS_OK;
-}
-
-/* Allocates and pushes a new string onto the demangled results stack
-   for DM.  Subsequent demangling with DM will emit to the new string.
-   Returns STATUS_OK on success, STATUS_ALLOCATION_FAILED on
-   allocation failure.  */
-
-static status_t
-result_push (dm)
-     demangling_t dm;
-{
-  string_list_t new_string = string_list_new (0);
-  if (new_string == NULL)
-    /* Allocation failed.  */
-    return STATUS_ALLOCATION_FAILED;
-
-  /* Link the new string to the front of the list of result strings.  */
-  new_string->next = (string_list_t) dm->result;
-  dm->result = new_string;
-  return STATUS_OK;
-}
-
-/* Removes and returns the topmost element on the demangled results
-   stack for DM.  The caller assumes ownership for the returned
-   string.  */
-
-static string_list_t
-result_pop (dm)
-     demangling_t dm;
-{
-  string_list_t top = dm->result;
-  dm->result = top->next;
-  return top;
-}
-
-/* Returns the current value of the caret for the result string.  The
-   value is an offet from the end of the result string.  */
-
-static int
-result_get_caret (dm)
-     demangling_t dm;
-{
-  return ((string_list_t) result_string (dm))->caret_position;
-}
-
-/* Sets the value of the caret for the result string, counted as an
-   offet from the end of the result string.  */
-
-static void
-result_set_caret (dm, position)
-     demangling_t dm;
-     int position;
-{
-  ((string_list_t) result_string (dm))->caret_position = position;
-}
-
-/* Shifts the position of the next addition to the result by
-   POSITION_OFFSET.  A negative value shifts the caret to the left.  */
-
-static void
-result_shift_caret (dm, position_offset)
-     demangling_t dm;
-     int position_offset;
-{
-  ((string_list_t) result_string (dm))->caret_position += position_offset;
-}
-
-/* Returns non-zero if the character that comes right before the place
-   where text will be added to the result is a space.  In this case,
-   the caller should supress adding another space.  */
-
-static int
-result_previous_char_is_space (dm)
-     demangling_t dm;
-{
-  char *result = dyn_string_buf (result_string (dm));
-  int pos = result_caret_pos (dm);
-  return pos > 0 && result[pos - 1] == ' ';
-}
-
-/* Returns the start position of a fragment of the demangled result
-   that will be a substitution candidate.  Should be called at the
-   start of productions that can add substitutions.  */
-
-static int
-substitution_start (dm)
-     demangling_t dm;
-{
-  return result_caret_pos (dm);
-}
-
-/* Adds the suffix of the current demangled result of DM starting at
-   START_POSITION as a potential substitution.  If TEMPLATE_P is
-   non-zero, this potential substitution is a template-id.  */
-
-static status_t
-substitution_add (dm, start_position, template_p)
-     demangling_t dm;
-     int start_position;
-     int template_p;
-{
-  dyn_string_t result = result_string (dm);
-  dyn_string_t substitution = dyn_string_new (0);
-  int i;
-
-  if (substitution == NULL)
-    return STATUS_ALLOCATION_FAILED;
-
-  /* Extract the substring of the current demangling result that
-     represents the subsitution candidate.  */
-  if (!dyn_string_substring (substitution, 
-			     result, start_position, result_caret_pos (dm)))
-    {
-      dyn_string_delete (substitution);
-      return STATUS_ALLOCATION_FAILED;
-    }
-
-  /* If there's no room for the new entry, grow the array.  */
-  if (dm->substitutions_allocated == dm->num_substitutions)
-    {
-      size_t new_array_size;
-      if (dm->substitutions_allocated > 0)
-	dm->substitutions_allocated *= 2;
-      else
-	dm->substitutions_allocated = 2;
-      new_array_size = 
-	sizeof (struct substitution_def) * dm->substitutions_allocated;
-
-      dm->substitutions = (struct substitution_def *)
-	realloc (dm->substitutions, new_array_size);
-      if (dm->substitutions == NULL)
-	/* Realloc failed.  */
-	{
-	  dyn_string_delete (substitution);
-	  return STATUS_ALLOCATION_FAILED;
-	}
-    }
-
-  /* Add the substitution to the array.  */
-  i = dm->num_substitutions++;
-  dm->substitutions[i].text = substitution;
-  dm->substitutions[i].template_p = template_p;
-
-#ifdef CP_DEMANGLE_DEBUG
-  substitutions_print (dm, stderr);
-#endif
-
-  return STATUS_OK;
-}
-
-/* Returns the Nth-most-recent substitution.  Sets *TEMPLATE_P to
-   non-zero if the substitution is a template-id, zero otherwise.  
-   N is numbered from zero.  DM retains ownership of the returned
-   string.  If N is negative, or equal to or greater than the current
-   number of substitution candidates, returns NULL.  */
-
-static dyn_string_t
-substitution_get (dm, n, template_p)
-     demangling_t dm;
-     int n;
-     int *template_p;
-{
-  struct substitution_def *sub;
-
-  /* Make sure N is in the valid range.  */
-  if (n < 0 || n >= dm->num_substitutions)
-    return NULL;
-
-  sub = &(dm->substitutions[n]);
-  *template_p = sub->template_p;
-  return sub->text;
-}
-
-#ifdef CP_DEMANGLE_DEBUG
-/* Debugging routine to print the current substitutions to FP.  */
-
-static void
-substitutions_print (dm, fp)
-     demangling_t dm;
-     FILE *fp;
-{
-  int seq_id;
-  int num = dm->num_substitutions;
-
-  fprintf (fp, "SUBSTITUTIONS:\n");
-  for (seq_id = -1; seq_id < num - 1; ++seq_id)
-    {
-      int template_p;
-      dyn_string_t text = substitution_get (dm, seq_id + 1, &template_p);
-
-      if (seq_id == -1)
-	fprintf (fp, " S_ ");
-      else
-	fprintf (fp, " S%d_", seq_id);
-      fprintf (fp, " %c: %s\n", template_p ? '*' : ' ', dyn_string_buf (text));
-    }
-}
-
-#endif /* CP_DEMANGLE_DEBUG */
-
-/* Creates a new template argument list.  Returns NULL if allocation
-   fails.  */
-
-static template_arg_list_t
-template_arg_list_new ()
-{
-  template_arg_list_t new_list =
-    (template_arg_list_t) malloc (sizeof (struct template_arg_list_def));
-  if (new_list == NULL)
-    return NULL;
-  /* Initialize the new list to have no arguments.  */
-  new_list->first_argument = NULL;
-  new_list->last_argument = NULL;
-  /* Return the new list.  */
-  return new_list;
-}
-
-/* Deletes a template argument list and the template arguments it
-   contains.  */
-
-static void
-template_arg_list_delete (list)
-     template_arg_list_t list;
-{
-  /* If there are any arguments on LIST, delete them.  */
-  if (list->first_argument != NULL)
-    string_list_delete (list->first_argument);
-  /* Delete LIST.  */
-  free (list);
-}
-
-/* Adds ARG to the template argument list ARG_LIST.  */
-
-static void 
-template_arg_list_add_arg (arg_list, arg)
-     template_arg_list_t arg_list;
-     string_list_t arg;
-{
-  if (arg_list->first_argument == NULL)
-    /* If there were no arguments before, ARG is the first one.  */
-    arg_list->first_argument = arg;
-  else
-    /* Make ARG the last argument on the list.  */
-    arg_list->last_argument->next = arg;
-  /* Make ARG the last on the list.  */
-  arg_list->last_argument = arg;
-  arg->next = NULL;
-}
-
-/* Returns the template arugment at position INDEX in template
-   argument list ARG_LIST.  */
-
-static string_list_t
-template_arg_list_get_arg (arg_list, index)
-     template_arg_list_t arg_list;
-     int index;
-{
-  string_list_t arg = arg_list->first_argument;
-  /* Scan down the list of arguments to find the one at position
-     INDEX.  */
-  while (index--)
-    {
-      arg = arg->next;
-      if (arg == NULL)
-	/* Ran out of arguments before INDEX hit zero.  That's an
-	   error.  */
-	return NULL;
-    }
-  /* Return the argument at position INDEX.  */
-  return arg;
-}
-
-/* Pushes ARG_LIST onto the top of the template argument list stack.  */
-
-static void
-push_template_arg_list (dm, arg_list)
-     demangling_t dm;
-     template_arg_list_t arg_list;
-{
-  arg_list->next = dm->template_arg_lists;
-  dm->template_arg_lists = arg_list;
-#ifdef CP_DEMANGLE_DEBUG
-  fprintf (stderr, " ** pushing template arg list\n");
-  template_arg_list_print (arg_list, stderr);
-#endif 
-}
-
-/* Pops and deletes elements on the template argument list stack until
-   arg_list is the topmost element.  If arg_list is NULL, all elements
-   are popped and deleted.  */
-
-static void
-pop_to_template_arg_list (dm, arg_list)
-     demangling_t dm;
-     template_arg_list_t arg_list;
-{
-  while (dm->template_arg_lists != arg_list)
-    {
-      template_arg_list_t top = dm->template_arg_lists;
-      /* Disconnect the topmost element from the list.  */
-      dm->template_arg_lists = top->next;
-      /* Delete the popped element.  */
-      template_arg_list_delete (top);
-#ifdef CP_DEMANGLE_DEBUG
-      fprintf (stderr, " ** removing template arg list\n");
-#endif
-    }
-}
-
-#ifdef CP_DEMANGLE_DEBUG
-
-/* Prints the contents of ARG_LIST to FP.  */
-
-static void
-template_arg_list_print (arg_list, fp)
-  template_arg_list_t arg_list;
-  FILE *fp;
-{
-  string_list_t arg;
-  int index = -1;
-
-  fprintf (fp, "TEMPLATE ARGUMENT LIST:\n");
-  for (arg = arg_list->first_argument; arg != NULL; arg = arg->next)
-    {
-      if (index == -1)
-	fprintf (fp, " T_  : ");
-      else
-	fprintf (fp, " T%d_ : ", index);
-      ++index;
-      fprintf (fp, "%s\n", dyn_string_buf ((dyn_string_t) arg));
-    }
-}
-
-#endif /* CP_DEMANGLE_DEBUG */
-
-/* Returns the topmost element on the stack of template argument
-   lists.  If there is no list of template arguments, returns NULL.  */
-
-static template_arg_list_t
-current_template_arg_list (dm)
-     demangling_t dm;
-{
-  return dm->template_arg_lists;
-}
-
-/* Allocates a demangling_t object for demangling mangled NAME.  A new
-   result must be pushed before the returned object can be used.
-   Returns NULL if allocation fails.  */
-
-static demangling_t
-demangling_new (name, style)
-     const char *name;
-     int style;
-{
-  demangling_t dm;
-  dm = (demangling_t) malloc (sizeof (struct demangling_def));
-  if (dm == NULL)
-    return NULL;
-
-  dm->name = name;
-  dm->next = name;
-  dm->result = NULL;
-  dm->num_substitutions = 0;
-  dm->substitutions_allocated = 10;
-  dm->template_arg_lists = NULL;
-  dm->last_source_name = dyn_string_new (0);
-  if (dm->last_source_name == NULL)
-    return NULL;
-  dm->substitutions = (struct substitution_def *)
-    malloc (dm->substitutions_allocated * sizeof (struct substitution_def));
-  if (dm->substitutions == NULL)
-    {
-      dyn_string_delete (dm->last_source_name);
-      return NULL;
-    }
-  dm->style = style;
-  dm->is_constructor = 0;
-  dm->is_destructor = 0;
-
-  return dm;
-}
-
-/* Deallocates a demangling_t object and all memory associated with
-   it.  */
-
-static void
-demangling_delete (dm)
-     demangling_t dm;
-{
-  int i;
-  template_arg_list_t arg_list = dm->template_arg_lists;
-
-  /* Delete the stack of template argument lists.  */
-  while (arg_list != NULL)
-    {
-      template_arg_list_t next = arg_list->next;
-      template_arg_list_delete (arg_list);
-      arg_list = next;
-    }
-  /* Delete the list of substitutions.  */
-  for (i = dm->num_substitutions; --i >= 0; )
-    dyn_string_delete (dm->substitutions[i].text);
-  free (dm->substitutions);
-  /* Delete the demangled result.  */
-  string_list_delete (dm->result);
-  /* Delete the stored identifier name.  */
-  dyn_string_delete (dm->last_source_name);
-  /* Delete the context object itself.  */
-  free (dm);
-}
-
-/* These functions demangle an alternative of the corresponding
-   production in the mangling spec.  The first argument of each is a
-   demangling context structure for the current demangling
-   operation.  Most emit demangled text directly to the topmost result
-   string on the result string stack in the demangling context
-   structure.  */
-
-static status_t demangle_char
-  PARAMS ((demangling_t, int));
-static status_t demangle_mangled_name 
-  PARAMS ((demangling_t));
-static status_t demangle_encoding
-  PARAMS ((demangling_t));
-static status_t demangle_name
-  PARAMS ((demangling_t, int *));
-static status_t demangle_nested_name
-  PARAMS ((demangling_t, int *));
-static status_t demangle_prefix_v3
-  PARAMS ((demangling_t, int *));
-static status_t demangle_unqualified_name
-  PARAMS ((demangling_t, int *));
-static status_t demangle_source_name
-  PARAMS ((demangling_t));
-static status_t demangle_number
-  PARAMS ((demangling_t, int *, int, int));
-static status_t demangle_number_literally
-  PARAMS ((demangling_t, dyn_string_t, int, int));
-static status_t demangle_identifier
-  PARAMS ((demangling_t, int, dyn_string_t));
-static status_t demangle_operator_name
-  PARAMS ((demangling_t, int, int *));
-static status_t demangle_nv_offset
-  PARAMS ((demangling_t));
-static status_t demangle_v_offset
-  PARAMS ((demangling_t));
-static status_t demangle_call_offset
-  PARAMS ((demangling_t));
-static status_t demangle_special_name
-  PARAMS ((demangling_t));
-static status_t demangle_ctor_dtor_name
-  PARAMS ((demangling_t));
-static status_t demangle_type_ptr
-  PARAMS ((demangling_t, int *, int));
-static status_t demangle_type
-  PARAMS ((demangling_t));
-static status_t demangle_CV_qualifiers
-  PARAMS ((demangling_t, dyn_string_t));
-static status_t demangle_builtin_type
-  PARAMS ((demangling_t));
-static status_t demangle_function_type
-  PARAMS ((demangling_t, int *));
-static status_t demangle_bare_function_type
-  PARAMS ((demangling_t, int *));
-static status_t demangle_class_enum_type
-  PARAMS ((demangling_t, int *));
-static status_t demangle_array_type
-  PARAMS ((demangling_t, int *));
-static status_t demangle_template_param
-  PARAMS ((demangling_t));
-static status_t demangle_template_args_1
-  PARAMS ((demangling_t, template_arg_list_t));
-static status_t demangle_template_args
-  PARAMS ((demangling_t));
-static status_t demangle_literal
-  PARAMS ((demangling_t));
-static status_t demangle_template_arg
-  PARAMS ((demangling_t));
-static status_t demangle_expression_v3
-  PARAMS ((demangling_t));
-static status_t demangle_scope_expression
-  PARAMS ((demangling_t));
-static status_t demangle_expr_primary
-  PARAMS ((demangling_t));
-static status_t demangle_substitution
-  PARAMS ((demangling_t, int *));
-static status_t demangle_local_name
-  PARAMS ((demangling_t));
-static status_t demangle_discriminator 
-  PARAMS ((demangling_t, int));
-static status_t cp_demangle
-  PARAMS ((const char *, dyn_string_t, int));
-#ifdef IN_LIBGCC2
-static status_t cp_demangle_type
-  PARAMS ((const char*, dyn_string_t));
-#endif
-
-/* When passed to demangle_bare_function_type, indicates that the
-   function's return type is not encoded before its parameter types.  */
-#define BFT_NO_RETURN_TYPE    NULL
-
-/* Check that the next character is C.  If so, consume it.  If not,
-   return an error.  */
-
-static status_t
-demangle_char (dm, c)
-     demangling_t dm;
-     int c;
-{
-  static char *error_message = NULL;
-
-  if (peek_char (dm) == c)
-    {
-      advance_char (dm);
-      return STATUS_OK;
-    }
-  else
-    {
-	vg_assert (0);
-	/*
-      if (error_message == NULL)
-	error_message = strdup ("Expected ?");
-      error_message[9] = c;
-      return error_message;
-      */
-    }
-}
-
-/* Demangles and emits a <mangled-name>.  
-
-    <mangled-name>      ::= _Z <encoding>  */
-
-static status_t
-demangle_mangled_name (dm)
-     demangling_t dm;
-{
-  DEMANGLE_TRACE ("mangled-name", dm);
-  RETURN_IF_ERROR (demangle_char (dm, '_'));
-  RETURN_IF_ERROR (demangle_char (dm, 'Z'));
-  RETURN_IF_ERROR (demangle_encoding (dm));
-  return STATUS_OK;
-}
-
-/* Demangles and emits an <encoding>.  
-
-    <encoding>		::= <function name> <bare-function-type>
-			::= <data name>
-			::= <special-name>  */
-
-static status_t
-demangle_encoding (dm)
-     demangling_t dm;
-{
-  int encode_return_type;
-  int start_position;
-  template_arg_list_t old_arg_list = current_template_arg_list (dm);
-  char peek = peek_char (dm);
-
-  DEMANGLE_TRACE ("encoding", dm);
-  
-  /* Remember where the name starts.  If it turns out to be a template
-     function, we'll have to insert the return type here.  */
-  start_position = result_caret_pos (dm);
-
-  if (peek == 'G' || peek == 'T')
-    RETURN_IF_ERROR (demangle_special_name (dm));
-  else
-    {
-      /* Now demangle the name.  */
-      RETURN_IF_ERROR (demangle_name (dm, &encode_return_type));
-
-      /* If there's anything left, the name was a function name, with
-	 maybe its return type, and its parameter types, following.  */
-      if (!end_of_name_p (dm) 
-	  && peek_char (dm) != 'E')
-	{
-	  if (encode_return_type)
-	    /* Template functions have their return type encoded.  The
-	       return type should be inserted at start_position.  */
-	    RETURN_IF_ERROR 
-	      (demangle_bare_function_type (dm, &start_position));
-	  else
-	    /* Non-template functions don't have their return type
-	       encoded.  */
-	    RETURN_IF_ERROR 
-	      (demangle_bare_function_type (dm, BFT_NO_RETURN_TYPE)); 
-	}
-    }
-
-  /* Pop off template argument lists that were built during the
-     mangling of this name, to restore the old template context.  */
-  pop_to_template_arg_list (dm, old_arg_list);
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <name>.
-
-    <name>              ::= <unscoped-name>
-                        ::= <unscoped-template-name> <template-args>
-			::= <nested-name>
-                        ::= <local-name>
-
-    <unscoped-name>     ::= <unqualified-name>
-			::= St <unqualified-name>   # ::std::
-
-    <unscoped-template-name>    
-                        ::= <unscoped-name>
-                        ::= <substitution>  */
-
-static status_t
-demangle_name (dm, encode_return_type)
-     demangling_t dm;
-     int *encode_return_type;
-{
-  int start = substitution_start (dm);
-  char peek = peek_char (dm);
-  int is_std_substitution = 0;
-
-  /* Generally, the return type is encoded if the function is a
-     template-id, and suppressed otherwise.  There are a few cases,
-     though, in which the return type is not encoded even for a
-     templated function.  In these cases, this flag is set.  */
-  int suppress_return_type = 0;
-
-  DEMANGLE_TRACE ("name", dm);
-
-  switch (peek)
-    {
-    case 'N':
-      /* This is a <nested-name>.  */
-      RETURN_IF_ERROR (demangle_nested_name (dm, encode_return_type));
-      break;
-
-    case 'Z':
-      RETURN_IF_ERROR (demangle_local_name (dm));
-      *encode_return_type = 0;
-      break;
-
-    case 'S':
-      /* The `St' substitution allows a name nested in std:: to appear
-	 without being enclosed in a nested name.  */
-      if (peek_char_next (dm) == 't') 
-	{
-	  (void) next_char (dm);
-	  (void) next_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "std::"));
-	  RETURN_IF_ERROR 
-	    (demangle_unqualified_name (dm, &suppress_return_type));
-	  is_std_substitution = 1;
-	}
-      else
-	RETURN_IF_ERROR (demangle_substitution (dm, encode_return_type));
-      /* Check if a template argument list immediately follows.
-	 If so, then we just demangled an <unqualified-template-name>.  */
-      if (peek_char (dm) == 'I') 
-	{
-	  /* A template name of the form std::<unqualified-name> is a
-             substitution candidate.  */
-	  if (is_std_substitution)
-	    RETURN_IF_ERROR (substitution_add (dm, start, 0));
-	  /* Demangle the <template-args> here.  */
-	  RETURN_IF_ERROR (demangle_template_args (dm));
-	  *encode_return_type = !suppress_return_type;
-	}
-      else
-	*encode_return_type = 0;
-
-      break;
-
-    default:
-      /* This is an <unscoped-name> or <unscoped-template-name>.  */
-      RETURN_IF_ERROR (demangle_unqualified_name (dm, &suppress_return_type));
-
-      /* If the <unqualified-name> is followed by template args, this
-	 is an <unscoped-template-name>.  */
-      if (peek_char (dm) == 'I')
-	{
-	  /* Add a substitution for the unqualified template name.  */
-	  RETURN_IF_ERROR (substitution_add (dm, start, 0));
-
-	  RETURN_IF_ERROR (demangle_template_args (dm));
-	  *encode_return_type = !suppress_return_type;
-	}
-      else
-	*encode_return_type = 0;
-
-      break;
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <nested-name>. 
-
-    <nested-name>     ::= N [<CV-qualifiers>] <prefix> <unqulified-name> E  */
-
-static status_t
-demangle_nested_name (dm, encode_return_type)
-     demangling_t dm;
-     int *encode_return_type;
-{
-  char peek;
-
-  DEMANGLE_TRACE ("nested-name", dm);
-
-  RETURN_IF_ERROR (demangle_char (dm, 'N'));
-
-  peek = peek_char (dm);
-  if (peek == 'r' || peek == 'V' || peek == 'K')
-    {
-      dyn_string_t cv_qualifiers;
-      status_t status;
-
-      /* Snarf up CV qualifiers.  */
-      cv_qualifiers = dyn_string_new (24);
-      if (cv_qualifiers == NULL)
-	return STATUS_ALLOCATION_FAILED;
-      demangle_CV_qualifiers (dm, cv_qualifiers);
-
-      /* Emit them, preceded by a space.  */
-      status = result_add_char (dm, ' ');
-      if (STATUS_NO_ERROR (status)) 
-	status = result_add_string (dm, cv_qualifiers);
-      /* The CV qualifiers that occur in a <nested-name> will be
-	 qualifiers for member functions.  These are placed at the end
-	 of the function.  Therefore, shift the caret to the left by
-	 the length of the qualifiers, so other text is inserted
-	 before them and they stay at the end.  */
-      result_shift_caret (dm, -dyn_string_length (cv_qualifiers) - 1);
-      /* Clean up.  */
-      dyn_string_delete (cv_qualifiers);
-      RETURN_IF_ERROR (status);
-    }
-
-  RETURN_IF_ERROR (demangle_prefix_v3 (dm, encode_return_type));
-  /* No need to demangle the final <unqualified-name>; demangle_prefix
-     will handle it.  */
-  RETURN_IF_ERROR (demangle_char (dm, 'E'));
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <prefix>.
-
-    <prefix>            ::= <prefix> <unqualified-name>
-                        ::= <template-prefix> <template-args>
-			::= # empty
-			::= <substitution>
-
-    <template-prefix>   ::= <prefix>
-                        ::= <substitution>  */
-
-static status_t
-demangle_prefix_v3 (dm, encode_return_type)
-     demangling_t dm;
-     int *encode_return_type;
-{
-  int start = substitution_start (dm);
-  int nested = 0;
-
-  /* ENCODE_RETURN_TYPE is updated as we decend the nesting chain.
-     After <template-args>, it is set to non-zero; after everything
-     else it is set to zero.  */
-
-  /* Generally, the return type is encoded if the function is a
-     template-id, and suppressed otherwise.  There are a few cases,
-     though, in which the return type is not encoded even for a
-     templated function.  In these cases, this flag is set.  */
-  int suppress_return_type = 0;
-
-  DEMANGLE_TRACE ("prefix", dm);
-
-  while (1)
-    {
-      char peek;
-
-      if (end_of_name_p (dm))
-	return "Unexpected end of name in <compound-name>.";
-
-      peek = peek_char (dm);
-      
-      /* We'll initialize suppress_return_type to false, and set it to true
-	 if we end up demangling a constructor name.  However, make
-	 sure we're not actually about to demangle template arguments
-	 -- if so, this is the <template-args> following a
-	 <template-prefix>, so we'll want the previous flag value
-	 around.  */
-      if (peek != 'I')
-	suppress_return_type = 0;
-
-      if (IS_DIGIT ((unsigned char) peek)
-	  || (peek >= 'a' && peek <= 'z')
-	  || peek == 'C' || peek == 'D'
-	  || peek == 'S')
-	{
-	  /* We have another level of scope qualification.  */
-	  if (nested)
-	    RETURN_IF_ERROR (result_add (dm, NAMESPACE_SEPARATOR));
-	  else
-	    nested = 1;
-
-	  if (peek == 'S')
-	    /* The substitution determines whether this is a
-	       template-id.  */
-	    RETURN_IF_ERROR (demangle_substitution (dm, encode_return_type));
-	  else
-	    {
-	      /* It's just a name.  */
-	      RETURN_IF_ERROR 
-		(demangle_unqualified_name (dm, &suppress_return_type));
-	      *encode_return_type = 0;
-	    }
-	}
-      else if (peek == 'Z')
-	RETURN_IF_ERROR (demangle_local_name (dm));
-      else if (peek == 'I')
-	{
-	  RETURN_IF_ERROR (demangle_template_args (dm));
-
-	  /* Now we want to indicate to the caller that we've
-	     demangled template arguments, thus the prefix was a
-	     <template-prefix>.  That's so that the caller knows to
-	     demangle the function's return type, if this turns out to
-	     be a function name.  But, if it's a member template
-	     constructor or a templated conversion operator, report it
-	     as untemplated.  Those never get encoded return types.  */
-	  *encode_return_type = !suppress_return_type;
-	}
-      else if (peek == 'E')
-	/* All done.  */
-	return STATUS_OK;
-      else
-	return "Unexpected character in <compound-name>.";
-
-      if (peek != 'S'
-	  && peek_char (dm) != 'E')
-	/* Add a new substitution for the prefix thus far.  */
-	RETURN_IF_ERROR (substitution_add (dm, start, *encode_return_type));
-    }
-}
-
-/* Demangles and emits an <unqualified-name>.  If this
-   <unqualified-name> is for a special function type that should never
-   have its return type encoded (particularly, a constructor or
-   conversion operator), *SUPPRESS_RETURN_TYPE is set to 1; otherwise,
-   it is set to zero.
-
-    <unqualified-name>  ::= <operator-name>
-			::= <special-name>  
-			::= <source-name>  */
-
-static status_t
-demangle_unqualified_name (dm, suppress_return_type)
-     demangling_t dm;
-     int *suppress_return_type;
-{
-  char peek = peek_char (dm);
-
-  DEMANGLE_TRACE ("unqualified-name", dm);
-
-  /* By default, don't force suppression of the return type (though
-     non-template functions still don't get a return type encoded).  */ 
-  *suppress_return_type = 0;
-
-  if (IS_DIGIT ((unsigned char) peek))
-    RETURN_IF_ERROR (demangle_source_name (dm));
-  else if (peek >= 'a' && peek <= 'z')
-    {
-      int num_args;
-
-      /* Conversion operators never have a return type encoded.  */
-      if (peek == 'c' && peek_char_next (dm) == 'v')
-	*suppress_return_type = 1;
-
-      RETURN_IF_ERROR (demangle_operator_name (dm, 0, &num_args));
-    }
-  else if (peek == 'C' || peek == 'D')
-    {
-      /* Constructors never have a return type encoded.  */
-      if (peek == 'C')
-	*suppress_return_type = 1;
-
-      RETURN_IF_ERROR (demangle_ctor_dtor_name (dm));
-    }
-  else
-    return "Unexpected character in <unqualified-name>.";
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits <source-name>.  
-
-    <source-name> ::= <length number> <identifier>  */
-
-static status_t
-demangle_source_name (dm)
-     demangling_t dm;
-{
-  int length;
-
-  DEMANGLE_TRACE ("source-name", dm);
-
-  /* Decode the length of the identifier.  */
-  RETURN_IF_ERROR (demangle_number (dm, &length, 10, 0));
-  if (length == 0)
-    return "Zero length in <source-name>.";
-
-  /* Now the identifier itself.  It's placed into last_source_name,
-     where it can be used to build a constructor or destructor name.  */
-  RETURN_IF_ERROR (demangle_identifier (dm, length, 
-					dm->last_source_name));
-
-  /* Emit it.  */
-  RETURN_IF_ERROR (result_add_string (dm, dm->last_source_name));
-
-  return STATUS_OK;
-}
-
-/* Demangles a number, either a <number> or a <positive-number> at the
-   current position, consuming all consecutive digit characters.  Sets
-   *VALUE to the resulting numberand returns STATUS_OK.  The number is
-   interpreted as BASE, which must be either 10 or 36.  If IS_SIGNED
-   is non-zero, negative numbers -- prefixed with `n' -- are accepted.
-
-    <number> ::= [n] <positive-number>
-
-    <positive-number> ::= <decimal integer>  */
-
-static status_t
-demangle_number (dm, value, base, is_signed)
-     demangling_t dm;
-     int *value;
-     int base;
-     int is_signed;
-{
-  dyn_string_t number = dyn_string_new (10);
-
-  DEMANGLE_TRACE ("number", dm);
-
-  if (number == NULL)
-    return STATUS_ALLOCATION_FAILED;
-
-  demangle_number_literally (dm, number, base, is_signed);
-  /*
-  *value = strtol (dyn_string_buf (number), NULL, base);
-  */
-  /* vg_assert( base == 10 ); */
-  if ( base != 10 && base != 36 ) {
-     dyn_string_delete(number);
-     return STATUS_UNIMPLEMENTED;
-  }
-
-  if (base == 36) {
-     *value = VG_(atoll36) (dyn_string_buf (number));
-  } else {
-     *value = VG_(atoll) (dyn_string_buf (number));
-  }
-  dyn_string_delete (number);
-
-  return STATUS_OK;
-}
-
-/* Demangles a number at the current position.  The digits (and minus
-   sign, if present) that make up the number are appended to STR.
-   Only base-BASE digits are accepted; BASE must be either 10 or 36.
-   If IS_SIGNED, negative numbers -- prefixed with `n' -- are
-   accepted.  Does not consume a trailing underscore or other
-   terminating character.  */
-
-static status_t
-demangle_number_literally (dm, str, base, is_signed)
-     demangling_t dm;
-     dyn_string_t str;
-     int base;
-     int is_signed;
-{
-  DEMANGLE_TRACE ("number*", dm);
-
-  if (base != 10 && base != 36)
-    return STATUS_INTERNAL_ERROR;
-
-  /* An `n' denotes a negative number.  */
-  if (is_signed && peek_char (dm) == 'n')
-    {
-      /* Skip past the n.  */
-      advance_char (dm);
-      /* The normal way to write a negative number is with a minus
-	 sign.  */
-      if (!dyn_string_append_char (str, '-'))
-	return STATUS_ALLOCATION_FAILED;
-    }
-
-  /* Loop until we hit a non-digit.  */
-  while (1)
-    {
-      char peek = peek_char (dm);
-      if (IS_DIGIT ((unsigned char) peek)
-	  || (base == 36 && peek >= 'A' && peek <= 'Z'))
-	{
-	  /* Accumulate digits.  */
-	  if (!dyn_string_append_char (str, next_char (dm)))
-	    return STATUS_ALLOCATION_FAILED;
-	}
-      else
-	/* Not a digit?  All done.  */
-	break;
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangles an identifier at the current position of LENGTH
-   characters and places it in IDENTIFIER.  */
-
-static status_t
-demangle_identifier (dm, length, identifier)
-     demangling_t dm;
-     int length;
-     dyn_string_t identifier;
-{
-  DEMANGLE_TRACE ("identifier", dm);
-
-  dyn_string_clear (identifier);
-  if (!dyn_string_resize (identifier, length))
-    return STATUS_ALLOCATION_FAILED;
-
-  while (length-- > 0)
-    {
-      if (end_of_name_p (dm))
-	return "Unexpected end of name in <identifier>.";
-      if (!dyn_string_append_char (identifier, next_char (dm)))
-	return STATUS_ALLOCATION_FAILED;
-    }
-
-  /* GCC encodes anonymous namespaces using a `_GLOBAL_[_.$]N.'
-     followed by the source file name and some random characters.
-     Unless we're in strict mode, decipher these names appropriately.  */
-  if (!flag_strict)
-    {
-      char *name = dyn_string_buf (identifier);
-      int prefix_length = VG_(strlen) (ANONYMOUS_NAMESPACE_PREFIX);
-
-      /* Compare the first, fixed part.  */
-      if (VG_(strncmp) (name, ANONYMOUS_NAMESPACE_PREFIX, prefix_length) == 0)
-        {
-	  name += prefix_length;
-	  /* The next character might be a period, an underscore, or
-	     dollar sign, depending on the target architecture's
-	     assembler's capabilities.  After that comes an `N'.  */
-	  if ((*name == '.' || *name == '_' || *name == '$')
-	      && *(name + 1) == 'N')
-	    /* This looks like the anonymous namespace identifier.
-	       Replace it with something comprehensible.  */
-	    dyn_string_copy_cstr (identifier, "(anonymous namespace)");
-	}
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits an <operator-name>.  If SHORT_NAME is non-zero,
-   the short form is emitted; otherwise the full source form
-   (`operator +' etc.) is emitted.  *NUM_ARGS is set to the number of
-   operands that the operator takes.  
-
-    <operator-name>
-                  ::= nw        # new           
-                  ::= na        # new[]
-                  ::= dl        # delete        
-                  ::= da        # delete[]      
-		  ::= ps        # + (unary)
-                  ::= ng        # - (unary)     
-                  ::= ad        # & (unary)     
-                  ::= de        # * (unary)     
-                  ::= co        # ~             
-                  ::= pl        # +             
-                  ::= mi        # -             
-                  ::= ml        # *             
-                  ::= dv        # /             
-                  ::= rm        # %             
-                  ::= an        # &             
-                  ::= or        # |             
-                  ::= eo        # ^             
-                  ::= aS        # =             
-                  ::= pL        # +=            
-                  ::= mI        # -=            
-                  ::= mL        # *=            
-                  ::= dV        # /=            
-                  ::= rM        # %=            
-                  ::= aN        # &=            
-                  ::= oR        # |=            
-                  ::= eO        # ^=            
-                  ::= ls        # <<            
-                  ::= rs        # >>            
-                  ::= lS        # <<=           
-                  ::= rS        # >>=           
-                  ::= eq        # ==            
-                  ::= ne        # !=            
-                  ::= lt        # <             
-                  ::= gt        # >             
-                  ::= le        # <=            
-                  ::= ge        # >=            
-                  ::= nt        # !             
-                  ::= aa        # &&            
-                  ::= oo        # ||            
-                  ::= pp        # ++            
-                  ::= mm        # --            
-                  ::= cm        # ,             
-                  ::= pm        # ->*           
-                  ::= pt        # ->            
-                  ::= cl        # ()            
-                  ::= ix        # []            
-                  ::= qu        # ?
-                  ::= sz        # sizeof 
-                  ::= cv <type> # cast        
-		  ::= v [0-9] <source-name>  # vendor extended operator  */
-
-static status_t
-demangle_operator_name (dm, short_name, num_args)
-     demangling_t dm;
-     int short_name;
-     int *num_args;
-{
-  struct operator_code
-  {
-    /* The mangled code for this operator.  */
-    const char *const code;
-    /* The source name of this operator.  */
-    const char *const name;
-    /* The number of arguments this operator takes.  */
-    const int num_args;
-  };
-
-  static const struct operator_code operators[] = 
-  {
-    { "aN", "&="       , 2 },
-    { "aS", "="        , 2 },
-    { "aa", "&&"       , 2 },
-    { "ad", "&"        , 1 },
-    { "an", "&"        , 2 },
-    { "cl", "()"       , 0 },
-    { "cm", ","        , 2 },
-    { "co", "~"        , 1 },
-    { "dV", "/="       , 2 },
-    { "da", " delete[]", 1 },
-    { "de", "*"        , 1 },
-    { "dl", " delete"  , 1 },
-    { "dv", "/"        , 2 },
-    { "eO", "^="       , 2 },
-    { "eo", "^"        , 2 },
-    { "eq", "=="       , 2 },
-    { "ge", ">="       , 2 },
-    { "gt", ">"        , 2 },
-    { "ix", "[]"       , 2 },
-    { "lS", "<<="      , 2 },
-    { "le", "<="       , 2 },
-    { "ls", "<<"       , 2 },
-    { "lt", "<"        , 2 },
-    { "mI", "-="       , 2 },
-    { "mL", "*="       , 2 },
-    { "mi", "-"        , 2 },
-    { "ml", "*"        , 2 },
-    { "mm", "--"       , 1 },
-    { "na", " new[]"   , 1 },
-    { "ne", "!="       , 2 },
-    { "ng", "-"        , 1 },
-    { "nt", "!"        , 1 },
-    { "nw", " new"     , 1 },
-    { "oR", "|="       , 2 },
-    { "oo", "||"       , 2 },
-    { "or", "|"        , 2 },
-    { "pL", "+="       , 2 },
-    { "pl", "+"        , 2 },
-    { "pm", "->*"      , 2 },
-    { "pp", "++"       , 1 },
-    { "ps", "+"        , 1 },
-    { "pt", "->"       , 2 },
-    { "qu", "?"        , 3 },
-    { "rM", "%="       , 2 },
-    { "rS", ">>="      , 2 },
-    { "rm", "%"        , 2 },
-    { "rs", ">>"       , 2 },
-    { "sz", " sizeof"  , 1 }
-  };
-
-  const int num_operators = 
-    sizeof (operators) / sizeof (struct operator_code);
-
-  int c0 = next_char (dm);
-  int c1 = next_char (dm);
-  const struct operator_code* p1 = operators;
-  const struct operator_code* p2 = operators + num_operators;
-
-  DEMANGLE_TRACE ("operator-name", dm);
-
-  /* Is this a vendor-extended operator?  */
-  if (c0 == 'v' && IS_DIGIT (c1))
-    {
-      RETURN_IF_ERROR (result_add (dm, "operator "));
-      RETURN_IF_ERROR (demangle_source_name (dm));
-      *num_args = 0;
-      return STATUS_OK;
-    }
-
-  /* Is this a conversion operator?  */
-  if (c0 == 'c' && c1 == 'v')
-    {
-      RETURN_IF_ERROR (result_add (dm, "operator "));
-      /* Demangle the converted-to type.  */
-      RETURN_IF_ERROR (demangle_type (dm));
-      *num_args = 0;
-      return STATUS_OK;
-    }
-
-  /* Perform a binary search for the operator code.  */
-  while (1)
-    {
-      const struct operator_code* p = p1 + (p2 - p1) / 2;
-      char match0 = p->code[0];
-      char match1 = p->code[1];
-
-      if (c0 == match0 && c1 == match1)
-	/* Found it.  */
-	{
-	  if (!short_name)
-	    RETURN_IF_ERROR (result_add (dm, "operator"));
-	  RETURN_IF_ERROR (result_add (dm, p->name));
-	  *num_args = p->num_args;
-
-	  return STATUS_OK;
-	}
-
-      if (p == p1)
-	/* Couldn't find it.  */
-	return "Unknown code in <operator-name>.";
-
-      /* Try again.  */
-      if (c0 < match0 || (c0 == match0 && c1 < match1))
-	p2 = p;
-      else
-	p1 = p;
-    }
-}
-
-/* Demangles and omits an <nv-offset>.
-
-    <nv-offset> ::= <offset number>   # non-virtual base override  */
-
-static status_t
-demangle_nv_offset (dm)
-     demangling_t dm;
-{
-  dyn_string_t number;
-  status_t status = STATUS_OK;
-
-  DEMANGLE_TRACE ("h-offset", dm);
-
-  /* Demangle the offset.  */
-  number = dyn_string_new (4);
-  if (number == NULL)
-    return STATUS_ALLOCATION_FAILED;
-  demangle_number_literally (dm, number, 10, 1);
-
-  /* Don't display the offset unless in verbose mode.  */
-  if (flag_verbose)
-    {
-      status = result_add (dm, " [nv:");
-      if (STATUS_NO_ERROR (status))
-	status = result_add_string (dm, number);
-      if (STATUS_NO_ERROR (status))
-	status = result_add_char (dm, ']');
-    }
-
-  /* Clean up.  */
-  dyn_string_delete (number);
-  RETURN_IF_ERROR (status);
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <v-offset>. 
-
-    <v-offset>  ::= <offset number> _ <virtual offset number>
-			# virtual base override, with vcall offset  */
-
-static status_t
-demangle_v_offset (dm)
-     demangling_t dm;
-{
-  dyn_string_t number;
-  status_t status = STATUS_OK;
-
-  DEMANGLE_TRACE ("v-offset", dm);
-
-  /* Demangle the offset.  */
-  number = dyn_string_new (4);
-  if (number == NULL)
-    return STATUS_ALLOCATION_FAILED;
-  demangle_number_literally (dm, number, 10, 1);
-
-  /* Don't display the offset unless in verbose mode.  */
-  if (flag_verbose)
-    {
-      status = result_add (dm, " [v:");
-      if (STATUS_NO_ERROR (status))
-	status = result_add_string (dm, number);
-      if (STATUS_NO_ERROR (status))
-	result_add_char (dm, ',');
-    }
-  dyn_string_delete (number);
-  RETURN_IF_ERROR (status);
-
-  /* Demangle the separator.  */
-  RETURN_IF_ERROR (demangle_char (dm, '_'));
-
-  /* Demangle the vcall offset.  */
-  number = dyn_string_new (4);
-  if (number == NULL)
-    return STATUS_ALLOCATION_FAILED;
-  demangle_number_literally (dm, number, 10, 1);
-
-  /* Don't display the vcall offset unless in verbose mode.  */
-  if (flag_verbose)
-    {
-      status = result_add_string (dm, number);
-      if (STATUS_NO_ERROR (status))
-	status = result_add_char (dm, ']');
-    }
-  dyn_string_delete (number);
-  RETURN_IF_ERROR (status);
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <call-offset>.
-
-    <call-offset> ::= h <nv-offset> _
-		  ::= v <v-offset> _  */
-
-static status_t
-demangle_call_offset (dm)
-     demangling_t dm;
-{
-  DEMANGLE_TRACE ("call-offset", dm);
-
-  switch (peek_char (dm))
-    {
-    case 'h':
-      advance_char (dm);
-      /* Demangle the offset.  */
-      RETURN_IF_ERROR (demangle_nv_offset (dm));
-      /* Demangle the separator.  */
-      RETURN_IF_ERROR (demangle_char (dm, '_'));
-      break;
-
-    case 'v':
-      advance_char (dm);
-      /* Demangle the offset.  */
-      RETURN_IF_ERROR (demangle_v_offset (dm));
-      /* Demangle the separator.  */
-      RETURN_IF_ERROR (demangle_char (dm, '_'));
-      break;
-
-    default:
-      return "Unrecognized <call-offset>.";
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <special-name>.  
-
-    <special-name> ::= GV <object name>   # Guard variable
-                   ::= TV <type>          # virtual table
-                   ::= TT <type>          # VTT
-                   ::= TI <type>          # typeinfo structure
-		   ::= TS <type>          # typeinfo name  
-
-   Other relevant productions include thunks:
-
-    <special-name> ::= T <call-offset> <base encoding>
- 			 # base is the nominal target function of thunk
-
-    <special-name> ::= Tc <call-offset> <call-offset> <base encoding>
-			 # base is the nominal target function of thunk
-			 # first call-offset is 'this' adjustment
-			 # second call-offset is result adjustment
-
-   where
-
-    <call-offset>  ::= h <nv-offset> _
-		   ::= v <v-offset> _
-
-   Also demangles the special g++ manglings,
-
-    <special-name> ::= TC <type> <offset number> _ <base type>
-                                          # construction vtable
-		   ::= TF <type>	  # typeinfo function (old ABI only)
-		   ::= TJ <type>	  # java Class structure  */
-
-static status_t
-demangle_special_name (dm)
-     demangling_t dm;
-{
-  dyn_string_t number;
-  int unused;
-  char peek = peek_char (dm);
-
-  DEMANGLE_TRACE ("special-name", dm);
-
-  if (peek == 'G')
-    {
-      /* Consume the G.  */
-      advance_char (dm);
-      switch (peek_char (dm))
-	{
-	case 'V':
-	  /* A guard variable name.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "guard variable for "));
-	  RETURN_IF_ERROR (demangle_name (dm, &unused));
-	  break;
-
-	case 'R':
-	  /* A reference temporary.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "reference temporary for "));
-	  RETURN_IF_ERROR (demangle_name (dm, &unused));
-	  break;
-	  
-	default:
-	  return "Unrecognized <special-name>.";
-	}
-    }
-  else if (peek == 'T')
-    {
-      status_t status = STATUS_OK;
-
-      /* Other C++ implementation miscellania.  Consume the T.  */
-      advance_char (dm);
-
-      switch (peek_char (dm))
-	{
-	case 'V':
-	  /* Virtual table.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "vtable for "));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  break;
-
-	case 'T':
-	  /* VTT structure.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "VTT for "));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  break;
-
-	case 'I':
-	  /* Typeinfo structure.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "typeinfo for "));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  break;
-
-	case 'F':
-	  /* Typeinfo function.  Used only in old ABI with new mangling.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "typeinfo fn for "));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  break;
-
-	case 'S':
-	  /* Character string containing type name, used in typeinfo. */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "typeinfo name for "));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  break;
-
-	case 'J':
-	  /* The java Class variable corresponding to a C++ class.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "java Class for "));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  break;
-
-	case 'h':
-	  /* Non-virtual thunk.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "non-virtual thunk"));
-	  RETURN_IF_ERROR (demangle_nv_offset (dm));
-	  /* Demangle the separator.  */
-	  RETURN_IF_ERROR (demangle_char (dm, '_'));
-	  /* Demangle and emit the target name and function type.  */
-	  RETURN_IF_ERROR (result_add (dm, " to "));
-	  RETURN_IF_ERROR (demangle_encoding (dm));
-	  break;
-
-	case 'v':
-	  /* Virtual thunk.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "virtual thunk"));
-	  RETURN_IF_ERROR (demangle_v_offset (dm));
-	  /* Demangle the separator.  */
-	  RETURN_IF_ERROR (demangle_char (dm, '_'));
-	  /* Demangle and emit the target function.  */
-	  RETURN_IF_ERROR (result_add (dm, " to "));
-	  RETURN_IF_ERROR (demangle_encoding (dm));
-	  break;
-
-	case 'c':
-	  /* Covariant return thunk.  */
-	  advance_char (dm);
-	  RETURN_IF_ERROR (result_add (dm, "covariant return thunk"));
-	  RETURN_IF_ERROR (demangle_call_offset (dm));
-	  RETURN_IF_ERROR (demangle_call_offset (dm));
-	  /* Demangle and emit the target function.  */
-	  RETURN_IF_ERROR (result_add (dm, " to "));
-	  RETURN_IF_ERROR (demangle_encoding (dm));
-	  break;
-
-	case 'C':
-	  /* TC is a special g++ mangling for a construction vtable. */
-	  if (!flag_strict)
-	    {
-	      dyn_string_t derived_type;
-
-	      advance_char (dm);
-	      RETURN_IF_ERROR (result_add (dm, "construction vtable for "));
-
-	      /* Demangle the derived type off to the side.  */
-	      RETURN_IF_ERROR (result_push (dm));
-	      RETURN_IF_ERROR (demangle_type (dm));
-	      derived_type = (dyn_string_t) result_pop (dm);
-
-	      /* Demangle the offset.  */
-	      number = dyn_string_new (4);
-	      if (number == NULL)
-		{
-		  dyn_string_delete (derived_type);
-		  return STATUS_ALLOCATION_FAILED;
-		}
-	      demangle_number_literally (dm, number, 10, 1);
-	      /* Demangle the underscore separator.  */
-	      status = demangle_char (dm, '_');
-
-	      /* Demangle the base type.  */
-	      if (STATUS_NO_ERROR (status))
-		status = demangle_type (dm);
-
-	      /* Emit the derived type.  */
-	      if (STATUS_NO_ERROR (status))
-		status = result_add (dm, "-in-");
-	      if (STATUS_NO_ERROR (status))
-		status = result_add_string (dm, derived_type);
-	      dyn_string_delete (derived_type);
-
-	      /* Don't display the offset unless in verbose mode.  */
-	      if (flag_verbose)
-		{
-		  status = result_add_char (dm, ' ');
-		  if (STATUS_NO_ERROR (status))
-		    result_add_string (dm, number);
-		}
-	      dyn_string_delete (number);
-	      RETURN_IF_ERROR (status);
-	      break;
-	    }
-	  /* If flag_strict, fall through.  */
-
-	default:
-	  return "Unrecognized <special-name>.";
-	}
-    }
-  else
-    return STATUS_ERROR;
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <ctor-dtor-name>.  
-   
-    <ctor-dtor-name>
-                   ::= C1  # complete object (in-charge) ctor
-                   ::= C2  # base object (not-in-charge) ctor
-                   ::= C3  # complete object (in-charge) allocating ctor
-                   ::= D0  # deleting (in-charge) dtor
-                   ::= D1  # complete object (in-charge) dtor
-                   ::= D2  # base object (not-in-charge) dtor  */
-
-static status_t
-demangle_ctor_dtor_name (dm)
-     demangling_t dm;
-{
-  static const char *const ctor_flavors[] = 
-  {
-    "in-charge",
-    "not-in-charge",
-    "allocating"
-  };
-  static const char *const dtor_flavors[] = 
-  {
-    "in-charge deleting",
-    "in-charge",
-    "not-in-charge"
-  };
-
-  int flavor;
-  char peek = peek_char (dm);
-
-  DEMANGLE_TRACE ("ctor-dtor-name", dm);
-  
-  if (peek == 'C')
-    {
-      /* A constructor name.  Consume the C.  */
-      advance_char (dm);
-      flavor = next_char (dm);
-      if (flavor < '1' || flavor > '3')
-	return "Unrecognized constructor.";
-      RETURN_IF_ERROR (result_add_string (dm, dm->last_source_name));
-      switch (flavor)
-	{
-	case '1': dm->is_constructor = gnu_v3_complete_object_ctor;
-	  break;
-	case '2': dm->is_constructor = gnu_v3_base_object_ctor;
-	  break;
-	case '3': dm->is_constructor = gnu_v3_complete_object_allocating_ctor;
-	  break;
-	}
-      /* Print the flavor of the constructor if in verbose mode.  */
-      if (flag_verbose)
-	{
-	  RETURN_IF_ERROR (result_add (dm, "["));
-	  RETURN_IF_ERROR (result_add (dm, ctor_flavors[flavor - '1']));
-	  RETURN_IF_ERROR (result_add_char (dm, ']'));
-	}
-    }
-  else if (peek == 'D')
-    {
-      /* A destructor name.  Consume the D.  */
-      advance_char (dm);
-      flavor = next_char (dm);
-      if (flavor < '0' || flavor > '2')
-	return "Unrecognized destructor.";
-      RETURN_IF_ERROR (result_add_char (dm, '~'));
-      RETURN_IF_ERROR (result_add_string (dm, dm->last_source_name));
-      switch (flavor)
-	{
-	case '0': dm->is_destructor = gnu_v3_deleting_dtor;
-	  break;
-	case '1': dm->is_destructor = gnu_v3_complete_object_dtor;
-	  break;
-	case '2': dm->is_destructor = gnu_v3_base_object_dtor;
-	  break;
-	}
-      /* Print the flavor of the destructor if in verbose mode.  */
-      if (flag_verbose)
-	{
-	  RETURN_IF_ERROR (result_add (dm, " ["));
-	  RETURN_IF_ERROR (result_add (dm, dtor_flavors[flavor - '0']));
-	  RETURN_IF_ERROR (result_add_char (dm, ']'));
-	}
-    }
-  else
-    return STATUS_ERROR;
-
-  return STATUS_OK;
-}
-
-/* Handle pointer, reference, and pointer-to-member cases for
-   demangle_type.  All consecutive `P's, `R's, and 'M's are joined to
-   build a pointer/reference type.  We snarf all these, plus the
-   following <type>, all at once since we need to know whether we have
-   a pointer to data or pointer to function to construct the right
-   output syntax.  C++'s pointer syntax is hairy.  
-
-   This function adds substitution candidates for every nested
-   pointer/reference type it processes, including the outermost, final
-   type, assuming the substitution starts at SUBSTITUTION_START in the
-   demangling result.  For example, if this function demangles
-   `PP3Foo', it will add a substitution for `Foo', `Foo*', and
-   `Foo**', in that order.
-
-   *INSERT_POS is a quantity used internally, when this function calls
-   itself recursively, to figure out where to insert pointer
-   punctuation on the way up.  On entry to this function, INSERT_POS
-   should point to a temporary value, but that value need not be
-   initialized.
-
-     <type> ::= P <type>
-            ::= R <type>
-            ::= <pointer-to-member-type>
-
-     <pointer-to-member-type> ::= M </class/ type> </member/ type>  */
-
-static status_t
-demangle_type_ptr (dm, insert_pos, substitution_start)
-     demangling_t dm;
-     int *insert_pos;
-     int substitution_start;
-{
-  status_t status;
-  int is_substitution_candidate = 1;
-
-  DEMANGLE_TRACE ("type*", dm);
-
-  /* Scan forward, collecting pointers and references into symbols,
-     until we hit something else.  Then emit the type.  */
-  switch (peek_char (dm))
-    {
-    case 'P':
-      /* A pointer.  Snarf the `P'.  */
-      advance_char (dm);
-      /* Demangle the underlying type.  */
-      RETURN_IF_ERROR (demangle_type_ptr (dm, insert_pos, 
-					  substitution_start));
-      /* Insert an asterisk where we're told to; it doesn't
-	 necessarily go at the end.  If we're doing Java style output, 
-	 there is no pointer symbol.  */
-      if (dm->style != DMGL_JAVA)
-	RETURN_IF_ERROR (result_insert_char (dm, *insert_pos, '*'));
-      /* The next (outermost) pointer or reference character should go
-	 after this one.  */
-      ++(*insert_pos);
-      break;
-
-    case 'R':
-      /* A reference.  Snarf the `R'.  */
-      advance_char (dm);
-      /* Demangle the underlying type.  */
-      RETURN_IF_ERROR (demangle_type_ptr (dm, insert_pos, 
-					  substitution_start));
-      /* Insert an ampersand where we're told to; it doesn't
-	 necessarily go at the end.  */
-      RETURN_IF_ERROR (result_insert_char (dm, *insert_pos, '&'));
-      /* The next (outermost) pointer or reference character should go
-	 after this one.  */
-      ++(*insert_pos);
-      break;
-
-    case 'M':
-    {
-      /* A pointer-to-member.  */
-      dyn_string_t class_type;
-      
-      /* Eat the 'M'.  */
-      advance_char (dm);
-      
-      /* Capture the type of which this is a pointer-to-member.  */
-      RETURN_IF_ERROR (result_push (dm));
-      RETURN_IF_ERROR (demangle_type (dm));
-      class_type = (dyn_string_t) result_pop (dm);
-      
-      if (peek_char (dm) == 'F')
-	/* A pointer-to-member function.  We want output along the
-	   lines of `void (C::*) (int, int)'.  Demangle the function
-	   type, which would in this case give `void () (int, int)'
-	   and set *insert_pos to the spot between the first
-	   parentheses.  */
-	status = demangle_type_ptr (dm, insert_pos, substitution_start);
-      else if (peek_char (dm) == 'A')
-	/* A pointer-to-member array variable.  We want output that
-	   looks like `int (Klass::*) [10]'.  Demangle the array type
-	   as `int () [10]', and set *insert_pos to the spot between
-	   the parentheses.  */
-	status = demangle_array_type (dm, insert_pos);
-      else
-        {
-	  /* A pointer-to-member variable.  Demangle the type of the
-             pointed-to member.  */
-	  status = demangle_type (dm);
-	  /* Make it pretty.  */
-	  if (STATUS_NO_ERROR (status)
-	      && !result_previous_char_is_space (dm))
-	    status = result_add_char (dm, ' ');
-	  /* The pointer-to-member notation (e.g. `C::*') follows the
-             member's type.  */
-	  *insert_pos = result_caret_pos (dm);
-	}
-
-      /* Build the pointer-to-member notation.  */
-      if (STATUS_NO_ERROR (status))
-	status = result_insert (dm, *insert_pos, "::*");
-      if (STATUS_NO_ERROR (status))
-	status = result_insert_string (dm, *insert_pos, class_type);
-      /* There may be additional levels of (pointer or reference)
-	 indirection in this type.  If so, the `*' and `&' should be
-	 added after the pointer-to-member notation (e.g. `C::*&' for
-	 a reference to a pointer-to-member of class C).  */
-      *insert_pos += dyn_string_length (class_type) + 3;
-
-      /* Clean up. */
-      dyn_string_delete (class_type);
-
-      RETURN_IF_ERROR (status);
-    }
-    break;
-
-    case 'F':
-      /* Ooh, tricky, a pointer-to-function.  When we demangle the
-	 function type, the return type should go at the very
-	 beginning.  */
-      *insert_pos = result_caret_pos (dm);
-      /* The parentheses indicate this is a function pointer or
-	 reference type.  */
-      RETURN_IF_ERROR (result_add (dm, "()"));
-      /* Now demangle the function type.  The return type will be
-	 inserted before the `()', and the argument list will go after
-	 it.  */
-      RETURN_IF_ERROR (demangle_function_type (dm, insert_pos));
-      /* We should now have something along the lines of 
-	 `void () (int, int)'.  The pointer or reference characters
-	 have to inside the first set of parentheses.  *insert_pos has
-	 already been updated to point past the end of the return
-	 type.  Move it one character over so it points inside the
-	 `()'.  */
-      ++(*insert_pos);
-      break;
-
-    case 'A':
-      /* An array pointer or reference.  demangle_array_type will figure
-	 out where the asterisks and ampersands go.  */
-      RETURN_IF_ERROR (demangle_array_type (dm, insert_pos));
-      break;
-
-    default:
-      /* No more pointer or reference tokens; this is therefore a
-	 pointer to data.  Finish up by demangling the underlying
-	 type.  */
-      RETURN_IF_ERROR (demangle_type (dm));
-      /* The pointer or reference characters follow the underlying
-	 type, as in `int*&'.  */
-      *insert_pos = result_caret_pos (dm);
-      /* Because of the production <type> ::= <substitution>,
-	 demangle_type will already have added the underlying type as
-	 a substitution candidate.  Don't do it again.  */
-      is_substitution_candidate = 0;
-      break;
-    }
-  
-  if (is_substitution_candidate)
-    RETURN_IF_ERROR (substitution_add (dm, substitution_start, 0));
-  
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <type>.  
-
-    <type> ::= <builtin-type>
-	   ::= <function-type>
-	   ::= <class-enum-type>
-	   ::= <array-type>
-	   ::= <pointer-to-member-type>
-	   ::= <template-param>
-	   ::= <template-template-param> <template-args>
-           ::= <CV-qualifiers> <type>
-	   ::= P <type>   # pointer-to
-	   ::= R <type>   # reference-to
-	   ::= C <type>   # complex pair (C 2000)
-	   ::= G <type>   # imaginary (C 2000)
-	   ::= U <source-name> <type>     # vendor extended type qualifier
-	   ::= <substitution>  */
-
-static status_t
-demangle_type (dm)
-     demangling_t dm;
-{
-  int start = substitution_start (dm);
-  char peek = peek_char (dm);
-  char peek_next;
-  int encode_return_type = 0;
-  template_arg_list_t old_arg_list = current_template_arg_list (dm);
-  int insert_pos;
-
-  /* A <type> can be a <substitution>; therefore, this <type> is a
-     substitution candidate unless a special condition holds (see
-     below).  */
-  int is_substitution_candidate = 1;
-
-  DEMANGLE_TRACE ("type", dm);
-
-  /* A <class-enum-type> can start with a digit (a <source-name>), an
-     N (a <nested-name>), or a Z (a <local-name>).  */
-  if (IS_DIGIT ((unsigned char) peek) || peek == 'N' || peek == 'Z')
-    RETURN_IF_ERROR (demangle_class_enum_type (dm, &encode_return_type));
-  /* Lower-case letters begin <builtin-type>s, except for `r', which
-     denotes restrict.  */
-  else if (peek >= 'a' && peek <= 'z' && peek != 'r')
-    {
-      RETURN_IF_ERROR (demangle_builtin_type (dm));
-      /* Built-in types are not substitution candidates.  */
-      is_substitution_candidate = 0;
-    }
-  else
-    switch (peek)
-      {
-      case 'r':
-      case 'V':
-      case 'K':
-	/* CV-qualifiers (including restrict).  We have to demangle
-	   them off to the side, since C++ syntax puts them in a funny
-	   place for qualified pointer and reference types.  */
-	{
-	  status_t status;
-	  dyn_string_t cv_qualifiers = dyn_string_new (24);
-	  int old_caret_position = result_get_caret (dm);
-
-	  if (cv_qualifiers == NULL)
-	    return STATUS_ALLOCATION_FAILED;
-
-	  /* Decode all adjacent CV qualifiers.  */
-	  demangle_CV_qualifiers (dm, cv_qualifiers);
-	  /* Emit them, and shift the caret left so that the
-	     underlying type will be emitted before the qualifiers.  */
-	  status = result_add_string (dm, cv_qualifiers);
-	  result_shift_caret (dm, -dyn_string_length (cv_qualifiers));
-	  /* Clean up.  */
-	  dyn_string_delete (cv_qualifiers);
-	  RETURN_IF_ERROR (status);
-	  /* Also prepend a blank, if needed.  */
-	  RETURN_IF_ERROR (result_add_char (dm, ' '));
-	  result_shift_caret (dm, -1);
-
-	  /* Demangle the underlying type.  It will be emitted before
-	     the CV qualifiers, since we moved the caret.  */
-	  RETURN_IF_ERROR (demangle_type (dm));
-
-	  /* Put the caret back where it was previously.  */
-	  result_set_caret (dm, old_caret_position);
-	}
-	break;
-
-      case 'F':
-	return "Non-pointer or -reference function type.";
-
-      case 'A':
-	RETURN_IF_ERROR (demangle_array_type (dm, NULL));
-	break;
-
-      case 'T':
-	/* It's either a <template-param> or a
-	   <template-template-param>.  In either case, demangle the
-	   `T' token first.  */
-	RETURN_IF_ERROR (demangle_template_param (dm));
-
-	/* Check for a template argument list; if one is found, it's a
-	     <template-template-param> ::= <template-param>
-                                       ::= <substitution>  */
-	if (peek_char (dm) == 'I')
-	  {
-	    /* Add a substitution candidate.  The template parameter
-	       `T' token is a substitution candidate by itself,
-	       without the template argument list.  */
-	    RETURN_IF_ERROR (substitution_add (dm, start, encode_return_type));
-
-	    /* Now demangle the template argument list.  */
-	    RETURN_IF_ERROR (demangle_template_args (dm));
-	    /* The entire type, including the template template
-	       parameter and its argument list, will be added as a
-	       substitution candidate below.  */
-	  }
-
-	break;
-
-      case 'S':
-	/* First check if this is a special substitution.  If it is,
-	   this is a <class-enum-type>.  Special substitutions have a
-	   letter following the `S'; other substitutions have a digit
-	   or underscore.  */
-	peek_next = peek_char_next (dm);
-	if (IS_DIGIT (peek_next) || peek_next == '_')
-	  {
-	    RETURN_IF_ERROR (demangle_substitution (dm, &encode_return_type));
-	    
-	    /* The substituted name may have been a template name.
-	       Check if template arguments follow, and if so, demangle
-	       them.  */
-	    if (peek_char (dm) == 'I')
-	      RETURN_IF_ERROR (demangle_template_args (dm));
-	    else
-	      /* A substitution token is not itself a substitution
-		 candidate.  (However, if the substituted template is
-		 instantiated, the resulting type is.)  */
-	      is_substitution_candidate = 0;
-	  }
-	else
-	  {
-	    /* Now some trickiness.  We have a special substitution
-	       here.  Often, the special substitution provides the
-	       name of a template that's subsequently instantiated,
-	       for instance `SaIcE' => std::allocator<char>.  In these
-	       cases we need to add a substitution candidate for the
-	       entire <class-enum-type> and thus don't want to clear
-	       the is_substitution_candidate flag.
-
-	       However, it's possible that what we have here is a
-	       substitution token representing an entire type, such as
-	       `Ss' => std::string.  In this case, we mustn't add a
-	       new substitution candidate for this substitution token.
-	       To detect this case, remember where the start of the
-	       substitution token is.  */
- 	    const char *next = dm->next;
-	    /* Now demangle the <class-enum-type>.  */
-	    RETURN_IF_ERROR 
-	      (demangle_class_enum_type (dm, &encode_return_type));
-	    /* If all that was just demangled is the two-character
-	       special substitution token, supress the addition of a
-	       new candidate for it.  */
-	    if (dm->next == next + 2)
-	      is_substitution_candidate = 0;
-	  }
-
-	break;
-
-      case 'P':
-      case 'R':
-      case 'M':
-	RETURN_IF_ERROR (demangle_type_ptr (dm, &insert_pos, start));
-	/* demangle_type_ptr adds all applicable substitution
-	   candidates.  */
-	is_substitution_candidate = 0;
-	break;
-
-      case 'C':
-	/* A C99 complex type.  */
-	RETURN_IF_ERROR (result_add (dm, "complex "));
-	advance_char (dm);
-	RETURN_IF_ERROR (demangle_type (dm));
-	break;
-
-      case 'G':
-	/* A C99 imaginary type.  */
-	RETURN_IF_ERROR (result_add (dm, "imaginary "));
-	advance_char (dm);
-	RETURN_IF_ERROR (demangle_type (dm));
-	break;
-
-      case 'U':
-	/* Vendor-extended type qualifier.  */
-	advance_char (dm);
-	RETURN_IF_ERROR (demangle_source_name (dm));
-	RETURN_IF_ERROR (result_add_char (dm, ' '));
-	RETURN_IF_ERROR (demangle_type (dm));
-	break;
-
-      default:
-	return "Unexpected character in <type>.";
-      }
-
-  if (is_substitution_candidate)
-    /* Add a new substitution for the type. If this type was a
-       <template-param>, pass its index since from the point of
-       substitutions; a <template-param> token is a substitution
-       candidate distinct from the type that is substituted for it.  */
-    RETURN_IF_ERROR (substitution_add (dm, start, encode_return_type));
-
-  /* Pop off template argument lists added during mangling of this
-     type.  */
-  pop_to_template_arg_list (dm, old_arg_list);
-
-  return STATUS_OK;
-}
-
-/* C++ source names of builtin types, indexed by the mangled code
-   letter's position in the alphabet ('a' -> 0, 'b' -> 1, etc).  */
-static const char *const builtin_type_names[26] = 
-{
-  "signed char",              /* a */
-  "bool",                     /* b */
-  "char",                     /* c */
-  "double",                   /* d */
-  "long double",              /* e */
-  "float",                    /* f */
-  "__float128",               /* g */
-  "unsigned char",            /* h */
-  "int",                      /* i */
-  "unsigned",                 /* j */
-  NULL,                       /* k */
-  "long",                     /* l */
-  "unsigned long",            /* m */
-  "__int128",                 /* n */
-  "unsigned __int128",        /* o */
-  NULL,                       /* p */
-  NULL,                       /* q */
-  NULL,                       /* r */
-  "short",                    /* s */
-  "unsigned short",           /* t */
-  NULL,                       /* u */
-  "void",                     /* v */
-  "wchar_t",                  /* w */
-  "long long",                /* x */
-  "unsigned long long",       /* y */
-  "..."                       /* z */
-};
-
-/* Java source names of builtin types.  Types that arn't valid in Java
-   are also included here - we don't fail if someone attempts to demangle a 
-   C++ symbol in Java style. */
-static const char *const java_builtin_type_names[26] = 
-{
-  "signed char",                /* a */
-  "boolean", /* C++ "bool" */   /* b */
-  "byte", /* C++ "char" */      /* c */
-  "double",                     /* d */
-  "long double",                /* e */
-  "float",                      /* f */
-  "__float128",                 /* g */
-  "unsigned char",              /* h */
-  "int",                        /* i */
-  "unsigned",                   /* j */
-  NULL,                         /* k */
-  "long",                       /* l */
-  "unsigned long",              /* m */
-  "__int128",                   /* n */
-  "unsigned __int128",          /* o */
-  NULL,                         /* p */
-  NULL,                         /* q */
-  NULL,                         /* r */
-  "short",                      /* s */
-  "unsigned short",             /* t */
-  NULL,                         /* u */
-  "void",                       /* v */
-  "char", /* C++ "wchar_t" */   /* w */
-  "long", /* C++ "long long" */ /* x */
-  "unsigned long long",         /* y */
-  "..."                         /* z */
-};
-
-/* Demangles and emits a <builtin-type>.  
-
-    <builtin-type> ::= v  # void
-		   ::= w  # wchar_t
-		   ::= b  # bool
-		   ::= c  # char
-		   ::= a  # signed char
-		   ::= h  # unsigned char
-		   ::= s  # short
-		   ::= t  # unsigned short
-		   ::= i  # int
-		   ::= j  # unsigned int
-		   ::= l  # long
-		   ::= m  # unsigned long
-		   ::= x  # long long, __int64
-		   ::= y  # unsigned long long, __int64
-		   ::= n  # __int128
-		   ::= o  # unsigned __int128
-		   ::= f  # float
-		   ::= d  # double
-		   ::= e  # long double, __float80
-		   ::= g  # __float128
-		   ::= z  # ellipsis
-		   ::= u <source-name>    # vendor extended type  */
-
-static status_t
-demangle_builtin_type (dm)
-     demangling_t dm;
-{
-
-  char code = peek_char (dm);
-
-  DEMANGLE_TRACE ("builtin-type", dm);
-
-  if (code == 'u')
-    {
-      advance_char (dm);
-      RETURN_IF_ERROR (demangle_source_name (dm));
-      return STATUS_OK;
-    }
-  else if (code >= 'a' && code <= 'z')
-    {
-      const char *type_name;
-      /* Java uses different names for some built-in types. */
-      if (dm->style == DMGL_JAVA)
-        type_name = java_builtin_type_names[code - 'a'];
-      else
-        type_name = builtin_type_names[code - 'a'];
-      if (type_name == NULL)
-	return "Unrecognized <builtin-type> code.";
-
-      RETURN_IF_ERROR (result_add (dm, type_name));
-      advance_char (dm);
-      return STATUS_OK;
-    }
-  else
-    return "Non-alphabetic <builtin-type> code.";
-}
-
-/* Demangles all consecutive CV-qualifiers (const, volatile, and
-   restrict) at the current position.  The qualifiers are appended to
-   QUALIFIERS.  Returns STATUS_OK.  */
-
-static status_t
-demangle_CV_qualifiers (dm, qualifiers)
-     demangling_t dm;
-     dyn_string_t qualifiers;
-{
-  DEMANGLE_TRACE ("CV-qualifiers", dm);
-
-  while (1)
-    {
-      switch (peek_char (dm))
-	{
-	case 'r':
-	  if (!dyn_string_append_space (qualifiers))
-	    return STATUS_ALLOCATION_FAILED;
-	  if (!dyn_string_append_cstr (qualifiers, "restrict"))
-	    return STATUS_ALLOCATION_FAILED;
-	  break;
-
-	case 'V':
-	  if (!dyn_string_append_space (qualifiers))
-	    return STATUS_ALLOCATION_FAILED;
-	  if (!dyn_string_append_cstr (qualifiers, "volatile"))
-	    return STATUS_ALLOCATION_FAILED;
-	  break;
-
-	case 'K':
-	  if (!dyn_string_append_space (qualifiers))
-	    return STATUS_ALLOCATION_FAILED;
-	  if (!dyn_string_append_cstr (qualifiers, "const"))
-	    return STATUS_ALLOCATION_FAILED;
-	  break;
-
-	default:
-	  return STATUS_OK;
-	}
-
-      advance_char (dm);
-    }
-}
-
-/* Demangles and emits a <function-type>.  *FUNCTION_NAME_POS is the
-   position in the result string of the start of the function
-   identifier, at which the function's return type will be inserted;
-   *FUNCTION_NAME_POS is updated to position past the end of the
-   function's return type.
-
-    <function-type> ::= F [Y] <bare-function-type> E  */
-
-static status_t
-demangle_function_type (dm, function_name_pos)
-     demangling_t dm;
-     int *function_name_pos;
-{
-  DEMANGLE_TRACE ("function-type", dm);
-  RETURN_IF_ERROR (demangle_char (dm, 'F'));  
-  if (peek_char (dm) == 'Y')
-    {
-      /* Indicate this function has C linkage if in verbose mode.  */
-      if (flag_verbose)
-	RETURN_IF_ERROR (result_add (dm, " [extern \"C\"] "));
-      advance_char (dm);
-    }
-  RETURN_IF_ERROR (demangle_bare_function_type (dm, function_name_pos));
-  RETURN_IF_ERROR (demangle_char (dm, 'E'));
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <bare-function-type>.  RETURN_TYPE_POS is the
-   position in the result string at which the function return type
-   should be inserted.  If RETURN_TYPE_POS is BFT_NO_RETURN_TYPE, the
-   function's return type is assumed not to be encoded.  
-
-    <bare-function-type> ::= <signature type>+  */
-
-static status_t
-demangle_bare_function_type (dm, return_type_pos)
-     demangling_t dm;
-     int *return_type_pos;
-{
-  /* Sequence is the index of the current function parameter, counting
-     from zero.  The value -1 denotes the return type.  */
-  int sequence = 
-    (return_type_pos == BFT_NO_RETURN_TYPE ? 0 : -1);
-
-  DEMANGLE_TRACE ("bare-function-type", dm);
-
-  RETURN_IF_ERROR (result_add_char (dm, '('));
-  while (!end_of_name_p (dm) && peek_char (dm) != 'E')
-    {
-      if (sequence == -1)
-	/* We're decoding the function's return type.  */
-	{
-	  dyn_string_t return_type;
-	  status_t status = STATUS_OK;
-
-	  /* Decode the return type off to the side.  */
-	  RETURN_IF_ERROR (result_push (dm));
-	  RETURN_IF_ERROR (demangle_type (dm));
-	  return_type = (dyn_string_t) result_pop (dm);
-
-	  /* Add a space to the end of the type.  Insert the return
-             type where we've been asked to. */
-	  if (!dyn_string_append_space (return_type))
-	    status = STATUS_ALLOCATION_FAILED;
-	  if (STATUS_NO_ERROR (status))
-	    {
-	      if (!dyn_string_insert (result_string (dm), *return_type_pos, 
-				      return_type))
-		status = STATUS_ALLOCATION_FAILED;
-	      else
-		*return_type_pos += dyn_string_length (return_type);
-	    }
-
-	  dyn_string_delete (return_type);
-	  RETURN_IF_ERROR (status);
-	}
-      else 
-	{
-	  /* Skip `void' parameter types.  One should only occur as
-	     the only type in a parameter list; in that case, we want
-	     to print `foo ()' instead of `foo (void)'.  */
-	  if (peek_char (dm) == 'v')
-	    /* Consume the v.  */
-	    advance_char (dm);
-	  else
-	    {
-	      /* Separate parameter types by commas.  */
-	      if (sequence > 0)
-		RETURN_IF_ERROR (result_add (dm, ", "));
-	      /* Demangle the type.  */
-	      RETURN_IF_ERROR (demangle_type (dm));
-	    }
-	}
-
-      ++sequence;
-    }
-  RETURN_IF_ERROR (result_add_char (dm, ')'));
-
-  /* We should have demangled at least one parameter type (which would
-     be void, for a function that takes no parameters), plus the
-     return type, if we were supposed to demangle that.  */
-  if (sequence == -1)
-    return "Missing function return type.";
-  else if (sequence == 0)
-    return "Missing function parameter.";
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <class-enum-type>.  *ENCODE_RETURN_TYPE is set to
-   non-zero if the type is a template-id, zero otherwise.  
-
-    <class-enum-type> ::= <name>  */
-
-static status_t
-demangle_class_enum_type (dm, encode_return_type)
-     demangling_t dm;
-     int *encode_return_type;
-{
-  DEMANGLE_TRACE ("class-enum-type", dm);
-
-  RETURN_IF_ERROR (demangle_name (dm, encode_return_type));
-  return STATUS_OK;
-}
-
-/* Demangles and emits an <array-type>.  
-
-   If PTR_INSERT_POS is not NULL, the array type is formatted as a
-   pointer or reference to an array, except that asterisk and
-   ampersand punctuation is omitted (since it's not know at this
-   point).  *PTR_INSERT_POS is set to the position in the demangled
-   name at which this punctuation should be inserted.  For example,
-   `A10_i' is demangled to `int () [10]' and *PTR_INSERT_POS points
-   between the parentheses.
-
-   If PTR_INSERT_POS is NULL, the array type is assumed not to be
-   pointer- or reference-qualified.  Then, for example, `A10_i' is
-   demangled simply as `int[10]'.  
-
-    <array-type> ::= A [<dimension number>] _ <element type>  
-                 ::= A <dimension expression> _ <element type>  */
-
-static status_t
-demangle_array_type (dm, ptr_insert_pos)
-     demangling_t dm;
-     int *ptr_insert_pos;
-{
-  status_t status = STATUS_OK;
-  dyn_string_t array_size = NULL;
-  char peek;
-
-  DEMANGLE_TRACE ("array-type", dm);
-
-  RETURN_IF_ERROR (demangle_char (dm, 'A'));
-
-  /* Demangle the array size into array_size.  */
-  peek = peek_char (dm);
-  if (peek == '_')
-    /* Array bound is omitted.  This is a C99-style VLA.  */
-    ;
-  else if (IS_DIGIT (peek_char (dm))) 
-    {
-      /* It looks like a constant array bound.  */
-      array_size = dyn_string_new (10);
-      if (array_size == NULL)
-	return STATUS_ALLOCATION_FAILED;
-      status = demangle_number_literally (dm, array_size, 10, 0);
-    }
-  else
-    {
-      /* Anything is must be an expression for a nont-constant array
-	 bound.  This happens if the array type occurs in a template
-	 and the array bound references a template parameter.  */
-      RETURN_IF_ERROR (result_push (dm));
-      RETURN_IF_ERROR (demangle_expression_v3 (dm));
-      array_size = (dyn_string_t) result_pop (dm);
-    }
-  /* array_size may have been allocated by now, so we can't use
-     RETURN_IF_ERROR until it's been deallocated.  */
-
-  /* Demangle the base type of the array.  */
-  if (STATUS_NO_ERROR (status))
-    status = demangle_char (dm, '_');
-  if (STATUS_NO_ERROR (status))
-    status = demangle_type (dm);
-
-  if (ptr_insert_pos != NULL)
-    {
-      /* This array is actually part of an pointer- or
-	 reference-to-array type.  Format appropriately, except we
-	 don't know which and how much punctuation to use.  */
-      if (STATUS_NO_ERROR (status))
-	status = result_add (dm, " () ");
-      /* Let the caller know where to insert the punctuation.  */
-      *ptr_insert_pos = result_caret_pos (dm) - 2;
-    }
-
-  /* Emit the array dimension syntax.  */
-  if (STATUS_NO_ERROR (status))
-    status = result_add_char (dm, '[');
-  if (STATUS_NO_ERROR (status) && array_size != NULL)
-    status = result_add_string (dm, array_size);
-  if (STATUS_NO_ERROR (status))
-    status = result_add_char (dm, ']');
-  if (array_size != NULL)
-    dyn_string_delete (array_size);
-  
-  RETURN_IF_ERROR (status);
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <template-param>.  
-
-    <template-param> ::= T_       # first template parameter
-                     ::= T <parameter-2 number> _  */
-
-static status_t
-demangle_template_param (dm)
-     demangling_t dm;
-{
-  int parm_number;
-  template_arg_list_t current_arg_list = current_template_arg_list (dm);
-  string_list_t arg;
-
-  DEMANGLE_TRACE ("template-param", dm);
-
-  /* Make sure there is a template argmust list in which to look up
-     this parameter reference.  */
-  if (current_arg_list == NULL)
-    return "Template parameter outside of template.";
-
-  RETURN_IF_ERROR (demangle_char (dm, 'T'));
-  if (peek_char (dm) == '_')
-    parm_number = 0;
-  else
-    {
-      RETURN_IF_ERROR (demangle_number (dm, &parm_number, 10, 0));
-      ++parm_number;
-    }
-  RETURN_IF_ERROR (demangle_char (dm, '_'));
-
-  arg = template_arg_list_get_arg (current_arg_list, parm_number);
-  if (arg == NULL)
-    /* parm_number exceeded the number of arguments in the current
-       template argument list.  */
-    return "Template parameter number out of bounds.";
-  RETURN_IF_ERROR (result_add_string (dm, (dyn_string_t) arg));
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <template-args>.  
-
-    <template-args> ::= I <template-arg>+ E  */
-
-static status_t
-demangle_template_args_1 (dm, arg_list)
-     demangling_t dm;
-     template_arg_list_t arg_list;
-{
-  int first = 1;
-
-  DEMANGLE_TRACE ("template-args", dm);
-
-  RETURN_IF_ERROR (demangle_char (dm, 'I'));
-  RETURN_IF_ERROR (result_open_template_list (dm));
-  do
-    {
-      string_list_t arg;
-
-      if (first)
-	first = 0;
-      else
-	RETURN_IF_ERROR (result_add (dm, ", "));
-
-      /* Capture the template arg.  */
-      RETURN_IF_ERROR (result_push (dm));
-      RETURN_IF_ERROR (demangle_template_arg (dm));
-      arg = result_pop (dm);
-
-      /* Emit it in the demangled name.  */
-      RETURN_IF_ERROR (result_add_string (dm, (dyn_string_t) arg));
-
-      /* Save it for use in expanding <template-param>s.  */
-      template_arg_list_add_arg (arg_list, arg);
-    }
-  while (peek_char (dm) != 'E');
-  /* Append the '>'.  */
-  RETURN_IF_ERROR (result_close_template_list (dm));
-
-  /* Consume the 'E'.  */
-  advance_char (dm);
-
-  return STATUS_OK;
-}
-
-static status_t
-demangle_template_args (dm)
-     demangling_t dm;
-{
-  int first = 1;
-  dyn_string_t old_last_source_name;
-  dyn_string_t new_name;
-  template_arg_list_t arg_list = template_arg_list_new ();
-  status_t status;
-
-  if (arg_list == NULL)
-    return STATUS_ALLOCATION_FAILED;
-
-  /* Preserve the most recently demangled source name.  */
-  old_last_source_name = dm->last_source_name;
-  new_name = dyn_string_new (0);
-
-  if (new_name == NULL)
-    {
-      template_arg_list_delete (arg_list);
-      return STATUS_ALLOCATION_FAILED;
-    }
-
-  dm->last_source_name = new_name;
-  
-  status = demangle_template_args_1 (dm, arg_list);
-  /* Restore the most recent demangled source name.  */
-  dyn_string_delete (dm->last_source_name);
-  dm->last_source_name = old_last_source_name;
-
-  if (!STATUS_NO_ERROR (status))
-    {
-      template_arg_list_delete (arg_list);
-      return status;
-    }
-
-  /* Push the list onto the top of the stack of template argument
-     lists, so that arguments from it are used from now on when
-     expanding <template-param>s.  */
-  push_template_arg_list (dm, arg_list);
-
-  return STATUS_OK;
-}
-
-/* This function, which does not correspond to a production in the
-   mangling spec, handles the `literal' production for both
-   <template-arg> and <expr-primary>.  It does not expect or consume
-   the initial `L' or final `E'.  The demangling is given by:
-
-     <literal> ::= <type> </value/ number>
-
-   and the emitted output is `(type)number'.  */
-
-static status_t
-demangle_literal (dm)
-     demangling_t dm;
-{
-  char peek = peek_char (dm);
-  dyn_string_t value_string;
-  status_t status;
-
-  DEMANGLE_TRACE ("literal", dm);
-
-  if (!flag_verbose && peek >= 'a' && peek <= 'z')
-    {
-      /* If not in verbose mode and this is a builtin type, see if we
-	 can produce simpler numerical output.  In particular, for
-	 integer types shorter than `long', just write the number
-	 without type information; for bools, write `true' or `false'.
-	 Other refinements could be made here too.  */
-
-      /* This constant string is used to map from <builtin-type> codes
-	 (26 letters of the alphabet) to codes that determine how the 
-	 value will be displayed.  The codes are:
-	   b: display as bool
-	   i: display as int
-           l: display as long
-	 A space means the value will be represented using cast
-	 notation. */
-      static const char *const code_map = "ibi    iii ll     ii  i  ";
-
-      char code = code_map[peek - 'a'];
-      /* FIXME: Implement demangling of floats and doubles.  */
-      if (code == 'u')
-	return STATUS_UNIMPLEMENTED;
-      if (code == 'b')
-	{
-	  /* It's a boolean.  */
-	  char value;
-
-	  /* Consume the b.  */
-	  advance_char (dm);
-	  /* Look at the next character.  It should be 0 or 1,
-	     corresponding to false or true, respectively.  */
-	  value = peek_char (dm);
-	  if (value == '0')
-	    RETURN_IF_ERROR (result_add (dm, "false"));
-	  else if (value == '1')
-	    RETURN_IF_ERROR (result_add (dm, "true"));
-	  else
-	    return "Unrecognized bool constant.";
-	  /* Consume the 0 or 1.  */
-	  advance_char (dm);
-	  return STATUS_OK;
-	}
-      else if (code == 'i' || code == 'l')
-	{
-	  /* It's an integer or long.  */
-
-	  /* Consume the type character.  */
-	  advance_char (dm);
-
-	  /* Demangle the number and write it out.  */
-	  value_string = dyn_string_new (0);
-	  status = demangle_number_literally (dm, value_string, 10, 1);
-	  if (STATUS_NO_ERROR (status))
-	    status = result_add_string (dm, value_string);
-	  /* For long integers, append an l.  */
-	  if (code == 'l' && STATUS_NO_ERROR (status))
-	    status = result_add_char (dm, code);
-	  dyn_string_delete (value_string);
-
-	  RETURN_IF_ERROR (status);
-	  return STATUS_OK;
-	}
-      /* ...else code == ' ', so fall through to represent this
-	 literal's type explicitly using cast syntax.  */
-    }
-
-  RETURN_IF_ERROR (result_add_char (dm, '('));
-  RETURN_IF_ERROR (demangle_type (dm));
-  RETURN_IF_ERROR (result_add_char (dm, ')'));
-
-  value_string = dyn_string_new (0);
-  if (value_string == NULL)
-    return STATUS_ALLOCATION_FAILED;
-
-  status = demangle_number_literally (dm, value_string, 10, 1);
-  if (STATUS_NO_ERROR (status))
-    status = result_add_string (dm, value_string);
-  dyn_string_delete (value_string);
-  RETURN_IF_ERROR (status);
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <template-arg>.  
-
-    <template-arg> ::= <type>                     # type
-                   ::= L <type> <value number> E  # literal
-                   ::= LZ <encoding> E            # external name
-                   ::= X <expression> E           # expression  */
-
-static status_t
-demangle_template_arg (dm)
-     demangling_t dm;
-{
-  DEMANGLE_TRACE ("template-arg", dm);
-
-  switch (peek_char (dm))
-    {
-    case 'L':
-      advance_char (dm);
-
-      if (peek_char (dm) == 'Z')
-	{
-	  /* External name.  */
-	  advance_char (dm);
-	  /* FIXME: Standard is contradictory here.  */
-	  RETURN_IF_ERROR (demangle_encoding (dm));
-	}
-      else
-	RETURN_IF_ERROR (demangle_literal (dm));
-      RETURN_IF_ERROR (demangle_char (dm, 'E'));
-      break;
-
-    case 'X':
-      /* Expression.  */
-      advance_char (dm);
-      RETURN_IF_ERROR (demangle_expression_v3 (dm));
-      RETURN_IF_ERROR (demangle_char (dm, 'E'));
-      break;
-
-    default:
-      RETURN_IF_ERROR (demangle_type (dm));
-      break;
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits an <expression>.
-
-    <expression> ::= <unary operator-name> <expression>
-		 ::= <binary operator-name> <expression> <expression>
-		 ::= <expr-primary>  
-                 ::= <scope-expression>  */
-
-static status_t
-demangle_expression_v3 (dm)
-     demangling_t dm;
-{
-  char peek = peek_char (dm);
-
-  DEMANGLE_TRACE ("expression", dm);
-
-  if (peek == 'L' || peek == 'T')
-    RETURN_IF_ERROR (demangle_expr_primary (dm));
-  else if (peek == 's' && peek_char_next (dm) == 'r')
-    RETURN_IF_ERROR (demangle_scope_expression (dm));
-  else
-    /* An operator expression.  */
-    {
-      int num_args;
-      status_t status = STATUS_OK;
-      dyn_string_t operator_name;
-
-      /* We have an operator name.  Since we want to output binary
-	 operations in infix notation, capture the operator name
-	 first.  */
-      RETURN_IF_ERROR (result_push (dm));
-      RETURN_IF_ERROR (demangle_operator_name (dm, 1, &num_args));
-      operator_name = (dyn_string_t) result_pop (dm);
-
-      /* If it's binary, do an operand first.  */
-      if (num_args > 1)
-	{
-	  status = result_add_char (dm, '(');
-	  if (STATUS_NO_ERROR (status))
-	    status = demangle_expression_v3 (dm);
-	  if (STATUS_NO_ERROR (status))
-	    status = result_add_char (dm, ')');
-	}
-
-      /* Emit the operator.  */  
-      if (STATUS_NO_ERROR (status))
-	status = result_add_string (dm, operator_name);
-      dyn_string_delete (operator_name);
-      RETURN_IF_ERROR (status);
-      
-      /* Emit its second (if binary) or only (if unary) operand.  */
-      RETURN_IF_ERROR (result_add_char (dm, '('));
-      RETURN_IF_ERROR (demangle_expression_v3 (dm));
-      RETURN_IF_ERROR (result_add_char (dm, ')'));
-
-      /* The ternary operator takes a third operand.  */
-      if (num_args == 3)
-	{
-	  RETURN_IF_ERROR (result_add (dm, ":("));
-	  RETURN_IF_ERROR (demangle_expression_v3 (dm));
-	  RETURN_IF_ERROR (result_add_char (dm, ')'));
-	}
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <scope-expression>.  
-
-    <scope-expression> ::= sr <qualifying type> <source-name>
-                       ::= sr <qualifying type> <encoding>  */
-
-static status_t
-demangle_scope_expression (dm)
-     demangling_t dm;
-{
-  RETURN_IF_ERROR (demangle_char (dm, 's'));
-  RETURN_IF_ERROR (demangle_char (dm, 'r'));
-  RETURN_IF_ERROR (demangle_type (dm));
-  RETURN_IF_ERROR (result_add (dm, "::"));
-  RETURN_IF_ERROR (demangle_encoding (dm));
-  return STATUS_OK;
-}
-
-/* Demangles and emits an <expr-primary>.  
-
-    <expr-primary> ::= <template-param>
-		   ::= L <type> <value number> E  # literal
-		   ::= L <mangled-name> E         # external name  */
-
-static status_t
-demangle_expr_primary (dm)
-     demangling_t dm;
-{
-  char peek = peek_char (dm);
-
-  DEMANGLE_TRACE ("expr-primary", dm);
-
-  if (peek == 'T')
-    RETURN_IF_ERROR (demangle_template_param (dm));
-  else if (peek == 'L')
-    {
-      /* Consume the `L'.  */
-      advance_char (dm);
-      peek = peek_char (dm);
-
-      if (peek == '_')
-	RETURN_IF_ERROR (demangle_mangled_name (dm));
-      else
-	RETURN_IF_ERROR (demangle_literal (dm));
-
-      RETURN_IF_ERROR (demangle_char (dm, 'E'));
-    }
-  else
-    return STATUS_ERROR;
-
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <substitution>.  Sets *TEMPLATE_P to non-zero
-   if the substitution is the name of a template, zero otherwise. 
-
-     <substitution> ::= S <seq-id> _
-                    ::= S_
-
-                    ::= St   # ::std::
-                    ::= Sa   # ::std::allocator
-                    ::= Sb   # ::std::basic_string
-                    ::= Ss   # ::std::basic_string<char,
-				    		   ::std::char_traits<char>,
-						   ::std::allocator<char> >
-                    ::= Si   # ::std::basic_istream<char,  
-                                                    std::char_traits<char> >
-                    ::= So   # ::std::basic_ostream<char,  
-                                                    std::char_traits<char> >
-                    ::= Sd   # ::std::basic_iostream<char, 
-                                                    std::char_traits<char> >
-*/
-
-static status_t
-demangle_substitution (dm, template_p)
-     demangling_t dm;
-     int *template_p;
-{
-  int seq_id;
-  int peek;
-  dyn_string_t text;
-
-  DEMANGLE_TRACE ("substitution", dm);
-
-  RETURN_IF_ERROR (demangle_char (dm, 'S'));
-
-  /* Scan the substitution sequence index.  A missing number denotes
-     the first index.  */
-  peek = peek_char (dm);
-  if (peek == '_')
-    seq_id = -1;
-  /* If the following character is 0-9 or a capital letter, interpret
-     the sequence up to the next underscore as a base-36 substitution
-     index.  */
-  else if (IS_DIGIT ((unsigned char) peek) 
-	   || (peek >= 'A' && peek <= 'Z'))
-    RETURN_IF_ERROR (demangle_number (dm, &seq_id, 36, 0));
-  else 
-    {
-      const char *new_last_source_name = NULL;
-
-      switch (peek)
-	{
-	case 't':
-	  RETURN_IF_ERROR (result_add (dm, "std"));
-	  break;
-
-	case 'a':
-	  RETURN_IF_ERROR (result_add (dm, "std::allocator"));
-	  new_last_source_name = "allocator";
-	  *template_p = 1;
-	  break;
-
-	case 'b':
-	  RETURN_IF_ERROR (result_add (dm, "std::basic_string"));
-	  new_last_source_name = "basic_string";
-	  *template_p = 1;
-	  break;
-	  
-	case 's':
-	  if (!flag_verbose)
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::string"));
-	      new_last_source_name = "string";
-	    }
-	  else
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::basic_string<char, std::char_traits<char>, std::allocator<char> >"));
-	      new_last_source_name = "basic_string";
-	    }
-	  *template_p = 0;
-	  break;
-
-	case 'i':
-	  if (!flag_verbose)
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::istream"));
-	      new_last_source_name = "istream";
-	    }
-	  else
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::basic_istream<char, std::char_traints<char> >"));
-	      new_last_source_name = "basic_istream";
-	    }
-	  *template_p = 0;
-	  break;
-
-	case 'o':
-	  if (!flag_verbose)
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::ostream"));
-	      new_last_source_name = "ostream";
-	    }
-	  else
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::basic_ostream<char, std::char_traits<char> >"));
-	      new_last_source_name = "basic_ostream";
-	    }
-	  *template_p = 0;
-	  break;
-
-	case 'd':
-	  if (!flag_verbose) 
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::iostream"));
-	      new_last_source_name = "iostream";
-	    }
-	  else
-	    {
-	      RETURN_IF_ERROR (result_add (dm, "std::basic_iostream<char, std::char_traits<char> >"));
-	      new_last_source_name = "basic_iostream";
-	    }
-	  *template_p = 0;
-	  break;
-
-	default:
-	  return "Unrecognized <substitution>.";
-	}
-      
-      /* Consume the character we just processed.  */
-      advance_char (dm);
-
-      if (new_last_source_name != NULL)
-	{
-	  if (!dyn_string_copy_cstr (dm->last_source_name, 
-				     new_last_source_name))
-	    return STATUS_ALLOCATION_FAILED;
-	}
-
-      return STATUS_OK;
-    }
-
-  /* Look up the substitution text.  Since `S_' is the most recent
-     substitution, `S0_' is the second-most-recent, etc., shift the
-     numbering by one.  */
-  text = substitution_get (dm, seq_id + 1, template_p);
-  if (text == NULL) 
-    return "Substitution number out of range.";
-
-  /* Emit the substitution text.  */
-  RETURN_IF_ERROR (result_add_string (dm, text));
-
-  RETURN_IF_ERROR (demangle_char (dm, '_'));
-  return STATUS_OK;
-}
-
-/* Demangles and emits a <local-name>.  
-
-    <local-name> := Z <function encoding> E <entity name> [<discriminator>]
-                 := Z <function encoding> E s [<discriminator>]  */
-
-static status_t
-demangle_local_name (dm)
-     demangling_t dm;
-{
-  DEMANGLE_TRACE ("local-name", dm);
-
-  RETURN_IF_ERROR (demangle_char (dm, 'Z'));
-  RETURN_IF_ERROR (demangle_encoding (dm));
-  RETURN_IF_ERROR (demangle_char (dm, 'E'));
-  RETURN_IF_ERROR (result_add (dm, "::"));
-
-  if (peek_char (dm) == 's')
-    {
-      /* Local character string literal.  */
-      RETURN_IF_ERROR (result_add (dm, "string literal"));
-      /* Consume the s.  */
-      advance_char (dm);
-      RETURN_IF_ERROR (demangle_discriminator (dm, 0));
-    }
-  else
-    {
-      int unused;
-      /* Local name for some other entity.  Demangle its name.  */
-      RETURN_IF_ERROR (demangle_name (dm, &unused));
-      RETURN_IF_ERROR (demangle_discriminator (dm, 1));
-     }
-
-   return STATUS_OK;
- }
-
- /* Optimonally demangles and emits a <discriminator>.  If there is no
-    <discriminator> at the current position in the mangled string, the
-    descriminator is assumed to be zero.  Emit the discriminator number
-    in parentheses, unless SUPPRESS_FIRST is non-zero and the
-    discriminator is zero.  
-
-     <discriminator> ::= _ <number>  */
-
-static status_t
-demangle_discriminator (dm, suppress_first)
-     demangling_t dm;
-     int suppress_first;
-{
-  /* Output for <discriminator>s to the demangled name is completely
-     suppressed if not in verbose mode.  */
-
-  if (peek_char (dm) == '_')
-    {
-      /* Consume the underscore.  */
-      advance_char (dm);
-      if (flag_verbose)
-	RETURN_IF_ERROR (result_add (dm, " [#"));
-      /* Check if there's a number following the underscore.  */
-      if (IS_DIGIT ((unsigned char) peek_char (dm)))
-	{
-	  int discriminator;
-	  /* Demangle the number.  */
-	  RETURN_IF_ERROR (demangle_number (dm, &discriminator, 10, 0));
-	  if (flag_verbose)
-	    /* Write the discriminator.  The mangled number is two
-	       less than the discriminator ordinal, counting from
-	       zero.  */
-	    RETURN_IF_ERROR (int_to_dyn_string (discriminator + 1,
-						(dyn_string_t) dm->result));
-	}
-      else
-	return STATUS_ERROR;
-      if (flag_verbose)
-	RETURN_IF_ERROR (result_add_char (dm, ']'));
-    }
-  else if (!suppress_first)
-    {
-      if (flag_verbose)
-	RETURN_IF_ERROR (result_add (dm, " [#0]"));
-    }
-
-  return STATUS_OK;
-}
-
-/* Demangle NAME into RESULT, which must be an initialized
-   dyn_string_t.  On success, returns STATUS_OK.  On failure, returns
-   an error message, and the contents of RESULT are unchanged.  */
-
-static status_t
-cp_demangle (name, result, style)
-     const char *name;
-     dyn_string_t result;
-     int style;
-{
-  status_t status;
-  int length = VG_(strlen) (name);
-
-  if (length > 2 && name[0] == '_' && name[1] == 'Z')
-    {
-      demangling_t dm = demangling_new (name, style);
-      if (dm == NULL)
-	return STATUS_ALLOCATION_FAILED;
-
-      status = result_push (dm);
-      if (status != STATUS_OK)
-	{
-	  demangling_delete (dm);
-	  return status;
-	}
-
-      status = demangle_mangled_name (dm);
-      if (STATUS_NO_ERROR (status))
-	{
-	  dyn_string_t demangled = (dyn_string_t) result_pop (dm);
-	  if (!dyn_string_copy (result, demangled))
-	    {
-	      demangling_delete (dm);
-	      return STATUS_ALLOCATION_FAILED;
-	    }
-	  dyn_string_delete (demangled);
-	}
-      
-      demangling_delete (dm);
-    }
-  else
-    {
-      /* It's evidently not a mangled C++ name.  It could be the name
-	 of something with C linkage, though, so just copy NAME into
-	 RESULT.  */
-      if (!dyn_string_copy_cstr (result, name))
-	return STATUS_ALLOCATION_FAILED;
-      status = STATUS_OK;
-    }
-
-  return status; 
-}
-
-/* Demangle TYPE_NAME into RESULT, which must be an initialized
-   dyn_string_t.  On success, returns STATUS_OK.  On failiure, returns
-   an error message, and the contents of RESULT are unchanged.  */
-
-#ifdef IN_LIBGCC2
-static status_t
-cp_demangle_type (type_name, result)
-     const char* type_name;
-     dyn_string_t result;
-{
-  status_t status;
-  demangling_t dm = demangling_new (type_name);
-  
-  if (dm == NULL)
-    return STATUS_ALLOCATION_FAILED;
-
-  /* Demangle the type name.  The demangled name is stored in dm.  */
-  status = result_push (dm);
-  if (status != STATUS_OK)
-    {
-      demangling_delete (dm);
-      return status;
-    }
-
-  status = demangle_type (dm);
-
-  if (STATUS_NO_ERROR (status))
-    {
-      /* The demangling succeeded.  Pop the result out of dm and copy
-	 it into RESULT.  */
-      dyn_string_t demangled = (dyn_string_t) result_pop (dm);
-      if (!dyn_string_copy (result, demangled))
-	return STATUS_ALLOCATION_FAILED;
-      dyn_string_delete (demangled);
-    }
-
-  /* Clean up.  */
-  demangling_delete (dm);
-
-  return status;
-}
-
-extern char *__cxa_demangle PARAMS ((const char *, char *, size_t *, int *));
-
-/* ia64 ABI-mandated entry point in the C++ runtime library for performing
-   demangling.  MANGLED_NAME is a NUL-terminated character string
-   containing the name to be demangled.  
-
-   OUTPUT_BUFFER is a region of memory, allocated with malloc, of
-   *LENGTH bytes, into which the demangled name is stored.  If
-   OUTPUT_BUFFER is not long enough, it is expanded using realloc.
-   OUTPUT_BUFFER may instead be NULL; in that case, the demangled name
-   is placed in a region of memory allocated with malloc.  
-
-   If LENGTH is non-NULL, the length of the buffer conaining the
-   demangled name, is placed in *LENGTH.  
-
-   The return value is a pointer to the start of the NUL-terminated
-   demangled name, or NULL if the demangling fails.  The caller is
-   responsible for deallocating this memory using free.  
-
-   *STATUS is set to one of the following values:
-      0: The demangling operation succeeded.
-     -1: A memory allocation failiure occurred.
-     -2: MANGLED_NAME is not a valid name under the C++ ABI mangling rules.
-     -3: One of the arguments is invalid.
-
-   The demagling is performed using the C++ ABI mangling rules, with
-   GNU extensions.  */
-
-char *
-__cxa_demangle (mangled_name, output_buffer, length, status)
-     const char *mangled_name;
-     char *output_buffer;
-     size_t *length;
-     int *status;
-{
-  struct dyn_string demangled_name;
-  status_t result;
-
-  if (status == NULL)
-    return NULL;
-
-  if (mangled_name == NULL) {
-    *status = -3;
-    return NULL;
-  }
-
-  /* Did the caller provide a buffer for the demangled name?  */
-  if (output_buffer == NULL) {
-    /* No; dyn_string will malloc a buffer for us.  */
-    if (!dyn_string_init (&demangled_name, 0)) 
-      {
-	*status = -1;
-	return NULL;
-      }
-  }
-  else {
-    /* Yes.  Check that the length was provided.  */
-    if (length == NULL) {
-      *status = -3;
-      return NULL;
-    }
-    /* Install the buffer into a dyn_string.  */
-    demangled_name.allocated = *length;
-    demangled_name.length = 0;
-    demangled_name.s = output_buffer;
-  }
-
-  if (mangled_name[0] == '_' && mangled_name[1] == 'Z')
-    /* MANGLED_NAME apprears to be a function or variable name.
-       Demangle it accordingly.  */
-    result = cp_demangle (mangled_name, &demangled_name, 0);
-  else
-    /* Try to demangled MANGLED_NAME as the name of a type.  */
-    result = cp_demangle_type (mangled_name, &demangled_name);
-
-  if (result == STATUS_OK) 
-    /* The demangling succeeded.  */
-    {
-      /* If LENGTH isn't NULL, store the allocated buffer length
-	 there; the buffer may have been realloced by dyn_string
-	 functions.  */
-      if (length != NULL)
-	*length = demangled_name.allocated;
-      /* The operation was a success.  */
-      *status = 0;
-      return dyn_string_buf (&demangled_name);
-    }
-  else if (result == STATUS_ALLOCATION_FAILED)
-    /* A call to malloc or realloc failed during the demangling
-       operation.  */
-    {
-      *status = -1;
-      return NULL;
-    }
-  else
-    /* The demangling failed for another reason, most probably because
-       MANGLED_NAME isn't a valid mangled name.  */
-    {
-      /* If the buffer containing the demangled name wasn't provided
-	 by the caller, free it.  */
-      if (output_buffer == NULL)
-	free (dyn_string_buf (&demangled_name));
-      *status = -2;
-      return NULL;
-    }
-}
-
-#else /* !IN_LIBGCC2 */
-
-/* Variant entry point for integration with the existing cplus-dem
-   demangler.  Attempts to demangle MANGLED.  If the demangling
-   succeeds, returns a buffer, allocated with malloc, containing the
-   demangled name.  The caller must deallocate the buffer using free.
-   If the demangling failes, returns NULL.  */
-
-char *
-VG_(cplus_demangle_v3) (mangled)
-     const char* mangled;
-{
-  dyn_string_t demangled;
-  status_t status;
-
-  /* If this isn't a mangled name, don't pretend to demangle it.  */
-  if (VG_(strncmp) (mangled, "_Z", 2) != 0)
-    return NULL;
-
-  /* Create a dyn_string to hold the demangled name.  */
-  demangled = dyn_string_new (0);
-  /* Attempt the demangling.  */
-  status = cp_demangle ((char *) mangled, demangled, 0);
-
-  if (STATUS_NO_ERROR (status))
-    /* Demangling succeeded.  */
-    {
-      /* Grab the demangled result from the dyn_string.  It was
-	 allocated with malloc, so we can return it directly.  */
-      char *return_value = dyn_string_release (demangled);
-      /* Hand back the demangled name.  */
-      return return_value;
-    }
-  else if (status == STATUS_ALLOCATION_FAILED)
-    {
-	vg_assert (0);
-	/*
-      fprintf (stderr, "Memory allocation failed.\n");
-      abort ();
-      */
-    }
-  else
-    /* Demangling failed.  */
-    {
-      dyn_string_delete (demangled);
-      return NULL;
-    }
-}
-
-/* Demangle a Java symbol.  Java uses a subset of the V3 ABI C++ mangling 
-   conventions, but the output formatting is a little different.
-   This instructs the C++ demangler not to emit pointer characters ("*"), and 
-   to use Java's namespace separator symbol ("." instead of "::").  It then 
-   does an additional pass over the demangled output to replace instances 
-   of JArray<TYPE> with TYPE[].  */
-
-char *
-VG_(java_demangle_v3) (mangled)
-     const char* mangled;
-{
-  dyn_string_t demangled;
-  char *next;
-  char *end;
-  int len;
-  status_t status;
-  int nesting = 0;
-  char *cplus_demangled;
-  char *return_value;
-    
-  /* Create a dyn_string to hold the demangled name.  */
-  demangled = dyn_string_new (0);
-
-  /* Attempt the demangling.  */
-  status = cp_demangle ((char *) mangled, demangled, DMGL_JAVA);
-
-  if (STATUS_NO_ERROR (status))
-    /* Demangling succeeded.  */
-    {
-      /* Grab the demangled result from the dyn_string. */
-      cplus_demangled = dyn_string_release (demangled);
-    }
-  else if (status == STATUS_ALLOCATION_FAILED)
-    {
-	vg_assert (0);
-	/*
-      fprintf (stderr, "Memory allocation failed.\n");
-      abort ();
-      */
-    }
-  else
-    /* Demangling failed.  */
-    {
-      dyn_string_delete (demangled);
-      return NULL;
-    }
-  
-  len = VG_(strlen) (cplus_demangled);
-  next = cplus_demangled;
-  end = next + len;
-  demangled = NULL;
-
-  /* Replace occurances of JArray<TYPE> with TYPE[]. */
-  while (next < end)
-    {
-      char *open_str = VG_(strstr) (next, "JArray<");
-      char *close_str = NULL;
-      if (nesting > 0)
-	close_str = VG_(strchr) (next, '>');
-    
-      if (open_str != NULL && (close_str == NULL || close_str > open_str))
-        {
-	  ++nesting;
-	  
-	  if (!demangled)
-	    demangled = dyn_string_new(len);
-
-          /* Copy prepending symbols, if any. */
-	  if (open_str > next)
-	    {
-	      open_str[0] = 0;
-	      dyn_string_append_cstr (demangled, next);
-	    }	  
-	  next = open_str + 7;
-	}
-      else if (close_str != NULL)
-        {
-	  --nesting;
-	  
-          /* Copy prepending type symbol, if any. Squash any spurious 
-	     whitespace. */
-	  if (close_str > next && next[0] != ' ')
-	    {
-	      close_str[0] = 0;
-	      dyn_string_append_cstr (demangled, next);
-	    }
-	  dyn_string_append_cstr (demangled, "[]");	  
-	  next = close_str + 1;
-	}
-      else
-        {
-	  /* There are no more arrays. Copy the rest of the symbol, or
-	     simply return the original symbol if no changes were made. */
-	  if (next == cplus_demangled)
-	    return cplus_demangled;
-
-          dyn_string_append_cstr (demangled, next);
-	  next = end;
-	}
-    }
-
-  free (cplus_demangled);
-  
-  return_value = dyn_string_release (demangled);
-  return return_value;
-}
-
-#endif /* IN_LIBGCC2 */
-
-
-/* Demangle NAME in the G++ V3 ABI demangling style, and return either
-   zero, indicating that some error occurred, or a demangling_t
-   holding the results.  */
-static demangling_t
-demangle_v3_with_details (name)
-     const char *name;
-{
-  demangling_t dm;
-  status_t status;
-
-  if (VG_(strncmp) (name, "_Z", 2))
-    return 0;
-
-  dm = demangling_new (name, DMGL_GNU_V3);
-  if (dm == NULL)
-    {
-	vg_assert (0);
-	/*
-      fprintf (stderr, "Memory allocation failed.\n");
-      abort ();
-      */
-    }
-
-  status = result_push (dm);
-  if (! STATUS_NO_ERROR (status))
-    {
-      demangling_delete (dm);
-      vg_assert (0);
-      /*
-      fprintf (stderr, "%s\n", status);
-      abort ();
-      */
-    }
-
-  status = demangle_mangled_name (dm);
-  if (STATUS_NO_ERROR (status))
-    return dm;
-
-  demangling_delete (dm);
-  return 0;
-}
-
-
-/* Return non-zero iff NAME is the mangled form of a constructor name
-   in the G++ V3 ABI demangling style.  Specifically, return:
-   - '1' if NAME is a complete object constructor,
-   - '2' if NAME is a base object constructor, or
-   - '3' if NAME is a complete object allocating constructor.  */
-/*
-enum gnu_v3_ctor_kinds
-is_gnu_v3_mangled_ctor (name)
-     const char *name;
-{
-  demangling_t dm = demangle_v3_with_details (name);
-
-  if (dm)
-    {
-      enum gnu_v3_ctor_kinds result = dm->is_constructor;
-      demangling_delete (dm);
-      return result;
-    }
-  else
-    return 0;
-}
-*/
-
-
-/* Return non-zero iff NAME is the mangled form of a destructor name
-   in the G++ V3 ABI demangling style.  Specifically, return:
-   - '0' if NAME is a deleting destructor,
-   - '1' if NAME is a complete object destructor, or
-   - '2' if NAME is a base object destructor.  */
-/*
-enum gnu_v3_dtor_kinds
-is_gnu_v3_mangled_dtor (name)
-     const char *name;
-{
-  demangling_t dm = demangle_v3_with_details (name);
-
-  if (dm)
-    {
-      enum gnu_v3_dtor_kinds result = dm->is_destructor;
-      demangling_delete (dm);
-      return result;
-    }
-  else
-    return 0;
-}
-*/
-
-#ifdef STANDALONE_DEMANGLER
-
-#include "getopt.h"
-
-static void print_usage
-  PARAMS ((FILE* fp, int exit_value));
-
-/* Non-zero if CHAR is a character than can occur in a mangled name.  */
-#define is_mangled_char(CHAR)                                           \
-  (IS_ALPHA (CHAR) || IS_DIGIT (CHAR)                                   \
-   || (CHAR) == '_' || (CHAR) == '.' || (CHAR) == '$')
-
-/* The name of this program, as invoked.  */
-const char* program_name;
-
-/* Prints usage summary to FP and then exits with EXIT_VALUE.  */
-
-static void
-print_usage (fp, exit_value)
-     FILE* fp;
-     int exit_value;
-{
-  fprintf (fp, "Usage: %s [options] [names ...]\n", program_name);
-  fprintf (fp, "Options:\n");
-  fprintf (fp, "  -h,--help       Display this message.\n");
-  fprintf (fp, "  -s,--strict     Demangle standard names only.\n");
-  fprintf (fp, "  -v,--verbose    Produce verbose demanglings.\n");
-  fprintf (fp, "If names are provided, they are demangled.  Otherwise filters standard input.\n");
-
-  exit (exit_value);
-}
-
-/* Option specification for getopt_long.  */
-static const struct option long_options[] = 
-{
-  { "help",    no_argument, NULL, 'h' },
-  { "strict",  no_argument, NULL, 's' },
-  { "verbose", no_argument, NULL, 'v' },
-  { NULL,      no_argument, NULL, 0   },
-};
-
-/* Main entry for a demangling filter executable.  It will demangle
-   its command line arguments, if any.  If none are provided, it will
-   filter stdin to stdout, replacing any recognized mangled C++ names
-   with their demangled equivalents.  */
-
-int
-main (argc, argv)
-     int argc;
-     char *argv[];
-{
-  status_t status;
-  int i;
-  int opt_char;
-
-  /* Use the program name of this program, as invoked.  */
-  program_name = argv[0];
-
-  /* Parse options.  */
-  do 
-    {
-      opt_char = getopt_long (argc, argv, "hsv", long_options, NULL);
-      switch (opt_char)
-	{
-	case '?':  /* Unrecognized option.  */
-	  print_usage (stderr, 1);
-	  break;
-
-	case 'h':
-	  print_usage (stdout, 0);
-	  break;
-
-	case 's':
-	  flag_strict = 1;
-	  break;
-
-	case 'v':
-	  flag_verbose = 1;
-	  break;
-	}
-    }
-  while (opt_char != -1);
-
-  if (optind == argc) 
-    /* No command line arguments were provided.  Filter stdin.  */
-    {
-      dyn_string_t mangled = dyn_string_new (3);
-      dyn_string_t demangled = dyn_string_new (0);
-      status_t status;
-
-      /* Read all of input.  */
-      while (!feof (stdin))
-	{
-	  char c = getchar ();
-
-	  /* The first character of a mangled name is an underscore.  */
-	  if (feof (stdin))
-	    break;
-	  if (c != '_')
-	    {
-	      /* It's not a mangled name.  Print the character and go
-		 on.  */
-	      putchar (c);
-	      continue;
-	    }
-	  c = getchar ();
-	  
-	  /* The second character of a mangled name is a capital `Z'.  */
-	  if (feof (stdin))
-	    break;
-	  if (c != 'Z')
-	    {
-	      /* It's not a mangled name.  Print the previous
-		 underscore, the `Z', and go on.  */
-	      putchar ('_');
-	      putchar (c);
-	      continue;
-	    }
-
-	  /* Start keeping track of the candidate mangled name.  */
-	  dyn_string_append_char (mangled, '_');
-	  dyn_string_append_char (mangled, 'Z');
-
-	  /* Pile characters into mangled until we hit one that can't
-	     occur in a mangled name.  */
-	  c = getchar ();
-	  while (!feof (stdin) && is_mangled_char (c))
-	    {
-	      dyn_string_append_char (mangled, c);
-	      if (feof (stdin))
-		break;
-	      c = getchar ();
-	    }
-
-	  /* Attempt to demangle the name.  */
-	  status = cp_demangle (dyn_string_buf (mangled), demangled, 0);
-
-	  /* If the demangling succeeded, great!  Print out the
-	     demangled version.  */
-	  if (STATUS_NO_ERROR (status))
-	    fputs (dyn_string_buf (demangled), stdout);
-	  /* Abort on allocation failures.  */
-	  else if (status == STATUS_ALLOCATION_FAILED)
-	    {
-	      fprintf (stderr, "Memory allocation failed.\n");
-	      abort ();
-	    }
-	  /* Otherwise, it might not have been a mangled name.  Just
-	     print out the original text.  */
-	  else
-	    fputs (dyn_string_buf (mangled), stdout);
-
-	  /* If we haven't hit EOF yet, we've read one character that
-	     can't occur in a mangled name, so print it out.  */
-	  if (!feof (stdin))
-	    putchar (c);
-
-	  /* Clear the candidate mangled name, to start afresh next
-	     time we hit a `_Z'.  */
-	  dyn_string_clear (mangled);
-	}
-
-      dyn_string_delete (mangled);
-      dyn_string_delete (demangled);
-    }
-  else
-    /* Demangle command line arguments.  */
-    {
-      dyn_string_t result = dyn_string_new (0);
-
-      /* Loop over command line arguments.  */
-      for (i = optind; i < argc; ++i)
-	{
-	  /* Attempt to demangle.  */
-	  status = cp_demangle (argv[i], result, 0);
-
-	  /* If it worked, print the demangled name.  */
-	  if (STATUS_NO_ERROR (status))
-	    printf ("%s\n", dyn_string_buf (result));
-	  /* Abort on allocaiton failures.  */
-	  else if (status == STATUS_ALLOCATION_FAILED)
-	    {
-	      fprintf (stderr, "Memory allocation failed.\n");
-	      abort ();
-	    }
-	  /* If not, print the error message to stderr instead.  */
-	  else 
-	    fprintf (stderr, "%s\n", status);
-	}
-      dyn_string_delete (result);
-    }
-
-  return 0;
-}
-
-#endif /* STANDALONE_DEMANGLER */
diff --git a/coregrind/demangle/cplus-dem.c b/coregrind/demangle/cplus-dem.c
deleted file mode 100644
index 56c3261391..0000000000
--- a/coregrind/demangle/cplus-dem.c
+++ /dev/null
@@ -1,5264 +0,0 @@
-/* Demangler for GNU C++
-   Copyright 1989, 1991, 1994, 1995, 1996, 1997, 1998, 1999,
-   2000, 2001 Free Software Foundation, Inc.
-   Written by James Clark (jjc@jclark.uucp)
-   Rewritten by Fred Fish (fnf@cygnus.com) for ARM and Lucid demangling
-   Modified by Satish Pai (pai@apollo.hp.com) for HP demangling
-
-This file is part of the libiberty library.
-Libiberty is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public
-License as published by the Free Software Foundation; either
-version 2 of the License, or (at your option) any later version.
-
-Libiberty is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with libiberty; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* This file exports two functions; cplus_mangle_opname and cplus_demangle.
-
-   This file imports xmalloc and xrealloc, which are like malloc and
-   realloc except that they generate a fatal error if there is no
-   available memory.  */
-
-/* This file lives in both GCC and libiberty.  When making changes, please
-   try not to break either.  */
-
-#define __NO_STRING_INLINES
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "safe-ctype.h"
-#include "vg_include.h"
-
-#include <sys/types.h>
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#else
-char * malloc ();
-char * realloc ();
-#endif
-
-#include <demangle.h>
-#include "dyn-string.h"
-#undef CURRENT_DEMANGLING_STYLE
-#define CURRENT_DEMANGLING_STYLE work->options
-
-/*#include "libiberty.h"*/
-
-static char *ada_demangle  PARAMS ((const char *, int));
-
-#define min(X,Y) (((X) < (Y)) ? (X) : (Y))
-
-/* A value at least one greater than the maximum number of characters
-   that will be output when using the `%d' format with `printf'.  */
-#define INTBUF_SIZE 32
-
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
-#endif
-
-#ifndef STANDALONE
-#define xstrdup(ptr) VG_(strdup)(VG_AR_DEMANGLE, ptr)
-#define free(ptr) VG_(free)(VG_AR_DEMANGLE, ptr)
-#define xmalloc(size) VG_(malloc)(VG_AR_DEMANGLE, size)
-#define xrealloc(ptr, size) VG_(realloc)(VG_AR_DEMANGLE, ptr, size)
-#define abort() vg_assert(0)
-#undef strstr
-#define strstr VG_(strstr)
-#define sprintf VG_(sprintf)
-#define strncpy VG_(strncpy)
-#define strncat VG_(strncat)
-#define strchr VG_(strchr)
-#define strpbrk VG_(strpbrk)
-#endif
-
-extern void fancy_abort PARAMS ((void)) ATTRIBUTE_NORETURN;
-
-/* In order to allow a single demangler executable to demangle strings
-   using various common values of CPLUS_MARKER, as well as any specific
-   one set at compile time, we maintain a string containing all the
-   commonly used ones, and check to see if the marker we are looking for
-   is in that string.  CPLUS_MARKER is usually '$' on systems where the
-   assembler can deal with that.  Where the assembler can't, it's usually
-   '.' (but on many systems '.' is used for other things).  We put the
-   current defined CPLUS_MARKER first (which defaults to '$'), followed
-   by the next most common value, followed by an explicit '$' in case
-   the value of CPLUS_MARKER is not '$'.
-
-   We could avoid this if we could just get g++ to tell us what the actual
-   cplus marker character is as part of the debug information, perhaps by
-   ensuring that it is the character that terminates the gcc<n>_compiled
-   marker symbol (FIXME).  */
-
-#if !defined (CPLUS_MARKER)
-#define CPLUS_MARKER '$'
-#endif
-
-enum demangling_styles current_demangling_style = auto_demangling;
-
-static char cplus_markers[] = { CPLUS_MARKER, '.', '$', '\0' };
-
-static char char_str[2] = { '\000', '\000' };
-
-/*
-void
-set_cplus_marker_for_demangling (ch)
-     int ch;
-{
-  cplus_markers[0] = ch;
-}
-*/
-
-typedef struct string		/* Beware: these aren't required to be */
-{				/*  '\0' terminated.  */
-  char *b;			/* pointer to start of string */
-  char *p;			/* pointer after last character */
-  char *e;			/* pointer after end of allocated space */
-} string;
-
-/* Stuff that is shared between sub-routines.
-   Using a shared structure allows cplus_demangle to be reentrant.  */
-
-struct work_stuff
-{
-  int options;
-  char **typevec;
-  char **ktypevec;
-  char **btypevec;
-  int numk;
-  int numb;
-  int ksize;
-  int bsize;
-  int ntypes;
-  int typevec_size;
-  int constructor;
-  int destructor;
-  int static_type;	/* A static member function */
-  int temp_start;       /* index in demangled to start of template args */
-  int type_quals;       /* The type qualifiers.  */
-  int dllimported;	/* Symbol imported from a PE DLL */
-  char **tmpl_argvec;   /* Template function arguments. */
-  int ntmpl_args;       /* The number of template function arguments. */
-  int forgetting_types; /* Nonzero if we are not remembering the types
-			   we see.  */
-  string* previous_argument; /* The last function argument demangled.  */
-  int nrepeats;         /* The number of times to repeat the previous
-			   argument.  */
-};
-
-#define PRINT_ANSI_QUALIFIERS (work -> options & DMGL_ANSI)
-#define PRINT_ARG_TYPES       (work -> options & DMGL_PARAMS)
-
-static const struct optable
-{
-  const char *const in;
-  const char *const out;
-  const int flags;
-} optable[] = {
-  {"nw",	  " new",	DMGL_ANSI},	/* new (1.92,	 ansi) */
-  {"dl",	  " delete",	DMGL_ANSI},	/* new (1.92,	 ansi) */
-  {"new",	  " new",	0},		/* old (1.91,	 and 1.x) */
-  {"delete",	  " delete",	0},		/* old (1.91,	 and 1.x) */
-  {"vn",	  " new []",	DMGL_ANSI},	/* GNU, pending ansi */
-  {"vd",	  " delete []",	DMGL_ANSI},	/* GNU, pending ansi */
-  {"as",	  "=",		DMGL_ANSI},	/* ansi */
-  {"ne",	  "!=",		DMGL_ANSI},	/* old, ansi */
-  {"eq",	  "==",		DMGL_ANSI},	/* old,	ansi */
-  {"ge",	  ">=",		DMGL_ANSI},	/* old,	ansi */
-  {"gt",	  ">",		DMGL_ANSI},	/* old,	ansi */
-  {"le",	  "<=",		DMGL_ANSI},	/* old,	ansi */
-  {"lt",	  "<",		DMGL_ANSI},	/* old,	ansi */
-  {"plus",	  "+",		0},		/* old */
-  {"pl",	  "+",		DMGL_ANSI},	/* ansi */
-  {"apl",	  "+=",		DMGL_ANSI},	/* ansi */
-  {"minus",	  "-",		0},		/* old */
-  {"mi",	  "-",		DMGL_ANSI},	/* ansi */
-  {"ami",	  "-=",		DMGL_ANSI},	/* ansi */
-  {"mult",	  "*",		0},		/* old */
-  {"ml",	  "*",		DMGL_ANSI},	/* ansi */
-  {"amu",	  "*=",		DMGL_ANSI},	/* ansi (ARM/Lucid) */
-  {"aml",	  "*=",		DMGL_ANSI},	/* ansi (GNU/g++) */
-  {"convert",	  "+",		0},		/* old (unary +) */
-  {"negate",	  "-",		0},		/* old (unary -) */
-  {"trunc_mod",	  "%",		0},		/* old */
-  {"md",	  "%",		DMGL_ANSI},	/* ansi */
-  {"amd",	  "%=",		DMGL_ANSI},	/* ansi */
-  {"trunc_div",	  "/",		0},		/* old */
-  {"dv",	  "/",		DMGL_ANSI},	/* ansi */
-  {"adv",	  "/=",		DMGL_ANSI},	/* ansi */
-  {"truth_andif", "&&",		0},		/* old */
-  {"aa",	  "&&",		DMGL_ANSI},	/* ansi */
-  {"truth_orif",  "||",		0},		/* old */
-  {"oo",	  "||",		DMGL_ANSI},	/* ansi */
-  {"truth_not",	  "!",		0},		/* old */
-  {"nt",	  "!",		DMGL_ANSI},	/* ansi */
-  {"postincrement","++",	0},		/* old */
-  {"pp",	  "++",		DMGL_ANSI},	/* ansi */
-  {"postdecrement","--",	0},		/* old */
-  {"mm",	  "--",		DMGL_ANSI},	/* ansi */
-  {"bit_ior",	  "|",		0},		/* old */
-  {"or",	  "|",		DMGL_ANSI},	/* ansi */
-  {"aor",	  "|=",		DMGL_ANSI},	/* ansi */
-  {"bit_xor",	  "^",		0},		/* old */
-  {"er",	  "^",		DMGL_ANSI},	/* ansi */
-  {"aer",	  "^=",		DMGL_ANSI},	/* ansi */
-  {"bit_and",	  "&",		0},		/* old */
-  {"ad",	  "&",		DMGL_ANSI},	/* ansi */
-  {"aad",	  "&=",		DMGL_ANSI},	/* ansi */
-  {"bit_not",	  "~",		0},		/* old */
-  {"co",	  "~",		DMGL_ANSI},	/* ansi */
-  {"call",	  "()",		0},		/* old */
-  {"cl",	  "()",		DMGL_ANSI},	/* ansi */
-  {"alshift",	  "<<",		0},		/* old */
-  {"ls",	  "<<",		DMGL_ANSI},	/* ansi */
-  {"als",	  "<<=",	DMGL_ANSI},	/* ansi */
-  {"arshift",	  ">>",		0},		/* old */
-  {"rs",	  ">>",		DMGL_ANSI},	/* ansi */
-  {"ars",	  ">>=",	DMGL_ANSI},	/* ansi */
-  {"component",	  "->",		0},		/* old */
-  {"pt",	  "->",		DMGL_ANSI},	/* ansi; Lucid C++ form */
-  {"rf",	  "->",		DMGL_ANSI},	/* ansi; ARM/GNU form */
-  {"indirect",	  "*",		0},		/* old */
-  {"method_call",  "->()",	0},		/* old */
-  {"addr",	  "&",		0},		/* old (unary &) */
-  {"array",	  "[]",		0},		/* old */
-  {"vc",	  "[]",		DMGL_ANSI},	/* ansi */
-  {"compound",	  ", ",		0},		/* old */
-  {"cm",	  ", ",		DMGL_ANSI},	/* ansi */
-  {"cond",	  "?:",		0},		/* old */
-  {"cn",	  "?:",		DMGL_ANSI},	/* pseudo-ansi */
-  {"max",	  ">?",		0},		/* old */
-  {"mx",	  ">?",		DMGL_ANSI},	/* pseudo-ansi */
-  {"min",	  "<?",		0},		/* old */
-  {"mn",	  "<?",		DMGL_ANSI},	/* pseudo-ansi */
-  {"nop",	  "",		0},		/* old (for operator=) */
-  {"rm",	  "->*",	DMGL_ANSI},	/* ansi */
-  {"sz",          "sizeof ",    DMGL_ANSI}      /* pseudo-ansi */
-};
-
-/* These values are used to indicate the various type varieties.
-   They are all non-zero so that they can be used as `success'
-   values.  */
-typedef enum type_kind_t
-{
-  tk_none,
-  tk_pointer,
-  tk_reference,
-  tk_integral,
-  tk_bool,
-  tk_char,
-  tk_real
-} type_kind_t;
-
-const struct demangler_engine libiberty_demanglers[] =
-{
-  {
-    NO_DEMANGLING_STYLE_STRING,
-    no_demangling,
-    "Demangling disabled"
-  }
-  ,
-  {
-    AUTO_DEMANGLING_STYLE_STRING,
-      auto_demangling,
-      "Automatic selection based on executable"
-  }
-  ,
-  {
-    GNU_DEMANGLING_STYLE_STRING,
-      gnu_demangling,
-      "GNU (g++) style demangling"
-  }
-  ,
-  {
-    LUCID_DEMANGLING_STYLE_STRING,
-      lucid_demangling,
-      "Lucid (lcc) style demangling"
-  }
-  ,
-  {
-    ARM_DEMANGLING_STYLE_STRING,
-      arm_demangling,
-      "ARM style demangling"
-  }
-  ,
-  {
-    HP_DEMANGLING_STYLE_STRING,
-      hp_demangling,
-      "HP (aCC) style demangling"
-  }
-  ,
-  {
-    EDG_DEMANGLING_STYLE_STRING,
-      edg_demangling,
-      "EDG style demangling"
-  }
-  ,
-  {
-    GNU_V3_DEMANGLING_STYLE_STRING,
-    gnu_v3_demangling,
-    "GNU (g++) V3 ABI-style demangling"
-  }
-  ,
-  {
-    JAVA_DEMANGLING_STYLE_STRING,
-    java_demangling,
-    "Java style demangling"
-  }
-  ,
-  {
-    GNAT_DEMANGLING_STYLE_STRING,
-    gnat_demangling,
-    "GNAT style demangling"
-  }
-  ,
-  {
-    NULL, unknown_demangling, NULL
-  }
-};
-
-#define STRING_EMPTY(str)	((str) -> b == (str) -> p)
-#define PREPEND_BLANK(str)	{if (!STRING_EMPTY(str)) \
-    string_prepend(str, " ");}
-#define APPEND_BLANK(str)	{if (!STRING_EMPTY(str)) \
-    string_append(str, " ");}
-#define LEN_STRING(str)         ( (STRING_EMPTY(str))?0:((str)->p - (str)->b))
-
-/* The scope separator appropriate for the language being demangled.  */
-
-#define SCOPE_STRING(work) ((work->options & DMGL_JAVA) ? "." : "::")
-
-#define ARM_VTABLE_STRING "__vtbl__"	/* Lucid/ARM virtual table prefix */
-#define ARM_VTABLE_STRLEN 8		/* strlen (ARM_VTABLE_STRING) */
-
-/* Prototypes for local functions */
-
-static void
-delete_work_stuff PARAMS ((struct work_stuff *));
-
-static void
-delete_non_B_K_work_stuff PARAMS ((struct work_stuff *));
-
-static char *
-mop_up PARAMS ((struct work_stuff *, string *, int));
-
-static void
-squangle_mop_up PARAMS ((struct work_stuff *));
-
-static void
-work_stuff_copy_to_from PARAMS ((struct work_stuff *, struct work_stuff *));
-
-#if 0
-static int
-demangle_method_args PARAMS ((struct work_stuff *, const char **, string *));
-#endif
-
-static char *
-internal_cplus_demangle PARAMS ((struct work_stuff *, const char *));
-
-static int
-demangle_template_template_parm PARAMS ((struct work_stuff *work,
-					 const char **, string *));
-
-static int
-demangle_template PARAMS ((struct work_stuff *work, const char **, string *,
-			   string *, int, int));
-
-static int
-arm_pt PARAMS ((struct work_stuff *, const char *, int, const char **,
-		const char **));
-
-static int
-demangle_class_name PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-demangle_qualified PARAMS ((struct work_stuff *, const char **, string *,
-			    int, int));
-
-static int
-demangle_class PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-demangle_fund_type PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-demangle_signature PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-demangle_prefix PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-gnu_special PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-arm_special PARAMS ((const char **, string *));
-
-static void
-string_need PARAMS ((string *, int));
-
-static void
-string_delete PARAMS ((string *));
-
-static void
-string_init PARAMS ((string *));
-
-static void
-string_clear PARAMS ((string *));
-
-#if 0
-static int
-string_empty PARAMS ((string *));
-#endif
-
-static void
-string_append PARAMS ((string *, const char *));
-
-static void
-string_appends PARAMS ((string *, string *));
-
-static void
-string_appendn PARAMS ((string *, const char *, int));
-
-static void
-string_prepend PARAMS ((string *, const char *));
-
-static void
-string_prependn PARAMS ((string *, const char *, int));
-
-static void
-string_append_template_idx PARAMS ((string *, int));
-
-static int
-get_count PARAMS ((const char **, int *));
-
-static int
-consume_count PARAMS ((const char **));
-
-static int
-consume_count_with_underscores PARAMS ((const char**));
-
-static int
-demangle_args PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-demangle_nested_args PARAMS ((struct work_stuff*, const char**, string*));
-
-static int
-do_type PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-do_arg PARAMS ((struct work_stuff *, const char **, string *));
-
-static void
-demangle_function_name PARAMS ((struct work_stuff *, const char **, string *,
-				const char *));
-
-static int
-iterate_demangle_function PARAMS ((struct work_stuff *,
-				   const char **, string *, const char *));
-
-static void
-remember_type PARAMS ((struct work_stuff *, const char *, int));
-
-static void
-remember_Btype PARAMS ((struct work_stuff *, const char *, int, int));
-
-static int
-register_Btype PARAMS ((struct work_stuff *));
-
-static void
-remember_Ktype PARAMS ((struct work_stuff *, const char *, int));
-
-static void
-forget_types PARAMS ((struct work_stuff *));
-
-static void
-forget_B_and_K_types PARAMS ((struct work_stuff *));
-
-static void
-string_prepends PARAMS ((string *, string *));
-
-static int
-demangle_template_value_parm PARAMS ((struct work_stuff*, const char**,
-				      string*, type_kind_t));
-
-static int
-do_hpacc_template_const_value PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-do_hpacc_template_literal PARAMS ((struct work_stuff *, const char **, string *));
-
-static int
-snarf_numeric_literal PARAMS ((const char **, string *));
-
-/* There is a TYPE_QUAL value for each type qualifier.  They can be
-   combined by bitwise-or to form the complete set of qualifiers for a
-   type.  */
-
-#define TYPE_UNQUALIFIED   0x0
-#define TYPE_QUAL_CONST    0x1
-#define TYPE_QUAL_VOLATILE 0x2
-#define TYPE_QUAL_RESTRICT 0x4
-
-static int
-code_for_qualifier PARAMS ((int));
-
-static const char*
-qualifier_string PARAMS ((int));
-
-static const char*
-demangle_qualifier PARAMS ((int));
-
-static int
-demangle_expression PARAMS ((struct work_stuff *, const char **, string *, 
-			     type_kind_t));
-
-static int
-demangle_integral_value PARAMS ((struct work_stuff *, const char **,
-				 string *));
-
-static int
-demangle_real_value PARAMS ((struct work_stuff *, const char **, string *));
-
-static void
-demangle_arm_hp_template PARAMS ((struct work_stuff *, const char **, int,
-				  string *));
-
-static void
-recursively_demangle PARAMS ((struct work_stuff *, const char **, string *,
-			      int));
-
-static void
-grow_vect PARAMS ((void **, size_t *, size_t, int));
-
-/* Translate count to integer, consuming tokens in the process.
-   Conversion terminates on the first non-digit character.
-
-   Trying to consume something that isn't a count results in no
-   consumption of input and a return of -1.
-
-   Overflow consumes the rest of the digits, and returns -1.  */
-
-static int
-consume_count (type)
-     const char **type;
-{
-  int count = 0;
-
-  if (! ISDIGIT ((unsigned char)**type))
-    return -1;
-
-  while (ISDIGIT ((unsigned char)**type))
-    {
-      count *= 10;
-
-      /* Check for overflow.
-	 We assume that count is represented using two's-complement;
-	 no power of two is divisible by ten, so if an overflow occurs
-	 when multiplying by ten, the result will not be a multiple of
-	 ten.  */
-      if ((count % 10) != 0)
-	{
-	  while (ISDIGIT ((unsigned char) **type))
-	    (*type)++;
-	  return -1;
-	}
-
-      count += **type - '0';
-      (*type)++;
-    }
-
-  if (count < 0)
-    count = -1;
-
-  return (count);
-}
-
-
-/* Like consume_count, but for counts that are preceded and followed
-   by '_' if they are greater than 10.  Also, -1 is returned for
-   failure, since 0 can be a valid value.  */
-
-static int
-consume_count_with_underscores (mangled)
-     const char **mangled;
-{
-  int idx;
-
-  if (**mangled == '_')
-    {
-      (*mangled)++;
-      if (!ISDIGIT ((unsigned char)**mangled))
-	return -1;
-
-      idx = consume_count (mangled);
-      if (**mangled != '_')
-	/* The trailing underscore was missing. */
-	return -1;
-
-      (*mangled)++;
-    }
-  else
-    {
-      if (**mangled < '0' || **mangled > '9')
-	return -1;
-
-      idx = **mangled - '0';
-      (*mangled)++;
-    }
-
-  return idx;
-}
-
-/* C is the code for a type-qualifier.  Return the TYPE_QUAL
-   corresponding to this qualifier.  */
-
-static int
-code_for_qualifier (c)
-  int c;
-{
-  switch (c)
-    {
-    case 'C':
-      return TYPE_QUAL_CONST;
-
-    case 'V':
-      return TYPE_QUAL_VOLATILE;
-
-    case 'u':
-      return TYPE_QUAL_RESTRICT;
-
-    default:
-      break;
-    }
-
-  /* C was an invalid qualifier.  */
-  abort ();
-}
-
-/* Return the string corresponding to the qualifiers given by
-   TYPE_QUALS.  */
-
-static const char*
-qualifier_string (type_quals)
-     int type_quals;
-{
-  switch (type_quals)
-    {
-    case TYPE_UNQUALIFIED:
-      return "";
-
-    case TYPE_QUAL_CONST:
-      return "const";
-
-    case TYPE_QUAL_VOLATILE:
-      return "volatile";
-
-    case TYPE_QUAL_RESTRICT:
-      return "__restrict";
-
-    case TYPE_QUAL_CONST | TYPE_QUAL_VOLATILE:
-      return "const volatile";
-
-    case TYPE_QUAL_CONST | TYPE_QUAL_RESTRICT:
-      return "const __restrict";
-
-    case TYPE_QUAL_VOLATILE | TYPE_QUAL_RESTRICT:
-      return "volatile __restrict";
-
-    case TYPE_QUAL_CONST | TYPE_QUAL_VOLATILE | TYPE_QUAL_RESTRICT:
-      return "const volatile __restrict";
-
-    default:
-      break;
-    }
-
-  /* TYPE_QUALS was an invalid qualifier set.  */
-  abort ();
-}
-
-/* C is the code for a type-qualifier.  Return the string
-   corresponding to this qualifier.  This function should only be
-   called with a valid qualifier code.  */
-
-static const char*
-demangle_qualifier (c)
-  int c;
-{
-  return qualifier_string (code_for_qualifier (c));
-}
-
-#if 0
-int
-cplus_demangle_opname (opname, result, options)
-     const char *opname;
-     char *result;
-     int options;
-{
-  int len, len1, ret;
-  string type;
-  struct work_stuff work[1];
-  const char *tem;
-
-  len = strlen(opname);
-  result[0] = '\0';
-  ret = 0;
-  memset ((char *) work, 0, sizeof (work));
-  work->options = options;
-
-  if (opname[0] == '_' && opname[1] == '_'
-      && opname[2] == 'o' && opname[3] == 'p')
-    {
-      /* ANSI.  */
-      /* type conversion operator.  */
-      tem = opname + 4;
-      if (do_type (work, &tem, &type))
-	{
-	  strcat (result, "operator ");
-	  strncat (result, type.b, type.p - type.b);
-	  string_delete (&type);
-	  ret = 1;
-	}
-    }
-  else if (opname[0] == '_' && opname[1] == '_'
-	   && ISLOWER((unsigned char)opname[2])
-	   && ISLOWER((unsigned char)opname[3]))
-    {
-      if (opname[4] == '\0')
-	{
-	  /* Operator.  */
-	  size_t i;
-	  for (i = 0; i < ARRAY_SIZE (optable); i++)
-	    {
-	      if (strlen (optable[i].in) == 2
-		  && memcmp (optable[i].in, opname + 2, 2) == 0)
-		{
-		  strcat (result, "operator");
-		  strcat (result, optable[i].out);
-		  ret = 1;
-		  break;
-		}
-	    }
-	}
-      else
-	{
-	  if (opname[2] == 'a' && opname[5] == '\0')
-	    {
-	      /* Assignment.  */
-	      size_t i;
-	      for (i = 0; i < ARRAY_SIZE (optable); i++)
-		{
-		  if (strlen (optable[i].in) == 3
-		      && memcmp (optable[i].in, opname + 2, 3) == 0)
-		    {
-		      strcat (result, "operator");
-		      strcat (result, optable[i].out);
-		      ret = 1;
-		      break;
-		    }
-		}
-	    }
-	}
-    }
-  else if (len >= 3
-	   && opname[0] == 'o'
-	   && opname[1] == 'p'
-	   && strchr (cplus_markers, opname[2]) != NULL)
-    {
-      /* see if it's an assignment expression */
-      if (len >= 10 /* op$assign_ */
-	  && memcmp (opname + 3, "assign_", 7) == 0)
-	{
-	  size_t i;
-	  for (i = 0; i < ARRAY_SIZE (optable); i++)
-	    {
-	      len1 = len - 10;
-	      if ((int) strlen (optable[i].in) == len1
-		  && memcmp (optable[i].in, opname + 10, len1) == 0)
-		{
-		  strcat (result, "operator");
-		  strcat (result, optable[i].out);
-		  strcat (result, "=");
-		  ret = 1;
-		  break;
-		}
-	    }
-	}
-      else
-	{
-	  size_t i;
-	  for (i = 0; i < ARRAY_SIZE (optable); i++)
-	    {
-	      len1 = len - 3;
-	      if ((int) strlen (optable[i].in) == len1
-		  && memcmp (optable[i].in, opname + 3, len1) == 0)
-		{
-		  strcat (result, "operator");
-		  strcat (result, optable[i].out);
-		  ret = 1;
-		  break;
-		}
-	    }
-	}
-    }
-  else if (len >= 5 && memcmp (opname, "type", 4) == 0
-	   && strchr (cplus_markers, opname[4]) != NULL)
-    {
-      /* type conversion operator */
-      tem = opname + 5;
-      if (do_type (work, &tem, &type))
-	{
-	  strcat (result, "operator ");
-	  strncat (result, type.b, type.p - type.b);
-	  string_delete (&type);
-	  ret = 1;
-	}
-    }
-  squangle_mop_up (work);
-  return ret;
-
-}
-#endif /* 0 */
-
-/* Takes operator name as e.g. "++" and returns mangled
-   operator name (e.g. "postincrement_expr"), or NULL if not found.
-
-   If OPTIONS & DMGL_ANSI == 1, return the ANSI name;
-   if OPTIONS & DMGL_ANSI == 0, return the old GNU name.  */
-
-/*
-const char *
-cplus_mangle_opname (opname, options)
-     const char *opname;
-     int options;
-{
-  size_t i;
-  int len;
-
-  len = strlen (opname);
-  for (i = 0; i < ARRAY_SIZE (optable); i++)
-    {
-      if ((int) strlen (optable[i].out) == len
-	  && (options & DMGL_ANSI) == (optable[i].flags & DMGL_ANSI)
-	  && memcmp (optable[i].out, opname, len) == 0)
-	return optable[i].in;
-    }
-  return (0);
-}
-*/
-
-/* Add a routine to set the demangling style to be sure it is valid and
-   allow for any demangler initialization that maybe necessary. */
-
-/*
-enum demangling_styles
-cplus_demangle_set_style (style)
-     enum demangling_styles style;
-{
-  const struct demangler_engine *demangler = libiberty_demanglers; 
-
-  for (; demangler->demangling_style != unknown_demangling; ++demangler)
-    if (style == demangler->demangling_style)
-      {
-	current_demangling_style = style;
-	return current_demangling_style;
-      }
-
-  return unknown_demangling;
-}
-*/
-
-/* Do string name to style translation */
-
-/*
-enum demangling_styles
-cplus_demangle_name_to_style (name)
-     const char *name;
-{
-  const struct demangler_engine *demangler = libiberty_demanglers; 
-
-  for (; demangler->demangling_style != unknown_demangling; ++demangler)
-    if (strcmp (name, demangler->demangling_style_name) == 0)
-      return demangler->demangling_style;
-
-  return unknown_demangling;
-}
-*/
-
-/* char *cplus_demangle (const char *mangled, int options)
-
-   If MANGLED is a mangled function name produced by GNU C++, then
-   a pointer to a @code{malloc}ed string giving a C++ representation
-   of the name will be returned; otherwise NULL will be returned.
-   It is the caller's responsibility to free the string which
-   is returned.
-
-   The OPTIONS arg may contain one or more of the following bits:
-
-   	DMGL_ANSI	ANSI qualifiers such as `const' and `void' are
-			included.
-	DMGL_PARAMS	Function parameters are included.
-
-   For example,
-
-   cplus_demangle ("foo__1Ai", DMGL_PARAMS)		=> "A::foo(int)"
-   cplus_demangle ("foo__1Ai", DMGL_PARAMS | DMGL_ANSI)	=> "A::foo(int)"
-   cplus_demangle ("foo__1Ai", 0)			=> "A::foo"
-
-   cplus_demangle ("foo__1Afe", DMGL_PARAMS)		=> "A::foo(float,...)"
-   cplus_demangle ("foo__1Afe", DMGL_PARAMS | DMGL_ANSI)=> "A::foo(float,...)"
-   cplus_demangle ("foo__1Afe", 0)			=> "A::foo"
-
-   Note that any leading underscores, or other such characters prepended by
-   the compilation system, are presumed to have already been stripped from
-   MANGLED.  */
-
-char *
-VG_(cplus_demangle) (mangled, options)
-     const char *mangled;
-     int options;
-{
-  char *ret;
-  struct work_stuff work[1];
-
-  if (current_demangling_style == no_demangling)
-    return xstrdup (mangled);
-
-  memset ((char *) work, 0, sizeof (work));
-  work->options = options;
-  if ((work->options & DMGL_STYLE_MASK) == 0)
-    work->options |= (int) current_demangling_style & DMGL_STYLE_MASK;
-
-  /* The V3 ABI demangling is implemented elsewhere.  */
-  if (GNU_V3_DEMANGLING || AUTO_DEMANGLING)
-    {
-      ret = VG_(cplus_demangle_v3) (mangled/*, work->options*/);
-      if (ret || GNU_V3_DEMANGLING)
-	return ret;
-    }
-
-  if (JAVA_DEMANGLING)
-    {
-      ret = VG_(java_demangle_v3) (mangled);
-      if (ret)
-        return ret;
-    }
-
-  if (GNAT_DEMANGLING)
-    return ada_demangle(mangled,options);
-
-  ret = internal_cplus_demangle (work, mangled);
-  squangle_mop_up (work);
-  return (ret);
-}
-
-
-/* Assuming *OLD_VECT points to an array of *SIZE objects of size
-   ELEMENT_SIZE, grow it to contain at least MIN_SIZE objects,
-   updating *OLD_VECT and *SIZE as necessary.  */
-
-static void
-grow_vect (old_vect, size, min_size, element_size)
-     void **old_vect;
-     size_t *size;
-     size_t min_size;
-     int element_size;
-{
-  if (*size < min_size)
-    {
-      *size *= 2;
-      if (*size < min_size)
-	*size = min_size;
-      *old_vect = xrealloc (*old_vect, *size * element_size);
-    }
-}
-
-/* Demangle ada names:
-   1. Discard final __{DIGIT}+ or ${DIGIT}+
-   2. Convert other instances of embedded "__" to `.'.
-   3. Discard leading _ada_.
-   4. Remove everything after first ___ if it is followed by 'X'.
-   5. Put symbols that should be suppressed in <...> brackets.
-   The resulting string is valid until the next call of ada_demangle.  */
-
-static char *
-ada_demangle (mangled, option)
-     const char *mangled;
-     int option ATTRIBUTE_UNUSED;
-{
-  int i, j;
-  int len0;
-  const char* p;
-  char *demangled = NULL;
-  int at_start_name;
-  int changed;
-  char *demangling_buffer = NULL;
-  size_t demangling_buffer_size = 0;
-  
-  changed = 0;
-
-  if (strncmp (mangled, "_ada_", 5) == 0)
-    {
-      mangled += 5;
-      changed = 1;
-    }
-  
-  if (mangled[0] == '_' || mangled[0] == '<')
-    goto Suppress;
-  
-  p = strstr (mangled, "___");
-  if (p == NULL)
-    len0 = strlen (mangled);
-  else
-    {
-      if (p[3] == 'X')
-	{
-	  len0 = p - mangled;
-	  changed = 1;
-	}
-      else
-	goto Suppress;
-    }
-  
-  /* Make demangled big enough for possible expansion by operator name.  */
-  grow_vect ((void **) &(demangling_buffer),
-	     &demangling_buffer_size,  2 * len0 + 1,
-	     sizeof (char));
-  demangled = demangling_buffer;
-  
-  if (ISDIGIT ((unsigned char) mangled[len0 - 1])) {
-    for (i = len0 - 2; i >= 0 && ISDIGIT ((unsigned char) mangled[i]); i -= 1)
-      ;
-    if (i > 1 && mangled[i] == '_' && mangled[i - 1] == '_')
-      {
-	len0 = i - 1;
-	changed = 1;
-      }
-    else if (mangled[i] == '$')
-      {
-	len0 = i;
-	changed = 1;
-      }
-  }
-  
-  for (i = 0, j = 0; i < len0 && ! ISALPHA ((unsigned char)mangled[i]);
-       i += 1, j += 1)
-    demangled[j] = mangled[i];
-  
-  at_start_name = 1;
-  while (i < len0)
-    {
-      at_start_name = 0;
-      
-      if (i < len0 - 2 && mangled[i] == '_' && mangled[i + 1] == '_')
-	{
-	  demangled[j] = '.';
-	  changed = at_start_name = 1;
-	  i += 2; j += 1;
-	}
-      else
-	{
-	  demangled[j] = mangled[i];
-	  i += 1;  j += 1;
-	}
-    }
-  demangled[j] = '\000';
-  
-  for (i = 0; demangled[i] != '\0'; i += 1)
-    if (ISUPPER ((unsigned char)demangled[i]) || demangled[i] == ' ')
-      goto Suppress;
-
-  if (! changed)
-    return NULL;
-  else
-    return demangled;
-  
- Suppress:
-  grow_vect ((void **) &(demangling_buffer),
-	     &demangling_buffer_size,  strlen (mangled) + 3,
-	     sizeof (char));
-  demangled = demangling_buffer;
-  if (mangled[0] == '<')
-     strcpy (demangled, mangled);
-  else
-    sprintf (demangled, "<%s>", mangled);
-
-  return demangled;
-}
-
-/* This function performs most of what cplus_demangle use to do, but
-   to be able to demangle a name with a B, K or n code, we need to
-   have a longer term memory of what types have been seen. The original
-   now intializes and cleans up the squangle code info, while internal
-   calls go directly to this routine to avoid resetting that info. */
-
-static char *
-internal_cplus_demangle (work, mangled)
-     struct work_stuff *work;
-     const char *mangled;
-{
-
-  string decl;
-  int success = 0;
-  char *demangled = NULL;
-  int s1, s2, s3, s4;
-  s1 = work->constructor;
-  s2 = work->destructor;
-  s3 = work->static_type;
-  s4 = work->type_quals;
-  work->constructor = work->destructor = 0;
-  work->type_quals = TYPE_UNQUALIFIED;
-  work->dllimported = 0;
-
-  if ((mangled != NULL) && (*mangled != '\0'))
-    {
-      string_init (&decl);
-
-      /* First check to see if gnu style demangling is active and if the
-	 string to be demangled contains a CPLUS_MARKER.  If so, attempt to
-	 recognize one of the gnu special forms rather than looking for a
-	 standard prefix.  In particular, don't worry about whether there
-	 is a "__" string in the mangled string.  Consider "_$_5__foo" for
-	 example.  */
-
-      if ((AUTO_DEMANGLING || GNU_DEMANGLING))
-	{
-	  success = gnu_special (work, &mangled, &decl);
-	}
-      if (!success)
-	{
-	  success = demangle_prefix (work, &mangled, &decl);
-	}
-      if (success && (*mangled != '\0'))
-	{
-	  success = demangle_signature (work, &mangled, &decl);
-	}
-      if (work->constructor == 2)
-        {
-          string_prepend (&decl, "global constructors keyed to ");
-          work->constructor = 0;
-        }
-      else if (work->destructor == 2)
-        {
-          string_prepend (&decl, "global destructors keyed to ");
-          work->destructor = 0;
-        }
-      else if (work->dllimported == 1)
-        {
-          string_prepend (&decl, "import stub for ");
-          work->dllimported = 0;
-        }
-      demangled = mop_up (work, &decl, success);
-    }
-  work->constructor = s1;
-  work->destructor = s2;
-  work->static_type = s3;
-  work->type_quals = s4;
-  return demangled;
-}
-
-
-/* Clear out and squangling related storage */
-static void
-squangle_mop_up (work)
-     struct work_stuff *work;
-{
-  /* clean up the B and K type mangling types. */
-  forget_B_and_K_types (work);
-  if (work -> btypevec != NULL)
-    {
-      free ((char *) work -> btypevec);
-    }
-  if (work -> ktypevec != NULL)
-    {
-      free ((char *) work -> ktypevec);
-    }
-}
-
-
-/* Copy the work state and storage.  */
-
-static void
-work_stuff_copy_to_from (to, from)
-     struct work_stuff *to;
-     struct work_stuff *from;
-{
-  int i;
-
-  delete_work_stuff (to);
-
-  /* Shallow-copy scalars.  */
-  memcpy (to, from, sizeof (*to));
-
-  /* Deep-copy dynamic storage.  */
-  if (from->typevec_size)
-    to->typevec
-      = (char **) xmalloc (from->typevec_size * sizeof (to->typevec[0]));
-
-  for (i = 0; i < from->ntypes; i++)
-    {
-      int len = strlen (from->typevec[i]) + 1;
-
-      to->typevec[i] = xmalloc (len);
-      memcpy (to->typevec[i], from->typevec[i], len);
-    }
-
-  if (from->ksize)
-    to->ktypevec
-      = (char **) xmalloc (from->ksize * sizeof (to->ktypevec[0]));
-
-  for (i = 0; i < from->numk; i++)
-    {
-      int len = strlen (from->ktypevec[i]) + 1;
-
-      to->ktypevec[i] = xmalloc (len);
-      memcpy (to->ktypevec[i], from->ktypevec[i], len);
-    }
-
-  if (from->bsize)
-    to->btypevec
-      = (char **) xmalloc (from->bsize * sizeof (to->btypevec[0]));
-
-  for (i = 0; i < from->numb; i++)
-    {
-      int len = strlen (from->btypevec[i]) + 1;
-
-      to->btypevec[i] = xmalloc (len);
-      memcpy (to->btypevec[i], from->btypevec[i], len);
-    }
-
-  if (from->ntmpl_args)
-    to->tmpl_argvec
-      = xmalloc (from->ntmpl_args * sizeof (to->tmpl_argvec[0]));
-
-  for (i = 0; i < from->ntmpl_args; i++)
-    {
-      int len = strlen (from->tmpl_argvec[i]) + 1;
-
-      to->tmpl_argvec[i] = xmalloc (len);
-      memcpy (to->tmpl_argvec[i], from->tmpl_argvec[i], len);
-    }
-
-  if (from->previous_argument)
-    {
-      to->previous_argument = (string*) xmalloc (sizeof (string));
-      string_init (to->previous_argument);
-      string_appends (to->previous_argument, from->previous_argument);
-    }
-}
-
-
-/* Delete dynamic stuff in work_stuff that is not to be re-used.  */
-
-static void
-delete_non_B_K_work_stuff (work)
-     struct work_stuff *work;
-{
-  /* Discard the remembered types, if any.  */
-
-  forget_types (work);
-  if (work -> typevec != NULL)
-    {
-      free ((char *) work -> typevec);
-      work -> typevec = NULL;
-      work -> typevec_size = 0;
-    }
-  if (work->tmpl_argvec)
-    {
-      int i;
-
-      for (i = 0; i < work->ntmpl_args; i++)
-	if (work->tmpl_argvec[i])
-	  free ((char*) work->tmpl_argvec[i]);
-
-      free ((char*) work->tmpl_argvec);
-      work->tmpl_argvec = NULL;
-    }
-  if (work->previous_argument)
-    {
-      string_delete (work->previous_argument);
-      free ((char*) work->previous_argument);
-      work->previous_argument = NULL;
-    }
-}
-
-
-/* Delete all dynamic storage in work_stuff.  */
-static void
-delete_work_stuff (work)
-     struct work_stuff *work;
-{
-  delete_non_B_K_work_stuff (work);
-  squangle_mop_up (work);
-}
-
-
-/* Clear out any mangled storage */
-
-static char *
-mop_up (work, declp, success)
-     struct work_stuff *work;
-     string *declp;
-     int success;
-{
-  char *demangled = NULL;
-
-  delete_non_B_K_work_stuff (work);
-
-  /* If demangling was successful, ensure that the demangled string is null
-     terminated and return it.  Otherwise, free the demangling decl.  */
-
-  if (!success)
-    {
-      string_delete (declp);
-    }
-  else
-    {
-      string_appendn (declp, "", 1);
-      demangled = declp->b;
-    }
-  return (demangled);
-}
-
-/*
-
-LOCAL FUNCTION
-
-	demangle_signature -- demangle the signature part of a mangled name
-
-SYNOPSIS
-
-	static int
-	demangle_signature (struct work_stuff *work, const char **mangled,
-			    string *declp);
-
-DESCRIPTION
-
-	Consume and demangle the signature portion of the mangled name.
-
-	DECLP is the string where demangled output is being built.  At
-	entry it contains the demangled root name from the mangled name
-	prefix.  I.E. either a demangled operator name or the root function
-	name.  In some special cases, it may contain nothing.
-
-	*MANGLED points to the current unconsumed location in the mangled
-	name.  As tokens are consumed and demangling is performed, the
-	pointer is updated to continuously point at the next token to
-	be consumed.
-
-	Demangling GNU style mangled names is nasty because there is no
-	explicit token that marks the start of the outermost function
-	argument list.  */
-
-static int
-demangle_signature (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  int success = 1;
-  int func_done = 0;
-  int expect_func = 0;
-  int expect_return_type = 0;
-  const char *oldmangled = NULL;
-  string trawname;
-  string tname;
-
-  while (success && (**mangled != '\0'))
-    {
-      switch (**mangled)
-	{
-	case 'Q':
-	  oldmangled = *mangled;
-	  success = demangle_qualified (work, mangled, declp, 1, 0);
-	  if (success)
-	    remember_type (work, oldmangled, *mangled - oldmangled);
-	  if (AUTO_DEMANGLING || GNU_DEMANGLING)
-	    expect_func = 1;
-	  oldmangled = NULL;
-	  break;
-
-        case 'K':
-	  oldmangled = *mangled;
-	  success = demangle_qualified (work, mangled, declp, 1, 0);
-	  if (AUTO_DEMANGLING || GNU_DEMANGLING)
-	    {
-	      expect_func = 1;
-	    }
-	  oldmangled = NULL;
-	  break;
-
-	case 'S':
-	  /* Static member function */
-	  if (oldmangled == NULL)
-	    {
-	      oldmangled = *mangled;
-	    }
-	  (*mangled)++;
-	  work -> static_type = 1;
-	  break;
-
-	case 'C':
-	case 'V':
-	case 'u':
-	  work->type_quals |= code_for_qualifier (**mangled);
-
-	  /* a qualified member function */
-	  if (oldmangled == NULL)
-	    oldmangled = *mangled;
-	  (*mangled)++;
-	  break;
-
-	case 'L':
-	  /* Local class name follows after "Lnnn_" */
-	  if (HP_DEMANGLING)
-	    {
-	      while (**mangled && (**mangled != '_'))
-		(*mangled)++;
-	      if (!**mangled)
-		success = 0;
-	      else
-		(*mangled)++;
-	    }
-	  else
-	    success = 0;
-	  break;
-
-	case '0': case '1': case '2': case '3': case '4':
-	case '5': case '6': case '7': case '8': case '9':
-	  if (oldmangled == NULL)
-	    {
-	      oldmangled = *mangled;
-	    }
-          work->temp_start = -1; /* uppermost call to demangle_class */
-	  success = demangle_class (work, mangled, declp);
-	  if (success)
-	    {
-	      remember_type (work, oldmangled, *mangled - oldmangled);
-	    }
-	  if (AUTO_DEMANGLING || GNU_DEMANGLING || EDG_DEMANGLING)
-	    {
-              /* EDG and others will have the "F", so we let the loop cycle
-                 if we are looking at one. */
-              if (**mangled != 'F')
-                 expect_func = 1;
-	    }
-	  oldmangled = NULL;
-	  break;
-
-	case 'B':
-	  {
-	    string s;
-	    success = do_type (work, mangled, &s);
-	    if (success)
-	      {
-		string_append (&s, SCOPE_STRING (work));
-		string_prepends (declp, &s);
-	      }
-	    oldmangled = NULL;
-	    expect_func = 1;
-	  }
-	  break;
-
-	case 'F':
-	  /* Function */
-	  /* ARM/HP style demangling includes a specific 'F' character after
-	     the class name.  For GNU style, it is just implied.  So we can
-	     safely just consume any 'F' at this point and be compatible
-	     with either style.  */
-
-	  oldmangled = NULL;
-	  func_done = 1;
-	  (*mangled)++;
-
-	  /* For lucid/ARM/HP style we have to forget any types we might
-	     have remembered up to this point, since they were not argument
-	     types.  GNU style considers all types seen as available for
-	     back references.  See comment in demangle_args() */
-
-	  if (LUCID_DEMANGLING || ARM_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING)
-	    {
-	      forget_types (work);
-	    }
-	  success = demangle_args (work, mangled, declp);
-	  /* After picking off the function args, we expect to either
-	     find the function return type (preceded by an '_') or the
-	     end of the string. */
-	  if (success && (AUTO_DEMANGLING || EDG_DEMANGLING) && **mangled == '_')
-	    {
-	      ++(*mangled);
-              /* At this level, we do not care about the return type. */
-              success = do_type (work, mangled, &tname);
-              string_delete (&tname);
-            }
-
-	  break;
-
-	case 't':
-	  /* G++ Template */
-	  string_init(&trawname);
-	  string_init(&tname);
-	  if (oldmangled == NULL)
-	    {
-	      oldmangled = *mangled;
-	    }
-	  success = demangle_template (work, mangled, &tname,
-				       &trawname, 1, 1);
-	  if (success)
-	    {
-	      remember_type (work, oldmangled, *mangled - oldmangled);
-	    }
-	  string_append (&tname, SCOPE_STRING (work));
-
-	  string_prepends(declp, &tname);
-	  if (work -> destructor & 1)
-	    {
-	      string_prepend (&trawname, "~");
-	      string_appends (declp, &trawname);
-	      work->destructor -= 1;
-	    }
-	  if ((work->constructor & 1) || (work->destructor & 1))
-	    {
-	      string_appends (declp, &trawname);
-	      work->constructor -= 1;
-	    }
-	  string_delete(&trawname);
-	  string_delete(&tname);
-	  oldmangled = NULL;
-	  expect_func = 1;
-	  break;
-
-	case '_':
-	  if ((AUTO_DEMANGLING || GNU_DEMANGLING) && expect_return_type)
-	    {
-	      /* Read the return type. */
-	      string return_type;
-	      string_init (&return_type);
-
-	      (*mangled)++;
-	      success = do_type (work, mangled, &return_type);
-	      APPEND_BLANK (&return_type);
-
-	      string_prepends (declp, &return_type);
-	      string_delete (&return_type);
-	      break;
-	    }
-	  else
-	    /* At the outermost level, we cannot have a return type specified,
-	       so if we run into another '_' at this point we are dealing with
-	       a mangled name that is either bogus, or has been mangled by
-	       some algorithm we don't know how to deal with.  So just
-	       reject the entire demangling.  */
-            /* However, "_nnn" is an expected suffix for alternate entry point
-               numbered nnn for a function, with HP aCC, so skip over that
-               without reporting failure. pai/1997-09-04 */
-            if (HP_DEMANGLING)
-              {
-                (*mangled)++;
-                while (**mangled && ISDIGIT ((unsigned char)**mangled))
-                  (*mangled)++;
-              }
-            else
-	      success = 0;
-	  break;
-
-	case 'H':
-	  if (AUTO_DEMANGLING || GNU_DEMANGLING)
-	    {
-	      /* A G++ template function.  Read the template arguments. */
-	      success = demangle_template (work, mangled, declp, 0, 0,
-					   0);
-	      if (!(work->constructor & 1))
-		expect_return_type = 1;
-	      (*mangled)++;
-	      break;
-	    }
-	  else
-	    /* fall through */
-	    {;}
-
-	default:
-	  if (AUTO_DEMANGLING || GNU_DEMANGLING)
-	    {
-	      /* Assume we have stumbled onto the first outermost function
-		 argument token, and start processing args.  */
-	      func_done = 1;
-	      success = demangle_args (work, mangled, declp);
-	    }
-	  else
-	    {
-	      /* Non-GNU demanglers use a specific token to mark the start
-		 of the outermost function argument tokens.  Typically 'F',
-		 for ARM/HP-demangling, for example.  So if we find something
-		 we are not prepared for, it must be an error.  */
-	      success = 0;
-	    }
-	  break;
-	}
-      /*
-	if (AUTO_DEMANGLING || GNU_DEMANGLING)
-	*/
-      {
-	if (success && expect_func)
-	  {
-	    func_done = 1;
-              if (LUCID_DEMANGLING || ARM_DEMANGLING || EDG_DEMANGLING)
-                {
-                  forget_types (work);
-                }
-	    success = demangle_args (work, mangled, declp);
-	    /* Since template include the mangling of their return types,
-	       we must set expect_func to 0 so that we don't try do
-	       demangle more arguments the next time we get here.  */
-	    expect_func = 0;
-	  }
-      }
-    }
-  if (success && !func_done)
-    {
-      if (AUTO_DEMANGLING || GNU_DEMANGLING)
-	{
-	  /* With GNU style demangling, bar__3foo is 'foo::bar(void)', and
-	     bar__3fooi is 'foo::bar(int)'.  We get here when we find the
-	     first case, and need to ensure that the '(void)' gets added to
-	     the current declp.  Note that with ARM/HP, the first case
-	     represents the name of a static data member 'foo::bar',
-	     which is in the current declp, so we leave it alone.  */
-	  success = demangle_args (work, mangled, declp);
-	}
-    }
-  if (success && PRINT_ARG_TYPES)
-    {
-      if (work->static_type)
-	string_append (declp, " static");
-      if (work->type_quals != TYPE_UNQUALIFIED)
-	{
-	  APPEND_BLANK (declp);
-	  string_append (declp, qualifier_string (work->type_quals));
-	}
-    }
-
-  return (success);
-}
-
-#if 0
-
-static int
-demangle_method_args (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  int success = 0;
-
-  if (work -> static_type)
-    {
-      string_append (declp, *mangled + 1);
-      *mangled += strlen (*mangled);
-      success = 1;
-    }
-  else
-    {
-      success = demangle_args (work, mangled, declp);
-    }
-  return (success);
-}
-
-#endif
-
-static int
-demangle_template_template_parm (work, mangled, tname)
-     struct work_stuff *work;
-     const char **mangled;
-     string *tname;
-{
-  int i;
-  int r;
-  int need_comma = 0;
-  int success = 1;
-  string temp;
-
-  string_append (tname, "template <");
-  /* get size of template parameter list */
-  if (get_count (mangled, &r))
-    {
-      for (i = 0; i < r; i++)
-	{
-	  if (need_comma)
-	    {
-	      string_append (tname, ", ");
-	    }
-
-	    /* Z for type parameters */
-	    if (**mangled == 'Z')
-	      {
-		(*mangled)++;
-		string_append (tname, "class");
-	      }
-	      /* z for template parameters */
-	    else if (**mangled == 'z')
-	      {
-		(*mangled)++;
-		success =
-		  demangle_template_template_parm (work, mangled, tname);
-		if (!success)
-		  {
-		    break;
-		  }
-	      }
-	    else
-	      {
-		/* temp is initialized in do_type */
-		success = do_type (work, mangled, &temp);
-		if (success)
-		  {
-		    string_appends (tname, &temp);
-		  }
-		string_delete(&temp);
-		if (!success)
-		  {
-		    break;
-		  }
-	      }
-	  need_comma = 1;
-	}
-
-    }
-  if (tname->p[-1] == '>')
-    string_append (tname, " ");
-  string_append (tname, "> class");
-  return (success);
-}
-
-static int
-demangle_expression (work, mangled, s, tk)
-     struct work_stuff *work;
-     const char** mangled;
-     string* s;
-     type_kind_t tk;
-{
-  int need_operator = 0;
-  int success;
-
-  success = 1;
-  string_appendn (s, "(", 1);
-  (*mangled)++;
-  while (success && **mangled != 'W' && **mangled != '\0')
-    {
-      if (need_operator)
-	{
-	  size_t i;
-	  size_t len;
-
-	  success = 0;
-
-	  len = strlen (*mangled);
-
-	  for (i = 0; i < ARRAY_SIZE (optable); ++i)
-	    {
-	      size_t l = strlen (optable[i].in);
-
-	      if (l <= len
-		  && memcmp (optable[i].in, *mangled, l) == 0)
-		{
-		  string_appendn (s, " ", 1);
-		  string_append (s, optable[i].out);
-		  string_appendn (s, " ", 1);
-		  success = 1;
-		  (*mangled) += l;
-		  break;
-		}
-	    }
-
-	  if (!success)
-	    break;
-	}
-      else
-	need_operator = 1;
-
-      success = demangle_template_value_parm (work, mangled, s, tk);
-    }
-
-  if (**mangled != 'W')
-    success = 0;
-  else
-    {
-      string_appendn (s, ")", 1);
-      (*mangled)++;
-    }
-
-  return success;
-}
-
-static int
-demangle_integral_value (work, mangled, s)
-     struct work_stuff *work;
-     const char** mangled;
-     string* s;
-{
-  int success;
-
-  if (**mangled == 'E')
-    success = demangle_expression (work, mangled, s, tk_integral);
-  else if (**mangled == 'Q' || **mangled == 'K')
-    success = demangle_qualified (work, mangled, s, 0, 1);
-  else
-    {
-      int value;
-
-      /* By default, we let the number decide whether we shall consume an
-	 underscore.  */
-      int consume_following_underscore = 0;
-      int leave_following_underscore = 0;
-
-      success = 0;
-
-      /* Negative numbers are indicated with a leading `m'.  */
-      if (**mangled == 'm')
-	{
-	  string_appendn (s, "-", 1);
-	  (*mangled)++;
-	}
-      else if (mangled[0][0] == '_' && mangled[0][1] == 'm')
-	{
-	  /* Since consume_count_with_underscores does not handle the
-	     `m'-prefix we must do it here, using consume_count and
-	     adjusting underscores: we have to consume the underscore
-	     matching the prepended one.  */
-	  consume_following_underscore = 1;
-	  string_appendn (s, "-", 1);
-	  (*mangled) += 2;
-	}
-      else if (**mangled == '_')
-	{
-	  /* Do not consume a following underscore;
-	     consume_following_underscore will consume what should be
-	     consumed.  */
-	  leave_following_underscore = 1;
-	}
-
-      /* We must call consume_count if we expect to remove a trailing
-	 underscore, since consume_count_with_underscores expects
-	 the leading underscore (that we consumed) if it is to handle
-	 multi-digit numbers.  */
-      if (consume_following_underscore)
-	value = consume_count (mangled);
-      else
-	value = consume_count_with_underscores (mangled);
-
-      if (value != -1)
-	{
-	  char buf[INTBUF_SIZE];
-	  sprintf (buf, "%d", value);
-	  string_append (s, buf);
-
-	  /* Numbers not otherwise delimited, might have an underscore
-	     appended as a delimeter, which we should skip.
-
-	     ??? This used to always remove a following underscore, which
-	     is wrong.  If other (arbitrary) cases are followed by an
-	     underscore, we need to do something more radical.  */
-
-	  if ((value > 9 || consume_following_underscore)
-	      && ! leave_following_underscore
-	      && **mangled == '_')
-	    (*mangled)++;
-
-	  /* All is well.  */
-	  success = 1;
-	}
-    }
-
-  return success;
-}
-
-/* Demangle the real value in MANGLED.  */
-
-static int
-demangle_real_value (work, mangled, s)
-     struct work_stuff *work;
-     const char **mangled;
-     string* s;
-{
-  if (**mangled == 'E')
-    return demangle_expression (work, mangled, s, tk_real);
-
-  if (**mangled == 'm')
-    {
-      string_appendn (s, "-", 1);
-      (*mangled)++;
-    }
-  while (ISDIGIT ((unsigned char)**mangled))
-    {
-      string_appendn (s, *mangled, 1);
-      (*mangled)++;
-    }
-  if (**mangled == '.') /* fraction */
-    {
-      string_appendn (s, ".", 1);
-      (*mangled)++;
-      while (ISDIGIT ((unsigned char)**mangled))
-	{
-	  string_appendn (s, *mangled, 1);
-	  (*mangled)++;
-	}
-    }
-  if (**mangled == 'e') /* exponent */
-    {
-      string_appendn (s, "e", 1);
-      (*mangled)++;
-      while (ISDIGIT ((unsigned char)**mangled))
-	{
-	  string_appendn (s, *mangled, 1);
-	  (*mangled)++;
-	}
-    }
-
-  return 1;
-}
-
-static int
-demangle_template_value_parm (work, mangled, s, tk)
-     struct work_stuff *work;
-     const char **mangled;
-     string* s;
-     type_kind_t tk;
-{
-  int success = 1;
-
-  if (**mangled == 'Y')
-    {
-      /* The next argument is a template parameter. */
-      int idx;
-
-      (*mangled)++;
-      idx = consume_count_with_underscores (mangled);
-      if (idx == -1
-	  || (work->tmpl_argvec && idx >= work->ntmpl_args)
-	  || consume_count_with_underscores (mangled) == -1)
-	return -1;
-      if (work->tmpl_argvec)
-	string_append (s, work->tmpl_argvec[idx]);
-      else
-	string_append_template_idx (s, idx);
-    }
-  else if (tk == tk_integral)
-    success = demangle_integral_value (work, mangled, s);
-  else if (tk == tk_char)
-    {
-      char tmp[2];
-      int val;
-      if (**mangled == 'm')
-	{
-	  string_appendn (s, "-", 1);
-	  (*mangled)++;
-	}
-      string_appendn (s, "'", 1);
-      val = consume_count(mangled);
-      if (val <= 0)
-	success = 0;
-      else
-	{
-	  tmp[0] = (char)val;
-	  tmp[1] = '\0';
-	  string_appendn (s, &tmp[0], 1);
-	  string_appendn (s, "'", 1);
-	}
-    }
-  else if (tk == tk_bool)
-    {
-      int val = consume_count (mangled);
-      if (val == 0)
-	string_appendn (s, "false", 5);
-      else if (val == 1)
-	string_appendn (s, "true", 4);
-      else
-	success = 0;
-    }
-  else if (tk == tk_real)
-    success = demangle_real_value (work, mangled, s);
-  else if (tk == tk_pointer || tk == tk_reference)
-    {
-      if (**mangled == 'Q')
-	success = demangle_qualified (work, mangled, s,
-				      /*isfuncname=*/0, 
-				      /*append=*/1);
-      else
-	{
-	  int symbol_len  = consume_count (mangled);
-	  if (symbol_len == -1)
-	    return -1;
-	  if (symbol_len == 0)
-	    string_appendn (s, "0", 1);
-	  else
-	    {
-	      char *p = xmalloc (symbol_len + 1), *q;
-	      strncpy (p, *mangled, symbol_len);
-	      p [symbol_len] = '\0';
-	      /* We use cplus_demangle here, rather than
-		 internal_cplus_demangle, because the name of the entity
-		 mangled here does not make use of any of the squangling
-		 or type-code information we have built up thus far; it is
-		 mangled independently.  */
-	      q = VG_(cplus_demangle) (p, work->options);
-	      if (tk == tk_pointer)
-		string_appendn (s, "&", 1);
-	      /* FIXME: Pointer-to-member constants should get a
-		 qualifying class name here.  */
-	      if (q)
-		{
-		  string_append (s, q);
-		  free (q);
-		}
-	      else
-		string_append (s, p);
-	      free (p);
-	    }
-	  *mangled += symbol_len;
-	}
-    }
-
-  return success;
-}
-
-/* Demangle the template name in MANGLED.  The full name of the
-   template (e.g., S<int>) is placed in TNAME.  The name without the
-   template parameters (e.g. S) is placed in TRAWNAME if TRAWNAME is
-   non-NULL.  If IS_TYPE is nonzero, this template is a type template,
-   not a function template.  If both IS_TYPE and REMEMBER are nonzero,
-   the template is remembered in the list of back-referenceable
-   types.  */
-
-static int
-demangle_template (work, mangled, tname, trawname, is_type, remember)
-     struct work_stuff *work;
-     const char **mangled;
-     string *tname;
-     string *trawname;
-     int is_type;
-     int remember;
-{
-  int i;
-  int r;
-  int need_comma = 0;
-  int success = 0;
-  const char *start;
-  int is_java_array = 0;
-  string temp;
-  int bindex = 0;
-
-  (*mangled)++;
-  if (is_type)
-    {
-      if (remember)
-	bindex = register_Btype (work);
-      start = *mangled;
-      /* get template name */
-      if (**mangled == 'z')
-	{
-	  int idx;
-	  (*mangled)++;
-	  (*mangled)++;
-
-	  idx = consume_count_with_underscores (mangled);
-	  if (idx == -1
-	      || (work->tmpl_argvec && idx >= work->ntmpl_args)
-	      || consume_count_with_underscores (mangled) == -1)
-	    return (0);
-
-	  if (work->tmpl_argvec)
-	    {
-	      string_append (tname, work->tmpl_argvec[idx]);
-	      if (trawname)
-		string_append (trawname, work->tmpl_argvec[idx]);
-	    }
-	  else
-	    {
-	      string_append_template_idx (tname, idx);
-	      if (trawname)
-		string_append_template_idx (trawname, idx);
-	    }
-	}
-      else
-	{
-	  if ((r = consume_count (mangled)) <= 0
-	      || (int) strlen (*mangled) < r)
-	    {
-	      return (0);
-	    }
-	  is_java_array = (work -> options & DMGL_JAVA)
-	    && strncmp (*mangled, "JArray1Z", 8) == 0;
-	  if (! is_java_array)
-	    {
-	      string_appendn (tname, *mangled, r);
-	    }
-	  if (trawname)
-	    string_appendn (trawname, *mangled, r);
-	  *mangled += r;
-	}
-    }
-  if (!is_java_array)
-    string_append (tname, "<");
-  /* get size of template parameter list */
-  if (!get_count (mangled, &r))
-    {
-      return (0);
-    }
-  if (!is_type)
-    {
-      /* Create an array for saving the template argument values. */
-      work->tmpl_argvec = (char**) xmalloc (r * sizeof (char *));
-      work->ntmpl_args = r;
-      for (i = 0; i < r; i++)
-	work->tmpl_argvec[i] = 0;
-    }
-  for (i = 0; i < r; i++)
-    {
-      if (need_comma)
-	{
-	  string_append (tname, ", ");
-	}
-      /* Z for type parameters */
-      if (**mangled == 'Z')
-	{
-	  (*mangled)++;
-	  /* temp is initialized in do_type */
-	  success = do_type (work, mangled, &temp);
-	  if (success)
-	    {
-	      string_appends (tname, &temp);
-
-	      if (!is_type)
-		{
-		  /* Save the template argument. */
-		  int len = temp.p - temp.b;
-		  work->tmpl_argvec[i] = xmalloc (len + 1);
-		  memcpy (work->tmpl_argvec[i], temp.b, len);
-		  work->tmpl_argvec[i][len] = '\0';
-		}
-	    }
-	  string_delete(&temp);
-	  if (!success)
-	    {
-	      break;
-	    }
-	}
-      /* z for template parameters */
-      else if (**mangled == 'z')
-	{
-	  int r2;
-	  (*mangled)++;
-	  success = demangle_template_template_parm (work, mangled, tname);
-
-	  if (success
-	      && (r2 = consume_count (mangled)) > 0
-	      && (int) strlen (*mangled) >= r2)
-	    {
-	      string_append (tname, " ");
-	      string_appendn (tname, *mangled, r2);
-	      if (!is_type)
-		{
-		  /* Save the template argument. */
-		  int len = r2;
-		  work->tmpl_argvec[i] = xmalloc (len + 1);
-		  memcpy (work->tmpl_argvec[i], *mangled, len);
-		  work->tmpl_argvec[i][len] = '\0';
-		}
-	      *mangled += r2;
-	    }
-	  if (!success)
-	    {
-	      break;
-	    }
-	}
-      else
-	{
-	  string  param;
-	  string* s;
-
-	  /* otherwise, value parameter */
-
-	  /* temp is initialized in do_type */
-	  success = do_type (work, mangled, &temp);
-	  string_delete(&temp);
-	  if (!success)
-	    break;
-
-	  if (!is_type)
-	    {
-	      s = &param;
-	      string_init (s);
-	    }
-	  else
-	    s = tname;
-
-	  success = demangle_template_value_parm (work, mangled, s,
-						  (type_kind_t) success);
-
-	  if (!success)
-	    {
-	      if (!is_type)
-		string_delete (s);
-	      success = 0;
-	      break;
-	    }
-
-	  if (!is_type)
-	    {
-	      int len = s->p - s->b;
-	      work->tmpl_argvec[i] = xmalloc (len + 1);
-	      memcpy (work->tmpl_argvec[i], s->b, len);
-	      work->tmpl_argvec[i][len] = '\0';
-
-	      string_appends (tname, s);
-	      string_delete (s);
-	    }
-	}
-      need_comma = 1;
-    }
-  if (is_java_array)
-    {
-      string_append (tname, "[]");
-    }
-  else
-    {
-      if (tname->p[-1] == '>')
-	string_append (tname, " ");
-      string_append (tname, ">");
-    }
-
-  if (is_type && remember)
-    remember_Btype (work, tname->b, LEN_STRING (tname), bindex);
-
-  /*
-    if (work -> static_type)
-    {
-    string_append (declp, *mangled + 1);
-    *mangled += strlen (*mangled);
-    success = 1;
-    }
-    else
-    {
-    success = demangle_args (work, mangled, declp);
-    }
-    }
-    */
-  return (success);
-}
-
-static int
-arm_pt (work, mangled, n, anchor, args)
-     struct work_stuff *work;
-     const char *mangled;
-     int n;
-     const char **anchor, **args;
-{
-  /* Check if ARM template with "__pt__" in it ("parameterized type") */
-  /* Allow HP also here, because HP's cfront compiler follows ARM to some extent */
-  if ((ARM_DEMANGLING || HP_DEMANGLING) && (*anchor = strstr (mangled, "__pt__")))
-    {
-      int len;
-      *args = *anchor + 6;
-      len = consume_count (args);
-      if (len == -1)
-	return 0;
-      if (*args + len == mangled + n && **args == '_')
-	{
-	  ++*args;
-	  return 1;
-	}
-    }
-  if (AUTO_DEMANGLING || EDG_DEMANGLING)
-    {
-      if ((*anchor = strstr (mangled, "__tm__"))
-          || (*anchor = strstr (mangled, "__ps__"))
-          || (*anchor = strstr (mangled, "__pt__")))
-        {
-          int len;
-          *args = *anchor + 6;
-          len = consume_count (args);
-	  if (len == -1)
-	    return 0;
-          if (*args + len == mangled + n && **args == '_')
-            {
-              ++*args;
-              return 1;
-            }
-        }
-      else if ((*anchor = strstr (mangled, "__S")))
-        {
- 	  int len;
- 	  *args = *anchor + 3;
- 	  len = consume_count (args);
-	  if (len == -1)
-	    return 0;
- 	  if (*args + len == mangled + n && **args == '_')
-            {
-              ++*args;
- 	      return 1;
-            }
-        }
-    }
-
-  return 0;
-}
-
-static void
-demangle_arm_hp_template (work, mangled, n, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     int n;
-     string *declp;
-{
-  const char *p;
-  const char *args;
-  const char *e = *mangled + n;
-  string arg;
-
-  /* Check for HP aCC template spec: classXt1t2 where t1, t2 are
-     template args */
-  if (HP_DEMANGLING && ((*mangled)[n] == 'X'))
-    {
-      char *start_spec_args = NULL;
-
-      /* First check for and omit template specialization pseudo-arguments,
-         such as in "Spec<#1,#1.*>" */
-      start_spec_args = strchr (*mangled, '<');
-      if (start_spec_args && (start_spec_args - *mangled < n))
-        string_appendn (declp, *mangled, start_spec_args - *mangled);
-      else
-        string_appendn (declp, *mangled, n);
-      (*mangled) += n + 1;
-      string_init (&arg);
-      if (work->temp_start == -1) /* non-recursive call */
-        work->temp_start = declp->p - declp->b;
-      string_append (declp, "<");
-      while (1)
-        {
-          string_clear (&arg);
-          switch (**mangled)
-            {
-              case 'T':
-                /* 'T' signals a type parameter */
-                (*mangled)++;
-                if (!do_type (work, mangled, &arg))
-                  goto hpacc_template_args_done;
-                break;
-
-              case 'U':
-              case 'S':
-                /* 'U' or 'S' signals an integral value */
-                if (!do_hpacc_template_const_value (work, mangled, &arg))
-                  goto hpacc_template_args_done;
-                break;
-
-              case 'A':
-                /* 'A' signals a named constant expression (literal) */
-                if (!do_hpacc_template_literal (work, mangled, &arg))
-                  goto hpacc_template_args_done;
-                break;
-
-              default:
-                /* Today, 1997-09-03, we have only the above types
-                   of template parameters */
-                /* FIXME: maybe this should fail and return null */
-                goto hpacc_template_args_done;
-            }
-          string_appends (declp, &arg);
-         /* Check if we're at the end of template args.
-             0 if at end of static member of template class,
-             _ if done with template args for a function */
-          if ((**mangled == '\000') || (**mangled == '_'))
-            break;
-          else
-            string_append (declp, ",");
-        }
-    hpacc_template_args_done:
-      string_append (declp, ">");
-      string_delete (&arg);
-      if (**mangled == '_')
-        (*mangled)++;
-      return;
-    }
-  /* ARM template? (Also handles HP cfront extensions) */
-  else if (arm_pt (work, *mangled, n, &p, &args))
-    {
-      string type_str;
-
-      string_init (&arg);
-      string_appendn (declp, *mangled, p - *mangled);
-      if (work->temp_start == -1)  /* non-recursive call */
-	work->temp_start = declp->p - declp->b;
-      string_append (declp, "<");
-      /* should do error checking here */
-      while (args < e) {
-	string_clear (&arg);
-
-	/* Check for type or literal here */
-	switch (*args)
-	  {
-	    /* HP cfront extensions to ARM for template args */
-	    /* spec: Xt1Lv1 where t1 is a type, v1 is a literal value */
-	    /* FIXME: We handle only numeric literals for HP cfront */
-          case 'X':
-            /* A typed constant value follows */
-            args++;
-            if (!do_type (work, &args, &type_str))
-	      goto cfront_template_args_done;
-            string_append (&arg, "(");
-            string_appends (&arg, &type_str);
-            string_append (&arg, ")");
-            if (*args != 'L')
-              goto cfront_template_args_done;
-            args++;
-            /* Now snarf a literal value following 'L' */
-            if (!snarf_numeric_literal (&args, &arg))
-	      goto cfront_template_args_done;
-            break;
-
-          case 'L':
-            /* Snarf a literal following 'L' */
-            args++;
-            if (!snarf_numeric_literal (&args, &arg))
-	      goto cfront_template_args_done;
-            break;
-          default:
-            /* Not handling other HP cfront stuff */
-            if (!do_type (work, &args, &arg))
-              goto cfront_template_args_done;
-	  }
-	string_appends (declp, &arg);
-	string_append (declp, ",");
-      }
-    cfront_template_args_done:
-      string_delete (&arg);
-      if (args >= e)
-	--declp->p; /* remove extra comma */
-      string_append (declp, ">");
-    }
-  else if (n>10 && strncmp (*mangled, "_GLOBAL_", 8) == 0
-	   && (*mangled)[9] == 'N'
-	   && (*mangled)[8] == (*mangled)[10]
-	   && strchr (cplus_markers, (*mangled)[8]))
-    {
-      /* A member of the anonymous namespace.  */
-      string_append (declp, "{anonymous}");
-    }
-  else
-    {
-      if (work->temp_start == -1) /* non-recursive call only */
-	work->temp_start = 0;     /* disable in recursive calls */
-      string_appendn (declp, *mangled, n);
-    }
-  *mangled += n;
-}
-
-/* Extract a class name, possibly a template with arguments, from the
-   mangled string; qualifiers, local class indicators, etc. have
-   already been dealt with */
-
-static int
-demangle_class_name (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  int n;
-  int success = 0;
-
-  n = consume_count (mangled);
-  if (n == -1)
-    return 0;
-  if ((int) strlen (*mangled) >= n)
-    {
-      demangle_arm_hp_template (work, mangled, n, declp);
-      success = 1;
-    }
-
-  return (success);
-}
-
-/*
-
-LOCAL FUNCTION
-
-	demangle_class -- demangle a mangled class sequence
-
-SYNOPSIS
-
-	static int
-	demangle_class (struct work_stuff *work, const char **mangled,
-			strint *declp)
-
-DESCRIPTION
-
-	DECLP points to the buffer into which demangling is being done.
-
-	*MANGLED points to the current token to be demangled.  On input,
-	it points to a mangled class (I.E. "3foo", "13verylongclass", etc.)
-	On exit, it points to the next token after the mangled class on
-	success, or the first unconsumed token on failure.
-
-	If the CONSTRUCTOR or DESTRUCTOR flags are set in WORK, then
-	we are demangling a constructor or destructor.  In this case
-	we prepend "class::class" or "class::~class" to DECLP.
-
-	Otherwise, we prepend "class::" to the current DECLP.
-
-	Reset the constructor/destructor flags once they have been
-	"consumed".  This allows demangle_class to be called later during
-	the same demangling, to do normal class demangling.
-
-	Returns 1 if demangling is successful, 0 otherwise.
-
-*/
-
-static int
-demangle_class (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  int success = 0;
-  int btype;
-  string class_name;
-  char *save_class_name_end = 0;
-
-  string_init (&class_name);
-  btype = register_Btype (work);
-  if (demangle_class_name (work, mangled, &class_name))
-    {
-      save_class_name_end = class_name.p;
-      if ((work->constructor & 1) || (work->destructor & 1))
-	{
-          /* adjust so we don't include template args */
-          if (work->temp_start && (work->temp_start != -1))
-            {
-              class_name.p = class_name.b + work->temp_start;
-            }
-	  string_prepends (declp, &class_name);
-	  if (work -> destructor & 1)
-	    {
-	      string_prepend (declp, "~");
-              work -> destructor -= 1;
-	    }
-	  else
-	    {
-	      work -> constructor -= 1;
-	    }
-	}
-      class_name.p = save_class_name_end;
-      remember_Ktype (work, class_name.b, LEN_STRING(&class_name));
-      remember_Btype (work, class_name.b, LEN_STRING(&class_name), btype);
-      string_prepend (declp, SCOPE_STRING (work));
-      string_prepends (declp, &class_name);
-      success = 1;
-    }
-  string_delete (&class_name);
-  return (success);
-}
-
-
-/* Called when there's a "__" in the mangled name, with `scan' pointing to
-   the rightmost guess.
-
-   Find the correct "__"-sequence where the function name ends and the
-   signature starts, which is ambiguous with GNU mangling.
-   Call demangle_signature here, so we can make sure we found the right
-   one; *mangled will be consumed so caller will not make further calls to
-   demangle_signature.  */
-
-static int
-iterate_demangle_function (work, mangled, declp, scan)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-     const char *scan;
-{
-  const char *mangle_init = *mangled;
-  int success = 0;
-  string decl_init;
-  struct work_stuff work_init;
-
-  if (*(scan + 2) == '\0')
-    return 0;
-
-  /* Do not iterate for some demangling modes, or if there's only one
-     "__"-sequence.  This is the normal case.  */
-  if (ARM_DEMANGLING || LUCID_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING
-      || strstr (scan + 2, "__") == NULL)
-    {
-      demangle_function_name (work, mangled, declp, scan);
-      return 1;
-    }
-
-  /* Save state so we can restart if the guess at the correct "__" was
-     wrong.  */
-  string_init (&decl_init);
-  string_appends (&decl_init, declp);
-  memset (&work_init, 0, sizeof work_init);
-  work_stuff_copy_to_from (&work_init, work);
-
-  /* Iterate over occurrences of __, allowing names and types to have a
-     "__" sequence in them.  We must start with the first (not the last)
-     occurrence, since "__" most often occur between independent mangled
-     parts, hence starting at the last occurence inside a signature
-     might get us a "successful" demangling of the signature.  */
-
-  while (scan[2])
-    {
-      demangle_function_name (work, mangled, declp, scan);
-      success = demangle_signature (work, mangled, declp);
-      if (success)
-	break;
-
-      /* Reset demangle state for the next round.  */
-      *mangled = mangle_init;
-      string_clear (declp);
-      string_appends (declp, &decl_init);
-      work_stuff_copy_to_from (work, &work_init);
-
-      /* Leave this underscore-sequence.  */
-      scan += 2;
-
-      /* Scan for the next "__" sequence.  */
-      while (*scan && (scan[0] != '_' || scan[1] != '_'))
-	scan++;
-
-      /* Move to last "__" in this sequence.  */
-      while (*scan && *scan == '_')
-	scan++;
-      scan -= 2;
-    }
-
-  /* Delete saved state.  */
-  delete_work_stuff (&work_init);
-  string_delete (&decl_init);
-
-  return success;
-}
-
-/*
-
-LOCAL FUNCTION
-
-	demangle_prefix -- consume the mangled name prefix and find signature
-
-SYNOPSIS
-
-	static int
-	demangle_prefix (struct work_stuff *work, const char **mangled,
-			 string *declp);
-
-DESCRIPTION
-
-	Consume and demangle the prefix of the mangled name.
-	While processing the function name root, arrange to call
-	demangle_signature if the root is ambiguous.
-
-	DECLP points to the string buffer into which demangled output is
-	placed.  On entry, the buffer is empty.  On exit it contains
-	the root function name, the demangled operator name, or in some
-	special cases either nothing or the completely demangled result.
-
-	MANGLED points to the current pointer into the mangled name.  As each
-	token of the mangled name is consumed, it is updated.  Upon entry
-	the current mangled name pointer points to the first character of
-	the mangled name.  Upon exit, it should point to the first character
-	of the signature if demangling was successful, or to the first
-	unconsumed character if demangling of the prefix was unsuccessful.
-
-	Returns 1 on success, 0 otherwise.
- */
-
-static int
-demangle_prefix (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  int success = 1;
-  const char *scan;
-  int i;
-
-  if (strlen(*mangled) > 6
-      && (strncmp(*mangled, "_imp__", 6) == 0
-          || strncmp(*mangled, "__imp_", 6) == 0))
-    {
-      /* it's a symbol imported from a PE dynamic library. Check for both
-         new style prefix _imp__ and legacy __imp_ used by older versions
-	 of dlltool. */
-      (*mangled) += 6;
-      work->dllimported = 1;
-    }
-  else if (strlen(*mangled) >= 11 && strncmp(*mangled, "_GLOBAL_", 8) == 0)
-    {
-      char *marker = strchr (cplus_markers, (*mangled)[8]);
-      if (marker != NULL && *marker == (*mangled)[10])
-	{
-	  if ((*mangled)[9] == 'D')
-	    {
-	      /* it's a GNU global destructor to be executed at program exit */
-	      (*mangled) += 11;
-	      work->destructor = 2;
-	      if (gnu_special (work, mangled, declp))
-		return success;
-	    }
-	  else if ((*mangled)[9] == 'I')
-	    {
-	      /* it's a GNU global constructor to be executed at program init */
-	      (*mangled) += 11;
-	      work->constructor = 2;
-	      if (gnu_special (work, mangled, declp))
-		return success;
-	    }
-	}
-    }
-  else if ((ARM_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING) && strncmp(*mangled, "__std__", 7) == 0)
-    {
-      /* it's a ARM global destructor to be executed at program exit */
-      (*mangled) += 7;
-      work->destructor = 2;
-    }
-  else if ((ARM_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING) && strncmp(*mangled, "__sti__", 7) == 0)
-    {
-      /* it's a ARM global constructor to be executed at program initial */
-      (*mangled) += 7;
-      work->constructor = 2;
-    }
-
-  /*  This block of code is a reduction in strength time optimization
-      of:
-      scan = strstr (*mangled, "__"); */
-
-  {
-    scan = *mangled;
-
-    do {
-      scan = strchr (scan, '_');
-    } while (scan != NULL && *++scan != '_');
-
-    if (scan != NULL) --scan;
-  }
-
-  if (scan != NULL)
-    {
-      /* We found a sequence of two or more '_', ensure that we start at
-	 the last pair in the sequence.  */
-      /* i = strspn (scan, "_"); */
-      i = 0;
-      while (scan[i] == '_') i++;
-      if (i > 2)
-	{
-	  scan += (i - 2);
-	}
-    }
-
-  if (scan == NULL)
-    {
-      success = 0;
-    }
-  else if (work -> static_type)
-    {
-      if (!ISDIGIT ((unsigned char)scan[0]) && (scan[0] != 't'))
-	{
-	  success = 0;
-	}
-    }
-  else if ((scan == *mangled)
-	   && (ISDIGIT ((unsigned char)scan[2]) || (scan[2] == 'Q')
-	       || (scan[2] == 't') || (scan[2] == 'K') || (scan[2] == 'H')))
-    {
-      /* The ARM says nothing about the mangling of local variables.
-	 But cfront mangles local variables by prepending __<nesting_level>
-	 to them. As an extension to ARM demangling we handle this case.  */
-      if ((LUCID_DEMANGLING || ARM_DEMANGLING || HP_DEMANGLING)
-	  && ISDIGIT ((unsigned char)scan[2]))
-	{
-	  *mangled = scan + 2;
-	  consume_count (mangled);
-	  string_append (declp, *mangled);
-	  *mangled += strlen (*mangled);
-	  success = 1;
-	}
-      else
-	{
-	  /* A GNU style constructor starts with __[0-9Qt].  But cfront uses
-	     names like __Q2_3foo3bar for nested type names.  So don't accept
-	     this style of constructor for cfront demangling.  A GNU
-	     style member-template constructor starts with 'H'. */
-	  if (!(LUCID_DEMANGLING || ARM_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING))
-	    work -> constructor += 1;
-	  *mangled = scan + 2;
-	}
-    }
-  else if (ARM_DEMANGLING && scan[2] == 'p' && scan[3] == 't')
-    {
-      /* Cfront-style parameterized type.  Handled later as a signature. */
-      success = 1;
-
-      /* ARM template? */
-      demangle_arm_hp_template (work, mangled, strlen (*mangled), declp);
-    }
-  else if (EDG_DEMANGLING && ((scan[2] == 't' && scan[3] == 'm')
-                              || (scan[2] == 'p' && scan[3] == 's')
-                              || (scan[2] == 'p' && scan[3] == 't')))
-    {
-      /* EDG-style parameterized type.  Handled later as a signature. */
-      success = 1;
-
-      /* EDG template? */
-      demangle_arm_hp_template (work, mangled, strlen (*mangled), declp);
-    }
-  else if ((scan == *mangled) && !ISDIGIT ((unsigned char)scan[2])
-	   && (scan[2] != 't'))
-    {
-      /* Mangled name starts with "__".  Skip over any leading '_' characters,
-	 then find the next "__" that separates the prefix from the signature.
-	 */
-      if (!(ARM_DEMANGLING || LUCID_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING)
-	  || (arm_special (mangled, declp) == 0))
-	{
-	  while (*scan == '_')
-	    {
-	      scan++;
-	    }
-	  if ((scan = strstr (scan, "__")) == NULL || (*(scan + 2) == '\0'))
-	    {
-	      /* No separator (I.E. "__not_mangled"), or empty signature
-		 (I.E. "__not_mangled_either__") */
-	      success = 0;
-	    }
-	  else
-	    return iterate_demangle_function (work, mangled, declp, scan);
-	}
-    }
-  else if (*(scan + 2) != '\0')
-    {
-      /* Mangled name does not start with "__" but does have one somewhere
-	 in there with non empty stuff after it.  Looks like a global
-	 function name.  Iterate over all "__":s until the right
-	 one is found.  */
-      return iterate_demangle_function (work, mangled, declp, scan);
-    }
-  else
-    {
-      /* Doesn't look like a mangled name */
-      success = 0;
-    }
-
-  if (!success && (work->constructor == 2 || work->destructor == 2))
-    {
-      string_append (declp, *mangled);
-      *mangled += strlen (*mangled);
-      success = 1;
-    }
-  return (success);
-}
-
-/*
-
-LOCAL FUNCTION
-
-	gnu_special -- special handling of gnu mangled strings
-
-SYNOPSIS
-
-	static int
-	gnu_special (struct work_stuff *work, const char **mangled,
-		     string *declp);
-
-
-DESCRIPTION
-
-	Process some special GNU style mangling forms that don't fit
-	the normal pattern.  For example:
-
-		_$_3foo		(destructor for class foo)
-		_vt$foo		(foo virtual table)
-		_vt$foo$bar	(foo::bar virtual table)
-		__vt_foo	(foo virtual table, new style with thunks)
-		_3foo$varname	(static data member)
-		_Q22rs2tu$vw	(static data member)
-		__t6vector1Zii	(constructor with template)
-		__thunk_4__$_7ostream (virtual function thunk)
- */
-
-static int
-gnu_special (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  int n;
-  int success = 1;
-  const char *p;
-
-  if ((*mangled)[0] == '_'
-      && strchr (cplus_markers, (*mangled)[1]) != NULL
-      && (*mangled)[2] == '_')
-    {
-      /* Found a GNU style destructor, get past "_<CPLUS_MARKER>_" */
-      (*mangled) += 3;
-      work -> destructor += 1;
-    }
-  else if ((*mangled)[0] == '_'
-	   && (((*mangled)[1] == '_'
-		&& (*mangled)[2] == 'v'
-		&& (*mangled)[3] == 't'
-		&& (*mangled)[4] == '_')
-	       || ((*mangled)[1] == 'v'
-		   && (*mangled)[2] == 't'
-		   && strchr (cplus_markers, (*mangled)[3]) != NULL)))
-    {
-      /* Found a GNU style virtual table, get past "_vt<CPLUS_MARKER>"
-         and create the decl.  Note that we consume the entire mangled
-	 input string, which means that demangle_signature has no work
-	 to do.  */
-      if ((*mangled)[2] == 'v')
-	(*mangled) += 5; /* New style, with thunks: "__vt_" */
-      else
-	(*mangled) += 4; /* Old style, no thunks: "_vt<CPLUS_MARKER>" */
-      while (**mangled != '\0')
-	{
-	  switch (**mangled)
-	    {
-	    case 'Q':
-	    case 'K':
-	      success = demangle_qualified (work, mangled, declp, 0, 1);
-	      break;
-	    case 't':
-	      success = demangle_template (work, mangled, declp, 0, 1,
-					   1);
-	      break;
-	    default:
-	      if (ISDIGIT((unsigned char)*mangled[0]))
-		{
-		  n = consume_count(mangled);
-		  /* We may be seeing a too-large size, or else a
-		     ".<digits>" indicating a static local symbol.  In
-		     any case, declare victory and move on; *don't* try
-		     to use n to allocate.  */
-		  if (n > (int) strlen (*mangled))
-		    {
-		      success = 1;
-		      break;
-		    }
-		}
-	      else
-		{
-		  /*n = strcspn (*mangled, cplus_markers);*/
-		  const char *check = *mangled;
-		  n = 0;
-		  while (*check)
-		    if (strchr (cplus_markers, *check++) == NULL)
-		      n++;
-		    else
-		      break;
-		}
-	      string_appendn (declp, *mangled, n);
-	      (*mangled) += n;
-	    }
-
-	  p = strpbrk (*mangled, cplus_markers);
-	  if (success && ((p == NULL) || (p == *mangled)))
-	    {
-	      if (p != NULL)
-		{
-		  string_append (declp, SCOPE_STRING (work));
-		  (*mangled)++;
-		}
-	    }
-	  else
-	    {
-	      success = 0;
-	      break;
-	    }
-	}
-      if (success)
-	string_append (declp, " virtual table");
-    }
-  else if ((*mangled)[0] == '_'
-	   && (strchr("0123456789Qt", (*mangled)[1]) != NULL)
-	   && (p = strpbrk (*mangled, cplus_markers)) != NULL)
-    {
-      /* static data member, "_3foo$varname" for example */
-      (*mangled)++;
-      switch (**mangled)
-	{
-	case 'Q':
-	case 'K':
-	  success = demangle_qualified (work, mangled, declp, 0, 1);
-	  break;
-	case 't':
-	  success = demangle_template (work, mangled, declp, 0, 1, 1);
-	  break;
-	default:
-	  n = consume_count (mangled);
-	  if (n < 0 || n > (long) strlen (*mangled))
-	    {
-	      success = 0;
-	      break;
-	    }
-
-	  if (n > 10 && strncmp (*mangled, "_GLOBAL_", 8) == 0
-	      && (*mangled)[9] == 'N'
-	      && (*mangled)[8] == (*mangled)[10]
-	      && strchr (cplus_markers, (*mangled)[8]))
-	    {
-	      /* A member of the anonymous namespace.  There's information
-		 about what identifier or filename it was keyed to, but
-		 it's just there to make the mangled name unique; we just
-		 step over it.  */
-	      string_append (declp, "{anonymous}");
-	      (*mangled) += n;
-
-	      /* Now p points to the marker before the N, so we need to
-		 update it to the first marker after what we consumed.  */
-	      p = strpbrk (*mangled, cplus_markers);
-	      break;
-	    }
-
-	  string_appendn (declp, *mangled, n);
-	  (*mangled) += n;
-	}
-      if (success && (p == *mangled))
-	{
-	  /* Consumed everything up to the cplus_marker, append the
-	     variable name.  */
-	  (*mangled)++;
-	  string_append (declp, SCOPE_STRING (work));
-	  n = strlen (*mangled);
-	  string_appendn (declp, *mangled, n);
-	  (*mangled) += n;
-	}
-      else
-	{
-	  success = 0;
-	}
-    }
-  else if (strncmp (*mangled, "__thunk_", 8) == 0)
-    {
-      int delta;
-
-      (*mangled) += 8;
-      delta = consume_count (mangled);
-      if (delta == -1)
-	success = 0;
-      else
-	{
-	  char *method = internal_cplus_demangle (work, ++*mangled);
-
-	  if (method)
-	    {
-	      char buf[50];
-	      sprintf (buf, "virtual function thunk (delta:%d) for ", -delta);
-	      string_append (declp, buf);
-	      string_append (declp, method);
-	      free (method);
-	      n = strlen (*mangled);
-	      (*mangled) += n;
-	    }
-	  else
-	    {
-	      success = 0;
-	    }
-	}
-    }
-  else if (strncmp (*mangled, "__t", 3) == 0
-	   && ((*mangled)[3] == 'i' || (*mangled)[3] == 'f'))
-    {
-      p = (*mangled)[3] == 'i' ? " type_info node" : " type_info function";
-      (*mangled) += 4;
-      switch (**mangled)
-	{
-	case 'Q':
-	case 'K':
-	  success = demangle_qualified (work, mangled, declp, 0, 1);
-	  break;
-	case 't':
-	  success = demangle_template (work, mangled, declp, 0, 1, 1);
-	  break;
-	default:
-	  success = do_type (work, mangled, declp);
-	  break;
-	}
-      if (success && **mangled != '\0')
-	success = 0;
-      if (success)
-	string_append (declp, p);
-    }
-  else
-    {
-      success = 0;
-    }
-  return (success);
-}
-
-static void
-recursively_demangle(work, mangled, result, namelength)
-     struct work_stuff *work;
-     const char **mangled;
-     string *result;
-     int namelength;
-{
-  char * recurse = (char *)NULL;
-  char * recurse_dem = (char *)NULL;
-
-  recurse = (char *) xmalloc (namelength + 1);
-  memcpy (recurse, *mangled, namelength);
-  recurse[namelength] = '\000';
-
-  recurse_dem = VG_(cplus_demangle) (recurse, work->options);
-
-  if (recurse_dem)
-    {
-      string_append (result, recurse_dem);
-      free (recurse_dem);
-    }
-  else
-    {
-      string_appendn (result, *mangled, namelength);
-    }
-  free (recurse);
-  *mangled += namelength;
-}
-
-/*
-
-LOCAL FUNCTION
-
-	arm_special -- special handling of ARM/lucid mangled strings
-
-SYNOPSIS
-
-	static int
-	arm_special (const char **mangled,
-		     string *declp);
-
-
-DESCRIPTION
-
-	Process some special ARM style mangling forms that don't fit
-	the normal pattern.  For example:
-
-		__vtbl__3foo		(foo virtual table)
-		__vtbl__3foo__3bar	(bar::foo virtual table)
-
- */
-
-static int
-arm_special (mangled, declp)
-     const char **mangled;
-     string *declp;
-{
-  int n;
-  int success = 1;
-  const char *scan;
-
-  if (strncmp (*mangled, ARM_VTABLE_STRING, ARM_VTABLE_STRLEN) == 0)
-    {
-      /* Found a ARM style virtual table, get past ARM_VTABLE_STRING
-         and create the decl.  Note that we consume the entire mangled
-	 input string, which means that demangle_signature has no work
-	 to do.  */
-      scan = *mangled + ARM_VTABLE_STRLEN;
-      while (*scan != '\0')        /* first check it can be demangled */
-        {
-          n = consume_count (&scan);
-          if (n == -1)
-	    {
-	      return (0);           /* no good */
-	    }
-          scan += n;
-          if (scan[0] == '_' && scan[1] == '_')
-	    {
-	      scan += 2;
-	    }
-        }
-      (*mangled) += ARM_VTABLE_STRLEN;
-      while (**mangled != '\0')
-	{
-	  n = consume_count (mangled);
-          if (n == -1
-	      || n > (long) strlen (*mangled))
-	    return 0;
-	  string_prependn (declp, *mangled, n);
-	  (*mangled) += n;
-	  if ((*mangled)[0] == '_' && (*mangled)[1] == '_')
-	    {
-	      string_prepend (declp, "::");
-	      (*mangled) += 2;
-	    }
-	}
-      string_append (declp, " virtual table");
-    }
-  else
-    {
-      success = 0;
-    }
-  return (success);
-}
-
-/*
-
-LOCAL FUNCTION
-
-	demangle_qualified -- demangle 'Q' qualified name strings
-
-SYNOPSIS
-
-	static int
-	demangle_qualified (struct work_stuff *, const char *mangled,
-			    string *result, int isfuncname, int append);
-
-DESCRIPTION
-
-	Demangle a qualified name, such as "Q25Outer5Inner" which is
-	the mangled form of "Outer::Inner".  The demangled output is
-	prepended or appended to the result string according to the
-	state of the append flag.
-
-	If isfuncname is nonzero, then the qualified name we are building
-	is going to be used as a member function name, so if it is a
-	constructor or destructor function, append an appropriate
-	constructor or destructor name.  I.E. for the above example,
-	the result for use as a constructor is "Outer::Inner::Inner"
-	and the result for use as a destructor is "Outer::Inner::~Inner".
-
-BUGS
-
-	Numeric conversion is ASCII dependent (FIXME).
-
- */
-
-static int
-demangle_qualified (work, mangled, result, isfuncname, append)
-     struct work_stuff *work;
-     const char **mangled;
-     string *result;
-     int isfuncname;
-     int append;
-{
-  int qualifiers = 0;
-  int success = 1;
-  string temp;
-  string last_name;
-  int bindex = register_Btype (work);
-
-  /* We only make use of ISFUNCNAME if the entity is a constructor or
-     destructor.  */
-  isfuncname = (isfuncname
-		&& ((work->constructor & 1) || (work->destructor & 1)));
-
-  string_init (&temp);
-  string_init (&last_name);
-
-  if ((*mangled)[0] == 'K')
-    {
-    /* Squangling qualified name reuse */
-      int idx;
-      (*mangled)++;
-      idx = consume_count_with_underscores (mangled);
-      if (idx == -1 || idx >= work -> numk)
-        success = 0;
-      else
-        string_append (&temp, work -> ktypevec[idx]);
-    }
-  else
-    switch ((*mangled)[1])
-    {
-    case '_':
-      /* GNU mangled name with more than 9 classes.  The count is preceded
-	 by an underscore (to distinguish it from the <= 9 case) and followed
-	 by an underscore.  */
-      (*mangled)++;
-      qualifiers = consume_count_with_underscores (mangled);
-      if (qualifiers == -1)
-	success = 0;
-      break;
-
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-      /* The count is in a single digit.  */
-      qualifiers = (*mangled)[1] - '0';
-
-      /* If there is an underscore after the digit, skip it.  This is
-	 said to be for ARM-qualified names, but the ARM makes no
-	 mention of such an underscore.  Perhaps cfront uses one.  */
-      if ((*mangled)[2] == '_')
-	{
-	  (*mangled)++;
-	}
-      (*mangled) += 2;
-      break;
-
-    case '0':
-    default:
-      success = 0;
-    }
-
-  if (!success)
-    {
-      string_delete (&last_name);
-      string_delete (&temp);
-      return success;
-    }
-
-  /* Pick off the names and collect them in the temp buffer in the order
-     in which they are found, separated by '::'.  */
-
-  while (qualifiers-- > 0)
-    {
-      int remember_K = 1;
-      string_clear (&last_name);
-
-      if (*mangled[0] == '_')
-	(*mangled)++;
-
-      if (*mangled[0] == 't')
-	{
-	  /* Here we always append to TEMP since we will want to use
-	     the template name without the template parameters as a
-	     constructor or destructor name.  The appropriate
-	     (parameter-less) value is returned by demangle_template
-	     in LAST_NAME.  We do not remember the template type here,
-	     in order to match the G++ mangling algorithm.  */
-	  success = demangle_template(work, mangled, &temp,
-				      &last_name, 1, 0);
-	  if (!success)
-	    break;
-	}
-      else if (*mangled[0] == 'K')
-	{
-          int idx;
-          (*mangled)++;
-          idx = consume_count_with_underscores (mangled);
-          if (idx == -1 || idx >= work->numk)
-            success = 0;
-          else
-            string_append (&temp, work->ktypevec[idx]);
-          remember_K = 0;
-
-	  if (!success) break;
-	}
-      else
-	{
-	  if (EDG_DEMANGLING)
-            {
-	      int namelength;
- 	      /* Now recursively demangle the qualifier
- 	       * This is necessary to deal with templates in
- 	       * mangling styles like EDG */
-	      namelength = consume_count (mangled);
-	      if (namelength == -1)
-		{
-		  success = 0;
-		  break;
-		}
- 	      recursively_demangle(work, mangled, &temp, namelength);
-            }
-          else
-            {
-	      string temp_last_name;
-	      string_init (&temp_last_name);
-              success = do_type (work, mangled, &temp_last_name);
-              if (!success)
-	        {
-		  string_delete (&temp_last_name);
-                  break;
-		}
-              string_appends (&temp, &temp_last_name);
-	      string_appends (&last_name, &temp_last_name);
-	      string_delete (&temp_last_name);
-            }
-	}
-
-      if (remember_K)
-	remember_Ktype (work, temp.b, LEN_STRING (&temp));
-
-      if (qualifiers > 0)
-	string_append (&temp, SCOPE_STRING (work));
-    }
-
-  remember_Btype (work, temp.b, LEN_STRING (&temp), bindex);
-
-  /* If we are using the result as a function name, we need to append
-     the appropriate '::' separated constructor or destructor name.
-     We do this here because this is the most convenient place, where
-     we already have a pointer to the name and the length of the name.  */
-
-  if (isfuncname)
-    {
-      string_append (&temp, SCOPE_STRING (work));
-      if (work -> destructor & 1)
-	string_append (&temp, "~");
-      string_appends (&temp, &last_name);
-    }
-
-  /* Now either prepend the temp buffer to the result, or append it,
-     depending upon the state of the append flag.  */
-
-  if (append)
-    string_appends (result, &temp);
-  else
-    {
-      if (!STRING_EMPTY (result))
-	string_append (&temp, SCOPE_STRING (work));
-      string_prepends (result, &temp);
-    }
-
-  string_delete (&last_name);
-  string_delete (&temp);
-  return (success);
-}
-
-/*
-
-LOCAL FUNCTION
-
-	get_count -- convert an ascii count to integer, consuming tokens
-
-SYNOPSIS
-
-	static int
-	get_count (const char **type, int *count)
-
-DESCRIPTION
-
-	Assume that *type points at a count in a mangled name; set
-	*count to its value, and set *type to the next character after
-	the count.  There are some weird rules in effect here.
-
-	If *type does not point at a string of digits, return zero.
-
-	If *type points at a string of digits followed by an
-	underscore, set *count to their value as an integer, advance
-	*type to point *after the underscore, and return 1.
-
-	If *type points at a string of digits not followed by an
-	underscore, consume only the first digit.  Set *count to its
-	value as an integer, leave *type pointing after that digit,
-	and return 1.
-
-        The excuse for this odd behavior: in the ARM and HP demangling
-        styles, a type can be followed by a repeat count of the form
-        `Nxy', where:
-
-        `x' is a single digit specifying how many additional copies
-            of the type to append to the argument list, and
-
-        `y' is one or more digits, specifying the zero-based index of
-            the first repeated argument in the list.  Yes, as you're
-            unmangling the name you can figure this out yourself, but
-            it's there anyway.
-
-        So, for example, in `bar__3fooFPiN51', the first argument is a
-        pointer to an integer (`Pi'), and then the next five arguments
-        are the same (`N5'), and the first repeat is the function's
-        second argument (`1').
-*/
-
-static int
-get_count (type, count)
-     const char **type;
-     int *count;
-{
-  const char *p;
-  int n;
-
-  if (!ISDIGIT ((unsigned char)**type))
-    return (0);
-  else
-    {
-      *count = **type - '0';
-      (*type)++;
-      if (ISDIGIT ((unsigned char)**type))
-	{
-	  p = *type;
-	  n = *count;
-	  do
-	    {
-	      n *= 10;
-	      n += *p - '0';
-	      p++;
-	    }
-	  while (ISDIGIT ((unsigned char)*p));
-	  if (*p == '_')
-	    {
-	      *type = p + 1;
-	      *count = n;
-	    }
-	}
-    }
-  return (1);
-}
-
-/* RESULT will be initialised here; it will be freed on failure.  The
-   value returned is really a type_kind_t.  */
-
-static int
-do_type (work, mangled, result)
-     struct work_stuff *work;
-     const char **mangled;
-     string *result;
-{
-  int n;
-  int done;
-  int success;
-  string decl;
-  const char *remembered_type;
-  int type_quals;
-  string btype;
-  type_kind_t tk = tk_none;
-
-  string_init (&btype);
-  string_init (&decl);
-  string_init (result);
-
-  done = 0;
-  success = 1;
-  while (success && !done)
-    {
-      int member;
-      switch (**mangled)
-	{
-
-	  /* A pointer type */
-	case 'P':
-	case 'p':
-	  (*mangled)++;
-	  if (! (work -> options & DMGL_JAVA))
-	    string_prepend (&decl, "*");
-	  if (tk == tk_none)
-	    tk = tk_pointer;
-	  break;
-
-	  /* A reference type */
-	case 'R':
-	  (*mangled)++;
-	  string_prepend (&decl, "&");
-	  if (tk == tk_none)
-	    tk = tk_reference;
-	  break;
-
-	  /* An array */
-	case 'A':
-	  {
-	    ++(*mangled);
-	    if (!STRING_EMPTY (&decl)
-		&& (decl.b[0] == '*' || decl.b[0] == '&'))
-	      {
-		string_prepend (&decl, "(");
-		string_append (&decl, ")");
-	      }
-	    string_append (&decl, "[");
-	    if (**mangled != '_')
-	      success = demangle_template_value_parm (work, mangled, &decl,
-						      tk_integral);
-	    if (**mangled == '_')
-	      ++(*mangled);
-	    string_append (&decl, "]");
-	    break;
-	  }
-
-	/* A back reference to a previously seen type */
-	case 'T':
-	  (*mangled)++;
-	  if (!get_count (mangled, &n) || n >= work -> ntypes)
-	    {
-	      success = 0;
-	    }
-	  else
-	    {
-	      remembered_type = work -> typevec[n];
-	      mangled = &remembered_type;
-	    }
-	  break;
-
-	  /* A function */
-	case 'F':
-	  (*mangled)++;
-	    if (!STRING_EMPTY (&decl)
-		&& (decl.b[0] == '*' || decl.b[0] == '&'))
-	    {
-	      string_prepend (&decl, "(");
-	      string_append (&decl, ")");
-	    }
-	  /* After picking off the function args, we expect to either find the
-	     function return type (preceded by an '_') or the end of the
-	     string.  */
-	  if (!demangle_nested_args (work, mangled, &decl)
-	      || (**mangled != '_' && **mangled != '\0'))
-	    {
-	      success = 0;
-	      break;
-	    }
-	  if (success && (**mangled == '_'))
-	    (*mangled)++;
-	  break;
-
-	case 'M':
-	case 'O':
-	  {
-	    type_quals = TYPE_UNQUALIFIED;
-
-	    member = **mangled == 'M';
-	    (*mangled)++;
-
-	    string_append (&decl, ")");
-
-	    /* We don't need to prepend `::' for a qualified name;
-	       demangle_qualified will do that for us.  */
-	    if (**mangled != 'Q')
-	      string_prepend (&decl, SCOPE_STRING (work));
-
-	    if (ISDIGIT ((unsigned char)**mangled))
-	      {
-		n = consume_count (mangled);
-		if (n == -1
-		    || (int) strlen (*mangled) < n)
-		  {
-		    success = 0;
-		    break;
-		  }
-		string_prependn (&decl, *mangled, n);
-		*mangled += n;
-	      }
-	    else if (**mangled == 'X' || **mangled == 'Y')
-	      {
-		string temp;
-		do_type (work, mangled, &temp);
-		string_prepends (&decl, &temp);
-	      }
-	    else if (**mangled == 't')
-	      {
-		string temp;
-		string_init (&temp);
-		success = demangle_template (work, mangled, &temp,
-					     NULL, 1, 1);
-		if (success)
-		  {
-		    string_prependn (&decl, temp.b, temp.p - temp.b);
-		    string_clear (&temp);
-		  }
-		else
-		  break;
-	      }
-	    else if (**mangled == 'Q')
-	      {
-		success = demangle_qualified (work, mangled, &decl,
-					      /*isfuncnam=*/0, 
-					      /*append=*/0);
-		if (!success)
-		  break;
-	      }
-	    else
-	      {
-		success = 0;
-		break;
-	      }
-
-	    string_prepend (&decl, "(");
-	    if (member)
-	      {
-		switch (**mangled)
-		  {
-		  case 'C':
-		  case 'V':
-		  case 'u':
-		    type_quals |= code_for_qualifier (**mangled);
-		    (*mangled)++;
-		    break;
-
-		  default:
-		    break;
-		  }
-
-		if (*(*mangled)++ != 'F')
-		  {
-		    success = 0;
-		    break;
-		  }
-	      }
-	    if ((member && !demangle_nested_args (work, mangled, &decl))
-		|| **mangled != '_')
-	      {
-		success = 0;
-		break;
-	      }
-	    (*mangled)++;
-	    if (! PRINT_ANSI_QUALIFIERS)
-	      {
-		break;
-	      }
-	    if (type_quals != TYPE_UNQUALIFIED)
-	      {
-		APPEND_BLANK (&decl);
-		string_append (&decl, qualifier_string (type_quals));
-	      }
-	    break;
-	  }
-        case 'G':
-	  (*mangled)++;
-	  break;
-
-	case 'C':
-	case 'V':
-	case 'u':
-	  if (PRINT_ANSI_QUALIFIERS)
-	    {
-	      if (!STRING_EMPTY (&decl))
-		string_prepend (&decl, " ");
-
-	      string_prepend (&decl, demangle_qualifier (**mangled));
-	    }
-	  (*mangled)++;
-	  break;
-	  /*
-	    }
-	    */
-
-	  /* fall through */
-	default:
-	  done = 1;
-	  break;
-	}
-    }
-
-  if (success) switch (**mangled)
-    {
-      /* A qualified name, such as "Outer::Inner".  */
-    case 'Q':
-    case 'K':
-      {
-        success = demangle_qualified (work, mangled, result, 0, 1);
-        break;
-      }
-
-    /* A back reference to a previously seen squangled type */
-    case 'B':
-      (*mangled)++;
-      if (!get_count (mangled, &n) || n >= work -> numb)
-	success = 0;
-      else
-	string_append (result, work->btypevec[n]);
-      break;
-
-    case 'X':
-    case 'Y':
-      /* A template parm.  We substitute the corresponding argument. */
-      {
-	int idx;
-
-	(*mangled)++;
-	idx = consume_count_with_underscores (mangled);
-
-	if (idx == -1
-	    || (work->tmpl_argvec && idx >= work->ntmpl_args)
-	    || consume_count_with_underscores (mangled) == -1)
-	  {
-	    success = 0;
-	    break;
-	  }
-
-	if (work->tmpl_argvec)
-	  string_append (result, work->tmpl_argvec[idx]);
-	else
-	  string_append_template_idx (result, idx);
-
-	success = 1;
-      }
-    break;
-
-    default:
-      success = demangle_fund_type (work, mangled, result);
-      if (tk == tk_none)
-	tk = (type_kind_t) success;
-      break;
-    }
-
-  if (success)
-    {
-      if (!STRING_EMPTY (&decl))
-	{
-	  string_append (result, " ");
-	  string_appends (result, &decl);
-	}
-    }
-  else
-    string_delete (result);
-  string_delete (&decl);
-
-  if (success)
-    /* Assume an integral type, if we're not sure.  */
-    return (int) ((tk == tk_none) ? tk_integral : tk);
-  else
-    return 0;
-}
-
-/* Given a pointer to a type string that represents a fundamental type
-   argument (int, long, unsigned int, etc) in TYPE, a pointer to the
-   string in which the demangled output is being built in RESULT, and
-   the WORK structure, decode the types and add them to the result.
-
-   For example:
-
-   	"Ci"	=>	"const int"
-	"Sl"	=>	"signed long"
-	"CUs"	=>	"const unsigned short"
-
-   The value returned is really a type_kind_t.  */
-
-static int
-demangle_fund_type (work, mangled, result)
-     struct work_stuff *work;
-     const char **mangled;
-     string *result;
-{
-  int done = 0;
-  int success = 1;
-  char buf[10];
-  unsigned int dec = 0;
-  string btype;
-  type_kind_t tk = tk_integral;
-
-  string_init (&btype);
-
-  /* First pick off any type qualifiers.  There can be more than one.  */
-
-  while (!done)
-    {
-      switch (**mangled)
-	{
-	case 'C':
-	case 'V':
-	case 'u':
-	  if (PRINT_ANSI_QUALIFIERS)
-	    {
-              if (!STRING_EMPTY (result))
-                string_prepend (result, " ");
-	      string_prepend (result, demangle_qualifier (**mangled));
-	    }
-	  (*mangled)++;
-	  break;
-	case 'U':
-	  (*mangled)++;
-	  APPEND_BLANK (result);
-	  string_append (result, "unsigned");
-	  break;
-	case 'S': /* signed char only */
-	  (*mangled)++;
-	  APPEND_BLANK (result);
-	  string_append (result, "signed");
-	  break;
-	case 'J':
-	  (*mangled)++;
-	  APPEND_BLANK (result);
-	  string_append (result, "__complex");
-	  break;
-	default:
-	  done = 1;
-	  break;
-	}
-    }
-
-  /* Now pick off the fundamental type.  There can be only one.  */
-
-  switch (**mangled)
-    {
-    case '\0':
-    case '_':
-      break;
-    case 'v':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "void");
-      break;
-    case 'x':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "long long");
-      break;
-    case 'l':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "long");
-      break;
-    case 'i':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "int");
-      break;
-    case 's':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "short");
-      break;
-    case 'b':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "bool");
-      tk = tk_bool;
-      break;
-    case 'c':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "char");
-      tk = tk_char;
-      break;
-    case 'w':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "wchar_t");
-      tk = tk_char;
-      break;
-    case 'r':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "long double");
-      tk = tk_real;
-      break;
-    case 'd':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "double");
-      tk = tk_real;
-      break;
-    case 'f':
-      (*mangled)++;
-      APPEND_BLANK (result);
-      string_append (result, "float");
-      tk = tk_real;
-      break;
-    case 'G':
-      (*mangled)++;
-      if (!ISDIGIT ((unsigned char)**mangled))
-	{
-	  success = 0;
-	  break;
-	}
-    case 'I':
-      (*mangled)++;
-      if (**mangled == '_')
-	{
-	  int i;
-	  (*mangled)++;
-	  for (i = 0;
-	       i < (long) sizeof (buf) - 1 && **mangled && **mangled != '_';
-	       (*mangled)++, i++)
-	    buf[i] = **mangled;
-	  if (**mangled != '_')
-	    {
-	      success = 0;
-	      break;
-	    }
-	  buf[i] = '\0';
-	  (*mangled)++;
-	}
-      else
-	{
-	  strncpy (buf, *mangled, 2);
-	  buf[2] = '\0';
-	  *mangled += min (strlen (*mangled), 2);
-	}
-      /*sscanf (buf, "%x", &dec);
-      sprintf (buf, "int%u_t", dec);*/
-      sprintf (buf, "i_xx_t");
-      APPEND_BLANK (result);
-      string_append (result, buf);
-      break;
-
-      /* fall through */
-      /* An explicit type, such as "6mytype" or "7integer" */
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-    case '8':
-    case '9':
-      {
-        int bindex = register_Btype (work);
-        string loc_btype;
-        string_init (&loc_btype);
-        if (demangle_class_name (work, mangled, &loc_btype)) {
-          remember_Btype (work, loc_btype.b, LEN_STRING (&loc_btype), bindex);
-          APPEND_BLANK (result);
-          string_appends (result, &loc_btype);
-        }
-        else
-          success = 0;
-        string_delete (&loc_btype);
-        break;
-      }
-    case 't':
-      {
-        success = demangle_template (work, mangled, &btype, 0, 1, 1);
-        string_appends (result, &btype);
-        break;
-      }
-    default:
-      success = 0;
-      break;
-    }
-
-  string_delete (&btype);
-
-  return success ? ((int) tk) : 0;
-}
-
-
-/* Handle a template's value parameter for HP aCC (extension from ARM)
-   **mangled points to 'S' or 'U' */
-
-static int
-do_hpacc_template_const_value (work, mangled, result)
-     struct work_stuff *work ATTRIBUTE_UNUSED;
-     const char **mangled;
-     string *result;
-{
-  int unsigned_const;
-
-  if (**mangled != 'U' && **mangled != 'S')
-    return 0;
-
-  unsigned_const = (**mangled == 'U');
-
-  (*mangled)++;
-
-  switch (**mangled)
-    {
-      case 'N':
-        string_append (result, "-");
-        /* fall through */
-      case 'P':
-        (*mangled)++;
-        break;
-      case 'M':
-        /* special case for -2^31 */
-        string_append (result, "-2147483648");
-        (*mangled)++;
-        return 1;
-      default:
-        return 0;
-    }
-
-  /* We have to be looking at an integer now */
-  if (!(ISDIGIT ((unsigned char)**mangled)))
-    return 0;
-
-  /* We only deal with integral values for template
-     parameters -- so it's OK to look only for digits */
-  while (ISDIGIT ((unsigned char)**mangled))
-    {
-      char_str[0] = **mangled;
-      string_append (result, char_str);
-      (*mangled)++;
-    }
-
-  if (unsigned_const)
-    string_append (result, "U");
-
-  /* FIXME? Some day we may have 64-bit (or larger :-) ) constants
-     with L or LL suffixes. pai/1997-09-03 */
-
-  return 1; /* success */
-}
-
-/* Handle a template's literal parameter for HP aCC (extension from ARM)
-   **mangled is pointing to the 'A' */
-
-static int
-do_hpacc_template_literal (work, mangled, result)
-     struct work_stuff *work;
-     const char **mangled;
-     string *result;
-{
-  int literal_len = 0;
-  char * recurse;
-  char * recurse_dem;
-
-  if (**mangled != 'A')
-    return 0;
-
-  (*mangled)++;
-
-  literal_len = consume_count (mangled);
-
-  if (literal_len <= 0)
-    return 0;
-
-  /* Literal parameters are names of arrays, functions, etc.  and the
-     canonical representation uses the address operator */
-  string_append (result, "&");
-
-  /* Now recursively demangle the literal name */
-  recurse = (char *) xmalloc (literal_len + 1);
-  memcpy (recurse, *mangled, literal_len);
-  recurse[literal_len] = '\000';
-
-  recurse_dem = VG_(cplus_demangle) (recurse, work->options);
-
-  if (recurse_dem)
-    {
-      string_append (result, recurse_dem);
-      free (recurse_dem);
-    }
-  else
-    {
-      string_appendn (result, *mangled, literal_len);
-    }
-  (*mangled) += literal_len;
-  free (recurse);
-
-  return 1;
-}
-
-static int
-snarf_numeric_literal (args, arg)
-     const char ** args;
-     string * arg;
-{
-  if (**args == '-')
-    {
-      char_str[0] = '-';
-      string_append (arg, char_str);
-      (*args)++;
-    }
-  else if (**args == '+')
-    (*args)++;
-
-  if (!ISDIGIT ((unsigned char)**args))
-    return 0;
-
-  while (ISDIGIT ((unsigned char)**args))
-    {
-      char_str[0] = **args;
-      string_append (arg, char_str);
-      (*args)++;
-    }
-
-  return 1;
-}
-
-/* Demangle the next argument, given by MANGLED into RESULT, which
-   *should be an uninitialized* string.  It will be initialized here,
-   and free'd should anything go wrong.  */
-
-static int
-do_arg (work, mangled, result)
-     struct work_stuff *work;
-     const char **mangled;
-     string *result;
-{
-  /* Remember where we started so that we can record the type, for
-     non-squangling type remembering.  */
-  const char *start = *mangled;
-  string temp_result;
-
-  string_init (result);
-  string_init (&temp_result);
-
-  if (work->nrepeats > 0)
-    {
-      --work->nrepeats;
-
-      if (work->previous_argument == 0)
-	return 0;
-
-      /* We want to reissue the previous type in this argument list.  */
-      string_appends (result, work->previous_argument);
-      return 1;
-    }
-
-  if (**mangled == 'n')
-    {
-      /* A squangling-style repeat.  */
-      (*mangled)++;
-      work->nrepeats = consume_count(mangled);
-
-      if (work->nrepeats <= 0)
-	/* This was not a repeat count after all.  */
-	return 0;
-
-      if (work->nrepeats > 9)
-	{
-	  if (**mangled != '_')
-	    /* The repeat count should be followed by an '_' in this
-	       case.  */
-	    return 0;
-	  else
-	    (*mangled)++;
-	}
-
-      /* Now, the repeat is all set up.  */
-      return do_arg (work, mangled, result);
-    }
-
-  /* Save the result in WORK->previous_argument so that we can find it
-     if it's repeated.  Note that saving START is not good enough: we
-     do not want to add additional types to the back-referenceable
-     type vector when processing a repeated type.  */
-  if (work->previous_argument)
-    string_clear (work->previous_argument);
-  else
-    {
-      work->previous_argument = (string*) xmalloc (sizeof (string));
-      string_init (work->previous_argument);
-    }
-
-  if (!do_type (work, mangled, &temp_result))
-    {
-      string_delete (&temp_result);
-      return 0;
-    }
-  string_appends (work->previous_argument, &temp_result);
-  string_delete (&temp_result);
-
-  string_appends (result, work->previous_argument);
-
-  remember_type (work, start, *mangled - start);
-  return 1;
-}
-
-static void
-remember_type (work, start, len)
-     struct work_stuff *work;
-     const char *start;
-     int len;
-{
-  char *tem;
-
-  if (work->forgetting_types)
-    return;
-
-  if (work -> ntypes >= work -> typevec_size)
-    {
-      if (work -> typevec_size == 0)
-	{
-	  work -> typevec_size = 3;
-	  work -> typevec
-	    = (char **) xmalloc (sizeof (char *) * work -> typevec_size);
-	}
-      else
-	{
-	  work -> typevec_size *= 2;
-	  work -> typevec
-	    = (char **) xrealloc ((char *)work -> typevec,
-				  sizeof (char *) * work -> typevec_size);
-	}
-    }
-  tem = xmalloc (len + 1);
-  memcpy (tem, start, len);
-  tem[len] = '\0';
-  work -> typevec[work -> ntypes++] = tem;
-}
-
-
-/* Remember a K type class qualifier. */
-static void
-remember_Ktype (work, start, len)
-     struct work_stuff *work;
-     const char *start;
-     int len;
-{
-  char *tem;
-
-  if (work -> numk >= work -> ksize)
-    {
-      if (work -> ksize == 0)
-	{
-	  work -> ksize = 5;
-	  work -> ktypevec
-	    = (char **) xmalloc (sizeof (char *) * work -> ksize);
-	}
-      else
-	{
-	  work -> ksize *= 2;
-	  work -> ktypevec
-	    = (char **) xrealloc ((char *)work -> ktypevec,
-				  sizeof (char *) * work -> ksize);
-	}
-    }
-  tem = xmalloc (len + 1);
-  memcpy (tem, start, len);
-  tem[len] = '\0';
-  work -> ktypevec[work -> numk++] = tem;
-}
-
-/* Register a B code, and get an index for it. B codes are registered
-   as they are seen, rather than as they are completed, so map<temp<char> >
-   registers map<temp<char> > as B0, and temp<char> as B1 */
-
-static int
-register_Btype (work)
-     struct work_stuff *work;
-{
-  int ret;
-
-  if (work -> numb >= work -> bsize)
-    {
-      if (work -> bsize == 0)
-	{
-	  work -> bsize = 5;
-	  work -> btypevec
-	    = (char **) xmalloc (sizeof (char *) * work -> bsize);
-	}
-      else
-	{
-	  work -> bsize *= 2;
-	  work -> btypevec
-	    = (char **) xrealloc ((char *)work -> btypevec,
-				  sizeof (char *) * work -> bsize);
-	}
-    }
-  ret = work -> numb++;
-  work -> btypevec[ret] = NULL;
-  return(ret);
-}
-
-/* Store a value into a previously registered B code type. */
-
-static void
-remember_Btype (work, start, len, ind)
-     struct work_stuff *work;
-     const char *start;
-     int len, ind;
-{
-  char *tem;
-
-  tem = xmalloc (len + 1);
-  memcpy (tem, start, len);
-  tem[len] = '\0';
-  work -> btypevec[ind] = tem;
-}
-
-/* Lose all the info related to B and K type codes. */
-static void
-forget_B_and_K_types (work)
-     struct work_stuff *work;
-{
-  int i;
-
-  while (work -> numk > 0)
-    {
-      i = --(work -> numk);
-      if (work -> ktypevec[i] != NULL)
-	{
-	  free (work -> ktypevec[i]);
-	  work -> ktypevec[i] = NULL;
-	}
-    }
-
-  while (work -> numb > 0)
-    {
-      i = --(work -> numb);
-      if (work -> btypevec[i] != NULL)
-	{
-	  free (work -> btypevec[i]);
-	  work -> btypevec[i] = NULL;
-	}
-    }
-}
-/* Forget the remembered types, but not the type vector itself.  */
-
-static void
-forget_types (work)
-     struct work_stuff *work;
-{
-  int i;
-
-  while (work -> ntypes > 0)
-    {
-      i = --(work -> ntypes);
-      if (work -> typevec[i] != NULL)
-	{
-	  free (work -> typevec[i]);
-	  work -> typevec[i] = NULL;
-	}
-    }
-}
-
-/* Process the argument list part of the signature, after any class spec
-   has been consumed, as well as the first 'F' character (if any).  For
-   example:
-
-   "__als__3fooRT0"		=>	process "RT0"
-   "complexfunc5__FPFPc_PFl_i"	=>	process "PFPc_PFl_i"
-
-   DECLP must be already initialised, usually non-empty.  It won't be freed
-   on failure.
-
-   Note that g++ differs significantly from ARM and lucid style mangling
-   with regards to references to previously seen types.  For example, given
-   the source fragment:
-
-     class foo {
-       public:
-       foo::foo (int, foo &ia, int, foo &ib, int, foo &ic);
-     };
-
-     foo::foo (int, foo &ia, int, foo &ib, int, foo &ic) { ia = ib = ic; }
-     void foo (int, foo &ia, int, foo &ib, int, foo &ic) { ia = ib = ic; }
-
-   g++ produces the names:
-
-     __3fooiRT0iT2iT2
-     foo__FiR3fooiT1iT1
-
-   while lcc (and presumably other ARM style compilers as well) produces:
-
-     foo__FiR3fooT1T2T1T2
-     __ct__3fooFiR3fooT1T2T1T2
-
-   Note that g++ bases its type numbers starting at zero and counts all
-   previously seen types, while lucid/ARM bases its type numbers starting
-   at one and only considers types after it has seen the 'F' character
-   indicating the start of the function args.  For lucid/ARM style, we
-   account for this difference by discarding any previously seen types when
-   we see the 'F' character, and subtracting one from the type number
-   reference.
-
- */
-
-static int
-demangle_args (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  string arg;
-  int need_comma = 0;
-  int r;
-  int t;
-  const char *tem;
-  char temptype;
-
-  if (PRINT_ARG_TYPES)
-    {
-      string_append (declp, "(");
-      if (**mangled == '\0')
-	{
-	  string_append (declp, "void");
-	}
-    }
-
-  while ((**mangled != '_' && **mangled != '\0' && **mangled != 'e')
-	 || work->nrepeats > 0)
-    {
-      if ((**mangled == 'N') || (**mangled == 'T'))
-	{
-	  temptype = *(*mangled)++;
-
-	  if (temptype == 'N')
-	    {
-	      if (!get_count (mangled, &r))
-		{
-		  return (0);
-		}
-	    }
-	  else
-	    {
-	      r = 1;
-	    }
-          if ((HP_DEMANGLING || ARM_DEMANGLING || EDG_DEMANGLING) && work -> ntypes >= 10)
-            {
-              /* If we have 10 or more types we might have more than a 1 digit
-                 index so we'll have to consume the whole count here. This
-                 will lose if the next thing is a type name preceded by a
-                 count but it's impossible to demangle that case properly
-                 anyway. Eg if we already have 12 types is T12Pc "(..., type1,
-                 Pc, ...)"  or "(..., type12, char *, ...)" */
-              if ((t = consume_count(mangled)) <= 0)
-                {
-                  return (0);
-                }
-            }
-          else
-	    {
-	      if (!get_count (mangled, &t))
-	    	{
-	          return (0);
-	    	}
-	    }
-	  if (LUCID_DEMANGLING || ARM_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING)
-	    {
-	      t--;
-	    }
-	  /* Validate the type index.  Protect against illegal indices from
-	     malformed type strings.  */
-	  if ((t < 0) || (t >= work -> ntypes))
-	    {
-	      return (0);
-	    }
-	  while (work->nrepeats > 0 || --r >= 0)
-	    {
-	      tem = work -> typevec[t];
-	      if (need_comma && PRINT_ARG_TYPES)
-		{
-		  string_append (declp, ", ");
-		}
-	      if (!do_arg (work, &tem, &arg))
-		{
-		  return (0);
-		}
-	      if (PRINT_ARG_TYPES)
-		{
-		  string_appends (declp, &arg);
-		}
-	      string_delete (&arg);
-	      need_comma = 1;
-	    }
-	}
-      else
-	{
-	  if (need_comma && PRINT_ARG_TYPES)
-	    string_append (declp, ", ");
-	  if (!do_arg (work, mangled, &arg))
-	    {
-	      string_delete (&arg);
-	      return (0);
-	    }
-	  if (PRINT_ARG_TYPES)
-	    string_appends (declp, &arg);
-	  string_delete (&arg);
-	  need_comma = 1;
-	}
-    }
-
-  if (**mangled == 'e')
-    {
-      (*mangled)++;
-      if (PRINT_ARG_TYPES)
-	{
-	  if (need_comma)
-	    {
-	      string_append (declp, ",");
-	    }
-	  string_append (declp, "...");
-	}
-    }
-
-  if (PRINT_ARG_TYPES)
-    {
-      string_append (declp, ")");
-    }
-  return (1);
-}
-
-/* Like demangle_args, but for demangling the argument lists of function
-   and method pointers or references, not top-level declarations.  */
-
-static int
-demangle_nested_args (work, mangled, declp)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-{
-  string* saved_previous_argument;
-  int result;
-  int saved_nrepeats;
-
-  /* The G++ name-mangling algorithm does not remember types on nested
-     argument lists, unless -fsquangling is used, and in that case the
-     type vector updated by remember_type is not used.  So, we turn
-     off remembering of types here.  */
-  ++work->forgetting_types;
-
-  /* For the repeat codes used with -fsquangling, we must keep track of
-     the last argument.  */
-  saved_previous_argument = work->previous_argument;
-  saved_nrepeats = work->nrepeats;
-  work->previous_argument = 0;
-  work->nrepeats = 0;
-
-  /* Actually demangle the arguments.  */
-  result = demangle_args (work, mangled, declp);
-
-  /* Restore the previous_argument field.  */
-  if (work->previous_argument)
-    {
-      string_delete (work->previous_argument);
-      free ((char*) work->previous_argument);
-    }
-  work->previous_argument = saved_previous_argument;
-  --work->forgetting_types;
-  work->nrepeats = saved_nrepeats;
-
-  return result;
-}
-
-static void
-demangle_function_name (work, mangled, declp, scan)
-     struct work_stuff *work;
-     const char **mangled;
-     string *declp;
-     const char *scan;
-{
-  size_t i;
-  string type;
-  const char *tem;
-
-  string_appendn (declp, (*mangled), scan - (*mangled));
-  string_need (declp, 1);
-  *(declp -> p) = '\0';
-
-  /* Consume the function name, including the "__" separating the name
-     from the signature.  We are guaranteed that SCAN points to the
-     separator.  */
-
-  (*mangled) = scan + 2;
-  /* We may be looking at an instantiation of a template function:
-     foo__Xt1t2_Ft3t4, where t1, t2, ... are template arguments and a
-     following _F marks the start of the function arguments.  Handle
-     the template arguments first. */
-
-  if (HP_DEMANGLING && (**mangled == 'X'))
-    {
-      demangle_arm_hp_template (work, mangled, 0, declp);
-      /* This leaves MANGLED pointing to the 'F' marking func args */
-    }
-
-  if (LUCID_DEMANGLING || ARM_DEMANGLING || HP_DEMANGLING || EDG_DEMANGLING)
-    {
-
-      /* See if we have an ARM style constructor or destructor operator.
-	 If so, then just record it, clear the decl, and return.
-	 We can't build the actual constructor/destructor decl until later,
-	 when we recover the class name from the signature.  */
-
-      if (strcmp (declp -> b, "__ct") == 0)
-	{
-	  work -> constructor += 1;
-	  string_clear (declp);
-	  return;
-	}
-      else if (strcmp (declp -> b, "__dt") == 0)
-	{
-	  work -> destructor += 1;
-	  string_clear (declp);
-	  return;
-	}
-    }
-
-  if (declp->p - declp->b >= 3
-      && declp->b[0] == 'o'
-      && declp->b[1] == 'p'
-      && strchr (cplus_markers, declp->b[2]) != NULL)
-    {
-      /* see if it's an assignment expression */
-      if (declp->p - declp->b >= 10 /* op$assign_ */
-	  && memcmp (declp->b + 3, "assign_", 7) == 0)
-	{
-	  for (i = 0; i < ARRAY_SIZE (optable); i++)
-	    {
-	      int len = declp->p - declp->b - 10;
-	      if ((int) strlen (optable[i].in) == len
-		  && memcmp (optable[i].in, declp->b + 10, len) == 0)
-		{
-		  string_clear (declp);
-		  string_append (declp, "operator");
-		  string_append (declp, optable[i].out);
-		  string_append (declp, "=");
-		  break;
-		}
-	    }
-	}
-      else
-	{
-	  for (i = 0; i < ARRAY_SIZE (optable); i++)
-	    {
-	      int len = declp->p - declp->b - 3;
-	      if ((int) strlen (optable[i].in) == len
-		  && memcmp (optable[i].in, declp->b + 3, len) == 0)
-		{
-		  string_clear (declp);
-		  string_append (declp, "operator");
-		  string_append (declp, optable[i].out);
-		  break;
-		}
-	    }
-	}
-    }
-  else if (declp->p - declp->b >= 5 && memcmp (declp->b, "type", 4) == 0
-	   && strchr (cplus_markers, declp->b[4]) != NULL)
-    {
-      /* type conversion operator */
-      tem = declp->b + 5;
-      if (do_type (work, &tem, &type))
-	{
-	  string_clear (declp);
-	  string_append (declp, "operator ");
-	  string_appends (declp, &type);
-	  string_delete (&type);
-	}
-    }
-  else if (declp->b[0] == '_' && declp->b[1] == '_'
-	   && declp->b[2] == 'o' && declp->b[3] == 'p')
-    {
-      /* ANSI.  */
-      /* type conversion operator.  */
-      tem = declp->b + 4;
-      if (do_type (work, &tem, &type))
-	{
-	  string_clear (declp);
-	  string_append (declp, "operator ");
-	  string_appends (declp, &type);
-	  string_delete (&type);
-	}
-    }
-  else if (declp->b[0] == '_' && declp->b[1] == '_'
-	   && ISLOWER((unsigned char)declp->b[2])
-	   && ISLOWER((unsigned char)declp->b[3]))
-    {
-      if (declp->b[4] == '\0')
-	{
-	  /* Operator.  */
-	  for (i = 0; i < ARRAY_SIZE (optable); i++)
-	    {
-	      if (strlen (optable[i].in) == 2
-		  && memcmp (optable[i].in, declp->b + 2, 2) == 0)
-		{
-		  string_clear (declp);
-		  string_append (declp, "operator");
-		  string_append (declp, optable[i].out);
-		  break;
-		}
-	    }
-	}
-      else
-	{
-	  if (declp->b[2] == 'a' && declp->b[5] == '\0')
-	    {
-	      /* Assignment.  */
-	      for (i = 0; i < ARRAY_SIZE (optable); i++)
-		{
-		  if (strlen (optable[i].in) == 3
-		      && memcmp (optable[i].in, declp->b + 2, 3) == 0)
-		    {
-		      string_clear (declp);
-		      string_append (declp, "operator");
-		      string_append (declp, optable[i].out);
-		      break;
-		    }
-		}
-	    }
-	}
-    }
-}
-
-/* a mini string-handling package */
-
-static void
-string_need (s, n)
-     string *s;
-     int n;
-{
-  int tem;
-
-  if (s->b == NULL)
-    {
-      if (n < 32)
-	{
-	  n = 32;
-	}
-      s->p = s->b = xmalloc (n);
-      s->e = s->b + n;
-    }
-  else if (s->e - s->p < n)
-    {
-      tem = s->p - s->b;
-      n += tem;
-      n *= 2;
-      s->b = xrealloc (s->b, n);
-      s->p = s->b + tem;
-      s->e = s->b + n;
-    }
-}
-
-static void
-string_delete (s)
-     string *s;
-{
-  if (s->b != NULL)
-    {
-      free (s->b);
-      s->b = s->e = s->p = NULL;
-    }
-}
-
-static void
-string_init (s)
-     string *s;
-{
-  s->b = s->p = s->e = NULL;
-}
-
-static void
-string_clear (s)
-     string *s;
-{
-  s->p = s->b;
-}
-
-#if 0
-
-static int
-string_empty (s)
-     string *s;
-{
-  return (s->b == s->p);
-}
-
-#endif
-
-static void
-string_append (p, s)
-     string *p;
-     const char *s;
-{
-  int n;
-  if (s == NULL || *s == '\0')
-    return;
-  n = strlen (s);
-  string_need (p, n);
-  memcpy (p->p, s, n);
-  p->p += n;
-}
-
-static void
-string_appends (p, s)
-     string *p, *s;
-{
-  int n;
-
-  if (s->b != s->p)
-    {
-      n = s->p - s->b;
-      string_need (p, n);
-      memcpy (p->p, s->b, n);
-      p->p += n;
-    }
-}
-
-static void
-string_appendn (p, s, n)
-     string *p;
-     const char *s;
-     int n;
-{
-  if (n != 0)
-    {
-      string_need (p, n);
-      memcpy (p->p, s, n);
-      p->p += n;
-    }
-}
-
-static void
-string_prepend (p, s)
-     string *p;
-     const char *s;
-{
-  if (s != NULL && *s != '\0')
-    {
-      string_prependn (p, s, strlen (s));
-    }
-}
-
-static void
-string_prepends (p, s)
-     string *p, *s;
-{
-  if (s->b != s->p)
-    {
-      string_prependn (p, s->b, s->p - s->b);
-    }
-}
-
-static void
-string_prependn (p, s, n)
-     string *p;
-     const char *s;
-     int n;
-{
-  char *q;
-
-  if (n != 0)
-    {
-      string_need (p, n);
-      for (q = p->p - 1; q >= p->b; q--)
-	{
-	  q[n] = q[0];
-	}
-      memcpy (p->b, s, n);
-      p->p += n;
-    }
-}
-
-static void
-string_append_template_idx (s, idx)
-     string *s;
-     int idx;
-{
-  char buf[INTBUF_SIZE + 1 /* 'T' */];
-  sprintf(buf, "T%d", idx);
-  string_append (s, buf);
-}
-
-/* To generate a standalone demangler program for testing purposes,
-   just compile and link this file with -DMAIN and libiberty.a.  When
-   run, it demangles each command line arg, or each stdin string, and
-   prints the result on stdout.  */
-
-#ifdef MAIN
-
-#include "getopt.h"
-
-static const char *program_name;
-static const char *program_version = VERSION;
-static int flags = DMGL_PARAMS | DMGL_ANSI | DMGL_VERBOSE;
-
-static void demangle_it PARAMS ((char *));
-static void usage PARAMS ((FILE *, int)) ATTRIBUTE_NORETURN;
-static void fatal PARAMS ((const char *)) ATTRIBUTE_NORETURN;
-static void print_demangler_list PARAMS ((FILE *));
-
-static void
-demangle_it (mangled_name)
-     char *mangled_name;
-{
-  char *result;
-
-  /* For command line args, also try to demangle type encodings.  */
-  result = cplus_demangle (mangled_name, flags | DMGL_TYPES);
-  if (result == NULL)
-    {
-      printf ("%s\n", mangled_name);
-    }
-  else
-    {
-      printf ("%s\n", result);
-      free (result);
-    }
-}
-
-static void 
-print_demangler_list (stream)
-     FILE *stream;
-{
-  const struct demangler_engine *demangler; 
-
-  fprintf (stream, "{%s", libiberty_demanglers->demangling_style_name);
-  
-  for (demangler = libiberty_demanglers + 1;
-       demangler->demangling_style != unknown_demangling;
-       ++demangler)
-    fprintf (stream, ",%s", demangler->demangling_style_name);
-
-  fprintf (stream, "}");
-}
-
-static void
-usage (stream, status)
-     FILE *stream;
-     int status;
-{
-  fprintf (stream, "\
-Usage: %s [-_] [-n] [--strip-underscores] [--no-strip-underscores] \n",
-	   program_name);
-
-  fprintf (stream, "\
-       [-s ");
-  print_demangler_list (stream);
-  fprintf (stream, "]\n");
-
-  fprintf (stream, "\
-       [--format ");
-  print_demangler_list (stream);
-  fprintf (stream, "]\n");
-
-  fprintf (stream, "\
-       [--help] [--version] [arg...]\n");
-  exit (status);
-}
-
-#define MBUF_SIZE 32767
-char mbuffer[MBUF_SIZE];
-
-/* Defined in the automatically-generated underscore.c.  */
-extern int prepends_underscore;
-
-int strip_underscore = 0;
-
-static const struct option long_options[] = {
-  {"strip-underscores", no_argument, 0, '_'},
-  {"format", required_argument, 0, 's'},
-  {"help", no_argument, 0, 'h'},
-  {"no-strip-underscores", no_argument, 0, 'n'},
-  {"version", no_argument, 0, 'v'},
-  {0, no_argument, 0, 0}
-};
-
-/* More 'friendly' abort that prints the line and file.
-   config.h can #define abort fancy_abort if you like that sort of thing.  */
-
-void
-fancy_abort ()
-{
-  fatal ("Internal gcc abort.");
-}
-
-
-static const char *
-standard_symbol_characters PARAMS ((void));
-
-static const char *
-hp_symbol_characters PARAMS ((void));
-
-static const char *
-gnu_v3_symbol_characters PARAMS ((void));
-
-/* Return the string of non-alnum characters that may occur 
-   as a valid symbol component, in the standard assembler symbol
-   syntax.  */
-
-static const char *
-standard_symbol_characters ()
-{
-  return "_$.";
-}
-
-
-/* Return the string of non-alnum characters that may occur
-   as a valid symbol name component in an HP object file.
-
-   Note that, since HP's compiler generates object code straight from
-   C++ source, without going through an assembler, its mangled
-   identifiers can use all sorts of characters that no assembler would
-   tolerate, so the alphabet this function creates is a little odd.
-   Here are some sample mangled identifiers offered by HP:
-
-	typeid*__XT24AddressIndExpClassMember_
-	[Vftptr]key:__dt__32OrdinaryCompareIndExpClassMemberFv
-	__ct__Q2_9Elf64_Dyn18{unnamed.union.#1}Fv
-
-   This still seems really weird to me, since nowhere else in this
-   file is there anything to recognize curly brackets, parens, etc.
-   I've talked with Srikanth <srikanth@cup.hp.com>, and he assures me
-   this is right, but I still strongly suspect that there's a
-   misunderstanding here.
-
-   If we decide it's better for c++filt to use HP's assembler syntax
-   to scrape identifiers out of its input, here's the definition of
-   the symbol name syntax from the HP assembler manual:
-
-       Symbols are composed of uppercase and lowercase letters, decimal
-       digits, dollar symbol, period (.), ampersand (&), pound sign(#) and
-       underscore (_). A symbol can begin with a letter, digit underscore or
-       dollar sign. If a symbol begins with a digit, it must contain a
-       non-digit character.
-
-   So have fun.  */
-static const char *
-hp_symbol_characters ()
-{
-  return "_$.<>#,*&[]:(){}";
-}
-
-
-/* Return the string of non-alnum characters that may occur 
-   as a valid symbol component in the GNU C++ V3 ABI mangling
-   scheme.  */
-
-static const char *
-gnu_v3_symbol_characters ()
-{
-  return "_$.";
-}
-
-
-extern int main PARAMS ((int, char **));
-
-int
-main (argc, argv)
-     int argc;
-     char **argv;
-{
-  char *result;
-  int c;
-  const char *valid_symbols;
-  enum demangling_styles style = auto_demangling;
-
-  program_name = argv[0];
-
-  strip_underscore = prepends_underscore;
-
-  while ((c = getopt_long (argc, argv, "_ns:", long_options, (int *) 0)) != EOF)
-    {
-      switch (c)
-	{
-	case '?':
-	  usage (stderr, 1);
-	  break;
-	case 'h':
-	  usage (stdout, 0);
-	case 'n':
-	  strip_underscore = 0;
-	  break;
-	case 'v':
-	  printf ("GNU %s (C++ demangler), version %s\n", program_name, program_version);
-	  return (0);
-	case '_':
-	  strip_underscore = 1;
-	  break;
-	case 's':
-	  {
-	    style = cplus_demangle_name_to_style (optarg);
-	    if (style == unknown_demangling)
-	      {
-		fprintf (stderr, "%s: unknown demangling style `%s'\n",
-			 program_name, optarg);
-		return (1);
-	      }
-	    else
-	      cplus_demangle_set_style (style);
-	  }
-	  break;
-	}
-    }
-
-  if (optind < argc)
-    {
-      for ( ; optind < argc; optind++)
-	{
-	  demangle_it (argv[optind]);
-	}
-    }
-  else
-    {
-      switch (current_demangling_style)
-	{
-	case gnu_demangling:
-	case lucid_demangling:
-	case arm_demangling:
-	case java_demangling:
-	case edg_demangling:
-	case gnat_demangling:
-	case auto_demangling:
-	  valid_symbols = standard_symbol_characters ();
-	  break;
-	case hp_demangling:
-	  valid_symbols = hp_symbol_characters ();
-	  break;
-	case gnu_v3_demangling:
-	  valid_symbols = gnu_v3_symbol_characters ();
-	  break;
-	default:
-	  /* Folks should explicitly indicate the appropriate alphabet for
-	     each demangling.  Providing a default would allow the
-	     question to go unconsidered.  */
-	  abort ();
-	}
-
-      for (;;)
-	{
-	  int i = 0;
-	  c = getchar ();
-	  /* Try to read a label.  */
-	  while (c != EOF && (ISALNUM (c) || strchr (valid_symbols, c)))
-	    {
-	      if (i >= MBUF_SIZE-1)
-		break;
-	      mbuffer[i++] = c;
-	      c = getchar ();
-	    }
-	  if (i > 0)
-	    {
-	      int skip_first = 0;
-
-	      if (mbuffer[0] == '.' || mbuffer[0] == '$')
-		++skip_first;
-	      if (strip_underscore && mbuffer[skip_first] == '_')
-		++skip_first;
-
-	      if (skip_first > i)
-		skip_first = i;
-
-	      mbuffer[i] = 0;
-	      flags |= (int) style;
-	      result = cplus_demangle (mbuffer + skip_first, flags);
-	      if (result)
-		{
-		  if (mbuffer[0] == '.')
-		    putc ('.', stdout);
-		  fputs (result, stdout);
-		  free (result);
-		}
-	      else
-		fputs (mbuffer, stdout);
-
-	      fflush (stdout);
-	    }
-	  if (c == EOF)
-	    break;
-	  putchar (c);
-	  fflush (stdout);
-	}
-    }
-
-  return (0);
-}
-
-static void
-fatal (str)
-     const char *str;
-{
-  fprintf (stderr, "%s: %s\n", program_name, str);
-  exit (1);
-}
-
-PTR
-xmalloc (size)
-  size_t size;
-{
-  register PTR value = (PTR) malloc (size);
-  if (value == 0)
-    fatal ("virtual memory exhausted");
-  return value;
-}
-
-PTR
-xrealloc (ptr, size)
-  PTR ptr;
-  size_t size;
-{
-  register PTR value = (PTR) realloc (ptr, size);
-  if (value == 0)
-    fatal ("virtual memory exhausted");
-  return value;
-}
-#endif	/* main */
diff --git a/coregrind/demangle/demangle.h b/coregrind/demangle/demangle.h
deleted file mode 100644
index 238ae3398a..0000000000
--- a/coregrind/demangle/demangle.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Defs for interface to demanglers.
-   Copyright 1992, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001
-   Free Software Foundation, Inc.
-   
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA.  */
-
-
-#if !defined (DEMANGLE_H)
-#define DEMANGLE_H
-
-#include <ansidecl.h>
-
-#define current_demangling_style VG_(current_demangling_style)
-
-/* Options passed to cplus_demangle (in 2nd parameter). */
-
-#define DMGL_NO_OPTS	 0		/* For readability... */
-#define DMGL_PARAMS	 (1 << 0)	/* Include function args */
-#define DMGL_ANSI	 (1 << 1)	/* Include const, volatile, etc */
-#define DMGL_JAVA	 (1 << 2)	/* Demangle as Java rather than C++. */
-
-#define DMGL_AUTO	 (1 << 8)
-#define DMGL_GNU	 (1 << 9)
-#define DMGL_LUCID	 (1 << 10)
-#define DMGL_ARM	 (1 << 11)
-#define DMGL_HP 	 (1 << 12)       /* For the HP aCC compiler;
-                                            same as ARM except for
-                                            template arguments, etc. */
-#define DMGL_EDG	 (1 << 13)
-#define DMGL_GNU_V3	 (1 << 14)
-#define DMGL_GNAT	 (1 << 15)
-
-/* If none of these are set, use 'current_demangling_style' as the default. */
-#define DMGL_STYLE_MASK (DMGL_AUTO|DMGL_GNU|DMGL_LUCID|DMGL_ARM|DMGL_HP|DMGL_EDG|DMGL_GNU_V3|DMGL_JAVA|DMGL_GNAT)
-
-/* Enumeration of possible demangling styles.
-
-   Lucid and ARM styles are still kept logically distinct, even though
-   they now both behave identically.  The resulting style is actual the
-   union of both.  I.E. either style recognizes both "__pt__" and "__rf__"
-   for operator "->", even though the first is lucid style and the second
-   is ARM style. (FIXME?) */
-
-extern enum demangling_styles
-{
-  no_demangling = -1,
-  unknown_demangling = 0,
-  auto_demangling = DMGL_AUTO,
-  gnu_demangling = DMGL_GNU,
-  lucid_demangling = DMGL_LUCID,
-  arm_demangling = DMGL_ARM,
-  hp_demangling = DMGL_HP,
-  edg_demangling = DMGL_EDG,
-  gnu_v3_demangling = DMGL_GNU_V3,
-  java_demangling = DMGL_JAVA,
-  gnat_demangling = DMGL_GNAT
-} current_demangling_style;
-
-/* Define string names for the various demangling styles. */
-
-#define NO_DEMANGLING_STYLE_STRING            "none"
-#define AUTO_DEMANGLING_STYLE_STRING	      "auto"
-#define GNU_DEMANGLING_STYLE_STRING    	      "gnu"
-#define LUCID_DEMANGLING_STYLE_STRING	      "lucid"
-#define ARM_DEMANGLING_STYLE_STRING	      "arm"
-#define HP_DEMANGLING_STYLE_STRING	      "hp"
-#define EDG_DEMANGLING_STYLE_STRING	      "edg"
-#define GNU_V3_DEMANGLING_STYLE_STRING        "gnu-v3"
-#define JAVA_DEMANGLING_STYLE_STRING          "java"
-#define GNAT_DEMANGLING_STYLE_STRING          "gnat"
-
-/* Some macros to test what demangling style is active. */
-
-#define CURRENT_DEMANGLING_STYLE current_demangling_style
-#define AUTO_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_AUTO)
-#define GNU_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_GNU)
-#define LUCID_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_LUCID)
-#define ARM_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_ARM)
-#define HP_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_HP)
-#define EDG_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_EDG)
-#define GNU_V3_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_GNU_V3)
-#define JAVA_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_JAVA)
-#define GNAT_DEMANGLING (((int) CURRENT_DEMANGLING_STYLE) & DMGL_GNAT)
-
-/* Provide information about the available demangle styles. This code is
-   pulled from gdb into libiberty because it is useful to binutils also.  */
-
-extern const struct demangler_engine
-{
-  const char *const demangling_style_name;
-  const enum demangling_styles demangling_style;
-  const char *const demangling_style_doc;
-} libiberty_demanglers[];
-
-extern char *
-VG_(cplus_demangle) PARAMS ((const char *mangled, int options));
-
-/*
-extern int
-cplus_demangle_opname PARAMS ((const char *opname, char *result, int options));
-*/
-
-/*
-extern const char *
-cplus_mangle_opname PARAMS ((const char *opname, int options));
-*/
-
-/* Note: This sets global state.  FIXME if you care about multi-threading. */
-
-/*
-extern void
-set_cplus_marker_for_demangling PARAMS ((int ch));
-*/
-
-/*
-extern enum demangling_styles 
-cplus_demangle_set_style PARAMS ((enum demangling_styles style));
-*/
-
-/*
-extern enum demangling_styles 
-cplus_demangle_name_to_style PARAMS ((const char *name));
-*/
-
-/* V3 ABI demangling entry points, defined in cp-demangle.c.  */
-extern char*
-VG_(cplus_demangle_v3) PARAMS ((const char* mangled));
-
-extern char*
-VG_(java_demangle_v3) PARAMS ((const char* mangled));
-
-
-enum gnu_v3_ctor_kinds {
-  gnu_v3_complete_object_ctor = 1,
-  gnu_v3_base_object_ctor,
-  gnu_v3_complete_object_allocating_ctor
-};
-
-/* Return non-zero iff NAME is the mangled form of a constructor name
-   in the G++ V3 ABI demangling style.  Specifically, return an `enum
-   gnu_v3_ctor_kinds' value indicating what kind of constructor
-   it is.  */
-/*
-extern enum gnu_v3_ctor_kinds
-	is_gnu_v3_mangled_ctor PARAMS ((const char *name));
-*/
-
-
-enum gnu_v3_dtor_kinds {
-  gnu_v3_deleting_dtor = 1,
-  gnu_v3_complete_object_dtor,
-  gnu_v3_base_object_dtor
-};
-
-/* Return non-zero iff NAME is the mangled form of a destructor name
-   in the G++ V3 ABI demangling style.  Specifically, return an `enum
-   gnu_v3_dtor_kinds' value, indicating what kind of destructor
-   it is.  */
-/*
-extern enum gnu_v3_dtor_kinds
-	is_gnu_v3_mangled_dtor PARAMS ((const char *name));
-*/
-
-#endif	/* DEMANGLE_H */
diff --git a/coregrind/demangle/dyn-string.c b/coregrind/demangle/dyn-string.c
deleted file mode 100644
index aaa7e36319..0000000000
--- a/coregrind/demangle/dyn-string.c
+++ /dev/null
@@ -1,439 +0,0 @@
-/* An abstract string datatype.
-   Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
-   Contributed by Mark Mitchell (mark@markmitchell.com).
-
-This file is part of GNU CC.
-   
-GNU CC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GNU CC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GNU CC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifdef HAVE_STRING_H
-#include <string.h>
-#endif
-
-#ifdef HAVE_STDLIB_H
-#include <stdlib.h>
-#endif
-
-#include "vg_include.h"
-#include "ansidecl.h"
-#include "dyn-string.h"
-
-#ifndef STANDALONE
-#define malloc(s) VG_(malloc)(VG_AR_DEMANGLE, s)
-#define free(p) VG_(free)(VG_AR_DEMANGLE, p)
-#define realloc(p,s) VG_(realloc)(VG_AR_DEMANGLE, p, s)
-#endif
-
-/* If this file is being compiled for inclusion in the C++ runtime
-   library, as part of the demangler implementation, we don't want to
-   abort if an allocation fails.  Instead, percolate an error code up
-   through the call chain.  */
-
-#ifdef IN_LIBGCC2
-#define RETURN_ON_ALLOCATION_FAILURE
-#endif
-
-/* Performs in-place initialization of a dyn_string struct.  This
-   function can be used with a dyn_string struct on the stack or
-   embedded in another object.  The contents of of the string itself
-   are still dynamically allocated.  The string initially is capable
-   of holding at least SPACE characeters, including the terminating
-   NUL.  If SPACE is 0, it will silently be increated to 1.  
-
-   If RETURN_ON_ALLOCATION_FAILURE is defined and memory allocation
-   fails, returns 0.  Otherwise returns 1.  */
-
-int
-dyn_string_init (ds_struct_ptr, space)
-     struct dyn_string *ds_struct_ptr;
-     int space;
-{
-  /* We need at least one byte in which to store the terminating NUL.  */
-  if (space == 0)
-    space = 1;
-
-#ifdef RETURN_ON_ALLOCATION_FAILURE
-  ds_struct_ptr->s = (char *) malloc (space);
-  if (ds_struct_ptr->s == NULL)
-    return 0;
-#else
-  ds_struct_ptr->s = (char *) malloc (space);
-#endif
-  ds_struct_ptr->allocated = space;
-  ds_struct_ptr->length = 0;
-  ds_struct_ptr->s[0] = '\0';
-
-  return 1;
-}
-
-/* Create a new dynamic string capable of holding at least SPACE
-   characters, including the terminating NUL.  If SPACE is 0, it will
-   be silently increased to 1.  If RETURN_ON_ALLOCATION_FAILURE is
-   defined and memory allocation fails, returns NULL.  Otherwise
-   returns the newly allocated string.  */
-
-dyn_string_t 
-dyn_string_new (space)
-     int space;
-{
-  dyn_string_t result;
-#ifdef RETURN_ON_ALLOCATION_FAILURE
-  result = (dyn_string_t) malloc (sizeof (struct dyn_string));
-  if (result == NULL)
-    return NULL;
-  if (!dyn_string_init (result, space))
-    {
-      free (result);
-      return NULL;
-    }
-#else
-  result = (dyn_string_t) malloc (sizeof (struct dyn_string));
-  dyn_string_init (result, space);
-#endif
-  return result;
-}
-
-/* Free the memory used by DS.  */
-
-void 
-dyn_string_delete (ds)
-     dyn_string_t ds;
-{
-  free (ds->s);
-  free (ds);
-}
-
-/* Returns the contents of DS in a buffer allocated with malloc.  It
-   is the caller's responsibility to deallocate the buffer using free.
-   DS is then set to the empty string.  Deletes DS itself.  */
-
-char*
-dyn_string_release (ds)
-     dyn_string_t ds;
-{
-  /* Store the old buffer.  */
-  char* result = ds->s;
-  /* The buffer is no longer owned by DS.  */
-  ds->s = NULL;
-  /* Delete DS.  */
-  free (ds);
-  /* Return the old buffer.  */
-  return result;
-}
-
-/* Increase the capacity of DS so it can hold at least SPACE
-   characters, plus the terminating NUL.  This function will not (at
-   present) reduce the capacity of DS.  Returns DS on success. 
-
-   If RETURN_ON_ALLOCATION_FAILURE is defined and a memory allocation
-   operation fails, deletes DS and returns NULL.  */
-
-dyn_string_t 
-dyn_string_resize (ds, space)
-     dyn_string_t ds;
-     int space;
-{
-  int new_allocated = ds->allocated;
-
-  /* Increase SPACE to hold the NUL termination.  */
-  ++space;
-
-  /* Increase allocation by factors of two.  */
-  while (space > new_allocated)
-    new_allocated *= 2;
-    
-  if (new_allocated != ds->allocated)
-    {
-      ds->allocated = new_allocated;
-      /* We actually need more space.  */
-#ifdef RETURN_ON_ALLOCATION_FAILURE
-      ds->s = (char *) realloc (ds->s, ds->allocated);
-      if (ds->s == NULL)
-	{
-	  free (ds);
-	  return NULL;
-	}
-#else
-      ds->s = (char *) realloc (ds->s, ds->allocated);
-#endif
-    }
-
-  return ds;
-}
-
-/* Sets the contents of DS to the empty string.  */
-
-void
-dyn_string_clear (ds)
-     dyn_string_t ds;
-{
-  /* A dyn_string always has room for at least the NUL terminator.  */
-  ds->s[0] = '\0';
-  ds->length = 0;
-}
-
-/* Makes the contents of DEST the same as the contents of SRC.  DEST
-   and SRC must be distinct.  Returns 1 on success.  On failure, if
-   RETURN_ON_ALLOCATION_FAILURE, deletes DEST and returns 0.  */
-
-int
-dyn_string_copy (dest, src)
-     dyn_string_t dest;
-     dyn_string_t src;
-{
-  if (dest == src)
-      VG_(panic) ("dyn_string_copy: src==dest");
-
-  /* Make room in DEST.  */
-  if (dyn_string_resize (dest, src->length) == NULL)
-    return 0;
-  /* Copy DEST into SRC.  */
-  VG_(strcpy) (dest->s, src->s);
-  /* Update the size of DEST.  */
-  dest->length = src->length;
-  return 1;
-}
-
-/* Copies SRC, a NUL-terminated string, into DEST.  Returns 1 on
-   success.  On failure, if RETURN_ON_ALLOCATION_FAILURE, deletes DEST
-   and returns 0.  */
-
-int
-dyn_string_copy_cstr (dest, src)
-     dyn_string_t dest;
-     const char *src;
-{
-  int length = VG_(strlen) (src);
-  /* Make room in DEST.  */
-  if (dyn_string_resize (dest, length) == NULL)
-    return 0;
-  /* Copy DEST into SRC.  */
-  VG_(strcpy) (dest->s, src);
-  /* Update the size of DEST.  */
-  dest->length = length;
-  return 1;
-}
-
-/* Inserts SRC at the beginning of DEST.  DEST is expanded as
-   necessary.  SRC and DEST must be distinct.  Returns 1 on success.
-   On failure, if RETURN_ON_ALLOCATION_FAILURE, deletes DEST and
-   returns 0.  */
-
-int
-dyn_string_prepend (dest, src)
-     dyn_string_t dest;
-     dyn_string_t src;
-{
-  return dyn_string_insert (dest, 0, src);
-}
-
-/* Inserts SRC, a NUL-terminated string, at the beginning of DEST.
-   DEST is expanded as necessary.  Returns 1 on success.  On failure,
-   if RETURN_ON_ALLOCATION_FAILURE, deletes DEST and returns 0. */
-
-int
-dyn_string_prepend_cstr (dest, src)
-     dyn_string_t dest;
-     const char *src;
-{
-  return dyn_string_insert_cstr (dest, 0, src);
-}
-
-/* Inserts SRC into DEST starting at position POS.  DEST is expanded
-   as necessary.  SRC and DEST must be distinct.  Returns 1 on
-   success.  On failure, if RETURN_ON_ALLOCATION_FAILURE, deletes DEST
-   and returns 0.  */
-
-int
-dyn_string_insert (dest, pos, src)
-     dyn_string_t dest;
-     int pos;
-     dyn_string_t src;
-{
-  int i;
-
-  if (src == dest)
-    VG_(panic)( "dyn_string_insert: src==dest" );
-
-  if (dyn_string_resize (dest, dest->length + src->length) == NULL)
-    return 0;
-  /* Make room for the insertion.  Be sure to copy the NUL.  */
-  for (i = dest->length; i >= pos; --i)
-    dest->s[i + src->length] = dest->s[i];
-  /* Splice in the new stuff.  */
-  VG_(strncpy) (dest->s + pos, src->s, src->length);
-  /* Compute the new length.  */
-  dest->length += src->length;
-  return 1;
-}
-
-/* Inserts SRC, a NUL-terminated string, into DEST starting at
-   position POS.  DEST is expanded as necessary.  Returns 1 on
-   success.  On failure, RETURN_ON_ALLOCATION_FAILURE, deletes DEST
-   and returns 0.  */
-
-int
-dyn_string_insert_cstr (dest, pos, src)
-     dyn_string_t dest;
-     int pos;
-     const char *src;
-{
-  int i;
-  int length = VG_(strlen) (src);
-
-  if (dyn_string_resize (dest, dest->length + length) == NULL)
-    return 0;
-  /* Make room for the insertion.  Be sure to copy the NUL.  */
-  for (i = dest->length; i >= pos; --i)
-    dest->s[i + length] = dest->s[i];
-  /* Splice in the new stuff.  */
-  VG_(strncpy) (dest->s + pos, src, length);
-  /* Compute the new length.  */
-  dest->length += length;
-  return 1;
-}
-
-/* Inserts character C into DEST starting at position POS.  DEST is
-   expanded as necessary.  Returns 1 on success.  On failure,
-   RETURN_ON_ALLOCATION_FAILURE, deletes DEST and returns 0.  */
-
-int
-dyn_string_insert_char (dest, pos, c)
-     dyn_string_t dest;
-     int pos;
-     int c;
-{
-  int i;
-
-  if (dyn_string_resize (dest, dest->length + 1) == NULL)
-    return 0;
-  /* Make room for the insertion.  Be sure to copy the NUL.  */
-  for (i = dest->length; i >= pos; --i)
-    dest->s[i + 1] = dest->s[i];
-  /* Add the new character.  */
-  dest->s[pos] = c;
-  /* Compute the new length.  */
-  ++dest->length;
-  return 1;
-}
-     
-/* Append S to DS, resizing DS if necessary.  Returns 1 on success.
-   On failure, if RETURN_ON_ALLOCATION_FAILURE, deletes DEST and
-   returns 0.  */
-
-int
-dyn_string_append (dest, s)
-     dyn_string_t dest;
-     dyn_string_t s;
-{
-  if (dyn_string_resize (dest, dest->length + s->length) == 0)
-    return 0;
-  VG_(strcpy) (dest->s + dest->length, s->s);
-  dest->length += s->length;
-  return 1;
-}
-
-/* Append the NUL-terminated string S to DS, resizing DS if necessary.
-   Returns 1 on success.  On failure, if RETURN_ON_ALLOCATION_FAILURE,
-   deletes DEST and returns 0.  */
-
-int
-dyn_string_append_cstr (dest, s)
-     dyn_string_t dest;
-     const char *s;
-{
-  int len = VG_(strlen) (s);
-
-  /* The new length is the old length plus the size of our string, plus
-     one for the null at the end.  */
-  if (dyn_string_resize (dest, dest->length + len) == NULL)
-    return 0;
-  VG_(strcpy) (dest->s + dest->length, s);
-  dest->length += len;
-  return 1;
-}
-
-/* Appends C to the end of DEST.  Returns 1 on success.  On failiure,
-   if RETURN_ON_ALLOCATION_FAILURE, deletes DEST and returns 0.  */
-
-int
-dyn_string_append_char (dest, c)
-     dyn_string_t dest;
-     int c;
-{
-  /* Make room for the extra character.  */
-  if (dyn_string_resize (dest, dest->length + 1) == NULL)
-    return 0;
-  /* Append the character; it will overwrite the old NUL.  */
-  dest->s[dest->length] = c;
-  /* Add a new NUL at the end.  */
-  dest->s[dest->length + 1] = '\0';
-  /* Update the length.  */
-  ++(dest->length);
-  return 1;
-}
-
-/* Sets the contents of DEST to the substring of SRC starting at START
-   and ending before END.  START must be less than or equal to END,
-   and both must be between zero and the length of SRC, inclusive.
-   Returns 1 on success.  On failure, if RETURN_ON_ALLOCATION_FAILURE,
-   deletes DEST and returns 0.  */
-
-int
-dyn_string_substring (dest, src, start, end)
-     dyn_string_t dest;
-     dyn_string_t src;
-     int start;
-     int end;
-{
-  int i;
-  int length = end - start;
-
-  /*
-  vg_assert (start > end || start > src->length || end > src->length);
-  */
-
-  /* Make room for the substring.  */
-  if (dyn_string_resize (dest, length) == NULL)
-    return 0;
-  /* Copy the characters in the substring,  */
-  for (i = length; --i >= 0; )
-    dest->s[i] = src->s[start + i];
-  /* NUL-terimate the result.  */
-  dest->s[length] = '\0';
-  /* Record the length of the substring.  */
-  dest->length = length;
-
-  return 1;
-}
-
-/* Returns non-zero if DS1 and DS2 have the same contents.  */
-
-int
-dyn_string_eq (ds1, ds2)
-     dyn_string_t ds1;
-     dyn_string_t ds2;
-{
-  /* If DS1 and DS2 have different lengths, they must not be the same.  */
-  if (ds1->length != ds2->length)
-    return 0;
-  else
-    return !VG_(strcmp) (ds1->s, ds2->s);
-}
diff --git a/coregrind/demangle/dyn-string.h b/coregrind/demangle/dyn-string.h
deleted file mode 100644
index 9615cd64ee..0000000000
--- a/coregrind/demangle/dyn-string.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* An abstract string datatype.
-   Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
-   Contributed by Mark Mitchell (mark@markmitchell.com).
-
-This file is part of GCC.
-   
-GCC is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-GCC is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-#ifndef __DYN_STRING_H
-#define __DYN_STRING_H
-
-
-typedef struct dyn_string
-{
-  int allocated;	/* The amount of space allocated for the string.  */
-  int length;		/* The actual length of the string.  */
-  char *s;		/* The string itself, NUL-terminated.  */
-}* dyn_string_t;
-
-/* The length STR, in bytes, not including the terminating NUL.  */
-#define dyn_string_length(STR)                                          \
-  ((STR)->length)
-
-/* The NTBS in which the contents of STR are stored.  */
-#define dyn_string_buf(STR)                                             \
-  ((STR)->s)
-
-/* Compare DS1 to DS2 with strcmp.  */
-#define dyn_string_compare(DS1, DS2)                                    \
-  (VG_(strcmp) ((DS1)->s, (DS2)->s))
-
-
-/* dyn_string functions are used in the demangling implementation
-   included in the G++ runtime library.  To prevent collisions with
-   names in user programs, the functions that are used in the
-   demangler are given implementation-reserved names.  */
-
-#if 1 /* def IN_LIBGCC2 */
-
-#define dyn_string_init                 VG_(__cxa_dyn_string_init)
-#define dyn_string_new                  VG_(__cxa_dyn_string_new)
-#define dyn_string_delete               VG_(__cxa_dyn_string_delete)
-#define dyn_string_release              VG_(__cxa_dyn_string_release)
-#define dyn_string_resize               VG_(__cxa_dyn_string_resize)
-#define dyn_string_clear                VG_(__cxa_dyn_string_clear)
-#define dyn_string_copy                 VG_(__cxa_dyn_string_copy)
-#define dyn_string_copy_cstr            VG_(__cxa_dyn_string_copy_cstr)
-#define dyn_string_prepend              VG_(__cxa_dyn_string_prepend)
-#define dyn_string_prepend_cstr         VG_(__cxa_dyn_string_prepend_cstr)
-#define dyn_string_insert               VG_(__cxa_dyn_string_insert)
-#define dyn_string_insert_cstr          VG_(__cxa_dyn_string_insert_cstr)
-#define dyn_string_insert_char          VG_(__cxa_dyn_string_insert_char)
-#define dyn_string_append               VG_(__cxa_dyn_string_append)
-#define dyn_string_append_cstr          VG_(__cxa_dyn_string_append_cstr)
-#define dyn_string_append_char          VG_(__cxa_dyn_string_append_char)
-#define dyn_string_substring            VG_(__cxa_dyn_string_substring)
-#define dyn_string_eq                   VG_(__cxa_dyn_string_eq)
-
-#endif /* IN_LIBGCC2 */
-
-
-extern int dyn_string_init              PARAMS ((struct dyn_string *, int));
-extern dyn_string_t dyn_string_new      PARAMS ((int));
-extern void dyn_string_delete           PARAMS ((dyn_string_t));
-extern char *dyn_string_release         PARAMS ((dyn_string_t));
-extern dyn_string_t dyn_string_resize   PARAMS ((dyn_string_t, int));
-extern void dyn_string_clear            PARAMS ((dyn_string_t));
-extern int dyn_string_copy              PARAMS ((dyn_string_t, dyn_string_t));
-extern int dyn_string_copy_cstr         PARAMS ((dyn_string_t, const char *));
-extern int dyn_string_prepend           PARAMS ((dyn_string_t, dyn_string_t));
-extern int dyn_string_prepend_cstr      PARAMS ((dyn_string_t, const char *));
-extern int dyn_string_insert            PARAMS ((dyn_string_t, int,
-						 dyn_string_t));
-extern int dyn_string_insert_cstr       PARAMS ((dyn_string_t, int,
-						 const char *));
-extern int dyn_string_insert_char       PARAMS ((dyn_string_t, int, int));
-extern int dyn_string_append            PARAMS ((dyn_string_t, dyn_string_t));
-extern int dyn_string_append_cstr       PARAMS ((dyn_string_t, const char *));
-extern int dyn_string_append_char       PARAMS ((dyn_string_t, int));
-extern int dyn_string_substring         PARAMS ((dyn_string_t, 
-						 dyn_string_t, int, int));
-extern int dyn_string_eq                PARAMS ((dyn_string_t, dyn_string_t));
-
-#endif
diff --git a/coregrind/demangle/safe-ctype.c b/coregrind/demangle/safe-ctype.c
deleted file mode 100644
index 0c2be3ed79..0000000000
--- a/coregrind/demangle/safe-ctype.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/* <ctype.h> replacement macros.
-
-   Copyright (C) 2000 Free Software Foundation, Inc.
-   Contributed by Zack Weinberg <zackw@stanford.edu>.
-
-This file is part of the libiberty library.
-Libiberty is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public
-License as published by the Free Software Foundation; either
-version 2 of the License, or (at your option) any later version.
-
-Libiberty is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with libiberty; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* This is a compatible replacement of the standard C library's <ctype.h>
-   with the following properties:
-
-   - Implements all isxxx() macros required by C99.
-   - Also implements some character classes useful when
-     parsing C-like languages.
-   - Does not change behavior depending on the current locale.
-   - Behaves properly for all values in the range of a signed or
-     unsigned char.  */
-
-#include "ansidecl.h"
-#include <safe-ctype.h>
-#include <stdio.h>  /* for EOF */
-
-/* Shorthand */
-#define bl _sch_isblank
-#define cn _sch_iscntrl
-#define di _sch_isdigit
-#define is _sch_isidst
-#define lo _sch_islower
-#define nv _sch_isnvsp
-#define pn _sch_ispunct
-#define pr _sch_isprint
-#define sp _sch_isspace
-#define up _sch_isupper
-#define vs _sch_isvsp
-#define xd _sch_isxdigit
-
-/* Masks.  */
-#define L  lo|is   |pr	/* lower case letter */
-#define XL lo|is|xd|pr	/* lowercase hex digit */
-#define U  up|is   |pr	/* upper case letter */
-#define XU up|is|xd|pr	/* uppercase hex digit */
-#define D  di   |xd|pr	/* decimal digit */
-#define P  pn      |pr	/* punctuation */
-#define _  pn|is   |pr	/* underscore */
-
-#define C           cn	/* control character */
-#define Z  nv      |cn	/* NUL */
-#define M  nv|sp   |cn	/* cursor movement: \f \v */
-#define V  vs|sp   |cn	/* vertical space: \r \n */
-#define T  nv|sp|bl|cn	/* tab */
-#define S  nv|sp|bl|pr	/* space */
-
-/* Are we ASCII? */
-#if '\n' == 0x0A && ' ' == 0x20 && '0' == 0x30 \
-  && 'A' == 0x41 && 'a' == 0x61 && '!' == 0x21 \
-  && EOF == -1
-
-const unsigned short _sch_istable[256] =
-{
-  Z,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
-  C,  T,  V,  M,   M,  V,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
-  C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
-  C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
-  S,  P,  P,  P,   P,  P,  P,  P,   /* SP  !   "   #    $   %   &   '   */
-  P,  P,  P,  P,   P,  P,  P,  P,   /* (   )   *   +    ,   -   .   /   */
-  D,  D,  D,  D,   D,  D,  D,  D,   /* 0   1   2   3    4   5   6   7   */
-  D,  D,  P,  P,   P,  P,  P,  P,   /* 8   9   :   ;    <   =   >   ?   */
-  P, XU, XU, XU,  XU, XU, XU,  U,   /* @   A   B   C    D   E   F   G   */
-  U,  U,  U,  U,   U,  U,  U,  U,   /* H   I   J   K    L   M   N   O   */
-  U,  U,  U,  U,   U,  U,  U,  U,   /* P   Q   R   S    T   U   V   W   */
-  U,  U,  U,  P,   P,  P,  P,  _,   /* X   Y   Z   [    \   ]   ^   _   */
-  P, XL, XL, XL,  XL, XL, XL,  L,   /* `   a   b   c    d   e   f   g   */
-  L,  L,  L,  L,   L,  L,  L,  L,   /* h   i   j   k    l   m   n   o   */
-  L,  L,  L,  L,   L,  L,  L,  L,   /* p   q   r   s    t   u   v   w   */
-  L,  L,  L,  P,   P,  P,  P,  C,   /* x   y   z   {    |   }   ~   DEL */
-
-  /* high half of unsigned char is locale-specific, so all tests are
-     false in "C" locale */
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
-};
-
-const unsigned char _sch_tolower[256] =
-{
-   0,  1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15,
-  16, 17, 18, 19,  20, 21, 22, 23,  24, 25, 26, 27,  28, 29, 30, 31,
-  32, 33, 34, 35,  36, 37, 38, 39,  40, 41, 42, 43,  44, 45, 46, 47,
-  48, 49, 50, 51,  52, 53, 54, 55,  56, 57, 58, 59,  60, 61, 62, 63,
-  64,
-
-  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
-  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
-
-  91, 92, 93, 94, 95, 96,
-
-  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
-  'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
-
- 123,124,125,126,127,
-
- 128,129,130,131, 132,133,134,135, 136,137,138,139, 140,141,142,143,
- 144,145,146,147, 148,149,150,151, 152,153,154,155, 156,157,158,159,
- 160,161,162,163, 164,165,166,167, 168,169,170,171, 172,173,174,175,
- 176,177,178,179, 180,181,182,183, 184,185,186,187, 188,189,190,191,
-
- 192,193,194,195, 196,197,198,199, 200,201,202,203, 204,205,206,207,
- 208,209,210,211, 212,213,214,215, 216,217,218,219, 220,221,222,223,
- 224,225,226,227, 228,229,230,231, 232,233,234,235, 236,237,238,239,
- 240,241,242,243, 244,245,246,247, 248,249,250,251, 252,253,254,255,
-};
-
-const unsigned char _sch_toupper[256] =
-{
-   0,  1,  2,  3,   4,  5,  6,  7,   8,  9, 10, 11,  12, 13, 14, 15,
-  16, 17, 18, 19,  20, 21, 22, 23,  24, 25, 26, 27,  28, 29, 30, 31,
-  32, 33, 34, 35,  36, 37, 38, 39,  40, 41, 42, 43,  44, 45, 46, 47,
-  48, 49, 50, 51,  52, 53, 54, 55,  56, 57, 58, 59,  60, 61, 62, 63,
-  64,
-
-  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
-  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
-
-  91, 92, 93, 94, 95, 96,
-
-  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
-  'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
-
- 123,124,125,126,127,
-
- 128,129,130,131, 132,133,134,135, 136,137,138,139, 140,141,142,143,
- 144,145,146,147, 148,149,150,151, 152,153,154,155, 156,157,158,159,
- 160,161,162,163, 164,165,166,167, 168,169,170,171, 172,173,174,175,
- 176,177,178,179, 180,181,182,183, 184,185,186,187, 188,189,190,191,
-
- 192,193,194,195, 196,197,198,199, 200,201,202,203, 204,205,206,207,
- 208,209,210,211, 212,213,214,215, 216,217,218,219, 220,221,222,223,
- 224,225,226,227, 228,229,230,231, 232,233,234,235, 236,237,238,239,
- 240,241,242,243, 244,245,246,247, 248,249,250,251, 252,253,254,255,
-};
-
-#else
- #error "Unsupported host character set"
-#endif /* not ASCII */
diff --git a/coregrind/demangle/safe-ctype.h b/coregrind/demangle/safe-ctype.h
deleted file mode 100644
index b2ad8490bd..0000000000
--- a/coregrind/demangle/safe-ctype.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* <ctype.h> replacement macros.
-
-   Copyright (C) 2000, 2001 Free Software Foundation, Inc.
-   Contributed by Zack Weinberg <zackw@stanford.edu>.
-
-This file is part of the libiberty library.
-Libiberty is free software; you can redistribute it and/or
-modify it under the terms of the GNU Library General Public
-License as published by the Free Software Foundation; either
-version 2 of the License, or (at your option) any later version.
-
-Libiberty is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Library General Public License for more details.
-
-You should have received a copy of the GNU Library General Public
-License along with libiberty; see the file COPYING.LIB.  If
-not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
-
-/* This is a compatible replacement of the standard C library's <ctype.h>
-   with the following properties:
-
-   - Implements all isxxx() macros required by C99.
-   - Also implements some character classes useful when
-     parsing C-like languages.
-   - Does not change behavior depending on the current locale.
-   - Behaves properly for all values in the range of a signed or
-     unsigned char.
-
-   To avoid conflicts, this header defines the isxxx functions in upper
-   case, e.g. ISALPHA not isalpha.  */
-
-#ifndef SAFE_CTYPE_H
-#define SAFE_CTYPE_H
-
-#ifdef isalpha
- #error "safe-ctype.h and ctype.h may not be used simultaneously"
-#else
-
-/* Categories.  */
-
-enum {
-  /* In C99 */
-  _sch_isblank  = 0x0001,	/* space \t */
-  _sch_iscntrl  = 0x0002,	/* nonprinting characters */
-  _sch_isdigit  = 0x0004,	/* 0-9 */
-  _sch_islower  = 0x0008,	/* a-z */
-  _sch_isprint  = 0x0010,	/* any printing character including ' ' */
-  _sch_ispunct  = 0x0020,	/* all punctuation */
-  _sch_isspace  = 0x0040,	/* space \t \n \r \f \v */
-  _sch_isupper  = 0x0080,	/* A-Z */
-  _sch_isxdigit = 0x0100,	/* 0-9A-Fa-f */
-
-  /* Extra categories useful to cpplib.  */
-  _sch_isidst	= 0x0200,	/* A-Za-z_ */
-  _sch_isvsp    = 0x0400,	/* \n \r */
-  _sch_isnvsp   = 0x0800,	/* space \t \f \v \0 */
-
-  /* Combinations of the above.  */
-  _sch_isalpha  = _sch_isupper|_sch_islower,	/* A-Za-z */
-  _sch_isalnum  = _sch_isalpha|_sch_isdigit,	/* A-Za-z0-9 */
-  _sch_isidnum  = _sch_isidst|_sch_isdigit,	/* A-Za-z0-9_ */
-  _sch_isgraph  = _sch_isalnum|_sch_ispunct,	/* isprint and not space */
-  _sch_iscppsp  = _sch_isvsp|_sch_isnvsp,	/* isspace + \0 */
-  _sch_isbasic  = _sch_isprint|_sch_iscppsp     /* basic charset of ISO C
-						   (plus ` and @)  */
-};
-
-/* Character classification.  */
-extern const unsigned short _sch_istable[256];
-
-#define _sch_test(c, bit) (_sch_istable[(c) & 0xff] & (unsigned short)(bit))
-
-#define ISALPHA(c)  _sch_test(c, _sch_isalpha)
-#define ISALNUM(c)  _sch_test(c, _sch_isalnum)
-#define ISBLANK(c)  _sch_test(c, _sch_isblank)
-#define ISCNTRL(c)  _sch_test(c, _sch_iscntrl)
-#define ISDIGIT(c)  _sch_test(c, _sch_isdigit)
-#define ISGRAPH(c)  _sch_test(c, _sch_isgraph)
-#define ISLOWER(c)  _sch_test(c, _sch_islower)
-#define ISPRINT(c)  _sch_test(c, _sch_isprint)
-#define ISPUNCT(c)  _sch_test(c, _sch_ispunct)
-#define ISSPACE(c)  _sch_test(c, _sch_isspace)
-#define ISUPPER(c)  _sch_test(c, _sch_isupper)
-#define ISXDIGIT(c) _sch_test(c, _sch_isxdigit)
-
-#define ISIDNUM(c)	_sch_test(c, _sch_isidnum)
-#define ISIDST(c)	_sch_test(c, _sch_isidst)
-#define IS_ISOBASIC(c)	_sch_test(c, _sch_isbasic)
-#define IS_VSPACE(c)	_sch_test(c, _sch_isvsp)
-#define IS_NVSPACE(c)	_sch_test(c, _sch_isnvsp)
-#define IS_SPACE_OR_NUL(c)	_sch_test(c, _sch_iscppsp)
-
-/* Character transformation.  */
-extern const unsigned char  _sch_toupper[256];
-extern const unsigned char  _sch_tolower[256];
-#define TOUPPER(c) _sch_toupper[(c) & 0xff]
-#define TOLOWER(c) _sch_tolower[(c) & 0xff]
-
-#endif /* no ctype.h */
-#endif /* SAFE_CTYPE_H */
diff --git a/coregrind/docs/Makefile.am b/coregrind/docs/Makefile.am
deleted file mode 100644
index e8a58fa18e..0000000000
--- a/coregrind/docs/Makefile.am
+++ /dev/null
@@ -1,5 +0,0 @@
-docdir = $(datadir)/doc/valgrind
-
-doc_DATA = index.html manual.html nav.html techdocs.html
-
-EXTRA_DIST = $(doc_DATA)
diff --git a/coregrind/docs/index.html b/coregrind/docs/index.html
deleted file mode 100644
index 1111702565..0000000000
--- a/coregrind/docs/index.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-
-<head>
-  <meta http-equiv="Content-Type"     
-        content="text/html; charset=iso-8859-1">
-  <meta http-equiv="Content-Language" content="en-gb">
-  <meta name="generator" 
-        content="Mozilla/4.76 (X11; U; Linux 2.4.1-0.1.9 i586) [Netscape]">
-  <meta name="author" content="Julian Seward <jseward@acm.org>">
-  <meta name="description" content="say what this prog does">
-  <meta name="keywords" content="Valgrind, memory checker, x86, GPL">
-  <title>Valgrind's user manual</title>
-</head>
-
-<frameset cols="150,*">
-  <frame name="nav" target="main" src="nav.html">
-  <frame name="main" src="manual.html" scrolling="auto">
-  <noframes>
-    <body>
-     <p>This page uses frames, but your browser doesn't support them.</p>
-    </body>
-  </noframes>
-</frameset>
-
-</html>
diff --git a/coregrind/docs/manual.html b/coregrind/docs/manual.html
deleted file mode 100644
index b715ee3dfe..0000000000
--- a/coregrind/docs/manual.html
+++ /dev/null
@@ -1,2702 +0,0 @@
-<html>
-  <head>
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-    <title>Valgrind</title>
-  </head>
-
-<body bgcolor="#ffffff">
-
-<a name="title">&nbsp;</a>
-<h1 align=center>Valgrind, version 1.0.0</h1>
-<center>This manual was last updated on 20020726</center>
-<p>
-
-<center>
-<a href="mailto:jseward@acm.org">jseward@acm.org</a><br>
-Copyright &copy; 2000-2002 Julian Seward
-<p>
-Valgrind is licensed under the GNU General Public License, 
-version 2<br>
-An open-source tool for finding memory-management problems in
-Linux-x86 executables.
-</center>
-
-<p>
-
-<hr width="100%">
-<a name="contents"></a>
-<h2>Contents of this manual</h2>
-
-<h4>1&nbsp; <a href="#intro">Introduction</a></h4>
-    1.1&nbsp; <a href="#whatfor">What Valgrind is for</a><br>
-    1.2&nbsp; <a href="#whatdoes">What it does with your program</a>
-
-<h4>2&nbsp; <a href="#howtouse">How to use it, and how to make sense 
-    of the results</a></h4>
-    2.1&nbsp; <a href="#starta">Getting started</a><br>
-    2.2&nbsp; <a href="#comment">The commentary</a><br>
-    2.3&nbsp; <a href="#report">Reporting of errors</a><br>
-    2.4&nbsp; <a href="#suppress">Suppressing errors</a><br>
-    2.5&nbsp; <a href="#flags">Command-line flags</a><br>
-    2.6&nbsp; <a href="#errormsgs">Explaination of error messages</a><br>
-    2.7&nbsp; <a href="#suppfiles">Writing suppressions files</a><br>
-    2.8&nbsp; <a href="#clientreq">The Client Request mechanism</a><br>
-    2.9&nbsp; <a href="#pthreads">Support for POSIX pthreads</a><br>
-    2.10&nbsp; <a href="#install">Building and installing</a><br>
-    2.11&nbsp; <a href="#problems">If you have problems</a><br>
-
-<h4>3&nbsp; <a href="#machine">Details of the checking machinery</a></h4>
-    3.1&nbsp; <a href="#vvalue">Valid-value (V) bits</a><br>
-    3.2&nbsp; <a href="#vaddress">Valid-address (A)&nbsp;bits</a><br>
-    3.3&nbsp; <a href="#together">Putting it all together</a><br>
-    3.4&nbsp; <a href="#signals">Signals</a><br>
-    3.5&nbsp; <a href="#leaks">Memory leak detection</a><br>
-
-<h4>4&nbsp; <a href="#limits">Limitations</a></h4>
-
-<h4>5&nbsp; <a href="#howitworks">How it works -- a rough overview</a></h4>
-    5.1&nbsp; <a href="#startb">Getting started</a><br>
-    5.2&nbsp; <a href="#engine">The translation/instrumentation engine</a><br>
-    5.3&nbsp; <a href="#track">Tracking the status of memory</a><br>
-    5.4&nbsp; <a href="#sys_calls">System calls</a><br>
-    5.5&nbsp; <a href="#sys_signals">Signals</a><br>
-
-<h4>6&nbsp; <a href="#example">An example</a></h4>
-
-<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
-
-<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
-
-<hr width="100%">
-
-<a name="intro"></a>
-<h2>1&nbsp; Introduction</h2>
-
-<a name="whatfor"></a>
-<h3>1.1&nbsp; What Valgrind is for</h3>
-
-Valgrind is a tool to help you find memory-management problems in your
-programs. When a program is run under Valgrind's supervision, all
-reads and writes of memory are checked, and calls to
-malloc/new/free/delete are intercepted. As a result, Valgrind can
-detect problems such as:
-<ul>
-  <li>Use of uninitialised memory</li>
-  <li>Reading/writing memory after it has been free'd</li>
-  <li>Reading/writing off the end of malloc'd blocks</li>
-  <li>Reading/writing inappropriate areas on the stack</li>
-  <li>Memory leaks -- where pointers to malloc'd blocks are lost
-  forever</li>
-  <li>Mismatched use of malloc/new/new [] vs free/delete/delete
-  []</li>
-  <li>Some misuses of the POSIX pthreads API</li>
-</ul>
-
-Problems like these can be difficult to find by other means, often
-lying undetected for long periods, then causing occasional,
-difficult-to-diagnose crashes.
-
-<p>
-Valgrind is closely tied to details of the CPU, operating system and
-to a less extent, compiler and basic C libraries. This makes it
-difficult to make it portable, so I have chosen at the outset to
-concentrate on what I believe to be a widely used platform: Linux on
-x86s.  Valgrind uses the standard Unix <code>./configure</code>,
-<code>make</code>, <code>make install</code> mechanism, and I have
-attempted to ensure that it works on machines with kernel 2.2 or 2.4
-and glibc 2.1.X or 2.2.X.  This should cover the vast majority of
-modern Linux installations.
-
-
-<p>
-Valgrind is licensed under the GNU General Public License, version
-2. Read the file LICENSE in the source distribution for details.  Some
-of the PThreads test cases, <code>test/pth_*.c</code>, are taken from
-"Pthreads Programming" by Bradford Nichols, Dick Buttlar &amp; Jacqueline
-Proulx Farrell, ISBN 1-56592-115-1, published by O'Reilly &amp;
-Associates, Inc.
-
-
-<a name="whatdoes"></a>
-<h3>1.2&nbsp; What it does with your program</h3>
-
-Valgrind is designed to be as non-intrusive as possible. It works
-directly with existing executables. You don't need to recompile,
-relink, or otherwise modify, the program to be checked. Simply place
-the word <code>valgrind</code> at the start of the command line
-normally used to run the program. So, for example, if you want to run
-the command <code>ls -l</code> on Valgrind, simply issue the
-command: <code>valgrind ls -l</code>.
-
-<p>Valgrind takes control of your program before it starts. Debugging
-information is read from the executable and associated libraries, so
-that error messages can be phrased in terms of source code
-locations. Your program is then run on a synthetic x86 CPU which
-checks every memory access. All detected errors are written to a
-log. When the program finishes, Valgrind searches for and reports on
-leaked memory.
-
-<p>You can run pretty much any dynamically linked ELF x86 executable
-using Valgrind. Programs run 25 to 50 times slower, and take a lot
-more memory, than they usually would. It works well enough to run
-large programs. For example, the Konqueror web browser from the KDE
-Desktop Environment, version 3.0, runs slowly but usably on Valgrind.
-
-<p>Valgrind simulates every single instruction your program executes.
-Because of this, it finds errors not only in your application but also
-in all supporting dynamically-linked (<code>.so</code>-format)
-libraries, including the GNU C library, the X client libraries, Qt, if
-you work with KDE, and so on. That often includes libraries, for
-example the GNU C library, which contain memory access violations, but
-which you cannot or do not want to fix.
-
-<p>Rather than swamping you with errors in which you are not
-interested, Valgrind allows you to selectively suppress errors, by
-recording them in a suppressions file which is read when Valgrind
-starts up.  The build mechanism attempts to select suppressions which
-give reasonable behaviour for the libc and XFree86 versions detected
-on your machine.
-
-
-<p><a href="#example">Section 6</a> shows an example of use.
-<p>
-<hr width="100%">
-
-<a name="howtouse"></a>
-<h2>2&nbsp; How to use it, and how to make sense of the results</h2>
-
-<a name="starta"></a>
-<h3>2.1&nbsp; Getting started</h3>
-
-First off, consider whether it might be beneficial to recompile your
-application and supporting libraries with optimisation disabled and
-debugging info enabled (the <code>-g</code> flag).  You don't have to
-do this, but doing so helps Valgrind produce more accurate and less
-confusing error reports.  Chances are you're set up like this already,
-if you intended to debug your program with GNU gdb, or some other
-debugger.  
-
-<p>
-A plausible compromise is to use <code>-g -O</code>.
-Optimisation levels above <code>-O</code> have been observed, on very
-rare occasions, to cause gcc to generate code which fools Valgrind's
-error tracking machinery into wrongly reporting uninitialised value
-errors.  <code>-O</code> gets you the vast majority of the benefits of
-higher optimisation levels anyway, so you don't lose much there.
-
-<p>
-Valgrind understands both the older "stabs" debugging format, used by
-gcc versions prior to 3.1, and the newer DWARF2 format used by gcc 3.1
-and later.
-
-<p>
-Then just run your application, but place the word
-<code>valgrind</code> in front of your usual command-line invokation.
-Note that you should run the real (machine-code) executable here.  If
-your application is started by, for example, a shell or perl script,
-you'll need to modify it to invoke Valgrind on the real executables.
-Running such scripts directly under Valgrind will result in you
-getting error reports pertaining to <code>/bin/sh</code>,
-<code>/usr/bin/perl</code>, or whatever interpreter you're using.
-This almost certainly isn't what you want and can be confusing.
-
-<a name="comment"></a>
-<h3>2.2&nbsp; The commentary</h3>
-
-Valgrind writes a commentary, detailing error reports and other
-significant events.  The commentary goes to standard output by
-default.  This may interfere with your program, so you can ask for it
-to be directed elsewhere.
-
-<p>All lines in the commentary are of the following form:<br>
-<pre>
-  ==12345== some-message-from-Valgrind
-</pre>
-<p>The <code>12345</code>  is the process ID.  This scheme makes it easy
-to distinguish program output from Valgrind commentary, and also easy
-to differentiate commentaries from different processes which have
-become merged together, for whatever reason.
-
-<p>By default, Valgrind writes only essential messages to the commentary,
-so as to avoid flooding you with information of secondary importance.
-If you want more information about what is happening, re-run, passing
-the <code>-v</code> flag to Valgrind.
-
-
-<a name="report"></a>
-<h3>2.3&nbsp; Reporting of errors</h3>
-
-When Valgrind detects something bad happening in the program, an error
-message is written to the commentary.  For example:<br>
-<pre>
-  ==25832== Invalid read of size 4
-  ==25832==    at 0x8048724: BandMatrix::ReSize(int, int, int) (bogon.cpp:45)
-  ==25832==    by 0x80487AF: main (bogon.cpp:66)
-  ==25832==    by 0x40371E5E: __libc_start_main (libc-start.c:129)
-  ==25832==    by 0x80485D1: (within /home/sewardj/newmat10/bogon)
-  ==25832==    Address 0xBFFFF74C is not stack'd, malloc'd or free'd
-</pre>
-
-<p>This message says that the program did an illegal 4-byte read of
-address 0xBFFFF74C, which, as far as it can tell, is not a valid stack
-address, nor corresponds to any currently malloc'd or free'd blocks.
-The read is happening at line 45 of <code>bogon.cpp</code>, called
-from line 66 of the same file, etc.  For errors associated with an
-identified malloc'd/free'd block, for example reading free'd memory,
-Valgrind reports not only the location where the error happened, but
-also where the associated block was malloc'd/free'd.
-
-<p>Valgrind remembers all error reports.  When an error is detected,
-it is compared against old reports, to see if it is a duplicate.  If
-so, the error is noted, but no further commentary is emitted.  This
-avoids you being swamped with bazillions of duplicate error reports.
-
-<p>If you want to know how many times each error occurred, run with
-the <code>-v</code> option.  When execution finishes, all the reports
-are printed out, along with, and sorted by, their occurrence counts.
-This makes it easy to see which errors have occurred most frequently.
-
-<p>Errors are reported before the associated operation actually
-happens.  For example, if you program decides to read from address
-zero, Valgrind will emit a message to this effect, and the program
-will then duly die with a segmentation fault.
-
-<p>In general, you should try and fix errors in the order that they
-are reported.  Not doing so can be confusing.  For example, a program
-which copies uninitialised values to several memory locations, and
-later uses them, will generate several error messages.  The first such
-error message may well give the most direct clue to the root cause of
-the problem.
-
-<p>The process of detecting duplicate errors is quite an expensive
-one and can become a significant performance overhead if your program
-generates huge quantities of errors.  To avoid serious problems here,
-Valgrind will simply stop collecting errors after 300 different errors
-have been seen, or 30000 errors in total have been seen.  In this
-situation you might as well stop your program and fix it, because
-Valgrind won't tell you anything else useful after this.  Note that
-the 300/30000 limits apply after suppressed errors are removed.  These
-limits are defined in <code>vg_include.h</code> and can be increased
-if necessary.
-
-<p>To avoid this cutoff you can use the
-<code>--error-limit=no</code> flag.  Then valgrind will always show
-errors, regardless of how many there are.  Use this flag carefully,
-since it may have a dire effect on performance.
-
-
-<a name="suppress"></a>
-<h3>2.4&nbsp; Suppressing errors</h3>
-
-Valgrind detects numerous problems in the base libraries, such as the
-GNU C library, and the XFree86 client libraries, which come
-pre-installed on your GNU/Linux system.  You can't easily fix these,
-but you don't want to see these errors (and yes, there are many!)  So
-Valgrind reads a list of errors to suppress at startup.  
-A default suppression file is cooked up by the
-<code>./configure</code> script.
-
-<p>You can modify and add to the suppressions file at your leisure,
-or, better, write your own.  Multiple suppression files are allowed.
-This is useful if part of your project contains errors you can't or
-don't want to fix, yet you don't want to continuously be reminded of
-them.
-
-<p>Each error to be suppressed is described very specifically, to
-minimise the possibility that a suppression-directive inadvertantly
-suppresses a bunch of similar errors which you did want to see.  The
-suppression mechanism is designed to allow precise yet flexible
-specification of errors to suppress.
-
-<p>If you use the <code>-v</code> flag, at the end of execution, Valgrind
-prints out one line for each used suppression, giving its name and the
-number of times it got used.  Here's the suppressions used by a run of
-<code>ls -l</code>:
-<pre>
-  --27579-- supp: 1 socketcall.connect(serv_addr)/__libc_connect/__nscd_getgrgid_r
-  --27579-- supp: 1 socketcall.connect(serv_addr)/__libc_connect/__nscd_getpwuid_r
-  --27579-- supp: 6 strrchr/_dl_map_object_from_fd/_dl_map_object
-</pre>
-
-<a name="flags"></a>
-<h3>2.5&nbsp; Command-line flags</h3>
-
-You invoke Valgrind like this:
-<pre>
-  valgrind [options-for-Valgrind] your-prog [options for your-prog]
-</pre>
-
-<p>Note that Valgrind also reads options from the environment variable
-<code>$VALGRIND</code>, and processes them before the command-line
-options.
-
-<p>Valgrind's default settings succeed in giving reasonable behaviour
-in most cases.  Available options, in no particular order, are as
-follows:
-<ul>
-  <li><code>--help</code></li><br>
-
-  <li><code>--version</code><br>
-      <p>The usual deal.</li><br><p>
-
-  <li><code>-v --verbose</code><br>
-      <p>Be more verbose.  Gives extra information on various aspects
-      of your program, such as: the shared objects loaded, the
-      suppressions used, the progress of the instrumentation engine,
-      and warnings about unusual behaviour.
-      </li><br><p>
-
-  <li><code>-q --quiet</code><br>
-      <p>Run silently, and only print error messages.  Useful if you
-      are running regression tests or have some other automated test
-      machinery.
-      </li><br><p>
-
-  <li><code>--demangle=no</code><br>
-      <code>--demangle=yes</code> [the default]
-      <p>Disable/enable automatic demangling (decoding) of C++ names.
-      Enabled by default.  When enabled, Valgrind will attempt to
-      translate encoded C++ procedure names back to something
-      approaching the original.  The demangler handles symbols mangled
-      by g++ versions 2.X and 3.X.
-
-      <p>An important fact about demangling is that function
-      names mentioned in suppressions files should be in their mangled
-      form.  Valgrind does not demangle function names when searching
-      for applicable suppressions, because to do otherwise would make
-      suppressions file contents dependent on the state of Valgrind's
-      demangling machinery, and would also be slow and pointless.
-      </li><br><p>
-
-  <li><code>--num-callers=&lt;number&gt;</code> [default=4]<br>
-      <p>By default, Valgrind shows four levels of function call names
-      to help you identify program locations.  You can change that
-      number with this option.  This can help in determining the
-      program's location in deeply-nested call chains.  Note that errors
-      are commoned up using only the top three function locations (the
-      place in the current function, and that of its two immediate
-      callers).  So this doesn't affect the total number of errors
-      reported.  
-      <p>
-      The maximum value for this is 50.  Note that higher settings
-      will make Valgrind run a bit more slowly and take a bit more
-      memory, but can be useful when working with programs with
-      deeply-nested call chains.  
-      </li><br><p>
-
-  <li><code>--gdb-attach=no</code> [the default]<br>
-      <code>--gdb-attach=yes</code>
-      <p>When enabled, Valgrind will pause after every error shown,
-      and print the line
-      <br>
-      <code>---- Attach to GDB ? --- [Return/N/n/Y/y/C/c] ----</code>
-      <p>
-      Pressing <code>Ret</code>, or <code>N</code> <code>Ret</code>
-      or <code>n</code> <code>Ret</code>, causes Valgrind not to
-      start GDB for this error.
-      <p>
-      <code>Y</code> <code>Ret</code>
-      or <code>y</code> <code>Ret</code> causes Valgrind to
-      start GDB, for the program at this point.  When you have
-      finished with GDB, quit from it, and the program will continue.
-      Trying to continue from inside GDB doesn't work.
-      <p>
-      <code>C</code> <code>Ret</code>
-      or <code>c</code> <code>Ret</code> causes Valgrind not to
-      start GDB, and not to ask again.
-      <p>
-      <code>--gdb-attach=yes</code> conflicts with
-      <code>--trace-children=yes</code>.  You can't use them together.
-      Valgrind refuses to start up in this situation.  1 May 2002:
-      this is a historical relic which could be easily fixed if it
-      gets in your way.  Mail me and complain if this is a problem for
-      you.  </li><br><p>
-     
-  <li><code>--partial-loads-ok=yes</code> [the default]<br>
-      <code>--partial-loads-ok=no</code>
-      <p>Controls how Valgrind handles word (4-byte) loads from
-      addresses for which some bytes are addressible and others
-      are not.  When <code>yes</code> (the default), such loads
-      do not elicit an address error.  Instead, the loaded V bytes
-      corresponding to the illegal addresses indicate undefined, and
-      those corresponding to legal addresses are loaded from shadow 
-      memory, as usual.
-      <p>
-      When <code>no</code>, loads from partially
-      invalid addresses are treated the same as loads from completely
-      invalid addresses: an illegal-address error is issued,
-      and the resulting V bytes indicate valid data.
-      </li><br><p>
-
-  <li><code>--sloppy-malloc=no</code> [the default]<br>
-      <code>--sloppy-malloc=yes</code>
-      <p>When enabled, all requests for malloc/calloc are rounded up
-      to a whole number of machine words -- in other words, made
-      divisible by 4.  For example, a request for 17 bytes of space
-      would result in a 20-byte area being made available.  This works
-      around bugs in sloppy libraries which assume that they can
-      safely rely on malloc/calloc requests being rounded up in this
-      fashion.  Without the workaround, these libraries tend to
-      generate large numbers of errors when they access the ends of
-      these areas.  
-      <p>
-      Valgrind snapshots dated 17 Feb 2002 and later are
-      cleverer about this problem, and you should no longer need to 
-      use this flag.  To put it bluntly, if you do need to use this
-      flag, your program violates the ANSI C semantics defined for
-      <code>malloc</code> and <code>free</code>, even if it appears to
-      work correctly, and you should fix it, at least if you hope for
-      maximum portability.
-      </li><br><p>
-
-  <li><code>--alignment=&lt;number></code> [default: 4]<br> <p>By
-      default valgrind's <code>malloc</code>, <code>realloc</code>,
-      etc, return 4-byte aligned addresses.  These are suitable for
-      any accesses on x86 processors. 
-      Some programs might however assume that <code>malloc</code> et
-      al return 8- or more aligned memory.
-      These programs are broken and should be fixed, but
-      if this is impossible for whatever reason the alignment can be
-      increased using this parameter.  The supplied value must be
-      between 4 and 4096 inclusive, and must be a power of two.</li><br><p>
-
-  <li><code>--trace-children=no</code> [the default]<br>
-      <code>--trace-children=yes</code>
-      <p>When enabled, Valgrind will trace into child processes.  This
-      is confusing and usually not what you want, so is disabled by
-      default.  As of 1 May 2002, tracing into a child process from a
-      parent which uses <code>libpthread.so</code> is probably broken
-      and is likely to cause breakage.  Please report any such
-      problems to me.  </li><br><p>
-
-  <li><code>--freelist-vol=&lt;number></code> [default: 1000000]
-      <p>When the client program releases memory using free (in C) or
-      delete (C++), that memory is not immediately made available for
-      re-allocation.  Instead it is marked inaccessible and placed in
-      a queue of freed blocks.  The purpose is to delay the point at
-      which freed-up memory comes back into circulation.  This
-      increases the chance that Valgrind will be able to detect
-      invalid accesses to blocks for some significant period of time
-      after they have been freed.  
-      <p>
-      This flag specifies the maximum total size, in bytes, of the
-      blocks in the queue.  The default value is one million bytes.
-      Increasing this increases the total amount of memory used by
-      Valgrind but may detect invalid uses of freed blocks which would
-      otherwise go undetected.</li><br><p>
-
-  <li><code>--logfile-fd=&lt;number></code> [default: 2, stderr]
-      <p>Specifies the file descriptor on which Valgrind communicates
-      all of its messages.  The default, 2, is the standard error
-      channel.  This may interfere with the client's own use of
-      stderr.  To dump Valgrind's commentary in a file without using
-      stderr, something like the following works well (sh/bash
-      syntax):<br>
-      <code>&nbsp;&nbsp;
-            valgrind --logfile-fd=9 my_prog 9> logfile</code><br>
-      That is: tell Valgrind to send all output to file descriptor 9,
-      and ask the shell to route file descriptor 9 to "logfile".
-      </li><br><p>
-
-  <li><code>--suppressions=&lt;filename></code> 
-      [default: $PREFIX/lib/valgrind/default.supp]
-      <p>Specifies an extra
-      file from which to read descriptions of errors to suppress.  You
-      may use as many extra suppressions files as you
-      like.</li><br><p>
-
-  <li><code>--leak-check=no</code> [default]<br>
-      <code>--leak-check=yes</code> 
-      <p>When enabled, search for memory leaks when the client program
-      finishes.  A memory leak means a malloc'd block, which has not
-      yet been free'd, but to which no pointer can be found.  Such a
-      block can never be free'd by the program, since no pointer to it
-      exists.  Leak checking is disabled by default because it tends
-      to generate dozens of error messages.  </li><br><p>
-
-  <li><code>--show-reachable=no</code> [default]<br>
-      <code>--show-reachable=yes</code> 
-      <p>When disabled, the memory leak detector only shows blocks for
-      which it cannot find a pointer to at all, or it can only find a
-      pointer to the middle of.  These blocks are prime candidates for
-      memory leaks.  When enabled, the leak detector also reports on
-      blocks which it could find a pointer to.  Your program could, at
-      least in principle, have freed such blocks before exit.
-      Contrast this to blocks for which no pointer, or only an
-      interior pointer could be found: they are more likely to
-      indicate memory leaks, because you do not actually have a
-      pointer to the start of the block which you can hand to
-      <code>free</code>, even if you wanted to.  </li><br><p>
-
-  <li><code>--leak-resolution=low</code> [default]<br>
-      <code>--leak-resolution=med</code> <br>
-      <code>--leak-resolution=high</code>
-      <p>When doing leak checking, determines how willing Valgrind is
-      to consider different backtraces to be the same.  When set to
-      <code>low</code>, the default, only the first two entries need
-      match.  When <code>med</code>, four entries have to match.  When
-      <code>high</code>, all entries need to match.  
-      <p>
-      For hardcore leak debugging, you probably want to use
-      <code>--leak-resolution=high</code> together with 
-      <code>--num-callers=40</code> or some such large number.  Note
-      however that this can give an overwhelming amount of
-      information, which is why the defaults are 4 callers and
-      low-resolution matching.
-      <p>
-      Note that the <code>--leak-resolution=</code> setting does not
-      affect Valgrind's ability to find leaks.  It only changes how
-      the results are presented.
-      </li><br><p>
-
-  <li><code>--workaround-gcc296-bugs=no</code> [default]<br>
-      <code>--workaround-gcc296-bugs=yes</code> <p>When enabled,
-      assume that reads and writes some small distance below the stack
-      pointer <code>%esp</code> are due to bugs in gcc 2.96, and does
-      not report them.  The "small distance" is 256 bytes by default.
-      Note that gcc 2.96 is the default compiler on some popular Linux
-      distributions (RedHat 7.X, Mandrake) and so you may well need to
-      use this flag.  Do not use it if you do not have to, as it can
-      cause real errors to be overlooked.  Another option is to use a
-      gcc/g++ which does not generate accesses below the stack
-      pointer.  2.95.3 seems to be a good choice in this respect.
-      <p>
-      Unfortunately (27 Feb 02) it looks like g++ 3.0.4 has a similar
-      bug, so you may need to issue this flag if you use 3.0.4.  A
-      while later (early Apr 02) this is confirmed as a scheduling bug
-      in g++-3.0.4.
-      </li><br><p>
-
-  <li><code>--error-limit=yes</code> [default]<br>
-      <code>--error-limit=no</code> <p>When enabled, valgrind stops
-      reporting errors after 30000 in total, or 300 different ones,
-      have been seen.  This is to stop the error tracking machinery
-      from becoming a huge performance overhead in programs with many
-      errors.  </li><br><p>
-
-  <li><code>--cachesim=no</code> [default]<br>
-      <code>--cachesim=yes</code> <p>When enabled, turns off memory
-      checking, and turns on cache profiling.  Cache profiling is
-      described in detail in <a href="#cache">Section 7</a>.
-      </li><br><p>
-
-  <li><code>--weird-hacks=hack1,hack2,...</code>
-      Pass miscellaneous hints to Valgrind which slightly modify the
-      simulated behaviour in nonstandard or dangerous ways, possibly
-      to help the simulation of strange features.  By default no hacks
-      are enabled.  Use with caution!  Currently known hacks are:
-      <p>
-      <ul>
-      <li><code>ioctl-VTIME</code> Use this if you have a program
-          which sets readable file descriptors to have a timeout by
-          doing <code>ioctl</code> on them with a
-          <code>TCSETA</code>-style command <b>and</b> a non-zero
-          <code>VTIME</code> timeout value.  This is considered
-          potentially dangerous and therefore is not engaged by
-          default, because it is (remotely) conceivable that it could
-          cause threads doing <code>read</code> to incorrectly block
-          the entire process.
-          <p>
-          You probably want to try this one if you have a program
-          which unexpectedly blocks in a <code>read</code> from a file
-          descriptor which you know to have been messed with by
-          <code>ioctl</code>.  This could happen, for example, if the
-          descriptor is used to read input from some kind of screen
-          handling library.
-          <p>
-          To find out if your program is blocking unexpectedly in the
-          <code>read</code> system call, run with
-          <code>--trace-syscalls=yes</code> flag.
-      <p>
-      <li><code>truncate-writes</code> Use this if you have a threaded
-          program which appears to unexpectedly block whilst writing
-          into a pipe.  The effect is to modify all calls to
-          <code>write()</code> so that requests to write more than
-          4096 bytes are treated as if they only requested a write of
-          4096 bytes.  Valgrind does this by changing the
-          <code>count</code> argument of <code>write()</code>, as
-          passed to the kernel, so that it is at most 4096.  The
-          amount of data written will then be less than the client
-          program asked for, but the client should have a loop around
-          its <code>write()</code> call to check whether the requested
-          number of bytes have been written.  If not, it should issue
-          further <code>write()</code> calls until all the data is
-          written.
-          <p>
-          This all sounds pretty dodgy to me, which is why I've made
-          this behaviour only happen on request.  It is not the
-          default behaviour.  At the time of writing this (30 June
-          2002) I have only seen one example where this is necessary,
-          so either the problem is extremely rare or nobody is using
-          Valgrind :-)
-          <p>
-          On experimentation I see that <code>truncate-writes</code>
-          doesn't interact well with <code>ioctl-VTIME</code>, so you
-          probably don't want to try both at once.
-          <p>
-          As above, to find out if your program is blocking
-          unexpectedly in the <code>write()</code> system call, you
-          may find the <code>--trace-syscalls=yes
-          --trace-sched=yes</code> flags useful.
-      </ul>
-
-      </li><p>
-</ul>
-
-There are also some options for debugging Valgrind itself.  You
-shouldn't need to use them in the normal run of things.  Nevertheless:
-
-<ul>
-
-  <li><code>--single-step=no</code> [default]<br>
-      <code>--single-step=yes</code>
-      <p>When enabled, each x86 insn is translated seperately into
-      instrumented code.  When disabled, translation is done on a
-      per-basic-block basis, giving much better translations.</li><br>
-      <p>
-
-  <li><code>--optimise=no</code><br>
-      <code>--optimise=yes</code> [default]
-      <p>When enabled, various improvements are applied to the
-      intermediate code, mainly aimed at allowing the simulated CPU's
-      registers to be cached in the real CPU's registers over several
-      simulated instructions.</li><br>
-      <p>
-
-  <li><code>--instrument=no</code><br>
-      <code>--instrument=yes</code> [default]
-      <p>When disabled, the translations don't actually contain any
-      instrumentation.</li><br>
-      <p>
-
-  <li><code>--cleanup=no</code><br>
-      <code>--cleanup=yes</code> [default]
-      <p>When enabled, various improvments are applied to the
-      post-instrumented intermediate code, aimed at removing redundant
-      value checks.</li><br>
-      <p>
-
-  <li><code>--trace-syscalls=no</code> [default]<br>
-      <code>--trace-syscalls=yes</code>
-      <p>Enable/disable tracing of system call intercepts.</li><br>
-      <p>
-
-  <li><code>--trace-signals=no</code> [default]<br>
-      <code>--trace-signals=yes</code>
-      <p>Enable/disable tracing of signal handling.</li><br>
-      <p>
-
-  <li><code>--trace-sched=no</code> [default]<br>
-      <code>--trace-sched=yes</code>
-      <p>Enable/disable tracing of thread scheduling events.</li><br>
-      <p>
-
-  <li><code>--trace-pthread=none</code> [default]<br>
-      <code>--trace-pthread=some</code> <br>
-      <code>--trace-pthread=all</code>
-      <p>Specifies amount of trace detail for pthread-related events.</li><br>
-      <p>
-
-  <li><code>--trace-symtab=no</code> [default]<br>
-      <code>--trace-symtab=yes</code>
-      <p>Enable/disable tracing of symbol table reading.</li><br>
-      <p>
-
-  <li><code>--trace-malloc=no</code> [default]<br>
-      <code>--trace-malloc=yes</code>
-      <p>Enable/disable tracing of malloc/free (et al) intercepts.
-      </li><br>
-      <p>
-
-  <li><code>--stop-after=&lt;number></code> 
-      [default: infinity, more or less]
-      <p>After &lt;number> basic blocks have been executed, shut down
-      Valgrind and switch back to running the client on the real CPU.
-      </li><br>
-      <p>
-
-  <li><code>--dump-error=&lt;number></code> [default: inactive]
-      <p>After the program has exited, show gory details of the
-      translation of the basic block containing the &lt;number>'th
-      error context.  When used with <code>--single-step=yes</code>,
-      can show the exact x86 instruction causing an error.  This is
-      all fairly dodgy and doesn't work at all if threads are
-      involved.</li><br>
-      <p>
-</ul>
-
-
-<a name="errormsgs"></a>
-<h3>2.6&nbsp; Explaination of error messages</h3>
-
-Despite considerable sophistication under the hood, Valgrind can only
-really detect two kinds of errors, use of illegal addresses, and use
-of undefined values.  Nevertheless, this is enough to help you
-discover all sorts of memory-management nasties in your code.  This
-section presents a quick summary of what error messages mean.  The
-precise behaviour of the error-checking machinery is described in
-<a href="#machine">Section 4</a>.
-
-
-<h4>2.6.1&nbsp; Illegal read / Illegal write errors</h4>
-For example:
-<pre>
-  Invalid read of size 4
-     at 0x40F6BBCC: (within /usr/lib/libpng.so.2.1.0.9)
-     by 0x40F6B804: (within /usr/lib/libpng.so.2.1.0.9)
-     by 0x40B07FF4: read_png_image__FP8QImageIO (kernel/qpngio.cpp:326)
-     by 0x40AC751B: QImageIO::read() (kernel/qimage.cpp:3621)
-     Address 0xBFFFF0E0 is not stack'd, malloc'd or free'd
-</pre>
-
-<p>This happens when your program reads or writes memory at a place
-which Valgrind reckons it shouldn't.  In this example, the program did
-a 4-byte read at address 0xBFFFF0E0, somewhere within the
-system-supplied library libpng.so.2.1.0.9, which was called from
-somewhere else in the same library, called from line 326 of
-qpngio.cpp, and so on.
-
-<p>Valgrind tries to establish what the illegal address might relate
-to, since that's often useful.  So, if it points into a block of
-memory which has already been freed, you'll be informed of this, and
-also where the block was free'd at.  Likewise, if it should turn out
-to be just off the end of a malloc'd block, a common result of
-off-by-one-errors in array subscripting, you'll be informed of this
-fact, and also where the block was malloc'd.
-
-<p>In this example, Valgrind can't identify the address.  Actually the
-address is on the stack, but, for some reason, this is not a valid
-stack address -- it is below the stack pointer, %esp, and that isn't
-allowed.  In this particular case it's probably caused by gcc
-generating invalid code, a known bug in various flavours of gcc.
-
-<p>Note that Valgrind only tells you that your program is about to
-access memory at an illegal address.  It can't stop the access from
-happening.  So, if your program makes an access which normally would
-result in a segmentation fault, you program will still suffer the same
-fate -- but you will get a message from Valgrind immediately prior to
-this.  In this particular example, reading junk on the stack is
-non-fatal, and the program stays alive.
-
-
-<h4>2.6.2&nbsp; Use of uninitialised values</h4>
-For example:
-<pre>
-  Conditional jump or move depends on uninitialised value(s)
-     at 0x402DFA94: _IO_vfprintf (_itoa.h:49)
-     by 0x402E8476: _IO_printf (printf.c:36)
-     by 0x8048472: main (tests/manuel1.c:8)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-</pre>
-
-<p>An uninitialised-value use error is reported when your program uses
-a value which hasn't been initialised -- in other words, is undefined.
-Here, the undefined value is used somewhere inside the printf()
-machinery of the C library.  This error was reported when running the
-following small program:
-<pre>
-  int main()
-  {
-    int x;
-    printf ("x = %d\n", x);
-  }
-</pre>
-
-<p>It is important to understand that your program can copy around
-junk (uninitialised) data to its heart's content.  Valgrind observes
-this and keeps track of the data, but does not complain.  A complaint
-is issued only when your program attempts to make use of uninitialised
-data.  In this example, x is uninitialised.  Valgrind observes the
-value being passed to _IO_printf and thence to _IO_vfprintf, but makes
-no comment.  However, _IO_vfprintf has to examine the value of x so it
-can turn it into the corresponding ASCII string, and it is at this
-point that Valgrind complains.
-
-<p>Sources of uninitialised data tend to be:
-<ul>
-  <li>Local variables in procedures which have not been initialised,
-      as in the example above.</li><br><p>
-
-  <li>The contents of malloc'd blocks, before you write something
-      there.  In C++, the new operator is a wrapper round malloc, so
-      if you create an object with new, its fields will be
-      uninitialised until you fill them in, which is only Right and
-      Proper.</li>
-</ul>
-
-
-
-<h4>2.6.3&nbsp; Illegal frees</h4>
-For example:
-<pre>
-  Invalid free()
-     at 0x4004FFDF: free (ut_clientmalloc.c:577)
-     by 0x80484C7: main (tests/doublefree.c:10)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/doublefree)
-     Address 0x3807F7B4 is 0 bytes inside a block of size 177 free'd
-     at 0x4004FFDF: free (ut_clientmalloc.c:577)
-     by 0x80484C7: main (tests/doublefree.c:10)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/doublefree)
-</pre>
-<p>Valgrind keeps track of the blocks allocated by your program with
-malloc/new, so it can know exactly whether or not the argument to
-free/delete is legitimate or not.  Here, this test program has
-freed the same block twice.  As with the illegal read/write errors,
-Valgrind attempts to make sense of the address free'd.  If, as
-here, the address is one which has previously been freed, you wil
-be told that -- making duplicate frees of the same block easy to spot.
-
-
-<h4>2.6.4&nbsp; When a block is freed with an inappropriate
-deallocation function</h4>
-In the following example, a block allocated with <code>new[]</code>
-has wrongly been deallocated with <code>free</code>:
-<pre>
-  Mismatched free() / delete / delete []
-     at 0x40043249: free (vg_clientfuncs.c:171)
-     by 0x4102BB4E: QGArray::~QGArray(void) (tools/qgarray.cpp:149)
-     by 0x4C261C41: PptDoc::~PptDoc(void) (include/qmemarray.h:60)
-     by 0x4C261F0E: PptXml::~PptXml(void) (pptxml.cc:44)
-     Address 0x4BB292A8 is 0 bytes inside a block of size 64 alloc'd
-     at 0x4004318C: __builtin_vec_new (vg_clientfuncs.c:152)
-     by 0x4C21BC15: KLaola::readSBStream(int) const (klaola.cc:314)
-     by 0x4C21C155: KLaola::stream(KLaola::OLENode const *) (klaola.cc:416)
-     by 0x4C21788F: OLEFilter::convert(QCString const &amp;) (olefilter.cc:272)
-</pre>
-The following was told to me be the KDE 3 developers.  I didn't know
-any of it myself.  They also implemented the check itself.
-<p>
-In C++ it's important to deallocate memory in a way compatible with
-how it was allocated.  The deal is:
-<ul>
-<li>If allocated with <code>malloc</code>, <code>calloc</code>,
-    <code>realloc</code>, <code>valloc</code> or
-    <code>memalign</code>, you must deallocate with <code>free</code>.
-<li>If allocated with <code>new[]</code>, you must deallocate with
-    <code>delete[]</code>.
-<li>If allocated with <code>new</code>, you must deallocate with
-    <code>delete</code>.
-</ul>
-The worst thing is that on Linux apparently it doesn't matter if you
-do muddle these up, and it all seems to work ok, but the same program
-may then crash on a different platform, Solaris for example.  So it's
-best to fix it properly.  According to the KDE folks "it's amazing how
-many C++ programmers don't know this".  
-<p>
-Pascal Massimino adds the following clarification:
-<code>delete[]</code> must be called associated with a
-<code>new[]</code> because the compiler stores the size of the array
-and the pointer-to-member to the destructor of the array's content
-just before the pointer actually returned.  This implies a
-variable-sized overhead in what's returned by <code>new</code> or
-<code>new[]</code>.  It rather surprising how compilers [Ed:
-runtime-support libraries?] are robust to mismatch in
-<code>new</code>/<code>delete</code>
-<code>new[]</code>/<code>delete[]</code>.
-
-
-<h4>2.6.5&nbsp; Passing system call parameters with inadequate
-read/write permissions</h4>
-
-Valgrind checks all parameters to system calls.  If a system call
-needs to read from a buffer provided by your program, Valgrind checks
-that the entire buffer is addressible and has valid data, ie, it is
-readable.  And if the system call needs to write to a user-supplied
-buffer, Valgrind checks that the buffer is addressible.  After the
-system call, Valgrind updates its administrative information to
-precisely reflect any changes in memory permissions caused by the
-system call.
-
-<p>Here's an example of a system call with an invalid parameter:
-<pre>
-  #include &lt;stdlib.h>
-  #include &lt;unistd.h>
-  int main( void )
-  {
-    char* arr = malloc(10);
-    (void) write( 1 /* stdout */, arr, 10 );
-    return 0;
-  }
-</pre>
-
-<p>You get this complaint ...
-<pre>
-  Syscall param write(buf) contains uninitialised or unaddressable byte(s)
-     at 0x4035E072: __libc_write
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/badwrite)
-     by &lt;bogus frame pointer> ???
-     Address 0x3807E6D0 is 0 bytes inside a block of size 10 alloc'd
-     at 0x4004FEE6: malloc (ut_clientmalloc.c:539)
-     by 0x80484A0: main (tests/badwrite.c:6)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/badwrite)
-</pre>
-
-<p>... because the program has tried to write uninitialised junk from
-the malloc'd block to the standard output.
-
-
-<h4>2.6.6&nbsp; Warning messages you might see</h4>
-
-Most of these only appear if you run in verbose mode (enabled by
-<code>-v</code>):
-<ul>
-<li> <code>More than 50 errors detected.  Subsequent errors
-     will still be recorded, but in less detail than before.</code>
-     <br>
-     After 50 different errors have been shown, Valgrind becomes 
-     more conservative about collecting them.  It then requires only 
-     the program counters in the top two stack frames to match when
-     deciding whether or not two errors are really the same one.
-     Prior to this point, the PCs in the top four frames are required
-     to match.  This hack has the effect of slowing down the
-     appearance of new errors after the first 50.  The 50 constant can
-     be changed by recompiling Valgrind.
-<p>
-<li> <code>More than 300 errors detected.  I'm not reporting any more.
-     Final error counts may be inaccurate.  Go fix your
-     program!</code>
-     <br>
-     After 300 different errors have been detected, Valgrind ignores
-     any more.  It seems unlikely that collecting even more different
-     ones would be of practical help to anybody, and it avoids the
-     danger that Valgrind spends more and more of its time comparing
-     new errors against an ever-growing collection.  As above, the 300
-     number is a compile-time constant.
-<p>
-<li> <code>Warning: client switching stacks?</code>
-     <br>
-     Valgrind spotted such a large change in the stack pointer, %esp,
-     that it guesses the client is switching to a different stack.
-     At this point it makes a kludgey guess where the base of the new
-     stack is, and sets memory permissions accordingly.  You may get
-     many bogus error messages following this, if Valgrind guesses
-     wrong.  At the moment "large change" is defined as a change of
-     more that 2000000 in the value of the %esp (stack pointer)
-     register.
-<p>
-<li> <code>Warning: client attempted to close Valgrind's logfile fd &lt;number>
-     </code>
-     <br>
-     Valgrind doesn't allow the client
-     to close the logfile, because you'd never see any diagnostic
-     information after that point.  If you see this message,
-     you may want to use the <code>--logfile-fd=&lt;number></code> 
-     option to specify a different logfile file-descriptor number.
-<p>
-<li> <code>Warning: noted but unhandled ioctl &lt;number></code>
-     <br>
-     Valgrind observed a call to one of the vast family of
-     <code>ioctl</code> system calls, but did not modify its
-     memory status info (because I have not yet got round to it).
-     The call will still have gone through, but you may get spurious
-     errors after this as a result of the non-update of the memory info.
-<p>
-<li> <code>Warning: set address range perms: large range &lt;number></code>
-     <br> 
-     Diagnostic message, mostly for my benefit, to do with memory 
-     permissions.
-</ul>
-
-
-<a name="suppfiles"></a>
-<h3>2.7&nbsp; Writing suppressions files</h3>
-
-A suppression file describes a bunch of errors which, for one reason
-or another, you don't want Valgrind to tell you about.  Usually the
-reason is that the system libraries are buggy but unfixable, at least
-within the scope of the current debugging session.  Multiple
-suppressions files are allowed.  By default, Valgrind uses
-<code>$PREFIX/lib/valgrind/default.supp</code>.
-
-<p>
-You can ask to add suppressions from another file, by specifying
-<code>--suppressions=/path/to/file.supp</code>.
-
-<p>Each suppression has the following components:<br>
-<ul>
-
-  <li>Its name.  This merely gives a handy name to the suppression, by
-      which it is referred to in the summary of used suppressions
-      printed out when a program finishes.  It's not important what
-      the name is; any identifying string will do.
-      <p>
-
-  <li>The nature of the error to suppress.  Either: 
-      <code>Value1</code>, 
-      <code>Value2</code>,
-      <code>Value4</code> or
-      <code>Value8</code>,
-      meaning an uninitialised-value error when
-      using a value of 1, 2, 4 or 8 bytes.
-      Or
-      <code>Cond</code> (or its old name, <code>Value0</code>),
-      meaning use of an uninitialised CPU condition code.  Or: 
-      <code>Addr1</code>,
-      <code>Addr2</code>, 
-      <code>Addr4</code> or 
-      <code>Addr8</code>, meaning an invalid address during a
-      memory access of 1, 2, 4 or 8 bytes respectively.  Or 
-      <code>Param</code>,
-      meaning an invalid system call parameter error.  Or
-      <code>Free</code>, meaning an invalid or mismatching free.
-      Or <code>PThread</code>, meaning any kind of complaint to do
-      with the PThreads API.</li><br>
-      <p>
-
-  <li>The "immediate location" specification.  For Value and Addr
-      errors, is either the name of the function in which the error
-      occurred, or, failing that, the full path the the .so file
-      containing the error location.  For Param errors, is the name of
-      the offending system call parameter.  For Free errors, is the
-      name of the function doing the freeing (eg, <code>free</code>,
-      <code>__builtin_vec_delete</code>, etc)</li><br>
-      <p>
-
-  <li>The caller of the above "immediate location".  Again, either a
-      function or shared-object name.</li><br>
-      <p>
-
-  <li>Optionally, one or two extra calling-function or object names,
-      for greater precision.</li>
-</ul>
-
-<p>
-Locations may be either names of shared objects or wildcards matching
-function names.  They begin <code>obj:</code> and <code>fun:</code>
-respectively.  Function and object names to match against may use the 
-wildcard characters <code>*</code> and <code>?</code>.
-
-A suppression only suppresses an error when the error matches all the
-details in the suppression.  Here's an example:
-<pre>
-  {
-    __gconv_transform_ascii_internal/__mbrtowc/mbtowc
-    Value4
-    fun:__gconv_transform_ascii_internal
-    fun:__mbr*toc
-    fun:mbtowc
-  }
-</pre>
-
-<p>What is means is: suppress a use-of-uninitialised-value error, when
-the data size is 4, when it occurs in the function
-<code>__gconv_transform_ascii_internal</code>, when that is called
-from any function of name matching <code>__mbr*toc</code>, 
-when that is called from
-<code>mbtowc</code>.  It doesn't apply under any other circumstances.
-The string by which this suppression is identified to the user is
-__gconv_transform_ascii_internal/__mbrtowc/mbtowc.
-
-<p>Another example:
-<pre>
-  {
-    libX11.so.6.2/libX11.so.6.2/libXaw.so.7.0
-    Value4
-    obj:/usr/X11R6/lib/libX11.so.6.2
-    obj:/usr/X11R6/lib/libX11.so.6.2
-    obj:/usr/X11R6/lib/libXaw.so.7.0
-  }
-</pre>
-
-<p>Suppress any size 4 uninitialised-value error which occurs anywhere
-in <code>libX11.so.6.2</code>, when called from anywhere in the same
-library, when called from anywhere in <code>libXaw.so.7.0</code>.  The
-inexact specification of locations is regrettable, but is about all
-you can hope for, given that the X11 libraries shipped with Red Hat
-7.2 have had their symbol tables removed.
-
-<p>Note -- since the above two examples did not make it clear -- that
-you can freely mix the <code>obj:</code> and <code>fun:</code>
-styles of description within a single suppression record.
-
-
-<a name="clientreq"></a>
-<h3>2.8&nbsp; The Client Request mechanism</h3>
-
-Valgrind has a trapdoor mechanism via which the client program can
-pass all manner of requests and queries to Valgrind.  Internally, this
-is used extensively to make malloc, free, signals, threads, etc, work,
-although you don't see that.
-<p>
-For your convenience, a subset of these so-called client requests is
-provided to allow you to tell Valgrind facts about the behaviour of
-your program, and conversely to make queries.  In particular, your
-program can tell Valgrind about changes in memory range permissions
-that Valgrind would not otherwise know about, and so allows clients to
-get Valgrind to do arbitrary custom checks.
-<p>
-Clients need to include the header file <code>valgrind.h</code> to
-make this work.  The macros therein have the magical property that
-they generate code in-line which Valgrind can spot.  However, the code
-does nothing when not run on Valgrind, so you are not forced to run
-your program on Valgrind just because you use the macros in this file.
-Also, you are not required to link your program with any extra
-supporting libraries.
-<p>
-A brief description of the available macros:
-<ul>
-<li><code>VALGRIND_MAKE_NOACCESS</code>,
-    <code>VALGRIND_MAKE_WRITABLE</code> and
-    <code>VALGRIND_MAKE_READABLE</code>.  These mark address
-    ranges as completely inaccessible, accessible but containing
-    undefined data, and accessible and containing defined data,
-    respectively.  Subsequent errors may have their faulting
-    addresses described in terms of these blocks.  Returns a
-    "block handle".  Returns zero when not run on Valgrind.
-<p>
-<li><code>VALGRIND_DISCARD</code>: At some point you may want
-    Valgrind to stop reporting errors in terms of the blocks
-    defined by the previous three macros.  To do this, the above
-    macros return a small-integer "block handle".  You can pass
-    this block handle to <code>VALGRIND_DISCARD</code>.  After
-    doing so, Valgrind will no longer be able to relate
-    addressing errors to the user-defined block associated with
-    the handle.  The permissions settings associated with the
-    handle remain in place; this just affects how errors are
-    reported, not whether they are reported.  Returns 1 for an
-    invalid handle and 0 for a valid handle (although passing
-    invalid handles is harmless).  Always returns 0 when not run
-    on Valgrind.
-<p>
-<li><code>VALGRIND_CHECK_NOACCESS</code>,
-    <code>VALGRIND_CHECK_WRITABLE</code> and
-    <code>VALGRIND_CHECK_READABLE</code>: check immediately
-    whether or not the given address range has the relevant
-    property, and if not, print an error message.  Also, for the
-    convenience of the client, returns zero if the relevant
-    property holds; otherwise, the returned value is the address
-    of the first byte for which the property is not true.
-    Always returns 0 when not run on Valgrind.
-<p>
-<li><code>VALGRIND_CHECK_NOACCESS</code>: a quick and easy way
-    to find out whether Valgrind thinks a particular variable
-    (lvalue, to be precise) is addressible and defined.  Prints
-    an error message if not.  Returns no value.
-<p>
-<li><code>VALGRIND_MAKE_NOACCESS_STACK</code>: a highly
-    experimental feature.  Similarly to
-    <code>VALGRIND_MAKE_NOACCESS</code>, this marks an address
-    range as inaccessible, so that subsequent accesses to an
-    address in the range gives an error.  However, this macro
-    does not return a block handle.  Instead, all annotations
-    created like this are reviewed at each client
-    <code>ret</code> (subroutine return) instruction, and those
-    which now define an address range block the client's stack
-    pointer register (<code>%esp</code>) are automatically
-    deleted.
-    <p>
-    In other words, this macro allows the client to tell
-    Valgrind about red-zones on its own stack.  Valgrind
-    automatically discards this information when the stack
-    retreats past such blocks.  Beware: hacky and flaky, and
-    probably interacts badly with the new pthread support.
-<p>
-<li><code>RUNNING_ON_VALGRIND</code>: returns 1 if running on
-    Valgrind, 0 if running on the real CPU.
-<p>
-<li><code>VALGRIND_DO_LEAK_CHECK</code>: run the memory leak detector
-    right now.  Returns no value.  I guess this could be used to
-    incrementally check for leaks between arbitrary places in the
-    program's execution.  Warning: not properly tested!
-<p>
-<li><code>VALGRIND_DISCARD_TRANSLATIONS</code>: discard translations
-    of code in the specified address range.  Useful if you are
-    debugging a JITter or some other dynamic code generation system.
-    After this call, attempts to execute code in the invalidated
-    address range will cause valgrind to make new translations of that
-    code, which is probably the semantics you want.  Note that this is
-    implemented naively, and involves checking all 200191 entries in
-    the translation table to see if any of them overlap the specified
-    address range.  So try not to call it often, or performance will
-    nosedive.  Note that you can be clever about this: you only need
-    to call it when an area which previously contained code is
-    overwritten with new code.  You can choose to write code into
-    fresh memory, and just call this occasionally to discard large
-    chunks of old code all at once.
-    <p>
-    Warning: minimally tested, especially for the cache simulator.
-</ul>
-<p>
-
-
-<a name="pthreads"></a>
-<h3>2.9&nbsp; Support for POSIX Pthreads</h3>
-
-As of late April 02, Valgrind supports programs which use POSIX
-pthreads.  Doing this has proved technically challenging but is now
-mostly complete.  It works well enough for significant threaded
-applications to work.
-<p>
-It works as follows: threaded apps are (dynamically) linked against
-<code>libpthread.so</code>.  Usually this is the one installed with
-your Linux distribution.  Valgrind, however, supplies its own
-<code>libpthread.so</code> and automatically connects your program to
-it instead.
-<p>
-The fake <code>libpthread.so</code> and Valgrind cooperate to
-implement a user-space pthreads package.  This approach avoids the 
-horrible implementation problems of implementing a truly
-multiprocessor version of Valgrind, but it does mean that threaded
-apps run only on one CPU, even if you have a multiprocessor machine.
-<p>
-Valgrind schedules your threads in a round-robin fashion, with all
-threads having equal priority.  It switches threads every 50000 basic
-blocks (typically around 300000 x86 instructions), which means you'll
-get a much finer interleaving of thread executions than when run
-natively.  This in itself may cause your program to behave differently
-if you have some kind of concurrency, critical race, locking, or
-similar, bugs.
-<p>
-The current (valgrind-1.0 release) state of pthread support is as
-follows:
-<ul>
-<li>Mutexes, condition variables, thread-specific data,
-    <code>pthread_once</code>, reader-writer locks, semaphores,
-    cleanup stacks, cancellation and thread detaching currently work.
-    Various attribute-like calls are handled but ignored; you get a
-    warning message.
-<p>
-<li>Currently the following syscalls are thread-safe (nonblocking):
-    <code>write</code> <code>read</code> <code>nanosleep</code>
-    <code>sleep</code> <code>select</code> <code>poll</code> 
-    <code>recvmsg</code> and
-    <code>accept</code>.
-<p>
-<li>Signals in pthreads are now handled properly(ish): 
-    <code>pthread_sigmask</code>, <code>pthread_kill</code>,
-    <code>sigwait</code> and <code>raise</code> are now implemented.
-    Each thread has its own signal mask, as POSIX requires.
-    It's a bit kludgey -- there's a system-wide pending signal set,
-    rather than one for each thread.  But hey.
-</ul>
-
-
-As of 18 May 02, the following threaded programs now work fine on my
-RedHat 7.2 box: Opera 6.0Beta2, KNode in KDE 3.0, Mozilla-0.9.2.1 and
-Galeon-0.11.3, both as supplied with RedHat 7.2.  Also Mozilla 1.0RC2.
-OpenOffice 1.0.  MySQL 3.something (the current stable release).
-
-<a name="install"></a>
-<h3>2.10&nbsp; Building and installing</h3>
-
-We now use the standard Unix <code>./configure</code>,
-<code>make</code>, <code>make install</code> mechanism, and I have
-attempted to ensure that it works on machines with kernel 2.2 or 2.4
-and glibc 2.1.X or 2.2.X.  I don't think there is much else to say.
-There are no options apart from the usual <code>--prefix</code> that
-you should give to <code>./configure</code>.
-
-<p>
-The <code>configure</code> script tests the version of the X server
-currently indicated by the current <code>$DISPLAY</code>.  This is a
-known bug.  The intention was to detect the version of the current
-XFree86 client libraries, so that correct suppressions could be
-selected for them, but instead the test checks the server version.
-This is just plain wrong.
-
-<p>
-If you are building a binary package of Valgrind for distribution,
-please read <code>README_PACKAGERS</code>.  It contains some important
-information.
-
-<p>
-Apart from that there is no excitement here.  Let me know if you have
-build problems.
-
-
-
-<a name="problems"></a>
-<h3>2.11&nbsp; If you have problems</h3>
-Mail me (<a href="mailto:jseward@acm.org">jseward@acm.org</a>).
-
-<p>See <a href="#limits">Section 4</a> for the known limitations of
-Valgrind, and for a list of programs which are known not to work on
-it.
-
-<p>The translator/instrumentor has a lot of assertions in it.  They
-are permanently enabled, and I have no plans to disable them.  If one
-of these breaks, please mail me!
-
-<p>If you get an assertion failure on the expression
-<code>chunkSane(ch)</code> in <code>vg_free()</code> in
-<code>vg_malloc.c</code>, this may have happened because your program
-wrote off the end of a malloc'd block, or before its beginning.
-Valgrind should have emitted a proper message to that effect before
-dying in this way.  This is a known problem which I should fix.
-<p>
-
-<hr width="100%">
-
-<a name="machine"></a>
-<h2>3&nbsp; Details of the checking machinery</h2>
-
-Read this section if you want to know, in detail, exactly what and how
-Valgrind is checking.
-
-<a name="vvalue"></a>
-<h3>3.1&nbsp; Valid-value (V) bits</h3>
-
-It is simplest to think of Valgrind implementing a synthetic Intel x86
-CPU which is identical to a real CPU, except for one crucial detail.
-Every bit (literally) of data processed, stored and handled by the
-real CPU has, in the synthetic CPU, an associated "valid-value" bit,
-which says whether or not the accompanying bit has a legitimate value.
-In the discussions which follow, this bit is referred to as the V
-(valid-value) bit.
-
-<p>Each byte in the system therefore has a 8 V bits which follow
-it wherever it goes.  For example, when the CPU loads a word-size item
-(4 bytes) from memory, it also loads the corresponding 32 V bits from
-a bitmap which stores the V bits for the process' entire address
-space.  If the CPU should later write the whole or some part of that
-value to memory at a different address, the relevant V bits will be
-stored back in the V-bit bitmap.
-
-<p>In short, each bit in the system has an associated V bit, which
-follows it around everywhere, even inside the CPU.  Yes, the CPU's
-(integer and <code>%eflags</code>) registers have their own V bit
-vectors.
-
-<p>Copying values around does not cause Valgrind to check for, or
-report on, errors.  However, when a value is used in a way which might
-conceivably affect the outcome of your program's computation, the
-associated V bits are immediately checked.  If any of these indicate
-that the value is undefined, an error is reported.
-
-<p>Here's an (admittedly nonsensical) example:
-<pre>
-  int i, j;
-  int a[10], b[10];
-  for (i = 0; i &lt; 10; i++) {
-    j = a[i];
-    b[i] = j;
-  }
-</pre>
-
-<p>Valgrind emits no complaints about this, since it merely copies
-uninitialised values from <code>a[]</code> into <code>b[]</code>, and
-doesn't use them in any way.  However, if the loop is changed to
-<pre>
-  for (i = 0; i &lt; 10; i++) {
-    j += a[i];
-  }
-  if (j == 77) 
-     printf("hello there\n");
-</pre>
-then Valgrind will complain, at the <code>if</code>, that the
-condition depends on uninitialised values.
-
-<p>Most low level operations, such as adds, cause Valgrind to 
-use the V bits for the operands to calculate the V bits for the
-result.  Even if the result is partially or wholly undefined,
-it does not complain.
-
-<p>Checks on definedness only occur in two places: when a value is
-used to generate a memory address, and where control flow decision
-needs to be made.  Also, when a system call is detected, valgrind
-checks definedness of parameters as required.
-
-<p>If a check should detect undefinedness, an error message is
-issued.  The resulting value is subsequently regarded as well-defined.
-To do otherwise would give long chains of error messages.  In effect,
-we say that undefined values are non-infectious.
-
-<p>This sounds overcomplicated.  Why not just check all reads from
-memory, and complain if an undefined value is loaded into a CPU register? 
-Well, that doesn't work well, because perfectly legitimate C programs routinely
-copy uninitialised values around in memory, and we don't want endless complaints
-about that.  Here's the canonical example.  Consider a struct
-like this:
-<pre>
-  struct S { int x; char c; };
-  struct S s1, s2;
-  s1.x = 42;
-  s1.c = 'z';
-  s2 = s1;
-</pre>
-
-<p>The question to ask is: how large is <code>struct S</code>, in
-bytes?  An int is 4 bytes and a char one byte, so perhaps a struct S
-occupies 5 bytes?  Wrong.  All (non-toy) compilers I know of will
-round the size of <code>struct S</code> up to a whole number of words,
-in this case 8 bytes.  Not doing this forces compilers to generate
-truly appalling code for subscripting arrays of <code>struct
-S</code>'s.
-
-<p>So s1 occupies 8 bytes, yet only 5 of them will be initialised.
-For the assignment <code>s2 = s1</code>, gcc generates code to copy
-all 8 bytes wholesale into <code>s2</code> without regard for their
-meaning.  If Valgrind simply checked values as they came out of
-memory, it would yelp every time a structure assignment like this
-happened.  So the more complicated semantics described above is
-necessary.  This allows gcc to copy <code>s1</code> into
-<code>s2</code> any way it likes, and a warning will only be emitted
-if the uninitialised values are later used.
-
-<p>One final twist to this story.  The above scheme allows garbage to
-pass through the CPU's integer registers without complaint.  It does
-this by giving the integer registers V tags, passing these around in
-the expected way.  This complicated and computationally expensive to
-do, but is necessary.  Valgrind is more simplistic about
-floating-point loads and stores.  In particular, V bits for data read
-as a result of floating-point loads are checked at the load
-instruction.  So if your program uses the floating-point registers to
-do memory-to-memory copies, you will get complaints about
-uninitialised values.  Fortunately, I have not yet encountered a
-program which (ab)uses the floating-point registers in this way.
-
-<a name="vaddress"></a>
-<h3>3.2&nbsp; Valid-address (A) bits</h3>
-
-Notice that the previous section describes how the validity of values
-is established and maintained without having to say whether the
-program does or does not have the right to access any particular
-memory location.  We now consider the latter issue.
-
-<p>As described above, every bit in memory or in the CPU has an
-associated valid-value (V) bit.  In addition, all bytes in memory, but
-not in the CPU, have an associated valid-address (A) bit.  This
-indicates whether or not the program can legitimately read or write
-that location.  It does not give any indication of the validity or the
-data at that location -- that's the job of the V bits -- only whether
-or not the location may be accessed.
-
-<p>Every time your program reads or writes memory, Valgrind checks the
-A bits associated with the address.  If any of them indicate an
-invalid address, an error is emitted.  Note that the reads and writes
-themselves do not change the A bits, only consult them.
-
-<p>So how do the A bits get set/cleared?  Like this:
-
-<ul>
-  <li>When the program starts, all the global data areas are marked as
-      accessible.</li><br>
-      <p>
-
-  <li>When the program does malloc/new, the A bits for the exactly the
-      area allocated, and not a byte more, are marked as accessible.
-      Upon freeing the area the A bits are changed to indicate
-      inaccessibility.</li><br>
-      <p>
-
-  <li>When the stack pointer register (%esp) moves up or down, A bits
-      are set.  The rule is that the area from %esp up to the base of
-      the stack is marked as accessible, and below %esp is
-      inaccessible.  (If that sounds illogical, bear in mind that the
-      stack grows down, not up, on almost all Unix systems, including
-      GNU/Linux.)  Tracking %esp like this has the useful side-effect
-      that the section of stack used by a function for local variables
-      etc is automatically marked accessible on function entry and
-      inaccessible on exit.</li><br>
-      <p>
-
-  <li>When doing system calls, A bits are changed appropriately.  For
-      example, mmap() magically makes files appear in the process's
-      address space, so the A bits must be updated if mmap()
-      succeeds.</li><br>
-      <p>
-
-  <li>Optionally, your program can tell Valgrind about such changes
-      explicitly, using the client request mechanism described above.
-</ul>
-
-
-<a name="together"></a>
-<h3>3.3&nbsp; Putting it all together</h3>
-Valgrind's checking machinery can be summarised as follows:
-
-<ul>
-  <li>Each byte in memory has 8 associated V (valid-value) bits,
-      saying whether or not the byte has a defined value, and a single
-      A (valid-address) bit, saying whether or not the program
-      currently has the right to read/write that address.</li><br>
-      <p>
-
-  <li>When memory is read or written, the relevant A bits are
-      consulted.  If they indicate an invalid address, Valgrind emits
-      an Invalid read or Invalid write error.</li><br>
-      <p>
-
-  <li>When memory is read into the CPU's integer registers, the
-      relevant V bits are fetched from memory and stored in the
-      simulated CPU.  They are not consulted.</li><br>
-      <p>
-
-  <li>When an integer register is written out to memory, the V bits
-      for that register are written back to memory too.</li><br>
-      <p>
-
-  <li>When memory is read into the CPU's floating point registers, the
-      relevant V bits are read from memory and they are immediately
-      checked.  If any are invalid, an uninitialised value error is
-      emitted.  This precludes using the floating-point registers to
-      copy possibly-uninitialised memory, but simplifies Valgrind in
-      that it does not have to track the validity status of the
-      floating-point registers.</li><br>
-      <p>
-
-  <li>As a result, when a floating-point register is written to
-      memory, the associated V bits are set to indicate a valid
-      value.</li><br>
-      <p>
-
-  <li>When values in integer CPU registers are used to generate a
-      memory address, or to determine the outcome of a conditional
-      branch, the V bits for those values are checked, and an error
-      emitted if any of them are undefined.</li><br>
-      <p>
-
-  <li>When values in integer CPU registers are used for any other
-      purpose, Valgrind computes the V bits for the result, but does
-      not check them.</li><br>
-      <p>
-
-  <li>One the V bits for a value in the CPU have been checked, they
-      are then set to indicate validity.  This avoids long chains of
-      errors.</li><br>
-      <p>
-
-  <li>When values are loaded from memory, valgrind checks the A bits
-      for that location and issues an illegal-address warning if
-      needed.  In that case, the V bits loaded are forced to indicate
-      Valid, despite the location being invalid.
-      <p>
-      This apparently strange choice reduces the amount of confusing
-      information presented to the user.  It avoids the
-      unpleasant phenomenon in which memory is read from a place which
-      is both unaddressible and contains invalid values, and, as a
-      result, you get not only an invalid-address (read/write) error,
-      but also a potentially large set of uninitialised-value errors,
-      one for every time the value is used.
-      <p>
-      There is a hazy boundary case to do with multi-byte loads from
-      addresses which are partially valid and partially invalid.  See
-      details of the flag <code>--partial-loads-ok</code> for details.
-      </li><br>
-</ul>
-
-Valgrind intercepts calls to malloc, calloc, realloc, valloc,
-memalign, free, new and delete.  The behaviour you get is:
-
-<ul>
-
-  <li>malloc/new: the returned memory is marked as addressible but not
-      having valid values.  This means you have to write on it before
-      you can read it.</li><br>
-      <p>
-
-  <li>calloc: returned memory is marked both addressible and valid,
-      since calloc() clears the area to zero.</li><br>
-      <p>
-
-  <li>realloc: if the new size is larger than the old, the new section
-      is addressible but invalid, as with malloc.</li><br>
-      <p>
-
-  <li>If the new size is smaller, the dropped-off section is marked as
-      unaddressible.  You may only pass to realloc a pointer
-      previously issued to you by malloc/calloc/new/realloc.</li><br>
-      <p>
-
-  <li>free/delete: you may only pass to free a pointer previously
-      issued to you by malloc/calloc/new/realloc, or the value
-      NULL. Otherwise, Valgrind complains.  If the pointer is indeed
-      valid, Valgrind marks the entire area it points at as
-      unaddressible, and places the block in the freed-blocks-queue.
-      The aim is to defer as long as possible reallocation of this
-      block.  Until that happens, all attempts to access it will
-      elicit an invalid-address error, as you would hope.</li><br>
-</ul>
-
-
-
-<a name="signals"></a>
-<h3>3.4&nbsp; Signals</h3>
-
-Valgrind provides suitable handling of signals, so, provided you stick
-to POSIX stuff, you should be ok.  Basic sigaction() and sigprocmask()
-are handled.  Signal handlers may return in the normal way or do
-longjmp(); both should work ok.  As specified by POSIX, a signal is
-blocked in its own handler.  Default actions for signals should work
-as before.  Etc, etc.
-
-<p>Under the hood, dealing with signals is a real pain, and Valgrind's
-simulation leaves much to be desired.  If your program does
-way-strange stuff with signals, bad things may happen.  If so, let me
-know.  I don't promise to fix it, but I'd at least like to be aware of
-it.
-
-
-<a name="leaks"></a>
-<h3>3.5&nbsp; Memory leak detection</h3>
-
-Valgrind keeps track of all memory blocks issued in response to calls
-to malloc/calloc/realloc/new.  So when the program exits, it knows
-which blocks are still outstanding -- have not been returned, in other
-words.  Ideally, you want your program to have no blocks still in use
-at exit.  But many programs do.
-
-<p>For each such block, Valgrind scans the entire address space of the
-process, looking for pointers to the block.  One of three situations
-may result:
-
-<ul>
-  <li>A pointer to the start of the block is found.  This usually
-      indicates programming sloppiness; since the block is still
-      pointed at, the programmer could, at least in principle, free'd
-      it before program exit.</li><br>
-      <p>
-
-  <li>A pointer to the interior of the block is found.  The pointer
-      might originally have pointed to the start and have been moved
-      along, or it might be entirely unrelated.  Valgrind deems such a
-      block as "dubious", that is, possibly leaked,
-      because it's unclear whether or
-      not a pointer to it still exists.</li><br>
-      <p>
-
-  <li>The worst outcome is that no pointer to the block can be found.
-      The block is classified as "leaked", because the
-      programmer could not possibly have free'd it at program exit,
-      since no pointer to it exists.  This might be a symptom of
-      having lost the pointer at some earlier point in the
-      program.</li>
-</ul>
-
-Valgrind reports summaries about leaked and dubious blocks.
-For each such block, it will also tell you where the block was
-allocated.  This should help you figure out why the pointer to it has
-been lost.  In general, you should attempt to ensure your programs do
-not have any leaked or dubious blocks at exit.
-
-<p>The precise area of memory in which Valgrind searches for pointers
-is: all naturally-aligned 4-byte words for which all A bits indicate
-addressibility and all V bits indicated that the stored value is
-actually valid.
-
-<p><hr width="100%">
-
-
-<a name="limits"></a>
-<h2>4&nbsp; Limitations</h2>
-
-The following list of limitations seems depressingly long.  However,
-most programs actually work fine.
-
-<p>Valgrind will run x86-GNU/Linux ELF dynamically linked binaries, on
-a kernel 2.2.X or 2.4.X system, subject to the following constraints:
-
-<ul>
-  <li>No MMX, SSE, SSE2, 3DNow instructions.  If the translator
-      encounters these, Valgrind will simply give up.  It may be
-      possible to add support for them at a later time. Intel added a
-      few instructions such as "cmov" to the integer instruction set
-      on Pentium and later processors, and these are supported.
-      Nevertheless it's safest to think of Valgrind as implementing
-      the 486 instruction set.</li><br>
-      <p>
-
-  <li>Pthreads support is improving, but there are still significant
-      limitations in that department.  See the section above on
-      Pthreads.  Note that your program must be dynamically linked
-      against <code>libpthread.so</code>, so that Valgrind can
-      substitute its own implementation at program startup time.  If
-      you're statically linked against it, things will fail
-      badly.</li><br>
-      <p>
-
-  <li>Valgrind assumes that the floating point registers are not used
-      as intermediaries in memory-to-memory copies, so it immediately
-      checks V bits in floating-point loads/stores.  If you want to
-      write code which copies around possibly-uninitialised values,
-      you must ensure these travel through the integer registers, not
-      the FPU.</li><br>
-      <p>
-
-  <li>If your program does its own memory management, rather than
-      using malloc/new/free/delete, it should still work, but
-      Valgrind's error checking won't be so effective.</li><br>
-      <p>
-
-  <li>Valgrind's signal simulation is not as robust as it could be.
-      Basic POSIX-compliant sigaction and sigprocmask functionality is
-      supplied, but it's conceivable that things could go badly awry
-      if you do wierd things with signals.  Workaround: don't.
-      Programs that do non-POSIX signal tricks are in any case
-      inherently unportable, so should be avoided if
-      possible.</li><br>
-      <p>
-
-  <li>Programs which switch stacks are not well handled.  Valgrind
-      does have support for this, but I don't have great faith in it.
-      It's difficult -- there's no cast-iron way to decide whether a
-      large change in %esp is as a result of the program switching
-      stacks, or merely allocating a large object temporarily on the
-      current stack -- yet Valgrind needs to handle the two situations
-      differently.  1 May 02: this probably interacts badly with the
-      new pthread support.  I haven't checked properly.</li><br>
-      <p>
-
-  <li>x86 instructions, and system calls, have been implemented on
-      demand.  So it's possible, although unlikely, that a program
-      will fall over with a message to that effect.  If this happens,
-      please mail me ALL the details printed out, so I can try and
-      implement the missing feature.</li><br>
-      <p>
-
-  <li>x86 floating point works correctly, but floating-point code may
-      run even more slowly than integer code, due to my simplistic
-      approach to FPU emulation.</li><br>
-      <p>
-
-  <li>You can't Valgrind-ize statically linked binaries.  Valgrind
-      relies on the dynamic-link mechanism to gain control at
-      startup.</li><br>
-      <p>
-
-  <li>Memory consumption of your program is majorly increased whilst
-      running under Valgrind.  This is due to the large amount of
-      adminstrative information maintained behind the scenes.  Another
-      cause is that Valgrind dynamically translates the original
-      executable.  Translated, instrumented code is 14-16 times larger
-      than the original (!) so you can easily end up with 30+ MB of
-      translations when running (eg) a web browser.
-      </li>
-</ul>
-
-Programs which are known not to work are:
-
-<ul>
-  <li>emacs starts up but immediately concludes it is out of memory
-      and aborts.  Emacs has it's own memory-management scheme, but I
-      don't understand why this should interact so badly with
-      Valgrind.  Emacs works fine if you build it to use the standard
-      malloc/free routines.</li><br>
-      <p>
-</ul>
-
-Known platform-specific limitations, as of release 1.0.0:
-
-<ul>
-  <li>On Red Hat 7.3, there have been reports of link errors (at
-      program start time) for threaded programs using
-      <code>__pthread_clock_gettime</code> and
-      <code>__pthread_clock_settime</code>.  This appears to be due to
-      <code>/lib/librt-2.2.5.so</code> needing them.  Unfortunately I
-      do not understand enough about this problem to fix it properly,
-      and I can't reproduce it on my test RedHat 7.3 system.  Please
-      mail me if you have more information / understanding.  </li><br>
-      <p>
-  <li>
-      1.0.0 now partially works on Red Hat 7.3.92 ("Limbo"
-      public beta).  However, don't expect a smooth ride.
-      Basically valgrind won't work as-is with any 
-      glibc-2.3 based system.  Limbo is just a little pre glibc-2.3 
-      and it just about works.  Limbo is also gcc-3.1 based and so
-      suffers from the problems in the following point.</li><br>
-      <p>
-  <li>
-      Inlining of string functions with gcc-3.1 or above causes a
-      large number of false reports of uninitialised value uses.  I
-      know what the problem is and roughly how to fix it, but I need
-      to devise a reasonably efficient fix.  Try to reduce the
-      optimisation level, or use <code>-fno-builtin-strlen</code> in
-      the meantime.  Or use an earlier gcc.</li><br>
-      <p>
-</ul>
-
-
-<p><hr width="100%">
-
-
-<a name="howitworks"></a>
-<h2>5&nbsp; How it works -- a rough overview</h2>
-Some gory details, for those with a passion for gory details.  You
-don't need to read this section if all you want to do is use Valgrind.
-
-<a name="startb"></a>
-<h3>5.1&nbsp; Getting started</h3>
-
-Valgrind is compiled into a shared object, valgrind.so.  The shell
-script valgrind sets the LD_PRELOAD environment variable to point to
-valgrind.so.  This causes the .so to be loaded as an extra library to
-any subsequently executed dynamically-linked ELF binary, viz, the
-program you want to debug.
-
-<p>The dynamic linker allows each .so in the process image to have an
-initialisation function which is run before main().  It also allows
-each .so to have a finalisation function run after main() exits.
-
-<p>When valgrind.so's initialisation function is called by the dynamic
-linker, the synthetic CPU to starts up.  The real CPU remains locked
-in valgrind.so for the entire rest of the program, but the synthetic
-CPU returns from the initialisation function.  Startup of the program
-now continues as usual -- the dynamic linker calls all the other .so's
-initialisation routines, and eventually runs main().  This all runs on
-the synthetic CPU, not the real one, but the client program cannot
-tell the difference.
-
-<p>Eventually main() exits, so the synthetic CPU calls valgrind.so's
-finalisation function.  Valgrind detects this, and uses it as its cue
-to exit.  It prints summaries of all errors detected, possibly checks
-for memory leaks, and then exits the finalisation routine, but now on
-the real CPU.  The synthetic CPU has now lost control -- permanently
--- so the program exits back to the OS on the real CPU, just as it
-would have done anyway.
-
-<p>On entry, Valgrind switches stacks, so it runs on its own stack.
-On exit, it switches back.  This means that the client program
-continues to run on its own stack, so we can switch back and forth
-between running it on the simulated and real CPUs without difficulty.
-This was an important design decision, because it makes it easy (well,
-significantly less difficult) to debug the synthetic CPU.
-
-
-<a name="engine"></a>
-<h3>5.2&nbsp; The translation/instrumentation engine</h3>
-
-Valgrind does not directly run any of the original program's code.  Only
-instrumented translations are run.  Valgrind maintains a translation
-table, which allows it to find the translation quickly for any branch
-target (code address).  If no translation has yet been made, the
-translator - a just-in-time translator - is summoned.  This makes an
-instrumented translation, which is added to the collection of
-translations.  Subsequent jumps to that address will use this
-translation.
-
-<p>Valgrind no longer directly supports detection of self-modifying
-code.  Such checking is expensive, and in practice (fortunately)
-almost no applications need it.  However, to help people who are
-debugging dynamic code generation systems, there is a Client Request 
-(basically a macro you can put in your program) which directs Valgrind
-to discard translations in a given address range.  So Valgrind can
-still work in this situation provided the client tells it when
-code has become out-of-date and needs to be retranslated.
-
-<p>The JITter translates basic blocks -- blocks of straight-line-code
--- as single entities.  To minimise the considerable difficulties of
-dealing with the x86 instruction set, x86 instructions are first
-translated to a RISC-like intermediate code, similar to sparc code,
-but with an infinite number of virtual integer registers.  Initially
-each insn is translated seperately, and there is no attempt at
-instrumentation.
-
-<p>The intermediate code is improved, mostly so as to try and cache
-the simulated machine's registers in the real machine's registers over
-several simulated instructions.  This is often very effective.  Also,
-we try to remove redundant updates of the simulated machines's
-condition-code register.
-
-<p>The intermediate code is then instrumented, giving more
-intermediate code.  There are a few extra intermediate-code operations
-to support instrumentation; it is all refreshingly simple.  After
-instrumentation there is a cleanup pass to remove redundant value
-checks.
-
-<p>This gives instrumented intermediate code which mentions arbitrary
-numbers of virtual registers.  A linear-scan register allocator is
-used to assign real registers and possibly generate spill code.  All
-of this is still phrased in terms of the intermediate code.  This
-machinery is inspired by the work of Reuben Thomas (MITE).
-
-<p>Then, and only then, is the final x86 code emitted.  The
-intermediate code is carefully designed so that x86 code can be
-generated from it without need for spare registers or other
-inconveniences.
-
-<p>The translations are managed using a traditional LRU-based caching
-scheme.  The translation cache has a default size of about 14MB.
-
-<a name="track"></a>
-
-<h3>5.3&nbsp; Tracking the status of memory</h3> Each byte in the
-process' address space has nine bits associated with it: one A bit and
-eight V bits.  The A and V bits for each byte are stored using a
-sparse array, which flexibly and efficiently covers arbitrary parts of
-the 32-bit address space without imposing significant space or
-performance overheads for the parts of the address space never
-visited.  The scheme used, and speedup hacks, are described in detail
-at the top of the source file vg_memory.c, so you should read that for
-the gory details.
-
-<a name="sys_calls"></a>
-
-<h3>5.4 System calls</h3>
-All system calls are intercepted.  The memory status map is consulted
-before and updated after each call.  It's all rather tiresome.  See
-vg_syscall_mem.c for details.
-
-<a name="sys_signals"></a>
-
-<h3>5.5&nbsp; Signals</h3>
-All system calls to sigaction() and sigprocmask() are intercepted.  If
-the client program is trying to set a signal handler, Valgrind makes a
-note of the handler address and which signal it is for.  Valgrind then
-arranges for the same signal to be delivered to its own handler.
-
-<p>When such a signal arrives, Valgrind's own handler catches it, and
-notes the fact.  At a convenient safe point in execution, Valgrind
-builds a signal delivery frame on the client's stack and runs its
-handler.  If the handler longjmp()s, there is nothing more to be said.
-If the handler returns, Valgrind notices this, zaps the delivery
-frame, and carries on where it left off before delivering the signal.
-
-<p>The purpose of this nonsense is that setting signal handlers
-essentially amounts to giving callback addresses to the Linux kernel.
-We can't allow this to happen, because if it did, signal handlers
-would run on the real CPU, not the simulated one.  This means the
-checking machinery would not operate during the handler run, and,
-worse, memory permissions maps would not be updated, which could cause
-spurious error reports once the handler had returned.
-
-<p>An even worse thing would happen if the signal handler longjmp'd
-rather than returned: Valgrind would completely lose control of the
-client program.
-
-<p>Upshot: we can't allow the client to install signal handlers
-directly.  Instead, Valgrind must catch, on behalf of the client, any
-signal the client asks to catch, and must delivery it to the client on
-the simulated CPU, not the real one.  This involves considerable
-gruesome fakery; see vg_signals.c for details.
-<p>
-
-<hr width="100%">
-
-<a name="example"></a>
-<h2>6&nbsp; Example</h2>
-This is the log for a run of a small program. The program is in fact
-correct, and the reported error is as the result of a potentially serious
-code generation bug in GNU g++ (snapshot 20010527).
-<pre>
-sewardj@phoenix:~/newmat10$
-~/Valgrind-6/valgrind -v ./bogon 
-==25832== Valgrind 0.10, a memory error detector for x86 RedHat 7.1.
-==25832== Copyright (C) 2000-2001, and GNU GPL'd, by Julian Seward.
-==25832== Startup, with flags:
-==25832== --suppressions=/home/sewardj/Valgrind/redhat71.supp
-==25832== reading syms from /lib/ld-linux.so.2
-==25832== reading syms from /lib/libc.so.6
-==25832== reading syms from /mnt/pima/jrs/Inst/lib/libgcc_s.so.0
-==25832== reading syms from /lib/libm.so.6
-==25832== reading syms from /mnt/pima/jrs/Inst/lib/libstdc++.so.3
-==25832== reading syms from /home/sewardj/Valgrind/valgrind.so
-==25832== reading syms from /proc/self/exe
-==25832== loaded 5950 symbols, 142333 line number locations
-==25832== 
-==25832== Invalid read of size 4
-==25832==    at 0x8048724: _ZN10BandMatrix6ReSizeEiii (bogon.cpp:45)
-==25832==    by 0x80487AF: main (bogon.cpp:66)
-==25832==    by 0x40371E5E: __libc_start_main (libc-start.c:129)
-==25832==    by 0x80485D1: (within /home/sewardj/newmat10/bogon)
-==25832==    Address 0xBFFFF74C is not stack'd, malloc'd or free'd
-==25832==
-==25832== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
-==25832== malloc/free: in use at exit: 0 bytes in 0 blocks.
-==25832== malloc/free: 0 allocs, 0 frees, 0 bytes allocated.
-==25832== For a detailed leak analysis, rerun with: --leak-check=yes
-==25832==
-==25832== exiting, did 1881 basic blocks, 0 misses.
-==25832== 223 translations, 3626 bytes in, 56801 bytes out.
-</pre>
-<p>The GCC folks fixed this about a week before gcc-3.0 shipped.
-<hr width="100%">
-<p>
-
-
-
-<a name="cache"></a>
-<h2>7&nbsp; Cache profiling</h2>
-As well as memory debugging, Valgrind also allows you to do cache simulations
-and annotate your source line-by-line with the number of cache misses.  In
-particular, it records:
-<ul>
-  <li>L1 instruction cache reads and misses;
-  <li>L1 data cache reads and read misses, writes and write misses;
-  <li>L2 unified cache reads and read misses, writes and writes misses.
-</ul>
-On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
-and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
-very useful for improving the performance of your program.<p>
-
-Also, since one instruction cache read is performed per instruction executed,
-you can find out how many instructions are executed per line, which can be
-useful for traditional profiling and test coverage.<p>
-
-Any feedback, bug-fixes, suggestions, etc, welcome.
-
-
-<h3>7.1&nbsp; Overview</h3>
-First off, as for normal Valgrind use, you probably want to turn on debugging
-info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
-probably <b>do</b> want to turn optimisation on, since you should profile your
-program as it will be normally run.
-
-The two steps are:
-<ol>
-  <li>Run your program with <code>cachegrind</code> in front of the
-      normal command line invocation.  When the program finishes,
-      Valgrind will print summary cache statistics. It also collects
-      line-by-line information in a file <code>cachegrind.out</code>.
-      <p>
-      This step should be done every time you want to collect
-      information about a new program, a changed program, or about the
-      same program with different input.
-  </li>
-  <p>
-  <li>Generate a function-by-function summary, and possibly annotate
-      source files with 'vg_annotate'. Source files to annotate can be
-      specified manually, or manually on the command line, or
-      "interesting" source files can be annotated automatically with
-      the <code>--auto=yes</code> option.  You can annotate C/C++
-      files or assembly language files equally easily.
-      <p>
-      This step can be performed as many times as you like for each
-      Step 2.  You may want to do multiple annotations showing
-      different information each time.<p>
-  </li>
-</ol>
-
-The steps are described in detail in the following sections.<p>
-
-
-<h3>7.2&nbsp; Cache simulation specifics</h3>
-
-Cachegrind uses a simulation for a machine with a split L1 cache and a unified
-L2 cache.  This configuration is used for all (modern) x86-based machines we
-are aware of.  Old Cyrix CPUs had a unified I and D L1 cache, but they are
-ancient history now.<p>
-
-The more specific characteristics of the simulation are as follows.
-
-<ul>
-  <li>Write-allocate: when a write miss occurs, the block written to
-      is brought into the D1 cache.  Most modern caches have this
-      property.</li><p>
-
-  <li>Bit-selection hash function: the line(s) in the cache to which a
-      memory block maps is chosen by the middle bits M--(M+N-1) of the
-      byte address, where:
-      <ul>
-        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
-        <li>(cache size / line size) = 2^N bytes</li>
-      </ul> </li><p>
-
-  <li>Inclusive L2 cache: the L2 cache replicates all the entries of
-      the L1 cache.  This is standard on Pentium chips, but AMD
-      Athlons use an exclusive L2 cache that only holds blocks evicted
-      from L1.  Ditto AMD Durons and most modern VIAs.</li><p>
-</ul>
-
-The cache configuration simulated (cache size, associativity and line size) is
-determined automagically using the CPUID instruction.  If you have an old
-machine that (a) doesn't support the CPUID instruction, or (b) supports it in
-an early incarnation that doesn't give any cache information, then Cachegrind
-will fall back to using a default configuration (that of a model 3/4 Athlon).
-Cachegrind will tell you if this happens.  You can manually specify one, two or
-all three levels (I1/D1/L2) of the cache from the command line using the
-<code>--I1</code>, <code>--D1</code> and <code>--L2</code> options.<p>
-
-Other noteworthy behaviour:
-
-<ul>
-  <li>References that straddle two cache lines are treated as follows:
-  <ul>
-    <li>If both blocks hit --&gt; counted as one hit</li>
-    <li>If one block hits, the other misses --&gt; counted as one miss</li>
-    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
-  </ul><p></li>
-
-  <li>Instructions that modify a memory location (eg. <code>inc</code> and
-      <code>dec</code>) are counted as doing just a read, ie. a single data
-      reference.  This may seem strange, but since the write can never cause a
-      miss (the read guarantees the block is in the cache) it's not very
-      interesting.<p>
-
-      Thus it measures not the number of times the data cache is accessed, but
-      the number of times a data cache miss could occur.<p>
-      </li>
-</ul>
-
-If you are interested in simulating a cache with different properties, it is
-not particularly hard to write your own cache simulator, or to modify the
-existing ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_D1.c</code>,
-<code>vg_cachesim_L2.c</code> and <code>vg_cachesim_gen.c</code>.  We'd be
-interested to hear from anyone who does.
-
-<a name="profile"></a>
-<h3>7.3&nbsp; Profiling programs</h3>
-
-Cache profiling is enabled by using the <code>--cachesim=yes</code>
-option to the <code>valgrind</code> shell script.  Alternatively, it
-is probably more convenient to use the <code>cachegrind</code> script.
-Either way automatically turns off Valgrind's memory checking functions,
-since the cache simulation is slow enough already, and you probably
-don't want to do both at once.
-<p>
-To gather cache profiling information about the program <code>ls
--l</code>, type:
-
-<blockquote><code>cachegrind ls -l</code></blockquote>
-
-The program will execute (slowly).  Upon completion, summary statistics
-that look like this will be printed:
-
-<pre>
-==31751== I   refs:      27,742,716
-==31751== I1  misses:           276
-==31751== L2  misses:           275
-==31751== I1  miss rate:        0.0%
-==31751== L2i miss rate:        0.0%
-==31751== 
-==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
-==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
-==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
-==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
-==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
-==31751== 
-==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
-==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
-</pre>
-
-Cache accesses for instruction fetches are summarised first, giving the
-number of fetches made (this is the number of instructions executed, which
-can be useful to know in its own right), the number of I1 misses, and the
-number of L2 instruction (<code>L2i</code>) misses.<p>
-
-Cache accesses for data follow. The information is similar to that of the
-instruction fetches, except that the values are also shown split between reads
-and writes (note each row's <code>rd</code> and <code>wr</code> values add up
-to the row's total).<p>
-
-Combined instruction and data figures for the L2 cache follow that.<p>
-
-
-<h3>7.4&nbsp; Output file</h3>
-
-As well as printing summary information, Cachegrind also writes
-line-by-line cache profiling information to a file named
-<code>cachegrind.out</code>.  This file is human-readable, but is best
-interpreted by the accompanying program <code>vg_annotate</code>,
-described in the next section.
-<p>
-Things to note about the <code>cachegrind.out</code> file:
-<ul>
-  <li>It is written every time <code>valgrind --cachesim=yes</code> or
-      <code>cachegrind</code> is run, and will overwrite any existing
-      <code>cachegrind.out</code> in the current directory.</li>
-  <p>
-  <li>It can be huge: <code>ls -l</code> generates a file of about
-      350KB.  Browsing a few files and web pages with a Konqueror
-      built with full debugging information generates a file
-      of around 15 MB.</li>
-</ul>
-
-<a name="profileflags"></a>
-<h3>7.5&nbsp; Cachegrind options</h3>
-Cachegrind accepts all the options that Valgrind does, although some of them
-(ones related to memory checking) don't do anything when cache profiling.<p>
-
-The interesting cache-simulation specific options are:
-
-<ul>
-  <li><code>--I1=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><br>
-      <code>--D1=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><br> 
-      <code>--L2=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><p> 
-      [default: uses CPUID for automagic cache configuration]<p>
-
-      Manually specifies the I1/D1/L2 cache configuration, where
-      <code>size</code> and <code>line_size</code> are measured in bytes.  The
-      three items must be comma-separated, but with no spaces, eg:
-
-      <blockquote><code>cachegrind --I1=65535,2,64</code></blockquote>
-
-      You can specify one, two or three of the I1/D1/L2 caches.  Any level not
-      manually specified will be simulated using the configuration found in the
-      normal way (via the CPUID instruction, or failing that, via defaults).
-</ul>
-
-  
-<a name="annotate"></a>
-<h3>7.6&nbsp; Annotating C/C++ programs</h3>
-
-Before using <code>vg_annotate</code>, it is worth widening your
-window to be at least 120-characters wide if possible, as the output
-lines can be quite long.
-<p>
-To get a function-by-function summary, run <code>vg_annotate</code> in
-directory containing a <code>cachegrind.out</code> file.  The output
-looks like this:
-
-<pre>
---------------------------------------------------------------------------------
-I1 cache:              65536 B, 64 B, 2-way associative
-D1 cache:              65536 B, 64 B, 2-way associative
-L2 cache:              262144 B, 64 B, 8-way associative
-Command:               concord vg_to_ucode.c
-Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Threshold:             99%
-Chosen for annotation:
-Auto-annotation:       on
-
---------------------------------------------------------------------------------
-Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
---------------------------------------------------------------------------------
-27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
-
---------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
---------------------------------------------------------------------------------
-8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
-5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
-2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
-2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
-2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
-1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
-  897,991   51   51   897,831    95    30        62      1      1  ???:???
-  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
-  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
-  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
-  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
-  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
-  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
-  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
-  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
-  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
-   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
-   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
-</pre>
-
-First up is a summary of the annotation options:
-                    
-<ul>
-  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
-      configuration with which these results were obtained.</li><p>
-
-  <li>Command: the command line invocation of the program under
-      examination.</li><p>
-
-  <li>Events recorded: event abbreviations are:<p>
-  <ul>
-    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
-    <li><code>I1mr</code>: I1 cache read misses</li>
-    <li><code>I2mr</code>: L2 cache instruction read misses</li>
-    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
-    <li><code>D1mr</code>: D1 cache read misses</li>
-    <li><code>D2mr</code>: L2 cache data read misses</li>
-    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
-    <li><code>D1mw</code>: D1 cache write misses</li>
-    <li><code>D2mw</code>: L2 cache data write misses</li>
-  </ul><p>
-      Note that D1 total accesses is given by <code>D1mr</code> +
-      <code>D1mw</code>, and that L2 total accesses is given by
-      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
-
-  <li>Events shown: the events shown (a subset of events gathered).  This can
-      be adjusted with the <code>--show</code> option.</li><p>
-
-  <li>Event sort order: the sort order in which functions are shown.  For
-      example, in this case the functions are sorted from highest
-      <code>Ir</code> counts to lowest.  If two functions have identical
-      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
-      counts, and so on.  This order can be adjusted with the
-      <code>--sort</code> option.<p>
-
-      Note that this dictates the order the functions appear.  It is <b>not</b>
-      the order in which the columns appear;  that is dictated by the "events
-      shown" line (and can be changed with the <code>--show</code> option).
-      </li><p>
-
-  <li>Threshold: <code>vg_annotate</code> by default omits functions
-      that cause very low numbers of misses to avoid drowning you in
-      information.  In this case, vg_annotate shows summaries the
-      functions that account for 99% of the <code>Ir</code> counts;
-      <code>Ir</code> is chosen as the threshold event since it is the
-      primary sort event.  The threshold can be adjusted with the
-      <code>--threshold</code> option.</li><p>
-
-  <li>Chosen for annotation: names of files specified manually for annotation; 
-      in this case none.</li><p>
-
-  <li>Auto-annotation: whether auto-annotation was requested via the 
-      <code>--auto=yes</code> option. In this case no.</li><p>
-</ul>
-
-Then follows summary statistics for the whole program. These are similar
-to the summary provided when running <code>cachegrind</code>.<p>
-  
-Then follows function-by-function statistics. Each function is
-identified by a <code>file_name:function_name</code> pair. If a column
-contains only a dot it means the function never performs
-that event (eg. the third row shows that <code>strcmp()</code>
-contains no instructions that write to memory). The name
-<code>???</code> is used if the the file name and/or function name
-could not be determined from debugging information. If most of the
-entries have the form <code>???:???</code> the program probably wasn't
-compiled with <code>-g</code>.  If any code was invalidated (either due to
-self-modifying code or unloading of shared objects) its counts are aggregated
-into a single cost centre written as <code>(discarded):(discarded)</code>.<p>
-
-It is worth noting that functions will come from three types of source files:
-<ol>
-  <li> From the profiled program (<code>concord.c</code> in this example).</li>
-  <li>From libraries (eg. <code>getc.c</code>)</li>
-  <li>From Valgrind's implementation of some libc functions (eg.
-      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
-      the filename begins with <code>vg_</code>, and is probably one of
-      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
-      <code>vg_mylibc.c</code>.
-  </li>
-</ol>
-
-There are two ways to annotate source files -- by choosing them
-manually, or with the <code>--auto=yes</code> option. To do it
-manually, just specify the filenames as arguments to
-<code>vg_annotate</code>. For example, the output from running
-<code>vg_annotate concord.c</code> for our example produces the same
-output as above followed by an annotated version of
-<code>concord.c</code>, a section of which looks like:
-
-<pre>
---------------------------------------------------------------------------------
--- User-annotated source: concord.c
---------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
-
-[snip]
-
-        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
-        3    1    1       .     .     .       1      0      0  {
-        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
-        .    .    .       .     .     .       .      .      .      Word_Info *data;
-        1    0    0       .     .     .       1      1      1      int line = 1, i;
-        .    .    .       .     .     .       .      .      .
-        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
-        .    .    .       .     .     .       .      .      .
-    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
-    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
-        .    .    .       .     .     .       .      .      .
-        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
-        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
-        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
-        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
-        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
-        .    .    .       .     .     .       .      .      .      }
-        .    .    .       .     .     .       .      .      .
-  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
-  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
-        .    .    .       .     .     .       .      .      .
-        4    0    0       1     0     0       2      0      0      free(data);
-        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
-        3    0    0       2     0     0       .      .      .  }
-</pre>
-
-(Although column widths are automatically minimised, a wide terminal is clearly
-useful.)<p>
-  
-Each source file is clearly marked (<code>User-annotated source</code>) as
-having been chosen manually for annotation.  If the file was found in one of
-the directories specified with the <code>-I</code>/<code>--include</code>
-option, the directory and file are both given.<p>
-
-Each line is annotated with its event counts.  Events not applicable for a line
-are represented by a `.';  this is useful for distinguishing between an event
-which cannot happen, and one which can but did not.<p> 
-
-Sometimes only a small section of a source file is executed.  To minimise
-uninteresting output, Valgrind only shows annotated lines and lines within a
-small distance of annotated lines.  Gaps are marked with the line numbers so
-you know which part of a file the shown code comes from, eg:
-
-<pre>
-(figures and code for line 704)
--- line 704 ----------------------------------------
--- line 878 ----------------------------------------
-(figures and code for line 878)
-</pre>
-
-The amount of context to show around annotated lines is controlled by the
-<code>--context</code> option.<p>
-
-To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
-vg_annotate will automatically annotate every source file it can find that is
-mentioned in the function-by-function summary.  Therefore, the files chosen for
-auto-annotation  are affected by the <code>--sort</code> and
-<code>--threshold</code> options.  Each source file is clearly marked
-(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
-that could not be found are mentioned at the end of the output, eg:    
-
-<pre>
---------------------------------------------------------------------------------
-The following files chosen for auto-annotation could not be found:
---------------------------------------------------------------------------------
-  getc.c
-  ctype.c
-  ../sysdeps/generic/lockfile.c
-</pre>
-
-This is quite common for library files, since libraries are usually compiled
-with debugging information, but the source files are often not present on a
-system.  If a file is chosen for annotation <b>both</b> manually and
-automatically, it is marked as <code>User-annotated source</code>.
-
-Use the <code>-I/--include</code> option to tell Valgrind where to look for
-source files if the filenames found from the debugging information aren't
-specific enough.
-
-Beware that vg_annotate can take some time to digest large
-<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
-auto-annotation can produce a lot of output if your program is large!
-
-
-<h3>7.7&nbsp; Annotating assembler programs</h3>
-
-Valgrind can annotate assembler programs too, or annotate the
-assembler generated for your C program.  Sometimes this is useful for
-understanding what is really happening when an interesting line of C
-code is translated into multiple instructions.<p>
-
-To do this, you just need to assemble your <code>.s</code> files with
-assembler-level debug information.  gcc doesn't do this, but you can
-use the GNU assembler with the <code>--gstabs</code> option to
-generate object files with this information, eg:
-
-<blockquote><code>as --gstabs foo.s</code></blockquote>
-
-You can then profile and annotate source files in the same way as for C/C++
-programs.
-
-
-<h3>7.8&nbsp; <code>vg_annotate</code> options</h3>
-<ul>
-  <li><code>-h, --help</code></li><p>
-  <li><code>-v, --version</code><p>
-
-      Help and version, as usual.</li>
-
-  <li><code>--sort=A,B,C</code> [default: order in 
-      <code>cachegrind.out</code>]<p>
-      Specifies the events upon which the sorting of the function-by-function
-      entries will be based.  Useful if you want to concentrate on eg. I cache
-      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
-      (<code>--sort=D1mr,D2mr</code>), or L2 misses
-      (<code>--sort=D2mr,I2mr</code>).</li><p>
-
-  <li><code>--show=A,B,C</code> [default: all, using order in
-      <code>cachegrind.out</code>]<p>
-      Specifies which events to show (and the column order). Default is to use
-      all present in the <code>cachegrind.out</code> file (and use the order in
-      the file).</li><p>
-
-  <li><code>--threshold=X</code> [default: 99%] <p>
-      Sets the threshold for the function-by-function summary.  Functions are
-      shown that account for more than X% of the primary sort event.  If
-      auto-annotating, also affects which files are annotated.
-      
-      Note: thresholds can be set for more than one of the events by appending
-      any events for the <code>--sort</code> option with a colon and a number
-      (no spaces, though).  E.g. if you want to see the functions that cover
-      99% of L2 read misses and 99% of L2 write misses, use this option:
-      
-      <blockquote><code>--sort=D2mr:99,D2mw:99</code></blockquote>
-      </li><p>
-
-  <li><code>--auto=no</code> [default]<br>
-      <code>--auto=yes</code> <p>
-      When enabled, automatically annotates every file that is mentioned in the
-      function-by-function summary that can be found.  Also gives a list of
-      those that couldn't be found.
-
-  <li><code>--context=N</code> [default: 8]<p>
-      Print N lines of context before and after each annotated line.  Avoids
-      printing large sections of source files that were not executed.  Use a 
-      large number (eg. 10,000) to show all source lines.
-      </li><p>
-
-  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
-      [default: empty string]<p>
-      Adds a directory to the list in which to search for files.  Multiple
-      -I/--include options can be given to add multiple directories.
-</ul>
-  
-
-<h3>7.9&nbsp; Warnings</h3>
-There are a couple of situations in which vg_annotate issues warnings.
-
-<ul>
-  <li>If a source file is more recent than the <code>cachegrind.out</code>
-      file.  This is because the information in <code>cachegrind.out</code> is
-      only recorded with line numbers, so if the line numbers change at all in
-      the source (eg. lines added, deleted, swapped), any annotations will be 
-      incorrect.<p>
-
-  <li>If information is recorded about line numbers past the end of a file.
-      This can be caused by the above problem, ie. shortening the source file
-      while using an old <code>cachegrind.out</code> file.  If this happens,
-      the figures for the bogus lines are printed anyway (clearly marked as
-      bogus) in case they are important.</li><p>
-</ul>
-
-
-<h3>7.10&nbsp; Things to watch out for</h3>
-Some odd things that can occur during annotation:
-
-<ul>
-  <li>If annotating at the assembler level, you might see something like this:
-
-      <pre>
-      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
-      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
-      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
-      .    .    .  .    .    .  .    .    .          .align 4,0x90
-      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
-      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
-      </pre>
-
-      How can the third instruction be executed twice when the others are
-      executed only once?  As it turns out, it isn't.  Here's a dump of the
-      executable, using <code>objdump -d</code>:
-
-      <pre>
-      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
-      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
-      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
-      8048f32:       89 f6                   mov    %esi,%esi
-      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
-      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
-      </pre>
-
-      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
-      come from?  The GNU assembler inserted it to serve as the two bytes of
-      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
-      a four-byte boundary, but pretended it didn't exist when adding debug
-      information.  Thus when Valgrind reads the debug info it thinks that the
-      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
-      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
-      <code>mov %esi,%esi</code> to it.<p>
-  </li>
-
-  <li>Inlined functions can cause strange results in the function-by-function
-      summary.  If a function <code>inline_me()</code> is defined in
-      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
-      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
-      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
-      will be separate function entries for each inlining site, ie.
-      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
-      <code>foo.h:f3()</code>.  To find the total counts for
-      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
-
-      The reason for this is that although the debug info output by gcc
-      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
-      doesn't indicate the name of the function in <code>foo.h</code>, so
-      Valgrind keeps using the old one.<p>
-
-  <li>Sometimes, the same filename might be represented with a relative name
-      and with an absolute name in different parts of the debug info, eg:
-      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
-      case, if you use auto-annotation, the file will be annotated twice with
-      the counts split between the two.<p>
-  </li>
-
-  <li>Files with more than 65,535 lines cause difficulties for the stabs debug
-      info reader.  This is because the line number in the <code>struct
-      nlist</code> defined in <code>a.out.h</code> under Linux is only a 16-bit
-      value.  Valgrind can handle some files with more than 65,535 lines
-      correctly by making some guesses to identify line number overflows.  But
-      some cases are beyond it, in which case you'll get a warning message
-      explaining that annotations for the file might be incorrect.<p>
-  </li>
-
-  <li>If you compile some files with <code>-g</code> and some without, some
-      events that take place in a file without debug info could be attributed
-      to the last line of a file with debug info (whichever one gets placed
-      before the non-debug-info file in the executable).<p>
-  </li>
-</ul>
-
-This list looks long, but these cases should be fairly rare.<p>
-
-Note: stabs is not an easy format to read.  If you come across bizarre
-annotations that look like might be caused by a bug in the stabs reader,
-please let us know.<p>
-
-
-<h3>7.11&nbsp; Accuracy</h3>
-Valgrind's cache profiling has a number of shortcomings:
-
-<ul>
-  <li>It doesn't account for kernel activity -- the effect of system calls on
-      the cache contents is ignored.</li><p>
-
-  <li>It doesn't account for other process activity (although this is probably
-      desirable when considering a single program).</li><p>
-
-  <li>It doesn't account for virtual-to-physical address mappings;  hence the
-      entire simulation is not a true representation of what's happening in the
-      cache.</li><p>
-
-  <li>It doesn't account for cache misses not visible at the instruction level,
-      eg. those arising from TLB misses, or speculative execution.</li><p>
-
-  <li>Valgrind's custom <code>malloc()</code> will allocate memory in different
-      ways to the standard <code>malloc()</code>, which could warp the results.
-      </li><p>
-
-  <li>Valgrind's custom threads implementation will schedule threads
-      differently to the standard one.  This too could warp the results for
-      threaded programs.
-      </li><p>
-
-  <li>The instructions <code>bts</code>, <code>btr</code> and <code>btc</code>
-      will incorrectly be counted as doing a data read if both the arguments
-      are registers, eg:
-
-      <blockquote><code>btsl %eax, %edx</code></blockquote>
-
-      This should only happen rarely.
-</ul>
-
-Another thing worth nothing is that results are very sensitive.  Changing the
-size of the <code>valgrind.so</code> file, the size of the program being
-profiled, or even the length of its name can perturb the results.  Variations
-will be small, but don't expect perfectly repeatable results if your program
-changes at all.<p>
-
-While these factors mean you shouldn't trust the results to be super-accurate,
-hopefully they should be close enough to be useful.<p>
-
-
-<h3>7.12&nbsp; Todo</h3>
-<ul>
-  <li>Program start-up/shut-down calls a lot of functions that aren't
-      interesting and just complicate the output.  Would be nice to exclude
-      these somehow.</li>
-  <p>
-</ul> 
-<hr width="100%">
-</body>
-</html>
-
diff --git a/coregrind/docs/nav.html b/coregrind/docs/nav.html
deleted file mode 100644
index ad920ad443..0000000000
--- a/coregrind/docs/nav.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<html>
-  <head>
-    <title>Valgrind</title>
-    <base target="main">
-    <style type="text/css">
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-  </head>
-
-  <body>
-    <br>
-    <a href="manual.html#contents"><b>Contents of this manual</b></a><br>
-    <a href="manual.html#intro">1 Introduction</a><br>
-    <a href="manual.html#whatfor">1.1 What Valgrind is for</a><br>
-    <a href="manual.html#whatdoes">1.2 What it does with
-       your program</a>
-    <p>
-    <a href="manual.html#howtouse">2 <b>How to use it, and how to
-       make sense of the results</b></a><br>
-    <a href="manual.html#starta">2.1 Getting started</a><br>
-    <a href="manual.html#comment">2.2 The commentary</a><br>
-    <a href="manual.html#report">2.3 Reporting of errors</a><br>
-    <a href="manual.html#suppress">2.4 Suppressing errors</a><br>
-    <a href="manual.html#flags">2.5 Command-line flags</a><br>
-    <a href="manual.html#errormsgs">2.6 Explanation of error messages</a><br>
-    <a href="manual.html#suppfiles">2.7 Writing suppressions files</a><br>
-    <a href="manual.html#clientreq">2.8 The Client Request mechanism</a><br>
-    <a href="manual.html#pthreads">2.9 Support for POSIX pthreads</a><br>
-    <a href="manual.html#install">2.10 Building and installing</a><br>
-    <a href="manual.html#problems">2.11 If you have problems</a>
-    <p>
-    <a href="manual.html#machine">3 <b>Details of the checking machinery</b></a><br>
-    <a href="manual.html#vvalue">3.1 Valid-value (V) bits</a><br>
-    <a href="manual.html#vaddress">3.2 Valid-address (A) bits</a><br>
-    <a href="manual.html#together">3.3 Putting it all together</a><br>
-    <a href="manual.html#signals">3.4 Signals</a><br>
-    <a href="manual.html#leaks">3.5 Memory leak detection</a>
-    <p>
-    <a href="manual.html#limits">4 <b>Limitations</b></a><br>
-    <p>
-    <a href="manual.html#howitworks">5 <b>How it works -- a rough overview</b></a><br>
-    <a href="manual.html#startb">5.1 Getting started</a><br>
-    <a href="manual.html#engine">5.2 The translation/instrumentation engine</a><br>
-    <a href="manual.html#track">5.3 Tracking the status of memory</a><br>
-    <a href="manual.html#sys_calls">5.4 System calls</a><br>
-    <a href="manual.html#sys_signals">5.5 Signals</a>
-    <p>
-    <a href="manual.html#example">6 <b>An example</b></a><br>
-    <p>
-    <a href="manual.html#cache">7 <b>Cache profiling</b></a></h4>
-    <p>
-    <a href="techdocs.html">8 <b>The design and implementation of Valgrind</b></a><br>
-
-</body>
-</html>
diff --git a/coregrind/docs/techdocs.html b/coregrind/docs/techdocs.html
deleted file mode 100644
index 2e1cc8b7e9..0000000000
--- a/coregrind/docs/techdocs.html
+++ /dev/null
@@ -1,2524 +0,0 @@
-<html>
-  <head>
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-    <title>The design and implementation of Valgrind</title>
-  </head>
-
-<body bgcolor="#ffffff">
-
-<a name="title">&nbsp;</a>
-<h1 align=center>The design and implementation of Valgrind</h1>
-
-<center>
-Detailed technical notes for hackers, maintainers and the
-overly-curious<br>
-These notes pertain to snapshot 20020306<br>
-<p>
-<a href="mailto:jseward@acm.org">jseward@acm.org<br>
-<a href="http://developer.kde.org/~sewardj">http://developer.kde.org/~sewardj</a><br>
-Copyright &copy; 2000-2002 Julian Seward
-<p>
-Valgrind is licensed under the GNU General Public License, 
-version 2<br>
-An open-source tool for finding memory-management problems in
-x86 GNU/Linux executables.
-</center>
-
-<p>
-
-
-
-
-<hr width="100%">
-
-<h2>Introduction</h2>
-
-This document contains a detailed, highly-technical description of the
-internals of Valgrind.  This is not the user manual; if you are an
-end-user of Valgrind, you do not want to read this.  Conversely, if
-you really are a hacker-type and want to know how it works, I assume
-that you have read the user manual thoroughly.
-<p>
-You may need to read this document several times, and carefully.  Some
-important things, I only say once.
-
-
-<h3>History</h3>
-
-Valgrind came into public view in late Feb 2002.  However, it has been
-under contemplation for a very long time, perhaps seriously for about
-five years.  Somewhat over two years ago, I started working on the x86
-code generator for the Glasgow Haskell Compiler
-(http://www.haskell.org/ghc), gaining familiarity with x86 internals
-on the way.  I then did Cacheprof (http://www.cacheprof.org), gaining
-further x86 experience.  Some time around Feb 2000 I started
-experimenting with a user-space x86 interpreter for x86-Linux.  This
-worked, but it was clear that a JIT-based scheme would be necessary to
-give reasonable performance for Valgrind.  Design work for the JITter
-started in earnest in Oct 2000, and by early 2001 I had an x86-to-x86
-dynamic translator which could run quite large programs.  This
-translator was in a sense pointless, since it did not do any
-instrumentation or checking.
-
-<p>
-Most of the rest of 2001 was taken up designing and implementing the
-instrumentation scheme.  The main difficulty, which consumed a lot
-of effort, was to design a scheme which did not generate large numbers
-of false uninitialised-value warnings.  By late 2001 a satisfactory
-scheme had been arrived at, and I started to test it on ever-larger
-programs, with an eventual eye to making it work well enough so that
-it was helpful to folks debugging the upcoming version 3 of KDE.  I've
-used KDE since before version 1.0, and wanted to Valgrind to be an
-indirect contribution to the KDE 3 development effort.  At the start of
-Feb 02 the kde-core-devel crew started using it, and gave a huge
-amount of helpful feedback and patches in the space of three weeks.
-Snapshot 20020306 is the result.
-
-<p>
-In the best Unix tradition, or perhaps in the spirit of Fred Brooks'
-depressing-but-completely-accurate epitaph "build one to throw away;
-you will anyway", much of Valgrind is a second or third rendition of
-the initial idea.  The instrumentation machinery
-(<code>vg_translate.c</code>, <code>vg_memory.c</code>) and core CPU
-simulation (<code>vg_to_ucode.c</code>, <code>vg_from_ucode.c</code>)
-have had three redesigns and rewrites; the register allocator,
-low-level memory manager (<code>vg_malloc2.c</code>) and symbol table
-reader (<code>vg_symtab2.c</code>) are on the second rewrite.  In a
-sense, this document serves to record some of the knowledge gained as
-a result.
-
-
-<h3>Design overview</h3>
-
-Valgrind is compiled into a Linux shared object,
-<code>valgrind.so</code>, and also a dummy one,
-<code>valgrinq.so</code>, of which more later.  The
-<code>valgrind</code> shell script adds <code>valgrind.so</code> to
-the <code>LD_PRELOAD</code> list of extra libraries to be
-loaded with any dynamically linked library.  This is a standard trick,
-one which I assume the <code>LD_PRELOAD</code> mechanism was developed
-to support.
-
-<p>
-<code>valgrind.so</code>
-is linked with the <code>-z initfirst</code> flag, which requests that
-its initialisation code is run before that of any other object in the
-executable image.  When this happens, valgrind gains control.  The
-real CPU becomes "trapped" in <code>valgrind.so</code> and the 
-translations it generates.  The synthetic CPU provided by Valgrind
-does, however, return from this initialisation function.  So the 
-normal startup actions, orchestrated by the dynamic linker
-<code>ld.so</code>, continue as usual, except on the synthetic CPU,
-not the real one.  Eventually <code>main</code> is run and returns,
-and then the finalisation code of the shared objects is run,
-presumably in inverse order to which they were initialised.  Remember,
-this is still all happening on the simulated CPU.  Eventually
-<code>valgrind.so</code>'s own finalisation code is called.  It spots
-this event, shuts down the simulated CPU, prints any error summaries
-and/or does leak detection, and returns from the initialisation code
-on the real CPU.  At this point, in effect the real and synthetic CPUs
-have merged back into one, Valgrind has lost control of the program,
-and the program finally <code>exit()s</code> back to the kernel in the
-usual way.
-
-<p>
-The normal course of activity, one Valgrind has started up, is as
-follows.  Valgrind never runs any part of your program (usually
-referred to as the "client"), not a single byte of it, directly.
-Instead it uses function <code>VG_(translate)</code> to translate
-basic blocks (BBs, straight-line sequences of code) into instrumented
-translations, and those are run instead.  The translations are stored
-in the translation cache (TC), <code>vg_tc</code>, with the
-translation table (TT), <code>vg_tt</code> supplying the
-original-to-translation code address mapping.  Auxiliary array
-<code>VG_(tt_fast)</code> is used as a direct-map cache for fast
-lookups in TT; it usually achieves a hit rate of around 98% and
-facilitates an orig-to-trans lookup in 4 x86 insns, which is not bad.
-
-<p>
-Function <code>VG_(dispatch)</code> in <code>vg_dispatch.S</code> is
-the heart of the JIT dispatcher.  Once a translated code address has
-been found, it is executed simply by an x86 <code>call</code>
-to the translation.  At the end of the translation, the next 
-original code addr is loaded into <code>%eax</code>, and the 
-translation then does a <code>ret</code>, taking it back to the
-dispatch loop, with, interestingly, zero branch mispredictions.  
-The address requested in <code>%eax</code> is looked up first in
-<code>VG_(tt_fast)</code>, and, if not found, by calling C helper
-<code>VG_(search_transtab)</code>.  If there is still no translation 
-available, <code>VG_(dispatch)</code> exits back to the top-level
-C dispatcher <code>VG_(toploop)</code>, which arranges for 
-<code>VG_(translate)</code> to make a new translation.  All fairly
-unsurprising, really.  There are various complexities described below.
-
-<p>
-The translator, orchestrated by <code>VG_(translate)</code>, is
-complicated but entirely self-contained.  It is described in great
-detail in subsequent sections.  Translations are stored in TC, with TT
-tracking administrative information.  The translations are subject to
-an approximate LRU-based management scheme.  With the current
-settings, the TC can hold at most about 15MB of translations, and LRU
-passes prune it to about 13.5MB.  Given that the
-orig-to-translation expansion ratio is about 13:1 to 14:1, this means
-TC holds translations for more or less a megabyte of original code,
-which generally comes to about 70000 basic blocks for C++ compiled
-with optimisation on.  Generating new translations is expensive, so it
-is worth having a large TC to minimise the (capacity) miss rate.
-
-<p>
-The dispatcher, <code>VG_(dispatch)</code>, receives hints from
-the translations which allow it to cheaply spot all control 
-transfers corresponding to x86 <code>call</code> and <code>ret</code>
-instructions.  It has to do this in order to spot some special events:
-<ul>
-<li>Calls to <code>VG_(shutdown)</code>.  This is Valgrind's cue to
-    exit.  NOTE: actually this is done a different way; it should be
-    cleaned up.
-<p>
-<li>Returns of system call handlers, to the return address 
-    <code>VG_(signalreturn_bogusRA)</code>.  The signal simulator
-    needs to know when a signal handler is returning, so we spot
-    jumps (returns) to this address.
-<p>
-<li>Calls to <code>vg_trap_here</code>.  All <code>malloc</code>,
-    <code>free</code>, etc calls that the client program makes are
-    eventually routed to a call to <code>vg_trap_here</code>,
-    and Valgrind does its own special thing with these calls.
-    In effect this provides a trapdoor, by which Valgrind can
-    intercept certain calls on the simulated CPU, run the call as it
-    sees fit itself (on the real CPU), and return the result to
-    the simulated CPU, quite transparently to the client program.
-</ul>
-Valgrind intercepts the client's <code>malloc</code>,
-<code>free</code>, etc,
-calls, so that it can store additional information.  Each block 
-<code>malloc</code>'d by the client gives rise to a shadow block
-in which Valgrind stores the call stack at the time of the
-<code>malloc</code>
-call.  When the client calls <code>free</code>, Valgrind tries to
-find the shadow block corresponding to the address passed to
-<code>free</code>, and emits an error message if none can be found.
-If it is found, the block is placed on the freed blocks queue 
-<code>vg_freed_list</code>, it is marked as inaccessible, and
-its shadow block now records the call stack at the time of the
-<code>free</code> call.  Keeping <code>free</code>'d blocks in
-this queue allows Valgrind to spot all (presumably invalid) accesses
-to them.  However, once the volume of blocks in the free queue 
-exceeds <code>VG_(clo_freelist_vol)</code>, blocks are finally
-removed from the queue.
-
-<p>
-Keeping track of A and V bits (note: if you don't know what these are,
-you haven't read the user guide carefully enough) for memory is done
-in <code>vg_memory.c</code>.  This implements a sparse array structure
-which covers the entire 4G address space in a way which is reasonably
-fast and reasonably space efficient.  The 4G address space is divided
-up into 64K sections, each covering 64Kb of address space.  Given a
-32-bit address, the top 16 bits are used to select one of the 65536
-entries in <code>VG_(primary_map)</code>.  The resulting "secondary"
-(<code>SecMap</code>) holds A and V bits for the 64k of address space
-chunk corresponding to the lower 16 bits of the address.
-
-
-<h3>Design decisions</h3>
-
-Some design decisions were motivated by the need to make Valgrind
-debuggable.  Imagine you are writing a CPU simulator.  It works fairly
-well.  However, you run some large program, like Netscape, and after
-tens of millions of instructions, it crashes.  How can you figure out
-where in your simulator the bug is?
-
-<p>
-Valgrind's answer is: cheat.  Valgrind is designed so that it is
-possible to switch back to running the client program on the real
-CPU at any point.  Using the <code>--stop-after= </code> flag, you can 
-ask Valgrind to run just some number of basic blocks, and then 
-run the rest of the way on the real CPU.  If you are searching for
-a bug in the simulated CPU, you can use this to do a binary search,
-which quickly leads you to the specific basic block which is
-causing the problem.  
-
-<p>
-This is all very handy.  It does constrain the design in certain
-unimportant ways.  Firstly, the layout of memory, when viewed from the
-client's point of view, must be identical regardless of whether it is
-running on the real or simulated CPU.  This means that Valgrind can't
-do pointer swizzling -- well, no great loss -- and it can't run on 
-the same stack as the client -- again, no great loss.  
-Valgrind operates on its own stack, <code>VG_(stack)</code>, which
-it switches to at startup, temporarily switching back to the client's
-stack when doing system calls for the client.
-
-<p>
-Valgrind also receives signals on its own stack,
-<code>VG_(sigstack)</code>, but for different gruesome reasons
-discussed below.
-
-<p>
-This nice clean switch-back-to-the-real-CPU-whenever-you-like story
-is muddied by signals.  Problem is that signals arrive at arbitrary
-times and tend to slightly perturb the basic block count, with the
-result that you can get close to the basic block causing a problem but
-can't home in on it exactly.  My kludgey hack is to define
-<code>SIGNAL_SIMULATION</code> to 1 towards the bottom of 
-<code>vg_syscall_mem.c</code>, so that signal handlers are run on the
-real CPU and don't change the BB counts.
-
-<p>
-A second hole in the switch-back-to-real-CPU story is that Valgrind's
-way of delivering signals to the client is different from that of the
-kernel.  Specifically, the layout of the signal delivery frame, and
-the mechanism used to detect a sighandler returning, are different.
-So you can't expect to make the transition inside a sighandler and
-still have things working, but in practice that's not much of a
-restriction.
-
-<p>
-Valgrind's implementation of <code>malloc</code>, <code>free</code>,
-etc, (in <code>vg_clientmalloc.c</code>, not the low-level stuff in
-<code>vg_malloc2.c</code>) is somewhat complicated by the need to 
-handle switching back at arbitrary points.  It does work tho.
-
-
-
-<h3>Correctness</h3>
-
-There's only one of me, and I have a Real Life (tm) as well as hacking
-Valgrind [allegedly :-].  That means I don't have time to waste
-chasing endless bugs in Valgrind.  My emphasis is therefore on doing
-everything as simply as possible, with correctness, stability and
-robustness being the number one priority, more important than
-performance or functionality.  As a result:
-<ul>
-<li>The code is absolutely loaded with assertions, and these are
-    <b>permanently enabled.</b>  I have no plan to remove or disable
-    them later.  Over the past couple of months, as valgrind has
-    become more widely used, they have shown their worth, pulling
-    up various bugs which would otherwise have appeared as
-    hard-to-find segmentation faults.
-    <p>
-    I am of the view that it's acceptable to spend 5% of the total
-    running time of your valgrindified program doing assertion checks
-    and other internal sanity checks.
-<p>
-<li>Aside from the assertions, valgrind contains various sets of
-    internal sanity checks, which get run at varying frequencies
-    during normal operation.  <code>VG_(do_sanity_checks)</code>
-    runs every 1000 basic blocks, which means 500 to 2000 times/second 
-    for typical machines at present.  It checks that Valgrind hasn't
-    overrun its private stack, and does some simple checks on the
-    memory permissions maps.  Once every 25 calls it does some more
-    extensive checks on those maps.  Etc, etc.
-    <p>
-    The following components also have sanity check code, which can
-    be enabled to aid debugging:
-    <ul>
-    <li>The low-level memory-manager
-        (<code>VG_(mallocSanityCheckArena)</code>).  This does a 
-        complete check of all blocks and chains in an arena, which
-        is very slow.  Is not engaged by default.
-    <p>
-    <li>The symbol table reader(s): various checks to ensure
-        uniqueness of mappings; see <code>VG_(read_symbols)</code>
-        for a start.  Is permanently engaged.
-    <p>
-    <li>The A and V bit tracking stuff in <code>vg_memory.c</code>.
-        This can be compiled with cpp symbol
-        <code>VG_DEBUG_MEMORY</code> defined, which removes all the
-        fast, optimised cases, and uses simple-but-slow fallbacks
-        instead.  Not engaged by default.
-    <p>
-    <li>Ditto <code>VG_DEBUG_LEAKCHECK</code>.
-    <p>
-    <li>The JITter parses x86 basic blocks into sequences of 
-        UCode instructions.  It then sanity checks each one with
-        <code>VG_(saneUInstr)</code> and sanity checks the sequence
-        as a whole with <code>VG_(saneUCodeBlock)</code>.  This stuff
-        is engaged by default, and has caught some way-obscure bugs
-        in the simulated CPU machinery in its time.
-    <p>
-    <li>The system call wrapper does
-        <code>VG_(first_and_last_secondaries_look_plausible)</code> after
-        every syscall; this is known to pick up bugs in the syscall
-        wrappers.  Engaged by default.
-    <p>
-    <li>The main dispatch loop, in <code>VG_(dispatch)</code>, checks
-        that translations do not set <code>%ebp</code> to any value
-        different from <code>VG_EBP_DISPATCH_CHECKED</code> or
-        <code>& VG_(baseBlock)</code>.  In effect this test is free,
-        and is permanently engaged.
-    <p>
-    <li>There are a couple of ifdefed-out consistency checks I
-        inserted whilst debugging the new register allocater, 
-        <code>vg_do_register_allocation</code>.
-    </ul>
-<p>
-<li>I try to avoid techniques, algorithms, mechanisms, etc, for which
-    I can supply neither a convincing argument that they are correct,
-    nor sanity-check code which might pick up bugs in my
-    implementation.  I don't always succeed in this, but I try.
-    Basically the idea is: avoid techniques which are, in practice,
-    unverifiable, in some sense.   When doing anything, always have in
-    mind: "how can I verify that this is correct?"
-</ul>
-
-<p>
-Some more specific things are:
-
-<ul>
-<li>Valgrind runs in the same namespace as the client, at least from
-    <code>ld.so</code>'s point of view, and it therefore absolutely
-    had better not export any symbol with a name which could clash
-    with that of the client or any of its libraries.  Therefore, all
-    globally visible symbols exported from <code>valgrind.so</code>
-    are defined using the <code>VG_</code> CPP macro.  As you'll see
-    from <code>vg_constants.h</code>, this appends some arbitrary
-    prefix to the symbol, in order that it be, we hope, globally
-    unique.  Currently the prefix is <code>vgPlain_</code>.  For
-    convenience there are also <code>VGM_</code>, <code>VGP_</code>
-    and <code>VGOFF_</code>.  All locally defined symbols are declared
-    <code>static</code> and do not appear in the final shared object.
-    <p>
-    To check this, I periodically do 
-    <code>nm valgrind.so | grep " T "</code>, 
-    which shows you all the globally exported text symbols.
-    They should all have an approved prefix, except for those like
-    <code>malloc</code>, <code>free</code>, etc, which we deliberately
-    want to shadow and take precedence over the same names exported
-    from <code>glibc.so</code>, so that valgrind can intercept those
-    calls easily.  Similarly, <code>nm valgrind.so | grep " D "</code>
-    allows you to find any rogue data-segment symbol names.
-<p>
-<li>Valgrind tries, and almost succeeds, in being completely
-    independent of all other shared objects, in particular of
-    <code>glibc.so</code>.  For example, we have our own low-level
-    memory manager in <code>vg_malloc2.c</code>, which is a fairly
-    standard malloc/free scheme augmented with arenas, and
-    <code>vg_mylibc.c</code> exports reimplementations of various bits
-    and pieces you'd normally get from the C library.
-    <p>
-    Why all the hassle?  Because imagine the potential chaos of both
-    the simulated and real CPUs executing in <code>glibc.so</code>.
-    It just seems simpler and cleaner to be completely self-contained,
-    so that only the simulated CPU visits <code>glibc.so</code>.  In
-    practice it's not much hassle anyway.  Also, valgrind starts up
-    before glibc has a chance to initialise itself, and who knows what
-    difficulties that could lead to.  Finally, glibc has definitions
-    for some types, specifically <code>sigset_t</code>, which conflict
-    (are different from) the Linux kernel's idea of same.  When 
-    Valgrind wants to fiddle around with signal stuff, it wants to
-    use the kernel's definitions, not glibc's definitions.  So it's 
-    simplest just to keep glibc out of the picture entirely.
-    <p>
-    To find out which glibc symbols are used by Valgrind, reinstate
-    the link flags <code>-nostdlib -Wl,-no-undefined</code>.  This
-    causes linking to fail, but will tell you what you depend on.
-    I have mostly, but not entirely, got rid of the glibc
-    dependencies; what remains is, IMO, fairly harmless.  AFAIK the
-    current dependencies are: <code>memset</code>,
-    <code>memcmp</code>, <code>stat</code>, <code>system</code>,
-    <code>sbrk</code>, <code>setjmp</code> and <code>longjmp</code>.
-
-<p>
-<li>Similarly, valgrind should not really import any headers other
-    than the Linux kernel headers, since it knows of no API other than
-    the kernel interface to talk to.  At the moment this is really not
-    in a good state, and <code>vg_syscall_mem</code> imports, via
-    <code>vg_unsafe.h</code>, a significant number of C-library
-    headers so as to know the sizes of various structs passed across
-    the kernel boundary.  This is of course completely bogus, since
-    there is no guarantee that the C library's definitions of these
-    structs matches those of the kernel.  I have started to sort this
-    out using <code>vg_kerneliface.h</code>, into which I had intended
-    to copy all kernel definitions which valgrind could need, but this
-    has not gotten very far.  At the moment it mostly contains
-    definitions for <code>sigset_t</code> and <code>struct
-    sigaction</code>, since the kernel's definition for these really
-    does clash with glibc's.  I plan to use a <code>vki_</code> prefix
-    on all these types and constants, to denote the fact that they
-    pertain to <b>V</b>algrind's <b>K</b>ernel <b>I</b>nterface.
-    <p>
-    Another advantage of having a <code>vg_kerneliface.h</code> file
-    is that it makes it simpler to interface to a different kernel.
-    Once can, for example, easily imagine writing a new
-    <code>vg_kerneliface.h</code> for FreeBSD, or x86 NetBSD.
-
-</ul>
-
-<h3>Current limitations</h3>
-
-No threads.  I think fixing this is close to a research-grade problem.
-<p>
-No MMX.  Fixing this should be relatively easy, using the same giant
-trick used for x86 FPU instructions.  See below.
-<p>
-Support for weird (non-POSIX) signal stuff is patchy.  Does anybody
-care?
-<p>
-
-
-
-
-<hr width="100%">
-
-<h2>The instrumenting JITter</h2>
-
-This really is the heart of the matter.  We begin with various side
-issues.
-
-<h3>Run-time storage, and the use of host registers</h3>
-
-Valgrind translates client (original) basic blocks into instrumented
-basic blocks, which live in the translation cache TC, until either the
-client finishes or the translations are ejected from TC to make room
-for newer ones.
-<p>
-Since it generates x86 code in memory, Valgrind has complete control
-of the use of registers in the translations.  Now pay attention.  I
-shall say this only once, and it is important you understand this.  In
-what follows I will refer to registers in the host (real) cpu using
-their standard names, <code>%eax</code>, <code>%edi</code>, etc.  I
-refer to registers in the simulated CPU by capitalising them:
-<code>%EAX</code>, <code>%EDI</code>, etc.  These two sets of
-registers usually bear no direct relationship to each other; there is
-no fixed mapping between them.  This naming scheme is used fairly
-consistently in the comments in the sources.
-<p>
-Host registers, once things are up and running, are used as follows:
-<ul>
-<li><code>%esp</code>, the real stack pointer, points
-    somewhere in Valgrind's private stack area,
-    <code>VG_(stack)</code> or, transiently, into its signal delivery
-    stack, <code>VG_(sigstack)</code>.
-<p>
-<li><code>%edi</code> is used as a temporary in code generation; it
-    is almost always dead, except when used for the <code>Left</code>
-    value-tag operations.
-<p>
-<li><code>%eax</code>, <code>%ebx</code>, <code>%ecx</code>,
-    <code>%edx</code> and <code>%esi</code> are available to
-    Valgrind's register allocator.  They are dead (carry unimportant
-    values) in between translations, and are live only in
-    translations.  The one exception to this is <code>%eax</code>,
-    which, as mentioned far above, has a special significance to the
-    dispatch loop <code>VG_(dispatch)</code>: when a translation
-    returns to the dispatch loop, <code>%eax</code> is expected to
-    contain the original-code-address of the next translation to run.
-    The register allocator is so good at minimising spill code that
-    using five regs and not having to save/restore <code>%edi</code>
-    actually gives better code than allocating to <code>%edi</code>
-    as well, but then having to push/pop it around special uses.
-<p>
-<li><code>%ebp</code> points permanently at
-    <code>VG_(baseBlock)</code>.  Valgrind's translations are
-    position-independent, partly because this is convenient, but also
-    because translations get moved around in TC as part of the LRUing
-    activity.  <b>All</b> static entities which need to be referred to
-    from generated code, whether data or helper functions, are stored
-    starting at <code>VG_(baseBlock)</code> and are therefore reached
-    by indexing from <code>%ebp</code>.  There is but one exception, 
-    which is that by placing the value
-    <code>VG_EBP_DISPATCH_CHECKED</code>
-    in <code>%ebp</code> just before a return to the dispatcher, 
-    the dispatcher is informed that the next address to run, 
-    in <code>%eax</code>, requires special treatment.
-<p>
-<li>The real machine's FPU state is pretty much unimportant, for
-    reasons which will become obvious.  Ditto its <code>%eflags</code>
-    register.
-</ul>
-
-<p>
-The state of the simulated CPU is stored in memory, in
-<code>VG_(baseBlock)</code>, which is a block of 200 words IIRC.
-Recall that <code>%ebp</code> points permanently at the start of this
-block.  Function <code>vg_init_baseBlock</code> decides what the
-offsets of various entities in <code>VG_(baseBlock)</code> are to be,
-and allocates word offsets for them.  The code generator then emits
-<code>%ebp</code> relative addresses to get at those things.  The
-sequence in which entities are allocated has been carefully chosen so
-that the 32 most popular entities come first, because this means 8-bit
-offsets can be used in the generated code.
-
-<p>
-If I was clever, I could make <code>%ebp</code> point 32 words along 
-<code>VG_(baseBlock)</code>, so that I'd have another 32 words of
-short-form offsets available, but that's just complicated, and it's
-not important -- the first 32 words take 99% (or whatever) of the
-traffic.
-
-<p>
-Currently, the sequence of stuff in <code>VG_(baseBlock)</code> is as
-follows:
-<ul>
-<li>9 words, holding the simulated integer registers,
-    <code>%EAX</code> .. <code>%EDI</code>, and the simulated flags,
-    <code>%EFLAGS</code>.
-<p>
-<li>Another 9 words, holding the V bit "shadows" for the above 9 regs.
-<p>
-<li>The <b>addresses</b> of various helper routines called from
-    generated code: 
-    <code>VG_(helper_value_check4_fail)</code>,
-    <code>VG_(helper_value_check0_fail)</code>,
-    which register V-check failures,
-    <code>VG_(helperc_STOREV4)</code>,
-    <code>VG_(helperc_STOREV1)</code>,
-    <code>VG_(helperc_LOADV4)</code>,
-    <code>VG_(helperc_LOADV1)</code>,
-    which do stores and loads of V bits to/from the 
-    sparse array which keeps track of V bits in memory,
-    and
-    <code>VGM_(handle_esp_assignment)</code>, which messes with
-    memory addressibility resulting from changes in <code>%ESP</code>.
-<p>
-<li>The simulated <code>%EIP</code>.
-<p>
-<li>24 spill words, for when the register allocator can't make it work
-    with 5 measly registers.
-<p>
-<li>Addresses of helpers <code>VG_(helperc_STOREV2)</code>,
-    <code>VG_(helperc_LOADV2)</code>.  These are here because 2-byte
-    loads and stores are relatively rare, so are placed above the
-    magic 32-word offset boundary.
-<p>
-<li>For similar reasons, addresses of helper functions 
-    <code>VGM_(fpu_write_check)</code> and
-    <code>VGM_(fpu_read_check)</code>, which handle the A/V maps
-    testing and changes required by FPU writes/reads.  
-<p>
-<li>Some other boring helper addresses:
-    <code>VG_(helper_value_check2_fail)</code> and
-    <code>VG_(helper_value_check1_fail)</code>.  These are probably
-    never emitted now, and should be removed.
-<p>
-<li>The entire state of the simulated FPU, which I believe to be
-    108 bytes long.
-<p>
-<li>Finally, the addresses of various other helper functions in
-    <code>vg_helpers.S</code>, which deal with rare situations which
-    are tedious or difficult to generate code in-line for.
-</ul>
-
-<p>
-As a general rule, the simulated machine's state lives permanently in
-memory at <code>VG_(baseBlock)</code>.  However, the JITter does some
-optimisations which allow the simulated integer registers to be
-cached in real registers over multiple simulated instructions within
-the same basic block.  These are always flushed back into memory at
-the end of every basic block, so that the in-memory state is
-up-to-date between basic blocks.  (This flushing is implied by the
-statement above that the real machine's allocatable registers are
-dead in between simulated blocks).
-
-
-<h3>Startup, shutdown, and system calls</h3>
-
-Getting into of Valgrind (<code>VG_(startup)</code>, called from
-<code>valgrind.so</code>'s initialisation section), really means
-copying the real CPU's state into <code>VG_(baseBlock)</code>, and
-then installing our own stack pointer, etc, into the real CPU, and
-then starting up the JITter.  Exiting valgrind involves copying the
-simulated state back to the real state.
-
-<p>
-Unfortunately, there's a complication at startup time.  Problem is
-that at the point where we need to take a snapshot of the real CPU's
-state, the offsets in <code>VG_(baseBlock)</code> are not set up yet,
-because to do so would involve disrupting the real machine's state
-significantly.  The way round this is to dump the real machine's state
-into a temporary, static block of memory,
-<code>VG_(m_state_static)</code>.  We can then set up the
-<code>VG_(baseBlock)</code> offsets at our leisure, and copy into it
-from <code>VG_(m_state_static)</code> at some convenient later time.
-This copying is done by
-<code>VG_(copy_m_state_static_to_baseBlock)</code>.
-
-<p>
-On exit, the inverse transformation is (rather unnecessarily) used:
-stuff in <code>VG_(baseBlock)</code> is copied to
-<code>VG_(m_state_static)</code>, and the assembly stub then copies
-from <code>VG_(m_state_static)</code> into the real machine registers.
-
-<p>
-Doing system calls on behalf of the client (<code>vg_syscall.S</code>)
-is something of a half-way house.  We have to make the world look
-sufficiently like that which the client would normally have to make
-the syscall actually work properly, but we can't afford to lose
-control.  So the trick is to copy all of the client's state, <b>except
-its program counter</b>, into the real CPU, do the system call, and
-copy the state back out.  Note that the client's state includes its
-stack pointer register, so one effect of this partial restoration is
-to cause the system call to be run on the client's stack, as it should
-be.
-
-<p>
-As ever there are complications.  We have to save some of our own state
-somewhere when restoring the client's state into the CPU, so that we
-can keep going sensibly afterwards.  In fact the only thing which is
-important is our own stack pointer, but for paranoia reasons I save 
-and restore our own FPU state as well, even though that's probably
-pointless.
-
-<p>
-The complication on the above complication is, that for horrible
-reasons to do with signals, we may have to handle a second client
-system call whilst the client is blocked inside some other system 
-call (unbelievable!).  That means there's two sets of places to 
-dump Valgrind's stack pointer and FPU state across the syscall,
-and we decide which to use by consulting
-<code>VG_(syscall_depth)</code>, which is in turn maintained by
-<code>VG_(wrap_syscall)</code>.
-
-
-
-<h3>Introduction to UCode</h3>
-
-UCode lies at the heart of the x86-to-x86 JITter.  The basic premise
-is that dealing the the x86 instruction set head-on is just too darn
-complicated, so we do the traditional compiler-writer's trick and
-translate it into a simpler, easier-to-deal-with form.
-
-<p>
-In normal operation, translation proceeds through six stages,
-coordinated by <code>VG_(translate)</code>:
-<ol>
-<li>Parsing of an x86 basic block into a sequence of UCode
-    instructions (<code>VG_(disBB)</code>).
-<p>
-<li>UCode optimisation (<code>vg_improve</code>), with the aim of
-    caching simulated registers in real registers over multiple
-    simulated instructions, and removing redundant simulated
-    <code>%EFLAGS</code> saving/restoring.
-<p>
-<li>UCode instrumentation (<code>vg_instrument</code>), which adds
-    value and address checking code.
-<p>
-<li>Post-instrumentation cleanup (<code>vg_cleanup</code>), removing
-    redundant value-check computations.
-<p>
-<li>Register allocation (<code>vg_do_register_allocation</code>),
-    which, note, is done on UCode.
-<p>
-<li>Emission of final instrumented x86 code
-    (<code>VG_(emit_code)</code>).
-</ol>
-
-<p>
-Notice how steps 2, 3, 4 and 5 are simple UCode-to-UCode
-transformation passes, all on straight-line blocks of UCode (type
-<code>UCodeBlock</code>).  Steps 2 and 4 are optimisation passes and
-can be disabled for debugging purposes, with
-<code>--optimise=no</code> and <code>--cleanup=no</code> respectively.
-
-<p>
-Valgrind can also run in a no-instrumentation mode, given
-<code>--instrument=no</code>.  This is useful for debugging the JITter
-quickly without having to deal with the complexity of the
-instrumentation mechanism too.  In this mode, steps 3 and 4 are
-omitted.
-
-<p>
-These flags combine, so that <code>--instrument=no</code> together with 
-<code>--optimise=no</code> means only steps 1, 5 and 6 are used.
-<code>--single-step=yes</code> causes each x86 instruction to be
-treated as a single basic block.  The translations are terrible but
-this is sometimes instructive.  
-
-<p>
-The <code>--stop-after=N</code> flag switches back to the real CPU
-after <code>N</code> basic blocks.  It also re-JITs the final basic
-block executed and prints the debugging info resulting, so this
-gives you a way to get a quick snapshot of how a basic block looks as
-it passes through the six stages mentioned above.  If you want to 
-see full information for every block translated (probably not, but
-still ...) find, in <code>VG_(translate)</code>, the lines
-<br><code>   dis = True;</code>
-<br><code>   dis = debugging_translation;</code>
-<br>
-and comment out the second line.  This will spew out debugging
-junk faster than you can possibly imagine.
-
-
-
-<h3>UCode operand tags: type <code>Tag</code></h3>
-
-UCode is, more or less, a simple two-address RISC-like code.  In
-keeping with the x86 AT&T assembly syntax, generally speaking the
-first operand is the source operand, and the second is the destination
-operand, which is modified when the uinstr is notionally executed.
-
-<p>
-UCode instructions have up to three operand fields, each of which has
-a corresponding <code>Tag</code> describing it.  Possible values for
-the tag are:
-
-<ul>
-<li><code>NoValue</code>: indicates that the field is not in use.
-<p>
-<li><code>Lit16</code>: the field contains a 16-bit literal.
-<p>
-<li><code>Literal</code>: the field denotes a 32-bit literal, whose
-    value is stored in the <code>lit32</code> field of the uinstr
-    itself.  Since there is only one <code>lit32</code> for the whole
-    uinstr, only one operand field may contain this tag.
-<p>
-<li><code>SpillNo</code>: the field contains a spill slot number, in
-    the range 0 to 23 inclusive, denoting one of the spill slots
-    contained inside <code>VG_(baseBlock)</code>.  Such tags only
-    exist after register allocation.
-<p>
-<li><code>RealReg</code>: the field contains a number in the range 0
-    to 7 denoting an integer x86 ("real") register on the host.  The
-    number is the Intel encoding for integer registers.  Such tags
-    only exist after register allocation.
-<p>
-<li><code>ArchReg</code>: the field contains a number in the range 0
-    to 7 denoting an integer x86 register on the simulated CPU.  In
-    reality this means a reference to one of the first 8 words of
-    <code>VG_(baseBlock)</code>.  Such tags can exist at any point in
-    the translation process.
-<p>
-<li>Last, but not least, <code>TempReg</code>.  The field contains the
-    number of one of an infinite set of virtual (integer)
-    registers. <code>TempReg</code>s are used everywhere throughout
-    the translation process; you can have as many as you want.  The
-    register allocator maps as many as it can into
-    <code>RealReg</code>s and turns the rest into
-    <code>SpillNo</code>s, so <code>TempReg</code>s should not exist
-    after the register allocation phase.
-    <p>
-    <code>TempReg</code>s are always 32 bits long, even if the data
-    they hold is logically shorter.  In that case the upper unused
-    bits are required, and, I think, generally assumed, to be zero.  
-    <code>TempReg</code>s holding V bits for quantities shorter than 
-    32 bits are expected to have ones in the unused places, since a
-    one denotes "undefined".
-</ul>
-
-
-<h3>UCode instructions: type <code>UInstr</code></h3>
-
-<p>
-UCode was carefully designed to make it possible to do register
-allocation on UCode and then translate the result into x86 code
-without needing any extra registers ... well, that was the original
-plan, anyway.  Things have gotten a little more complicated since
-then.  In what follows, UCode instructions are referred to as uinstrs,
-to distinguish them from x86 instructions.  Uinstrs of course have
-uopcodes which are (naturally) different from x86 opcodes.
-
-<p>
-A uinstr (type <code>UInstr</code>) contains
-various fields, not all of which are used by any one uopcode:
-<ul>
-<li>Three 16-bit operand fields, <code>val1</code>, <code>val2</code>
-    and <code>val3</code>.
-<p>
-<li>Three tag fields, <code>tag1</code>, <code>tag2</code>
-    and <code>tag3</code>.  Each of these has a value of type
-    <code>Tag</code>,
-    and they describe what the <code>val1</code>, <code>val2</code>
-    and <code>val3</code> fields contain.
-<p>
-<li>A 32-bit literal field.
-<p>
-<li>Two <code>FlagSet</code>s, specifying which x86 condition codes are
-    read and written by the uinstr.
-<p>
-<li>An opcode byte, containing a value of type <code>Opcode</code>.
-<p>
-<li>A size field, indicating the data transfer size (1/2/4/8/10) in
-    cases where this makes sense, or zero otherwise.
-<p>
-<li>A condition-code field, which, for jumps, holds a
-    value of type <code>Condcode</code>, indicating the condition
-    which applies.  The encoding is as it is in the x86 insn stream,
-    except we add a 17th value <code>CondAlways</code> to indicate
-    an unconditional transfer.
-<p>
-<li>Various 1-bit flags, indicating whether this insn pertains to an
-    x86 CALL or RET instruction, whether a widening is signed or not,
-    etc.
-</ul>
-
-<p>
-UOpcodes (type <code>Opcode</code>) are divided into two groups: those
-necessary merely to express the functionality of the x86 code, and
-extra uopcodes needed to express the instrumentation.  The former
-group contains:
-<ul>
-<li><code>GET</code> and <code>PUT</code>, which move values from the
-    simulated CPU's integer registers (<code>ArchReg</code>s) into
-    <code>TempReg</code>s, and back.  <code>GETF</code> and
-    <code>PUTF</code> do the corresponding thing for the simulated
-    <code>%EFLAGS</code>.  There are no corresponding insns for the
-    FPU register stack, since we don't explicitly simulate its
-    registers.
-<p>
-<li><code>LOAD</code> and <code>STORE</code>, which, in RISC-like
-    fashion, are the only uinstrs able to interact with memory.
-<p>
-<li><code>MOV</code> and <code>CMOV</code> allow unconditional and
-    conditional moves of values between <code>TempReg</code>s.
-<p>
-<li>ALU operations.  Again in RISC-like fashion, these only operate on
-    <code>TempReg</code>s (before reg-alloc) or <code>RealReg</code>s
-    (after reg-alloc).  These are: <code>ADD</code>, <code>ADC</code>,
-    <code>AND</code>, <code>OR</code>, <code>XOR</code>,
-    <code>SUB</code>, <code>SBB</code>, <code>SHL</code>,
-    <code>SHR</code>, <code>SAR</code>, <code>ROL</code>,
-    <code>ROR</code>, <code>RCL</code>, <code>RCR</code>,
-    <code>NOT</code>, <code>NEG</code>, <code>INC</code>,
-    <code>DEC</code>, <code>BSWAP</code>, <code>CC2VAL</code> and
-    <code>WIDEN</code>.  <code>WIDEN</code> does signed or unsigned
-    value widening.  <code>CC2VAL</code> is used to convert condition
-    codes into a value, zero or one.  The rest are obvious.
-    <p>
-    To allow for more efficient code generation, we bend slightly the
-    restriction at the start of the previous para: for
-    <code>ADD</code>, <code>ADC</code>, <code>XOR</code>,
-    <code>SUB</code> and <code>SBB</code>, we allow the first (source)
-    operand to also be an <code>ArchReg</code>, that is, one of the
-    simulated machine's registers.  Also, many of these ALU ops allow
-    the source operand to be a literal.  See
-    <code>VG_(saneUInstr)</code> for the final word on the allowable
-    forms of uinstrs.
-<p>
-<li><code>LEA1</code> and <code>LEA2</code> are not strictly
-    necessary, but allow faciliate better translations.  They
-    record the fancy x86 addressing modes in a direct way, which
-    allows those amodes to be emitted back into the final
-    instruction stream more or less verbatim.
-<p>
-<li><code>CALLM</code> calls a machine-code helper, one of the methods
-    whose address is stored at some <code>VG_(baseBlock)</code>
-    offset.  <code>PUSH</code> and <code>POP</code> move values
-    to/from <code>TempReg</code> to the real (Valgrind's) stack, and
-    <code>CLEAR</code> removes values from the stack.
-    <code>CALLM_S</code> and <code>CALLM_E</code> delimit the
-    boundaries of call setups and clearings, for the benefit of the
-    instrumentation passes.  Getting this right is critical, and so
-    <code>VG_(saneUCodeBlock)</code> makes various checks on the use
-    of these uopcodes.
-    <p>
-    It is important to understand that these uopcodes have nothing to
-    do with the x86 <code>call</code>, <code>return,</code>
-    <code>push</code> or <code>pop</code> instructions, and are not
-    used to implement them.  Those guys turn into combinations of
-    <code>GET</code>, <code>PUT</code>, <code>LOAD</code>,
-    <code>STORE</code>, <code>ADD</code>, <code>SUB</code>, and
-    <code>JMP</code>.  What these uopcodes support is calling of
-    helper functions such as <code>VG_(helper_imul_32_64)</code>,
-    which do stuff which is too difficult or tedious to emit inline.
-<p>
-<li><code>FPU</code>, <code>FPU_R</code> and <code>FPU_W</code>.
-    Valgrind doesn't attempt to simulate the internal state of the
-    FPU at all.  Consequently it only needs to be able to distinguish
-    FPU ops which read and write memory from those that don't, and
-    for those which do, it needs to know the effective address and
-    data transfer size.  This is made easier because the x86 FP
-    instruction encoding is very regular, basically consisting of
-    16 bits for a non-memory FPU insn and 11 (IIRC) bits + an address mode
-    for a memory FPU insn.  So our <code>FPU</code> uinstr carries
-    the 16 bits in its <code>val1</code> field.  And
-    <code>FPU_R</code> and <code>FPU_W</code> carry 11 bits in that
-    field, together with the identity of a <code>TempReg</code> or
-    (later) <code>RealReg</code> which contains the address.
-<p>
-<li><code>JIFZ</code> is unique, in that it allows a control-flow
-    transfer which is not deemed to end a basic block.  It causes a
-    jump to a literal (original) address if the specified argument
-    is zero.
-<p>
-<li>Finally, <code>INCEIP</code> advances the simulated
-    <code>%EIP</code> by the specified literal amount.  This supports
-    lazy <code>%EIP</code> updating, as described below.
-</ul>
-
-<p>
-Stages 1 and 2 of the 6-stage translation process mentioned above
-deal purely with these uopcodes, and no others.  They are
-sufficient to express pretty much all the x86 32-bit protected-mode 
-instruction set, at
-least everything understood by a pre-MMX original Pentium (P54C). 
-
-<p>
-Stages 3, 4, 5 and 6 also deal with the following extra
-"instrumentation" uopcodes.  They are used to express all the
-definedness-tracking and -checking machinery which valgrind does.  In
-later sections we show how to create checking code for each of the
-uopcodes above.  Note that these instrumentation uopcodes, although
-some appearing complicated, have been carefully chosen so that
-efficient x86 code can be generated for them.  GNU superopt v2.5 did a
-great job helping out here.  Anyways, the uopcodes are as follows:
-
-<ul>
-<li><code>GETV</code> and <code>PUTV</code> are analogues to
-    <code>GET</code> and <code>PUT</code> above.  They are identical
-    except that they move the V bits for the specified values back and
-    forth to <code>TempRegs</code>, rather than moving the values
-    themselves.
-<p>
-<li>Similarly, <code>LOADV</code> and <code>STOREV</code> read and
-    write V bits from the synthesised shadow memory that Valgrind
-    maintains.  In fact they do more than that, since they also do
-    address-validity checks, and emit complaints if the read/written
-    addresses are unaddressible.
-<p>
-<li><code>TESTV</code>, whose parameters are a <code>TempReg</code>
-    and a size, tests the V bits in the <code>TempReg</code>, at the
-    specified operation size (0/1/2/4 byte) and emits an error if any
-    of them indicate undefinedness.  This is the only uopcode capable
-    of doing such tests.
-<p>
-<li><code>SETV</code>, whose parameters are also <code>TempReg</code>
-    and a size, makes the V bits in the <code>TempReg</code> indicated
-    definedness, at the specified operation size.  This is usually
-    used to generate the correct V bits for a literal value, which is
-    of course fully defined.
-<p>
-<li><code>GETVF</code> and <code>PUTVF</code> are analogues to
-    <code>GETF</code> and <code>PUTF</code>.  They move the single V
-    bit used to model definedness of <code>%EFLAGS</code> between its
-    home in <code>VG_(baseBlock)</code> and the specified
-    <code>TempReg</code>.
-<p>
-<li><code>TAG1</code> denotes one of a family of unary operations on
-    <code>TempReg</code>s containing V bits.  Similarly,
-    <code>TAG2</code> denotes one in a family of binary operations on
-    V bits.
-</ul>
-
-<p>
-These 10 uopcodes are sufficient to express Valgrind's entire
-definedness-checking semantics.  In fact most of the interesting magic
-is done by the <code>TAG1</code> and <code>TAG2</code>
-suboperations.
-
-<p>
-First, however, I need to explain about V-vector operation sizes.
-There are 4 sizes: 1, 2 and 4, which operate on groups of 8, 16 and 32
-V bits at a time, supporting the usual 1, 2 and 4 byte x86 operations.
-However there is also the mysterious size 0, which really means a
-single V bit.  Single V bits are used in various circumstances; in
-particular, the definedness of <code>%EFLAGS</code> is modelled with a
-single V bit.  Now might be a good time to also point out that for
-V bits, 1 means "undefined" and 0 means "defined".  Similarly, for A
-bits, 1 means "invalid address" and 0 means "valid address".  This
-seems counterintuitive (and so it is), but testing against zero on
-x86s saves instructions compared to testing against all 1s, because
-many ALU operations set the Z flag for free, so to speak.
-
-<p>
-With that in mind, the tag ops are:
-
-<ul>
-<li><b>(UNARY) Pessimising casts</b>: <code>VgT_PCast40</code>,
-    <code>VgT_PCast20</code>, <code>VgT_PCast10</code>,
-    <code>VgT_PCast01</code>, <code>VgT_PCast02</code> and
-    <code>VgT_PCast04</code>.  A "pessimising cast" takes a V-bit
-    vector at one size, and creates a new one at another size,
-    pessimised in the sense that if any of the bits in the source
-    vector indicate undefinedness, then all the bits in the result
-    indicate undefinedness.  In this case the casts are all to or from
-    a single V bit, so for example <code>VgT_PCast40</code> is a
-    pessimising cast from 32 bits to 1, whereas
-    <code>VgT_PCast04</code> simply copies the single source V bit
-    into all 32 bit positions in the result.  Surprisingly, these ops
-    can all be implemented very efficiently.
-    <p>
-    There are also the pessimising casts <code>VgT_PCast14</code>,
-    from 8 bits to 32, <code>VgT_PCast12</code>, from 8 bits to 16,
-    and <code>VgT_PCast11</code>, from 8 bits to 8.  This last one
-    seems nonsensical, but in fact it isn't a no-op because, as
-    mentioned above, any undefined (1) bits in the source infect the
-    entire result.
-<p>
-<li><b>(UNARY) Propagating undefinedness upwards in a word</b>:
-    <code>VgT_Left4</code>, <code>VgT_Left2</code> and
-    <code>VgT_Left1</code>.  These are used to simulate the worst-case
-    effects of carry propagation in adds and subtracts.  They return a
-    V vector identical to the original, except that if the original
-    contained any undefined bits, then it and all bits above it are
-    marked as undefined too.  Hence the Left bit in the names.
-<p>
-<li><b>(UNARY) Signed and unsigned value widening</b>:
-     <code>VgT_SWiden14</code>, <code>VgT_SWiden24</code>,
-     <code>VgT_SWiden12</code>, <code>VgT_ZWiden14</code>,
-     <code>VgT_ZWiden24</code> and <code>VgT_ZWiden12</code>.  These
-     mimic the definedness effects of standard signed and unsigned
-     integer widening.  Unsigned widening creates zero bits in the new
-     positions, so <code>VgT_ZWiden*</code> accordingly park mark
-     those parts of their argument as defined.  Signed widening copies
-     the sign bit into the new positions, so <code>VgT_SWiden*</code>
-     copies the definedness of the sign bit into the new positions.
-     Because 1 means undefined and 0 means defined, these operations
-     can (fascinatingly) be done by the same operations which they
-     mimic.  Go figure.
-<p>
-<li><b>(BINARY) Undefined-if-either-Undefined,
-     Defined-if-either-Defined</b>: <code>VgT_UifU4</code>,
-     <code>VgT_UifU2</code>, <code>VgT_UifU1</code>,
-     <code>VgT_UifU0</code>, <code>VgT_DifD4</code>,
-     <code>VgT_DifD2</code>, <code>VgT_DifD1</code>.  These do simple
-     bitwise operations on pairs of V-bit vectors, with
-     <code>UifU</code> giving undefined if either arg bit is
-     undefined, and <code>DifD</code> giving defined if either arg bit
-     is defined.  Abstract interpretation junkies, if any make it this
-     far, may like to think of them as meets and joins (or is it joins
-     and meets) in the definedness lattices.  
-<p>
-<li><b>(BINARY; one value, one V bits) Generate argument improvement
-    terms for AND and OR</b>: <code>VgT_ImproveAND4_TQ</code>,
-    <code>VgT_ImproveAND2_TQ</code>, <code>VgT_ImproveAND1_TQ</code>,
-    <code>VgT_ImproveOR4_TQ</code>, <code>VgT_ImproveOR2_TQ</code>,
-    <code>VgT_ImproveOR1_TQ</code>.  These help out with AND and OR
-    operations.  AND and OR have the inconvenient property that the
-    definedness of the result depends on the actual values of the
-    arguments as well as their definedness.  At the bit level:
-    <br><code>1 AND undefined = undefined</code>, but 
-    <br><code>0 AND undefined = 0</code>, and similarly 
-    <br><code>0 OR  undefined = undefined</code>, but 
-    <br><code>1 OR  undefined = 1</code>.
-    <br>
-    <p>
-    It turns out that gcc (quite legitimately) generates code which
-    relies on this fact, so we have to model it properly in order to
-    avoid flooding users with spurious value errors.  The ultimate
-    definedness result of AND and OR is calculated using
-    <code>UifU</code> on the definedness of the arguments, but we
-    also <code>DifD</code> in some "improvement" terms which 
-    take into account the above phenomena.  
-    <p>
-    <code>ImproveAND</code> takes as its first argument the actual
-    value of an argument to AND (the T) and the definedness of that
-    argument (the Q), and returns a V-bit vector which is defined (0)
-    for bits which have value 0 and are defined; this, when
-    <code>DifD</code> into the final result causes those bits to be
-    defined even if the corresponding bit in the other argument is undefined.
-    <p>
-    The <code>ImproveOR</code> ops do the dual thing for OR
-    arguments.  Note that XOR does not have this property that one
-    argument can make the other irrelevant, so there is no need for
-    such complexity for XOR.
-</ul>
-
-<p>
-That's all the tag ops.  If you stare at this long enough, and then
-run Valgrind and stare at the pre- and post-instrumented ucode, it
-should be fairly obvious how the instrumentation machinery hangs
-together.
-
-<p>
-One point, if you do this: in order to make it easy to differentiate
-<code>TempReg</code>s carrying values from <code>TempReg</code>s
-carrying V bit vectors, Valgrind prints the former as (for example)
-<code>t28</code> and the latter as <code>q28</code>; the fact that
-they carry the same number serves to indicate their relationship.
-This is purely for the convenience of the human reader; the register
-allocator and code generator don't regard them as different.
-
-
-<h3>Translation into UCode</h3>
-
-<code>VG_(disBB)</code> allocates a new <code>UCodeBlock</code> and
-then uses <code>disInstr</code> to translate x86 instructions one at a
-time into UCode, dumping the result in the <code>UCodeBlock</code>.
-This goes on until a control-flow transfer instruction is encountered.
-
-<p>
-Despite the large size of <code>vg_to_ucode.c</code>, this translation
-is really very simple.  Each x86 instruction is translated entirely
-independently of its neighbours, merrily allocating new
-<code>TempReg</code>s as it goes.  The idea is to have a simple
-translator -- in reality, no more than a macro-expander -- and the --
-resulting bad UCode translation is cleaned up by the UCode
-optimisation phase which follows.  To give you an idea of some x86
-instructions and their translations (this is a complete basic block,
-as Valgrind sees it):
-<pre>
-        0x40435A50:  incl %edx
-
-           0: GETL      %EDX, t0
-           1: INCL      t0  (-wOSZAP)
-           2: PUTL      t0, %EDX
-
-        0x40435A51:  movsbl (%edx),%eax
-
-           3: GETL      %EDX, t2
-           4: LDB       (t2), t2
-           5: WIDENL_Bs t2
-           6: PUTL      t2, %EAX
-
-        0x40435A54:  testb $0x20, 1(%ecx,%eax,2)
-
-           7: GETL      %EAX, t6
-           8: GETL      %ECX, t8
-           9: LEA2L     1(t8,t6,2), t4
-          10: LDB       (t4), t10
-          11: MOVB      $0x20, t12
-          12: ANDB      t12, t10  (-wOSZACP)
-          13: INCEIPo   $9
-
-        0x40435A59:  jnz-8 0x40435A50
-
-          14: Jnzo      $0x40435A50  (-rOSZACP)
-          15: JMPo      $0x40435A5B
-</pre>
-
-<p>
-Notice how the block always ends with an unconditional jump to the
-next block.  This is a bit unnecessary, but makes many things simpler.
-
-<p>
-Most x86 instructions turn into sequences of <code>GET</code>,
-<code>PUT</code>, <code>LEA1</code>, <code>LEA2</code>,
-<code>LOAD</code> and <code>STORE</code>.  Some complicated ones
-however rely on calling helper bits of code in 
-<code>vg_helpers.S</code>.  The ucode instructions <code>PUSH</code>,
-<code>POP</code>, <code>CALL</code>, <code>CALLM_S</code> and
-<code>CALLM_E</code> support this.  The calling convention is somewhat
-ad-hoc and is not the C calling convention.  The helper routines must 
-save all integer registers, and the flags, that they use.  Args are
-passed on the stack underneath the return address, as usual, and if 
-result(s) are to be returned, it (they) are either placed in dummy arg
-slots created by the ucode <code>PUSH</code> sequence, or just
-overwrite the incoming args.
-
-<p>
-In order that the instrumentation mechanism can handle calls to these
-helpers, <code>VG_(saneUCodeBlock)</code> enforces the following
-restrictions on calls to helpers:
-
-<ul>
-<li>Each <code>CALL</code> uinstr must be bracketed by a preceding
-    <code>CALLM_S</code> marker (dummy uinstr) and a trailing
-    <code>CALLM_E</code> marker.  These markers are used by the
-    instrumentation mechanism later to establish the boundaries of the
-    <code>PUSH</code>, <code>POP</code> and <code>CLEAR</code>
-    sequences for the call.
-<p>
-<li><code>PUSH</code>, <code>POP</code> and <code>CLEAR</code>
-    may only appear inside sections bracketed by <code>CALLM_S</code>
-    and <code>CALLM_E</code>, and nowhere else.
-<p>
-<li>In any such bracketed section, no two <code>PUSH</code> insns may
-    push the same <code>TempReg</code>.  Dually, no two two
-    <code>POP</code>s may pop the same <code>TempReg</code>.
-<p>
-<li>Finally, although this is not checked, args should be removed from
-    the stack with <code>CLEAR</code>, rather than <code>POP</code>s
-    into a <code>TempReg</code> which is not subsequently used.  This
-    is because the instrumentation mechanism assumes that all values
-    <code>POP</code>ped from the stack are actually used.
-</ul>
-
-Some of the translations may appear to have redundant
-<code>TempReg</code>-to-<code>TempReg</code> moves.  This helps the
-next phase, UCode optimisation, to generate better code.
-
-
-
-<h3>UCode optimisation</h3>
-
-UCode is then subjected to an improvement pass
-(<code>vg_improve()</code>), which blurs the boundaries between the
-translations of the original x86 instructions.  It's pretty
-straightforward.  Three transformations are done:
-
-<ul>
-<li>Redundant <code>GET</code> elimination.  Actually, more general
-    than that -- eliminates redundant fetches of ArchRegs.  In our
-    running example, uinstr 3 <code>GET</code>s <code>%EDX</code> into
-    <code>t2</code> despite the fact that, by looking at the previous
-    uinstr, it is already in <code>t0</code>.  The <code>GET</code> is
-    therefore removed, and <code>t2</code> renamed to <code>t0</code>.
-    Assuming <code>t0</code> is allocated to a host register, it means
-    the simulated <code>%EDX</code> will exist in a host CPU register
-    for more than one simulated x86 instruction, which seems to me to
-    be a highly desirable property.
-    <p>
-    There is some mucking around to do with subregisters;
-    <code>%AL</code> vs <code>%AH</code> <code>%AX</code> vs
-    <code>%EAX</code> etc.  I can't remember how it works, but in
-    general we are very conservative, and these tend to invalidate the
-    caching. 
-<p>
-<li>Redundant <code>PUT</code> elimination.  This annuls
-    <code>PUT</code>s of values back to simulated CPU registers if a
-    later <code>PUT</code> would overwrite the earlier
-    <code>PUT</code> value, and there is no intervening reads of the
-    simulated register (<code>ArchReg</code>).
-    <p>
-    As before, we are paranoid when faced with subregister references.
-    Also, <code>PUT</code>s of <code>%ESP</code> are never annulled,
-    because it is vital the instrumenter always has an up-to-date
-    <code>%ESP</code> value available, <code>%ESP</code> changes
-    affect addressibility of the memory around the simulated stack
-    pointer.
-    <p>
-    The implication of the above paragraph is that the simulated
-    machine's registers are only lazily updated once the above two
-    optimisation phases have run, with the exception of
-    <code>%ESP</code>.  <code>TempReg</code>s go dead at the end of
-    every basic block, from which is is inferrable that any
-    <code>TempReg</code> caching a simulated CPU reg is flushed (back
-    into the relevant <code>VG_(baseBlock)</code> slot) at the end of
-    every basic block.  The further implication is that the simulated
-    registers are only up-to-date at in between basic blocks, and not
-    at arbitrary points inside basic blocks.  And the consequence of
-    that is that we can only deliver signals to the client in between
-    basic blocks.  None of this seems any problem in practice.
-<p>
-<li>Finally there is a simple def-use thing for condition codes.  If
-    an earlier uinstr writes the condition codes, and the next uinsn
-    along which actually cares about the condition codes writes the
-    same or larger set of them, but does not read any, the earlier
-    uinsn is marked as not writing any condition codes.  This saves 
-    a lot of redundant cond-code saving and restoring.
-</ul>
-
-The effect of these transformations on our short block is rather
-unexciting, and shown below.  On longer basic blocks they can
-dramatically improve code quality.
-
-<pre>
-at 3: delete GET, rename t2 to t0 in (4 .. 6)
-at 7: delete GET, rename t6 to t0 in (8 .. 9)
-at 1: annul flag write OSZAP due to later OSZACP
-
-Improved code:
-           0: GETL      %EDX, t0
-           1: INCL      t0
-           2: PUTL      t0, %EDX
-           4: LDB       (t0), t0
-           5: WIDENL_Bs t0
-           6: PUTL      t0, %EAX
-           8: GETL      %ECX, t8
-           9: LEA2L     1(t8,t0,2), t4
-          10: LDB       (t4), t10
-          11: MOVB      $0x20, t12
-          12: ANDB      t12, t10  (-wOSZACP)
-          13: INCEIPo   $9
-          14: Jnzo      $0x40435A50  (-rOSZACP)
-          15: JMPo      $0x40435A5B
-</pre>
-
-<h3>UCode instrumentation</h3>
-
-Once you understand the meaning of the instrumentation uinstrs,
-discussed in detail above, the instrumentation scheme is fairly
-straighforward.  Each uinstr is instrumented in isolation, and the
-instrumentation uinstrs are placed before the original uinstr.
-Our running example continues below.  I have placed a blank line 
-after every original ucode, to make it easier to see which
-instrumentation uinstrs correspond to which originals.
-
-<p>
-As mentioned somewhere above, <code>TempReg</code>s carrying values 
-have names like <code>t28</code>, and each one has a shadow carrying
-its V bits, with names like <code>q28</code>.  This pairing aids in
-reading instrumented ucode.
-
-<p>
-One decision about all this is where to have "observation points",
-that is, where to check that V bits are valid.  I use a minimalistic
-scheme, only checking where a failure of validity could cause the 
-original program to (seg)fault.  So the use of values as memory
-addresses causes a check, as do conditional jumps (these cause a check
-on the definedness of the condition codes).  And arguments
-<code>PUSH</code>ed for helper calls are checked, hence the wierd
-restrictions on help call preambles described above.
-
-<p>
-Another decision is that once a value is tested, it is thereafter
-regarded as defined, so that we do not emit multiple undefined-value
-errors for the same undefined value.  That means that
-<code>TESTV</code> uinstrs are always followed by <code>SETV</code> 
-on the same (shadow) <code>TempReg</code>s.  Most of these
-<code>SETV</code>s are redundant and are removed by the
-post-instrumentation cleanup phase.
-
-<p>
-The instrumentation for calling helper functions deserves further
-comment.  The definedness of results from a helper is modelled using
-just one V bit.  So, in short, we do pessimising casts of the
-definedness of all the args, down to a single bit, and then
-<code>UifU</code> these bits together.  So this single V bit will say
-"undefined" if any part of any arg is undefined.  This V bit is then
-pessimally cast back up to the result(s) sizes, as needed.  If, by
-seeing that all the args are got rid of with <code>CLEAR</code> and
-none with <code>POP</code>, Valgrind sees that the result of the call
-is not actually used, it immediately examines the result V bit with a
-<code>TESTV</code> -- <code>SETV</code> pair.  If it did not do this,
-there would be no observation point to detect that the some of the
-args to the helper were undefined.  Of course, if the helper's results
-are indeed used, we don't do this, since the result usage will
-presumably cause the result definedness to be checked at some suitable
-future point.
-
-<p>
-In general Valgrind tries to track definedness on a bit-for-bit basis,
-but as the above para shows, for calls to helpers we throw in the
-towel and approximate down to a single bit.  This is because it's too
-complex and difficult to track bit-level definedness through complex
-ops such as integer multiply and divide, and in any case there is no
-reasonable code fragments which attempt to (eg) multiply two
-partially-defined values and end up with something meaningful, so
-there seems little point in modelling multiplies, divides, etc, in
-that level of detail.
-
-<p>
-Integer loads and stores are instrumented with firstly a test of the
-definedness of the address, followed by a <code>LOADV</code> or
-<code>STOREV</code> respectively.  These turn into calls to 
-(for example) <code>VG_(helperc_LOADV4)</code>.  These helpers do two
-things: they perform an address-valid check, and they load or store V
-bits from/to the relevant address in the (simulated V-bit) memory.
-
-<p>
-FPU loads and stores are different.  As above the definedness of the
-address is first tested.  However, the helper routine for FPU loads
-(<code>VGM_(fpu_read_check)</code>) emits an error if either the
-address is invalid or the referenced area contains undefined values.
-It has to do this because we do not simulate the FPU at all, and so
-cannot track definedness of values loaded into it from memory, so we
-have to check them as soon as they are loaded into the FPU, ie, at
-this point.  We notionally assume that everything in the FPU is
-defined.
-
-<p>
-It follows therefore that FPU writes first check the definedness of
-the address, then the validity of the address, and finally mark the
-written bytes as well-defined.
-
-<p>
-If anyone is inspired to extend Valgrind to MMX/SSE insns, I suggest
-you use the same trick.  It works provided that the FPU/MMX unit is
-not used to merely as a conduit to copy partially undefined data from
-one place in memory to another.  Unfortunately the integer CPU is used
-like that (when copying C structs with holes, for example) and this is
-the cause of much of the elaborateness of the instrumentation here
-described.
-
-<p>
-<code>vg_instrument()</code> in <code>vg_translate.c</code> actually
-does the instrumentation.  There are comments explaining how each
-uinstr is handled, so we do not repeat that here.  As explained
-already, it is bit-accurate, except for calls to helper functions.
-Unfortunately the x86 insns <code>bt/bts/btc/btr</code> are done by
-helper fns, so bit-level accuracy is lost there.  This should be fixed
-by doing them inline; it will probably require adding a couple new
-uinstrs.  Also, left and right rotates through the carry flag (x86
-<code>rcl</code> and <code>rcr</code>) are approximated via a single
-V bit; so far this has not caused anyone to complain.  The
-non-carry rotates, <code>rol</code> and <code>ror</code>, are much
-more common and are done exactly.  Re-visiting the instrumentation for
-AND and OR, they seem rather verbose, and I wonder if it could be done
-more concisely now.
-
-<p>
-The lowercase <code>o</code> on many of the uopcodes in the running
-example indicates that the size field is zero, usually meaning a
-single-bit operation.
-
-<p>
-Anyroads, the post-instrumented version of our running example looks
-like this:
-
-<pre>
-Instrumented code:
-           0: GETVL     %EDX, q0
-           1: GETL      %EDX, t0
-
-           2: TAG1o     q0 = Left4 ( q0 )
-           3: INCL      t0
-
-           4: PUTVL     q0, %EDX
-           5: PUTL      t0, %EDX
-
-           6: TESTVL    q0
-           7: SETVL     q0
-           8: LOADVB    (t0), q0
-           9: LDB       (t0), t0
-
-          10: TAG1o     q0 = SWiden14 ( q0 )
-          11: WIDENL_Bs t0
-
-          12: PUTVL     q0, %EAX
-          13: PUTL      t0, %EAX
-
-          14: GETVL     %ECX, q8
-          15: GETL      %ECX, t8
-
-          16: MOVL      q0, q4
-          17: SHLL      $0x1, q4
-          18: TAG2o     q4 = UifU4 ( q8, q4 )
-          19: TAG1o     q4 = Left4 ( q4 )
-          20: LEA2L     1(t8,t0,2), t4
-
-          21: TESTVL    q4
-          22: SETVL     q4
-          23: LOADVB    (t4), q10
-          24: LDB       (t4), t10
-
-          25: SETVB     q12
-          26: MOVB      $0x20, t12
-
-          27: MOVL      q10, q14
-          28: TAG2o     q14 = ImproveAND1_TQ ( t10, q14 )
-          29: TAG2o     q10 = UifU1 ( q12, q10 )
-          30: TAG2o     q10 = DifD1 ( q14, q10 )
-          31: MOVL      q12, q14
-          32: TAG2o     q14 = ImproveAND1_TQ ( t12, q14 )
-          33: TAG2o     q10 = DifD1 ( q14, q10 )
-          34: MOVL      q10, q16
-          35: TAG1o     q16 = PCast10 ( q16 )
-          36: PUTVFo    q16
-          37: ANDB      t12, t10  (-wOSZACP)
-
-          38: INCEIPo   $9
-
-          39: GETVFo    q18
-          40: TESTVo    q18
-          41: SETVo     q18
-          42: Jnzo      $0x40435A50  (-rOSZACP)
-
-          43: JMPo      $0x40435A5B
-</pre>
-
-
-<h3>UCode post-instrumentation cleanup</h3>
-
-<p>
-This pass, coordinated by <code>vg_cleanup()</code>, removes redundant
-definedness computation created by the simplistic instrumentation
-pass.  It consists of two passes,
-<code>vg_propagate_definedness()</code> followed by
-<code>vg_delete_redundant_SETVs</code>.
-
-<p>
-<code>vg_propagate_definedness()</code> is a simple
-constant-propagation and constant-folding pass.  It tries to determine
-which <code>TempReg</code>s containing V bits will always indicate
-"fully defined", and it propagates this information as far as it can,
-and folds out as many operations as possible.  For example, the
-instrumentation for an ADD of a literal to a variable quantity will be
-reduced down so that the definedness of the result is simply the
-definedness of the variable quantity, since the literal is by
-definition fully defined.
-
-<p>
-<code>vg_delete_redundant_SETVs</code> removes <code>SETV</code>s on
-shadow <code>TempReg</code>s for which the next action is a write.
-I don't think there's anything else worth saying about this; it is
-simple.  Read the sources for details.
-
-<p>
-So the cleaned-up running example looks like this.  As above, I have
-inserted line breaks after every original (non-instrumentation) uinstr
-to aid readability.  As with straightforward ucode optimisation, the
-results in this block are undramatic because it is so short; longer
-blocks benefit more because they have more redundancy which gets
-eliminated.
-
-
-<pre>
-at 29: delete UifU1 due to defd arg1
-at 32: change ImproveAND1_TQ to MOV due to defd arg2
-at 41: delete SETV
-at 31: delete MOV
-at 25: delete SETV
-at 22: delete SETV
-at 7: delete SETV
-
-           0: GETVL     %EDX, q0
-           1: GETL      %EDX, t0
-
-           2: TAG1o     q0 = Left4 ( q0 )
-           3: INCL      t0
-
-           4: PUTVL     q0, %EDX
-           5: PUTL      t0, %EDX
-
-           6: TESTVL    q0
-           8: LOADVB    (t0), q0
-           9: LDB       (t0), t0
-
-          10: TAG1o     q0 = SWiden14 ( q0 )
-          11: WIDENL_Bs t0
-
-          12: PUTVL     q0, %EAX
-          13: PUTL      t0, %EAX
-
-          14: GETVL     %ECX, q8
-          15: GETL      %ECX, t8
-
-          16: MOVL      q0, q4
-          17: SHLL      $0x1, q4
-          18: TAG2o     q4 = UifU4 ( q8, q4 )
-          19: TAG1o     q4 = Left4 ( q4 )
-          20: LEA2L     1(t8,t0,2), t4
-
-          21: TESTVL    q4
-          23: LOADVB    (t4), q10
-          24: LDB       (t4), t10
-
-          26: MOVB      $0x20, t12
-
-          27: MOVL      q10, q14
-          28: TAG2o     q14 = ImproveAND1_TQ ( t10, q14 )
-          30: TAG2o     q10 = DifD1 ( q14, q10 )
-          32: MOVL      t12, q14
-          33: TAG2o     q10 = DifD1 ( q14, q10 )
-          34: MOVL      q10, q16
-          35: TAG1o     q16 = PCast10 ( q16 )
-          36: PUTVFo    q16
-          37: ANDB      t12, t10  (-wOSZACP)
-
-          38: INCEIPo   $9
-          39: GETVFo    q18
-          40: TESTVo    q18
-          42: Jnzo      $0x40435A50  (-rOSZACP)
-
-          43: JMPo      $0x40435A5B
-</pre>
-
-
-<h3>Translation from UCode</h3>
-
-This is all very simple, even though <code>vg_from_ucode.c</code>
-is a big file.  Position-independent x86 code is generated into 
-a dynamically allocated array <code>emitted_code</code>; this is
-doubled in size when it overflows.  Eventually the array is handed
-back to the caller of <code>VG_(translate)</code>, who must copy
-the result into TC and TT, and free the array.
-
-<p>
-This file is structured into four layers of abstraction, which,
-thankfully, are glued back together with extensive
-<code>__inline__</code> directives.  From the bottom upwards:
-
-<ul>
-<li>Address-mode emitters, <code>emit_amode_regmem_reg</code> et al.
-<p>
-<li>Emitters for specific x86 instructions.  There are quite a lot of
-    these, with names such as <code>emit_movv_offregmem_reg</code>.
-    The <code>v</code> suffix is Intel parlance for a 16/32 bit insn;
-    there are also <code>b</code> suffixes for 8 bit insns.
-<p>
-<li>The next level up are the <code>synth_*</code> functions, which
-    synthesise possibly a sequence of raw x86 instructions to do some
-    simple task.  Some of these are quite complex because they have to
-    work around Intel's silly restrictions on subregister naming.  See 
-    <code>synth_nonshiftop_reg_reg</code> for example.
-<p>
-<li>Finally, at the top of the heap, we have
-    <code>emitUInstr()</code>,
-    which emits code for a single uinstr.
-</ul>
-
-<p>
-Some comments:
-<ul>
-<li>The hack for FPU instructions becomes apparent here.  To do a
-    <code>FPU</code> ucode instruction, we load the simulated FPU's
-    state into from its <code>VG_(baseBlock)</code> into the real FPU
-    using an x86 <code>frstor</code> insn, do the ucode
-    <code>FPU</code> insn on the real CPU, and write the updated FPU
-    state back into <code>VG_(baseBlock)</code> using an
-    <code>fnsave</code> instruction.  This is pretty brutal, but is
-    simple and it works, and even seems tolerably efficient.  There is
-    no attempt to cache the simulated FPU state in the real FPU over
-    multiple back-to-back ucode FPU instructions.
-    <p>
-    <code>FPU_R</code> and <code>FPU_W</code> are also done this way,
-    with the minor complication that we need to patch in some
-    addressing mode bits so the resulting insn knows the effective
-    address to use.  This is easy because of the regularity of the x86
-    FPU instruction encodings.
-<p>
-<li>An analogous trick is done with ucode insns which claim, in their
-    <code>flags_r</code> and <code>flags_w</code> fields, that they
-    read or write the simulated <code>%EFLAGS</code>.  For such cases
-    we first copy the simulated <code>%EFLAGS</code> into the real
-    <code>%eflags</code>, then do the insn, then, if the insn says it
-    writes the flags, copy back to <code>%EFLAGS</code>.  This is a
-    bit expensive, which is why the ucode optimisation pass goes to
-    some effort to remove redundant flag-update annotations.
-</ul>
-
-<p>
-And so ... that's the end of the documentation for the instrumentating
-translator!  It's really not that complex, because it's composed as a
-sequence of simple(ish) self-contained transformations on
-straight-line blocks of code.
-
-
-<h3>Top-level dispatch loop</h3>
-
-Urk.  In <code>VG_(toploop)</code>.  This is basically boring and
-unsurprising, not to mention fiddly and fragile.  It needs to be
-cleaned up.  
-
-<p>
-The only perhaps surprise is that the whole thing is run
-on top of a <code>setjmp</code>-installed exception handler, because,
-supposing a translation got a segfault, we have to bail out of the
-Valgrind-supplied exception handler <code>VG_(oursignalhandler)</code>
-and immediately start running the client's segfault handler, if it has
-one.  In particular we can't finish the current basic block and then
-deliver the signal at some convenient future point, because signals
-like SIGILL, SIGSEGV and SIGBUS mean that the faulting insn should not
-simply be re-tried.  (I'm sure there is a clearer way to explain this).
-
-
-<h3>Exceptions, creating new translations</h3>
-<h3>Self-modifying code</h3>
-
-<h3>Lazy updates of the simulated program counter</h3>
-
-Simulated <code>%EIP</code> is not updated after every simulated x86
-insn as this was regarded as too expensive.  Instead ucode
-<code>INCEIP</code> insns move it along as and when necessary.
-Currently we don't allow it to fall more than 4 bytes behind reality
-(see <code>VG_(disBB)</code> for the way this works).
-<p>
-Note that <code>%EIP</code> is always brought up to date by the inner
-dispatch loop in <code>VG_(dispatch)</code>, so that if the client
-takes a fault we know at least which basic block this happened in.
-
-
-<h3>The translation cache and translation table</h3>
-
-<h3>Signals</h3>
-
-Horrible, horrible.  <code>vg_signals.c</code>.
-Basically, since we have to intercept all system
-calls anyway, we can see when the client tries to install a signal
-handler.  If it does so, we make a note of what the client asked to
-happen, and ask the kernel to route the signal to our own signal
-handler, <code>VG_(oursignalhandler)</code>.  This simply notes the
-delivery of signals, and returns.  
-
-<p>
-Every 1000 basic blocks, we see if more signals have arrived.  If so,
-<code>VG_(deliver_signals)</code> builds signal delivery frames on the
-client's stack, and allows their handlers to be run.  Valgrind places
-in these signal delivery frames a bogus return address,
-</code>VG_(signalreturn_bogusRA)</code>, and checks all jumps to see
-if any jump to it.  If so, this is a sign that a signal handler is
-returning, and if so Valgrind removes the relevant signal frame from
-the client's stack, restores the from the signal frame the simulated
-state before the signal was delivered, and allows the client to run
-onwards.  We have to do it this way because some signal handlers never
-return, they just <code>longjmp()</code>, which nukes the signal
-delivery frame.
-
-<p>
-The Linux kernel has a different but equally horrible hack for
-detecting signal handler returns.  Discovering it is left as an
-exercise for the reader.
-
-
-
-<h3>Errors, error contexts, error reporting, suppressions</h3>
-<h3>Client malloc/free</h3>
-<h3>Low-level memory management</h3>
-<h3>A and V bitmaps</h3>
-<h3>Symbol table management</h3>
-<h3>Dealing with system calls</h3>
-<h3>Namespace management</h3>
-<h3>GDB attaching</h3>
-<h3>Non-dependence on glibc or anything else</h3>
-<h3>The leak detector</h3>
-<h3>Performance problems</h3>
-<h3>Continuous sanity checking</h3>
-<h3>Tracing, or not tracing, child processes</h3>
-<h3>Assembly glue for syscalls</h3>
-
-
-<hr width="100%">
-
-<h2>Extensions</h2>
-
-Some comments about Stuff To Do.
-
-<h3>Bugs</h3>
-
-Stephan Kulow and Marc Mutz report problems with kmail in KDE 3 CVS
-(RC2 ish) when run on Valgrind.  Stephan has it deadlocking; Marc has
-it looping at startup.  I can't repro either behaviour. Needs
-repro-ing and fixing.
-
-
-<h3>Threads</h3>
-
-Doing a good job of thread support strikes me as almost a
-research-level problem.  The central issues are how to do fast cheap
-locking of the <code>VG_(primary_map)</code> structure, whether or not
-accesses to the individual secondary maps need locking, what
-race-condition issues result, and whether the already-nasty mess that
-is the signal simulator needs further hackery.
-
-<p>
-I realise that threads are the most-frequently-requested feature, and
-I am thinking about it all.  If you have guru-level understanding of 
-fast mutual exclusion mechanisms and race conditions, I would be
-interested in hearing from you.
-
-
-<h3>Verification suite</h3>
-
-Directory <code>tests/</code> contains various ad-hoc tests for
-Valgrind.  However, there is no systematic verification or regression
-suite, that, for example, exercises all the stuff in
-<code>vg_memory.c</code>, to ensure that illegal memory accesses and
-undefined value uses are detected as they should be.  It would be good
-to have such a suite.
-
-
-<h3>Porting to other platforms</h3>
-
-It would be great if Valgrind was ported to FreeBSD and x86 NetBSD,
-and to x86 OpenBSD, if it's possible (doesn't OpenBSD use a.out-style
-executables, not ELF ?)
-
-<p>
-The main difficulties, for an x86-ELF platform, seem to be:
-
-<ul>
-<li>You'd need to rewrite the <code>/proc/self/maps</code> parser
-    (<code>vg_procselfmaps.c</code>).
-    Easy.
-<p>
-<li>You'd need to rewrite <code>vg_syscall_mem.c</code>, or, more
-    specifically, provide one for your OS.  This is tedious, but you
-    can implement syscalls on demand, and the Linux kernel interface
-    is, for the most part, going to look very similar to the *BSD
-    interfaces, so it's really a copy-paste-and-modify-on-demand job.
-    As part of this, you'd need to supply a new
-    <code>vg_kerneliface.h</code> file.
-<p>
-<li>You'd also need to change the syscall wrappers for Valgrind's
-    internal use, in <code>vg_mylibc.c</code>.
-</ul>
-
-All in all, I think a port to x86-ELF *BSDs is not really very
-difficult, and in some ways I would like to see it happen, because
-that would force a more clear factoring of Valgrind into platform
-dependent and independent pieces.  Not to mention, *BSD folks also
-deserve to use Valgrind just as much as the Linux crew do.
-
-
-<p>
-<hr width="100%">
-
-<h2>Easy stuff which ought to be done</h2>
-
-<h3>MMX instructions</h3>
-
-MMX insns should be supported, using the same trick as for FPU insns.
-If the MMX registers are not used to copy uninitialised junk from one
-place to another in memory, this means we don't have to actually
-simulate the internal MMX unit state, so the FPU hack applies.  This
-should be fairly easy.
-
-
-
-<h3>Fix stabs-info reader</h3>
-
-The machinery in <code>vg_symtab2.c</code> which reads "stabs" style
-debugging info is pretty weak.  It usually correctly translates 
-simulated program counter values into line numbers and procedure
-names, but the file name is often completely wrong.  I think the
-logic used to parse "stabs" entries is weak.  It should be fixed.
-The simplest solution, IMO, is to copy either the logic or simply the
-code out of GNU binutils which does this; since GDB can clearly get it
-right, binutils (or GDB?) must have code to do this somewhere.
-
-
-
-
-
-<h3>BT/BTC/BTS/BTR</h3>
-
-These are x86 instructions which test, complement, set, or reset, a
-single bit in a word.  At the moment they are both incorrectly
-implemented and incorrectly instrumented.
-
-<p>
-The incorrect instrumentation is due to use of helper functions.  This
-means we lose bit-level definedness tracking, which could wind up
-giving spurious uninitialised-value use errors.  The Right Thing to do
-is to invent a couple of new UOpcodes, I think <code>GET_BIT</code>
-and <code>SET_BIT</code>, which can be used to implement all 4 x86
-insns, get rid of the helpers, and give bit-accurate instrumentation
-rules for the two new UOpcodes.
-
-<p>
-I realised the other day that they are mis-implemented too.  The x86
-insns take a bit-index and a register or memory location to access.
-For registers the bit index clearly can only be in the range zero to
-register-width minus 1, and I assumed the same applied to memory
-locations too.  But evidently not; for memory locations the index can
-be arbitrary, and the processor will index arbitrarily into memory as
-a result.  This too should be fixed.  Sigh.  Presumably indexing
-outside the immediate word is not actually used by any programs yet
-tested on Valgrind, for otherwise they (presumably) would simply not
-work at all.  If you plan to hack on this, first check the Intel docs
-to make sure my understanding is really correct.
-
-
-
-<h3>Using PREFETCH instructions</h3>
-
-Here's a small but potentially interesting project for performance
-junkies.  Experiments with valgrind's code generator and optimiser(s)
-suggest that reducing the number of instructions executed in the
-translations and mem-check helpers gives disappointingly small
-performance improvements.  Perhaps this is because performance of
-Valgrindified code is limited by cache misses.  After all, each read
-in the original program now gives rise to at least three reads, one
-for the <code>VG_(primary_map)</code>, one of the resulting
-secondary, and the original.  Not to mention, the instrumented
-translations are 13 to 14 times larger than the originals.  All in all
-one would expect the memory system to be hammered to hell and then
-some.
-
-<p>
-So here's an idea.  An x86 insn involving a read from memory, after
-instrumentation, will turn into ucode of the following form:
-<pre>
-    ... calculate effective addr, into ta and qa ...
-    TESTVL qa             -- is the addr defined?
-    LOADV (ta), qloaded   -- fetch V bits for the addr
-    LOAD  (ta), tloaded   -- do the original load
-</pre>
-At the point where the <code>LOADV</code> is done, we know the actual
-address (<code>ta</code>) from which the real <code>LOAD</code> will
-be done.  We also know that the <code>LOADV</code> will take around
-20 x86 insns to do.  So it seems plausible that doing a prefetch of
-<code>ta</code> just before the <code>LOADV</code> might just avoid a
-miss at the <code>LOAD</code> point, and that might be a significant
-performance win.
-
-<p>
-Prefetch insns are notoriously tempermental, more often than not
-making things worse rather than better, so this would require
-considerable fiddling around.  It's complicated because Intels and
-AMDs have different prefetch insns with different semantics, so that
-too needs to be taken into account.  As a general rule, even placing
-the prefetches before the <code>LOADV</code> insn is too near the
-<code>LOAD</code>; the ideal distance is apparently circa 200 CPU
-cycles.  So it might be worth having another analysis/transformation
-pass which pushes prefetches as far back as possible, hopefully 
-immediately after the effective address becomes available.
-
-<p>
-Doing too many prefetches is also bad because they soak up bus
-bandwidth / cpu resources, so some cleverness in deciding which loads
-to prefetch and which to not might be helpful.  One can imagine not
-prefetching client-stack-relative (<code>%EBP</code> or
-<code>%ESP</code>) accesses, since the stack in general tends to show
-good locality anyway.
-
-<p>
-There's quite a lot of experimentation to do here, but I think it
-might make an interesting week's work for someone.
-
-<p>
-As of 15-ish March 2002, I've started to experiment with this, using
-the AMD <code>prefetch/prefetchw</code> insns.
-
-
-
-<h3>User-defined permission ranges</h3>
-
-This is quite a large project -- perhaps a month's hacking for a
-capable hacker to do a good job -- but it's potentially very
-interesting.  The outcome would be that Valgrind could detect a 
-whole class of bugs which it currently cannot.
-
-<p>
-The presentation falls into two pieces.
-
-<p>
-<b>Part 1: user-defined address-range permission setting</b>
-<p>
-
-Valgrind intercepts the client's <code>malloc</code>,
-<code>free</code>, etc calls, watches system calls, and watches the
-stack pointer move.  This is currently the only way it knows about
-which addresses are valid and which not.  Sometimes the client program
-knows extra information about its memory areas.  For example, the
-client could at some point know that all elements of an array are
-out-of-date.  We would like to be able to convey to Valgrind this
-information that the array is now addressable-but-uninitialised, so
-that Valgrind can then warn if elements are used before they get new
-values. 
-
-<p>
-What I would like are some macros like this:
-<pre>
-   VALGRIND_MAKE_NOACCESS(addr, len)
-   VALGRIND_MAKE_WRITABLE(addr, len)
-   VALGRIND_MAKE_READABLE(addr, len)
-</pre>
-and also, to check that memory is addressible/initialised,
-<pre>
-   VALGRIND_CHECK_ADDRESSIBLE(addr, len)
-   VALGRIND_CHECK_INITIALISED(addr, len)
-</pre>
-
-<p>
-I then include in my sources a header defining these macros, rebuild
-my app, run under Valgrind, and get user-defined checks.
-
-<p>
-Now here's a neat trick.  It's a nuisance to have to re-link the app
-with some new library which implements the above macros.  So the idea
-is to define the macros so that the resulting executable is still
-completely stand-alone, and can be run without Valgrind, in which case
-the macros do nothing, but when run on Valgrind, the Right Thing
-happens.  How to do this?  The idea is for these macros to turn into a
-piece of inline assembly code, which (1) has no effect when run on the
-real CPU, (2) is easily spotted by Valgrind's JITter, and (3) no sane
-person would ever write, which is important for avoiding false matches
-in (2).  So here's a suggestion:
-<pre>
-   VALGRIND_MAKE_NOACCESS(addr, len)
-</pre>
-becomes (roughly speaking)
-<pre>
-   movl addr, %eax
-   movl len,  %ebx
-   movl $1,   %ecx   -- 1 describes the action; MAKE_WRITABLE might be
-                     -- 2, etc
-   rorl $13, %ecx
-   rorl $19, %ecx
-   rorl $11, %eax
-   rorl $21, %eax
-</pre>
-The rotate sequences have no effect, and it's unlikely they would
-appear for any other reason, but they define a unique byte-sequence
-which the JITter can easily spot.  Using the operand constraints
-section at the end of a gcc inline-assembly statement, we can tell gcc
-that the assembly fragment kills <code>%eax</code>, <code>%ebx</code>,
-<code>%ecx</code> and the condition codes, so this fragment is made
-harmless when not running on Valgrind, runs quickly when not on
-Valgrind, and does not require any other library support.
-
-
-<p>
-<b>Part 2: using it to detect interference between stack variables</b>
-<p>
-
-Currently Valgrind cannot detect errors of the following form:
-<pre>
-void fooble ( void )
-{
-   int a[10];
-   int b[10];
-   a[10] = 99;
-}
-</pre>
-Now imagine rewriting this as
-<pre>
-void fooble ( void )
-{
-   int spacer0;
-   int a[10];
-   int spacer1;
-   int b[10];
-   int spacer2;
-   VALGRIND_MAKE_NOACCESS(&spacer0, sizeof(int));
-   VALGRIND_MAKE_NOACCESS(&spacer1, sizeof(int));
-   VALGRIND_MAKE_NOACCESS(&spacer2, sizeof(int));
-   a[10] = 99;
-}
-</pre>
-Now the invalid write is certain to hit <code>spacer0</code> or
-<code>spacer1</code>, so Valgrind will spot the error.
-
-<p>
-There are two complications.
-
-<p>
-The first is that we don't want to annotate sources by hand, so the
-Right Thing to do is to write a C/C++ parser, annotator, prettyprinter
-which does this automatically, and run it on post-CPP'd C/C++ source.
-See http://www.cacheprof.org for an example of a system which
-transparently inserts another phase into the gcc/g++ compilation
-route.  The parser/prettyprinter is probably not as hard as it sounds;
-I would write it in Haskell, a powerful functional language well
-suited to doing symbolic computation, with which I am intimately
-familar.  There is already a C parser written in Haskell by someone in
-the Haskell community, and that would probably be a good starting
-point.
-
-<p>
-The second complication is how to get rid of these
-<code>NOACCESS</code> records inside Valgrind when the instrumented
-function exits; after all, these refer to stack addresses and will
-make no sense whatever when some other function happens to re-use the
-same stack address range, probably shortly afterwards.  I think I
-would be inclined to define a special stack-specific macro
-<pre>
-   VALGRIND_MAKE_NOACCESS_STACK(addr, len)
-</pre>
-which causes Valgrind to record the client's <code>%ESP</code> at the
-time it is executed.  Valgrind will then watch for changes in
-<code>%ESP</code> and discard such records as soon as the protected
-area is uncovered by an increase in <code>%ESP</code>.  I hesitate
-with this scheme only because it is potentially expensive, if there
-are hundreds of such records, and considering that changes in
-<code>%ESP</code> already require expensive messing with stack access
-permissions.
-
-<p>
-This is probably easier and more robust than for the instrumenter 
-program to try and spot all exit points for the procedure and place
-suitable deallocation annotations there.  Plus C++ procedures can 
-bomb out at any point if they get an exception, so spotting return
-points at the source level just won't work at all.
-
-<p>
-Although some work, it's all eminently doable, and it would make
-Valgrind into an even-more-useful tool.
-
-
-<p>
-
-
-<hr width="100%">
-
-<h2>Cache profiling</h2>
-Valgrind is a very nice platform for doing cache profiling and other kinds of
-simulation, because it converts horrible x86 instructions into nice clean
-RISC-like UCode.  For example, for cache profiling we are interested in
-instructions that read and write memory;  in UCode there are only four
-instructions that do this:  <code>LOAD</code>, <code>STORE</code>,
-<code>FPU_R</code> and <code>FPU_W</code>.  By contrast, because of the x86
-addressing modes, almost every instruction can read or write memory.<p>
-
-Most of the cache profiling machinery is in the file
-<code>vg_cachesim.c</code>.<p>
-
-These notes are a somewhat haphazard guide to how Valgrind's cache profiling
-works.<p>
-
-<h3>Cost centres</h3>
-Valgrind gathers cache profiling about every instruction executed,
-individually.  Each instruction has a <b>cost centre</b> associated with it.
-There are two kinds of cost centre: one for instructions that don't reference
-memory (<code>iCC</code>), and one for instructions that do
-(<code>idCC</code>):
-
-<pre>
-typedef struct _CC {
-   ULong a;
-   ULong m1;
-   ULong m2;
-} CC;
-
-typedef struct _iCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-} iCC;
-   
-typedef struct _idCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   UChar data_size;
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I; 
-   CC D; 
-} idCC; 
-</pre>
-
-Each <code>CC</code> has three fields <code>a</code>, <code>m1</code>,
-<code>m2</code> for recording references, level 1 misses and level 2 misses.
-Each of these is a 64-bit <code>ULong</code> -- the numbers can get very large,
-ie. greater than 4.2 billion allowed by a 32-bit unsigned int.<p>
-
-A <code>iCC</code> has one <code>CC</code> for instruction cache accesses.  A
-<code>idCC</code> has two, one for instruction cache accesses, and one for data
-cache accesses.<p>
-
-The <code>iCC</code> and <code>dCC</code> structs also store unchanging
-information about the instruction:
-<ul>
-  <li>An instruction-type identification tag (explained below)</li><p>
-  <li>Instruction size</li><p>
-  <li>Data reference size (<code>idCC</code> only)</li><p>
-  <li>Instruction address</li><p>
-</ul>
-
-Note that data address is not one of the fields for <code>idCC</code>.  This is
-because for many memory-referencing instructions the data address can change
-each time it's executed (eg. if it uses register-offset addressing).  We have
-to give this item to the cache simulation in a different way (see
-Instrumentation section below). Some memory-referencing instructions do always
-reference the same address, but we don't try to treat them specialy in order to
-keep things simple.<p>
-
-Also note that there is only room for recording info about one data cache
-access in an <code>idCC</code>.  So what about instructions that do a read then
-a write, such as:
-
-<blockquote><code>inc %(esi)</code></blockquote>
-
-In a write-allocate cache, as simulated by Valgrind, the write cannot miss,
-since it immediately follows the read which will drag the block into the cache
-if it's not already there.  So the write access isn't really interesting, and
-Valgrind doesn't record it.  This means that Valgrind doesn't measure
-memory references, but rather memory references that could miss in the cache.
-This behaviour is the same as that used by the AMD Athlon hardware counters.
-It also has the benefit of simplifying the implementation -- instructions that
-read and write memory can be treated like instructions that read memory.<p>
-
-<h3>Storing cost-centres</h3>
-Cost centres are stored in a way that makes them very cheap to lookup, which is
-important since one is looked up for every original x86 instruction
-executed.<p>
-
-Valgrind does JIT translations at the basic block level, and cost centres are
-also setup and stored at the basic block level.  By doing things carefully, we
-store all the cost centres for a basic block in a contiguous array, and lookup
-comes almost for free.<p>
-
-Consider this part of a basic block (for exposition purposes, pretend it's an
-entire basic block):
-
-<pre>
-movl $0x0,%eax
-movl $0x99, -4(%ebp)
-</pre>
-
-The translation to UCode looks like this:
-                
-<pre>
-MOVL      $0x0, t20
-PUTL      t20, %EAX
-INCEIPo   $5
-
-LEA1L     -4(t4), t14
-MOVL      $0x99, t18
-STL       t18, (t14)
-INCEIPo   $7
-</pre>
-
-The first step is to allocate the cost centres.  This requires a preliminary
-pass to count how many x86 instructions were in the basic block, and their
-types (and thus sizes).  UCode translations for single x86 instructions are
-delimited by the <code>INCEIPo</code> instruction, the argument of which gives
-the byte size of the instruction (note that lazy INCEIP updating is turned off
-to allow this).<p>
-
-We can tell if an x86 instruction references memory by looking for
-<code>LDL</code> and <code>STL</code> UCode instructions, and thus what kind of
-cost centre is required.  From this we can determine how many cost centres we
-need for the basic block, and their sizes.  We can then allocate them in a
-single array.<p>
-
-Consider the example code above.  After the preliminary pass, we know we need
-two cost centres, one <code>iCC</code> and one <code>dCC</code>.  So we
-allocate an array to store these which looks like this:
-
-<pre>
-|(uninit)|      tag         (1 byte)
-|(uninit)|      instr_size  (1 bytes)
-|(uninit)|      (padding)   (2 bytes)
-|(uninit)|      instr_addr  (4 bytes)
-|(uninit)|      I.a         (8 bytes)
-|(uninit)|      I.m1        (8 bytes)
-|(uninit)|      I.m2        (8 bytes)
-
-|(uninit)|      tag         (1 byte)
-|(uninit)|      instr_size  (1 byte)
-|(uninit)|      data_size   (1 byte)
-|(uninit)|      (padding)   (1 byte)
-|(uninit)|      instr_addr  (4 bytes)
-|(uninit)|      I.a         (8 bytes)
-|(uninit)|      I.m1        (8 bytes)
-|(uninit)|      I.m2        (8 bytes)
-|(uninit)|      D.a         (8 bytes)
-|(uninit)|      D.m1        (8 bytes)
-|(uninit)|      D.m2        (8 bytes)
-</pre>
-
-(We can see now why we need tags to distinguish between the two types of cost
-centres.)<p>
-
-We also record the size of the array.  We look up the debug info of the first
-instruction in the basic block, and then stick the array into a table indexed
-by filename and function name.  This makes it easy to dump the information
-quickly to file at the end.<p>
-
-<h3>Instrumentation</h3>
-The instrumentation pass has two main jobs:
-
-<ol>
-  <li>Fill in the gaps in the allocated cost centres.</li><p>
-  <li>Add UCode to call the cache simulator for each instruction.</li><p>
-</ol>
-
-The instrumentation pass steps through the UCode and the cost centres in
-tandem.  As each original x86 instruction's UCode is processed, the appropriate
-gaps in the instructions cost centre are filled in, for example:
-
-<pre>
-|INSTR_CC|      tag         (1 byte)
-|5       |      instr_size  (1 bytes)
-|(uninit)|      (padding)   (2 bytes)
-|i_addr1 |      instr_addr  (4 bytes)
-|0       |      I.a         (8 bytes)
-|0       |      I.m1        (8 bytes)
-|0       |      I.m2        (8 bytes)
-
-|WRITE_CC|      tag         (1 byte)
-|7       |      instr_size  (1 byte)
-|4       |      data_size   (1 byte)
-|(uninit)|      (padding)   (1 byte)
-|i_addr2 |      instr_addr  (4 bytes)
-|0       |      I.a         (8 bytes)
-|0       |      I.m1        (8 bytes)
-|0       |      I.m2        (8 bytes)
-|0       |      D.a         (8 bytes)
-|0       |      D.m1        (8 bytes)
-|0       |      D.m2        (8 bytes)
-</pre>
-
-(Note that this step is not performed if a basic block is re-translated;  see
-<a href="#retranslations">here</a> for more information.)<p>
-
-GCC inserts padding before the <code>instr_size</code> field so that it is word
-aligned.<p>
-
-The instrumentation added to call the cache simulation function looks like this
-(instrumentation is indented to distinguish it from the original UCode):
-
-<pre>
-MOVL      $0x0, t20
-PUTL      t20, %EAX
-  PUSHL     %eax
-  PUSHL     %ecx
-  PUSHL     %edx
-  MOVL      $0x4091F8A4, t46  # address of 1st CC
-  PUSHL     t46
-  CALLMo    $0x12             # second cachesim function
-  CLEARo    $0x4
-  POPL      %edx
-  POPL      %ecx
-  POPL      %eax
-INCEIPo   $5
-
-LEA1L     -4(t4), t14
-MOVL      $0x99, t18
-  MOVL      t14, t42
-STL       t18, (t14)
-  PUSHL     %eax
-  PUSHL     %ecx
-  PUSHL     %edx
-  PUSHL     t42
-  MOVL      $0x4091F8C4, t44  # address of 2nd CC
-  PUSHL     t44
-  CALLMo    $0x13             # second cachesim function
-  CLEARo    $0x8
-  POPL      %edx
-  POPL      %ecx
-  POPL      %eax
-INCEIPo   $7
-</pre>
-
-Consider the first instruction's UCode.  Each call is surrounded by three
-<code>PUSHL</code> and <code>POPL</code> instructions to save and restore the
-caller-save registers.  Then the address of the instruction's cost centre is
-pushed onto the stack, to be the first argument to the cache simulation
-function.  The address is known at this point because we are doing a
-simultaneous pass through the cost centre array.  This means the cost centre
-lookup for each instruction is almost free (just the cost of pushing an
-argument for a function call).  Then the call to the cache simulation function
-for non-memory-reference instructions is made (note that the
-<code>CALLMo</code> UInstruction takes an offset into a table of predefined
-functions;  it is not an absolute address), and the single argument is
-<code>CLEAR</code>ed from the stack.<p>
-
-The second instruction's UCode is similar.  The only difference is that, as
-mentioned before, we have to pass the address of the data item referenced to
-the cache simulation function too.  This explains the <code>MOVL t14,
-t42</code> and <code>PUSHL t42</code> UInstructions.  (Note that the seemingly
-redundant <code>MOV</code>ing will probably be optimised away during register
-allocation.)<p>
-
-Note that instead of storing unchanging information about each instruction
-(instruction size, data size, etc) in its cost centre, we could have passed in
-these arguments to the simulation function.  But this would slow the calls down
-(two or three extra arguments pushed onto the stack).  Also it would bloat the
-UCode instrumentation by amounts similar to the space required for them in the
-cost centre;  bloated UCode would also fill the translation cache more quickly,
-requiring more translations for large programs and slowing them down more.<p>
-
-<a name="retranslations"></a>
-<h3>Handling basic block retranslations</h3>
-The above description ignores one complication.  Valgrind has a limited size
-cache for basic block translations;  if it fills up, old translations are
-discarded.  If a discarded basic block is executed again, it must be
-re-translated.<p>
-
-However, we can't use this approach for profiling -- we can't throw away cost
-centres for instructions in the middle of execution!  So when a basic block is
-translated, we first look for its cost centre array in the hash table.  If
-there is no cost centre array, it must be the first translation, so we proceed
-as described above.  But if there is a cost centre array already, it must be a
-retranslation.  In this case, we skip the cost centre allocation and
-initialisation steps, but still do the UCode instrumentation step.<p>
-
-<h3>The cache simulation</h3>
-The cache simulation is fairly straightforward.  It just tracks which memory
-blocks are in the cache at the moment (it doesn't track the contents, since
-that is irrelevant).<p>
-
-The interface to the simulation is quite clean.  The functions called from the
-UCode contain calls to the simulation functions in the files
-<Code>vg_cachesim_{I1,D1,L2}.c</code>;  these calls are inlined so that only
-one function call is done per simulated x86 instruction.  The file
-<code>vg_cachesim.c</code> simply <code>#include</code>s the three files
-containing the simulation, which makes plugging in new cache simulations is
-very easy -- you just replace the three files and recompile.<p>
-
-<h3>Output</h3>
-Output is fairly straightforward, basically printing the cost centre for every
-instruction, grouped by files and functions.  Total counts (eg. total cache
-accesses, total L1 misses) are calculated when traversing this structure rather
-than during execution, to save time;  the cache simulation functions are called
-so often that even one or two extra adds can make a sizeable difference.<p>
-
-Input file has the following format:
-
-<pre>
-file         ::= desc_line* cmd_line events_line data_line+ summary_line
-desc_line    ::= "desc:" ws? non_nl_string
-cmd_line     ::= "cmd:" ws? cmd
-events_line  ::= "events:" ws? (event ws)+
-data_line    ::= file_line | fn_line | count_line
-file_line    ::= ("fl=" | "fi=" | "fe=") filename
-fn_line      ::= "fn=" fn_name
-count_line   ::= line_num ws? (count ws)+
-summary_line ::= "summary:" ws? (count ws)+
-count        ::= num | "."
-</pre>
-
-Where:
-
-<ul>
-  <li><code>non_nl_string</code> is any string not containing a newline.</li><p>
-  <li><code>cmd</code> is a command line invocation.</li><p>
-  <li><code>filename</code> and <code>fn_name</code> can be anything.</li><p>
-  <li><code>num</code> and <code>line_num</code> are decimal numbers.</li><p>
-  <li><code>ws</code> is whitespace.</li><p>
-  <li><code>nl</code> is a newline.</li><p>
-</ul>
-
-The contents of the "desc:" lines is printed out at the top of the summary.
-This is a generic way of providing simulation specific information, eg. for
-giving the cache configuration for cache simulation.<p>
-
-Counts can be "." to represent "N/A", eg. the number of write misses for an
-instruction that doesn't write to memory.<p>
-
-The number of counts in each <code>line</code> and the
-<code>summary_line</code> should not exceed the number of events in the
-<code>event_line</code>.  If the number in each <code>line</code> is less,
-vg_annotate treats those missing as though they were a "." entry.  <p>
-
-A <code>file_line</code> changes the current file name.  A <code>fn_line</code>
-changes the current function name.  A <code>count_line</code> contains counts
-that pertain to the current filename/fn_name.  A "fn=" <code>file_line</code>
-and a <code>fn_line</code> must appear before any <code>count_line</code>s to
-give the context of the first <code>count_line</code>s.<p>
-
-Each <code>file_line</code> should be immediately followed by a
-<code>fn_line</code>.  "fi=" <code>file_lines</code> are used to switch
-filenames for inlined functions; "fe=" <code>file_lines</code> are similar, but
-are put at the end of a basic block in which the file name hasn't been switched
-back to the original file name.  (fi and fe lines behave the same, they are
-only distinguished to help debugging.)<p>
-
-
-<h3>Summary of performance features</h3>
-Quite a lot of work has gone into making the profiling as fast as possible.
-This is a summary of the important features:
-
-<ul>
-  <li>The basic block-level cost centre storage allows almost free cost centre
-      lookup.</li><p>
-  
-  <li>Only one function call is made per instruction simulated;  even this
-      accounts for a sizeable percentage of execution time, but it seems
-      unavoidable if we want flexibility in the cache simulator.</li><p>
-
-  <li>Unchanging information about an instruction is stored in its cost centre,
-      avoiding unnecessary argument pushing, and minimising UCode
-      instrumentation bloat.</li><p>
-
-  <li>Summary counts are calculated at the end, rather than during
-      execution.</li><p>
-
-  <li>The <code>cachegrind.out</code> output files can contain huge amounts of
-      information; file format was carefully chosen to minimise file
-      sizes.</li><p>
-</ul>
-
-
-<h3>Annotation</h3>
-Annotation is done by vg_annotate.  It is a fairly straightforward Perl script
-that slurps up all the cost centres, and then runs through all the chosen
-source files, printing out cost centres with them.  It too has been carefully
-optimised.
-
-
-<h3>Similar work, extensions</h3>
-It would be relatively straightforward to do other simulations and obtain
-line-by-line information about interesting events.  A good example would be
-branch prediction -- all branches could be instrumented to interact with a
-branch prediction simulator, using very similar techniques to those described
-above.<p>
-
-In particular, vg_annotate would not need to change -- the file format is such
-that it is not specific to the cache simulation, but could be used for any kind
-of line-by-line information.  The only part of vg_annotate that is specific to
-the cache simulation is the name of the input file
-(<code>cachegrind.out</code>), although it would be very simple to add an
-option to control this.<p>
-
-</body>
-</html>
diff --git a/coregrind/dosyms b/coregrind/dosyms
deleted file mode 100755
index 4a46f01e6b..0000000000
--- a/coregrind/dosyms
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-
-# A simple script to help me ensure that my libpthread.so looks
-# from the outside, to the linker, identical to the original.
-
-nm /lib/libpthread.so.0 | grep " T " | cut -c 10- > orig-T
-nm /lib/libpthread.so.0 | grep " D " | cut -c 10- > orig-D
-nm /lib/libpthread.so.0 | grep " W " | cut -c 10- > orig-W
-
-nm ./libpthread.so | grep " T " | cut -c 10- > mine-T
-nm ./libpthread.so | grep " D " | cut -c 10- > mine-D
-nm ./libpthread.so | grep " W " | cut -c 10- > mine-W
-
-echo ========================== TEXT orig vs mine =========================
-sdiff -w 80 orig-T mine-T
-echo
-
-echo ========================== WEAK orig vs mine =========================
-sdiff -w 80 orig-W mine-W
-echo
-
-echo ========================== DATA orig vs mine =========================
-sdiff -w 80 orig-D mine-D
-echo
diff --git a/coregrind/valgrind.in b/coregrind/valgrind.in
deleted file mode 100755
index 7b99277254..0000000000
--- a/coregrind/valgrind.in
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/bin/sh
-
-# Should point to the installation directory
-prefix="@prefix@"
-exec_prefix="@exec_prefix@"
-VALGRIND="@libdir@/valgrind"
-
-
-# Other stuff ...
-version="@VERSION@"
-emailto="jseward@acm.org"
-
-# The default name of the suppressions file
-vgsupp="--suppressions=$VALGRIND/default.supp"
-
-# name we were invoked with
-vgname=`echo $0 | sed 's,^.*/,,'`
-
-# Valgrind options
-vgopts=
-
-# Prog and arg to run
-argopts=
-
-# Show usage info?
-dousage=0
-
-# show version info?
-doversion=0
-
-# Collect up args for Valgrind
-while [ $+ != 0 ]
-do
-  arg=$1
-  case "$arg" in
-#   options for the user
-    --help)                 dousage=1; break;;
-    --version)              doversion=1; break;;
-    --logfile-fd=*)         vgopts="$vgopts $arg"; shift;;
-    -v)                     vgopts="$vgopts $arg"; shift;;
-    --verbose)              vgopts="$vgopts -v"; shift;;
-    -q)                     vgopts="$vgopts $arg"; shift;;
-    --quiet)                vgopts="$vgopts $arg"; shift;;
-    --error-limit=no)       vgopts="$vgopts $arg"; shift;;
-    --error-limit=yes)      vgopts="$vgopts $arg"; shift;;
-    --check-addrVs=no)      vgopts="$vgopts $arg"; shift;;
-    --check-addrVs=yes)     vgopts="$vgopts $arg"; shift;;
-    --gdb-attach=no)        vgopts="$vgopts $arg"; shift;;
-    --gdb-attach=yes)       vgopts="$vgopts $arg"; shift;;
-    --demangle=no)          vgopts="$vgopts $arg"; shift;;
-    --demangle=yes)         vgopts="$vgopts $arg"; shift;;
-    --num-callers=*)        vgopts="$vgopts $arg"; shift;;
-    --partial-loads-ok=no)  vgopts="$vgopts $arg"; shift;;
-    --partial-loads-ok=yes) vgopts="$vgopts $arg"; shift;;
-    --leak-check=no)        vgopts="$vgopts $arg"; shift;;
-    --leak-check=yes)       vgopts="$vgopts $arg"; shift;;
-    --show-reachable=no)    vgopts="$vgopts $arg"; shift;;
-    --show-reachable=yes)   vgopts="$vgopts $arg"; shift;;
-    --leak-resolution=low)  vgopts="$vgopts $arg"; shift;;
-    --leak-resolution=med)  vgopts="$vgopts $arg"; shift;;
-    --leak-resolution=high) vgopts="$vgopts $arg"; shift;;
-    --sloppy-malloc=no)     vgopts="$vgopts $arg"; shift;;
-    --sloppy-malloc=yes)    vgopts="$vgopts $arg"; shift;;
-    --alignment=*)          vgopts="$vgopts $arg"; shift;;
-    --trace-children=no)    vgopts="$vgopts $arg"; shift;;
-    --trace-children=yes)   vgopts="$vgopts $arg"; shift;;
-    --workaround-gcc296-bugs=no)    vgopts="$vgopts $arg"; shift;;
-    --workaround-gcc296-bugs=yes)   vgopts="$vgopts $arg"; shift;;
-    --freelist-vol=*)       vgopts="$vgopts $arg"; shift;;
-    --suppressions=*)       vgopts="$vgopts $arg"; shift;;
-    --cachesim=yes)         vgopts="$vgopts $arg"; shift;;
-    --cachesim=no)          vgopts="$vgopts $arg"; shift;;
-    --I1=*,*,*)             vgopts="$vgopts $arg"; shift;;
-    --D1=*,*,*)             vgopts="$vgopts $arg"; shift;;
-    --L2=*,*,*)             vgopts="$vgopts $arg"; shift;;
-    --weird-hacks=*)        vgopts="$vgopts $arg"; shift;;
-#   options for debugging Valgrind
-    --sanity-level=*)       vgopts="$vgopts $arg"; shift;;
-    --single-step=yes)      vgopts="$vgopts $arg"; shift;;
-    --single-step=no)       vgopts="$vgopts $arg"; shift;;
-    --optimise=yes)         vgopts="$vgopts $arg"; shift;;
-    --optimise=no)          vgopts="$vgopts $arg"; shift;;
-    --instrument=yes)       vgopts="$vgopts $arg"; shift;;
-    --instrument=no)        vgopts="$vgopts $arg"; shift;;
-    --cleanup=yes)          vgopts="$vgopts $arg"; shift;;
-    --cleanup=no)           vgopts="$vgopts $arg"; shift;;
-    --smc-check=none)       vgopts="$vgopts $arg"; shift;;
-    --smc-check=some)       vgopts="$vgopts $arg"; shift;;
-    --smc-check=all)        vgopts="$vgopts $arg"; shift;;
-    --trace-syscalls=yes)   vgopts="$vgopts $arg"; shift;;
-    --trace-syscalls=no)    vgopts="$vgopts $arg"; shift;;
-    --trace-signals=yes)    vgopts="$vgopts $arg"; shift;;
-    --trace-signals=no)     vgopts="$vgopts $arg"; shift;;
-    --trace-symtab=yes)     vgopts="$vgopts $arg"; shift;;
-    --trace-symtab=no)      vgopts="$vgopts $arg"; shift;;
-    --trace-malloc=yes)     vgopts="$vgopts $arg"; shift;;
-    --trace-malloc=no)      vgopts="$vgopts $arg"; shift;;
-    --trace-sched=yes)      vgopts="$vgopts $arg"; shift;;
-    --trace-sched=no)       vgopts="$vgopts $arg"; shift;;
-    --trace-pthread=none)   vgopts="$vgopts $arg"; shift;;
-    --trace-pthread=some)   vgopts="$vgopts $arg"; shift;;
-    --trace-pthread=all)    vgopts="$vgopts $arg"; shift;;
-    --stop-after=*)         vgopts="$vgopts $arg"; shift;;
-    --dump-error=*)         vgopts="$vgopts $arg"; shift;;
-    -*)                     dousage=1; break;;
-    *)                      break;;
-  esac
-done
-
-if [ z"$doversion" = z1 ]; then
-   echo "valgrind-$version"
-   exit 1
-fi
-
-if [ $# = 0 ] || [ z"$dousage" = z1 ]; then
-   echo
-   echo "usage: $vgname [options] prog-and-args"
-   echo
-   echo "  options for the user, with defaults in [ ], are:"
-   echo "    --help                    show this message"
-   echo "    --version                 show version"
-   echo "    -q --quiet                run silently; only print error msgs"
-   echo "    -v --verbose              be more verbose, incl counts of errors"
-   echo "    --gdb-attach=no|yes       start GDB when errors detected? [no]"
-   echo "    --demangle=no|yes         automatically demangle C++ names? [yes]"
-   echo "    --num-callers=<number>    show <num> callers in stack traces [4]"
-   echo "    --error-limit=no|yes      stop showing new errors if too many? [yes]"
-   echo "    --partial-loads-ok=no|yes too hard to explain here; see manual [yes]"
-   echo "    --leak-check=no|yes       search for memory leaks at exit? [no]"
-   echo "    --leak-resolution=low|med|high"
-   echo "                              amount of bt merging in leak check [low]"
-   echo "    --show-reachable=no|yes   show reachable blocks in leak check? [no]"
-   echo "    --sloppy-malloc=no|yes    round malloc sizes to next word? [no]"
-   echo "    --alignment=<number>      set minimum alignment of allocations [4]"
-   echo "    --trace-children=no|yes   Valgrind-ise child processes? [no]"
-   echo "    --logfile-fd=<number>     file descriptor for messages [2=stderr]"
-   echo "    --freelist-vol=<number>   volume of freed blocks queue [1000000]"
-   echo "    --workaround-gcc296-bugs=no|yes  self explanatory [no]"
-   echo "    --suppressions=<filename> suppress errors described in"
-   echo "                              suppressions file <filename>"
-   echo "    --check-addrVs=no|yes     experimental lighterweight checking? [yes]"
-   echo "                              yes == Valgrind's original behaviour"
-   echo "    --cachesim=no|yes         do cache profiling? [no]"
-   echo "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually"
-   echo "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually"
-   echo "    --L2=<size>,<assoc>,<line_size>  set L2 cache manually"
-   echo "    --weird-hacks=hack1,hack2,...  [no hacks selected]"
-   echo "         recognised hacks are: ioctl-VTIME truncate-writes"
-   echo ""
-   echo
-   echo "  options for debugging Valgrind itself are:"
-   echo "    --sanity-level=<number>   level of sanity checking to do [1]"
-   echo "    --single-step=no|yes      translate each instr separately? [no]"
-   echo "    --optimise=no|yes         improve intermediate code? [yes]"
-   echo "    --instrument=no|yes       actually do memory checks? [yes]"
-   echo "    --cleanup=no|yes          improve after instrumentation? [yes]"
-   echo "    --smc-check=none|some|all check writes for s-m-c? [some]"
-   echo "    --trace-syscalls=no|yes   show all system calls? [no]"
-   echo "    --trace-signals=no|yes    show signal handling details? [no]"
-   echo "    --trace-symtab=no|yes     show symbol table details? [no]"
-   echo "    --trace-malloc=no|yes     show client malloc details? [no]"
-   echo "    --trace-sched=no|yes      show thread scheduler details? [no]"
-   echo "    --trace-pthread=none|some|all  show pthread event details? [no]"
-   echo "    --stop-after=<number>     switch to real CPU after executing"
-   echo "                              <number> basic blocks [infinity]"
-   echo "    --dump-error=<number>     show translation for basic block"
-   echo "                              associated with <number>'th"
-   echo "                              error context [0=don't show any]"
-   echo
-   echo "  Extra options are read from env variable \$VALGRIND_OPTS"
-   echo
-   echo "  Valgrind is Copyright (C) 2000-2002 Julian Seward"
-   echo "  and licensed under the GNU General Public License, version 2."
-   echo "  Bug reports, feedback, admiration, abuse, etc, to: $emailto."
-   echo
-   exit 1
-fi
-
-# A bit subtle.  The LD_PRELOAD added entry must be absolute
-# and not depend on LD_LIBRARY_PATH.  This is so that we can
-# mess with LD_LIBRARY_PATH for child processes, which makes
-# libpthread.so fall out of visibility, independently of
-# whether valgrind.so is visible.
-
-VG_ARGS="$VALGRIND_OPTS $vgsupp $vgopts"
-export VG_ARGS
-LD_LIBRARY_PATH=$VALGRIND:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH
-LD_PRELOAD=$VALGRIND/valgrind.so:$LD_PRELOAD
-export LD_PRELOAD
-#LD_DEBUG=files
-#LD_DEBUG=symbols
-#export LD_DEBUG
-exec "$@"
diff --git a/coregrind/vg_clientfuncs.c b/coregrind/vg_clientfuncs.c
deleted file mode 100644
index 80bdae6714..0000000000
--- a/coregrind/vg_clientfuncs.c
+++ /dev/null
@@ -1,574 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Code which runs on the simulated CPU.                        ---*/
-/*---                                             vg_clientfuncs.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "vg_constants.h"
-
-#include "valgrind.h"   /* for VALGRIND_MAGIC_SEQUENCE */
-
-
-/* ---------------------------------------------------------------------
-   All the code in this file runs on the SIMULATED CPU.  It is
-   intended for various reasons as drop-in replacements for libc
-   functions.  These functions have global visibility (obviously) and
-   have no prototypes in vg_include.h, since they are not intended to
-   be called from within Valgrind.
-   ------------------------------------------------------------------ */
-
-/* ---------------------------------------------------------------------
-   Intercepts for the GNU malloc interface.
-   ------------------------------------------------------------------ */
-
-#define SIMPLE_REQUEST1(_qyy_request, _qyy_arg1)                 \
-   ({unsigned int _qyy_res;                                      \
-    VALGRIND_MAGIC_SEQUENCE(_qyy_res, 0 /* default return */,    \
-                            _qyy_request,                        \
-                            _qyy_arg1, 0, 0, 0);                 \
-    _qyy_res;                                                    \
-   })
-
-#define SIMPLE_REQUEST2(_qyy_request, _qyy_arg1, _qyy_arg2)      \
-   ({unsigned int _qyy_res;                                      \
-    VALGRIND_MAGIC_SEQUENCE(_qyy_res, 0 /* default return */,    \
-                            _qyy_request,                        \
-                            _qyy_arg1, _qyy_arg2, 0, 0);         \
-    _qyy_res;                                                    \
-   })
-
-
-/* Below are new versions of malloc, __builtin_new, free, 
-   __builtin_delete, calloc and realloc.
-
-   malloc, __builtin_new, free, __builtin_delete, calloc and realloc
-   can be entered either on the real CPU or the simulated one.  If on
-   the real one, this is because the dynamic linker is running the
-   static initialisers for C++, before starting up Valgrind itself.
-   In this case it is safe to route calls through to
-   VG_(malloc)/vg_free, since that is self-initialising.
-
-   Once Valgrind is initialised, vg_running_on_simd_CPU becomes True.
-   The call needs to be transferred from the simulated CPU back to the
-   real one and routed to the vg_client_* functions.  To do that, the
-   client-request mechanism (in valgrind.h) is used to convey requests
-   to the scheduler.
-*/
-
-/* ALL calls to malloc wind up here. */
-void* malloc ( Int n )
-{
-   void* v;
-
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("malloc[simd=%d](%d)", 
-                  (UInt)VG_(running_on_simd_CPU), n );
-   if (n < 0) {
-      v = NULL;
-      VG_(message)(Vg_UserMsg, 
-         "Warning: silly arg (%d) to malloc()", n );
-   } else {
-      if (VG_(clo_sloppy_malloc)) { while ((n % 4) > 0) n++; }
-
-      if (VG_(running_on_simd_CPU)) {
-         v = (void*)SIMPLE_REQUEST1(VG_USERREQ__MALLOC, n);
-      } else {
-         v = VG_(malloc)(VG_AR_CLIENT, n);
-      }
-   }
-   if (VG_(clo_trace_malloc)) 
-      VG_(printf)(" = %p\n", v );
-   return (void*)v;
-}
-
-
-void* __builtin_new ( Int n )
-{
-   void* v;
-
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("__builtin_new[simd=%d](%d)", 
-                  (UInt)VG_(running_on_simd_CPU), n );
-   if (n < 0) {
-      v = NULL;
-      VG_(message)(Vg_UserMsg, 
-         "Warning: silly arg (%d) to __builtin_new()", n );
-   } else {
-      if (VG_(clo_sloppy_malloc)) { while ((n % 4) > 0) n++; }
-
-      if (VG_(running_on_simd_CPU)) {
-         v = (void*)SIMPLE_REQUEST1(VG_USERREQ__BUILTIN_NEW, n);
-      } else {
-         v = VG_(malloc)(VG_AR_CLIENT, n);
-      }
-   }
-   if (VG_(clo_trace_malloc)) 
-      VG_(printf)(" = %p\n", v );
-   return v;
-}
-
-
-void* __builtin_vec_new ( Int n )
-{
-   void* v;
-
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("__builtin_vec_new[simd=%d](%d)", 
-                  (UInt)VG_(running_on_simd_CPU), n );
-   if (n < 0) {
-      v = NULL;
-      VG_(message)(Vg_UserMsg, 
-         "Warning: silly arg (%d) to __builtin_vec_new()", n );
-   } else {
-      if (VG_(clo_sloppy_malloc)) { while ((n % 4) > 0) n++; }
-
-      if (VG_(running_on_simd_CPU)) {
-         v = (void*)SIMPLE_REQUEST1(VG_USERREQ__BUILTIN_VEC_NEW, n);
-      } else {
-         v = VG_(malloc)(VG_AR_CLIENT, n);
-      }
-   }
-   if (VG_(clo_trace_malloc)) 
-      VG_(printf)(" = %p\n", v );
-   return v;
-}
-
-
-void free ( void* p )
-{
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("free[simd=%d](%p)\n", 
-                  (UInt)VG_(running_on_simd_CPU), p );
-   if (p == NULL) 
-      return;
-   if (VG_(running_on_simd_CPU)) {
-      (void)SIMPLE_REQUEST1(VG_USERREQ__FREE, p);
-   } else {
-      VG_(free)(VG_AR_CLIENT, p);      
-   }
-}
-
-
-void __builtin_delete ( void* p )
-{
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("__builtin_delete[simd=%d](%p)\n", 
-                  (UInt)VG_(running_on_simd_CPU), p );
-   if (p == NULL) 
-      return;
-   if (VG_(running_on_simd_CPU)) {
-      (void)SIMPLE_REQUEST1(VG_USERREQ__BUILTIN_DELETE, p);
-   } else {
-      VG_(free)(VG_AR_CLIENT, p);
-   }
-}
-
-
-void __builtin_vec_delete ( void* p )
-{
-   if (VG_(clo_trace_malloc))
-       VG_(printf)("__builtin_vec_delete[simd=%d](%p)\n", 
-                   (UInt)VG_(running_on_simd_CPU), p );
-   if (p == NULL) 
-      return;
-   if (VG_(running_on_simd_CPU)) {
-      (void)SIMPLE_REQUEST1(VG_USERREQ__BUILTIN_VEC_DELETE, p);
-   } else {
-      VG_(free)(VG_AR_CLIENT, p);
-   }
-}
-
-
-void* calloc ( Int nmemb, Int size )
-{
-   void* v;
-
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("calloc[simd=%d](%d,%d)", 
-                  (UInt)VG_(running_on_simd_CPU), nmemb, size );
-   if (nmemb < 0 || size < 0) {
-      v = NULL;
-      VG_(message)(Vg_UserMsg, "Warning: silly args (%d,%d) to calloc()", 
-                               nmemb, size );
-   } else {
-      if (VG_(running_on_simd_CPU)) {
-         v = (void*)SIMPLE_REQUEST2(VG_USERREQ__CALLOC, nmemb, size);
-      } else {
-         v = VG_(calloc)(VG_AR_CLIENT, nmemb, size);
-      }
-   }
-   if (VG_(clo_trace_malloc)) 
-      VG_(printf)(" = %p\n", v );
-   return v;
-}
-
-
-void* realloc ( void* ptrV, Int new_size )
-{
-   void* v;
-
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("realloc[simd=%d](%p,%d)", 
-                  (UInt)VG_(running_on_simd_CPU), ptrV, new_size );
-
-   if (VG_(clo_sloppy_malloc)) 
-      { while ((new_size % 4) > 0) new_size++; }
-
-   if (ptrV == NULL)
-      return malloc(new_size);
-   if (new_size <= 0) {
-      free(ptrV);
-      if (VG_(clo_trace_malloc)) 
-         VG_(printf)(" = 0\n" );
-      return NULL;
-   }   
-   if (VG_(running_on_simd_CPU)) {
-      v = (void*)SIMPLE_REQUEST2(VG_USERREQ__REALLOC, ptrV, new_size);
-   } else {
-      v = VG_(realloc)(VG_AR_CLIENT, ptrV, new_size);
-   }
-   if (VG_(clo_trace_malloc)) 
-      VG_(printf)(" = %p\n", v );
-   return v;
-}
-
-
-void* memalign ( Int alignment, Int n )
-{
-   void* v;
-
-   if (VG_(clo_trace_malloc))
-      VG_(printf)("memalign[simd=%d](al %d, size %d)", 
-                  (UInt)VG_(running_on_simd_CPU), alignment, n );
-   if (n < 0) {
-      v = NULL;
-   } else {
-      if (VG_(clo_sloppy_malloc)) { while ((n % 4) > 0) n++; }
-
-      if (VG_(running_on_simd_CPU)) {
-         v = (void*)SIMPLE_REQUEST2(VG_USERREQ__MEMALIGN, alignment, n);
-      } else {
-         v = VG_(malloc_aligned)(VG_AR_CLIENT, alignment, n);
-      }
-   }
-   if (VG_(clo_trace_malloc)) 
-      VG_(printf)(" = %p\n", v );
-   return (void*)v;
-}
-
-
-void* valloc ( Int size )
-{
-   return memalign(VKI_BYTES_PER_PAGE, size);
-}
-
-
-/* Various compatibility wrapper functions, for glibc and libstdc++. */
-void cfree ( void* p )
-{
-   free ( p );
-}
-
-
-int mallopt ( int cmd, int value )
-{
-   /* In glibc-2.2.4, 1 denotes a successful return value for mallopt */
-   return 1;
-}
-
-
-int __posix_memalign ( void **memptr, UInt alignment, UInt size )
-{
-    void *mem;
-
-    /* Test whether the SIZE argument is valid.  It must be a power of
-       two multiple of sizeof (void *).  */
-    if (size % sizeof (void *) != 0 || (size & (size - 1)) != 0)
-       return VKI_EINVAL /*22*/ /*EINVAL*/;
-
-    mem = memalign (alignment, size);
-
-    if (mem != NULL) {
-       *memptr = mem;
-       return 0;
-    }
-
-    return VKI_ENOMEM /*12*/ /*ENOMEM*/;
-}
-
-
-/* Bomb out if we get any of these. */
-/* HACK: We shouldn't call VG_(panic) or VG_(message) on the simulated
-   CPU.  Really we should pass the request in the usual way, and
-   Valgrind itself can do the panic.  Too tedious, however.  
-*/
-void pvalloc ( void )
-{ VG_(panic)("call to pvalloc\n"); }
-void malloc_stats ( void )
-{ VG_(panic)("call to malloc_stats\n"); }
-void malloc_usable_size ( void )
-{ VG_(panic)("call to malloc_usable_size\n"); }
-void malloc_trim ( void )
-{ VG_(panic)("call to malloc_trim\n"); }
-void malloc_get_state ( void )
-{ VG_(panic)("call to malloc_get_state\n"); }
-void malloc_set_state ( void )
-{ VG_(panic)("call to malloc_set_state\n"); }
-
-
-/* Yet another ugly hack.  Cannot include <malloc.h> because we
-   implement functions implemented there with different signatures.
-   This struct definition MUST match the system one. */
-
-/* SVID2/XPG mallinfo structure */
-struct mallinfo {
-   int arena;    /* total space allocated from system */
-   int ordblks;  /* number of non-inuse chunks */
-   int smblks;   /* unused -- always zero */
-   int hblks;    /* number of mmapped regions */
-   int hblkhd;   /* total space in mmapped regions */
-   int usmblks;  /* unused -- always zero */
-   int fsmblks;  /* unused -- always zero */
-   int uordblks; /* total allocated space */
-   int fordblks; /* total non-inuse space */
-   int keepcost; /* top-most, releasable (via malloc_trim) space */
-};
-
-struct mallinfo mallinfo ( void )
-{
-   /* Should really try to return something a bit more meaningful */
-   Int             i;
-   struct mallinfo mi;
-   UChar*          pmi = (UChar*)(&mi);
-   for (i = 0; i < sizeof(mi); i++)
-      pmi[i] = 0;
-   return mi;
-}
-
-
-/* ---------------------------------------------------------------------
-   Replace some C lib things with equivs which don't get
-   spurious value warnings.  THEY RUN ON SIMD CPU!
-   ------------------------------------------------------------------ */
-
-char* strrchr ( const char* s, int c )
-{
-   UChar  ch   = (UChar)((UInt)c);
-   UChar* p    = (UChar*)s;
-   UChar* last = NULL;
-   while (True) {
-      if (*p == ch) last = p;
-      if (*p == 0) return last;
-      p++;
-   }
-}
-
-char* strchr ( const char* s, int c )
-{
-   UChar  ch = (UChar)((UInt)c);
-   UChar* p  = (UChar*)s;
-   while (True) {
-      if (*p == ch) return p;
-      if (*p == 0) return NULL;
-      p++;
-   }
-}
-
-char* strcat ( char* dest, const char* src )
-{
-   Char* dest_orig = dest;
-   while (*dest) dest++;
-   while (*src) *dest++ = *src++;
-   *dest = 0;
-   return dest_orig;
-}
-
-unsigned int strlen ( const char* str )
-{
-   UInt i = 0;
-   while (str[i] != 0) i++;
-   return i;
-}
-
-char* strcpy ( char* dest, const char* src )
-{
-   Char* dest_orig = dest;
-   while (*src) *dest++ = *src++;
-   *dest = 0;
-   return dest_orig;
-}
-
-int strncmp ( const char* s1, const char* s2, unsigned int nmax )
-{
-   unsigned int n = 0;
-   while (True) {
-      if (n >= nmax) return 0;
-      if (*s1 == 0 && *s2 == 0) return 0;
-      if (*s1 == 0) return -1;
-      if (*s2 == 0) return 1;
-
-      if (*(UChar*)s1 < *(UChar*)s2) return -1;
-      if (*(UChar*)s1 > *(UChar*)s2) return 1;
-
-      s1++; s2++; n++;
-   }
-}
-
-int strcmp ( const char* s1, const char* s2 )
-{
-   register char c1, c2;
-   while (True) {
-      c1 = *s1;
-      c2 = *s2;
-      if (c1 != c2) break;
-      if (c1 == 0) break;
-      s1++; s2++;
-   }
-   if (c1 < c2) return -1;
-   if (c1 > c2) return 1;
-   return 0;
-}
-
-void* memchr(const void *s, int c, unsigned int n)
-{
-   unsigned int i;
-   UChar c0 = (UChar)c;
-   UChar* p = (UChar*)s;
-   for (i = 0; i < n; i++)
-      if (p[i] == c0) return (void*)(&p[i]);
-   return NULL;
-}
-
-void* memcpy( void *dst, const void *src, unsigned int len )
-{
-    register char *d;
-    register char *s;
-    if ( dst > src ) {
-        d = (char *)dst + len - 1;
-        s = (char *)src + len - 1;
-        while ( len >= 4 ) {
-            *d-- = *s--;
-            *d-- = *s--;
-            *d-- = *s--;
-            *d-- = *s--;
-            len -= 4;
-	}
-        while ( len-- ) {
-            *d-- = *s--;
-        }
-    } else if ( dst < src ) {
-        d = (char *)dst;
-        s = (char *)src;
-	while ( len >= 4 ) {
-            *d++ = *s++;
-            *d++ = *s++;
-            *d++ = *s++;
-            *d++ = *s++;
-            len -= 4;
-	}
-        while ( len-- ) {
-            *d++ = *s++;
-	}
-    }
-    return dst;
-}
-
-
-/* ---------------------------------------------------------------------
-   Horrible hack to make sigsuspend() sort-of work OK.  Same trick as
-   for pause() in vg_libpthread.so.
-   ------------------------------------------------------------------ */
-
-/* Horrible because
-
-   -- uses VG_(ksigprocmask), VG_(nanosleep) and vg_assert, which are 
-      valgrind-native (not intended for client use).
-
-   -- This is here so single-threaded progs (not linking libpthread.so)
-      can see it.  But pause() should also be here.  ???
-*/
-
-/* Either libc supplies this (weak) or our libpthread.so supplies it
-   (strong) in a threaded setting. 
-*/
-extern int* __errno_location ( void );
-
-
-int sigsuspend ( /* const sigset_t * */ void* mask)
-{
-   unsigned int n_orig, n_now;
-   struct vki_timespec nanosleep_interval;
-
-   VALGRIND_MAGIC_SEQUENCE(n_orig, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__GET_N_SIGS_RETURNED, 
-                           0, 0, 0, 0);
-   vg_assert(n_orig != 0xFFFFFFFF);
-
-   VG_(ksigprocmask)(VKI_SIG_SETMASK, mask, NULL);
-
-   while (1) {
-      VALGRIND_MAGIC_SEQUENCE(n_now, 0xFFFFFFFF /* default */,
-                              VG_USERREQ__GET_N_SIGS_RETURNED, 
-                              0, 0, 0, 0);
-      vg_assert(n_now != 0xFFFFFFFF);
-      vg_assert(n_now >= n_orig);
-      if (n_now != n_orig) break;
-
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 53 * 1000 * 1000; /* 53 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      VG_(nanosleep)( &nanosleep_interval, NULL);
-   }
-
-   /* Maybe this is OK both in single and multithreaded setting. */
-   * (__errno_location()) = -VKI_EINTR; /* == EINTR; */ 
-   return -1;
-}
-
-
-/* ---------------------------------------------------------------------
-   Hook for running __libc_freeres once the program exits.
-   ------------------------------------------------------------------ */
-
-void VG_(__libc_freeres_wrapper)( void )
-{
-   int res;
-   extern void __libc_freeres(void);
-   __libc_freeres();
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__LIBC_FREERES_DONE, 0, 0, 0, 0);
-   /*NOTREACHED*/
-   vg_assert(12345+54321 == 999999);
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                         vg_clientfuncs.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_clientmalloc.c b/coregrind/vg_clientmalloc.c
deleted file mode 100644
index 0292aa404d..0000000000
--- a/coregrind/vg_clientmalloc.c
+++ /dev/null
@@ -1,579 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- An implementation of malloc/free for the client.             ---*/
-/*---                                            vg_clientmalloc.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-
-/*------------------------------------------------------------*/
-/*--- Defns                                                ---*/
-/*------------------------------------------------------------*/
-
-/* #define DEBUG_CLIENTMALLOC */
-
-/* Holds malloc'd but not freed blocks. */
-#define VG_MALLOCLIST_NO(aa) (((UInt)(aa)) % VG_N_MALLOCLISTS)
-static ShadowChunk* vg_malloclist[VG_N_MALLOCLISTS];
-static Bool         vg_client_malloc_init_done = False;
-
-/* Holds blocks after freeing. */
-static ShadowChunk* vg_freed_list_start   = NULL;
-static ShadowChunk* vg_freed_list_end     = NULL;
-static Int          vg_freed_list_volume  = 0;
-
-/* Stats ... */
-static UInt         vg_cmalloc_n_mallocs  = 0;
-static UInt         vg_cmalloc_n_frees    = 0;
-static UInt         vg_cmalloc_bs_mallocd = 0;
-
-static UInt         vg_mlist_frees = 0;
-static UInt         vg_mlist_tries = 0;
-
-
-/*------------------------------------------------------------*/
-/*--- Fns                                                  ---*/
-/*------------------------------------------------------------*/
-
-/* Allocate a suitably-sized array, copy all the malloc-d block
-   shadows into it, and return both the array and the size of it.
-   This is used by the memory-leak detector.
-*/
-ShadowChunk** VG_(get_malloc_shadows) ( /*OUT*/ UInt* n_shadows )
-{
-   UInt          i, scn;
-   ShadowChunk** arr;
-   ShadowChunk*  sc;
-   *n_shadows = 0;
-   for (scn = 0; scn < VG_N_MALLOCLISTS; scn++) {
-      for (sc = vg_malloclist[scn]; sc != NULL; sc = sc->next) {
-         (*n_shadows)++;
-      }
-   }
-   if (*n_shadows == 0) return NULL;
-
-   arr = VG_(malloc)( VG_AR_PRIVATE, 
-                      *n_shadows * sizeof(ShadowChunk*) );
-
-   i = 0;
-   for (scn = 0; scn < VG_N_MALLOCLISTS; scn++) {
-      for (sc = vg_malloclist[scn]; sc != NULL; sc = sc->next) {
-         arr[i++] = sc;
-      }
-   }
-   vg_assert(i == *n_shadows);
-   return arr;
-}
-
-static void client_malloc_init ( void )
-{
-   UInt ml_no;
-   if (vg_client_malloc_init_done) return;
-   for (ml_no = 0; ml_no < VG_N_MALLOCLISTS; ml_no++)
-      vg_malloclist[ml_no] = NULL;
-   vg_client_malloc_init_done = True;
-}
-
-
-static __attribute__ ((unused))
-       Int count_freelist ( void )
-{
-   ShadowChunk* sc;
-   Int n = 0;
-   for (sc = vg_freed_list_start; sc != NULL; sc = sc->next)
-      n++;
-   return n;
-}
-
-static __attribute__ ((unused))
-       Int count_malloclists ( void )
-{
-   ShadowChunk* sc;
-   UInt ml_no;
-   Int  n = 0;
-   for (ml_no = 0; ml_no < VG_N_MALLOCLISTS; ml_no++) 
-      for (sc = vg_malloclist[ml_no]; sc != NULL; sc = sc->next)
-         n++;
-   return n;
-}
-
-static __attribute__ ((unused))
-       void freelist_sanity ( void )
-{
-   ShadowChunk* sc;
-   Int n = 0;
-   /* VG_(printf)("freelist sanity\n"); */
-   for (sc = vg_freed_list_start; sc != NULL; sc = sc->next)
-      n += sc->size;
-   vg_assert(n == vg_freed_list_volume);
-}
-
-/* Remove sc from malloc list # sc.  It is an unchecked error for
-   sc not to be present in the list. 
-*/
-static void remove_from_malloclist ( UInt ml_no, ShadowChunk* sc )
-{
-   ShadowChunk *sc1, *sc2;
-   if (sc == vg_malloclist[ml_no]) {
-      vg_malloclist[ml_no] = vg_malloclist[ml_no]->next;
-   } else {
-      sc1 = vg_malloclist[ml_no];
-      vg_assert(sc1 != NULL);
-      sc2 = sc1->next;
-      while (sc2 != sc) {
-         vg_assert(sc2 != NULL);
-         sc1 = sc2;
-         sc2 = sc2->next;
-      }
-      vg_assert(sc1->next == sc);
-      vg_assert(sc2 == sc);
-      sc1->next = sc2->next;
-   }
-}
-
-
-/* Put a shadow chunk on the freed blocks queue, possibly freeing up
-   some of the oldest blocks in the queue at the same time. */
-
-static void add_to_freed_queue ( ShadowChunk* sc )
-{
-   ShadowChunk* sc1;
-
-   /* Put it at the end of the freed list */
-   if (vg_freed_list_end == NULL) {
-      vg_assert(vg_freed_list_start == NULL);
-      vg_freed_list_end = vg_freed_list_start = sc;
-      vg_freed_list_volume = sc->size;
-   } else {
-      vg_assert(vg_freed_list_end->next == NULL);
-      vg_freed_list_end->next = sc;
-      vg_freed_list_end = sc;
-      vg_freed_list_volume += sc->size;
-   }
-   sc->next = NULL;
-
-   /* Release enough of the oldest blocks to bring the free queue
-      volume below vg_clo_freelist_vol. */
-
-   while (vg_freed_list_volume > VG_(clo_freelist_vol)) {
-      /* freelist_sanity(); */
-      vg_assert(vg_freed_list_start != NULL);
-      vg_assert(vg_freed_list_end != NULL);
-
-      sc1 = vg_freed_list_start;
-      vg_freed_list_volume -= sc1->size;
-      /* VG_(printf)("volume now %d\n", vg_freed_list_volume); */
-      vg_assert(vg_freed_list_volume >= 0);
-
-      if (vg_freed_list_start == vg_freed_list_end) {
-         vg_freed_list_start = vg_freed_list_end = NULL;
-      } else {
-         vg_freed_list_start = sc1->next;
-      }
-      sc1->next = NULL; /* just paranoia */
-      VG_(free)(VG_AR_CLIENT,  (void*)(sc1->data));
-      VG_(free)(VG_AR_PRIVATE, sc1);
-   }
-}
-
-
-/* Allocate a user-chunk of size bytes.  Also allocate its shadow
-   block, make the shadow block point at the user block.  Put the
-   shadow chunk on the appropriate list, and set all memory
-   protections correctly. */
-
-static ShadowChunk* client_malloc_shadow ( ThreadState* tst,
-                                           UInt align, UInt size, 
-                                           VgAllocKind kind )
-{
-   ShadowChunk* sc;
-   Addr         p;
-   UInt         ml_no;
-
-#  ifdef DEBUG_CLIENTMALLOC
-   VG_(printf)("[m %d, f %d (%d)] client_malloc_shadow ( al %d, sz %d )\n", 
-               count_malloclists(), 
-               count_freelist(), vg_freed_list_volume,
-               align, size );
-#  endif
-
-   vg_assert(align >= 4);
-   if (align == 4)
-      p = (Addr)VG_(malloc)(VG_AR_CLIENT, size);
-   else
-      p = (Addr)VG_(malloc_aligned)(VG_AR_CLIENT, align, size);
-
-   sc        = VG_(malloc)(VG_AR_PRIVATE, sizeof(ShadowChunk));
-   sc->where = VG_(get_ExeContext)(False, tst->m_eip, tst->m_ebp);
-   sc->size  = size;
-   sc->allockind = kind;
-   sc->data  = p;
-   ml_no     = VG_MALLOCLIST_NO(p);
-   sc->next  = vg_malloclist[ml_no];
-   vg_malloclist[ml_no] = sc;
-
-   VGM_(make_writable)(p, size);
-   VGM_(make_noaccess)(p + size, 
-                       VG_AR_CLIENT_REDZONE_SZB);
-   VGM_(make_noaccess)(p - VG_AR_CLIENT_REDZONE_SZB, 
-                       VG_AR_CLIENT_REDZONE_SZB);
-
-   return sc;
-}
-
-
-/* Allocate memory, noticing whether or not we are doing the full
-   instrumentation thing. */
-
-void* VG_(client_malloc) ( ThreadState* tst, UInt size, VgAllocKind kind )
-{
-   ShadowChunk* sc;
-
-   VGP_PUSHCC(VgpCliMalloc);
-   client_malloc_init();
-#  ifdef DEBUG_CLIENTMALLOC
-   VG_(printf)("[m %d, f %d (%d)] client_malloc ( %d, %x )\n", 
-               count_malloclists(), 
-               count_freelist(), vg_freed_list_volume,
-               size, raw_alloc_kind );
-#  endif
-
-   vg_cmalloc_n_mallocs ++;
-   vg_cmalloc_bs_mallocd += size;
-
-   if (!VG_(clo_instrument)) {
-      VGP_POPCC;
-      return VG_(malloc) ( VG_AR_CLIENT, size );
-   }
-
-   sc = client_malloc_shadow ( tst, VG_(clo_alignment), size, kind );
-   VGP_POPCC;
-   return (void*)(sc->data);
-}
-
-
-void* VG_(client_memalign) ( ThreadState* tst, UInt align, UInt size )
-{
-   ShadowChunk* sc;
-   VGP_PUSHCC(VgpCliMalloc);
-   client_malloc_init();
-#  ifdef DEBUG_CLIENTMALLOC
-   VG_(printf)("[m %d, f %d (%d)] client_memalign ( al %d, sz %d )\n", 
-               count_malloclists(), 
-               count_freelist(), vg_freed_list_volume,
-               align, size );
-#  endif
-
-   vg_cmalloc_n_mallocs ++;
-   vg_cmalloc_bs_mallocd += size;
-
-   if (!VG_(clo_instrument)) {
-      VGP_POPCC;
-      return VG_(malloc_aligned) ( VG_AR_CLIENT, align, size );
-   }
-   sc = client_malloc_shadow ( tst, align, size, Vg_AllocMalloc );
-   VGP_POPCC;
-   return (void*)(sc->data);
-}
-
-
-void VG_(client_free) ( ThreadState* tst, void* ptrV, VgAllocKind kind )
-{
-   ShadowChunk* sc;
-   UInt         ml_no;
-
-   VGP_PUSHCC(VgpCliMalloc);
-   client_malloc_init();
-#  ifdef DEBUG_CLIENTMALLOC
-   VG_(printf)("[m %d, f %d (%d)] client_free ( %p, %x )\n", 
-               count_malloclists(), 
-               count_freelist(), vg_freed_list_volume,
-               ptrV, raw_alloc_kind );
-#  endif
-
-   vg_cmalloc_n_frees ++;
-
-   if (!VG_(clo_instrument)) {
-      VGP_POPCC;
-      VG_(free) ( VG_AR_CLIENT, ptrV );
-      return;
-   }
-
-   /* first, see if ptrV is one vg_client_malloc gave out. */
-   ml_no = VG_MALLOCLIST_NO(ptrV);
-   vg_mlist_frees++;
-   for (sc = vg_malloclist[ml_no]; sc != NULL; sc = sc->next) {
-      vg_mlist_tries++;
-      if ((Addr)ptrV == sc->data)
-         break;
-   }
-
-   if (sc == NULL) {
-      VG_(record_free_error) ( tst, (Addr)ptrV );
-      VGP_POPCC;
-      return;
-   }
-
-   /* check if its a matching free() / delete / delete [] */
-   if (kind != sc->allockind)
-      VG_(record_freemismatch_error) ( tst, (Addr) ptrV );
-
-   /* Remove the shadow chunk from the mallocd list. */
-   remove_from_malloclist ( ml_no, sc );
-
-   /* Declare it inaccessible. */
-   VGM_(make_noaccess) ( sc->data - VG_AR_CLIENT_REDZONE_SZB, 
-                         sc->size + 2*VG_AR_CLIENT_REDZONE_SZB );
-   VGM_(make_noaccess) ( (Addr)sc, sizeof(ShadowChunk) );
-   sc->where = VG_(get_ExeContext)(False, tst->m_eip, tst->m_ebp);
-
-   /* Put it out of harm's way for a while. */
-   add_to_freed_queue ( sc );
-   VGP_POPCC;
-}
-
-
-
-void* VG_(client_calloc) ( ThreadState* tst, UInt nmemb, UInt size1 )
-{
-   ShadowChunk* sc;
-   Addr         p;
-   UInt         size, i, ml_no;
-
-   VGP_PUSHCC(VgpCliMalloc);
-   client_malloc_init();
-
-#  ifdef DEBUG_CLIENTMALLOC
-   VG_(printf)("[m %d, f %d (%d)] client_calloc ( %d, %d )\n", 
-               count_malloclists(), 
-               count_freelist(), vg_freed_list_volume,
-               nmemb, size1 );
-#  endif
-
-   vg_cmalloc_n_mallocs ++;
-   vg_cmalloc_bs_mallocd += nmemb * size1;
-
-   if (!VG_(clo_instrument)) {
-      VGP_POPCC;
-      return VG_(calloc) ( VG_AR_CLIENT, nmemb, size1 );
-   }
-
-   size      = nmemb * size1;
-   p         = (Addr)VG_(malloc)(VG_AR_CLIENT, size);
-   sc        = VG_(malloc)(VG_AR_PRIVATE, sizeof(ShadowChunk));
-   sc->where = VG_(get_ExeContext)(False, tst->m_eip, tst->m_ebp);
-   sc->size  = size;
-   sc->allockind = Vg_AllocMalloc; /* its a lie - but true. eat this :) */
-   sc->data  = p;
-   ml_no     = VG_MALLOCLIST_NO(p);
-   sc->next  = vg_malloclist[ml_no];
-   vg_malloclist[ml_no] = sc;
-
-   VGM_(make_readable)(p, size);
-   VGM_(make_noaccess)(p + size, 
-                       VG_AR_CLIENT_REDZONE_SZB);
-   VGM_(make_noaccess)(p - VG_AR_CLIENT_REDZONE_SZB, 
-                       VG_AR_CLIENT_REDZONE_SZB);
-
-   for (i = 0; i < size; i++) ((UChar*)p)[i] = 0;
-
-   VGP_POPCC;
-   return (void*)p;
-}
-
-
-void* VG_(client_realloc) ( ThreadState* tst, void* ptrV, UInt size_new )
-{
-   ShadowChunk *sc, *sc_new;
-   UInt         i, ml_no;
-
-   VGP_PUSHCC(VgpCliMalloc);
-   client_malloc_init();
-
-#  ifdef DEBUG_CLIENTMALLOC
-   VG_(printf)("[m %d, f %d (%d)] client_realloc ( %p, %d )\n", 
-               count_malloclists(), 
-               count_freelist(), vg_freed_list_volume,
-               ptrV, size_new );
-#  endif
-
-   vg_cmalloc_n_frees ++;
-   vg_cmalloc_n_mallocs ++;
-   vg_cmalloc_bs_mallocd += size_new;
-
-   if (!VG_(clo_instrument)) {
-      vg_assert(ptrV != NULL && size_new != 0);
-      VGP_POPCC;
-      return VG_(realloc) ( VG_AR_CLIENT, ptrV, size_new );
-   }
-
-   /* First try and find the block. */
-   ml_no = VG_MALLOCLIST_NO(ptrV);
-   for (sc = vg_malloclist[ml_no]; sc != NULL; sc = sc->next) {
-      if ((Addr)ptrV == sc->data)
-         break;
-   }
-  
-   if (sc == NULL) {
-      VG_(record_free_error) ( tst, (Addr)ptrV );
-      /* Perhaps we should keep going regardless. */
-      VGP_POPCC;
-      return NULL;
-   }
-
-   if (sc->allockind != Vg_AllocMalloc) {
-      /* can not realloc a range that was allocated with new or new [] */
-      VG_(record_freemismatch_error) ( tst, (Addr)ptrV );
-      /* but keep going anyway */
-   }
-
-   if (sc->size == size_new) {
-      /* size unchanged */
-      VGP_POPCC;
-      return ptrV;
-   }
-   if (sc->size > size_new) {
-      /* new size is smaller */
-      VGM_(make_noaccess)( sc->data + size_new, 
-                           sc->size - size_new );
-      sc->size = size_new;
-      VGP_POPCC;
-      return ptrV;
-   } else {
-      /* new size is bigger */
-      sc_new = client_malloc_shadow ( tst, VG_(clo_alignment), 
-                                      size_new, Vg_AllocMalloc );
-      for (i = 0; i < sc->size; i++)
-         ((UChar*)(sc_new->data))[i] = ((UChar*)(sc->data))[i];
-      VGM_(copy_address_range_perms) ( 
-         sc->data, sc_new->data, sc->size );
-      remove_from_malloclist ( VG_MALLOCLIST_NO(sc->data), sc );
-      VGM_(make_noaccess) ( sc->data - VG_AR_CLIENT_REDZONE_SZB, 
-                            sc->size + 2*VG_AR_CLIENT_REDZONE_SZB );
-      VGM_(make_noaccess) ( (Addr)sc, sizeof(ShadowChunk) );
-      add_to_freed_queue ( sc );
-      VGP_POPCC;
-      return (void*)sc_new->data;
-   }  
-}
-
-
-void VG_(clientmalloc_done) ( void )
-{
-   UInt         nblocks, nbytes, ml_no;
-   ShadowChunk* sc;
-
-   client_malloc_init();
-
-   nblocks = nbytes = 0;
-
-   for (ml_no = 0; ml_no < VG_N_MALLOCLISTS; ml_no++) {
-      for (sc = vg_malloclist[ml_no]; sc != NULL; sc = sc->next) {
-         nblocks ++;
-         nbytes  += sc->size;
-      }
-   }
-
-   if (VG_(clo_verbosity) == 0)
-     return;
-
-   VG_(message)(Vg_UserMsg, 
-                "malloc/free: in use at exit: %d bytes in %d blocks.",
-                nbytes, nblocks);
-   VG_(message)(Vg_UserMsg, 
-                "malloc/free: %d allocs, %d frees, %d bytes allocated.",
-                vg_cmalloc_n_mallocs,
-                vg_cmalloc_n_frees, vg_cmalloc_bs_mallocd);
-   if (!VG_(clo_leak_check))
-      VG_(message)(Vg_UserMsg, 
-                   "For a detailed leak analysis,  rerun with: --leak-check=yes");
-   if (0)
-      VG_(message)(Vg_DebugMsg,
-                   "free search: %d tries, %d frees", 
-                   vg_mlist_tries, 
-                   vg_mlist_frees );
-   if (VG_(clo_verbosity) > 1)
-      VG_(message)(Vg_UserMsg, "");
-}
-
-
-/* Describe an address as best you can, for error messages,
-   putting the result in ai. */
-
-void VG_(describe_addr) ( Addr a, AddrInfo* ai )
-{
-   ShadowChunk* sc;
-   UInt         ml_no;
-   Bool         ok;
-   ThreadId     tid;
-
-   /* Perhaps it's a user-def'd block ? */
-   ok = VG_(client_perm_maybe_describe)( a, ai );
-   if (ok)
-      return;
-   /* Perhaps it's on a thread's stack? */
-   tid = VG_(identify_stack_addr)(a);
-   if (tid != VG_INVALID_THREADID) {
-      ai->akind     = Stack;
-      ai->stack_tid = tid;
-      return;
-   }
-   /* Search for a freed block which might bracket it. */
-   for (sc = vg_freed_list_start; sc != NULL; sc = sc->next) {
-      if (sc->data - VG_AR_CLIENT_REDZONE_SZB <= a
-          && a < sc->data + sc->size + VG_AR_CLIENT_REDZONE_SZB) {
-         ai->akind      = Freed;
-         ai->blksize    = sc->size;
-         ai->rwoffset   = (Int)(a) - (Int)(sc->data);
-         ai->lastchange = sc->where;
-         return;
-      }
-   }
-   /* Search for a mallocd block which might bracket it. */
-   for (ml_no = 0; ml_no < VG_N_MALLOCLISTS; ml_no++) {
-      for (sc = vg_malloclist[ml_no]; sc != NULL; sc = sc->next) {
-         if (sc->data - VG_AR_CLIENT_REDZONE_SZB <= a
-             && a < sc->data + sc->size + VG_AR_CLIENT_REDZONE_SZB) {
-            ai->akind      = Mallocd;
-            ai->blksize    = sc->size;
-            ai->rwoffset   = (Int)(a) - (Int)(sc->data);
-            ai->lastchange = sc->where;
-            return;
-         }
-      }
-   }
-   /* Clueless ... */
-   ai->akind = Unknown;
-   return;
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                        vg_clientmalloc.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_constants.h b/coregrind/vg_constants.h
deleted file mode 100644
index d3da14b1a9..0000000000
--- a/coregrind/vg_constants.h
+++ /dev/null
@@ -1,100 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A header file containing constants (for assembly code).      ---*/
-/*---                                               vg_constants.h ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#ifndef __VG_CONSTANTS_H
-#define __VG_CONSTANTS_H
-
-
-/* This file is included in all Valgrind source files, including
-   assembly ones. */
-
-/* All symbols externally visible from valgrind.so are prefixed
-   as specified here.  The prefix can be changed, so as to avoid
-   namespace conflict problems.
-*/
-#define VGAPPEND(str1,str2) str1##str2
-
-/* These macros should add different prefixes so the same base
-   name can safely be used across different macros. */
-#define VG_(str)    VGAPPEND(vgPlain_,str)
-#define VGM_(str)   VGAPPEND(vgMem_,str)
-#define VGP_(str)   VGAPPEND(vgProf_,str)
-#define VGOFF_(str) VGAPPEND(vgOff_,str)
-
-
-/* Magic values that %ebp might be set to when returning to the
-   dispatcher.  The only other legitimate value is to point to the
-   start of VG_(baseBlock).  These also are return values from
-   VG_(run_innerloop) to the scheduler.
-
-   EBP means %ebp can legitimately have this value when a basic block
-   returns to the dispatch loop.  TRC means that this value is a valid
-   thread return code, which the dispatch loop may return to the
-   scheduler.  */
-#define VG_TRC_EBP_JMP_STKADJ     17 /* EBP only; handled by dispatcher */
-#define VG_TRC_EBP_JMP_SYSCALL    19 /* EBP and TRC */
-#define VG_TRC_EBP_JMP_CLIENTREQ  23 /* EBP and TRC */
-
-#define VG_TRC_INNER_COUNTERZERO  29  /* TRC only; means bb ctr == 0 */
-#define VG_TRC_INNER_FASTMISS     31  /* TRC only; means fast-cache miss. */
-#define VG_TRC_UNRESUMABLE_SIGNAL 37  /* TRC only; got sigsegv/sigbus */
-
-
-/* Debugging hack for assembly code ... sigh. */
-#if 0
-#define OYNK(nnn) pushal;  pushl $nnn; call VG_(oynk) ; addl $4,%esp; popal
-#else
-#define OYNK(nnn)
-#endif
-
-#if 0
-#define OYNNK(nnn) pushal;  pushl $nnn; call VG_(oynk) ; addl $4,%esp; popal
-#else
-#define OYNNK(nnn)
-#endif
-
-
-/* Constants for the fast translation lookup cache. */
-#define VG_TT_FAST_BITS 15
-#define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS)
-#define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1)
-
-/* Constants for the fast original-code-write check cache. */
-
-
-/* Assembly code stubs make this request */
-#define VG_USERREQ__SIGNAL_RETURNS          0x4001
-
-#endif /* ndef __VG_INCLUDE_H */
-
-/*--------------------------------------------------------------------*/
-/*--- end                                           vg_constants.h ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_demangle.c b/coregrind/vg_demangle.c
deleted file mode 100644
index f07f7f3465..0000000000
--- a/coregrind/vg_demangle.c
+++ /dev/null
@@ -1,73 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Demangling of C++ mangled names.                             ---*/
-/*---                                                vg_demangle.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "demangle.h"
-
-#define ADD_TO_RESULT(zzstr,zzn)                   \
-{                                                  \
-   Char* zz = (zzstr);                             \
-   Int nn = (zzn);                                 \
-   Int ii;                                         \
-   for (ii = 0; ii < nn; ii++) {                   \
-      result[n_result] = zz[ii];                   \
-      if (n_result < result_size-1) n_result++;    \
-      result[n_result] = 0;                        \
-   }                                               \
-}
-
-void VG_(demangle) ( Char* orig, Char* result, Int result_size )
-{
-   Int   n_result  = 0;
-   Char* demangled = NULL;
-
-   if (VG_(clo_demangle))
-      demangled = VG_(cplus_demangle) ( orig, DMGL_ANSI | DMGL_PARAMS );
-
-   if (demangled) {
-      ADD_TO_RESULT(demangled, VG_(strlen)(demangled));
-      VG_(free) (VG_AR_DEMANGLE, demangled);
-   } else {
-      ADD_TO_RESULT(orig, VG_(strlen)(orig));
-   }
-
-   /* Check that the demangler isn't leaking. */
-   /* 15 Feb 02: if this assertion fails, this is not a disaster.
-      Comment it out, and let me know.  (jseward@acm.org). */
-   vg_assert(VG_(is_empty_arena)(VG_AR_DEMANGLE));
-
-   /* VG_(show_all_arena_stats)(); */
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                            vg_demangle.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_dispatch.S b/coregrind/vg_dispatch.S
deleted file mode 100644
index bd1c5b959f..0000000000
--- a/coregrind/vg_dispatch.S
+++ /dev/null
@@ -1,212 +0,0 @@
-
-##--------------------------------------------------------------------##
-##--- The core dispatch loop, for jumping to a code address.       ---##
-##---                                                vg_dispatch.S ---##
-##--------------------------------------------------------------------##
-
-/*
-  This file is part of Valgrind, an x86 protected-mode emulator 
-  designed for debugging and profiling binaries on x86-Unixes.
-
-  Copyright (C) 2000-2002 Julian Seward 
-     jseward@acm.org
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of the
-  License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-  02111-1307, USA.
-
-  The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_constants.h"
-
-
-/*------------------------------------------------------------*/
-/*--- The normal-case dispatch machinery.                  ---*/
-/*------------------------------------------------------------*/
-	
-/* To transfer to an (original) code address, load it into %eax and
-   jump to vg_dispatch.  This fragment of code tries to find the
-   address of the corresponding translation by searching the translation
-   table.   If it fails, a new translation is made, added to the
-   translation table, and then jumped to.  Almost all the hard
-   work is done by C routines; this code simply handles the
-   common case fast -- when the translation address is found in
-   the translation cache.
-
-   At entry, %eax is the only live (real-machine) register; the
-   entire simulated state is tidily saved in vg_m_state.  
-*/
-
-	
-/* The C world needs a way to get started simulating.  So we provide
-   a function void vg_run_innerloop ( void ), which starts running
-   from vg_m_eip, and exits when the counter reaches zero.  This loop
-   can also exit if vg_oursignalhandler() catches a non-resumable
-   signal, for example SIGSEGV.  It then longjmp()s back past here.
-*/
-	
-.globl VG_(run_innerloop)
-VG_(run_innerloop):
-	#OYNK(1000)
-
-	# ----- entry point to VG_(run_innerloop) -----
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	pushl	%esi
-	pushl	%edi
-	pushl	%ebp
-
-	# Set up the baseBlock pointer
-	movl	$VG_(baseBlock), %ebp
-
-	# fetch m_eip into %eax
-	movl	VGOFF_(m_eip), %esi
-	movl	(%ebp, %esi, 4), %eax
-	
-	# Start off dispatching paranoically, since we no longer have
-	# any indication whether or not this might be a special call/ret
-	# transfer.
-	jmp	dispatch_stkadj
-	
-	
-dispatch_main:
-	# Jump here to do a new dispatch.
-	# %eax holds destination (original) address.
-	# %ebp indicates further details of the control transfer
-	# requested to the address in %eax.
-	#
-	# If ebp == & VG_(baseBlock), just jump next to %eax.
-	# 
-	# If ebp == VG_EBP_JMP_SYSCALL, do a system call before 
-	# continuing at eax.
-	#
-	# If ebp == VG_EBP_JMP_CLIENTREQ, do a client request before 
-	# continuing at eax.
-	#
-	# If %ebp has any other value, we panic.
-	
-	cmpl	$VG_(baseBlock), %ebp
-	jnz	dispatch_exceptional
-
-dispatch_boring:
-	# save the jump address at VG_(baseBlock)[VGOFF_(m_eip)],
-	movl	VGOFF_(m_eip), %esi
-	movl	%eax, (%ebp, %esi, 4)
-	
-	# do a timeslice check.
-	# are we out of timeslice?  If yes, defer to scheduler.
-	#OYNK(1001)
-	decl	VG_(dispatch_ctr)
-	jz	counter_is_zero
-
-	#OYNK(1002)
-	# try a fast lookup in the translation cache
-	movl	%eax, %ebx
-	andl	$VG_TT_FAST_MASK, %ebx	
-	# ebx = tt_fast index
-	movl	VG_(tt_fast)(,%ebx,4), %ebx	
-	# ebx points at a tt entry
-	# now compare target with the tte.orig_addr field (+0)
-	cmpl	%eax, (%ebx)
-	jnz	fast_lookup_failed
-
-	# Found a match.  Set the tte.mru_epoch field (+8)
-	# and call the tte.trans_addr field (+4)
-	movl	VG_(current_epoch), %ecx
-	movl	%ecx, 8(%ebx)
-	call	*4(%ebx)
-	jmp	dispatch_main
-	
-fast_lookup_failed:
-	# %EIP is up to date here since dispatch_boring dominates
-	movl	$VG_TRC_INNER_FASTMISS, %eax
-	jmp	run_innerloop_exit
-
-counter_is_zero:
-	# %EIP is up to date here since dispatch_boring dominates
-	movl	$VG_TRC_INNER_COUNTERZERO, %eax
-	jmp	run_innerloop_exit
-	
-run_innerloop_exit:
-	popl	%ebp
-	popl	%edi
-	popl	%esi
-	popl	%edx
-	popl	%ecx
-	popl	%ebx
-	ret	
-
-
-
-/* Other ways of getting out of the inner loop.  Placed out-of-line to
-   make it look cleaner. 
-*/
-dispatch_exceptional:
-	# this is jumped to only, not fallen-through from above
-	cmpl	$VG_TRC_EBP_JMP_STKADJ, %ebp
-	jz	dispatch_stkadj
-	cmpl	$VG_TRC_EBP_JMP_SYSCALL, %ebp
-	jz	dispatch_syscall
-	cmpl	$VG_TRC_EBP_JMP_CLIENTREQ, %ebp
-	jz	dispatch_clientreq
-
-	# ebp has an invalid value ... crap out.
-	pushl	$panic_msg_ebp
-	call	VG_(panic)
-	#	(never returns)
-
-dispatch_syscall:
-	# save %eax in %EIP and defer to sched
-	movl	$VG_(baseBlock), %ebp
-	movl	VGOFF_(m_eip), %esi
-	movl	%eax, (%ebp, %esi, 4)
-	movl	$VG_TRC_EBP_JMP_SYSCALL, %eax
-	jmp	run_innerloop_exit
-	
-dispatch_clientreq:
-	# save %eax in %EIP and defer to sched
-	movl	$VG_(baseBlock), %ebp
-	movl	VGOFF_(m_eip), %esi
-	movl	%eax, (%ebp, %esi, 4)
-	movl	$VG_TRC_EBP_JMP_CLIENTREQ, %eax
-	jmp	run_innerloop_exit
-
-dispatch_stkadj:
-	# save %eax in %EIP
-	movl	$VG_(baseBlock), %ebp
-	movl	VGOFF_(m_eip), %esi
-	movl	%eax, (%ebp, %esi, 4)
-
-	# see if we need to mess with stack blocks
-	pushl	%eax
-	call	VG_(delete_client_stack_blocks_following_ESP_change)
-	popl	%eax
-	movl	$VG_(baseBlock), %ebp
-		
-	# ok, its not interesting.  Handle the normal way.
-	jmp	dispatch_boring
-
-
-.data
-panic_msg_ebp:
-.ascii	"vg_dispatch: %ebp has invalid value!"
-.byte	0
-.text	
-
-
-##--------------------------------------------------------------------##
-##--- end                                            vg_dispatch.S ---##
-##--------------------------------------------------------------------##
diff --git a/coregrind/vg_errcontext.c b/coregrind/vg_errcontext.c
deleted file mode 100644
index 46838b603f..0000000000
--- a/coregrind/vg_errcontext.c
+++ /dev/null
@@ -1,1234 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Management of error messages.                vg_errcontext.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "vg_constants.h"
-
-
-/*------------------------------------------------------------*/
-/*--- Defns                                                ---*/
-/*------------------------------------------------------------*/
-
-/* Suppression is a type describing an error which we want to
-   suppress, ie, not show the user, usually because it is caused by a
-   problem in a library which we can't fix, replace or work around.
-   Suppressions are read from a file at startup time, specified by
-   vg_clo_suppressions, and placed in the vg_suppressions list.  This
-   gives flexibility so that new suppressions can be added to the file
-   as and when needed. 
-*/
-typedef 
-   enum { 
-      /* Bad syscall params */
-      Param, 
-      /* Use of invalid values of given size */
-      Value0, Value1, Value2, Value4, Value8, 
-      /* Invalid read/write attempt at given size */
-      Addr1, Addr2, Addr4, Addr8,
-      /* Invalid or mismatching free */
-      FreeS,
-      /* Pthreading error */
-      PThread
-   } 
-   SuppressionKind;
-
-
-/* For each caller specified for a suppression, record the nature of
-   the caller name. */
-typedef
-   enum { 
-      /* Name is of an shared object file. */
-      ObjName,
-      /* Name is of a function. */
-      FunName 
-   }
-   SuppressionLocTy;
-
-
-/* A complete suppression record. */
-typedef
-   struct _Suppression {
-      struct _Suppression* next;
-      /* The number of times this error has been suppressed. */
-      Int count;
-      /* The name by which the suppression is referred to. */
-      Char* sname;
-      /* What kind of suppression. */
-      SuppressionKind skind;
-      /* Name of syscall param if skind==Param */
-      Char* param;
-      /* Name of fn where err occurs, and immediate caller (mandatory). */
-      SuppressionLocTy caller0_ty;
-      Char*            caller0;
-      SuppressionLocTy caller1_ty;
-      Char*            caller1;
-      /* Optional extra callers. */
-      SuppressionLocTy caller2_ty;
-      Char*            caller2;
-      SuppressionLocTy caller3_ty;
-      Char*            caller3;
-   } 
-   Suppression;
-
-
-/* ErrContext is a type for recording just enough info to generate an
-   error report for an illegal memory access.  The idea is that
-   (typically) the same few points in the program generate thousands
-   of illegal accesses, and we don't want to spew out a fresh error
-   message for each one.  Instead, we use these structures to common
-   up duplicates.  
-*/
-
-/* What kind of error it is. */
-typedef 
-   enum { ValueErr, AddrErr, 
-          ParamErr, UserErr, /* behaves like an anonymous ParamErr */
-          FreeErr, FreeMismatchErr,
-          PThreadErr /* pthread API error */
-   }
-   ErrKind;
-
-/* What kind of memory access is involved in the error? */
-typedef
-   enum { ReadAxs, WriteAxs, ExecAxs }
-   AxsKind;
-
-/* Top-level struct for recording errors. */
-typedef
-   struct _ErrContext {
-      /* ALL */
-      struct _ErrContext* next;
-      /* ALL */
-      /* NULL if unsuppressed; or ptr to suppression record. */
-      Suppression* supp;
-      /* ALL */
-      Int count;
-      /* ALL */
-      ErrKind ekind;
-      /* ALL */
-      ExeContext* where;
-      /* Addr */
-      AxsKind axskind;
-      /* Addr, Value */
-      Int size;
-      /* Addr, Free, Param, User */
-      Addr addr;
-      /* Addr, Free, Param, User */
-      AddrInfo addrinfo;
-      /* Param; hijacked for PThread as a description */
-      Char* syscall_param;
-      /* Param, User */
-      Bool isWriteableLack;
-      /* ALL */
-      ThreadId tid;
-      /* ALL */
-      /* These record %EIP, %ESP and %EBP at the error point.  They
-         are only used to make GDB-attaching convenient; there is no
-         other purpose; specifically they are not used to do
-         comparisons between errors. */
-      UInt m_eip;
-      UInt m_esp;
-      UInt m_ebp;
-   } 
-   ErrContext;
-
-/* The list of error contexts found, both suppressed and unsuppressed.
-   Initially empty, and grows as errors are detected. */
-static ErrContext* vg_err_contexts = NULL;
-
-/* The list of suppression directives, as read from the specified
-   suppressions file. */
-static Suppression* vg_suppressions = NULL;
-
-/* Running count of unsuppressed errors detected. */
-static UInt vg_n_errs_found = 0;
-
-/* Running count of suppressed errors detected. */
-static UInt vg_n_errs_suppressed = 0;
-
-/* Used to disable further error reporting once some huge number of
-   errors have already been logged. */
-static Bool vg_ignore_errors = False;
-
-/* forwards ... */
-static Suppression* is_suppressible_error ( ErrContext* ec );
-
-
-/*------------------------------------------------------------*/
-/*--- Helper fns                                           ---*/
-/*------------------------------------------------------------*/
-
-
-static void clear_AddrInfo ( AddrInfo* ai )
-{
-   ai->akind      = Unknown;
-   ai->blksize    = 0;
-   ai->rwoffset   = 0;
-   ai->lastchange = NULL;
-   ai->stack_tid  = VG_INVALID_THREADID;
-   ai->maybe_gcc  = False;
-}
-
-static void clear_ErrContext ( ErrContext* ec )
-{
-   ec->next    = NULL;
-   ec->supp    = NULL;
-   ec->count   = 0;
-   ec->ekind   = ValueErr;
-   ec->where   = NULL;
-   ec->axskind = ReadAxs;
-   ec->size    = 0;
-   ec->addr    = 0;
-   clear_AddrInfo ( &ec->addrinfo );
-   ec->syscall_param   = NULL;
-   ec->isWriteableLack = False;
-   ec->m_eip   = 0xDEADB00F;
-   ec->m_esp   = 0xDEADBE0F;
-   ec->m_ebp   = 0xDEADB0EF;
-   ec->tid     = VG_INVALID_THREADID;
-}
-
-
-static __inline__
-Bool vg_eq_ExeContext ( Bool top_2_only,
-                        ExeContext* e1, ExeContext* e2 )
-{
-   /* Note that frames after the 4th are always ignored. */
-   if (top_2_only) {
-      return VG_(eq_ExeContext_top2(e1, e2));
-   } else {
-      return VG_(eq_ExeContext_top4(e1, e2));
-   }
-}
-
-
-static Bool eq_AddrInfo ( Bool cheap_addr_cmp,
-                          AddrInfo* ai1, AddrInfo* ai2 )
-{
-   if (ai1->akind != Undescribed 
-       && ai2->akind != Undescribed
-       && ai1->akind != ai2->akind) 
-      return False;
-   if (ai1->akind == Freed || ai1->akind == Mallocd) {
-      if (ai1->blksize != ai2->blksize)
-         return False;
-      if (!vg_eq_ExeContext(cheap_addr_cmp, 
-                            ai1->lastchange, ai2->lastchange))
-         return False;
-   }
-   return True;
-}
-
-/* Compare error contexts, to detect duplicates.  Note that if they
-   are otherwise the same, the faulting addrs and associated rwoffsets
-   are allowed to be different.  */
-
-static Bool eq_ErrContext ( Bool cheap_addr_cmp,
-                            ErrContext* e1, ErrContext* e2 )
-{
-   if (e1->ekind != e2->ekind) 
-      return False;
-   if (!vg_eq_ExeContext(cheap_addr_cmp, e1->where, e2->where))
-      return False;
-
-   switch (e1->ekind) {
-      case PThreadErr:
-         if (e1->syscall_param == e2->syscall_param) 
-            return True;
-         if (0 == VG_(strcmp)(e1->syscall_param, e2->syscall_param))
-            return True;
-         return False;
-      case UserErr:
-      case ParamErr:
-         if (e1->isWriteableLack != e2->isWriteableLack) return False;
-         if (e1->ekind == ParamErr 
-             && 0 != VG_(strcmp)(e1->syscall_param, e2->syscall_param))
-            return False;
-         return True;
-      case FreeErr:
-      case FreeMismatchErr:
-         if (e1->addr != e2->addr) return False;
-         if (!eq_AddrInfo(cheap_addr_cmp, &e1->addrinfo, &e2->addrinfo)) 
-            return False;
-         return True;
-      case AddrErr:
-         if (e1->axskind != e2->axskind) return False;
-         if (e1->size != e2->size) return False;
-         if (!eq_AddrInfo(cheap_addr_cmp, &e1->addrinfo, &e2->addrinfo)) 
-            return False;
-         return True;
-      case ValueErr:
-         if (e1->size != e2->size) return False;
-         return True;
-      default: 
-         VG_(panic)("eq_ErrContext");
-   }
-}
-
-static void pp_AddrInfo ( Addr a, AddrInfo* ai )
-{
-   switch (ai->akind) {
-      case Stack: 
-         VG_(message)(Vg_UserMsg, 
-                      "   Address 0x%x is on thread %d's stack", 
-                      a, ai->stack_tid);
-         break;
-      case Unknown:
-         if (ai->maybe_gcc) {
-            VG_(message)(Vg_UserMsg, 
-               "   Address 0x%x is just below %%esp.  Possibly a bug in GCC/G++",
-               a);
-            VG_(message)(Vg_UserMsg, 
-               "   v 2.96 or 3.0.X.  To suppress, use: --workaround-gcc296-bugs=yes");
-	 } else {
-            VG_(message)(Vg_UserMsg, 
-               "   Address 0x%x is not stack'd, malloc'd or free'd", a);
-         }
-         break;
-      case Freed: case Mallocd: case UserG: case UserS: {
-         UInt delta;
-         UChar* relative;
-         if (ai->rwoffset < 0) {
-            delta    = (UInt)(- ai->rwoffset);
-            relative = "before";
-         } else if (ai->rwoffset >= ai->blksize) {
-            delta    = ai->rwoffset - ai->blksize;
-            relative = "after";
-         } else {
-            delta    = ai->rwoffset;
-            relative = "inside";
-         }
-         if (ai->akind == UserS) {
-            VG_(message)(Vg_UserMsg, 
-               "   Address 0x%x is %d bytes %s a %d-byte stack red-zone created",
-               a, delta, relative, 
-               ai->blksize );
-	 } else {
-            VG_(message)(Vg_UserMsg, 
-               "   Address 0x%x is %d bytes %s a block of size %d %s",
-               a, delta, relative, 
-               ai->blksize,
-               ai->akind==Mallocd ? "alloc'd" 
-                  : ai->akind==Freed ? "free'd" 
-                                     : "client-defined");
-         }
-         VG_(pp_ExeContext)(ai->lastchange);
-         break;
-      }
-      default:
-         VG_(panic)("pp_AddrInfo");
-   }
-}
-
-static void pp_ErrContext ( ErrContext* ec, Bool printCount )
-{
-   if (printCount)
-      VG_(message)(Vg_UserMsg, "Observed %d times:", ec->count );
-   if (ec->tid > 1)
-      VG_(message)(Vg_UserMsg, "Thread %d:", ec->tid );
-   switch (ec->ekind) {
-      case ValueErr:
-         if (ec->size == 0) {
-             VG_(message)(
-                Vg_UserMsg,
-                "Conditional jump or move depends on uninitialised value(s)");
-         } else {
-             VG_(message)(Vg_UserMsg,
-                          "Use of uninitialised value of size %d",
-                          ec->size);
-         }
-         VG_(pp_ExeContext)(ec->where);
-         break;
-      case AddrErr:
-         switch (ec->axskind) {
-            case ReadAxs:
-               VG_(message)(Vg_UserMsg, "Invalid read of size %d", 
-                                        ec->size ); 
-               break;
-            case WriteAxs:
-               VG_(message)(Vg_UserMsg, "Invalid write of size %d", 
-                                        ec->size ); 
-               break;
-            case ExecAxs:
-               VG_(message)(Vg_UserMsg, "Jump to the invalid address "
-                                        "stated on the next line");
-               break;
-            default: 
-               VG_(panic)("pp_ErrContext(axskind)");
-         }
-         VG_(pp_ExeContext)(ec->where);
-         pp_AddrInfo(ec->addr, &ec->addrinfo);
-         break;
-      case FreeErr:
-         VG_(message)(Vg_UserMsg,"Invalid free() / delete / delete[]");
-         /* fall through */
-      case FreeMismatchErr:
-         if (ec->ekind == FreeMismatchErr)
-            VG_(message)(Vg_UserMsg, 
-                         "Mismatched free() / delete / delete []");
-         VG_(pp_ExeContext)(ec->where);
-         pp_AddrInfo(ec->addr, &ec->addrinfo);
-         break;
-      case ParamErr:
-         if (ec->isWriteableLack) {
-            VG_(message)(Vg_UserMsg, 
-               "Syscall param %s contains unaddressable byte(s)",
-                ec->syscall_param );
-         } else {
-            VG_(message)(Vg_UserMsg, 
-                "Syscall param %s contains uninitialised or "
-                "unaddressable byte(s)",
-            ec->syscall_param);
-         }
-         VG_(pp_ExeContext)(ec->where);
-         pp_AddrInfo(ec->addr, &ec->addrinfo);
-         break;
-      case UserErr:
-         if (ec->isWriteableLack) {
-            VG_(message)(Vg_UserMsg, 
-               "Unaddressable byte(s) found during client check request");
-         } else {
-            VG_(message)(Vg_UserMsg, 
-               "Uninitialised or "
-               "unaddressable byte(s) found during client check request");
-         }
-         VG_(pp_ExeContext)(ec->where);
-         pp_AddrInfo(ec->addr, &ec->addrinfo);
-         break;
-      case PThreadErr:
-         VG_(message)(Vg_UserMsg, "%s", ec->syscall_param );
-         VG_(pp_ExeContext)(ec->where);
-         break;
-      default: 
-         VG_(panic)("pp_ErrContext");
-   }
-}
-
-
-/* Figure out if we want to attach for GDB for this error, possibly
-   by asking the user. */
-static
-Bool vg_is_GDB_attach_requested ( void )
-{
-   Char ch, ch2;
-   Int res;
-
-   if (VG_(clo_GDB_attach) == False)
-      return False;
-
-   VG_(message)(Vg_UserMsg, "");
-
-  again:
-   VG_(printf)(
-      "==%d== "
-      "---- Attach to GDB ? --- [Return/N/n/Y/y/C/c] ---- ", 
-      VG_(getpid)()
-   );
-
-   res = VG_(read)(0 /*stdin*/, &ch, 1);
-   if (res != 1) goto ioerror;
-   /* res == 1 */
-   if (ch == '\n') return False;
-   if (ch != 'N' && ch != 'n' && ch != 'Y' && ch != 'y' 
-      && ch != 'C' && ch != 'c') goto again;
-
-   res = VG_(read)(0 /*stdin*/, &ch2, 1);
-   if (res != 1) goto ioerror;
-   if (ch2 != '\n') goto again;
-
-   /* No, don't want to attach. */
-   if (ch == 'n' || ch == 'N') return False;
-   /* Yes, want to attach. */
-   if (ch == 'y' || ch == 'Y') return True;
-   /* No, don't want to attach, and don't ask again either. */
-   vg_assert(ch == 'c' || ch == 'C');
-
-  ioerror:
-   VG_(clo_GDB_attach) = False;
-   return False;
-}
-
-
-/* Top-level entry point to the error management subsystem.  All
-   detected errors are notified here; this routine decides if/when the
-   user should see the error. */
-static void VG_(maybe_add_context) ( ErrContext* ec )
-{
-   ErrContext* p;
-   ErrContext* p_prev;
-   Bool        cheap_addr_cmp         = False;
-   static Bool is_first_shown_context = True;
-   static Bool stopping_message       = False;
-   static Bool slowdown_message       = False;
-   static Int  vg_n_errs_shown        = 0;
-
-   vg_assert(ec->tid >= 0 && ec->tid < VG_N_THREADS);
-
-   /* After M_VG_COLLECT_NO_ERRORS_AFTER_SHOWN different errors have
-      been found, or M_VG_COLLECT_NO_ERRORS_AFTER_FOUND total errors
-      have been found, just refuse to collect any more.  This stops
-      the burden of the error-management system becoming excessive in
-      extremely buggy programs, although it does make it pretty
-      pointless to continue the Valgrind run after this point. */
-   if (VG_(clo_error_limit) 
-       && (vg_n_errs_shown >= M_VG_COLLECT_NO_ERRORS_AFTER_SHOWN
-           || vg_n_errs_found >= M_VG_COLLECT_NO_ERRORS_AFTER_FOUND)) {
-      if (!stopping_message) {
-         VG_(message)(Vg_UserMsg, "");
-
-	 if (vg_n_errs_shown >= M_VG_COLLECT_NO_ERRORS_AFTER_SHOWN) {
-            VG_(message)(Vg_UserMsg, 
-               "More than %d different errors detected.  "
-               "I'm not reporting any more.",
-               M_VG_COLLECT_NO_ERRORS_AFTER_SHOWN );
-         } else {
-            VG_(message)(Vg_UserMsg, 
-               "More than %d total errors detected.  "
-               "I'm not reporting any more.",
-               M_VG_COLLECT_NO_ERRORS_AFTER_FOUND );
-	 }
-
-         VG_(message)(Vg_UserMsg, 
-            "Final error counts will be inaccurate.  Go fix your program!");
-         VG_(message)(Vg_UserMsg, 
-            "Rerun with --error-limit=no to disable this cutoff.  Note");
-         VG_(message)(Vg_UserMsg, 
-            "that your program may now segfault without prior warning from");
-         VG_(message)(Vg_UserMsg, 
-            "Valgrind, because errors are no longer being displayed.");
-         VG_(message)(Vg_UserMsg, "");
-         stopping_message = True;
-         vg_ignore_errors = True;
-      }
-      return;
-   }
-
-   /* After M_VG_COLLECT_ERRORS_SLOWLY_AFTER different errors have
-      been found, be much more conservative about collecting new
-      ones. */
-   if (vg_n_errs_shown >= M_VG_COLLECT_ERRORS_SLOWLY_AFTER) {
-      cheap_addr_cmp = True;
-      if (!slowdown_message) {
-         VG_(message)(Vg_UserMsg, "");
-         VG_(message)(Vg_UserMsg, 
-            "More than %d errors detected.  Subsequent errors",
-            M_VG_COLLECT_ERRORS_SLOWLY_AFTER);
-         VG_(message)(Vg_UserMsg, 
-            "will still be recorded, but in less detail than before.");
-         slowdown_message = True;
-      }
-   }
-
-
-   /* First, see if we've got an error record matching this one. */
-   p      = vg_err_contexts;
-   p_prev = NULL;
-   while (p != NULL) {
-      if (eq_ErrContext(cheap_addr_cmp, p, ec)) {
-         /* Found it. */
-         p->count++;
-	 if (p->supp != NULL) {
-            /* Deal correctly with suppressed errors. */
-            p->supp->count++;
-            vg_n_errs_suppressed++;	 
-         } else {
-            vg_n_errs_found++;
-         }
-
-         /* Move p to the front of the list so that future searches
-            for it are faster. */
-         if (p_prev != NULL) {
-            vg_assert(p_prev->next == p);
-            p_prev->next    = p->next;
-            p->next         = vg_err_contexts;
-            vg_err_contexts = p;
-	 }
-         return;
-      }
-      p_prev = p;
-      p      = p->next;
-   }
-
-   /* Didn't see it.  Copy and add. */
-
-   /* OK, we're really going to collect it.  First, describe any addr
-      info in the error. */
-   if (ec->addrinfo.akind == Undescribed)
-      VG_(describe_addr) ( ec->addr, &ec->addrinfo );
-
-   p = VG_(malloc)(VG_AR_ERRCTXT, sizeof(ErrContext));
-   *p = *ec;
-   p->next = vg_err_contexts;
-   p->supp = is_suppressible_error(ec);
-   vg_err_contexts = p;
-   if (p->supp == NULL) {
-      vg_n_errs_found++;
-      if (!is_first_shown_context)
-         VG_(message)(Vg_UserMsg, "");
-      pp_ErrContext(p, False);      
-      is_first_shown_context = False;
-      vg_n_errs_shown++;
-      /* Perhaps we want a GDB attach at this point? */
-      if (vg_is_GDB_attach_requested()) {
-         VG_(swizzle_esp_then_start_GDB)(
-            ec->m_eip, ec->m_esp, ec->m_ebp);
-      }
-   } else {
-      vg_n_errs_suppressed++;
-      p->supp->count++;
-   }
-}
-
-
-
-
-/*------------------------------------------------------------*/
-/*--- Exported fns                                         ---*/
-/*------------------------------------------------------------*/
-
-/* These two are called from generated code, so that the %EIP/%EBP
-   values that we need in order to create proper error messages are
-   picked up out of VG_(baseBlock) rather than from the thread table
-   (vg_threads in vg_scheduler.c). */
-
-void VG_(record_value_error) ( Int size )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   clear_ErrContext( &ec );
-   ec.count = 1;
-   ec.next  = NULL;
-   ec.where = VG_(get_ExeContext)( False, VG_(baseBlock)[VGOFF_(m_eip)], 
-                                          VG_(baseBlock)[VGOFF_(m_ebp)] );
-   ec.ekind = ValueErr;
-   ec.size  = size;
-   ec.tid   = VG_(get_current_tid)();
-   ec.m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
-   ec.m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   ec.m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
-   VG_(maybe_add_context) ( &ec );
-}
-
-void VG_(record_address_error) ( Addr a, Int size, Bool isWrite )
-{
-   ErrContext ec;
-   Bool       just_below_esp;
-   if (vg_ignore_errors) return;
-
-   just_below_esp 
-      = VG_(is_just_below_ESP)( VG_(baseBlock)[VGOFF_(m_esp)], a );
-
-   /* If this is caused by an access immediately below %ESP, and the
-      user asks nicely, we just ignore it. */
-   if (VG_(clo_workaround_gcc296_bugs) && just_below_esp)
-      return;
-
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, VG_(baseBlock)[VGOFF_(m_eip)], 
-                                            VG_(baseBlock)[VGOFF_(m_ebp)] );
-   ec.ekind   = AddrErr;
-   ec.axskind = isWrite ? WriteAxs : ReadAxs;
-   ec.size    = size;
-   ec.addr    = a;
-   ec.tid     = VG_(get_current_tid)();
-   ec.m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
-   ec.m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   ec.m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
-   ec.addrinfo.akind     = Undescribed;
-   ec.addrinfo.maybe_gcc = just_below_esp;
-   VG_(maybe_add_context) ( &ec );
-}
-
-
-/* These five are called not from generated code but in response to
-   requests passed back to the scheduler.  So we pick up %EIP/%EBP
-   values from the stored thread state, not from VG_(baseBlock).  */
-
-void VG_(record_free_error) ( ThreadState* tst, Addr a )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, tst->m_eip, tst->m_ebp );
-   ec.ekind   = FreeErr;
-   ec.addr    = a;
-   ec.tid     = tst->tid;
-   ec.m_eip   = tst->m_eip;
-   ec.m_esp   = tst->m_esp;
-   ec.m_ebp   = tst->m_ebp;
-   ec.addrinfo.akind = Undescribed;
-   VG_(maybe_add_context) ( &ec );
-}
-
-void VG_(record_freemismatch_error) ( ThreadState* tst, Addr a )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, tst->m_eip, tst->m_ebp );
-   ec.ekind   = FreeMismatchErr;
-   ec.addr    = a;
-   ec.tid     = tst->tid;
-   ec.m_eip   = tst->m_eip;
-   ec.m_esp   = tst->m_esp;
-   ec.m_ebp   = tst->m_ebp;
-   ec.addrinfo.akind = Undescribed;
-   VG_(maybe_add_context) ( &ec );
-}
-
-void VG_(record_jump_error) ( ThreadState* tst, Addr a )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, tst->m_eip, tst->m_ebp );
-   ec.ekind   = AddrErr;
-   ec.axskind = ExecAxs;
-   ec.addr    = a;
-   ec.tid     = tst->tid;
-   ec.m_eip   = tst->m_eip;
-   ec.m_esp   = tst->m_esp;
-   ec.m_ebp   = tst->m_ebp;
-   ec.addrinfo.akind = Undescribed;
-   VG_(maybe_add_context) ( &ec );
-}
-
-void VG_(record_param_err) ( ThreadState* tst, Addr a, Bool isWriteLack, 
-                             Char* msg )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, tst->m_eip, tst->m_ebp );
-   ec.ekind   = ParamErr;
-   ec.addr    = a;
-   ec.tid     = tst->tid;
-   ec.m_eip   = tst->m_eip;
-   ec.m_esp   = tst->m_esp;
-   ec.m_ebp   = tst->m_ebp;
-   ec.addrinfo.akind = Undescribed;
-   ec.syscall_param = msg;
-   ec.isWriteableLack = isWriteLack;
-   VG_(maybe_add_context) ( &ec );
-}
-
-void VG_(record_user_err) ( ThreadState* tst, Addr a, Bool isWriteLack )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, tst->m_eip, tst->m_ebp );
-   ec.ekind   = UserErr;
-   ec.addr    = a;
-   ec.tid     = tst->tid;
-   ec.m_eip   = tst->m_eip;
-   ec.m_esp   = tst->m_esp;
-   ec.m_ebp   = tst->m_ebp;
-   ec.addrinfo.akind = Undescribed;
-   ec.isWriteableLack = isWriteLack;
-   VG_(maybe_add_context) ( &ec );
-}
-
-void VG_(record_pthread_err) ( ThreadId tid, Char* msg )
-{
-   ErrContext ec;
-   if (vg_ignore_errors) return;
-   if (!VG_(clo_instrument)) return;
-   clear_ErrContext( &ec );
-   ec.count   = 1;
-   ec.next    = NULL;
-   ec.where   = VG_(get_ExeContext)( False, VG_(threads)[tid].m_eip, 
-                                            VG_(threads)[tid].m_ebp );
-   ec.ekind   = PThreadErr;
-   ec.tid     = tid;
-   ec.syscall_param = msg;
-   ec.m_eip   = VG_(threads)[tid].m_eip;
-   ec.m_esp   = VG_(threads)[tid].m_esp;
-   ec.m_ebp   = VG_(threads)[tid].m_ebp;
-   VG_(maybe_add_context) ( &ec );
-}
-
-
-/*------------------------------*/
-
-void VG_(show_all_errors) ( void )
-{
-   Int         i, n_min;
-   Int         n_err_contexts, n_supp_contexts;
-   ErrContext  *p, *p_min;
-   Suppression *su;
-   Bool        any_supp;
-
-   if (VG_(clo_verbosity) == 0)
-      return;
-
-   n_err_contexts = 0;
-   for (p = vg_err_contexts; p != NULL; p = p->next) {
-      if (p->supp == NULL)
-         n_err_contexts++;
-   }
-
-   n_supp_contexts = 0;
-   for (su = vg_suppressions; su != NULL; su = su->next) {
-      if (su->count > 0)
-         n_supp_contexts++;
-   }
-
-   VG_(message)(Vg_UserMsg,
-                "ERROR SUMMARY: "
-                "%d errors from %d contexts (suppressed: %d from %d)",
-                vg_n_errs_found, n_err_contexts, 
-                vg_n_errs_suppressed, n_supp_contexts );
-
-   if (VG_(clo_verbosity) <= 1)
-      return;
-
-   /* Print the contexts in order of increasing error count. */
-   for (i = 0; i < n_err_contexts; i++) {
-      n_min = (1 << 30) - 1;
-      p_min = NULL;
-      for (p = vg_err_contexts; p != NULL; p = p->next) {
-         if (p->supp != NULL) continue;
-         if (p->count < n_min) {
-            n_min = p->count;
-            p_min = p;
-         }
-      }
-      if (p_min == NULL) VG_(panic)("pp_AllErrContexts");
-
-      VG_(message)(Vg_UserMsg, "");
-      VG_(message)(Vg_UserMsg, "%d errors in context %d of %d:",
-                   p_min->count,
-                   i+1, n_err_contexts);
-      pp_ErrContext( p_min, False );
-
-      if ((i+1 == VG_(clo_dump_error))) {
-	VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to below NULLs */,
-                         p_min->where->eips[0], NULL, NULL, NULL );
-      }
-
-      p_min->count = 1 << 30;
-   } 
-
-   if (n_supp_contexts > 0) 
-      VG_(message)(Vg_DebugMsg, "");
-   any_supp = False;
-   for (su = vg_suppressions; su != NULL; su = su->next) {
-      if (su->count > 0) {
-         any_supp = True;
-         VG_(message)(Vg_DebugMsg, "supp: %4d %s", su->count, 
-                                   su->sname);
-      }
-   }
-
-   if (n_err_contexts > 0) {
-      if (any_supp) 
-         VG_(message)(Vg_UserMsg, "");
-      VG_(message)(Vg_UserMsg,
-                   "IN SUMMARY: "
-                   "%d errors from %d contexts (suppressed: %d from %d)",
-                   vg_n_errs_found, n_err_contexts, 
-                   vg_n_errs_suppressed,
-                   n_supp_contexts );
-      VG_(message)(Vg_UserMsg, "");
-   }
-}
-
-/*------------------------------------------------------------*/
-/*--- Standard suppressions                                ---*/
-/*------------------------------------------------------------*/
-
-/* Get a non-blank, non-comment line of at most nBuf chars from fd.
-   Skips leading spaces on the line. Return True if EOF was hit instead. 
-*/
-
-#define VG_ISSPACE(ch) (((ch)==' ') || ((ch)=='\n') || ((ch)=='\t'))
-
-static Bool getLine ( Int fd, Char* buf, Int nBuf )
-{
-   Char ch;
-   Int  n, i;
-   while (True) {
-      /* First, read until a non-blank char appears. */
-      while (True) {
-         n = VG_(read)(fd, &ch, 1);
-         if (n == 1 && !VG_ISSPACE(ch)) break;
-         if (n == 0) return True;
-      }
-
-      /* Now, read the line into buf. */
-      i = 0;
-      buf[i++] = ch; buf[i] = 0;
-      while (True) {
-         n = VG_(read)(fd, &ch, 1);
-         if (n == 0) return False; /* the next call will return True */
-         if (ch == '\n') break;
-         if (i > 0 && i == nBuf-1) i--;
-         buf[i++] = ch; buf[i] = 0;
-      }
-      while (i > 1 && VG_ISSPACE(buf[i-1])) { 
-         i--; buf[i] = 0; 
-      };
-
-      /* VG_(printf)("The line is `%s'\n", buf); */
-      /* Ok, we have a line.  If a non-comment line, return.
-         If a comment line, start all over again. */
-      if (buf[0] != '#') return False;
-   }
-}
-
-
-/* *p_caller contains the raw name of a caller, supposedly either
-       fun:some_function_name   or
-       obj:some_object_name.
-   Set *p_ty accordingly and advance *p_caller over the descriptor
-   (fun: or obj:) part.
-   Returns False if failed.
-*/
-static Bool setLocationTy ( Char** p_caller, SuppressionLocTy* p_ty )
-{
-   if (VG_(strncmp)(*p_caller, "fun:", 4) == 0) {
-      (*p_caller) += 4;
-      *p_ty = FunName;
-      return True;
-   }
-   if (VG_(strncmp)(*p_caller, "obj:", 4) == 0) {
-      (*p_caller) += 4;
-      *p_ty = ObjName;
-      return True;
-   }
-   VG_(printf)("location should start with fun: or obj:\n");
-   return False;
-}
-
-
-/* Read suppressions from the file specified in vg_clo_suppressions
-   and place them in the suppressions list.  If there's any difficulty
-   doing this, just give up -- there's no point in trying to recover.  
-*/
-#define STREQ(s1,s2) (s1 != NULL && s2 != NULL \
-                      && VG_(strcmp)((s1),(s2))==0)
-
-static Char* copyStr ( Char* str )
-{
-   Int   n, i;
-   Char* str2;
-   n    = VG_(strlen)(str);
-   str2 = VG_(malloc)(VG_AR_PRIVATE, n+1);
-   vg_assert(n > 0);
-   for (i = 0; i < n+1; i++) str2[i] = str[i];
-   return str2;
-}
-
-static void load_one_suppressions_file ( Char* filename )
-{
-#  define N_BUF 200
-   Int  fd;
-   Bool eof;
-   Char buf[N_BUF+1];
-   fd = VG_(open_read)( filename );
-   if (fd == -1) {
-      VG_(message)(Vg_UserMsg, 
-                   "FATAL: can't open suppressions file `%s'", 
-                   filename );
-      VG_(exit)(1);
-   }
-
-   while (True) {
-      Suppression* supp;
-      supp = VG_(malloc)(VG_AR_PRIVATE, sizeof(Suppression));
-      supp->count = 0;
-      supp->param = supp->caller0 = supp->caller1 
-                  = supp->caller2 = supp->caller3 = NULL;
-
-      eof = getLine ( fd, buf, N_BUF );
-      if (eof) break;
-
-      if (!STREQ(buf, "{")) goto syntax_error;
-      
-      eof = getLine ( fd, buf, N_BUF );
-      if (eof || STREQ(buf, "}")) goto syntax_error;
-      supp->sname = copyStr(buf);
-
-      eof = getLine ( fd, buf, N_BUF );
-      if (eof) goto syntax_error;
-      else if (STREQ(buf, "Param"))  supp->skind = Param;
-      else if (STREQ(buf, "Value0")) supp->skind = Value0; /* backwards compat */
-      else if (STREQ(buf, "Cond"))   supp->skind = Value0;
-      else if (STREQ(buf, "Value1")) supp->skind = Value1;
-      else if (STREQ(buf, "Value2")) supp->skind = Value2;
-      else if (STREQ(buf, "Value4")) supp->skind = Value4;
-      else if (STREQ(buf, "Value8")) supp->skind = Value8;
-      else if (STREQ(buf, "Addr1"))  supp->skind = Addr1;
-      else if (STREQ(buf, "Addr2"))  supp->skind = Addr2;
-      else if (STREQ(buf, "Addr4"))  supp->skind = Addr4;
-      else if (STREQ(buf, "Addr8"))  supp->skind = Addr8;
-      else if (STREQ(buf, "Free"))   supp->skind = FreeS;
-      else if (STREQ(buf, "PThread")) supp->skind = PThread;
-      else goto syntax_error;
-
-      if (supp->skind == Param) {
-         eof = getLine ( fd, buf, N_BUF );
-         if (eof) goto syntax_error;
-         supp->param = copyStr(buf);
-      }
-
-      eof = getLine ( fd, buf, N_BUF );
-      if (eof) goto syntax_error;
-      supp->caller0 = copyStr(buf);
-      if (!setLocationTy(&(supp->caller0), &(supp->caller0_ty)))
-         goto syntax_error;
-
-      eof = getLine ( fd, buf, N_BUF );
-      if (eof) goto syntax_error;
-      if (!STREQ(buf, "}")) {
-         supp->caller1 = copyStr(buf);
-         if (!setLocationTy(&(supp->caller1), &(supp->caller1_ty)))
-            goto syntax_error;
-      
-         eof = getLine ( fd, buf, N_BUF );
-         if (eof) goto syntax_error;
-         if (!STREQ(buf, "}")) {
-            supp->caller2 = copyStr(buf);
-            if (!setLocationTy(&(supp->caller2), &(supp->caller2_ty)))
-               goto syntax_error;
-
-            eof = getLine ( fd, buf, N_BUF );
-            if (eof) goto syntax_error;
-            if (!STREQ(buf, "}")) {
-               supp->caller3 = copyStr(buf);
-              if (!setLocationTy(&(supp->caller3), &(supp->caller3_ty)))
-                 goto syntax_error;
-
-               eof = getLine ( fd, buf, N_BUF );
-               if (eof || !STREQ(buf, "}")) goto syntax_error;
-	    }
-         }
-      }
-
-      supp->next = vg_suppressions;
-      vg_suppressions = supp;
-   }
-
-   VG_(close)(fd);
-   return;
-
-  syntax_error:
-   if (eof) {
-      VG_(message)(Vg_UserMsg, 
-                   "FATAL: in suppressions file `%s': unexpected EOF", 
-                   filename );
-   } else {
-      VG_(message)(Vg_UserMsg, 
-                   "FATAL: in suppressions file `%s': syntax error on: %s", 
-                   filename, buf );
-   }
-   VG_(close)(fd);
-   VG_(message)(Vg_UserMsg, "exiting now.");
-    VG_(exit)(1);
-
-#  undef N_BUF   
-}
-
-
-void VG_(load_suppressions) ( void )
-{
-   Int i;
-   vg_suppressions = NULL;
-   for (i = 0; i < VG_(clo_n_suppressions); i++) {
-      if (VG_(clo_verbosity) > 1) {
-         VG_(message)(Vg_UserMsg, "Reading suppressions file: %s", 
-                                  VG_(clo_suppressions)[i] );
-      }
-      load_one_suppressions_file( VG_(clo_suppressions)[i] );
-   }
-}
-
-
-/* Does an error context match a suppression?  ie is this a
-   suppressible error?  If so, return a pointer to the Suppression
-   record, otherwise NULL.
-   Tries to minimise the number of calls to what_fn_is_this since they
-   are expensive.  
-*/
-static Suppression* is_suppressible_error ( ErrContext* ec )
-{
-#  define STREQ(s1,s2) (s1 != NULL && s2 != NULL \
-                        && VG_(strcmp)((s1),(s2))==0)
-
-   Char caller0_obj[M_VG_ERRTXT];
-   Char caller0_fun[M_VG_ERRTXT];
-   Char caller1_obj[M_VG_ERRTXT];
-   Char caller1_fun[M_VG_ERRTXT];
-   Char caller2_obj[M_VG_ERRTXT];
-   Char caller2_fun[M_VG_ERRTXT];
-   Char caller3_obj[M_VG_ERRTXT];
-   Char caller3_fun[M_VG_ERRTXT];
-
-   Suppression* su;
-   Int          su_size;
-
-   /* vg_what_fn_or_object_is_this returns:
-         <function_name>      or
-         <object_name>        or
-         ???
-      so the strings in the suppression file should match these.
-   */
-
-   /* Initialise these strs so they are always safe to compare, even
-      if what_fn_or_object_is_this doesn't write anything to them. */
-   caller0_obj[0] = caller1_obj[0] = caller2_obj[0] = caller3_obj[0] = 0;
-   caller0_fun[0] = caller1_fun[0] = caller2_obj[0] = caller3_obj[0] = 0;
-
-   VG_(what_obj_and_fun_is_this)
-      ( ec->where->eips[0], caller0_obj, M_VG_ERRTXT,
-                            caller0_fun, M_VG_ERRTXT );
-   VG_(what_obj_and_fun_is_this)
-      ( ec->where->eips[1], caller1_obj, M_VG_ERRTXT,
-                            caller1_fun, M_VG_ERRTXT );
-
-   if (VG_(clo_backtrace_size) > 2) {
-      VG_(what_obj_and_fun_is_this)
-         ( ec->where->eips[2], caller2_obj, M_VG_ERRTXT,
-                               caller2_fun, M_VG_ERRTXT );
-
-      if (VG_(clo_backtrace_size) > 3) {
-         VG_(what_obj_and_fun_is_this)
-            ( ec->where->eips[3], caller3_obj, M_VG_ERRTXT,
-                                  caller3_fun, M_VG_ERRTXT );
-      }
-   }
-
-   /* See if the error context matches any suppression. */
-   for (su = vg_suppressions; su != NULL; su = su->next) {
-      switch (su->skind) {
-         case FreeS:  case PThread:
-         case Param:  case Value0: su_size = 0; break;
-         case Value1: case Addr1:  su_size = 1; break;
-         case Value2: case Addr2:  su_size = 2; break;
-         case Value4: case Addr4:  su_size = 4; break;
-         case Value8: case Addr8:  su_size = 8; break;
-         default: VG_(panic)("errcontext_matches_suppression");
-      }
-      switch (su->skind) {
-         case Param:
-            if (ec->ekind != ParamErr) continue;
-            if (!STREQ(su->param, ec->syscall_param)) continue;
-            break;
-         case Value0: case Value1: case Value2: case Value4: case Value8:
-            if (ec->ekind != ValueErr) continue;
-            if (ec->size  != su_size)  continue;
-            break;
-         case Addr1: case Addr2: case Addr4: case Addr8:
-            if (ec->ekind != AddrErr) continue;
-            if (ec->size  != su_size) continue;
-            break;
-         case FreeS:
-            if (ec->ekind != FreeErr 
-                && ec->ekind != FreeMismatchErr) continue;
-            break;
-         case PThread:
-            if (ec->ekind != PThreadErr) continue;
-            break;
-      }
-
-      switch (su->caller0_ty) {
-         case ObjName: if (!VG_(stringMatch)(su->caller0, 
-                                             caller0_obj)) continue;
-                       break;
-         case FunName: if (!VG_(stringMatch)(su->caller0, 
-                                             caller0_fun)) continue;
-                       break;
-         default: goto baaaad;
-      }
-
-      if (su->caller1 != NULL) {
-         vg_assert(VG_(clo_backtrace_size) >= 2);
-         switch (su->caller1_ty) {
-            case ObjName: if (!VG_(stringMatch)(su->caller1, 
-                                                caller1_obj)) continue;
-                          break;
-            case FunName: if (!VG_(stringMatch)(su->caller1, 
-                                                caller1_fun)) continue;
-                          break;
-            default: goto baaaad;
-         }
-      }
-
-      if (VG_(clo_backtrace_size) > 2 && su->caller2 != NULL) {
-         switch (su->caller2_ty) {
-            case ObjName: if (!VG_(stringMatch)(su->caller2, 
-                                                caller2_obj)) continue;
-                          break;
-            case FunName: if (!VG_(stringMatch)(su->caller2, 
-                                                caller2_fun)) continue;
-                          break;
-            default: goto baaaad;
-         }
-      }
-
-      if (VG_(clo_backtrace_size) > 3 && su->caller3 != NULL) {
-         switch (su->caller3_ty) {
-            case ObjName: if (!VG_(stringMatch)(su->caller3,
-                                                caller3_obj)) continue;
-                          break;
-            case FunName: if (!VG_(stringMatch)(su->caller3, 
-                                                caller3_fun)) continue;
-                          break;
-            default: goto baaaad;
-         }
-      }
-
-      return su;
-   }
-
-   return NULL;
-
-  baaaad:
-   VG_(panic)("is_suppressible_error");
-
-#  undef STREQ
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                          vg_errcontext.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_execontext.c b/coregrind/vg_execontext.c
deleted file mode 100644
index 4da1b31e18..0000000000
--- a/coregrind/vg_execontext.c
+++ /dev/null
@@ -1,258 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Storage, and equality on, execution contexts (backtraces).   ---*/
-/*---                                              vg_execontext.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "vg_constants.h"
-
-
-/*------------------------------------------------------------*/
-/*--- Low-level ExeContext storage.                        ---*/
-/*------------------------------------------------------------*/
-
-/* The idea is only to ever store any one context once, so as to save
-   space and make exact comparisons faster. */
-
-static ExeContext* vg_ec_list[VG_N_EC_LISTS];
-
-/* Stats only: the number of times the system was searched to locate a
-   context. */
-static UInt vg_ec_searchreqs;
-
-/* Stats only: the number of full context comparisons done. */
-static UInt vg_ec_searchcmps;
-
-/* Stats only: total number of stored contexts. */
-static UInt vg_ec_totstored;
-
-/* Number of 2, 4 and (fast) full cmps done. */
-static UInt vg_ec_cmp2s;
-static UInt vg_ec_cmp4s;
-static UInt vg_ec_cmpAlls;
-
-
-/*------------------------------------------------------------*/
-/*--- Exported functions.                                  ---*/
-/*------------------------------------------------------------*/
-
-
-/* Initialise this subsystem. */
-void VG_(init_ExeContext_storage) ( void )
-{
-   Int i;
-   vg_ec_searchreqs = 0;
-   vg_ec_searchcmps = 0;
-   vg_ec_totstored = 0;
-   vg_ec_cmp2s = 0;
-   vg_ec_cmp4s = 0;
-   vg_ec_cmpAlls = 0;
-   for (i = 0; i < VG_N_EC_LISTS; i++)
-      vg_ec_list[i] = NULL;
-}
-
-
-/* Show stats. */
-void VG_(show_ExeContext_stats) ( void )
-{
-   VG_(message)(Vg_DebugMsg, 
-      "exectx: %d lists, %d contexts (avg %d per list)",
-      VG_N_EC_LISTS, vg_ec_totstored, 
-      vg_ec_totstored / VG_N_EC_LISTS 
-   );
-   VG_(message)(Vg_DebugMsg, 
-      "exectx: %d searches, %d full compares (%d per 1000)",
-      vg_ec_searchreqs, vg_ec_searchcmps, 
-      vg_ec_searchreqs == 0 
-         ? 0 
-         : (UInt)( (((ULong)vg_ec_searchcmps) * 1000) 
-           / ((ULong)vg_ec_searchreqs )) 
-   );
-   VG_(message)(Vg_DebugMsg, 
-      "exectx: %d cmp2, %d cmp4, %d cmpAll",
-      vg_ec_cmp2s, vg_ec_cmp4s, vg_ec_cmpAlls 
-   );
-}
-
-
-/* Print an ExeContext. */
-void VG_(pp_ExeContext) ( ExeContext* e )
-{
-   VG_(mini_stack_dump) ( e );
-}
-
-
-/* Compare two ExeContexts, comparing all callers. */
-Bool VG_(eq_ExeContext_all) ( ExeContext* e1, ExeContext* e2 )
-{
-   vg_ec_cmpAlls++;
-   /* Just do pointer comparison. */
-   if (e1 != e2) return False;
-   return True;
-}
-
-
-/* Compare two ExeContexts, just comparing the top two callers. */
-Bool VG_(eq_ExeContext_top2) ( ExeContext* e1, ExeContext* e2 )
-{
-   vg_ec_cmp2s++;
-   if (e1->eips[0] != e2->eips[0]
-       || e1->eips[1] != e2->eips[1]) return False;
-   return True;
-}
-
-
-/* Compare two ExeContexts, just comparing the top four callers. */
-Bool VG_(eq_ExeContext_top4) ( ExeContext* e1, ExeContext* e2 )
-{
-   vg_ec_cmp4s++;
-   if (e1->eips[0] != e2->eips[0]
-       || e1->eips[1] != e2->eips[1]) return False;
-
-   if (VG_(clo_backtrace_size) < 3) return True;
-   if (e1->eips[2] != e2->eips[2]) return False;
-
-   if (VG_(clo_backtrace_size) < 4) return True;
-   if (e1->eips[3] != e2->eips[3]) return False;
-
-   return True;
-}
-
-
-/* This guy is the head honcho here.  Take a snapshot of the client's
-   stack.  Search our collection of ExeContexts to see if we already
-   have it, and if not, allocate a new one.  Either way, return a
-   pointer to the context.  If there is a matching context we
-   guarantee to not allocate a new one.  Thus we never store
-   duplicates, and so exact equality can be quickly done as equality
-   on the returned ExeContext* values themselves.  Inspired by Hugs's
-   Text type.  
-
-   In order to be thread-safe, we pass in the thread's %EIP and %EBP.
-*/
-ExeContext* VG_(get_ExeContext) ( Bool skip_top_frame,
-                                  Addr eip, Addr ebp )
-{
-   Int         i;
-   Addr        eips[VG_DEEPEST_BACKTRACE];
-   Bool        same;
-   UInt        hash;
-   ExeContext* new_ec;
-   ExeContext* list;
-
-   VGP_PUSHCC(VgpExeContext);
-
-   vg_assert(VG_(clo_backtrace_size) >= 2 
-             && VG_(clo_backtrace_size) <= VG_DEEPEST_BACKTRACE);
-
-   /* First snaffle %EIPs from the client's stack into eips[0
-      .. VG_(clo_backtrace_size)-1], putting zeroes in when the trail
-      goes cold. */
-
-   for (i = 0; i < VG_(clo_backtrace_size); i++)
-      eips[i] = 0;
-   
-#  define GET_CALLER(lval)                                        \
-   if (ebp != 0 && VGM_(check_readable)(ebp, 8, NULL)) {          \
-      lval = ((UInt*)ebp)[1];  /* ret addr */                     \
-      ebp  = ((UInt*)ebp)[0];  /* old ebp */                      \
-   } else {                                                       \
-      lval = ebp = 0;                                             \
-   }
-
-   if (skip_top_frame) {
-      for (i = 0; i < VG_(clo_backtrace_size); i++)
-         GET_CALLER(eips[i]);
-   } else {
-      eips[0] = eip;
-      for (i = 1; i < VG_(clo_backtrace_size); i++)
-         GET_CALLER(eips[i]);
-   }
-#  undef GET_CALLER
-
-   /* Now figure out if we've seen this one before.  First hash it so
-      as to determine the list number. */
-
-   hash = 0;
-   for (i = 0; i < VG_(clo_backtrace_size); i++) {
-      hash ^= (UInt)eips[i];
-      hash = (hash << 29) | (hash >> 3);
-   }
-   hash = hash % VG_N_EC_LISTS;
-
-   /* And (the expensive bit) look a matching entry in the list. */
-
-   vg_ec_searchreqs++;
-
-   list = vg_ec_list[hash];
-
-   while (True) {
-      if (list == NULL) break;
-      vg_ec_searchcmps++;
-      same = True;
-      for (i = 0; i < VG_(clo_backtrace_size); i++) {
-         if (list->eips[i] != eips[i]) {
-            same = False;
-            break; 
-         }
-      }
-      if (same) break;
-      list = list->next;
-   }
-
-   if (list != NULL) {
-      /* Yay!  We found it.  */
-      VGP_POPCC;
-      return list;
-   }
-
-   /* Bummer.  We have to allocate a new context record. */
-   vg_ec_totstored++;
-
-   new_ec 
-      = VG_(malloc)( 
-           VG_AR_EXECTXT, 
-           sizeof(struct _ExeContextRec *) 
-              + VG_(clo_backtrace_size) * sizeof(Addr) 
-        );
-
-   for (i = 0; i < VG_(clo_backtrace_size); i++)
-      new_ec->eips[i] = eips[i];
-
-   new_ec->next = vg_ec_list[hash];
-   vg_ec_list[hash] = new_ec;
-
-   VGP_POPCC;
-   return new_ec;
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                          vg_execontext.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
deleted file mode 100644
index 573ee93271..0000000000
--- a/coregrind/vg_from_ucode.c
+++ /dev/null
@@ -1,2647 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- The JITter: translate ucode back to x86 code.                ---*/
-/*---                                              vg_from_ucode.c ---*/
-/*--------------------------------------------------------------------*/
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-
-/*------------------------------------------------------------*/
-/*--- Renamings of frequently-used global functions.       ---*/
-/*------------------------------------------------------------*/
-
-#define dis       VG_(disassemble)
-#define nameIReg  VG_(nameOfIntReg)
-#define nameISize VG_(nameOfIntSize)
-
-
-/*------------------------------------------------------------*/
-/*--- Instruction emission -- turning final uinstrs back   ---*/
-/*--- into x86 code.                                       ---*/
-/*------------------------------------------------------------*/
-
-/* [2001-07-08 This comment is now somewhat out of date.]
-
-   This is straightforward but for one thing: to facilitate generating
-   code in a single pass, we generate position-independent code.  To
-   do this, calls and jmps to fixed addresses must specify the address
-   by first loading it into a register, and jump to/call that
-   register.  Fortunately, the only jump to a literal is the jump back
-   to vg_dispatch, and only %eax is live then, conveniently.  Ucode
-   call insns may only have a register as target anyway, so there's no
-   need to do anything fancy for them.
-
-   The emit_* routines constitute the lowest level of instruction
-   emission.  They simply emit the sequence of bytes corresponding to
-   the relevant instruction, with no further ado.  In particular there
-   is no checking about whether uses of byte registers makes sense,
-   nor whether shift insns have their first operand in %cl, etc.
-
-   These issues are taken care of by the level above, the synth_*
-   routines.  These detect impossible operand combinations and turn
-   them into sequences of legal instructions.  Finally, emitUInstr is
-   phrased in terms of the synth_* abstraction layer.  */
-
-static UChar* emitted_code;
-static Int    emitted_code_used;
-static Int    emitted_code_size;
-
-static void expandEmittedCode ( void )
-{
-   Int    i;
-   UChar* tmp = VG_(jitmalloc)(2 * emitted_code_size);
-   /* VG_(printf)("expand to %d\n", 2 * emitted_code_size); */
-   for (i = 0; i < emitted_code_size; i++)
-      tmp[i] = emitted_code[i];
-   VG_(jitfree)(emitted_code);
-   emitted_code = tmp;
-   emitted_code_size *= 2;
-}
-
-static __inline__ void emitB ( UInt b )
-{
-   if (dis) {
-      if (b < 16) VG_(printf)("0%x ", b); else VG_(printf)("%2x ", b);
-   }
-   if (emitted_code_used == emitted_code_size)
-      expandEmittedCode();
-
-   emitted_code[emitted_code_used] = (UChar)b;
-   emitted_code_used++;
-}
-
-static __inline__ void emitW ( UInt l )
-{
-   emitB ( (l) & 0x000000FF );
-   emitB ( (l >> 8) & 0x000000FF );
-}
-
-static __inline__ void emitL ( UInt l )
-{
-   emitB ( (l) & 0x000000FF );
-   emitB ( (l >> 8) & 0x000000FF );
-   emitB ( (l >> 16) & 0x000000FF );
-   emitB ( (l >> 24) & 0x000000FF );
-}
-
-static __inline__ void newEmit ( void )
-{
-   if (dis)
-      VG_(printf)("\t       %4d: ", emitted_code_used );
-}
-
-/* Is this a callee-save register, in the normal C calling convention?  */
-#define VG_CALLEE_SAVED(reg) (reg == R_EBX || reg == R_ESI || reg == R_EDI)
-
-
-/*----------------------------------------------------*/
-/*--- Addressing modes                             ---*/
-/*----------------------------------------------------*/
-
-static __inline__ UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
-{
-   return ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7);
-}
-
-static __inline__ UChar mkSIB ( Int scale, Int regindex, Int regbase )
-{
-   Int shift;
-   switch (scale) {
-      case 1: shift = 0; break;
-      case 2: shift = 1; break;
-      case 4: shift = 2; break;
-      case 8: shift = 3; break;
-      default: VG_(panic)( "mkSIB" );
-   }
-   return ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7);
-}
-
-static __inline__ void emit_amode_litmem_reg ( Addr addr, Int reg )
-{
-   /* ($ADDR), reg */
-   emitB ( mkModRegRM(0, reg, 5) );
-   emitL ( addr );
-}
-
-static __inline__ void emit_amode_regmem_reg ( Int regmem, Int reg )
-{
-   /* (regmem), reg */
-   if (regmem == R_ESP) 
-      VG_(panic)("emit_amode_regmem_reg");
-   if (regmem == R_EBP) {
-      emitB ( mkModRegRM(1, reg, 5) );
-      emitB ( 0x00 );
-   } else {
-      emitB( mkModRegRM(0, reg, regmem) );
-   }
-}
-
-static __inline__ void emit_amode_offregmem_reg ( Int off, Int regmem, Int reg )
-{
-   if (regmem == R_ESP)
-      VG_(panic)("emit_amode_offregmem_reg(ESP)");
-   if (off < -128 || off > 127) {
-      /* Use a large offset */
-      /* d32(regmem), reg */
-      emitB ( mkModRegRM(2, reg, regmem) );
-      emitL ( off );
-   } else {
-      /* d8(regmem), reg */
-      emitB ( mkModRegRM(1, reg, regmem) );
-      emitB ( off & 0xFF );
-   }
-}
-
-static __inline__ void emit_amode_sib_reg ( Int off, Int scale, Int regbase, 
-                                            Int regindex, Int reg )
-{
-   if (regindex == R_ESP)
-      VG_(panic)("emit_amode_sib_reg(ESP)");
-   if (off < -128 || off > 127) {
-      /* Use a 32-bit offset */
-      emitB ( mkModRegRM(2, reg, 4) ); /* SIB with 32-bit displacement */
-      emitB ( mkSIB( scale, regindex, regbase ) );
-      emitL ( off );
-   } else {
-      /* Use an 8-bit offset */
-      emitB ( mkModRegRM(1, reg, 4) ); /* SIB with 8-bit displacement */
-      emitB ( mkSIB( scale, regindex, regbase ) );
-      emitB ( off & 0xFF );
-   }
-}
-
-static __inline__ void emit_amode_ereg_greg ( Int e_reg, Int g_reg )
-{
-   /* other_reg, reg */
-   emitB ( mkModRegRM(3, g_reg, e_reg) );
-}
-
-static __inline__ void emit_amode_greg_ereg ( Int g_reg, Int e_reg )
-{
-   /* other_reg, reg */
-   emitB ( mkModRegRM(3, g_reg, e_reg) );
-}
-
-
-/*----------------------------------------------------*/
-/*--- Opcode translation                           ---*/
-/*----------------------------------------------------*/
-
-static __inline__ Int mkGrp1opcode ( Opcode opc )
-{
-   switch (opc) {
-      case ADD: return 0;
-      case OR:  return 1;
-      case ADC: return 2;
-      case SBB: return 3;
-      case AND: return 4;
-      case SUB: return 5;
-      case XOR: return 6;
-      default: VG_(panic)("mkGrp1opcode");
-   }
-}
-
-static __inline__ Int mkGrp2opcode ( Opcode opc )
-{
-   switch (opc) {
-      case ROL: return 0;
-      case ROR: return 1;
-      case RCL: return 2;
-      case RCR: return 3;
-      case SHL: return 4;
-      case SHR: return 5;
-      case SAR: return 7;
-      default: VG_(panic)("mkGrp2opcode");
-   }
-}
-
-static __inline__ Int mkGrp3opcode ( Opcode opc )
-{
-   switch (opc) {
-      case NOT: return 2;
-      case NEG: return 3;
-      default: VG_(panic)("mkGrp3opcode");
-   }
-}
-
-static __inline__ Int mkGrp4opcode ( Opcode opc )
-{
-   switch (opc) {
-      case INC: return 0;
-      case DEC: return 1;
-      default: VG_(panic)("mkGrp4opcode");
-   }
-}
-
-static __inline__ Int mkGrp5opcode ( Opcode opc )
-{
-   switch (opc) {
-      case CALLM: return 2;
-      case JMP:   return 4;
-      default: VG_(panic)("mkGrp5opcode");
-   }
-}
-
-static __inline__ UChar mkPrimaryOpcode ( Opcode opc )
-{
-   switch (opc) {
-      case ADD: return 0x00;
-      case ADC: return 0x10;
-      case AND: return 0x20;
-      case XOR: return 0x30;
-      case OR:  return 0x08;
-      case SBB: return 0x18;
-      case SUB: return 0x28;
-      default: VG_(panic)("mkPrimaryOpcode");
-  }
-}
-
-/*----------------------------------------------------*/
-/*--- v-size (4, or 2 with OSO) insn emitters      ---*/
-/*----------------------------------------------------*/
-
-static void emit_movv_offregmem_reg ( Int sz, Int off, Int areg, Int reg )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0x8B ); /* MOV Ev, Gv */
-   emit_amode_offregmem_reg ( off, areg, reg );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t0x%x(%s), %s\n", 
-                   nameISize(sz), off, nameIReg(4,areg), nameIReg(sz,reg));
-}
-
-static void emit_movv_reg_offregmem ( Int sz, Int reg, Int off, Int areg )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0x89 ); /* MOV Gv, Ev */
-   emit_amode_offregmem_reg ( off, areg, reg );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t%s, 0x%x(%s)\n", 
-                   nameISize(sz), nameIReg(sz,reg), off, nameIReg(4,areg));
-}
-
-static void emit_movv_regmem_reg ( Int sz, Int reg1, Int reg2 )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0x8B ); /* MOV Ev, Gv */
-   emit_amode_regmem_reg ( reg1, reg2 );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t(%s), %s\n",
-                   nameISize(sz),  nameIReg(4,reg1), nameIReg(sz,reg2));
-}
-
-static void emit_movv_reg_regmem ( Int sz, Int reg1, Int reg2 )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0x89 ); /* MOV Gv, Ev */
-   emit_amode_regmem_reg ( reg2, reg1 );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t%s, (%s)\n", 
-                   nameISize(sz), nameIReg(sz,reg1), nameIReg(4,reg2));
-}
-
-static void emit_movv_reg_reg ( Int sz, Int reg1, Int reg2 )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0x89 ); /* MOV Gv, Ev */
-   emit_amode_ereg_greg ( reg2, reg1 );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t%s, %s\n", 
-                   nameISize(sz), nameIReg(sz,reg1), nameIReg(sz,reg2));
-}
-
-static void emit_nonshiftopv_lit_reg ( Int sz, Opcode opc, 
-                                       UInt lit, Int reg )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   if (lit == VG_(extend_s_8to32)(lit & 0x000000FF)) {
-      /* short form OK */
-      emitB ( 0x83 ); /* Grp1 Ib,Ev */
-      emit_amode_ereg_greg ( reg, mkGrp1opcode(opc) );
-      emitB ( lit & 0x000000FF );
-   } else {
-      emitB ( 0x81 ); /* Grp1 Iv,Ev */
-      emit_amode_ereg_greg ( reg, mkGrp1opcode(opc) );
-      if (sz == 2) emitW ( lit ); else emitL ( lit );
-   }
-   if (dis)
-      VG_(printf)( "\n\t\t%s%c\t$0x%x, %s\n", 
-                   VG_(nameUOpcode)(False,opc), nameISize(sz), 
-                   lit, nameIReg(sz,reg));
-}
-
-static void emit_shiftopv_lit_reg ( Int sz, Opcode opc, UInt lit, Int reg )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0xC1 ); /* Grp2 Ib,Ev */
-   emit_amode_ereg_greg ( reg, mkGrp2opcode(opc) );
-   emitB ( lit );
-   if (dis)
-      VG_(printf)( "\n\t\t%s%c\t$%d, %s\n", 
-                   VG_(nameUOpcode)(False,opc), nameISize(sz), 
-                   lit, nameIReg(sz,reg));
-}
-
-static void emit_shiftopv_cl_stack0 ( Int sz, Opcode opc )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0xD3 ); /* Grp2 CL,Ev */
-   emitB ( mkModRegRM ( 1, mkGrp2opcode(opc), 4 ) );
-   emitB ( 0x24 ); /* a SIB, I think `d8(%esp)' */
-   emitB ( 0x00 ); /* the d8 displacement */
-   if (dis)
-      VG_(printf)("\n\t\t%s%c %%cl, 0(%%esp)\n",
-                  VG_(nameUOpcode)(False,opc), nameISize(sz) );
-}
-
-static void emit_shiftopb_cl_stack0 ( Opcode opc )
-{
-   newEmit();
-   emitB ( 0xD2 ); /* Grp2 CL,Eb */
-   emitB ( mkModRegRM ( 1, mkGrp2opcode(opc), 4 ) );
-   emitB ( 0x24 ); /* a SIB, I think `d8(%esp)' */
-   emitB ( 0x00 ); /* the d8 displacement */
-   if (dis)
-      VG_(printf)("\n\t\t%s%c %%cl, 0(%%esp)\n",
-                  VG_(nameUOpcode)(False,opc), nameISize(1) );
-}
-
-static void emit_nonshiftopv_offregmem_reg ( Int sz, Opcode opc, 
-                                             Int off, Int areg, Int reg )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 3 + mkPrimaryOpcode(opc) ); /* op Ev, Gv */
-   emit_amode_offregmem_reg ( off, areg, reg );
-   if (dis)
-      VG_(printf)( "\n\t\t%s%c\t0x%x(%s), %s\n", 
-                   VG_(nameUOpcode)(False,opc), nameISize(sz),
-                   off, nameIReg(4,areg), nameIReg(sz,reg));
-}
-
-static void emit_nonshiftopv_reg_reg ( Int sz, Opcode opc, 
-                                       Int reg1, Int reg2 )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-#  if 0
-   /* Perfectly correct, but the GNU assembler uses the other form.
-      Therefore we too use the other form, to aid verification. */
-   emitB ( 3 + mkPrimaryOpcode(opc) ); /* op Ev, Gv */
-   emit_amode_ereg_greg ( reg1, reg2 );
-#  else
-   emitB ( 1 + mkPrimaryOpcode(opc) ); /* op Gv, Ev */
-   emit_amode_greg_ereg ( reg1, reg2 );
-#  endif
-   if (dis)
-      VG_(printf)( "\n\t\t%s%c\t%s, %s\n", 
-                   VG_(nameUOpcode)(False,opc), nameISize(sz), 
-                   nameIReg(sz,reg1), nameIReg(sz,reg2));
-}
-
-static void emit_movv_lit_reg ( Int sz, UInt lit, Int reg )
-{
-   if (lit == 0) {
-      emit_nonshiftopv_reg_reg ( sz, XOR, reg, reg );
-      return;
-   }
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   emitB ( 0xB8+reg ); /* MOV imm, Gv */
-   if (sz == 2) emitW ( lit ); else emitL ( lit );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t$0x%x, %s\n", 
-                   nameISize(sz), lit, nameIReg(sz,reg));
-}
-
-static void emit_unaryopv_reg ( Int sz, Opcode opc, Int reg )
-{
-   newEmit();
-   if (sz == 2) emitB ( 0x66 );
-   switch (opc) {
-      case NEG:
-         emitB ( 0xF7 );
-         emit_amode_ereg_greg ( reg, mkGrp3opcode(NEG) );
-         if (dis)
-            VG_(printf)( "\n\t\tneg%c\t%s\n", 
-                         nameISize(sz), nameIReg(sz,reg));
-         break;
-      case NOT:
-         emitB ( 0xF7 );
-         emit_amode_ereg_greg ( reg, mkGrp3opcode(NOT) );
-         if (dis)
-            VG_(printf)( "\n\t\tnot%c\t%s\n", 
-                         nameISize(sz), nameIReg(sz,reg));
-         break;
-      case DEC:
-         emitB ( 0x48 + reg );
-         if (dis)
-            VG_(printf)( "\n\t\tdec%c\t%s\n", 
-                         nameISize(sz), nameIReg(sz,reg));
-         break;
-      case INC:
-         emitB ( 0x40 + reg );
-         if (dis)
-            VG_(printf)( "\n\t\tinc%c\t%s\n", 
-                         nameISize(sz), nameIReg(sz,reg));
-         break;
-      default: 
-         VG_(panic)("emit_unaryopv_reg");
-   }
-}
-
-static void emit_pushv_reg ( Int sz, Int reg )
-{
-   newEmit();
-   if (sz == 2) {
-      emitB ( 0x66 ); 
-   } else {
-      vg_assert(sz == 4);
-   }
-   emitB ( 0x50 + reg );
-   if (dis)
-      VG_(printf)("\n\t\tpush%c %s\n", nameISize(sz), nameIReg(sz,reg));
-}
-
-static void emit_popv_reg ( Int sz, Int reg )
-{
-   newEmit();
-   if (sz == 2) {
-      emitB ( 0x66 ); 
-   } else {
-      vg_assert(sz == 4);
-   }
-   emitB ( 0x58 + reg );
-   if (dis)
-      VG_(printf)("\n\t\tpop%c %s\n", nameISize(sz), nameIReg(sz,reg));
-}
-
-static void emit_pushl_lit8 ( Int lit8 )
-{
-   vg_assert(lit8 >= -128 && lit8 < 128);
-   newEmit();
-   emitB ( 0x6A );
-   emitB ( (UChar)((UInt)lit8) );
-   if (dis)
-      VG_(printf)("\n\t\tpushl $%d\n", lit8 );
-}
-
-static void emit_pushl_lit32 ( UInt int32 )
-{
-   newEmit();
-   emitB ( 0x68 );
-   emitL ( int32 );
-   if (dis)
-      VG_(printf)("\n\t\tpushl $0x%x\n", int32 );
-}
-
-static void emit_cmpl_zero_reg ( Int reg )
-{
-   newEmit();
-   emitB ( 0x83 );
-   emit_amode_ereg_greg ( reg, 7 /* Grp 3 opcode for CMP */ );
-   emitB ( 0x00 );
-   if (dis)
-      VG_(printf)("\n\t\tcmpl $0, %s\n", nameIReg(4,reg));
-}
-
-static void emit_swapl_reg_ECX ( Int reg )
-{
-   newEmit();
-   emitB ( 0x87 ); /* XCHG Gv,Ev */
-   emit_amode_ereg_greg ( reg, R_ECX );
-   if (dis) 
-      VG_(printf)("\n\t\txchgl %%ecx, %s\n", nameIReg(4,reg));
-}
-
-static void emit_swapl_reg_EAX ( Int reg )
-{
-   newEmit();
-   emitB ( 0x90 + reg ); /* XCHG Gv,eAX */
-   if (dis) 
-      VG_(printf)("\n\t\txchgl %%eax, %s\n", nameIReg(4,reg));
-}
-
-static void emit_swapl_reg_reg ( Int reg1, Int reg2 )
-{
-   newEmit();
-   emitB ( 0x87 ); /* XCHG Gv,Ev */
-   emit_amode_ereg_greg ( reg1, reg2 );
-   if (dis) 
-      VG_(printf)("\n\t\txchgl %s, %s\n", nameIReg(4,reg1), 
-                  nameIReg(4,reg2));
-}
-
-static void emit_bswapl_reg ( Int reg )
-{
-   newEmit();
-   emitB ( 0x0F );
-   emitB ( 0xC8 + reg ); /* BSWAP r32 */
-   if (dis) 
-      VG_(printf)("\n\t\tbswapl %s\n", nameIReg(4,reg));
-}
-
-static void emit_movl_reg_reg ( Int regs, Int regd )
-{
-   newEmit();
-   emitB ( 0x89 ); /* MOV Gv,Ev */
-   emit_amode_ereg_greg ( regd, regs );
-   if (dis) 
-      VG_(printf)("\n\t\tmovl %s, %s\n", nameIReg(4,regs), nameIReg(4,regd));
-}
-
-static void emit_testv_lit_reg ( Int sz, UInt lit, Int reg )
-{
-   newEmit();
-   if (sz == 2) {
-      emitB ( 0x66 );
-   } else {
-      vg_assert(sz == 4);
-   }
-   emitB ( 0xF7 ); /* Grp3 Ev */
-   emit_amode_ereg_greg ( reg, 0 /* Grp3 subopcode for TEST */ );
-   if (sz == 2) emitW ( lit ); else emitL ( lit );
-   if (dis)
-      VG_(printf)("\n\t\ttest%c $0x%x, %s\n", nameISize(sz), 
-                                            lit, nameIReg(sz,reg));
-}
-
-static void emit_testv_lit_offregmem ( Int sz, UInt lit, Int off, Int reg )
-{
-   newEmit();
-   if (sz == 2) {
-      emitB ( 0x66 );
-   } else {
-      vg_assert(sz == 4);
-   }
-   emitB ( 0xF7 ); /* Grp3 Ev */
-   emit_amode_offregmem_reg ( off, reg, 0 /* Grp3 subopcode for TEST */ );
-   if (sz == 2) emitW ( lit ); else emitL ( lit );
-   if (dis)
-      VG_(printf)("\n\t\ttest%c $%d, 0x%x(%s)\n", 
-                  nameISize(sz), lit, off, nameIReg(4,reg) );
-}
-
-static void emit_movv_lit_offregmem ( Int sz, UInt lit, Int off, Int memreg )
-{
-   newEmit();
-   if (sz == 2) {
-      emitB ( 0x66 );
-   } else {
-      vg_assert(sz == 4);
-   }
-   emitB ( 0xC7 ); /* Grp11 Ev */
-   emit_amode_offregmem_reg ( off, memreg, 0 /* Grp11 subopcode for MOV */ );
-   if (sz == 2) emitW ( lit ); else emitL ( lit );
-   if (dis)
-      VG_(printf)( "\n\t\tmov%c\t$0x%x, 0x%x(%s)\n", 
-                   nameISize(sz), lit, off, nameIReg(4,memreg) );
-}
-
-
-/*----------------------------------------------------*/
-/*--- b-size (1 byte) instruction emitters         ---*/
-/*----------------------------------------------------*/
-
-/* There is some doubt as to whether C6 (Grp 11) is in the
-   486 insn set.  ToDo: investigate. */
-static void emit_movb_lit_offregmem ( UInt lit, Int off, Int memreg )
-{
-   newEmit();
-   emitB ( 0xC6 ); /* Grp11 Eb */
-   emit_amode_offregmem_reg ( off, memreg, 0 /* Grp11 subopcode for MOV */ );
-   emitB ( lit );
-   if (dis)
-      VG_(printf)( "\n\t\tmovb\t$0x%x, 0x%x(%s)\n", 
-                   lit, off, nameIReg(4,memreg) );
-}
-
-static void emit_nonshiftopb_offregmem_reg ( Opcode opc, 
-                                             Int off, Int areg, Int reg )
-{
-   newEmit();
-   emitB ( 2 + mkPrimaryOpcode(opc) ); /* op Eb, Gb */
-   emit_amode_offregmem_reg ( off, areg, reg );
-   if (dis)
-      VG_(printf)( "\n\t\t%sb\t0x%x(%s), %s\n", 
-                   VG_(nameUOpcode)(False,opc), off, nameIReg(4,areg), 
-                   nameIReg(1,reg));
-}
-
-static void emit_movb_reg_offregmem ( Int reg, Int off, Int areg )
-{
-   /* Could do better when reg == %al. */
-   newEmit();
-   emitB ( 0x88 ); /* MOV G1, E1 */
-   emit_amode_offregmem_reg ( off, areg, reg );
-   if (dis)
-      VG_(printf)( "\n\t\tmovb\t%s, 0x%x(%s)\n", 
-                   nameIReg(1,reg), off, nameIReg(4,areg));
-}
-
-static void emit_nonshiftopb_reg_reg ( Opcode opc, Int reg1, Int reg2 )
-{
-   newEmit();
-   emitB ( 2 + mkPrimaryOpcode(opc) ); /* op Eb, Gb */
-   emit_amode_ereg_greg ( reg1, reg2 );
-   if (dis)
-      VG_(printf)( "\n\t\t%sb\t%s, %s\n", 
-                   VG_(nameUOpcode)(False,opc),
-                   nameIReg(1,reg1), nameIReg(1,reg2));
-}
-
-static void emit_movb_reg_regmem ( Int reg1, Int reg2 )
-{
-   newEmit();
-   emitB ( 0x88 ); /* MOV G1, E1 */
-   emit_amode_regmem_reg ( reg2, reg1 );
-   if (dis)
-      VG_(printf)( "\n\t\tmovb\t%s, (%s)\n", nameIReg(1,reg1), 
-                                             nameIReg(4,reg2));
-}
-
-static void emit_nonshiftopb_lit_reg ( Opcode opc, UInt lit, Int reg )
-{
-   newEmit();
-   emitB ( 0x80 ); /* Grp1 Ib,Eb */
-   emit_amode_ereg_greg ( reg, mkGrp1opcode(opc) );
-   emitB ( lit & 0x000000FF );
-   if (dis)
-      VG_(printf)( "\n\t\t%sb\t$0x%x, %s\n", VG_(nameUOpcode)(False,opc),
-                                             lit, nameIReg(1,reg));
-}
-
-static void emit_shiftopb_lit_reg ( Opcode opc, UInt lit, Int reg )
-{
-   newEmit();
-   emitB ( 0xC0 ); /* Grp2 Ib,Eb */
-   emit_amode_ereg_greg ( reg, mkGrp2opcode(opc) );
-   emitB ( lit );
-   if (dis)
-      VG_(printf)( "\n\t\t%sb\t$%d, %s\n", 
-                   VG_(nameUOpcode)(False,opc),
-                   lit, nameIReg(1,reg));
-}
-
-static void emit_unaryopb_reg ( Opcode opc, Int reg )
-{
-   newEmit();
-   switch (opc) {
-      case INC:
-         emitB ( 0xFE );
-         emit_amode_ereg_greg ( reg, mkGrp4opcode(INC) );
-         if (dis)
-            VG_(printf)( "\n\t\tincb\t%s\n", nameIReg(1,reg));
-         break;
-      case DEC:
-         emitB ( 0xFE );
-         emit_amode_ereg_greg ( reg, mkGrp4opcode(DEC) );
-         if (dis)
-            VG_(printf)( "\n\t\tdecb\t%s\n", nameIReg(1,reg));
-         break;
-      case NOT:
-         emitB ( 0xF6 );
-         emit_amode_ereg_greg ( reg, mkGrp3opcode(NOT) );
-         if (dis)
-            VG_(printf)( "\n\t\tnotb\t%s\n", nameIReg(1,reg));
-         break;
-      case NEG:
-         emitB ( 0xF6 );
-         emit_amode_ereg_greg ( reg, mkGrp3opcode(NEG) );
-         if (dis)
-            VG_(printf)( "\n\t\tnegb\t%s\n", nameIReg(1,reg));
-         break;
-      default: 
-         VG_(panic)("emit_unaryopb_reg");
-   }
-}
-
-static void emit_testb_lit_reg ( UInt lit, Int reg )
-{
-   newEmit();
-   emitB ( 0xF6 ); /* Grp3 Eb */
-   emit_amode_ereg_greg ( reg, 0 /* Grp3 subopcode for TEST */ );
-   emitB ( lit );
-   if (dis)
-      VG_(printf)("\n\t\ttestb $0x%x, %s\n", lit, nameIReg(1,reg));
-}
-
-
-/*----------------------------------------------------*/
-/*--- zero-extended load emitters                  ---*/
-/*----------------------------------------------------*/
-
-static void emit_movzbl_offregmem_reg ( Int off, Int regmem, Int reg )
-{
-   newEmit();
-   emitB ( 0x0F ); emitB ( 0xB6 ); /* MOVZBL */
-   emit_amode_offregmem_reg ( off, regmem, reg );
-   if (dis)
-      VG_(printf)( "\n\t\tmovzbl\t0x%x(%s), %s\n", 
-                   off, nameIReg(4,regmem), nameIReg(4,reg));
-}
-
-static void emit_movzbl_regmem_reg ( Int reg1, Int reg2 )
-{
-   newEmit();
-   emitB ( 0x0F ); emitB ( 0xB6 ); /* MOVZBL */
-   emit_amode_regmem_reg ( reg1, reg2 );
-   if (dis)
-      VG_(printf)( "\n\t\tmovzbl\t(%s), %s\n", nameIReg(4,reg1), 
-                                               nameIReg(4,reg2));
-}
-
-static void emit_movzwl_offregmem_reg ( Int off, Int areg, Int reg )
-{
-   newEmit();
-   emitB ( 0x0F ); emitB ( 0xB7 ); /* MOVZWL */
-   emit_amode_offregmem_reg ( off, areg, reg );
-   if (dis)
-      VG_(printf)( "\n\t\tmovzwl\t0x%x(%s), %s\n",
-                   off, nameIReg(4,areg), nameIReg(4,reg));
-}
-
-static void emit_movzwl_regmem_reg ( Int reg1, Int reg2 )
-{
-   newEmit();
-   emitB ( 0x0F ); emitB ( 0xB7 ); /* MOVZWL */
-   emit_amode_regmem_reg ( reg1, reg2 );
-   if (dis)
-      VG_(printf)( "\n\t\tmovzwl\t(%s), %s\n", nameIReg(4,reg1), 
-                                             nameIReg(4,reg2));
-}
-
-/*----------------------------------------------------*/
-/*--- FPU instruction emitters                     ---*/
-/*----------------------------------------------------*/
-
-static void emit_get_fpu_state ( void )
-{
-   Int off = 4 * VGOFF_(m_fpustate);
-   newEmit();
-   emitB ( 0xDD ); emitB ( 0xA5 ); /* frstor d32(%ebp) */
-   emitL ( off );
-   if (dis)
-      VG_(printf)("\n\t\tfrstor\t%d(%%ebp)\n", off );
-}
-
-static void emit_put_fpu_state ( void )
-{
-   Int off = 4 * VGOFF_(m_fpustate);
-   newEmit();
-   emitB ( 0xDD ); emitB ( 0xB5 ); /* fnsave d32(%ebp) */
-   emitL ( off );
-   if (dis)
-      VG_(printf)("\n\t\tfnsave\t%d(%%ebp)\n", off );
-}
-
-static void emit_fpu_no_mem ( UChar first_byte, 
-                              UChar second_byte )
-{
-   newEmit();
-   emitB ( first_byte );
-   emitB ( second_byte );
-   if (dis)
-      VG_(printf)("\n\t\tfpu-0x%x:0x%x\n", 
-                  (UInt)first_byte, (UInt)second_byte );
-}
-
-static void emit_fpu_regmem ( UChar first_byte, 
-                              UChar second_byte_masked, 
-                              Int reg )
-{
-   newEmit();
-   emitB ( first_byte );
-   emit_amode_regmem_reg ( reg, second_byte_masked >> 3 );
-   if (dis)
-      VG_(printf)("\n\t\tfpu-0x%x:0x%x-(%s)\n", 
-                  (UInt)first_byte, (UInt)second_byte_masked,
-                  nameIReg(4,reg) );
-}
-
-
-/*----------------------------------------------------*/
-/*--- misc instruction emitters                    ---*/
-/*----------------------------------------------------*/
-
-static void emit_call_reg ( Int reg )
-{
-   newEmit();
-   emitB ( 0xFF ); /* Grp5 */
-   emit_amode_ereg_greg ( reg, mkGrp5opcode(CALLM) );
-   if (dis)
-      VG_(printf)( "\n\t\tcall\t*%s\n", nameIReg(4,reg) );
-}
-
-
-static void emit_call_star_EBP_off ( Int byte_off )
-{
-  newEmit();
-  if (byte_off < -128 || byte_off > 127) {
-     emitB ( 0xFF );
-     emitB ( 0x95 );
-     emitL ( byte_off );
-  } else {
-     emitB ( 0xFF );
-     emitB ( 0x55 );
-     emitB ( byte_off );
-  }
-  if (dis)
-     VG_(printf)( "\n\t\tcall * %d(%%ebp)\n", byte_off );
-}
-
-
-static void emit_addlit8_offregmem ( Int lit8, Int regmem, Int off )
-{
-   vg_assert(lit8 >= -128 && lit8 < 128);
-   newEmit();
-   emitB ( 0x83 ); /* Grp1 Ib,Ev */
-   emit_amode_offregmem_reg ( off, regmem, 
-                              0 /* Grp1 subopcode for ADD */ );
-   emitB ( lit8 & 0xFF );
-   if (dis)
-      VG_(printf)( "\n\t\taddl $%d, %d(%s)\n", lit8, off, 
-                                               nameIReg(4,regmem));
-}
-
-
-static void emit_add_lit_to_esp ( Int lit )
-{
-   if (lit < -128 || lit > 127) VG_(panic)("emit_add_lit_to_esp");
-   newEmit();
-   emitB ( 0x83 );
-   emitB ( 0xC4 );
-   emitB ( lit & 0xFF );
-   if (dis)
-      VG_(printf)( "\n\t\taddl $%d, %%esp\n", lit );
-}
-
-
-static void emit_movb_AL_zeroESPmem ( void )
-{
-   /* movb %al, 0(%esp) */
-   /* 88442400              movb    %al, 0(%esp) */
-   newEmit();
-   emitB ( 0x88 );
-   emitB ( 0x44 );
-   emitB ( 0x24 );
-   emitB ( 0x00 );
-   if (dis)
-      VG_(printf)( "\n\t\tmovb %%al, 0(%%esp)\n" );
-}
-
-static void emit_movb_zeroESPmem_AL ( void )
-{
-   /* movb 0(%esp), %al */
-   /* 8A442400              movb    0(%esp), %al */
-   newEmit();
-   emitB ( 0x8A );
-   emitB ( 0x44 );
-   emitB ( 0x24 );
-   emitB ( 0x00 );
-   if (dis)
-      VG_(printf)( "\n\t\tmovb 0(%%esp), %%al\n" );
-}
-
-
-/* Emit a jump short with an 8-bit signed offset.  Note that the
-   offset is that which should be added to %eip once %eip has been
-   advanced over this insn.  */
-static void emit_jcondshort_delta ( Condcode cond, Int delta )
-{
-   vg_assert(delta >= -128 && delta <= 127);
-   newEmit();
-   emitB ( 0x70 + (UInt)cond );
-   emitB ( (UChar)delta );
-   if (dis)
-      VG_(printf)( "\n\t\tj%s-8\t%%eip+%d\n", 
-                   VG_(nameCondcode)(cond), delta );
-}
-
-static void emit_get_eflags ( void )
-{
-   Int off = 4 * VGOFF_(m_eflags);
-   vg_assert(off >= 0 && off < 128);
-   newEmit();
-   emitB ( 0xFF ); /* PUSHL off(%ebp) */
-   emitB ( 0x75 );
-   emitB ( off );
-   emitB ( 0x9D ); /* POPFL */
-   if (dis)
-      VG_(printf)( "\n\t\tpushl %d(%%ebp) ; popfl\n", off );
-}
-
-static void emit_put_eflags ( void )
-{
-   Int off = 4 * VGOFF_(m_eflags);
-   vg_assert(off >= 0 && off < 128);
-   newEmit();
-   emitB ( 0x9C ); /* PUSHFL */
-   emitB ( 0x8F ); /* POPL vg_m_state.m_eflags */
-   emitB ( 0x45 );
-   emitB ( off );
-   if (dis)
-      VG_(printf)( "\n\t\tpushfl ; popl %d(%%ebp)\n", off );
-}
-
-static void emit_setb_reg ( Int reg, Condcode cond )
-{
-   newEmit();
-   emitB ( 0x0F ); emitB ( 0x90 + (UChar)cond );
-   emit_amode_ereg_greg ( reg, 0 );
-   if (dis)
-      VG_(printf)("\n\t\tset%s %s\n", 
-                  VG_(nameCondcode)(cond), nameIReg(1,reg));
-}
-
-static void emit_ret ( void )
-{
-   newEmit();
-   emitB ( 0xC3 ); /* RET */
-   if (dis)
-      VG_(printf)("\n\t\tret\n");
-}
-
-static void emit_pushal ( void )
-{
-   newEmit();
-   emitB ( 0x60 ); /* PUSHAL */
-   if (dis)
-      VG_(printf)("\n\t\tpushal\n");
-}
-
-static void emit_popal ( void )
-{
-   newEmit();
-   emitB ( 0x61 ); /* POPAL */
-   if (dis)
-      VG_(printf)("\n\t\tpopal\n");
-}
-
-static void emit_lea_litreg_reg ( UInt lit, Int regmem, Int reg )
-{
-   newEmit();
-   emitB ( 0x8D ); /* LEA M,Gv */
-   emit_amode_offregmem_reg ( (Int)lit, regmem, reg );
-   if (dis)
-      VG_(printf)("\n\t\tleal 0x%x(%s), %s\n",
-                  lit, nameIReg(4,regmem), nameIReg(4,reg) );
-}
-
-static void emit_lea_sib_reg ( UInt lit, Int scale,
-			       Int regbase, Int regindex, Int reg )
-{
-   newEmit();
-   emitB ( 0x8D ); /* LEA M,Gv */
-   emit_amode_sib_reg ( (Int)lit, scale, regbase, regindex, reg );
-   if (dis)
-      VG_(printf)("\n\t\tleal 0x%x(%s,%s,%d), %s\n",
-                  lit, nameIReg(4,regbase), 
-                       nameIReg(4,regindex), scale,
-                       nameIReg(4,reg) );
-}
-
-static void emit_AMD_prefetch_reg ( Int reg )
-{
-   newEmit();
-   emitB ( 0x0F );
-   emitB ( 0x0D );
-   emit_amode_regmem_reg ( reg, 1 /* 0 is prefetch; 1 is prefetchw */ );
-   if (dis)
-      VG_(printf)("\n\t\tamd-prefetch (%s)\n", nameIReg(4,reg) );
-}
-
-/*----------------------------------------------------*/
-/*--- Instruction synthesisers                     ---*/
-/*----------------------------------------------------*/
-
-static Condcode invertCondition ( Condcode cond )
-{
-   return (Condcode)(1 ^ (UInt)cond);
-}
-
-
-/* Synthesise a call to *baseBlock[offset], ie,
-   call * (4 x offset)(%ebp).
-*/
-static void synth_call_baseBlock_method ( Bool ensure_shortform, 
-                                          Int word_offset )
-{
-   vg_assert(word_offset >= 0);
-   vg_assert(word_offset < VG_BASEBLOCK_WORDS);
-   if (ensure_shortform)
-      vg_assert(word_offset < 32);
-   emit_call_star_EBP_off ( 4 * word_offset );
-}
-
-
-static void load_ebp_from_JmpKind ( JmpKind jmpkind )
-{
-   switch (jmpkind) {
-      case JmpBoring: 
-         break;
-      case JmpCall:
-      case JmpRet: 
-         emit_movv_lit_reg ( 4, VG_TRC_EBP_JMP_STKADJ, R_EBP );
-         break;
-      case JmpSyscall: 
-         emit_movv_lit_reg ( 4, VG_TRC_EBP_JMP_SYSCALL, R_EBP );
-         break;
-      case JmpClientReq: 
-         emit_movv_lit_reg ( 4, VG_TRC_EBP_JMP_CLIENTREQ, R_EBP );
-         break;
-      default: 
-         VG_(panic)("load_ebp_from_JmpKind");
-   }
-}
-
-/* Jump to the next translation, by loading its original addr into
-   %eax and returning to the scheduler.  Signal special requirements
-   by loading a special value into %ebp first.  
-*/
-static void synth_jmp_reg ( Int reg, JmpKind jmpkind )
-{
-   load_ebp_from_JmpKind ( jmpkind );
-   if (reg != R_EAX)
-      emit_movv_reg_reg ( 4, reg, R_EAX );
-   emit_ret();
-}
-
-
-/* Same deal as synth_jmp_reg. */
-static void synth_jmp_lit ( Addr addr, JmpKind jmpkind )
-{
-   load_ebp_from_JmpKind ( jmpkind );
-   emit_movv_lit_reg ( 4, addr, R_EAX );
-   emit_ret();
-}
-
-
-static void synth_jcond_lit ( Condcode cond, Addr addr )
-{
-  /* Do the following:
-        get eflags
-        jmp short if not cond to xyxyxy
-        addr -> eax
-        ret
-        xyxyxy
-
-   2 0000 750C                  jnz     xyxyxy
-   3 0002 B877665544            movl    $0x44556677, %eax
-   4 0007 C3                    ret
-   5 0008 FFE3                  jmp     *%ebx
-   6                    xyxyxy:
-  */
-   emit_get_eflags();
-   emit_jcondshort_delta ( invertCondition(cond), 5+1 );
-   synth_jmp_lit ( addr, JmpBoring );
-}
-
-
-static void synth_jmp_ifzero_reg_lit ( Int reg, Addr addr )
-{
-   /* 0000 83FF00                cmpl    $0, %edi
-      0003 750A                  jnz     next
-      0005 B844332211            movl    $0x11223344, %eax
-      000a C3                    ret
-      next:
-   */
-   emit_cmpl_zero_reg ( reg );
-   emit_jcondshort_delta ( CondNZ, 5+1 );
-   synth_jmp_lit ( addr, JmpBoring );
-}
-
-
-static void synth_mov_lit_reg ( Int size, UInt lit, Int reg ) 
-{
-   /* Load the zero-extended literal into reg, at size l,
-      regardless of the request size. */
-   emit_movv_lit_reg ( 4, lit, reg );
-}
-
-
-static void synth_mov_regmem_reg ( Int size, Int reg1, Int reg2 ) 
-{
-   switch (size) {
-      case 4: emit_movv_regmem_reg ( 4, reg1, reg2 ); break;
-      case 2: emit_movzwl_regmem_reg ( reg1, reg2 ); break;
-      case 1: emit_movzbl_regmem_reg ( reg1, reg2 ); break;
-      default: VG_(panic)("synth_mov_regmem_reg");
-   }  
-}
-
-
-static void synth_mov_offregmem_reg ( Int size, Int off, Int areg, Int reg ) 
-{
-   switch (size) {
-      case 4: emit_movv_offregmem_reg ( 4, off, areg, reg ); break;
-      case 2: emit_movzwl_offregmem_reg ( off, areg, reg ); break;
-      case 1: emit_movzbl_offregmem_reg ( off, areg, reg ); break;
-      default: VG_(panic)("synth_mov_offregmem_reg");
-   }  
-}
-
-
-static void synth_mov_reg_offregmem ( Int size, Int reg, 
-                                      Int off, Int areg )
-{
-   switch (size) {
-      case 4: emit_movv_reg_offregmem ( 4, reg, off, areg ); break;
-      case 2: emit_movv_reg_offregmem ( 2, reg, off, areg ); break;
-      case 1: if (reg < 4) {
-                 emit_movb_reg_offregmem ( reg, off, areg ); 
-              }
-              else {
-                 emit_swapl_reg_EAX ( reg );
-                 emit_movb_reg_offregmem ( R_AL, off, areg );
-                 emit_swapl_reg_EAX ( reg );
-              }
-              break;
-      default: VG_(panic)("synth_mov_reg_offregmem");
-   }
-}
-
-
-static void synth_mov_reg_memreg ( Int size, Int reg1, Int reg2 )
-{
-   Int s1;
-   switch (size) {
-      case 4: emit_movv_reg_regmem ( 4, reg1, reg2 ); break;
-      case 2: emit_movv_reg_regmem ( 2, reg1, reg2 ); break;
-      case 1: if (reg1 < 4) {
-                 emit_movb_reg_regmem ( reg1, reg2 ); 
-              }
-              else {
-                 /* Choose a swap reg which is < 4 and not reg1 or reg2. */
-                 for (s1 = 0; s1 == reg1 || s1 == reg2; s1++) ;
-                 emit_swapl_reg_reg ( s1, reg1 );
-                 emit_movb_reg_regmem ( s1, reg2 );
-                 emit_swapl_reg_reg ( s1, reg1 );
-              }
-              break;
-      default: VG_(panic)("synth_mov_reg_litmem");
-   }
-}
-
-
-static void synth_unaryop_reg ( Bool upd_cc,
-                                Opcode opcode, Int size,
-                                Int reg )
-{
-   /* NB! opcode is a uinstr opcode, not an x86 one! */
-   switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              emit_unaryopv_reg ( 4, opcode, reg );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 2: if (upd_cc) emit_get_eflags();
-              emit_unaryopv_reg ( 2, opcode, reg );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 1: if (reg < 4) {
-                 if (upd_cc) emit_get_eflags();
-                 emit_unaryopb_reg ( opcode, reg );
-                 if (upd_cc) emit_put_eflags();
-              } else {
-                 emit_swapl_reg_EAX ( reg );
-                 if (upd_cc) emit_get_eflags();
-                 emit_unaryopb_reg ( opcode, R_AL );
-                 if (upd_cc) emit_put_eflags();
-                 emit_swapl_reg_EAX ( reg );
-              }
-              break;
-      default: VG_(panic)("synth_unaryop_reg");
-   }
-}
-
-
-
-static void synth_nonshiftop_reg_reg ( Bool upd_cc, 
-                                       Opcode opcode, Int size, 
-                                       Int reg1, Int reg2 )
-{
-   /* NB! opcode is a uinstr opcode, not an x86 one! */
-   switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              emit_nonshiftopv_reg_reg ( 4, opcode, reg1, reg2 );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 2: if (upd_cc) emit_get_eflags();
-              emit_nonshiftopv_reg_reg ( 2, opcode, reg1, reg2 );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 1: { /* Horrible ... */
-         Int s1, s2;
-         /* Choose s1 and s2 to be x86 regs which we can talk about the
-            lowest 8 bits, ie either %eax, %ebx, %ecx or %edx.  Make
-            sure s1 != s2 and that neither of them equal either reg1 or
-            reg2. Then use them as temporaries to make things work. */
-         if (reg1 < 4 && reg2 < 4) {
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, reg1, reg2); 
-            if (upd_cc) emit_put_eflags();
-            break;
-         }
-         for (s1 = 0; s1 == reg1 || s1 == reg2; s1++) ;
-         if (reg1 >= 4 && reg2 < 4) {
-            emit_swapl_reg_reg ( reg1, s1 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, s1, reg2);
-            if (upd_cc) emit_put_eflags();
-            emit_swapl_reg_reg ( reg1, s1 );
-            break;
-         }
-         for (s2 = 0; s2 == reg1 || s2 == reg2 || s2 == s1; s2++) ;
-         if (reg1 < 4 && reg2 >= 4) {
-            emit_swapl_reg_reg ( reg2, s2 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, reg1, s2);
-            if (upd_cc) emit_put_eflags();
-            emit_swapl_reg_reg ( reg2, s2 );
-            break;
-         }
-         if (reg1 >= 4 && reg2 >= 4 && reg1 != reg2) {
-            emit_swapl_reg_reg ( reg1, s1 );
-            emit_swapl_reg_reg ( reg2, s2 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, s1, s2);
-            if (upd_cc) emit_put_eflags();
-            emit_swapl_reg_reg ( reg1, s1 );
-            emit_swapl_reg_reg ( reg2, s2 );
-            break;
-         }
-         if (reg1 >= 4 && reg2 >= 4 && reg1 == reg2) {
-            emit_swapl_reg_reg ( reg1, s1 );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_reg_reg(opcode, s1, s1);
-            if (upd_cc) emit_put_eflags();
-            emit_swapl_reg_reg ( reg1, s1 );
-            break;
-         }
-         VG_(panic)("synth_nonshiftopb_reg_reg");
-      }
-      default: VG_(panic)("synth_nonshiftop_reg_reg");
-   }
-}
-
-
-static void synth_nonshiftop_offregmem_reg ( 
-   Bool upd_cc,
-   Opcode opcode, Int size, 
-   Int off, Int areg, Int reg )
-{
-   switch (size) {
-      case 4: 
-         if (upd_cc) emit_get_eflags();
-         emit_nonshiftopv_offregmem_reg ( 4, opcode, off, areg, reg ); 
-         if (upd_cc) emit_put_eflags();
-         break;
-      case 2: 
-         if (upd_cc) emit_get_eflags();
-         emit_nonshiftopv_offregmem_reg ( 2, opcode, off, areg, reg ); 
-         if (upd_cc) emit_put_eflags();
-         break;
-      case 1: 
-         if (reg < 4) {
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_offregmem_reg ( opcode, off, areg, reg );
-            if (upd_cc) emit_put_eflags();
-         } else {
-            emit_swapl_reg_EAX ( reg );
-            if (upd_cc) emit_get_eflags();
-            emit_nonshiftopb_offregmem_reg ( opcode, off, areg, R_AL );
-            if (upd_cc) emit_put_eflags();
-            emit_swapl_reg_EAX ( reg );
-         }
-         break;
-      default: 
-         VG_(panic)("synth_nonshiftop_litmem_reg");
-   }
-}
-
-
-static void synth_nonshiftop_lit_reg ( Bool upd_cc,
-                                       Opcode opcode, Int size, 
-                                       UInt lit, Int reg )
-{
-   switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              emit_nonshiftopv_lit_reg ( 4, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 2: if (upd_cc) emit_get_eflags();
-              emit_nonshiftopv_lit_reg ( 2, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 1: if (reg < 4) {
-                 if (upd_cc) emit_get_eflags();
-                 emit_nonshiftopb_lit_reg ( opcode, lit, reg );
-                 if (upd_cc) emit_put_eflags();
-              } else {
-                 emit_swapl_reg_EAX ( reg );
-                 if (upd_cc) emit_get_eflags();
-                 emit_nonshiftopb_lit_reg ( opcode, lit, R_AL );
-                 if (upd_cc) emit_put_eflags();
-                 emit_swapl_reg_EAX ( reg );
-              }
-              break;
-      default: VG_(panic)("synth_nonshiftop_lit_reg");
-   }
-}
-
-
-static void synth_push_reg ( Int size, Int reg )
-{
-   switch (size) {
-      case 4: 
-         emit_pushv_reg ( 4, reg ); 
-         break;
-      case 2: 
-         emit_pushv_reg ( 2, reg ); 
-         break;
-      /* Pray that we don't have to generate this really cruddy bit of
-         code very often.  Could do better, but can I be bothered? */
-      case 1: 
-         vg_assert(reg != R_ESP); /* duh */
-         emit_add_lit_to_esp(-1);
-         if (reg != R_EAX) emit_swapl_reg_EAX ( reg );
-         emit_movb_AL_zeroESPmem();
-         if (reg != R_EAX) emit_swapl_reg_EAX ( reg );
-         break;
-     default: 
-         VG_(panic)("synth_push_reg");
-   }
-}
-
-
-static void synth_pop_reg ( Int size, Int reg )
-{
-   switch (size) {
-      case 4: 
-         emit_popv_reg ( 4, reg ); 
-         break;
-      case 2: 
-         emit_popv_reg ( 2, reg ); 
-         break;
-      case 1:
-         /* Same comment as above applies. */
-         vg_assert(reg != R_ESP); /* duh */
-         if (reg != R_EAX) emit_swapl_reg_EAX ( reg );
-         emit_movb_zeroESPmem_AL();
-         if (reg != R_EAX) emit_swapl_reg_EAX ( reg );
-         emit_add_lit_to_esp(1);
-         break;
-      default: VG_(panic)("synth_pop_reg");
-   }
-}
-
-
-static void synth_shiftop_reg_reg ( Bool upd_cc,
-                                    Opcode opcode, Int size, 
-                                    Int regs, Int regd )
-{
-   synth_push_reg ( size, regd );
-   if (regs != R_ECX) emit_swapl_reg_ECX ( regs );
-   if (upd_cc) emit_get_eflags();
-   switch (size) {
-      case 4: emit_shiftopv_cl_stack0 ( 4, opcode ); break;
-      case 2: emit_shiftopv_cl_stack0 ( 2, opcode ); break;
-      case 1: emit_shiftopb_cl_stack0 ( opcode ); break;
-      default: VG_(panic)("synth_shiftop_reg_reg");
-   }
-   if (upd_cc) emit_put_eflags();
-   if (regs != R_ECX) emit_swapl_reg_ECX ( regs );
-   synth_pop_reg ( size, regd );
-}
-
-
-static void synth_shiftop_lit_reg ( Bool upd_cc,
-                                    Opcode opcode, Int size, 
-                                    UInt lit, Int reg )
-{
-   switch (size) {
-      case 4: if (upd_cc) emit_get_eflags();
-              emit_shiftopv_lit_reg ( 4, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 2: if (upd_cc) emit_get_eflags();
-              emit_shiftopv_lit_reg ( 2, opcode, lit, reg );
-              if (upd_cc) emit_put_eflags();
-              break;
-      case 1: if (reg < 4) {
-                 if (upd_cc) emit_get_eflags();
-                 emit_shiftopb_lit_reg ( opcode, lit, reg );
-                 if (upd_cc) emit_put_eflags();
-              } else {
-                 emit_swapl_reg_EAX ( reg );
-                 if (upd_cc) emit_get_eflags();
-                 emit_shiftopb_lit_reg ( opcode, lit, R_AL );
-                 if (upd_cc) emit_put_eflags();
-                 emit_swapl_reg_EAX ( reg );
-              }
-              break;
-      default: VG_(panic)("synth_nonshiftop_lit_reg");
-   }
-}
-
-
-static void synth_setb_reg ( Int reg, Condcode cond )
-{
-   emit_get_eflags();
-   if (reg < 4) {
-      emit_setb_reg ( reg, cond );
-   } else {
-      emit_swapl_reg_EAX ( reg );
-      emit_setb_reg ( R_AL, cond );
-      emit_swapl_reg_EAX ( reg );
-   }
-}
-
-
-static void synth_fpu_regmem ( UChar first_byte,
-                               UChar second_byte_masked, 
-                               Int reg )
-{
-   emit_get_fpu_state();
-   emit_fpu_regmem ( first_byte, second_byte_masked, reg );
-   emit_put_fpu_state();
-}
-
-
-static void synth_fpu_no_mem ( UChar first_byte,
-                               UChar second_byte )
-{
-   emit_get_fpu_state();
-   emit_fpu_no_mem ( first_byte, second_byte );
-   emit_put_fpu_state();
-}
-
-
-static void synth_movl_reg_reg ( Int src, Int dst )
-{
-   emit_movl_reg_reg ( src, dst );
-}
-
-static void synth_cmovl_reg_reg ( Condcode cond, Int src, Int dst )
-{
-   emit_get_eflags();
-   emit_jcondshort_delta ( invertCondition(cond), 
-                           2 /* length of the next insn */ );
-   emit_movl_reg_reg ( src, dst );
-}
-
-
-/* Synthesise a minimal test (and which discards result) of reg32
-   against lit.  It's always safe do simply
-      emit_testv_lit_reg ( 4, lit, reg32 )
-   but we try to do better when possible.
-*/
-static void synth_minimal_test_lit_reg ( UInt lit, Int reg32 )
-{
-   if ((lit & 0xFFFFFF00) == 0 && reg32 < 4) {
-      /* We can get away with a byte insn. */
-      emit_testb_lit_reg ( lit, reg32 );
-   }
-   else 
-   if ((lit & 0xFFFF0000) == 0) {
-      /* Literal fits in 16 bits; do a word insn. */
-      emit_testv_lit_reg ( 2, lit, reg32 );
-   }
-   else {
-      /* Totally general ... */
-      emit_testv_lit_reg ( 4, lit, reg32 );
-   }
-}
-
-
-/*----------------------------------------------------*/
-/*--- Top level of the uinstr -> x86 translation.  ---*/
-/*----------------------------------------------------*/
-
-/* Return the byte offset from %ebp (ie, into baseBlock)
-   for the specified ArchReg or SpillNo. */
-
-static Int spillOrArchOffset ( Int size, Tag tag, UInt value )
-{
-   if (tag == SpillNo) {
-      vg_assert(size == 4);
-      vg_assert(value >= 0 && value < VG_MAX_SPILLSLOTS);
-      return 4 * (value + VGOFF_(spillslots));
-   }
-   if (tag == ArchReg) {
-      switch (value) {
-         case R_EAX: return 4 * VGOFF_(m_eax);
-         case R_ECX: return 4 * VGOFF_(m_ecx);
-         case R_EDX: return 4 * VGOFF_(m_edx);
-         case R_EBX: return 4 * VGOFF_(m_ebx);
-         case R_ESP:
-           if (size == 1) return 4 * VGOFF_(m_eax) + 1;
-                     else return 4 * VGOFF_(m_esp);
-         case R_EBP:
-           if (size == 1) return 4 * VGOFF_(m_ecx) + 1;
-                     else return 4 * VGOFF_(m_ebp);
-         case R_ESI:
-           if (size == 1) return 4 * VGOFF_(m_edx) + 1;
-                     else return 4 * VGOFF_(m_esi);
-         case R_EDI:
-           if (size == 1) return 4 * VGOFF_(m_ebx) + 1;
-                     else return 4 * VGOFF_(m_edi);
-      }
-   }
-   VG_(panic)("spillOrArchOffset");
-}
-
-
-static Int eflagsOffset ( void )
-{
-   return 4 * VGOFF_(m_eflags);
-}
-
-
-static Int shadowOffset ( Int arch )
-{
-   switch (arch) {
-      case R_EAX: return 4 * VGOFF_(sh_eax);
-      case R_ECX: return 4 * VGOFF_(sh_ecx);
-      case R_EDX: return 4 * VGOFF_(sh_edx);
-      case R_EBX: return 4 * VGOFF_(sh_ebx);
-      case R_ESP: return 4 * VGOFF_(sh_esp);
-      case R_EBP: return 4 * VGOFF_(sh_ebp);
-      case R_ESI: return 4 * VGOFF_(sh_esi);
-      case R_EDI: return 4 * VGOFF_(sh_edi);
-      default:    VG_(panic)( "shadowOffset");
-   }
-}
-
-
-static Int shadowFlagsOffset ( void )
-{
-   return 4 * VGOFF_(sh_eflags);
-}
-
-
-static void synth_LOADV ( Int sz, Int a_reg, Int tv_reg )
-{
-   Int i, j, helper_offw;
-   Int pushed[VG_MAX_REALREGS+2];
-   Int n_pushed;
-   switch (sz) {
-      case 4: helper_offw = VGOFF_(helperc_LOADV4); break;
-      case 2: helper_offw = VGOFF_(helperc_LOADV2); break;
-      case 1: helper_offw = VGOFF_(helperc_LOADV1); break;
-      default: VG_(panic)("synth_LOADV");
-   }
-   n_pushed = 0;
-   for (i = 0; i < VG_MAX_REALREGS; i++) {
-      j = VG_(rankToRealRegNo) ( i );
-      if (VG_CALLEE_SAVED(j)) continue;
-      if (j == tv_reg || j == a_reg) continue;
-      emit_pushv_reg ( 4, j );
-      pushed[n_pushed++] = j;
-   }
-   emit_pushv_reg ( 4, a_reg );
-   pushed[n_pushed++] = a_reg;
-   vg_assert(n_pushed <= VG_MAX_REALREGS+1);
-
-   synth_call_baseBlock_method ( False, helper_offw );
-   /* Result is in %eax; we need to get it to tv_reg. */
-   if (tv_reg != R_EAX)
-      emit_movv_reg_reg ( 4, R_EAX, tv_reg );
-
-   while (n_pushed > 0) {
-      n_pushed--;
-      if (pushed[n_pushed] == tv_reg) {
-         emit_add_lit_to_esp ( 4 );
-      } else {
-         emit_popv_reg ( 4, pushed[n_pushed] );
-      }
-   }
-}
-
-
-static void synth_STOREV ( Int sz,
-                           Int tv_tag, Int tv_val,
-                           Int a_reg )
-{
-   Int i, j, helper_offw;
-   vg_assert(tv_tag == RealReg || tv_tag == Literal);
-   switch (sz) {
-      case 4: helper_offw = VGOFF_(helperc_STOREV4); break;
-      case 2: helper_offw = VGOFF_(helperc_STOREV2); break;
-      case 1: helper_offw = VGOFF_(helperc_STOREV1); break;
-      default: VG_(panic)("synth_STOREV");
-   }
-   for (i = 0; i < VG_MAX_REALREGS; i++) {
-      j = VG_(rankToRealRegNo) ( i );
-      if (VG_CALLEE_SAVED(j)) continue;
-      if ((tv_tag == RealReg && j == tv_val) || j == a_reg) continue;
-      emit_pushv_reg ( 4, j );
-   }
-   if (tv_tag == RealReg) {
-      emit_pushv_reg ( 4, tv_val );
-   } else {
-     if (tv_val == VG_(extend_s_8to32)(tv_val))
-        emit_pushl_lit8 ( VG_(extend_s_8to32)(tv_val) );
-     else
-        emit_pushl_lit32(tv_val);
-   }
-   emit_pushv_reg ( 4, a_reg );
-   synth_call_baseBlock_method ( False, helper_offw );
-   emit_popv_reg ( 4, a_reg );
-   if (tv_tag == RealReg) {
-      emit_popv_reg ( 4, tv_val );
-   } else {
-      emit_add_lit_to_esp ( 4 );
-   }
-   for (i = VG_MAX_REALREGS-1; i >= 0; i--) {
-      j = VG_(rankToRealRegNo) ( i );
-      if (VG_CALLEE_SAVED(j)) continue;
-      if ((tv_tag == RealReg && j == tv_val) || j == a_reg) continue;
-      emit_popv_reg ( 4, j );
-   }
-}
-
-
-static void synth_WIDEN_signed ( Int sz_src, Int sz_dst, Int reg )
-{
-   if (sz_src == 1 && sz_dst == 4) {
-      emit_shiftopv_lit_reg ( 4, SHL, 24, reg );
-      emit_shiftopv_lit_reg ( 4, SAR, 24, reg );
-   }
-   else if (sz_src == 2 && sz_dst == 4) {
-      emit_shiftopv_lit_reg ( 4, SHL, 16, reg );
-      emit_shiftopv_lit_reg ( 4, SAR, 16, reg );
-   }
-   else if (sz_src == 1 && sz_dst == 2) {
-      emit_shiftopv_lit_reg ( 2, SHL, 8, reg );
-      emit_shiftopv_lit_reg ( 2, SAR, 8, reg );
-   }
-   else
-      VG_(panic)("synth_WIDEN");
-}
-
-
-static void synth_SETV ( Int sz, Int reg )
-{
-   UInt val;
-   switch (sz) {
-      case 4: val = 0x00000000; break;
-      case 2: val = 0xFFFF0000; break;
-      case 1: val = 0xFFFFFF00; break;
-      case 0: val = 0xFFFFFFFE; break;
-      default: VG_(panic)("synth_SETV");
-   }
-   emit_movv_lit_reg ( 4, val, reg );
-}
-
-
-static void synth_TESTV ( Int sz, Int tag, Int val )
-{
-   vg_assert(tag == ArchReg || tag == RealReg);
-   if (tag == ArchReg) {
-      switch (sz) {
-         case 4: 
-            emit_testv_lit_offregmem ( 
-               4, 0xFFFFFFFF, shadowOffset(val), R_EBP );
-            break;
-         case 2: 
-            emit_testv_lit_offregmem ( 
-               4, 0x0000FFFF, shadowOffset(val), R_EBP );
-            break;
-         case 1:
-            if (val < 4) {
-               emit_testv_lit_offregmem ( 
-                  4, 0x000000FF, shadowOffset(val), R_EBP );
-            } else {
-               emit_testv_lit_offregmem ( 
-                  4, 0x0000FF00, shadowOffset(val-4), R_EBP );
-            }
-            break;
-         case 0: 
-            /* should never happen */
-         default: 
-            VG_(panic)("synth_TESTV(ArchReg)");
-      }
-   } else {
-      switch (sz) {
-         case 4:
-            /* Works, but holds the entire 32-bit literal, hence
-               generating a 6-byte insn.  We want to know if any bits
-               in the reg are set, but since this is for the full reg,
-               we might as well compare it against zero, which can be
-               done with a shorter insn. */
-            /* synth_minimal_test_lit_reg ( 0xFFFFFFFF, val ); */
-            emit_cmpl_zero_reg ( val );
-            break;
-         case 2:
-            synth_minimal_test_lit_reg ( 0x0000FFFF, val );
-            break;
-         case 1:
-            synth_minimal_test_lit_reg ( 0x000000FF, val );
-            break;
-         case 0:
-            synth_minimal_test_lit_reg ( 0x00000001, val );
-            break;
-         default: 
-            VG_(panic)("synth_TESTV(RealReg)");
-      }
-   }
-   emit_jcondshort_delta ( CondZ, 3 );
-   synth_call_baseBlock_method (
-      True, /* needed to guarantee that this insn is indeed 3 bytes long */
-      (sz==4 ? VGOFF_(helper_value_check4_fail)
-             : (sz==2 ? VGOFF_(helper_value_check2_fail)
-                      : sz == 1 ? VGOFF_(helper_value_check1_fail)
-                                : VGOFF_(helper_value_check0_fail)))
-   );
-}
-
-
-static void synth_GETV ( Int sz, Int arch, Int reg )
-{
-   /* VG_(printf)("synth_GETV %d of Arch %s\n", sz, nameIReg(sz, arch)); */
-   switch (sz) {
-      case 4: 
-         emit_movv_offregmem_reg ( 4, shadowOffset(arch), R_EBP, reg );
-         break;
-      case 2: 
-         emit_movzwl_offregmem_reg ( shadowOffset(arch), R_EBP, reg );
-         emit_nonshiftopv_lit_reg ( 4, OR, 0xFFFF0000, reg );
-         break;
-      case 1: 
-         if (arch < 4) {
-            emit_movzbl_offregmem_reg ( shadowOffset(arch), R_EBP, reg );
-         } else {
-            emit_movzbl_offregmem_reg ( shadowOffset(arch-4)+1, R_EBP, reg );
-         }
-         emit_nonshiftopv_lit_reg ( 4, OR, 0xFFFFFF00, reg );
-         break;
-      default: 
-         VG_(panic)("synth_GETV");
-   }
-}
-
-
-static void synth_PUTV ( Int sz, Int srcTag, UInt lit_or_reg, Int arch )
-{
-   if (srcTag == Literal) {
-     /* PUTV with a Literal is only ever used to set the corresponding
-        ArchReg to `all valid'.  Should really be a kind of SETV. */
-      UInt lit = lit_or_reg;
-      switch (sz) {
-         case 4:
-            vg_assert(lit == 0x00000000);
-            emit_movv_lit_offregmem ( 4, 0x00000000, 
-                                      shadowOffset(arch), R_EBP );
-            break;
-         case 2:
-            vg_assert(lit == 0xFFFF0000);
-            emit_movv_lit_offregmem ( 2, 0x0000, 
-                                      shadowOffset(arch), R_EBP );
-            break;
-         case 1:
-            vg_assert(lit == 0xFFFFFF00);
-            if (arch < 4) {
-               emit_movb_lit_offregmem ( 0x00, 
-                                         shadowOffset(arch), R_EBP );
-            } else {
-               emit_movb_lit_offregmem ( 0x00, 
-                                         shadowOffset(arch-4)+1, R_EBP );
-            }
-            break;
-         default: 
-            VG_(panic)("synth_PUTV(lit)");
-      }
-
-   } else {
-
-      UInt reg;
-      vg_assert(srcTag == RealReg);
-
-      if (sz == 1 && lit_or_reg >= 4) {
-         emit_swapl_reg_EAX ( lit_or_reg );
-         reg = R_EAX;
-      } else {
-         reg = lit_or_reg;
-      }
-
-      if (sz == 1) vg_assert(reg < 4);
-
-      switch (sz) {
-         case 4:
-            emit_movv_reg_offregmem ( 4, reg,
-                                      shadowOffset(arch), R_EBP );
-            break;
-         case 2:
-            emit_movv_reg_offregmem ( 2, reg,
-                                      shadowOffset(arch), R_EBP );
-            break;
-         case 1:
-            if (arch < 4) {
-               emit_movb_reg_offregmem ( reg,
-                                         shadowOffset(arch), R_EBP );
-	    } else {
-               emit_movb_reg_offregmem ( reg,
-                                         shadowOffset(arch-4)+1, R_EBP );
-            }
-            break;
-         default: 
-            VG_(panic)("synth_PUTV(reg)");
-      }
-
-      if (sz == 1 && lit_or_reg >= 4) {
-         emit_swapl_reg_EAX ( lit_or_reg );
-      }
-   }
-}
-
-
-static void synth_GETVF ( Int reg )
-{
-   emit_movv_offregmem_reg ( 4, shadowFlagsOffset(), R_EBP, reg );
-   /* paranoia only; should be unnecessary ... */
-   /* emit_nonshiftopv_lit_reg ( 4, OR, 0xFFFFFFFE, reg ); */
-}
-
-
-static void synth_PUTVF ( UInt reg )
-{
-   emit_movv_reg_offregmem ( 4, reg, shadowFlagsOffset(), R_EBP );
-}
-
-
-static void synth_handle_esp_assignment ( Int reg )
-{
-   emit_pushal();
-   emit_pushv_reg ( 4, reg );
-   synth_call_baseBlock_method ( False, VGOFF_(handle_esp_assignment) );
-   emit_add_lit_to_esp ( 4 );
-   emit_popal();
-}
-
-
-static void synth_fpu_mem_check_actions ( Bool isWrite, 
-                                          Int size, Int a_reg )
-{
-   Int helper_offw
-     = isWrite ? VGOFF_(fpu_write_check)
-               : VGOFF_(fpu_read_check);
-   emit_pushal();
-   emit_pushl_lit8 ( size );
-   emit_pushv_reg ( 4, a_reg );
-   synth_call_baseBlock_method ( False, helper_offw );
-   emit_add_lit_to_esp ( 8 );   
-   emit_popal();
-}
-
-
-#if 0
-/* FixMe.  Useful for debugging. */
-void VG_(oink) ( Int n )
-{
-   VG_(printf)("OiNk(%d): ", n );
-   VG_(show_reg_tags)( &VG_(m_shadow) );
-}
-
-static void synth_OINK ( Int n )
-{
-   emit_pushal();
-   emit_movv_lit_reg ( 4, n, R_EBP );
-   emit_pushl_reg ( R_EBP );
-   emit_movv_lit_reg ( 4, (Addr)&VG_(oink), R_EBP );
-   emit_call_reg ( R_EBP );
-   emit_add_lit_to_esp ( 4 );
-   emit_popal();
-}
-#endif
-
-static void synth_TAG1_op ( VgTagOp op, Int reg )
-{
-   switch (op) {
-
-      /* Scheme is
-            neg<sz> %reg          -- CF = %reg==0 ? 0 : 1
-            sbbl %reg, %reg       -- %reg = -CF
-            or 0xFFFFFFFE, %reg   -- invalidate all bits except lowest
-      */
-      case VgT_PCast40:
-         emit_unaryopv_reg(4, NEG, reg);
-         emit_nonshiftopv_reg_reg(4, SBB, reg, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFFFE, reg);
-         break;
-      case VgT_PCast20:
-         emit_unaryopv_reg(2, NEG, reg);
-         emit_nonshiftopv_reg_reg(4, SBB, reg, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFFFE, reg);
-         break;
-      case VgT_PCast10:
-         if (reg >= 4) {
-            emit_swapl_reg_EAX(reg);
-            emit_unaryopb_reg(NEG, R_EAX);
-            emit_swapl_reg_EAX(reg);
-         } else {
-            emit_unaryopb_reg(NEG, reg);
-         }
-         emit_nonshiftopv_reg_reg(4, SBB, reg, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFFFE, reg);
-         break;
-
-      /* Scheme is
-            andl $1, %reg -- %reg is 0 or 1
-            negl %reg -- %reg is 0 or 0xFFFFFFFF
-            and possibly an OR to invalidate unused bits.
-      */
-      case VgT_PCast04:
-         emit_nonshiftopv_lit_reg(4, AND, 0x00000001, reg);
-         emit_unaryopv_reg(4, NEG, reg);
-         break;
-      case VgT_PCast02:
-         emit_nonshiftopv_lit_reg(4, AND, 0x00000001, reg);
-         emit_unaryopv_reg(4, NEG, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFF0000, reg);
-         break;
-      case VgT_PCast01:
-         emit_nonshiftopv_lit_reg(4, AND, 0x00000001, reg);
-         emit_unaryopv_reg(4, NEG, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFF00, reg);
-         break;
-
-      /* Scheme is
-            shl $24, %reg -- make irrelevant bits disappear
-            negl %reg             -- CF = %reg==0 ? 0 : 1
-            sbbl %reg, %reg       -- %reg = -CF
-            and possibly an OR to invalidate unused bits.
-      */
-      case VgT_PCast14:
-         emit_shiftopv_lit_reg(4, SHL, 24, reg);
-         emit_unaryopv_reg(4, NEG, reg);
-         emit_nonshiftopv_reg_reg(4, SBB, reg, reg);
-         break;
-      case VgT_PCast12:
-         emit_shiftopv_lit_reg(4, SHL, 24, reg);
-         emit_unaryopv_reg(4, NEG, reg);
-         emit_nonshiftopv_reg_reg(4, SBB, reg, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFF0000, reg);
-         break;
-      case VgT_PCast11:
-         emit_shiftopv_lit_reg(4, SHL, 24, reg);
-         emit_unaryopv_reg(4, NEG, reg);
-         emit_nonshiftopv_reg_reg(4, SBB, reg, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFF00, reg);
-         break;
-
-      /* We steal %ebp (a non-allocable reg) as a temporary:
-            pushl %ebp
-            movl %reg, %ebp
-            negl %ebp
-            orl %ebp, %reg
-            popl %ebp
-         This sequence turns out to be correct regardless of the 
-         operation width.
-      */
-      case VgT_Left4:
-      case VgT_Left2:
-      case VgT_Left1:
-         vg_assert(reg != R_EDI);
-         emit_movv_reg_reg(4, reg, R_EDI);
-         emit_unaryopv_reg(4, NEG, R_EDI);
-         emit_nonshiftopv_reg_reg(4, OR, R_EDI, reg);
-         break;
-
-      /* These are all fairly obvious; do the op and then, if
-         necessary, invalidate unused bits. */
-      case VgT_SWiden14:
-         emit_shiftopv_lit_reg(4, SHL, 24, reg);
-         emit_shiftopv_lit_reg(4, SAR, 24, reg);
-         break;
-      case VgT_SWiden24:
-         emit_shiftopv_lit_reg(4, SHL, 16, reg);
-         emit_shiftopv_lit_reg(4, SAR, 16, reg);
-         break;
-      case VgT_SWiden12:
-         emit_shiftopv_lit_reg(4, SHL, 24, reg);
-         emit_shiftopv_lit_reg(4, SAR, 24, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFF0000, reg);
-         break;
-      case VgT_ZWiden14:
-         emit_nonshiftopv_lit_reg(4, AND, 0x000000FF, reg);
-         break;
-      case VgT_ZWiden24:
-         emit_nonshiftopv_lit_reg(4, AND, 0x0000FFFF, reg);
-         break;
-      case VgT_ZWiden12:
-         emit_nonshiftopv_lit_reg(4, AND, 0x000000FF, reg);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFF0000, reg);
-         break;
-
-      default:
-         VG_(panic)("synth_TAG1_op");
-   }
-}
-
-
-static void synth_TAG2_op ( VgTagOp op, Int regs, Int regd )
-{
-   switch (op) {
-
-      /* UifU is implemented by OR, since 1 means Undefined. */
-      case VgT_UifU4:
-      case VgT_UifU2:
-      case VgT_UifU1:
-      case VgT_UifU0:
-         emit_nonshiftopv_reg_reg(4, OR, regs, regd);
-         break;
-
-      /* DifD is implemented by AND, since 0 means Defined. */
-      case VgT_DifD4:
-      case VgT_DifD2:
-      case VgT_DifD1:
-         emit_nonshiftopv_reg_reg(4, AND, regs, regd);
-         break;
-
-      /* ImproveAND(value, tags) = value OR tags.
-	 Defined (0) value 0s give defined (0); all other -> undefined (1).
-         value is in regs; tags is in regd. 
-         Be paranoid and invalidate unused bits; I don't know whether 
-         or not this is actually necessary. */
-      case VgT_ImproveAND4_TQ:
-         emit_nonshiftopv_reg_reg(4, OR, regs, regd);
-         break;
-      case VgT_ImproveAND2_TQ:
-         emit_nonshiftopv_reg_reg(4, OR, regs, regd);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFF0000, regd);
-         break;
-      case VgT_ImproveAND1_TQ:
-         emit_nonshiftopv_reg_reg(4, OR, regs, regd);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFF00, regd);
-         break;
-
-      /* ImproveOR(value, tags) = (not value) OR tags.
-	 Defined (0) value 1s give defined (0); all other -> undefined (1).
-         value is in regs; tags is in regd. 
-         To avoid trashing value, this is implemented (re de Morgan) as
-               not (value AND (not tags))
-         Be paranoid and invalidate unused bits; I don't know whether 
-         or not this is actually necessary. */
-      case VgT_ImproveOR4_TQ:
-         emit_unaryopv_reg(4, NOT, regd);
-         emit_nonshiftopv_reg_reg(4, AND, regs, regd);
-         emit_unaryopv_reg(4, NOT, regd);
-         break;
-      case VgT_ImproveOR2_TQ:
-         emit_unaryopv_reg(4, NOT, regd);
-         emit_nonshiftopv_reg_reg(4, AND, regs, regd);
-         emit_unaryopv_reg(4, NOT, regd);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFF0000, regd);
-         break;
-      case VgT_ImproveOR1_TQ:
-         emit_unaryopv_reg(4, NOT, regd);
-         emit_nonshiftopv_reg_reg(4, AND, regs, regd);
-         emit_unaryopv_reg(4, NOT, regd);
-         emit_nonshiftopv_lit_reg(4, OR, 0xFFFFFF00, regd);
-         break;
-
-      default:
-         VG_(panic)("synth_TAG2_op");
-   }
-}
-
-/*----------------------------------------------------*/
-/*--- Generate code for a single UInstr.           ---*/
-/*----------------------------------------------------*/
-
-static void emitUInstr ( Int i, UInstr* u )
-{
-   if (dis)
-      VG_(ppUInstr)(i, u);
-
-#  if 0
-   if (0&& VG_(translations_done) >= 600) {
-      Bool old_dis = dis;
-      dis = False; 
-      synth_OINK(i);
-      dis = old_dis;
-   }
-#  endif
-
-   switch (u->opcode) {
-
-      case NOP: case CALLM_S: case CALLM_E: break;
-
-      case INCEIP: {
-         vg_assert(u->tag1 == Lit16);
-         emit_addlit8_offregmem ( u->val1, R_EBP, 4 * VGOFF_(m_eip) );
-         break;
-      }
-
-      case LEA1: {
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == RealReg);
-         emit_lea_litreg_reg ( u->lit32, u->val1, u->val2 );
-         break;
-      }
-
-      case LEA2: {
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == RealReg);
-         vg_assert(u->tag3 == RealReg);
-         emit_lea_sib_reg ( u->lit32, u->extra4b, 
-                            u->val1, u->val2, u->val3 );
-         break;
-      }
-
-      case WIDEN: {
-         vg_assert(u->tag1 == RealReg);
-         if (u->signed_widen) {
-            synth_WIDEN_signed ( u->extra4b, u->size, u->val1 );
-         } else {
-            /* no need to generate any code. */
-         }
-         break;
-      }
-
-      case SETV: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg);
-         synth_SETV ( u->size, u->val1 );
-         break;
-      }
-
-      case STOREV: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg || u->tag1 == Literal);
-         vg_assert(u->tag2 == RealReg);
-         synth_STOREV ( u->size, u->tag1, 
-                                 u->tag1==Literal ? u->lit32 : u->val1, 
-                                 u->val2 );
-         break;
-      }
-
-      case STORE: {
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == RealReg);
-         synth_mov_reg_memreg ( u->size, u->val1, u->val2 );
-	 /* No longer possible, but retained for illustrative purposes.
-         if (u->smc_check) 
-            synth_orig_code_write_check ( u->size, u->val2 );
-	 */
-         break;
-      }
-
-      case LOADV: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == RealReg);
-         if (0 && VG_(clo_instrument))
-            emit_AMD_prefetch_reg ( u->val1 );
-         synth_LOADV ( u->size, u->val1, u->val2 );
-         break;
-      }
-
-      case LOAD: {
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == RealReg);
-         synth_mov_regmem_reg ( u->size, u->val1, u->val2 );
-         break;
-      }
-
-      case TESTV: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg || u->tag1 == ArchReg);
-         synth_TESTV(u->size, u->tag1, u->val1);
-         break;
-      }
-
-      case GETV: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == ArchReg);
-         vg_assert(u->tag2 == RealReg);
-         synth_GETV(u->size, u->val1, u->val2);
-         break;
-      }
-
-      case GETVF: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->size == 0);
-         synth_GETVF(u->val1);
-         break;
-      }
-
-      case PUTV: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg || u->tag1 == Literal);
-         vg_assert(u->tag2 == ArchReg);
-         synth_PUTV(u->size, u->tag1, 
-                             u->tag1==Literal ? u->lit32 : u->val1, 
-                             u->val2 );
-         break;
-      }
-
-      case PUTVF: {
-         vg_assert(VG_(clo_instrument));
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->size == 0);
-         synth_PUTVF(u->val1);
-         break;
-      }
-
-      case GET: {
-         vg_assert(u->tag1 == ArchReg || u->tag1 == SpillNo);
-         vg_assert(u->tag2 == RealReg);
-         synth_mov_offregmem_reg ( 
-            u->size, 
-            spillOrArchOffset( u->size, u->tag1, u->val1 ),
-            R_EBP,
-            u->val2 
-         );
-         break;
-      }
-            
-      case PUT: {
-         vg_assert(u->tag2 == ArchReg || u->tag2 == SpillNo);
-         vg_assert(u->tag1 == RealReg);
-         if (u->tag2 == ArchReg 
-             && u->val2 == R_ESP
-             && u->size == 4
-             && VG_(clo_instrument)) {
-            synth_handle_esp_assignment ( u->val1 );
-	 }
-         synth_mov_reg_offregmem ( 
-            u->size, 
-            u->val1, 
-            spillOrArchOffset( u->size, u->tag2, u->val2 ),
-            R_EBP
-         );
-         break;
-      }
-
-      case GETF: {
-         vg_assert(u->size == 2 || u->size == 4);
-         vg_assert(u->tag1 == RealReg);
-         synth_mov_offregmem_reg ( 
-            u->size, 
-            eflagsOffset(),
-            R_EBP,
-            u->val1
-         );
-         break;
-      }
-            
-      case PUTF: {
-         vg_assert(u->size == 2 || u->size == 4);
-         vg_assert(u->tag1 == RealReg);
-         synth_mov_reg_offregmem ( 
-            u->size, 
-            u->val1,
-            eflagsOffset(),
-            R_EBP
-         );
-         break;
-      }
-            
-      case MOV: {
-         vg_assert(u->tag1 == RealReg || u->tag1 == Literal);
-         vg_assert(u->tag2 == RealReg);
-         switch (u->tag1) {
-            case RealReg: vg_assert(u->size == 4);
-                          if (u->val1 != u->val2)
-                             synth_movl_reg_reg ( u->val1, u->val2 ); 
-                          break;
-            case Literal: synth_mov_lit_reg ( u->size, u->lit32, u->val2 ); 
-                          break;
-            default: VG_(panic)("emitUInstr:mov");
-	 }
-         break;
-      }
-
-      case SBB:
-      case ADC:
-      case XOR:
-      case OR:
-      case AND:
-      case SUB:
-      case ADD: {
-         vg_assert(u->tag2 == RealReg);
-         switch (u->tag1) {
-            case Literal: synth_nonshiftop_lit_reg (
-                             VG_(anyFlagUse)(u), 
-                             u->opcode, u->size, u->lit32, u->val2 );
-                          break;
-            case RealReg: synth_nonshiftop_reg_reg (
-                             VG_(anyFlagUse)(u), 
-                             u->opcode, u->size, u->val1, u->val2 );
-                          break;
-            case ArchReg: synth_nonshiftop_offregmem_reg (
-                             VG_(anyFlagUse)(u), 
-                             u->opcode, u->size, 
-                             spillOrArchOffset( u->size, u->tag1, u->val1 ), 
-                             R_EBP,
-                             u->val2 );
-                          break;
-            default: VG_(panic)("emitUInstr:non-shift-op");
-         }
-         break;
-      }
-
-      case RCR:
-      case RCL:
-      case ROR:
-      case ROL:
-      case SAR:
-      case SHR:
-      case SHL: {
-         vg_assert(u->tag2 == RealReg);
-         switch (u->tag1) {
-            case Literal: synth_shiftop_lit_reg (
-                             VG_(anyFlagUse)(u), 
-                             u->opcode, u->size, u->lit32, u->val2 );
-                          break;
-            case RealReg: synth_shiftop_reg_reg (
-                             VG_(anyFlagUse)(u), 
-                             u->opcode, u->size, u->val1, u->val2 );
-                          break;
-            default: VG_(panic)("emitUInstr:non-shift-op");
-         }
-         break;
-      }
-
-      case INC:
-      case DEC:
-      case NEG:
-      case NOT:
-         vg_assert(u->tag1 == RealReg);
-         synth_unaryop_reg ( 
-            VG_(anyFlagUse)(u), u->opcode, u->size, u->val1 );
-         break;
-
-      case BSWAP:
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->size == 4);
-	 vg_assert(!VG_(anyFlagUse)(u));
-         emit_bswapl_reg ( u->val1 );
-         break;
-
-      case CMOV: 
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == RealReg);
-         vg_assert(u->cond != CondAlways);
-         vg_assert(u->size == 4);
-         synth_cmovl_reg_reg ( u->cond, u->val1, u->val2 );
-         break;
-
-      case JMP: {
-         vg_assert(u->tag2 == NoValue);
-         vg_assert(u->tag1 == RealReg || u->tag1 == Literal);
-         if (u->cond == CondAlways) {
-            switch (u->tag1) {
-               case RealReg:
-                  synth_jmp_reg ( u->val1, u->jmpkind );
-                  break;
-               case Literal:
-                  synth_jmp_lit ( u->lit32, u->jmpkind );
-                  break;
-               default: 
-                  VG_(panic)("emitUInstr(JMP, unconditional, default)");
-                  break;
-            }
-         } else {
-            switch (u->tag1) {
-               case RealReg:
-                  VG_(panic)("emitUInstr(JMP, conditional, RealReg)");
-                  break;
-               case Literal:
-                  vg_assert(u->jmpkind == JmpBoring);
-                  synth_jcond_lit ( u->cond, u->lit32 );
-                  break;
-               default: 
-                  VG_(panic)("emitUInstr(JMP, conditional, default)");
-                  break;
-            }
-         }
-         break;
-      }
-
-      case JIFZ:
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == Literal);
-         vg_assert(u->size == 4);
-         synth_jmp_ifzero_reg_lit ( u->val1, u->lit32 );
-         break;
-
-      case TAG1:
-         synth_TAG1_op ( u->val3, u->val1 );
-         break;
-
-      case TAG2:
-         if (u->val3 != VgT_DebugFn) {
-            synth_TAG2_op ( u->val3, u->val1, u->val2 );
-         } else {
-            /* Assume a call to VgT_DebugFn passing both args
-               and placing the result back in the second. */
-            Int j, k;
-            /* u->val2 is the reg into which the result is written.  So
-               don't save/restore it.  And it can be used at a temp for
-               the call target, too.  Since %eax is used for the return
-               value from the C procedure, it is preserved only by
-               virtue of not being mentioned as a VG_CALLEE_SAVED reg. */
-            for (k = 0; k < VG_MAX_REALREGS; k++) {
-               j = VG_(rankToRealRegNo) ( k );
-               if (VG_CALLEE_SAVED(j)) continue;
-               if (j == u->val2) continue;
-               emit_pushv_reg ( 4, j );
-            }
-            emit_pushv_reg(4, u->val2);
-            emit_pushv_reg(4, u->val1);
-            emit_movv_lit_reg ( 4, (UInt)(&VG_(DebugFn)), u->val2 );
-            emit_call_reg ( u->val2 );
-            if (u->val2 != R_EAX)
-               emit_movv_reg_reg ( 4, R_EAX, u->val2 );
-            /* nuke args */
-            emit_add_lit_to_esp(8);
-            for (k = VG_MAX_REALREGS-1; k >= 0; k--) {
-               j = VG_(rankToRealRegNo) ( k );
-               if (VG_CALLEE_SAVED(j)) continue;
-               if (j == u->val2) continue;
-               emit_popv_reg ( 4, j );
-            }
-         }
-         break;
-
-      case PUSH:
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == NoValue);
-         emit_pushv_reg ( 4, u->val1 );
-         break;
-
-      case POP:
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == NoValue);
-         emit_popv_reg ( 4, u->val1 );
-         break;
-
-      case CALLM:
-         vg_assert(u->tag1 == Lit16);
-         vg_assert(u->tag2 == NoValue);
-         vg_assert(u->size == 0);
-         if (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty) 
-            emit_get_eflags();
-         synth_call_baseBlock_method ( False, u->val1 );
-         if (u->flags_w != FlagsEmpty) 
-            emit_put_eflags();
-         break;
-
-      case CLEAR:
-         vg_assert(u->tag1 == Lit16);
-         vg_assert(u->tag2 == NoValue);
-         emit_add_lit_to_esp ( u->val1 );
-         break;
-
-      case CC2VAL:
-         vg_assert(u->tag1 == RealReg);
-         vg_assert(u->tag2 == NoValue);
-         vg_assert(VG_(anyFlagUse)(u));
-         synth_setb_reg ( u->val1, u->cond );
-         break;
-
-      /* We assume that writes to memory done by FPU_Ws are not going
-         to be used to create new code, so there's no orig-code-write
-         checks done by default. */
-      case FPU_R: 
-      case FPU_W:         
-         vg_assert(u->tag1 == Lit16);
-         vg_assert(u->tag2 == RealReg);
-         if (VG_(clo_instrument))
-            synth_fpu_mem_check_actions ( 
-               u->opcode==FPU_W, u->size, u->val2 );
-         synth_fpu_regmem ( (u->val1 >> 8) & 0xFF,
-                            u->val1 & 0xFF,
-                            u->val2 );
-         /* No longer possible, but retained for illustrative purposes.
-         if (u->opcode == FPU_W && u->smc_check) 
-            synth_orig_code_write_check ( u->size, u->val2 );
-         */
-         break;
-
-      case FPU:
-         vg_assert(u->tag1 == Lit16);
-         vg_assert(u->tag2 == NoValue);
-         if (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty) 
-            emit_get_eflags();
-         synth_fpu_no_mem ( (u->val1 >> 8) & 0xFF,
-                            u->val1 & 0xFF );
-         if (u->flags_w != FlagsEmpty) 
-            emit_put_eflags();
-         break;
-
-      default: 
-         VG_(printf)("emitUInstr: unhandled insn:\n");
-         VG_(ppUInstr)(0,u);
-         VG_(panic)("emitUInstr: unimplemented opcode");
-   }
-
-}
-
-
-/* Emit x86 for the ucode in cb, returning the address of the
-   generated code and setting *nbytes to its size. */
-UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes )
-{
-   Int i;
-   emitted_code_used = 0;
-   emitted_code_size = 500; /* reasonable initial size */
-   emitted_code = VG_(jitmalloc)(emitted_code_size);
-
-   if (dis) VG_(printf)("Generated code:\n");
-
-   for (i = 0; i < cb->used; i++) {
-      if (cb->instrs[i].opcode != NOP) {
-         UInstr* u = &cb->instrs[i];
-#        if 1
-         /* Check on the sanity of this insn. */
-         Bool sane = VG_(saneUInstr)( False, u );
-         if (!sane) {
-            VG_(printf)("\ninsane instruction\n");
-            VG_(ppUInstr)( i, u );
-	 }
-         vg_assert(sane);
-#        endif
-#        if 0
-         /* Pass args to TAG1/TAG2 to vg_DebugFn for sanity checking.
-            Requires a suitable definition of vg_DebugFn. */
-	 if (u->opcode == TAG1) {
-            UInstr t1;
-            vg_assert(u->tag1 == RealReg);
-            VG_(emptyUInstr)( &t1 );
-            t1.opcode = TAG2;
-            t1.tag1 = t1.tag2 = RealReg;
-            t1.val1 = t1.val2 = u->val1;
-            t1.tag3 = Lit16;
-            t1.val3 = VgT_DebugFn;
-            emitUInstr( i, &t1 );
-	 }
-	 if (u->opcode == TAG2) {
-            UInstr t1;
-            vg_assert(u->tag1 == RealReg);
-            vg_assert(u->tag2 == RealReg);
-            VG_(emptyUInstr)( &t1 );
-            t1.opcode = TAG2;
-            t1.tag1 = t1.tag2 = RealReg;
-            t1.val1 = t1.val2 = u->val1;
-            t1.tag3 = Lit16;
-            t1.val3 = VgT_DebugFn;
-            if (u->val3 == VgT_UifU1 || u->val3 == VgT_UifU2 
-                || u->val3 == VgT_UifU4 || u->val3 == VgT_DifD1 
-                || u->val3 == VgT_DifD2 || u->val3 == VgT_DifD4)
-               emitUInstr( i, &t1 );
-            t1.val1 = t1.val2 = u->val2;
-            emitUInstr( i, &t1 );
-	 }
-#        endif
-         emitUInstr( i, u );
-      }
-   }
-
-   /* Returns a pointer to the emitted code.  This will have to be
-      copied by the caller into the translation cache, and then freed
-      using VG_(jitfree). */
-   *nbytes = emitted_code_used;
-   return emitted_code;
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                          vg_from_ucode.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_helpers.S b/coregrind/vg_helpers.S
deleted file mode 100644
index 82627377d6..0000000000
--- a/coregrind/vg_helpers.S
+++ /dev/null
@@ -1,571 +0,0 @@
-
-##--------------------------------------------------------------------##
-##--- Support routines for the JITter output.                      ---##
-##---                                                 vg_helpers.S ---##
-##--------------------------------------------------------------------##
-
-/*
-  This file is part of Valgrind, an x86 protected-mode emulator 
-  designed for debugging and profiling binaries on x86-Unixes.
-
-  Copyright (C) 2000-2002 Julian Seward 
-     jseward@acm.org
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of the
-  License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-  02111-1307, USA.
-
-  The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_constants.h"
-
-/* ------------------ SIMULATED CPU HELPERS ------------------ */
-/* A stubs for a return which we want to catch: a signal return.
-   returns and pthread returns.  In the latter case, the thread's
-   return value is in %EAX, so we pass this as the first argument
-   to the request.  In both cases we use the user request mechanism.
-   You need to to read the definition of VALGRIND_MAGIC_SEQUENCE
-   in valgrind.h to make sense of this.
-*/
-.global VG_(signalreturn_bogusRA)
-VG_(signalreturn_bogusRA):
-	subl	$20, %esp	# allocate arg block
-	movl	%esp, %edx	# %edx == &_zzq_args[0]
-	movl	$VG_USERREQ__SIGNAL_RETURNS, 0(%edx)	# request
-	movl	$0, 4(%edx)	# arg1
-	movl	$0, 8(%edx)	# arg2
-	movl	$0, 12(%edx)	# arg3
-	movl	$0, 16(%edx)	# arg4
-	movl	%edx, %eax
-	# and now the magic sequence itself:
-	roll $29, %eax
-	roll $3, %eax
-	rorl $27, %eax
-	rorl $5, %eax
-	roll $13, %eax
-	roll $19, %eax
-	# should never get here
-	pushl	$signalreturn_bogusRA_panic_msg
-	call	VG_(panic)
-	
-.data
-signalreturn_bogusRA_panic_msg:
-.ascii	"vg_signalreturn_bogusRA: VG_USERREQ__SIGNAL_RETURNS was missed"
-.byte	0
-.text	
-	
-
-
-	
-/* ------------------ REAL CPU HELPERS ------------------ */
-/* The rest of this lot run on the real CPU. */
-	
-/* Various helper routines, for instructions which are just too
-   darn tedious for the JITter to output code in-line:
-	
-	* integer division
-	* integer multiplication
-        * setting and getting obscure eflags
-	* double-length shifts
-	
-   All routines use a standard calling convention designed for
-   calling from translations, in which the incoming args are
-   underneath the return address, the callee saves _all_ registers,
-   and the incoming parameters can be modified, to return results.
-*/
-
-
-.global VG_(helper_value_check0_fail)
-VG_(helper_value_check0_fail):
-	pushal
-	call	VG_(helperc_value_check0_fail)
-	popal
-	ret
-
-.global VG_(helper_value_check1_fail)
-VG_(helper_value_check1_fail):
-	pushal
-	call	VG_(helperc_value_check1_fail)
-	popal
-	ret
-
-.global VG_(helper_value_check2_fail)
-VG_(helper_value_check2_fail):
-	pushal
-	call	VG_(helperc_value_check2_fail)
-	popal
-	ret
-
-.global VG_(helper_value_check4_fail)
-VG_(helper_value_check4_fail):
-	pushal
-	call	VG_(helperc_value_check4_fail)
-	popal
-	ret
-
-
-/* Fetch the time-stamp-ctr reg.
-   On entry:
-	dummy, replaced by %EAX value
-	dummy, replaced by %EDX value
-	RA   <- %esp
-*/
-.global VG_(helper_RDTSC)
-VG_(helper_RDTSC):
-	pushl	%eax
-	pushl	%edx
-	rdtsc
-	movl	%edx, 12(%esp)
-	movl	%eax, 16(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-
-
-/* Do the CPUID instruction.
-   On entry:
-	dummy, replaced by %EAX value
-	dummy, replaced by %EBX value
-	dummy, replaced by %ECX value
-	dummy, replaced by %EDX value
-	RA   <- %esp
-
-   As emulating a real CPUID is kinda hard, as it
-   has to return different values depending on EAX, 
-   we just pretend to not support CPUID at all until
-   it becomes a problem. This will for sure disable
-   all MMX / 3dnow checks so they don't bother us
-   with code we don't understand. (Dirk <dirk@kde.org>)
-   
-   http://www.sandpile.org/ia32/cpuid.htm
-
-   (Later: we instead pretend to be like Werner's P54C P133, that is
-    an original pre-MMX Pentium).
-   <werner> cpuid words (0): 0x1 0x756e6547 0x6c65746e 0x49656e69
-   <werner> cpuid words (1): 0x52b 0x0 0x0 0x1bf
-*/
-.global VG_(helper_CPUID)
-VG_(helper_CPUID):
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-	pushl	%edx
-	movl	32(%esp), %eax
-/*
-	cpuid
-*/
-/*
-        xor     %eax,%eax
-        xor     %ebx,%ebx
-        xor     %ecx,%ecx
-        xor     %edx,%edx
-*/
-	cmpl	$0, %eax
-	jz	cpuid__0
-	movl	$0x52b, %eax
-	movl	$0x0,   %ebx
-	movl	$0x0,   %ecx
-	movl	$0x1bf, %edx
-	jmp	cpuid__99
-cpuid__0:
-	movl	$0x1,        %eax
-	movl	$0x756e6547, %ebx
-	movl	$0x6c65746e, %ecx
-	movl	$0x49656e69, %edx
-cpuid__99:
-		
-	movl	%edx, 20(%esp)
-	movl	%ecx, 24(%esp)
-	movl	%ebx, 28(%esp)
-	movl	%eax, 32(%esp)
-	popl	%edx
-	popl	%ecx
-	popl	%ebx
-	popl	%eax
-	ret
-
-
-/* Fetch the FPU status register.
-   On entry:
-	dummy, replaced by result
-	RA   <- %esp
-*/
-.global VG_(helper_fstsw_AX)
-VG_(helper_fstsw_AX):
-	pushl	%eax
-	pushl	%esi
-	movl	VGOFF_(m_fpustate), %esi
-	frstor	(%ebp, %esi, 4)
-	fstsw	%ax
-	popl	%esi
-	movw	%ax, 8(%esp)
-	popl	%eax
-	ret
-
-
-/* Copy %ah into %eflags.
-   On entry:
-	value of %eax
-	RA   <- %esp
-*/
-.global VG_(helper_SAHF)
-VG_(helper_SAHF):
-	pushl	%eax
-	movl	8(%esp), %eax
-	sahf
-	popl	%eax
-	ret
-
-
-/* Do %al = DAS(%al).  Note that the passed param has %AL as the least
-   significant 8 bits, since it was generated with GETB %AL,
-   some-temp.  Fortunately %al is the least significant 8 bits of
-   %eax anyway, which is why it's safe to work with %eax as a
-   whole. 
- 
-   On entry:
-	value of %eax
-	RA   <- %esp
-*/
-.global VG_(helper_DAS)
-VG_(helper_DAS):
-	pushl	%eax
-	movl	8(%esp), %eax
-	das
- 	movl	%eax, 8(%esp)
-	popl	%eax
-	ret
-
-
-/* Similarly, do %al = DAA(%al). */
-.global VG_(helper_DAA)
-VG_(helper_DAA):
-       pushl   %eax
-       movl    8(%esp), %eax
-       daa
-       movl    %eax, 8(%esp)
-       popl    %eax
-       ret
-	
-
-/* Bit scan forwards/reverse.  Sets flags (??).
-   On entry:
-	value, replaced by result
-	RA   <- %esp
-*/
-.global VG_(helper_bsr)
-VG_(helper_bsr):
-	pushl	%eax
-	movl	12(%esp), %eax
-	bsrl	8(%esp), %eax
-	movl	%eax, 12(%esp)
-	popl	%eax
-	ret
-
-.global VG_(helper_bsf)
-VG_(helper_bsf):
-	pushl	%eax
-	movl	12(%esp), %eax
-	bsfl	8(%esp), %eax
-	movl	%eax, 12(%esp)
-	popl	%eax
-	ret
-
-
-/* 32-bit double-length shift left/right.
-   On entry:
-	amount
-	src
-	dst
-	RA   <- %esp
-*/
-.global VG_(helper_shldl)
-VG_(helper_shldl):
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-
-	movb	24(%esp), %cl
-	movl	20(%esp), %ebx
-	movl	16(%esp), %eax
-	shldl	%cl, %ebx, %eax
-	movl	%eax, 16(%esp)
-	
-	popl	%ecx
-	popl	%ebx
-	popl	%eax
-	ret
-
-.global VG_(helper_shldw)
-VG_(helper_shldw):
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-
-	movb	24(%esp), %cl
-	movw	20(%esp), %bx
-	movw	16(%esp), %ax
-	shldw	%cl, %bx, %ax
-	movw	%ax, 16(%esp)
-	
-	popl	%ecx
-	popl	%ebx
-	popl	%eax
-	ret
-
-.global VG_(helper_shrdl)
-VG_(helper_shrdl):
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-
-	movb	24(%esp), %cl
-	movl	20(%esp), %ebx
-	movl	16(%esp), %eax
-	shrdl	%cl, %ebx, %eax
-	movl	%eax, 16(%esp)
-	
-	popl	%ecx
-	popl	%ebx
-	popl	%eax
-	ret
-
-.global VG_(helper_shrdw)
-VG_(helper_shrdw):
-	pushl	%eax
-	pushl	%ebx
-	pushl	%ecx
-
-	movb	24(%esp), %cl
-	movw	20(%esp), %bx
-	movw	16(%esp), %ax
-	shrdw	%cl, %bx, %ax
-	movw	%ax, 16(%esp)
-	
-	popl	%ecx
-	popl	%ebx
-	popl	%eax
-	ret
-
-	
-/* Get the direction flag, and return either 1 or -1. */
-.global VG_(helper_get_dirflag)
-VG_(helper_get_dirflag):
-	pushfl
-	pushl	%eax
-
-	pushfl
-	popl	%eax
-	shrl	$10, %eax
-	andl	$1, %eax
-	jnz	L1
-	movl	$1, %eax
-	jmp	L2
-L1:	movl	$-1, %eax
-L2:	movl	%eax, 12(%esp)
-
-	popl %eax
-	popfl
-	ret
-
-
-/* Clear/set the direction flag. */
-.global VG_(helper_CLD)
-VG_(helper_CLD):
-	cld
-	ret
-
-.global VG_(helper_STD)
-VG_(helper_STD):
-	std
-	ret
-
-/* Clear/set the carry flag. */
-.global VG_(helper_CLC)
-VG_(helper_CLC):
-        clc
-        ret
-
-.global VG_(helper_STC)  
-VG_(helper_STC):
-        stc
-        ret
-
-/* Signed 32-to-64 multiply. */
-.globl VG_(helper_imul_32_64)
-VG_(helper_imul_32_64):
-	pushl	%eax
-	pushl	%edx
-	movl	16(%esp), %eax
-	imull	12(%esp)
-	movl	%eax, 16(%esp)
-	movl	%edx, 12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-	
-/* Signed 16-to-32 multiply. */
-.globl VG_(helper_imul_16_32)
-VG_(helper_imul_16_32):
-	pushl	%eax
-	pushl	%edx
-	movw	16(%esp), %ax
-	imulw	12(%esp)
-	movw	%ax, 16(%esp)
-	movw	%dx, 12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-	
-/* Signed 8-to-16 multiply. */
-.globl VG_(helper_imul_8_16)
-VG_(helper_imul_8_16):
-	pushl	%eax
-	pushl	%edx
-	movb	16(%esp), %al
-	imulb	12(%esp)
-	movw	%ax, 16(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-	
-
-	
-	
-	
-	
-/* Unsigned 32-to-64 multiply. */
-.globl VG_(helper_mul_32_64)
-VG_(helper_mul_32_64):
-	pushl	%eax
-	pushl	%edx
-	movl	16(%esp), %eax
-	mull	12(%esp)
-	movl	%eax, 16(%esp)
-	movl	%edx, 12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-	
-/* Unsigned 16-to-32 multiply. */
-.globl VG_(helper_mul_16_32)
-VG_(helper_mul_16_32):
-	pushl	%eax
-	pushl	%edx
-	movw	16(%esp), %ax
-	mulw	12(%esp)
-	movw	%ax, 16(%esp)
-	movw	%dx, 12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-	
-/* Unsigned 8-to-16 multiply. */
-.globl VG_(helper_mul_8_16)
-VG_(helper_mul_8_16):
-	pushl	%eax
-	pushl	%edx
-	movb	16(%esp), %al
-	mulb	12(%esp)
-	movw	%ax, 16(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-	
-	
-	
-		
-/* Unsigned 64-into-32 divide. */
-.globl	VG_(helper_div_64_32)
-VG_(helper_div_64_32):
-	pushl	%eax
-	pushl	%edx
-	movl	16(%esp),%eax
-	movl	12(%esp),%edx
-	divl	20(%esp)
-	movl	%eax,16(%esp)
-	movl	%edx,12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-
-/* Signed 64-into-32 divide. */
-.globl	VG_(helper_idiv_64_32)
-VG_(helper_idiv_64_32):
-	pushl	%eax
-	pushl	%edx
-	movl	16(%esp),%eax
-	movl	12(%esp),%edx
-	idivl	20(%esp)
-	movl	%eax,16(%esp)
-	movl	%edx,12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-
-/* Unsigned 32-into-16 divide. */
-.globl	VG_(helper_div_32_16)
-VG_(helper_div_32_16):
-	pushl	%eax
-	pushl	%edx
-	movw	16(%esp),%ax
-	movw	12(%esp),%dx
-	divw	20(%esp)
-	movw	%ax,16(%esp)
-	movw	%dx,12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-
-/* Signed 32-into-16 divide. */
-.globl	VG_(helper_idiv_32_16)
-VG_(helper_idiv_32_16):
-	pushl	%eax
-	pushl	%edx
-	movw	16(%esp),%ax
-	movw	12(%esp),%dx
-	idivw	20(%esp)
-	movw	%ax,16(%esp)
-	movw	%dx,12(%esp)
-	popl	%edx
-	popl	%eax
-	ret
-
-/* Unsigned 16-into-8 divide. */
-.globl	VG_(helper_div_16_8)
-VG_(helper_div_16_8):
-	pushl	%eax
-	movw	12(%esp),%ax
-	divb	16(%esp)
-	movb	%ah,12(%esp)
-	movb	%al,8(%esp)
-	popl	%eax
-	ret
-
-/* Signed 16-into-8 divide. */
-.globl	VG_(helper_idiv_16_8)
-VG_(helper_idiv_16_8):
-	pushl	%eax
-	movw	12(%esp),%ax
-	idivb	16(%esp)
-	movb	%ah,12(%esp)
-	movb	%al,8(%esp)
-	popl	%eax
-	ret
-
-		
-##--------------------------------------------------------------------##
-##--- end                                             vg_helpers.S ---##
-##--------------------------------------------------------------------##
diff --git a/coregrind/vg_include.h b/coregrind/vg_include.h
deleted file mode 100644
index 0d38c92332..0000000000
--- a/coregrind/vg_include.h
+++ /dev/null
@@ -1,2023 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A header file for all parts of Valgrind.                     ---*/
-/*--- Include no other!                                            ---*/
-/*---                                                 vg_include.h ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#ifndef __VG_INCLUDE_H
-#define __VG_INCLUDE_H
-
-
-#include <stdarg.h>       /* ANSI varargs stuff  */
-#include <setjmp.h>       /* for jmp_buf         */
-
-
-/* ---------------------------------------------------------------------
-   Where to send bug reports to.
-   ------------------------------------------------------------------ */
-
-#define VG_EMAIL_ADDR "jseward@acm.org"
-
-
-/* ---------------------------------------------------------------------
-   Build options and table sizes.  You should be able to change these
-   options or sizes, recompile, and still have a working system.
-   ------------------------------------------------------------------ */
-
-#include "vg_constants.h"
-
-
-/* Set to 1 to enable time profiling.  Since this uses SIGPROF, we
-   don't want this permanently enabled -- only for profiling
-   builds. */
-#if 0
-#  define VG_PROFILE
-#endif
-
-
-/* Total number of integer registers available for allocation.  That's
-   all of them except %esp, %edi and %ebp.  %edi is a general spare
-   temporary.  %ebp permanently points at VG_(baseBlock).  Note that
-   it's important that this tie in with what rankToRealRegNo() says.
-   DO NOT CHANGE THIS VALUE FROM 5. !  */
-#define VG_MAX_REALREGS 5
-
-/* Total number of spill slots available for allocation, if a TempReg
-   doesn't make it into a RealReg.  Just bomb the entire system if
-   this value is too small; we don't expect it will ever get
-   particularly high. */
-#define VG_MAX_SPILLSLOTS 24
-
-
-/* Constants for the slow translation lookup cache. */
-#define VG_TRANSTAB_SLOW_BITS 11
-#define VG_TRANSTAB_SLOW_SIZE (1 << VG_TRANSTAB_SLOW_BITS)
-#define VG_TRANSTAB_SLOW_MASK ((VG_TRANSTAB_SLOW_SIZE) - 1)
-
-/* Size of a buffer used for creating messages. */
-#define M_VG_MSGBUF 10000
-
-/* Size of a smallish table used to read /proc/self/map entries. */
-#define M_PROCMAP_BUF 50000
-
-/* Max length of pathname to a .so/executable file. */
-#define M_VG_LIBNAMESTR 100
-
-/* Max length of a text fragment used to construct error messages. */
-#define M_VG_ERRTXT 512
-
-/* Max length of the string copied from env var VG_ARGS at startup. */
-#define M_VG_CMDLINE_STRLEN 1000
-
-/* Max number of options for Valgrind which we can handle. */
-#define M_VG_CMDLINE_OPTS 100
-
-/* After this many different unsuppressed errors have been observed,
-   be more conservative about collecting new ones. */
-#define M_VG_COLLECT_ERRORS_SLOWLY_AFTER 50
-
-/* After this many different unsuppressed errors have been observed,
-   stop collecting errors at all, and tell the user their program is
-   evidently a steaming pile of camel dung. */
-#define M_VG_COLLECT_NO_ERRORS_AFTER_SHOWN 300
-
-/* After this many total errors have been observed, stop collecting
-   errors at all.  Counterpart to M_VG_COLLECT_NO_ERRORS_AFTER_SHOWN. */
-#define M_VG_COLLECT_NO_ERRORS_AFTER_FOUND 30000
-
-/* These many bytes below %ESP are considered addressible if we're
-   doing the --workaround-gcc296-bugs hack. */
-#define VG_GCC296_BUG_STACK_SLOP 1024
-
-/* The maximum number of calls we're prepared to save in a
-   backtrace. */
-#define VG_DEEPEST_BACKTRACE 50
-
-/* Number of lists in which we keep track of malloc'd but not free'd
-   blocks.  Should be prime. */
-#define VG_N_MALLOCLISTS 997
-
-/* Number of lists in which we keep track of ExeContexts.  Should be
-   prime. */
-#define VG_N_EC_LISTS /*997*/ 4999
-
-/* Defines the thread-scheduling timeslice, in terms of the number of
-   basic blocks we attempt to run each thread for.  Smaller values
-   give finer interleaving but much increased scheduling overheads. */
-#define VG_SCHEDULING_QUANTUM   50000
-
-/* The maximum number of pthreads that we support.  This is
-   deliberately not very high since our implementation of some of the
-   scheduler algorithms is surely O(N) in the number of threads, since
-   that's simple, at least.  And (in practice) we hope that most
-   programs do not need many threads. */
-#define VG_N_THREADS 50
-
-/* Maximum number of pthread keys available.  Again, we start low until
-   the need for a higher number presents itself. */
-#define VG_N_THREAD_KEYS 50
-
-/* Number of file descriptors that can simultaneously be waited on for
-   I/O to complete.  Perhaps this should be the same as VG_N_THREADS
-   (surely a thread can't wait on more than one fd at once?.  Who
-   knows.) */
-#define VG_N_WAITING_FDS 10
-
-/* Stack size for a thread.  We try and check that they do not go
-   beyond it. */
-#define VG_PTHREAD_STACK_SIZE (1 << 20)
-
-/* Number of entries in the semaphore-remapping table. */
-#define VG_N_SEMAPHORES 50
-
-/* Number of entries in the rwlock-remapping table. */
-#define VG_N_RWLOCKS 50
-
-/* Number of entries in each thread's cleanup stack. */
-#define VG_N_CLEANUPSTACK 8
-
-/* Number of entries in each thread's fork-handler stack. */
-#define VG_N_FORKHANDLERSTACK 2
-
-
-/* ---------------------------------------------------------------------
-   Basic types
-   ------------------------------------------------------------------ */
-
-typedef unsigned char          UChar;
-typedef unsigned short         UShort;
-typedef unsigned int           UInt;
-typedef unsigned long long int ULong;
-
-typedef signed char          Char;
-typedef signed short         Short;
-typedef signed int           Int;
-typedef signed long long int Long;
-
-typedef unsigned int Addr;
-
-typedef unsigned char Bool;
-#define False ((Bool)0)
-#define True ((Bool)1)
-
-#define mycat_wrk(aaa,bbb) aaa##bbb
-#define mycat(aaa,bbb) mycat_wrk(aaa,bbb)
-
-/* Just pray that gcc's constant folding works properly ... */
-#define BITS(bit7,bit6,bit5,bit4,bit3,bit2,bit1,bit0)               \
-   ( ((bit7) << 7) | ((bit6) << 6) | ((bit5) << 5) | ((bit4) << 4)  \
-     | ((bit3) << 3) | ((bit2) << 2) | ((bit1) << 1) | (bit0))
-
-/* For cache simulation */
-typedef struct { 
-    int size;       /* bytes */
-    int assoc;
-    int line_size;  /* bytes */
-} cache_t;
-
-#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 })
-
-/* ---------------------------------------------------------------------
-   Now the basic types are set up, we can haul in the kernel-interface
-   definitions.
-   ------------------------------------------------------------------ */
-
-#include "./vg_kerneliface.h"
-
-
-/* ---------------------------------------------------------------------
-   Command-line-settable options
-   ------------------------------------------------------------------ */
-
-#define VG_CLO_SMC_NONE 0
-#define VG_CLO_SMC_SOME 1
-#define VG_CLO_SMC_ALL  2
-
-#define VG_CLO_MAX_SFILES 10
-
-/* Should we stop collecting errors if too many appear?  default: YES */
-extern Bool  VG_(clo_error_limit);
-/* Shall we V-check addrs (they are always A checked too): default: YES */
-extern Bool  VG_(clo_check_addrVs);
-/* Enquire about whether to attach to GDB at errors?   default: NO */
-extern Bool  VG_(clo_GDB_attach);
-/* Sanity-check level: 0 = none, 1 (default), > 1 = expensive. */
-extern Int   VG_(sanity_level);
-/* Verbosity level: 0 = silent, 1 (default), > 1 = more verbose. */
-extern Int   VG_(clo_verbosity);
-/* Automatically attempt to demangle C++ names?  default: YES */
-extern Bool  VG_(clo_demangle);
-/* Do leak check at exit?  default: NO */
-extern Bool  VG_(clo_leak_check);
-/* In leak check, show reachable-but-not-freed blocks?  default: NO */
-extern Bool  VG_(clo_show_reachable);
-/* How closely should we compare ExeContexts in leak records? default: 2 */
-extern Int   VG_(clo_leak_resolution);
-/* Round malloc sizes upwards to integral number of words? default:
-   NO */
-extern Bool  VG_(clo_sloppy_malloc);
-/* Minimum alignment in functions that don't specify alignment explicitly.
-   default: 0, i.e. use default of the machine (== 4) */
-extern Int   VG_(clo_alignment);
-/* Allow loads from partially-valid addresses?  default: YES */
-extern Bool  VG_(clo_partial_loads_ok);
-/* Simulate child processes? default: NO */
-extern Bool  VG_(clo_trace_children);
-/* The file id on which we send all messages.  default: 2 (stderr). */
-extern Int   VG_(clo_logfile_fd);
-/* Max volume of the freed blocks queue. */
-extern Int   VG_(clo_freelist_vol);
-/* Assume accesses immediately below %esp are due to gcc-2.96 bugs.
-   default: NO */
-extern Bool  VG_(clo_workaround_gcc296_bugs);
-
-/* The number of suppression files specified. */
-extern Int   VG_(clo_n_suppressions);
-/* The names of the suppression files. */
-extern Char* VG_(clo_suppressions)[VG_CLO_MAX_SFILES];
-
-/* Single stepping?  default: NO */
-extern Bool  VG_(clo_single_step);
-/* Code improvement?  default: YES */
-extern Bool  VG_(clo_optimise);
-/* Memory-check instrumentation?  default: YES */
-extern Bool  VG_(clo_instrument);
-/* DEBUG: clean up instrumented code?  default: YES */
-extern Bool  VG_(clo_cleanup);
-/* Cache simulation instrumentation?  default: NO */
-extern Bool  VG_(clo_cachesim);
-/* I1 cache configuration.  default: undefined */
-extern cache_t VG_(clo_I1_cache);
-/* D1 cache configuration.  default: undefined */
-extern cache_t VG_(clo_D1_cache);
-/* L2 cache configuration.  default: undefined */
-extern cache_t VG_(clo_L2_cache);
-/* SMC write checks?  default: SOME (1,2,4 byte movs to mem) */
-extern Int   VG_(clo_smc_check);
-/* DEBUG: print system calls?  default: NO */
-extern Bool  VG_(clo_trace_syscalls);
-/* DEBUG: print signal details?  default: NO */
-extern Bool  VG_(clo_trace_signals);
-/* DEBUG: print symtab details?  default: NO */
-extern Bool  VG_(clo_trace_symtab);
-/* DEBUG: print malloc details?  default: NO */
-extern Bool  VG_(clo_trace_malloc);
-/* DEBUG: print thread scheduling events?  default: NO */
-extern Bool  VG_(clo_trace_sched);
-/* DEBUG: print pthread (mutex etc) events?  default: 0 (none), 1
-   (some), 2 (all) */
-extern Int   VG_(clo_trace_pthread_level);
-/* Stop after this many basic blocks.  default: Infinity. */
-extern ULong VG_(clo_stop_after);
-/* Display gory details for the k'th most popular error.  default:
-   Infinity. */
-extern Int   VG_(clo_dump_error);
-/* Number of parents of a backtrace.  Default: 8.  */
-extern Int   VG_(clo_backtrace_size);
-/* Engage miscellaneous wierd hacks needed for some progs. */
-extern Char* VG_(clo_weird_hacks);
-
-
-/* ---------------------------------------------------------------------
-   Debugging and profiling stuff
-   ------------------------------------------------------------------ */
-
-/* No, really.  I _am_ that strange. */
-#define OINK(nnn) VG_(message)(Vg_DebugMsg, "OINK %d",nnn)
-
-/* Tools for building messages from multiple parts. */
-typedef
-   enum { Vg_UserMsg, Vg_DebugMsg, Vg_DebugExtraMsg }
-   VgMsgKind;
-
-extern void VG_(start_msg)  ( VgMsgKind kind );
-extern void VG_(add_to_msg) ( Char* format, ... );
-extern void VG_(end_msg)    ( void );
-
-/* Send a simple, single-part message. */
-extern void VG_(message)    ( VgMsgKind kind, Char* format, ... );
-
-/* Create a logfile into which messages can be dumped. */
-extern void VG_(startup_logging) ( void );
-extern void VG_(shutdown_logging) ( void );
-
-
-/* Profiling stuff */
-#ifdef VG_PROFILE
-
-#define VGP_M_STACK 10
-
-#define VGP_M_CCS 26  /* == the # of elems in VGP_LIST */
-#define VGP_LIST \
-   VGP_PAIR(VgpUnc=0,      "unclassified"),           \
-   VGP_PAIR(VgpRun,        "running"),                \
-   VGP_PAIR(VgpSched,      "scheduler"),              \
-   VGP_PAIR(VgpMalloc,     "low-lev malloc/free"),    \
-   VGP_PAIR(VgpCliMalloc,  "client  malloc/free"),    \
-   VGP_PAIR(VgpTranslate,  "translate-main"),         \
-   VGP_PAIR(VgpToUCode,    "to-ucode"),               \
-   VGP_PAIR(VgpFromUcode,  "from-ucode"),             \
-   VGP_PAIR(VgpImprove,    "improve"),                \
-   VGP_PAIR(VgpInstrument, "instrument"),             \
-   VGP_PAIR(VgpCleanup,    "cleanup"),                \
-   VGP_PAIR(VgpRegAlloc,   "reg-alloc"),              \
-   VGP_PAIR(VgpDoLRU,      "do-lru"),                 \
-   VGP_PAIR(VgpSlowFindT,  "slow-search-transtab"),   \
-   VGP_PAIR(VgpInitAudit,  "init-mem-audit"),         \
-   VGP_PAIR(VgpExeContext, "exe-context"),            \
-   VGP_PAIR(VgpReadSyms,   "read-syms"),              \
-   VGP_PAIR(VgpAddToT,     "add-to-transtab"),        \
-   VGP_PAIR(VgpSARP,       "set-addr-range-perms"),   \
-   VGP_PAIR(VgpSyscall,    "syscall wrapper"),        \
-   VGP_PAIR(VgpCacheInstrument, "cache instrument"),  \
-   VGP_PAIR(VgpCacheGetBBCC,"cache get BBCC"),        \
-   VGP_PAIR(VgpCacheSimulate, "cache simulate"),      \
-   VGP_PAIR(VgpCacheDump,  "cache stats dump"),       \
-   VGP_PAIR(VgpSpare1,     "spare 1"),                \
-   VGP_PAIR(VgpSpare2,     "spare 2")
-
-#define VGP_PAIR(enumname,str) enumname
-typedef enum { VGP_LIST } VgpCC;
-#undef VGP_PAIR
-
-extern void VGP_(init_profiling) ( void );
-extern void VGP_(done_profiling) ( void );
-extern void VGP_(pushcc) ( VgpCC );
-extern void VGP_(popcc) ( void );
-
-#define VGP_PUSHCC(cc) VGP_(pushcc)(cc)
-#define VGP_POPCC      VGP_(popcc)()
-
-#else
-
-#define VGP_PUSHCC(cc) /* */
-#define VGP_POPCC      /* */
-
-#endif /* VG_PROFILE */
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_malloc2.c
-   ------------------------------------------------------------------ */
-
-/* Allocation arenas.  
-      SYMTAB    is for Valgrind's symbol table storage.
-      CLIENT    is for the client's mallocs/frees.
-      DEMANGLE  is for the C++ demangler.
-      EXECTXT   is for storing ExeContexts.
-      ERRCTXT   is for storing ErrContexts.
-      PRIVATE   is for Valgrind general stuff.
-      TRANSIENT is for very short-term use.  It should be empty
-                in between uses.
-   When adding a new arena, remember also to add it
-   to ensure_mm_init(). 
-*/
-typedef Int ArenaId;
-
-#define VG_N_ARENAS 7
-
-#define VG_AR_PRIVATE   0    /* :: ArenaId */
-#define VG_AR_SYMTAB    1    /* :: ArenaId */
-#define VG_AR_CLIENT    2    /* :: ArenaId */
-#define VG_AR_DEMANGLE  3    /* :: ArenaId */
-#define VG_AR_EXECTXT   4    /* :: ArenaId */
-#define VG_AR_ERRCTXT   5    /* :: ArenaId */
-#define VG_AR_TRANSIENT 6    /* :: ArenaId */
-
-extern void* VG_(malloc)  ( ArenaId arena, Int nbytes );
-extern void  VG_(free)    ( ArenaId arena, void* ptr );
-extern void* VG_(calloc)  ( ArenaId arena, Int nmemb, Int nbytes );
-extern void* VG_(realloc) ( ArenaId arena, void* ptr, Int size );
-extern void* VG_(malloc_aligned) ( ArenaId aid, Int req_alignB, 
-                                                Int req_pszB );
-
-extern void  VG_(mallocSanityCheckArena) ( ArenaId arena );
-extern void  VG_(mallocSanityCheckAll)   ( void );
-
-extern void  VG_(show_all_arena_stats) ( void );
-extern Bool  VG_(is_empty_arena) ( ArenaId aid );
-
-
-/* The red-zone size for the client.  This can be arbitrary, but
-   unfortunately must be set at compile time. */
-#define VG_AR_CLIENT_REDZONE_SZW 4
-
-#define VG_AR_CLIENT_REDZONE_SZB \
-   (VG_AR_CLIENT_REDZONE_SZW * VKI_BYTES_PER_WORD)
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_clientfuns.c
-   ------------------------------------------------------------------ */
-
-/* This doesn't export code or data that valgrind.so needs to link
-   against.  However, the scheduler does need to know the following
-   request codes.  A few, publically-visible, request codes are also
-   defined in valgrind.h. */
-
-#define VG_USERREQ__MALLOC              0x2001
-#define VG_USERREQ__BUILTIN_NEW         0x2002
-#define VG_USERREQ__BUILTIN_VEC_NEW     0x2003
-
-#define VG_USERREQ__FREE                0x2004
-#define VG_USERREQ__BUILTIN_DELETE      0x2005
-#define VG_USERREQ__BUILTIN_VEC_DELETE  0x2006
-
-#define VG_USERREQ__CALLOC              0x2007
-#define VG_USERREQ__REALLOC             0x2008
-#define VG_USERREQ__MEMALIGN            0x2009
-
-
-/* (Fn, Arg): Create a new thread and run Fn applied to Arg in it.  Fn
-   MUST NOT return -- ever.  Eventually it will do either __QUIT or
-   __WAIT_JOINER.  */
-#define VG_USERREQ__APPLY_IN_NEW_THREAD     0x3001
-
-/* ( no-args ): calling thread disappears from the system forever.
-   Reclaim resources. */
-#define VG_USERREQ__QUIT                    0x3002
-
-/* ( void* ): calling thread waits for joiner and returns the void* to
-   it. */
-#define VG_USERREQ__WAIT_JOINER             0x3003
-
-/* ( ThreadId, void** ): wait to join a thread. */
-#define VG_USERREQ__PTHREAD_JOIN            0x3004
-
-/* Set cancellation state and type for this thread. */
-#define VG_USERREQ__SET_CANCELSTATE         0x3005
-#define VG_USERREQ__SET_CANCELTYPE          0x3006
-
-/* ( no-args ): Test if we are at a cancellation point. */
-#define VG_USERREQ__TESTCANCEL              0x3007
-
-/* ( ThreadId, &thread_exit_wrapper is the only allowable arg ): call
-   with this arg to indicate that a cancel is now pending for the
-   specified thread. */
-#define VG_USERREQ__SET_CANCELPEND          0x3008
-
-/* Set/get detach state for this thread. */
-#define VG_USERREQ__SET_OR_GET_DETACH       0x3009
-
-#define VG_USERREQ__PTHREAD_GET_THREADID    0x300B
-#define VG_USERREQ__PTHREAD_MUTEX_LOCK      0x300C
-#define VG_USERREQ__PTHREAD_MUTEX_TRYLOCK   0x300D
-#define VG_USERREQ__PTHREAD_MUTEX_UNLOCK    0x300E
-#define VG_USERREQ__PTHREAD_COND_WAIT       0x300F
-#define VG_USERREQ__PTHREAD_COND_TIMEDWAIT  0x3010
-#define VG_USERREQ__PTHREAD_COND_SIGNAL     0x3011
-#define VG_USERREQ__PTHREAD_COND_BROADCAST  0x3012
-#define VG_USERREQ__PTHREAD_KEY_CREATE      0x3013
-#define VG_USERREQ__PTHREAD_KEY_DELETE      0x3014
-#define VG_USERREQ__PTHREAD_SETSPECIFIC     0x3015
-#define VG_USERREQ__PTHREAD_GETSPECIFIC     0x3016
-#define VG_USERREQ__READ_MILLISECOND_TIMER  0x3017
-#define VG_USERREQ__PTHREAD_SIGMASK         0x3018
-#define VG_USERREQ__SIGWAIT                 0x3019
-#define VG_USERREQ__PTHREAD_KILL            0x301A
-#define VG_USERREQ__PTHREAD_YIELD           0x301B
-
-#define VG_USERREQ__CLEANUP_PUSH            0x3020
-#define VG_USERREQ__CLEANUP_POP             0x3021
-#define VG_USERREQ__GET_KEY_D_AND_S         0x3022
-
-#define VG_USERREQ__NUKE_OTHER_THREADS      0x3023
-
-/* Ask how many signal handler returns have happened to this
-   thread. */
-#define VG_USERREQ__GET_N_SIGS_RETURNED     0x3024
-
-/* Get/set entries for a thread's pthread_atfork stack. */
-#define VG_USERREQ__SET_FHSTACK_USED        0x3025
-#define VG_USERREQ__GET_FHSTACK_USED        0x3026
-#define VG_USERREQ__SET_FHSTACK_ENTRY       0x3027
-#define VG_USERREQ__GET_FHSTACK_ENTRY       0x3028
-
-/* Denote the finish of VG_(__libc_freeres_wrapper). */
-#define VG_USERREQ__LIBC_FREERES_DONE       0x3029
-
-/* Cosmetic ... */
-#define VG_USERREQ__GET_PTHREAD_TRACE_LEVEL 0x3101
-/* Log a pthread error from client-space.  Cosmetic. */
-#define VG_USERREQ__PTHREAD_ERROR           0x3102
-
-/* 
-In vg_constants.h:
-#define VG_USERREQ__SIGNAL_RETURNS          0x4001
-*/
-
-/* The scheduler does need to know the address of it so it can be
-   called at program exit. */
-extern void VG_(__libc_freeres_wrapper)( void );
-
-
-/* ---------------------------------------------------------------------
-   Constants pertaining to the simulated CPU state, VG_(baseBlock),
-   which need to go here to avoid ugly circularities.
-   ------------------------------------------------------------------ */
-
-/* How big is the saved FPU state? */
-#define VG_SIZE_OF_FPUSTATE 108
-/* ... and in words ... */
-#define VG_SIZE_OF_FPUSTATE_W ((VG_SIZE_OF_FPUSTATE+3)/4)
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_scheduler.c
-   ------------------------------------------------------------------ */
-
-/* ThreadIds are simply indices into the vg_threads[] array. */
-typedef 
-   UInt 
-   ThreadId;
-
-/* Special magic value for an invalid ThreadId.  It corresponds to
-   LinuxThreads using zero as the initial value for
-   pthread_mutex_t.__m_owner and pthread_cond_t.__c_waiting. */
-#define VG_INVALID_THREADID ((ThreadId)(0))
-
-typedef
-   enum { 
-      VgTs_Empty,      /* this slot is not in use */
-      VgTs_Runnable,   /* waiting to be scheduled */
-      VgTs_WaitJoiner, /* waiting for someone to do join on me */
-      VgTs_WaitJoinee, /* waiting for the thread I did join on */
-      VgTs_WaitFD,     /* waiting for I/O completion on a fd */
-      VgTs_WaitMX,     /* waiting on a mutex */
-      VgTs_WaitCV,     /* waiting on a condition variable */
-      VgTs_WaitSIG,    /* waiting due to sigwait() */
-      VgTs_Sleeping    /* sleeping for a while */
-   }
-   ThreadStatus;
-
-/* An entry in a threads's cleanup stack. */
-typedef
-   struct {
-      void (*fn)(void*);
-      void* arg;
-   }
-   CleanupEntry;
-
-/* An entry in a thread's fork-handler stack. */
-typedef
-   struct {
-      void (*prepare)(void);
-      void (*parent)(void);
-      void (*child)(void);
-   }
-   ForkHandlerEntry;
-
-
-typedef
-   struct {
-      /* ThreadId == 0 (and hence vg_threads[0]) is NEVER USED.
-         The thread identity is simply the index in vg_threads[].
-         ThreadId == 1 is the root thread and has the special property
-         that we don't try and allocate or deallocate its stack.  For
-         convenience of generating error message, we also put the
-         ThreadId in this tid field, but be aware that it should
-         ALWAYS == the index in vg_threads[]. */
-      ThreadId tid;
-
-      /* Current scheduling status. 
-
-         Complications: whenever this is set to VgTs_WaitMX, you
-         should also set .m_edx to whatever the required return value
-         is for pthread_mutex_lock / pthread_cond_timedwait for when
-         the mutex finally gets unblocked. */
-      ThreadStatus status;
-
-      /* When .status == WaitMX, points to the mutex I am waiting for.
-         When .status == WaitCV, points to the mutex associated with
-         the condition variable indicated by the .associated_cv field.
-         In all other cases, should be NULL. */
-      void* /* pthread_mutex_t* */ associated_mx;
-
-      /* When .status == WaitCV, points to the condition variable I am
-         waiting for.  In all other cases, should be NULL. */
-      void* /* pthread_cond_t* */ associated_cv;
-
-      /* If VgTs_Sleeping, this is when we should wake up, measured in
-         milliseconds as supplied by VG_(read_millisecond_counter). 
- 
-         If VgTs_WaitCV, this indicates the time at which
-         pthread_cond_timedwait should wake up.  If == 0xFFFFFFFF,
-         this means infinitely far in the future, viz,
-         pthread_cond_wait. */
-      UInt awaken_at;
-
-      /* If VgTs_WaitJoiner, return value, as generated by joinees. */
-      void* joinee_retval;
-
-      /* If VgTs_WaitJoinee, place to copy the return value to, and
-         the identity of the thread we're waiting for. */
-      void**   joiner_thread_return;
-      ThreadId joiner_jee_tid;      
-
-      /* Whether or not detached. */
-      Bool detached;
-
-      /* Cancelability state and type. */
-      Bool cancel_st; /* False==PTH_CANCEL_DISABLE; True==.._ENABLE */
-      Bool cancel_ty; /* False==PTH_CANC_ASYNCH; True==..._DEFERRED */
-     
-      /* Pointer to fn to call to do cancellation.  Indicates whether
-         or not cancellation is pending.  If NULL, not pending.  Else
-         should be &thread_exit_wrapper(), indicating that
-         cancallation is pending. */
-      void (*cancel_pend)(void*);
-
-      /* The cleanup stack. */
-      Int          custack_used;
-      CleanupEntry custack[VG_N_CLEANUPSTACK];
-
-      /* thread-specific data */
-      void* specifics[VG_N_THREAD_KEYS];
-
-      /* This thread's blocked-signals mask.  Semantics is that for a
-         signal to be delivered to this thread, the signal must not be
-         blocked by either the process-wide signal mask nor by this
-         one.  So, if this thread is prepared to handle any signal that
-         the process as a whole is prepared to handle, this mask should
-         be made empty -- and that it is its default, starting
-         state. */
-      vki_ksigset_t sig_mask;
-
-      /* When not VgTs_WaitSIG, has no meaning.  When VgTs_WaitSIG,
-         is the set of signals for which we are sigwait()ing. */
-      vki_ksigset_t sigs_waited_for;
-
-      /* Counts the number of times a signal handler for this thread
-         has returned.  This makes it easy to implement pause(), by
-         polling this value, of course interspersed with nanosleeps,
-         and waiting till it changes. */
-      UInt n_signals_returned;
-
-      /* Stacks.  When a thread slot is freed, we don't deallocate its
-         stack; we just leave it lying around for the next use of the
-         slot.  If the next use of the slot requires a larger stack,
-         only then is the old one deallocated and a new one
-         allocated. 
- 
-         For the main thread (threadid == 0), this mechanism doesn't
-         apply.  We don't know the size of the stack since we didn't
-         allocate it, and furthermore we never reallocate it. */
-
-      /* The allocated size of this thread's stack (permanently zero
-         if this is ThreadId == 0, since we didn't allocate its stack) */
-      UInt stack_size;
-
-      /* Address of the lowest word in this thread's stack.  NULL means
-         not allocated yet.
-      */
-      Addr stack_base;
-
-     /* Address of the highest legitimate word in this stack.  This is
-        used for error messages only -- not critical for execution
-        correctness.  Is is set for all stacks, specifically including
-        ThreadId == 0 (the main thread). */
-      Addr stack_highest_word;
-
-      /* Saved machine context. */
-      UInt m_eax;
-      UInt m_ebx;
-      UInt m_ecx;
-      UInt m_edx;
-      UInt m_esi;
-      UInt m_edi;
-      UInt m_ebp;
-      UInt m_esp;
-      UInt m_eflags;
-      UInt m_eip;
-      UInt m_fpu[VG_SIZE_OF_FPUSTATE_W];
-
-      UInt sh_eax;
-      UInt sh_ebx;
-      UInt sh_ecx;
-      UInt sh_edx;
-      UInt sh_esi;
-      UInt sh_edi;
-      UInt sh_ebp;
-      UInt sh_esp;
-      UInt sh_eflags;
-   }
-   ThreadState;
-
-
-/* The thread table. */
-extern ThreadState VG_(threads)[VG_N_THREADS];
-
-/* Check that tid is in range and denotes a non-Empty thread. */
-extern Bool VG_(is_valid_tid) ( ThreadId tid );
-
-/* Check that tid is in range. */
-extern Bool VG_(is_valid_or_empty_tid) ( ThreadId tid );
-
-/* Copy the specified thread's state into VG_(baseBlock) in
-   preparation for running it. */
-extern void VG_(load_thread_state)( ThreadId );
-
-/* Save the specified thread's state back in VG_(baseBlock), and fill
-   VG_(baseBlock) with junk, for sanity-check reasons. */
-extern void VG_(save_thread_state)( ThreadId );
-
-/* And for the currently running one, if valid. */
-extern ThreadState* VG_(get_current_thread_state) ( void );
-
-/* Similarly ... */
-extern ThreadId VG_(get_current_tid) ( void );
-
-/* Which thread is this address in the stack of, if any?  Used for
-   error message generation. */
-extern ThreadId VG_(identify_stack_addr)( Addr a );
-
-/* Nuke all threads except tid. */
-extern void VG_(nuke_all_threads_except) ( ThreadId me );
-
-
-/* Return codes from the scheduler. */
-typedef
-   enum { 
-      VgSrc_Deadlock,    /* no runnable threads and no prospect of any
-                            even if we wait for a long time */
-      VgSrc_ExitSyscall, /* client called exit().  This is the normal
-                            route out. */
-      VgSrc_BbsDone      /* In a debugging run, the specified number of
-                            bbs has been completed. */
-   }
-   VgSchedReturnCode;
-
-
-/* The scheduler. */
-extern VgSchedReturnCode VG_(scheduler) ( void );
-
-extern void VG_(scheduler_init) ( void );
-
-extern void VG_(pp_sched_status) ( void );
-
-/* vg_oursignalhandler() might longjmp().  Here's the jmp_buf. */
-extern jmp_buf VG_(scheduler_jmpbuf);
-/* This says whether scheduler_jmpbuf is actually valid.  Needed so
-   that our signal handler doesn't longjmp when the buffer isn't
-   actually valid. */
-extern Bool    VG_(scheduler_jmpbuf_valid);
-/* ... and if so, here's the signal which caused it to do so. */
-extern Int     VG_(longjmpd_on_signal);
-
-
-/* Possible places where the main stack might be based.  We check that
-   the initial stack, which we can't move, is allocated here.
-   VG_(scheduler_init) checks this.  Andrea Archelangi's 2.4 kernels
-   have been rumoured to start stacks at 0x80000000, so that too is
-   considered. It seems systems with longer uptimes tend to to use
-   stacks which start at 0x40000000 sometimes.  
-*/
-#define VG_STARTUP_STACK_BASE_1  (Addr)0xC0000000
-#define VG_STARTUP_STACK_BASE_2  (Addr)0x80000000
-#define VG_STARTUP_STACK_BASE_3  (Addr)0x40000000
-#define VG_STARTUP_STACK_SMALLERTHAN  0x100000 /* 1024k */
-
-#define VG_STACK_MATCHES_BASE(zzstack, zzbase)                 \
-   (                                                           \
-      ((zzstack) & ((zzbase) - VG_STARTUP_STACK_SMALLERTHAN))  \
-      ==                                                       \
-      ((zzbase) - VG_STARTUP_STACK_SMALLERTHAN)                \
-   )
-
-
-/* The red-zone size which we put at the bottom (highest address) of
-   thread stacks, for paranoia reasons.  This can be arbitrary, and
-   doesn't really need to be set at compile time. */
-#define VG_AR_CLIENT_STACKBASE_REDZONE_SZW 4
-
-#define VG_AR_CLIENT_STACKBASE_REDZONE_SZB \
-   (VG_AR_CLIENT_STACKBASE_REDZONE_SZW * VKI_BYTES_PER_WORD)
-
-
-/* Write a value to the client's %EDX (request return value register)
-   and set the shadow to indicate it is defined. */
-#define SET_EDX(zztid, zzval)                          \
-   do { VG_(threads)[zztid].m_edx = (zzval);             \
-        VG_(threads)[zztid].sh_edx = VGM_WORD_VALID;     \
-   } while (0)
-
-#define SET_EAX(zztid, zzval)                          \
-   do { VG_(threads)[zztid].m_eax = (zzval);             \
-        VG_(threads)[zztid].sh_eax = VGM_WORD_VALID;     \
-   } while (0)
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_signals.c
-   ------------------------------------------------------------------ */
-
-extern void VG_(sigstartup_actions) ( void );
-
-extern Bool VG_(deliver_signals) ( void );
-extern void VG_(unblock_host_signal) ( Int sigNo );
-extern void VG_(handle_SCSS_change) ( Bool force_update );
-
-
-/* Fake system calls for signal handling. */
-extern void VG_(do__NR_sigaltstack)   ( ThreadId tid );
-extern void VG_(do__NR_sigaction)     ( ThreadId tid );
-extern void VG_(do__NR_sigprocmask)   ( ThreadId tid,
-                                        Int how, 
-                                        vki_ksigset_t* set,
-                                        vki_ksigset_t* oldset );
-extern void VG_(do_pthread_sigmask_SCSS_upd) ( ThreadId tid,
-                                               Int how, 
-                                               vki_ksigset_t* set,
-                                               vki_ksigset_t* oldset );
-extern void VG_(send_signal_to_thread) ( ThreadId thread,
-                                         Int signo );
-
-extern void VG_(do_sigpending) ( ThreadId tid, vki_ksigset_t* set );
-
-
-/* Modify the current thread's state once we have detected it is
-   returning from a signal handler. */
-extern Bool VG_(signal_returns) ( ThreadId );
-
-/* Handy utilities to block/restore all host signals. */
-extern void VG_(block_all_host_signals) 
-                  ( /* OUT */ vki_ksigset_t* saved_mask );
-extern void VG_(restore_all_host_signals) 
-                  ( /* IN */ vki_ksigset_t* saved_mask );
-
-/* ---------------------------------------------------------------------
-   Exports of vg_mylibc.c
-   ------------------------------------------------------------------ */
-
-
-#if !defined(NULL)
-#  define NULL ((void*)0)
-#endif
-
-extern void VG_(exit)( Int status )
-            __attribute__ ((__noreturn__));
-
-extern void VG_(printf) ( const char *format, ... );
-/* too noisy ...  __attribute__ ((format (printf, 1, 2))) ; */
-
-extern void VG_(sprintf) ( Char* buf, Char *format, ... );
-
-extern void VG_(vprintf) ( void(*send)(Char), 
-                          const Char *format, va_list vargs );
-
-extern Bool VG_(isspace) ( Char c );
-extern Bool VG_(isdigit) ( Char c );
-
-extern Int VG_(strlen) ( const Char* str );
-
-extern Long VG_(atoll) ( Char* str );
-extern Long VG_(atoll36) ( Char* str );
-
-extern Char* VG_(strcat) ( Char* dest, const Char* src );
-extern Char* VG_(strncat) ( Char* dest, const Char* src, Int n );
-extern Char* VG_(strpbrk) ( const Char* s, const Char* accept );
-
-extern Char* VG_(strcpy) ( Char* dest, const Char* src );
-
-extern Int VG_(strcmp)    ( const Char* s1, const Char* s2 );
-extern Int VG_(strcmp_ws) ( const Char* s1, const Char* s2 );
-
-extern Int VG_(strncmp)    ( const Char* s1, const Char* s2, Int nmax );
-extern Int VG_(strncmp_ws) ( const Char* s1, const Char* s2, Int nmax );
-
-extern Char* VG_(strstr) ( const Char* haystack, Char* needle );
-extern Char* VG_(strchr) ( const Char* s, Char c );
-extern Char* VG_(strdup) ( ArenaId aid, const Char* s);
-
-extern Char* VG_(getenv) ( Char* name );
-extern Int   VG_(getpid) ( void );
-
-extern void VG_(start_rdtsc_calibration) ( void );
-extern void VG_(end_rdtsc_calibration) ( void );
-extern UInt VG_(read_millisecond_timer) ( void );
-
-
-extern Char VG_(toupper) ( Char c );
-
-extern void VG_(strncpy_safely) ( Char* dest, const Char* src, Int ndest );
-
-extern void VG_(strncpy) ( Char* dest, const Char* src, Int ndest );
-
-extern Bool VG_(stringMatch) ( Char* pat, Char* str );
-
-
-#define VG__STRING(__str)  #__str
-
-/* Asserts are permanently enabled.  Hurrah! */
-#define vg_assert(expr)                                               \
-  ((void) ((expr) ? 0 :						      \
-	   (VG_(assert_fail) (VG__STRING(expr),			      \
-			      __FILE__, __LINE__,                     \
-                              __PRETTY_FUNCTION__), 0)))
-
-extern void VG_(assert_fail) ( Char* expr, Char* file, 
-                               Int line, Char* fn )
-            __attribute__ ((__noreturn__));
-
-/* Reading and writing files. */
-extern Int  VG_(open_read) ( Char* pathname );
-extern Int  VG_(open_write)       ( Char* pathname );
-extern Int  VG_(create_and_write) ( Char* pathname );
-extern void VG_(close)     ( Int fd );
-extern Int  VG_(read)      ( Int fd, void* buf, Int count);
-extern Int  VG_(write)     ( Int fd, void* buf, Int count);
-extern Int  VG_(stat) ( Char* file_name, struct vki_stat* buf );
-
-extern Int  VG_(fcntl) ( Int fd, Int cmd, Int arg );
-
-extern Int VG_(select)( Int n, 
-                        vki_fd_set* readfds, 
-                        vki_fd_set* writefds, 
-                        vki_fd_set* exceptfds, 
-                        struct vki_timeval * timeout );
-extern Int VG_(nanosleep)( const struct vki_timespec *req, 
-                           struct vki_timespec *rem );
-
-
-/* mmap-ery ... */
-extern void* VG_(mmap)( void* start, UInt length, 
-                        UInt prot, UInt flags, UInt fd, UInt offset );
-
-extern Int  VG_(munmap)( void* start, Int length );
-
-extern void* VG_(brk) ( void* end_data_segment );
-
-
-/* Print a (panic) message, and abort. */
-extern void VG_(panic) ( Char* str )
-            __attribute__ ((__noreturn__));
-
-/* Get memory by anonymous mmap. */
-extern void* VG_(get_memory_from_mmap) ( Int nBytes, Char* who );
-
-/* Crude stand-in for the glibc system() call. */
-extern Int VG_(system) ( Char* cmd );
-
-
-/* Signal stuff.  Note that these use the vk_ (kernel) structure
-   definitions, which are different in places from those that glibc
-   defines.  Since we're operating right at the kernel interface,
-   glibc's view of the world is entirely irrelevant. */
-
-/* --- Signal set ops --- */
-extern Int  VG_(ksigfillset)( vki_ksigset_t* set );
-extern Int  VG_(ksigemptyset)( vki_ksigset_t* set );
-
-extern Bool VG_(kisfullsigset)( vki_ksigset_t* set );
-extern Bool VG_(kisemptysigset)( vki_ksigset_t* set );
-
-extern Int  VG_(ksigaddset)( vki_ksigset_t* set, Int signum );
-extern Int  VG_(ksigdelset)( vki_ksigset_t* set, Int signum );
-extern Int  VG_(ksigismember) ( vki_ksigset_t* set, Int signum );
-
-extern void VG_(ksigaddset_from_set)( vki_ksigset_t* dst, 
-                                      vki_ksigset_t* src );
-extern void VG_(ksigdelset_from_set)( vki_ksigset_t* dst, 
-                                      vki_ksigset_t* src );
-
-/* --- Mess with the kernel's sig state --- */
-extern Int VG_(ksigprocmask)( Int how, const vki_ksigset_t* set, 
-                                       vki_ksigset_t* oldset );
-extern Int VG_(ksigaction) ( Int signum,  
-                             const vki_ksigaction* act,  
-                             vki_ksigaction* oldact );
-
-extern Int VG_(ksignal)(Int signum, void (*sighandler)(Int));
-
-extern Int VG_(ksigaltstack)( const vki_kstack_t* ss, vki_kstack_t* oss );
-
-extern Int VG_(kill)( Int pid, Int signo );
-extern Int VG_(sigpending) ( vki_ksigset_t* set );
-
-
-/* ---------------------------------------------------------------------
-   Definitions for the JITter (vg_translate.c, vg_to_ucode.c,
-   vg_from_ucode.c).
-   ------------------------------------------------------------------ */
-
-/* Tags which describe what operands are. */
-typedef
-   enum { TempReg=0, ArchReg=1, RealReg=2, 
-          SpillNo=3, Literal=4, Lit16=5, 
-          NoValue=6 }
-   Tag;
-
-
-/* Microinstruction opcodes. */
-typedef
-   enum {
-      NOP,
-      GET,
-      PUT,
-      LOAD,
-      STORE,
-      MOV,
-      CMOV, /* Used for cmpxchg and cmov */
-      WIDEN,
-      JMP,
-
-      /* Read/write the %EFLAGS register into a TempReg. */
-      GETF, PUTF,
-
-      ADD, ADC, AND, OR,  XOR, SUB, SBB,
-      SHL, SHR, SAR, ROL, ROR, RCL, RCR,
-      NOT, NEG, INC, DEC, BSWAP,
-      CC2VAL,
-
-      /* Not strictly needed, but useful for making better
-         translations of address calculations. */
-      LEA1,  /* reg2 := const + reg1 */
-      LEA2,  /* reg3 := const + reg1 + reg2 * 1,2,4 or 8 */
-
-      /* not for translating x86 calls -- only to call helpers */
-      CALLM_S, CALLM_E, /* Mark start and end of push/pop sequences
-                           for CALLM. */
-      PUSH, POP, CLEAR, /* Add/remove/zap args for helpers. */
-      CALLM,  /* call to a machine-code helper */
-
-      /* Hack for translating string (REP-) insns.  Jump to literal if
-         TempReg/RealReg is zero. */
-      JIFZ,
-
-      /* FPU ops which read/write mem or don't touch mem at all. */
-      FPU_R,
-      FPU_W,
-      FPU,
-
-      /* Advance the simulated %eip by some small (< 128) number. */
-      INCEIP,
-
-      /* uinstrs which are not needed for mere translation of x86 code,
-         only for instrumentation of it. */
-      LOADV,
-      STOREV,
-      GETV,
-      PUTV,
-      TESTV,
-      SETV,
-      /* Get/set the v-bit (and it is only one bit) for the simulated
-         %eflags register. */
-      GETVF,
-      PUTVF,
-
-      /* Do a unary or binary tag op.  Only for post-instrumented
-         code.  For TAG1, first and only arg is a TempReg, and is both
-         arg and result reg.  For TAG2, first arg is src, second is
-         dst, in the normal way; both are TempRegs.  In both cases,
-         3rd arg is a RiCHelper with a Lit16 tag.  This indicates
-         which tag op to do. */
-      TAG1,
-      TAG2
-   }
-   Opcode;
-
-
-/* Condition codes, observing the Intel encoding.  CondAlways is an
-   extra. */
-typedef
-   enum {
-      CondO      = 0,  /* overflow           */
-      CondNO     = 1,  /* no overflow        */
-      CondB      = 2,  /* below              */
-      CondNB     = 3,  /* not below          */
-      CondZ      = 4,  /* zero               */
-      CondNZ     = 5,  /* not zero           */
-      CondBE     = 6,  /* below or equal     */
-      CondNBE    = 7,  /* not below or equal */
-      CondS      = 8,  /* negative           */
-      ConsNS     = 9,  /* not negative       */
-      CondP      = 10, /* parity even        */
-      CondNP     = 11, /* not parity even    */
-      CondL      = 12, /* jump less          */
-      CondNL     = 13, /* not less           */
-      CondLE     = 14, /* less or equal      */
-      CondNLE    = 15, /* not less or equal  */
-      CondAlways = 16  /* Jump always        */
-   } 
-   Condcode;
-
-
-/* Descriptions of additional properties of *unconditional* jumps. */
-typedef
-   enum {
-     JmpBoring=0,   /* boring unconditional jump */
-     JmpCall=1,     /* jump due to an x86 call insn */
-     JmpRet=2,      /* jump due to an x86 ret insn */
-     JmpSyscall=3,  /* do a system call, then jump */
-     JmpClientReq=4 /* do a client request, then jump */
-   }
-   JmpKind;
-
-
-/* Flags.  User-level code can only read/write O(verflow), S(ign),
-   Z(ero), A(ux-carry), C(arry), P(arity), and may also write
-   D(irection).  That's a total of 7 flags.  A FlagSet is a bitset,
-   thusly: 
-      76543210
-       DOSZACP
-   and bit 7 must always be zero since it is unused.
-*/
-typedef UChar FlagSet;
-
-#define FlagD (1<<6)
-#define FlagO (1<<5)
-#define FlagS (1<<4)
-#define FlagZ (1<<3)
-#define FlagA (1<<2)
-#define FlagC (1<<1)
-#define FlagP (1<<0)
-
-#define FlagsOSZACP (FlagO | FlagS | FlagZ | FlagA | FlagC | FlagP)
-#define FlagsOSZAP  (FlagO | FlagS | FlagZ | FlagA |         FlagP)
-#define FlagsOSZCP  (FlagO | FlagS | FlagZ |         FlagC | FlagP)
-#define FlagsOSACP  (FlagO | FlagS |         FlagA | FlagC | FlagP)
-#define FlagsSZACP  (        FlagS | FlagZ | FlagA | FlagC | FlagP)
-#define FlagsSZAP   (        FlagS | FlagZ | FlagA |         FlagP)
-#define FlagsZCP    (                FlagZ         | FlagC | FlagP)
-#define FlagsOC     (FlagO |                         FlagC        )
-#define FlagsAC     (                        FlagA | FlagC        )
-
-#define FlagsALL    (FlagsOSZACP | FlagD)
-#define FlagsEmpty  (FlagSet)0
-
-#define VG_IS_FLAG_SUBSET(set1,set2) \
-   (( ((FlagSet)set1) & ((FlagSet)set2) ) == ((FlagSet)set1) )
-
-#define VG_UNION_FLAG_SETS(set1,set2) \
-   ( ((FlagSet)set1) | ((FlagSet)set2) )
-
-
-
-/* A Micro (u)-instruction. */
-typedef
-   struct {
-      /* word 1 */
-      UInt    lit32;      /* 32-bit literal */
-
-      /* word 2 */
-      UShort  val1;       /* first operand */
-      UShort  val2;       /* second operand */
-
-      /* word 3 */
-      UShort  val3;       /* third operand */
-      UChar   opcode;     /* opcode */
-      UChar   size;       /* data transfer size */
-
-      /* word 4 */
-      FlagSet flags_r;    /* :: FlagSet */
-      FlagSet flags_w;    /* :: FlagSet */
-      UChar   tag1:4;     /* first  operand tag */
-      UChar   tag2:4;     /* second operand tag */
-      UChar   tag3:4;     /* third  operand tag */
-      UChar   extra4b:4;  /* Spare field, used by WIDEN for src
-                             -size, and by LEA2 for scale 
-                             (1,2,4 or 8), and by unconditional JMPs for
-                             orig x86 instr size if --cachesim=yes */
-
-
-      /* word 5 */
-      UChar   cond;            /* condition, for jumps */
-      Bool    smc_check:1;     /* do a smc test, if writes memory. */
-      Bool    signed_widen:1;  /* signed or unsigned WIDEN ? */
-      JmpKind jmpkind:3;       /* additional properties of unconditional JMP */
-   }
-   UInstr;
-
-
-/* Expandable arrays of uinstrs. */
-typedef 
-   struct { 
-      Int     used; 
-      Int     size; 
-      UInstr* instrs;
-      Int     nextTemp;
-   }
-   UCodeBlock;
-
-/* Refer to `the last instruction stuffed in', including as an
-   lvalue. */
-#define LAST_UINSTR(cb) (cb)->instrs[(cb)->used-1]
-
-/* An invalid temporary number :-) */
-#define INVALID_TEMPREG 999999999
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_demangle.c
-   ------------------------------------------------------------------ */
-
-extern void VG_(demangle) ( Char* orig, Char* result, Int result_size );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_from_ucode.c
-   ------------------------------------------------------------------ */
-
-extern UChar* VG_(emit_code) ( UCodeBlock* cb, Int* nbytes );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_to_ucode.c
-   ------------------------------------------------------------------ */
-
-extern Int   VG_(disBB)          ( UCodeBlock* cb, Addr eip0 );
-extern Char* VG_(nameOfIntReg)   ( Int size, Int reg );
-extern Char  VG_(nameOfIntSize)  ( Int size );
-extern UInt  VG_(extend_s_8to32) ( UInt x );
-extern Int   VG_(getNewTemp)     ( UCodeBlock* cb );
-extern Int   VG_(getNewShadow)   ( UCodeBlock* cb );
-
-#define SHADOW(tempreg)  ((tempreg)+1)
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_translate.c
-   ------------------------------------------------------------------ */
-
-extern void  VG_(translate)  ( ThreadState* tst,
-                               Addr  orig_addr,
-                               UInt* orig_size,
-                               Addr* trans_addr,
-                               UInt* trans_size );
-
-extern void  VG_(emptyUInstr) ( UInstr* u );
-extern void  VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz );
-extern void  VG_(newUInstr1) ( UCodeBlock* cb, Opcode opcode, Int sz,
-                               Tag tag1, UInt val1 );
-extern void  VG_(newUInstr2) ( UCodeBlock* cb, Opcode opcode, Int sz,
-                               Tag tag1, UInt val1,
-                               Tag tag2, UInt val2 );
-extern void  VG_(newUInstr3) ( UCodeBlock* cb, Opcode opcode, Int sz,
-                               Tag tag1, UInt val1,
-                               Tag tag2, UInt val2,
-                               Tag tag3, UInt val3 );
-extern void VG_(setFlagRW) ( UInstr* u, 
-                             FlagSet fr, FlagSet fw );
-
-extern void VG_(setLiteralField) ( UCodeBlock* cb, UInt lit32 );
-extern Bool VG_(anyFlagUse) ( UInstr* u );
-
-
-
-extern void  VG_(ppUInstr)        ( Int instrNo, UInstr* u );
-extern void  VG_(ppUCodeBlock)    ( UCodeBlock* cb, Char* title );
-
-extern UCodeBlock* VG_(allocCodeBlock) ( void );
-extern void  VG_(freeCodeBlock)        ( UCodeBlock* cb );
-extern void  VG_(copyUInstr)                ( UCodeBlock* cb, UInstr* instr );
-
-extern Char* VG_(nameCondcode)    ( Condcode cond );
-extern Bool  VG_(saneUInstr)      ( Bool beforeRA, UInstr* u );
-extern Bool  VG_(saneUCodeBlock)  ( UCodeBlock* cb );
-extern Char* VG_(nameUOpcode)     ( Bool upper, Opcode opc );
-extern Int   VG_(rankToRealRegNo) ( Int rank );
-
-extern void* VG_(jitmalloc) ( Int nbytes );
-extern void  VG_(jitfree)   ( void* ptr );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_execontext.c.
-   ------------------------------------------------------------------ */
-
-/* Records the PC and a bit of the call chain.  The first 4 %eip
-   values are used in comparisons do remove duplicate errors, and for
-   comparing against suppression specifications.  The rest are purely
-   informational (but often important). */
-
-typedef
-   struct _ExeContextRec {
-      struct _ExeContextRec * next;
-      /* The size of this array is VG_(clo_backtrace_size); at least
-         2, at most VG_DEEPEST_BACKTRACE.  [0] is the current %eip,
-         [1] is its caller, [2] is the caller of [1], etc. */
-      Addr eips[0];
-   }
-   ExeContext;
-
-
-/* Initialise the ExeContext storage mechanism. */
-extern void VG_(init_ExeContext_storage) ( void );
-
-/* Print stats (informational only). */
-extern void VG_(show_ExeContext_stats) ( void );
-
-
-/* Take a snapshot of the client's stack.  Search our collection of
-   ExeContexts to see if we already have it, and if not, allocate a
-   new one.  Either way, return a pointer to the context. */
-extern ExeContext* VG_(get_ExeContext) ( Bool skip_top_frame,
-                                         Addr eip, Addr ebp );
-
-/* Print an ExeContext. */
-extern void VG_(pp_ExeContext) ( ExeContext* );
-
-/* Compare two ExeContexts, just comparing the top two callers. */
-extern Bool VG_(eq_ExeContext_top2) ( ExeContext* e1, ExeContext* e2 );
-
-/* Compare two ExeContexts, just comparing the top four callers. */
-extern Bool VG_(eq_ExeContext_top4) ( ExeContext* e1, ExeContext* e2 );
-
-/* Compare two ExeContexts, comparing all callers. */
-extern Bool VG_(eq_ExeContext_all) ( ExeContext* e1, ExeContext* e2 );
-
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_errcontext.c.
-   ------------------------------------------------------------------ */
-
-extern void VG_(load_suppressions)    ( void );
-extern void VG_(show_all_errors)      ( void );
-extern void VG_(record_value_error)   ( Int size );
-extern void VG_(record_free_error)    ( ThreadState* tst, Addr a );
-extern void VG_(record_freemismatch_error)    ( ThreadState* tst, Addr a );
-extern void VG_(record_address_error) ( Addr a, Int size, 
-                                        Bool isWrite );
-
-extern void VG_(record_jump_error) ( ThreadState* tst, Addr a );
-
-extern void VG_(record_param_err) ( ThreadState* tst,
-                                    Addr a, 
-                                    Bool isWriteLack, 
-                                    Char* msg );
-extern void VG_(record_user_err) ( ThreadState* tst,
-                                   Addr a, Bool isWriteLack );
-extern void VG_(record_pthread_err) ( ThreadId tid, Char* msg );
-
-
-
-/* The classification of a faulting address. */
-typedef 
-   enum { Undescribed, /* as-yet unclassified */
-          Stack, 
-          Unknown, /* classification yielded nothing useful */
-          Freed, Mallocd, 
-          UserG, UserS }
-   AddrKind;
-
-/* Records info about a faulting address. */
-typedef
-   struct {
-      /* ALL */
-      AddrKind akind;
-      /* Freed, Mallocd */
-      Int blksize;
-      /* Freed, Mallocd */
-      Int rwoffset;
-      /* Freed, Mallocd */
-      ExeContext* lastchange;
-      /* Stack */
-      ThreadId stack_tid;
-      /* True if is just-below %esp -- could be a gcc bug. */
-      Bool maybe_gcc;
-   }
-   AddrInfo;
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_clientperms.c
-   ------------------------------------------------------------------ */
-
-extern Bool VG_(client_perm_maybe_describe)( Addr a, AddrInfo* ai );
-
-extern UInt VG_(handle_client_request) ( ThreadState* tst, UInt* arg_block );
-
-extern void VG_(delete_client_stack_blocks_following_ESP_change) ( void );
-
-extern void VG_(show_client_block_stats) ( void );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_procselfmaps.c
-   ------------------------------------------------------------------ */
-
-extern 
-void VG_(read_procselfmaps) (
-   void (*record_mapping)( Addr, UInt, Char, Char, Char, UInt, UChar* )
-);
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_symtab2.c
-   ------------------------------------------------------------------ */
-
-/* We assume the executable is loaded here ... can't really find
-   out.  There is a hacky sanity check in vg_init_memory_audit()
-   which should trip up most stupidities.
-*/
-#define VG_ASSUMED_EXE_BASE  (Addr)0x8048000
-
-extern void VG_(read_symbols) ( void );
-extern void VG_(mini_stack_dump) ( ExeContext* ec );
-extern void VG_(what_obj_and_fun_is_this)
-                                     ( Addr a,
-                                       Char* obj_buf, Int n_obj_buf,
-                                       Char* fun_buf, Int n_fun_buf );
-extern Bool VG_(what_line_is_this) ( Addr a,
-                                     UChar* filename, Int n_filename,
-                                     UInt* lineno );
-extern Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a,
-                                     Char* fn_name, Int n_fn_name);
-
-extern Bool VG_(symtab_notify_munmap) ( Addr start, UInt length );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_clientmalloc.c
-   ------------------------------------------------------------------ */
-
-typedef
-   enum { 
-      Vg_AllocMalloc = 0,
-      Vg_AllocNew    = 1,
-      Vg_AllocNewVec = 2 
-   }
-   VgAllocKind;
-
-/* Description of a malloc'd chunk. */
-typedef 
-   struct _ShadowChunk {
-      struct _ShadowChunk* next;
-      ExeContext*   where;          /* where malloc'd/free'd */
-      UInt          size : 30;      /* size requested.       */
-      VgAllocKind   allockind : 2;  /* which wrapper did the allocation */
-      Addr          data;           /* ptr to actual block.  */
-   } 
-   ShadowChunk;
-
-extern void          VG_(clientmalloc_done) ( void );
-extern void          VG_(describe_addr) ( Addr a, AddrInfo* ai );
-extern ShadowChunk** VG_(get_malloc_shadows) ( /*OUT*/ UInt* n_shadows );
-
-/* These are called from the scheduler, when it intercepts a user
-   request. */
-extern void* VG_(client_malloc)   ( ThreadState* tst, 
-                                    UInt size, VgAllocKind kind );
-extern void* VG_(client_memalign) ( ThreadState* tst, 
-                                    UInt align, UInt size );
-extern void  VG_(client_free)     ( ThreadState* tst, 
-                                    void* ptrV, VgAllocKind  kind );
-extern void* VG_(client_calloc)   ( ThreadState* tst, 
-                                    UInt nmemb, UInt size1 );
-extern void* VG_(client_realloc)  ( ThreadState* tst, 
-                                    void* ptrV, UInt size_new );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_main.c
-   ------------------------------------------------------------------ */
-
-/* A structure used as an intermediary when passing the simulated
-   CPU's state to some assembly fragments, particularly system calls.
-   Stuff is copied from baseBlock to here, the assembly magic runs,
-   and then the inverse copy is done. */
-
-extern UInt VG_(m_state_static) [8 /* int regs, in Intel order */ 
-                                 + 1 /* %eflags */ 
-                                 + 1 /* %eip */
-                                 + VG_SIZE_OF_FPUSTATE_W /* FPU state */
-                                ];
-
-/* Handy fns for doing the copy back and forth. */
-extern void VG_(copy_baseBlock_to_m_state_static) ( void );
-extern void VG_(copy_m_state_static_to_baseBlock) ( void );
-
-/* Called when some unhandleable client behaviour is detected.
-   Prints a msg and aborts. */
-extern void VG_(unimplemented) ( Char* msg );
-extern void VG_(nvidia_moan) ( void );
-
-/* The stack on which Valgrind runs.  We can't use the same stack as the
-   simulatee -- that's an important design decision.  */
-extern UInt VG_(stack)[10000];
-
-/* Similarly, we have to ask for signals to be delivered on an
-   alternative stack, since it is possible, although unlikely, that
-   we'll have to run client code from inside the Valgrind-installed
-   signal handler.  If this happens it will be done by
-   vg_deliver_signal_immediately(). */
-extern UInt VG_(sigstack)[10000];
-
-/* Holds client's %esp at the point we gained control.  From this the
-   client's argc, argv and envp are deduced. */
-extern Addr   VG_(esp_at_startup);
-extern Int    VG_(client_argc);
-extern Char** VG_(client_argv);
-extern Char** VG_(client_envp);
-
-/* Remove valgrind.so from a LD_PRELOAD=... string so child processes
-   don't get traced into.  Also mess up $libdir/valgrind so that our
-   libpthread.so disappears from view. */
-void VG_(mash_LD_PRELOAD_and_LD_LIBRARY_PATH) ( Char* ld_preload_str,
-                                                Char* ld_library_path_str );
-
-/* Something of a function looking for a home ... start up GDB.  This
-   is called from VG_(swizzle_esp_then_start_GDB) and so runs on the
-   *client's* stack.  This is necessary to give GDB the illusion that
-   the client program really was running on the real cpu. */
-extern void VG_(start_GDB_whilst_on_client_stack) ( void );
-
-/* Spew out vast amounts of junk during JITting? */
-extern Bool  VG_(disassemble);
-
-/* 64-bit counter for the number of basic blocks done. */
-extern ULong VG_(bbs_done);
-/* 64-bit counter for the number of bbs to go before a debug exit. */
-extern ULong VG_(bbs_to_go);
-
-/* Counts downwards in vg_run_innerloop. */
-extern UInt VG_(dispatch_ctr);
-
-/* Is the client running on the simulated CPU or the real one? */
-extern Bool VG_(running_on_simd_CPU); /* Initially False */
-
-/* The current LRU epoch. */
-extern UInt VG_(current_epoch);
-
-/* This is the ThreadId of the last thread the scheduler ran. */
-extern ThreadId VG_(last_run_tid);
-
-
-/* --- Counters, for informational purposes only. --- */
-
-/* Number of lookups which miss the fast tt helper. */
-extern UInt VG_(tt_fast_misses);
-
-/* Counts for LRU informational messages. */
-
-/* Number and total o/t size of new translations this epoch. */
-extern UInt VG_(this_epoch_in_count);
-extern UInt VG_(this_epoch_in_osize);
-extern UInt VG_(this_epoch_in_tsize);
-/* Number and total o/t size of discarded translations this epoch. */
-extern UInt VG_(this_epoch_out_count);
-extern UInt VG_(this_epoch_out_osize);
-extern UInt VG_(this_epoch_out_tsize);
-/* Number and total o/t size of translations overall. */
-extern UInt VG_(overall_in_count);
-extern UInt VG_(overall_in_osize);
-extern UInt VG_(overall_in_tsize);
-/* Number and total o/t size of discards overall. */
-extern UInt VG_(overall_out_count);
-extern UInt VG_(overall_out_osize);
-extern UInt VG_(overall_out_tsize);
-
-/* The number of LRU-clearings of TT/TC. */
-extern UInt VG_(number_of_lrus);
-
-/* Counts pertaining to the register allocator. */
-
-/* total number of uinstrs input to reg-alloc */
-extern UInt VG_(uinstrs_prealloc);
-
-/* total number of uinstrs added due to spill code */
-extern UInt VG_(uinstrs_spill);
-
-/* number of bbs requiring spill code */
-extern UInt VG_(translations_needing_spill);
-
-/* total of register ranks over all translations */
-extern UInt VG_(total_reg_rank);
-
-/* Counts pertaining to internal sanity checking. */
-extern UInt VG_(sanity_fast_count);
-extern UInt VG_(sanity_slow_count);
-
-/* Counts pertaining to the scheduler. */
-extern UInt VG_(num_scheduling_events_MINOR);
-extern UInt VG_(num_scheduling_events_MAJOR);
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_memory.c
-   ------------------------------------------------------------------ */
-
-extern void VGM_(init_memory_audit) ( void );
-extern Addr VGM_(curr_dataseg_end);
-extern void VG_(show_reg_tags) ( void );
-extern void VG_(detect_memory_leaks) ( void );
-extern void VG_(done_prof_mem) ( void );
-
-/* Set permissions for an address range.  Not speed-critical. */
-extern void VGM_(make_noaccess) ( Addr a, UInt len );
-extern void VGM_(make_writable) ( Addr a, UInt len );
-extern void VGM_(make_readable) ( Addr a, UInt len );
-/* Use with care! (read: use for shmat only) */
-extern void VGM_(make_readwritable) ( Addr a, UInt len );
-extern void VGM_(copy_address_range_perms) ( Addr src, Addr dst,
-                                             UInt len );
-
-/* Check permissions for an address range.  Not speed-critical. */
-extern Bool VGM_(check_writable) ( Addr a, UInt len, Addr* bad_addr );
-extern Bool VGM_(check_readable) ( Addr a, UInt len, Addr* bad_addr );
-extern Bool VGM_(check_readable_asciiz) ( Addr a, Addr* bad_addr );
-
-/* Sanity checks which may be done at any time.  The scheduler decides
-   when. */
-extern void VG_(do_sanity_checks) ( Bool force_expensive );
-/* Very cheap ... */
-extern Bool VG_(first_and_last_secondaries_look_plausible) ( void );
-
-/* These functions are called from generated code. */
-extern void VG_(helperc_STOREV4) ( UInt, Addr );
-extern void VG_(helperc_STOREV2) ( UInt, Addr );
-extern void VG_(helperc_STOREV1) ( UInt, Addr );
-
-extern UInt VG_(helperc_LOADV1) ( Addr );
-extern UInt VG_(helperc_LOADV2) ( Addr );
-extern UInt VG_(helperc_LOADV4) ( Addr );
-
-extern void VGM_(handle_esp_assignment) ( Addr new_espA );
-extern void VGM_(fpu_write_check) ( Addr addr, Int size );
-extern void VGM_(fpu_read_check)  ( Addr addr, Int size );
-
-/* Safely (avoiding SIGSEGV / SIGBUS) scan the entire valid address
-   space and pass the addresses and values of all addressible,
-   defined, aligned words to notify_word.  This is the basis for the
-   leak detector.  Returns the number of calls made to notify_word.  */
-UInt VG_(scan_all_valid_memory) ( void (*notify_word)( Addr, UInt ) );
-
-/* Is this address within some small distance below %ESP?  Used only
-   for the --workaround-gcc296-bugs kludge. */
-extern Bool VG_(is_just_below_ESP)( Addr esp, Addr aa );
-
-/* Nasty kludgery to deal with applications which switch stacks,
-   like netscape. */
-#define VG_PLAUSIBLE_STACK_SIZE 8000000
-
-/* Needed by the pthreads implementation. */
-#define VGM_WORD_VALID     0
-#define VGM_WORD_INVALID   0xFFFFFFFF
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_syscall_mem.c
-   ------------------------------------------------------------------ */
-
-extern void VG_(perform_assumed_nonblocking_syscall) ( ThreadId tid );
-
-extern void VG_(check_known_blocking_syscall) ( ThreadId tid, 
-                                                Int syscallno,
-                                                Int* /*IN*/ res );
-
-extern Bool VG_(is_kerror) ( Int res );
-
-#define KERNEL_DO_SYSCALL(thread_id, result_lvalue)        \
-         VG_(load_thread_state)(thread_id);                \
-         VG_(copy_baseBlock_to_m_state_static)();          \
-         VG_(do_syscall)();                                \
-         VG_(copy_m_state_static_to_baseBlock)();          \
-         VG_(save_thread_state)(thread_id);                \
-         VG_(threads)[thread_id].sh_eax = VGM_WORD_VALID;  \
-         result_lvalue = VG_(threads)[thread_id].m_eax;
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_transtab.c
-   ------------------------------------------------------------------ */
-
-/* An entry in the translation table (TT). */
-typedef
-   struct {
-      /* +0 */  Addr   orig_addr;
-      /* +4 */  Addr   trans_addr;
-      /* +8 */  UInt   mru_epoch;
-      /* +12 */ UShort orig_size;
-      /* +14 */ UShort trans_size;
-   }
-   TTEntry;
-
-/* The number of basic blocks in an epoch (one age-step). */
-#define VG_BBS_PER_EPOCH 20000
-
-extern void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used );
-extern void VG_(maybe_do_lru_pass) ( void );
-extern void VG_(flush_transtab) ( void );
-extern Addr VG_(copy_to_transcache) ( Addr trans_addr, Int trans_size );
-extern void VG_(add_to_trans_tab) ( TTEntry* tte );
-extern void VG_(invalidate_translations) ( Addr start, UInt range );
-
-extern void VG_(init_tt_tc) ( void );
-
-extern void VG_(sanity_check_tc_tt) ( void );
-extern Addr VG_(search_transtab) ( Addr original_addr );
-
-extern void VG_(invalidate_tt_fast)( void );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_vtagops.c
-   ------------------------------------------------------------------ */
-
-/* Lists the names of value-tag operations used in instrumented
-   code.  These are the third argument to TAG1 and TAG2 uinsns. */
-
-typedef
-   enum { 
-     /* Unary. */
-     VgT_PCast40, VgT_PCast20, VgT_PCast10,
-     VgT_PCast01, VgT_PCast02, VgT_PCast04,
-
-     VgT_PCast14, VgT_PCast12, VgT_PCast11,
-
-     VgT_Left4, VgT_Left2, VgT_Left1,
-
-     VgT_SWiden14, VgT_SWiden24, VgT_SWiden12,
-     VgT_ZWiden14, VgT_ZWiden24, VgT_ZWiden12,
-
-     /* Binary; 1st is rd; 2nd is rd+wr */
-     VgT_UifU4, VgT_UifU2, VgT_UifU1, VgT_UifU0,
-     VgT_DifD4, VgT_DifD2, VgT_DifD1,
-
-     VgT_ImproveAND4_TQ, VgT_ImproveAND2_TQ, VgT_ImproveAND1_TQ, 
-     VgT_ImproveOR4_TQ, VgT_ImproveOR2_TQ, VgT_ImproveOR1_TQ,
-     VgT_DebugFn
-   }
-   VgTagOp;
-
-extern Char* VG_(nameOfTagOp) ( VgTagOp );
-extern UInt VG_(DebugFn) ( UInt a1, UInt a2 );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_syscall.S
-   ------------------------------------------------------------------ */
-
-extern void VG_(do_syscall) ( void );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_startup.S
-   ------------------------------------------------------------------ */
-
-extern void VG_(switch_to_real_CPU) ( void );
-
-extern void VG_(swizzle_esp_then_start_GDB) ( Addr m_eip_at_error,
-                                              Addr m_esp_at_error,
-                                              Addr m_ebp_at_error );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_dispatch.S
-   ------------------------------------------------------------------ */
-
-/* Run a thread for a (very short) while, until some event happens
-   which means we need to defer to the scheduler. */
-extern UInt VG_(run_innerloop) ( void );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_helpers.S
-   ------------------------------------------------------------------ */
-
-/* Mul, div, etc, -- we don't codegen these directly. */
-extern void VG_(helper_idiv_64_32);
-extern void VG_(helper_div_64_32);
-extern void VG_(helper_idiv_32_16);
-extern void VG_(helper_div_32_16);
-extern void VG_(helper_idiv_16_8);
-extern void VG_(helper_div_16_8);
-
-extern void VG_(helper_imul_32_64);
-extern void VG_(helper_mul_32_64);
-extern void VG_(helper_imul_16_32);
-extern void VG_(helper_mul_16_32);
-extern void VG_(helper_imul_8_16);
-extern void VG_(helper_mul_8_16);
-
-extern void VG_(helper_CLD);
-extern void VG_(helper_STD);
-extern void VG_(helper_get_dirflag);
-
-extern void VG_(helper_CLC);
-extern void VG_(helper_STC);
-
-extern void VG_(helper_shldl);
-extern void VG_(helper_shldw);
-extern void VG_(helper_shrdl);
-extern void VG_(helper_shrdw);
-
-extern void VG_(helper_RDTSC);
-extern void VG_(helper_CPUID);
-
-extern void VG_(helper_bsf);
-extern void VG_(helper_bsr);
-
-extern void VG_(helper_fstsw_AX);
-extern void VG_(helper_SAHF);
-extern void VG_(helper_DAS);
-extern void VG_(helper_DAA);
-
-extern void VG_(helper_value_check4_fail);
-extern void VG_(helper_value_check2_fail);
-extern void VG_(helper_value_check1_fail);
-extern void VG_(helper_value_check0_fail);
-
-/* NOT A FUNCTION; this is a bogus RETURN ADDRESS. */
-extern void VG_(signalreturn_bogusRA)( void );
-
-
-/* ---------------------------------------------------------------------
-   Exports of vg_cachesim.c
-   ------------------------------------------------------------------ */
-
-extern Int VG_(log2) ( Int x );
-
-extern UCodeBlock* VG_(cachesim_instrument) ( UCodeBlock* cb_in, 
-                                              Addr orig_addr );
-
-typedef struct  _iCC  iCC;
-typedef struct _idCC idCC;
-
-extern void VG_(init_cachesim)      ( void );
-extern void VG_(do_cachesim_results)( Int client_argc, Char** client_argv );
-
-extern void VG_(cachesim_log_non_mem_instr)(  iCC* cc );
-extern void VG_(cachesim_log_mem_instr)    ( idCC* cc, Addr data_addr );
-
-extern void VG_(cachesim_notify_discard) ( TTEntry* tte );
-
-
-/* ---------------------------------------------------------------------
-   The state of the simulated CPU.
-   ------------------------------------------------------------------ */
-
-/* This is the Intel register encoding. */
-#define R_EAX 0
-#define R_ECX 1
-#define R_EDX 2
-#define R_EBX 3
-#define R_ESP 4
-#define R_EBP 5
-#define R_ESI 6
-#define R_EDI 7
-
-#define R_AL (0+R_EAX)
-#define R_CL (0+R_ECX)
-#define R_DL (0+R_EDX)
-#define R_BL (0+R_EBX)
-#define R_AH (4+R_EAX)
-#define R_CH (4+R_ECX)
-#define R_DH (4+R_EDX)
-#define R_BH (4+R_EBX)
-
-
-/* ---------------------------------------------------------------------
-   Offsets into baseBlock for everything which needs to referred to
-   from generated code.  The order of these decls does not imply 
-   what the order of the actual offsets is.  The latter is important
-   and is set up in vg_main.c.
-   ------------------------------------------------------------------ */
-
-/* An array of words.  In generated code, %ebp always points to the
-   start of this array.  Useful stuff, like the simulated CPU state,
-   and the addresses of helper functions, can then be found by
-   indexing off %ebp.  The following declares variables which, at
-   startup time, are given values denoting offsets into baseBlock.
-   These offsets are in *words* from the start of baseBlock. */
-
-#define VG_BASEBLOCK_WORDS 200
-
-extern UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
-
-
-/* -----------------------------------------------------
-   Read-write parts of baseBlock.
-   -------------------------------------------------- */
-
-/* State of the simulated CPU. */
-extern Int VGOFF_(m_eax);
-extern Int VGOFF_(m_ecx);
-extern Int VGOFF_(m_edx);
-extern Int VGOFF_(m_ebx);
-extern Int VGOFF_(m_esp);
-extern Int VGOFF_(m_ebp);
-extern Int VGOFF_(m_esi);
-extern Int VGOFF_(m_edi);
-extern Int VGOFF_(m_eflags);
-extern Int VGOFF_(m_fpustate);
-extern Int VGOFF_(m_eip);
-
-/* Reg-alloc spill area (VG_MAX_SPILLSLOTS words long). */
-extern Int VGOFF_(spillslots);
-
-/* Records the valid bits for the 8 integer regs & flags reg. */
-extern Int VGOFF_(sh_eax);
-extern Int VGOFF_(sh_ecx);
-extern Int VGOFF_(sh_edx);
-extern Int VGOFF_(sh_ebx);
-extern Int VGOFF_(sh_esp);
-extern Int VGOFF_(sh_ebp);
-extern Int VGOFF_(sh_esi);
-extern Int VGOFF_(sh_edi);
-extern Int VGOFF_(sh_eflags);
-
-
-/* -----------------------------------------------------
-   Read-only parts of baseBlock.
-   -------------------------------------------------- */
-
-/* Offsets of addresses of helper functions.  A "helper" function is
-   one which is called from generated code. */
-
-extern Int VGOFF_(helper_idiv_64_32);
-extern Int VGOFF_(helper_div_64_32);
-extern Int VGOFF_(helper_idiv_32_16);
-extern Int VGOFF_(helper_div_32_16);
-extern Int VGOFF_(helper_idiv_16_8);
-extern Int VGOFF_(helper_div_16_8);
-
-extern Int VGOFF_(helper_imul_32_64);
-extern Int VGOFF_(helper_mul_32_64);
-extern Int VGOFF_(helper_imul_16_32);
-extern Int VGOFF_(helper_mul_16_32);
-extern Int VGOFF_(helper_imul_8_16);
-extern Int VGOFF_(helper_mul_8_16);
-
-extern Int VGOFF_(helper_CLD);
-extern Int VGOFF_(helper_STD);
-extern Int VGOFF_(helper_get_dirflag);
-
-extern Int VGOFF_(helper_CLC);
-extern Int VGOFF_(helper_STC);
-
-extern Int VGOFF_(helper_shldl);
-extern Int VGOFF_(helper_shldw);
-extern Int VGOFF_(helper_shrdl);
-extern Int VGOFF_(helper_shrdw);
-
-extern Int VGOFF_(helper_RDTSC);
-extern Int VGOFF_(helper_CPUID);
-
-extern Int VGOFF_(helper_bsf);
-extern Int VGOFF_(helper_bsr);
-
-extern Int VGOFF_(helper_fstsw_AX);
-extern Int VGOFF_(helper_SAHF);
-extern Int VGOFF_(helper_DAS);
-extern Int VGOFF_(helper_DAA);
-
-extern Int VGOFF_(helper_value_check4_fail);
-extern Int VGOFF_(helper_value_check2_fail);
-extern Int VGOFF_(helper_value_check1_fail);
-extern Int VGOFF_(helper_value_check0_fail);
-
-extern Int VGOFF_(helperc_STOREV4); /* :: UInt -> Addr -> void */
-extern Int VGOFF_(helperc_STOREV2); /* :: UInt -> Addr -> void */
-extern Int VGOFF_(helperc_STOREV1); /* :: UInt -> Addr -> void */
-
-extern Int VGOFF_(helperc_LOADV4); /* :: Addr -> UInt -> void */
-extern Int VGOFF_(helperc_LOADV2); /* :: Addr -> UInt -> void */
-extern Int VGOFF_(helperc_LOADV1); /* :: Addr -> UInt -> void */
-
-extern Int VGOFF_(handle_esp_assignment); /* :: Addr -> void */
-extern Int VGOFF_(fpu_write_check);       /* :: Addr -> Int -> void */
-extern Int VGOFF_(fpu_read_check);        /* :: Addr -> Int -> void */
-
-extern Int VGOFF_(cachesim_log_non_mem_instr);
-extern Int VGOFF_(cachesim_log_mem_instr);
-
-#endif /* ndef __VG_INCLUDE_H */
-
-
-/* ---------------------------------------------------------------------
-   Finally - autoconf-generated settings
-   ------------------------------------------------------------------ */
-
-#include "config.h"
-
-/*--------------------------------------------------------------------*/
-/*--- end                                             vg_include.h ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_kerneliface.h b/coregrind/vg_kerneliface.h
deleted file mode 100644
index bcc10f55e2..0000000000
--- a/coregrind/vg_kerneliface.h
+++ /dev/null
@@ -1,354 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A header file defining structures and constants which are    ---*/
-/*--- important at the kernel boundary for this platform.          ---*/
-/*---                                             vg_kerneliface.h ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#ifndef __VG_KERNELIFACE_H
-#define __VG_KERNELIFACE_H
-
-/* This file is ONLY to be included into vg_include.h.  Do not include
-   it directly into valgrind source .c files.  This file defines types
-   and constants for the kernel interface, and to make that clear
-   everything is prefixed VKI. */
-
-/*--- All the following stuff is correct for Linux kernels 2.2.X and
-      2.4.X. 
----*/
-
-/* Should really get this from an include file somewhere. */
-#define VKI_BYTES_PER_PAGE_BITS 12
-#define VKI_BYTES_PER_PAGE (1 << VKI_BYTES_PER_PAGE_BITS)
-
-#define VKI_BYTES_PER_WORD 4
-#define VKI_WORDS_PER_PAGE (VKI_BYTES_PER_PAGE / VKI_BYTES_PER_WORD)
-
-
-/* For system call numbers __NR_... */
-#include <asm/unistd.h>
-
-/* An implementation of signal sets.  These are the same as the sigset
-   implementations in the relevant Linux kernels.  Note carefully that
-   this has nothing to do with glibc's signal sets.  We work entirely
-   at the kernel boundary, so the libc stuff is invisible and
-   irrelevant.  */
-
-/* The following is copied from
-   /usr/src/linux-2.4.9-13/include/asm-i386/signal.h */
-#define VKI_KNSIG       64  /* true for linux 2.2.X and 2.4.X */
-#define VKI_KNSIG_BPW   32  /* since we're using UInts */
-#define VKI_KNSIG_WORDS (VKI_KNSIG / VKI_KNSIG_BPW)
-
-typedef 
-   struct { 
-      UInt ws[VKI_KNSIG_WORDS]; 
-   }
-   vki_ksigset_t;
-
-
-typedef
-   struct {
-      void*         ksa_handler;
-      unsigned long ksa_flags;
-      void (*ksa_restorer)(void);
-      vki_ksigset_t ksa_mask;
-   }
-   vki_ksigaction;
-
-typedef 
-   struct {
-      void* ss_sp;
-      Int   ss_flags;
-      UInt  ss_size;
-   } 
-   vki_kstack_t;
-
-
-/* sigaltstack controls */
-#define VKI_SS_ONSTACK      1
-#define VKI_SS_DISABLE      2
-
-#define VKI_MINSIGSTKSZ     2048
-#define VKI_SIGSTKSZ        8192
-
-
-
-#define VKI_SIG_BLOCK          0    /* for blocking signals */
-#define VKI_SIG_UNBLOCK        1    /* for unblocking signals */
-#define VKI_SIG_SETMASK        2    /* for setting the signal mask */
-
-#define VKI_SIG_DFL ((void*)0)     /* default signal handling */
-#define VKI_SIG_IGN ((void*)1)     /* ignore signal */
-#define VKI_SIG_ERR ((void*)-1)    /* error return from signal */
-
-#define VKI_SA_ONSTACK      0x08000000
-#define VKI_SA_RESTART      0x10000000
-#define VKI_SA_NOCLDSTOP    0x00000001
-#define VKI_SA_RESETHAND    0x80000000
-#define VKI_SA_ONESHOT      VKI_SA_RESETHAND
-#define VKI_SA_NODEFER      0x40000000
-#define VKI_SA_NOMASK       VKI_SA_NODEFER
-#if 0
-#define VKI_SA_NOCLDWAIT    0x00000002 /* not supported yet */
-#define VKI_SA_SIGINFO      0x00000004
-#define VKI_SA_INTERRUPT    0x20000000 /* dummy -- ignored */
-#define VKI_SA_RESTORER     0x04000000
-#endif
-
-#define VKI_SIGSEGV         11
-#define VKI_SIGBUS           7
-#define VKI_SIGILL           4
-#define VKI_SIGFPE           8
-#define VKI_SIGKILL          9
-#define VKI_SIGSTOP         19
-#define VKI_SIGTERM         15
-#define VKI_SIGUSR1         10
-
-/* The following are copied from include/asm-i386/mman.h .*/
-
-#define VKI_PROT_READ      0x1             /* Page can be read.  */
-#define VKI_PROT_WRITE     0x2             /* Page can be written.  */
-#define VKI_PROT_EXEC      0x4             /* Page can be executed.  */
-#define VKI_MAP_ANONYMOUS  0x20            /* Don't use a file.  */
-#define VKI_MAP_PRIVATE    0x02            /* Changes are private.  */
-#define VKI_MAP_FIXED      0x10            /* Interpret addr exactly */
-
-
-/* Copied from /usr/src/linux-2.4.9-13/include/asm/errno.h */
-
-#define VKI_EPERM            1      /* Operation not permitted */
-#define VKI_EINTR            4      /* Interrupted system call */
-#define VKI_EINVAL          22      /* Invalid argument */
-#define VKI_ENOMEM          12      /* Out of memory */
-#define	VKI_EFAULT          14      /* Bad address */
-#define VKI_ESRCH            3      /* No such process */
-
-#define VKI_EWOULDBLOCK     VKI_EAGAIN  /* Operation would block */
-#define VKI_EAGAIN          11      /* Try again */
-
-
-/* Gawd ... hack ... */
-
-typedef struct vki__user_cap_header_struct {
-        UInt version;
-        int pid;
-} vki_cap_user_header_t;
- 
-typedef struct vki__user_cap_data_struct {
-        UInt effective;
-        UInt permitted;
-        UInt inheritable;
-} vki_cap_user_data_t;
-  
-
-/* "Byrial Jensen" <byrial@image.dk> says:
-               [various] ioctls take a pointer to a "struct
-               termios" but this is another and shorter "struct
-               termios" than the one defined in <termios.h> and used
-               by tcgetattr(3) and tcsetattr(3) and other library
-               functions. GNU libc translate between its library
-               termios and the kernel termios. 
-*/
-
-#define VKI_SIZEOF_STRUCT_TERMIOS 36
-
-/* Adam Gundy <arg@cyberscience.com>, 20 Mar 2002, says: */
-#define VKI_SIZEOF_STRUCT_TERMIO 17
-
-
-/* File descriptor sets, for doing select().  Copied from
-   /usr/src/linux-2.4.9-31/include/linux/posix_types.h 
-*/
-/*
- * This allows for 1024 file descriptors: if NR_OPEN is ever grown
- * beyond that you'll have to change this too. But 1024 fd's seem to be
- * enough even for such "real" unices like OSF/1, so hopefully this is
- * one limit that doesn't have to be changed [again].
- *
- * Note that POSIX wants the FD_CLEAR(fd,fdsetp) defines to be in
- * <sys/time.h> (and thus <linux/time.h>) - but this is a more logical
- * place for them. Solved by having dummy defines in <sys/time.h>.
- */
-
-/*
- * Those macros may have been defined in <gnu/types.h>. But we always
- * use the ones here. 
- */
-#undef VKI_NFDBITS
-#define VKI_NFDBITS       (8 * sizeof(unsigned long))
-
-#undef VKI_FD_SETSIZE
-#define VKI_FD_SETSIZE    1024
-
-#undef VKI_FDSET_LONGS
-#define VKI_FDSET_LONGS   (VKI_FD_SETSIZE/VKI_NFDBITS)
-
-#undef VKI_FDELT
-#define VKI_FDELT(d)      ((d) / VKI_NFDBITS)
-
-#undef VKI_FDMASK
-#define VKI_FDMASK(d)     (1UL << ((d) % VKI_NFDBITS))
-
-typedef struct {
-        unsigned long vki_fds_bits [VKI_FDSET_LONGS];
-} vki_fd_set;
-
-
-/* Gawd ...
-   Copied from /usr/src/linux-2.4.9-31/./include/asm-i386/posix_types.h
-*/
-#undef  VKI_FD_SET
-#define VKI_FD_SET(fd,fdsetp) \
-                __asm__ __volatile__("btsl %1,%0": \
-                        "=m" (*(vki_fd_set *) (fdsetp)):"r" ((int) (fd)))
-
-#undef  VKI_FD_CLR
-#define VKI_FD_CLR(fd,fdsetp) \
-                __asm__ __volatile__("btrl %1,%0": \
-                        "=m" (*(vki_fd_set *) (fdsetp)):"r" ((int) (fd)))
-
-#undef  VKI_FD_ISSET
-#define VKI_FD_ISSET(fd,fdsetp) (__extension__ ({ \
-                unsigned char __result; \
-                __asm__ __volatile__("btl %1,%2 ; setb %0" \
-                        :"=q" (__result) :"r" ((int) (fd)), \
-                        "m" (*(vki_fd_set *) (fdsetp))); \
-                __result; }))
-
-#undef  VKI_FD_ZERO
-#define VKI_FD_ZERO(fdsetp) \
-do { \
-        int __d0, __d1; \
-        __asm__ __volatile__("cld ; rep ; stosl" \
-                        :"=m" (*(vki_fd_set *) (fdsetp)), \
-                          "=&c" (__d0), "=&D" (__d1) \
-                        :"a" (0), "1" (VKI_FDSET_LONGS), \
-                        "2" ((vki_fd_set *) (fdsetp)) : "memory"); \
-} while (0)
-
-
-
-/* 
-./include/asm-i386/posix_types.h:typedef long           __kernel_suseconds_t;
-./include/linux/types.h:typedef __kernel_suseconds_t    suseconds_t;
-
-./include/asm-i386/posix_types.h:typedef long           __kernel_time_t;
-./include/linux/types.h:typedef __kernel_time_t         time_t;
-*/
-
-struct vki_timeval {
-        /* time_t */ long         tv_sec;         /* seconds */
-        /* suseconds_t */ long    tv_usec;        /* microseconds */
-};
-
-
-
-/* For fcntl on fds ..
-   from ./include/asm-i386/fcntl.h */
-#define VKI_F_GETFL         3       /* get file->f_flags */
-#define VKI_F_SETFL         4       /* set file->f_flags */
-
-#define VKI_O_NONBLOCK        04000
-
-/* For nanosleep ... 
-   from ./include/linux/time.h */
-struct vki_timespec {
-        /* time_t */ long tv_sec;         /* seconds */
-        long    tv_nsec;        /* nanoseconds */
-};
-
-
-/* STAT stuff 
-   from /usr/src/linux-2.4.9-31/include/asm-i386/stat.h */
-struct vki_stat {
-        unsigned short st_dev;
-        unsigned short __pad1;
-        unsigned long st_ino;
-        unsigned short st_mode;
-        unsigned short st_nlink;
-        unsigned short st_uid;
-        unsigned short st_gid;
-        unsigned short st_rdev;
-        unsigned short __pad2;
-        unsigned long  st_size;
-        unsigned long  st_blksize;
-        unsigned long  st_blocks;
-        unsigned long  st_atime;
-        unsigned long  __unused1;
-        unsigned long  st_mtime;
-        unsigned long  __unused2;
-        unsigned long  st_ctime;
-        unsigned long  __unused3;
-        unsigned long  __unused4;
-        unsigned long  __unused5;
-};
-
-
-/* To do with the ELF frame constructed by the kernel on a process'
-   stack just before it transfers control to the program's interpreter
-   (to use the ELF parlance).
-   Constants from /usr/src/linux-2.4.9-31/include/linux/elf.h
-   Logic from     /usr/src/linux-2.4.9-31/fs/binfmt_elf.c
-                  and its counterpart in the 2.2.14 kernel sources 
-                  in Red Hat 6.2.  */
-#define VKI_AT_CLKTCK 17    /* frequency at which times() increments */
-#define VKI_AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
-#define VKI_AT_BASE   7     /* base address of interpreter */
-#define VKI_AT_PAGESZ 6     /* system page size */
-#define VKI_AT_PHNUM  5     /* number of program headers */
-#define VKI_AT_PHENT  4     /* size of program header entry */
-#define VKI_AT_PHDR   3     /* program headers for program */
-#define VKI_AT_USER_AUX_SEGMENT 23  /* tell glibc what address segment
-                                       0x3B points to.  (Needed for
-                                       Red Hat Limbo, 7.3.92) */
-
-/* Including <linux/module.h> leads to loads of hassle because then we
-   need <asm/atomic.h> sometimes (RedHat 7.3) and that is a
-   kernel-only header which deliberately #errors on gcc-3.1.  Mucho
-   hassle considering that we only want to know sizeof(struct module).
-   Hence ...
- 
-   #include <stdio.h>
-   #include <asm/atomic.h>
-   #include <linux/module.h>
-
-   int main ( void )
-   {
-      printf ("sizeof(struct module) = %d\n", sizeof(struct module) );
-      return 0;
-    }
-*/
-
-#define VKI_SIZEOF_STRUCT_MODULE 96
-
-#endif /* ndef __VG_KERNELIFACE_H */
-
-/*--------------------------------------------------------------------*/
-/*--- end                                         vg_kerneliface.h ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_libpthread.c b/coregrind/vg_libpthread.c
deleted file mode 100644
index 60c4dc95ff..0000000000
--- a/coregrind/vg_libpthread.c
+++ /dev/null
@@ -1,2850 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A replacement for the standard libpthread.so.                ---*/
-/*---                                              vg_libpthread.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-/* ALL THIS CODE RUNS ON THE SIMULATED CPU.
-
-   This is a replacement for the standard libpthread.so.  It is loaded
-   as part of the client's image (if required) and directs pthread
-   calls through to Valgrind's request mechanism. 
-
-   A couple of caveats.
- 
-   1.  Since it's a binary-compatible replacement for an existing library, 
-       we must take care to used exactly the same data layouts, etc, as 
-       the standard pthread.so does.  
-
-   2.  Since this runs as part of the client, there are no specific
-       restrictions on what headers etc we can include, so long as
-       this libpthread.so does not end up having dependencies on .so's
-       which the real one doesn't.
-
-   Later ... it appears we cannot call file-related stuff in libc here,
-   perhaps fair enough.  Be careful what you call from here.  Even exit()
-   doesn't work (gives infinite recursion and then stack overflow); hence
-   myexit().  Also fprintf doesn't seem safe.
-*/
-
-#include "valgrind.h"    /* For the request-passing mechanism */
-#include "vg_include.h"  /* For the VG_USERREQ__* constants */
-
-#define __USE_UNIX98
-#include <sys/types.h>
-#include <pthread.h>
-#undef __USE_UNIX98
-
-#include <unistd.h>
-#include <string.h>
-#ifdef GLIBC_2_1
-#include <sys/time.h>
-#endif
-
-#include <stdio.h>
-
-
-/* ---------------------------------------------------------------------
-   Forwardses.
-   ------------------------------------------------------------------ */
-
-static void wait_for_fd_to_be_readable_or_erring ( int fd );
-
-static
-int my_do_syscall2 ( int syscallno, 
-                     int arg1, int arg2 );
-
-
-/* ---------------------------------------------------------------------
-   Helpers.  We have to be pretty self-sufficient.
-   ------------------------------------------------------------------ */
-
-/* Number of times any given error message is printed. */
-#define N_MOANS 3
-
-/* Extract from Valgrind the value of VG_(clo_trace_pthread_level).
-   Returns 0 (none) if not running on Valgrind. */
-static
-int get_pt_trace_level ( void )
-{
-   int res;
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__GET_PTHREAD_TRACE_LEVEL,
-                           0, 0, 0, 0);
-   return res;
-}
-
-
-static
-void my_exit ( int arg )
-{
-   int __res;
-   __asm__ volatile ("movl %%ecx, %%ebx ; int $0x80"
-                     : "=a" (__res)
-                     : "0" (__NR_exit),
-                       "c" (arg) );
-   /* We don't bother to mention the fact that this asm trashes %ebx,
-      since it won't return.  If you ever do let it return ... fix
-      this! */
-}
-
-
-/* We need this guy -- it's in valgrind.so. */
-extern void VG_(startup) ( void );
-
-
-/* Just start up Valgrind if it's not already going.  VG_(startup)()
-   detects and ignores second and subsequent calls. */
-static __inline__
-void ensure_valgrind ( char* caller )
-{
-   VG_(startup)();
-}
-
-/* While we're at it ... hook our own startup function into this
-   game. */
-__asm__ (
-   ".section .init\n"
-   "\tcall vgPlain_startup"
-);
-
-
-static
-__attribute__((noreturn))
-void barf ( char* str )
-{
-   char buf[100];
-   buf[0] = 0;
-   strcat(buf, "\nvalgrind's libpthread.so: ");
-   strcat(buf, str);
-   strcat(buf, "\n\n");
-   write(2, buf, strlen(buf));
-   my_exit(1);
-   /* We have to persuade gcc into believing this doesn't return. */
-   while (1) { };
-}
-
-
-static void ignored ( char* msg )
-{
-   if (get_pt_trace_level() >= 0) {
-      char* ig = "valgrind's libpthread.so: IGNORED call to: ";
-      write(2, ig, strlen(ig));
-      write(2, msg, strlen(msg));
-      ig = "\n";
-      write(2, ig, strlen(ig));
-   }
-}
-
-static void kludged ( char* msg )
-{
-   if (get_pt_trace_level() >= 0) {
-      char* ig = "valgrind's libpthread.so: KLUDGED call to: ";
-      write(2, ig, strlen(ig));
-      write(2, msg, strlen(msg));
-      ig = "\n";
-      write(2, ig, strlen(ig));
-   }
-}
-
-static void not_inside ( char* msg )
-{
-   VG_(startup)();
-}
-
-__attribute__((noreturn))
-void vgPlain_unimp ( char* what )
-{
-   char* ig = "valgrind's libpthread.so: UNIMPLEMENTED FUNCTION: ";
-   write(2, ig, strlen(ig));
-   write(2, what, strlen(what));
-   ig = "\n";
-   write(2, ig, strlen(ig));
-   barf("Please report this bug to me at: jseward@acm.org");
-}
-
-
-static
-void my_assert_fail ( Char* expr, Char* file, Int line, Char* fn )
-{
-   static Bool entered = False;
-   if (entered) 
-      my_exit(2);
-   entered = True;
-   fprintf(stderr, "\n%s: %s:%d (%s): Assertion `%s' failed.\n",
-                   "valgrind", file, line, fn, expr );
-   fprintf(stderr, "Please report this bug to me at: %s\n\n", 
-                   VG_EMAIL_ADDR);
-   my_exit(1);
-}
-
-#define MY__STRING(__str)  #__str
-
-#define my_assert(expr)                                               \
-  ((void) ((expr) ? 0 :						      \
-	   (my_assert_fail  (MY__STRING(expr),			      \
-			      __FILE__, __LINE__,                     \
-                              __PRETTY_FUNCTION__), 0)))
-
-
-/* ---------------------------------------------------------------------
-   Pass pthread_ calls to Valgrind's request mechanism.
-   ------------------------------------------------------------------ */
-
-#include <errno.h>
-#include <sys/time.h> /* gettimeofday */
-
-
-/* ---------------------------------------------------
-   Ummm ..
-   ------------------------------------------------ */
-
-static
-void pthread_error ( const char* msg )
-{
-   int res;
-   VALGRIND_MAGIC_SEQUENCE(res, 0,
-                           VG_USERREQ__PTHREAD_ERROR, 
-                           msg, 0, 0, 0);
-}
-
-
-/* ---------------------------------------------------
-   THREAD ATTRIBUTES
-   ------------------------------------------------ */
-
-int pthread_attr_init(pthread_attr_t *attr)
-{
-   /* Just initialise the fields which we might look at. */
-   attr->__detachstate = PTHREAD_CREATE_JOINABLE;
-   return 0;
-}
-
-int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate)
-{
-   if (detachstate != PTHREAD_CREATE_JOINABLE 
-       && detachstate != PTHREAD_CREATE_DETACHED) {
-      pthread_error("pthread_attr_setdetachstate: "
-                    "detachstate is invalid");
-      return EINVAL;
-   }
-   attr->__detachstate = detachstate;
-   return 0;
-}
-
-int pthread_attr_setinheritsched(pthread_attr_t *attr, int inherit)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_attr_setinheritsched");
-   return 0;
-}
-
-__attribute__((weak))
-int pthread_attr_setstacksize (pthread_attr_t *__attr,
-                               size_t __stacksize)
-{
-   size_t limit;
-   char buf[1024];
-   ensure_valgrind("pthread_attr_setstacksize");
-   limit = VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB 
-                                 - 1000; /* paranoia */
-   if (__stacksize < limit)
-      return 0;
-   snprintf(buf, sizeof(buf), "pthread_attr_setstacksize: "
-            "requested size %d >= VG_PTHREAD_STACK_SIZE\n   "
-            "edit vg_include.h and rebuild.", __stacksize);
-   buf[sizeof(buf)-1] = '\0'; /* Make sure it is zero terminated */
-   barf(buf);
-}
-
-
-/* This is completely bogus. */
-int  pthread_attr_getschedparam(const  pthread_attr_t  *attr,  
-                                struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      kludged("pthread_attr_getschedparam");
-#  ifdef HAVE_SCHED_PRIORITY
-   if (param) param->sched_priority = 0; /* who knows */
-#  else
-   if (param) param->__sched_priority = 0; /* who knows */
-#  endif
-   return 0;
-}
-
-int  pthread_attr_setschedparam(pthread_attr_t  *attr,
-                                const  struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_attr_setschedparam");
-   return 0;
-}
-
-int pthread_attr_destroy(pthread_attr_t *attr)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_attr_destroy");
-   return 0;
-}
-
-/* These are no-ops, as with LinuxThreads. */
-int pthread_attr_setscope ( pthread_attr_t *attr, int scope )
-{
-   ensure_valgrind("pthread_attr_setscope");
-   if (scope == PTHREAD_SCOPE_SYSTEM)
-      return 0;
-   pthread_error("pthread_attr_setscope: "
-                 "invalid or unsupported scope");
-   if (scope == PTHREAD_SCOPE_PROCESS)
-      return ENOTSUP;
-   return EINVAL;
-}
-
-int pthread_attr_getscope ( const pthread_attr_t *attr, int *scope )
-{
-   ensure_valgrind("pthread_attr_setscope");
-   if (scope)
-      *scope = PTHREAD_SCOPE_SYSTEM;
-   return 0;
-}
-
-
-/* Pretty bogus.  Avoid if possible. */
-int pthread_getattr_np (pthread_t thread, pthread_attr_t *attr)
-{
-   int    detached;
-   size_t limit;
-   ensure_valgrind("pthread_getattr_np");
-   kludged("pthread_getattr_np");
-   limit = VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB 
-                                 - 1000; /* paranoia */
-   attr->__detachstate = PTHREAD_CREATE_JOINABLE;
-   attr->__schedpolicy = SCHED_OTHER;
-   attr->__schedparam.sched_priority = 0;
-   attr->__inheritsched = PTHREAD_EXPLICIT_SCHED;
-   attr->__scope = PTHREAD_SCOPE_SYSTEM;
-   attr->__guardsize = VKI_BYTES_PER_PAGE;
-   attr->__stackaddr = NULL;
-   attr->__stackaddr_set = 0;
-   attr->__stacksize = limit;
-   VALGRIND_MAGIC_SEQUENCE(detached, (-1) /* default */,
-                           VG_USERREQ__SET_OR_GET_DETACH, 
-                           2 /* get */, thread, 0, 0);
-   my_assert(detached == 0 || detached == 1);
-   if (detached)
-      attr->__detachstate = PTHREAD_CREATE_DETACHED;
-   return 0;
-}
-
-
-/* Bogus ... */
-int pthread_attr_getstackaddr ( const pthread_attr_t * attr,
-                                void ** stackaddr )
-{
-   ensure_valgrind("pthread_attr_getstackaddr");
-   kludged("pthread_attr_getstackaddr");
-   if (stackaddr)
-      *stackaddr = NULL;
-   return 0;
-}
-
-/* Not bogus (!) */
-int pthread_attr_getstacksize ( const pthread_attr_t * _attr, 
-                                size_t * __stacksize )
-{
-   size_t limit;
-   ensure_valgrind("pthread_attr_getstacksize");
-   limit = VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB 
-                                 - 1000; /* paranoia */
-   if (__stacksize)
-      *__stacksize = limit;
-   return 0;
-}
-
-int pthread_attr_setschedpolicy(pthread_attr_t *attr, int policy)
-{
-  if (policy != SCHED_OTHER && policy != SCHED_FIFO && policy != SCHED_RR)
-    return EINVAL;
-  attr->__schedpolicy = policy;
-  return 0;
-}
-
-int pthread_attr_getschedpolicy(const pthread_attr_t *attr, int *policy)
-{
-  *policy = attr->__schedpolicy;
-  return 0;
-}
-
-
-/* --------------------------------------------------- 
-   Helper functions for running a thread 
-   and for clearing up afterwards.
-   ------------------------------------------------ */
-
-/* All exiting threads eventually pass through here, bearing the
-   return value, or PTHREAD_CANCELED, in ret_val. */
-static
-__attribute__((noreturn))
-void thread_exit_wrapper ( void* ret_val )
-{
-   int           detached, res;
-   CleanupEntry  cu;
-   pthread_key_t key;
-
-   /* Run this thread's cleanup handlers. */
-   while (1) {
-      VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                              VG_USERREQ__CLEANUP_POP,
-                              &cu, 0, 0, 0);
-      if (res == -1) break; /* stack empty */
-      my_assert(res == 0);
-      if (0) printf("running exit cleanup handler");
-      cu.fn ( cu.arg );
-   }
-
-   /* Run this thread's key finalizers.  Really this should be run
-      PTHREAD_DESTRUCTOR_ITERATIONS times. */
-   for (key = 0; key < VG_N_THREAD_KEYS; key++) {
-      VALGRIND_MAGIC_SEQUENCE(res, (-2) /* default */,
-                              VG_USERREQ__GET_KEY_D_AND_S,
-                              key, &cu, 0, 0 );
-      if (res == 0) {
-         /* valid key */
-         if (cu.fn && cu.arg)
-            cu.fn /* destructor for key */ 
-                  ( cu.arg /* specific for key for this thread */ );
-         continue;
-      }
-      my_assert(res == -1);
-   }
-
-   /* Decide on my final disposition. */
-   VALGRIND_MAGIC_SEQUENCE(detached, (-1) /* default */,
-                           VG_USERREQ__SET_OR_GET_DETACH, 
-                           2 /* get */, pthread_self(), 0, 0);
-   my_assert(detached == 0 || detached == 1);
-
-   if (detached) {
-      /* Detached; I just quit right now. */
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__QUIT, 0, 0, 0, 0);
-   } else {
-      /* Not detached; so I wait for a joiner. */
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__WAIT_JOINER, ret_val, 0, 0, 0);
-   }
-   /* NOTREACHED */
-   barf("thread_exit_wrapper: still alive?!");
-}
-
-
-/* This function is a wrapper function for running a thread.  It runs
-   the root function specified in pthread_create, and then, should the
-   root function return a value, it arranges to run the thread's
-   cleanup handlers and exit correctly. */
-
-/* Struct used to convey info from pthread_create to thread_wrapper.
-   Must be careful not to pass to the child thread any pointers to
-   objects which might be on the parent's stack.  */
-typedef
-   struct {
-      int   attr__detachstate;
-      void* (*root_fn) ( void* );
-      void* arg;
-   }
-   NewThreadInfo;
-
-
-/* This is passed to the VG_USERREQ__APPLY_IN_NEW_THREAD and so must
-   not return.  Note that this runs in the new thread, not the
-   parent. */
-static
-__attribute__((noreturn))
-void thread_wrapper ( NewThreadInfo* info )
-{
-   int   res;
-   int   attr__detachstate;
-   void* (*root_fn) ( void* );
-   void* arg;
-   void* ret_val;
-
-   attr__detachstate = info->attr__detachstate;
-   root_fn           = info->root_fn;
-   arg               = info->arg;
-
-   /* Free up the arg block that pthread_create malloced. */
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__FREE, info, 0, 0, 0);
-   my_assert(res == 0);
-
-   /* Minimally observe the attributes supplied. */
-   if (attr__detachstate != PTHREAD_CREATE_DETACHED
-       && attr__detachstate != PTHREAD_CREATE_JOINABLE)
-      pthread_error("thread_wrapper: invalid attr->__detachstate");
-   if (attr__detachstate == PTHREAD_CREATE_DETACHED)
-      pthread_detach(pthread_self());
-
-   /* The root function might not return.  But if it does we simply
-      move along to thread_exit_wrapper.  All other ways out for the
-      thread (cancellation, or calling pthread_exit) lead there
-      too. */
-   ret_val = root_fn(arg);
-   thread_exit_wrapper(ret_val);
-   /* NOTREACHED */
-}
-
-
-/* ---------------------------------------------------
-   THREADs
-   ------------------------------------------------ */
-
-__attribute__((weak))
-int pthread_yield ( void )
-{
-   int res;
-   ensure_valgrind("pthread_yield");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_YIELD, 0, 0, 0, 0);
-   return 0;
-}
-
-
-int pthread_equal(pthread_t thread1, pthread_t thread2)
-{
-   return thread1 == thread2 ? 1 : 0;
-}
-
-
-/* Bundle up the args into a malloc'd block and create a new thread
-   consisting of thread_wrapper() applied to said malloc'd block. */
-int
-pthread_create (pthread_t *__restrict __thredd,
-                __const pthread_attr_t *__restrict __attr,
-                void *(*__start_routine) (void *),
-                void *__restrict __arg)
-{
-   int            tid_child;
-   NewThreadInfo* info;
-
-   ensure_valgrind("pthread_create");
-
-   /* Allocate space for the arg block.  thread_wrapper will free
-      it. */
-   VALGRIND_MAGIC_SEQUENCE(info, NULL /* default */,
-                           VG_USERREQ__MALLOC, 
-                           sizeof(NewThreadInfo), 0, 0, 0);
-   my_assert(info != NULL);
-
-   if (__attr)
-      info->attr__detachstate = __attr->__detachstate;
-   else 
-      info->attr__detachstate = PTHREAD_CREATE_JOINABLE;
-
-   info->root_fn = __start_routine;
-   info->arg     = __arg;
-   VALGRIND_MAGIC_SEQUENCE(tid_child, VG_INVALID_THREADID /* default */,
-                           VG_USERREQ__APPLY_IN_NEW_THREAD,
-                           &thread_wrapper, info, 0, 0);
-   my_assert(tid_child != VG_INVALID_THREADID);
-
-   if (__thredd)
-      *__thredd = tid_child;
-   return 0; /* success */
-}
-
-
-int 
-pthread_join (pthread_t __th, void **__thread_return)
-{
-   int res;
-   ensure_valgrind("pthread_join");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_JOIN,
-                           __th, __thread_return, 0, 0);
-   return res;
-}
-
-
-void pthread_exit(void *retval)
-{
-   ensure_valgrind("pthread_exit");
-   /* Simple! */
-   thread_exit_wrapper(retval);
-}
-
-
-pthread_t pthread_self(void)
-{
-   int tid;
-   ensure_valgrind("pthread_self");
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("pthread_self: invalid ThreadId");
-   return tid;
-}
-
-
-int pthread_detach(pthread_t th)
-{
-   int res;
-   ensure_valgrind("pthread_detach");
-   /* First we enquire as to the current detach state. */
-   VALGRIND_MAGIC_SEQUENCE(res, (-2) /* default */,
-                           VG_USERREQ__SET_OR_GET_DETACH,
-                           2 /* get */, th, 0, 0);
-   if (res == -1) {
-      /* not found */ 
-      pthread_error("pthread_detach: "
-                    "invalid target thread");
-      return ESRCH;
-   }
-   if (res == 1) { 
-      /* already detached */
-      pthread_error("pthread_detach: "
-                    "target thread is already detached");
-      return EINVAL;
-   }
-   if (res == 0) {
-      VALGRIND_MAGIC_SEQUENCE(res, (-2) /* default */,
-                              VG_USERREQ__SET_OR_GET_DETACH,
-                              1 /* set */, th, 0, 0);
-      my_assert(res == 0);
-      return 0;
-   }
-   barf("pthread_detach");
-}
-
-
-/* ---------------------------------------------------
-   CLEANUP STACKS
-   ------------------------------------------------ */
-
-void _pthread_cleanup_push (struct _pthread_cleanup_buffer *__buffer,
-                            void (*__routine) (void *),
-                            void *__arg)
-{
-   int          res;
-   CleanupEntry cu;
-   ensure_valgrind("_pthread_cleanup_push");
-   cu.fn  = __routine;
-   cu.arg = __arg;
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__CLEANUP_PUSH,
-                           &cu, 0, 0, 0);
-   my_assert(res == 0);
-}
-
-
-void _pthread_cleanup_push_defer (struct _pthread_cleanup_buffer *__buffer,
-                                  void (*__routine) (void *),
-                                  void *__arg)
-{
-   /* As _pthread_cleanup_push, but first save the thread's original
-      cancellation type in __buffer and set it to Deferred. */
-   int orig_ctype;
-   ensure_valgrind("_pthread_cleanup_push_defer");
-   /* Set to Deferred, and put the old cancellation type in res. */
-   my_assert(-1 != PTHREAD_CANCEL_DEFERRED);
-   my_assert(-1 != PTHREAD_CANCEL_ASYNCHRONOUS);
-   my_assert(sizeof(struct _pthread_cleanup_buffer) >= sizeof(int));
-   VALGRIND_MAGIC_SEQUENCE(orig_ctype, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELTYPE,
-                           PTHREAD_CANCEL_DEFERRED, 0, 0, 0);   
-   my_assert(orig_ctype != -1);
-   *((int*)(__buffer)) = orig_ctype;
-   /* Now push the cleanup. */
-   _pthread_cleanup_push(NULL, __routine, __arg);
-}
-
-
-void _pthread_cleanup_pop (struct _pthread_cleanup_buffer *__buffer,
-                           int __execute)
-{
-   int          res;
-   CleanupEntry cu;
-   ensure_valgrind("_pthread_cleanup_push");
-   cu.fn = cu.arg = NULL; /* paranoia */
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__CLEANUP_POP,
-                           &cu, 0, 0, 0);
-   if (res == 0) {
-      /* pop succeeded */
-     if (__execute) {
-        cu.fn ( cu.arg );
-     }
-     return;
-   }   
-   if (res == -1) {
-      /* stack underflow */
-      return;
-   }
-   barf("_pthread_cleanup_pop");
-}
-
-
-void _pthread_cleanup_pop_restore (struct _pthread_cleanup_buffer *__buffer,
-                                   int __execute)
-{
-   int orig_ctype, fake_ctype;
-   /* As _pthread_cleanup_pop, but after popping/running the handler,
-      restore the thread's original cancellation type from the first
-      word of __buffer. */
-   _pthread_cleanup_pop(NULL, __execute);
-   orig_ctype = *((int*)(__buffer));
-   my_assert(orig_ctype == PTHREAD_CANCEL_DEFERRED
-          || orig_ctype == PTHREAD_CANCEL_ASYNCHRONOUS);
-   my_assert(-1 != PTHREAD_CANCEL_DEFERRED);
-   my_assert(-1 != PTHREAD_CANCEL_ASYNCHRONOUS);
-   my_assert(sizeof(struct _pthread_cleanup_buffer) >= sizeof(int));
-   VALGRIND_MAGIC_SEQUENCE(fake_ctype, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELTYPE,
-                           orig_ctype, 0, 0, 0); 
-   my_assert(fake_ctype == PTHREAD_CANCEL_DEFERRED);
-}
-
-
-/* ---------------------------------------------------
-   MUTEX ATTRIBUTES
-   ------------------------------------------------ */
-
-int __pthread_mutexattr_init(pthread_mutexattr_t *attr)
-{
-   attr->__mutexkind = PTHREAD_MUTEX_ERRORCHECK_NP;
-   return 0;
-}
-
-int __pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
-{
-   switch (type) {
-#     ifndef GLIBC_2_1    
-      case PTHREAD_MUTEX_TIMED_NP:
-      case PTHREAD_MUTEX_ADAPTIVE_NP:
-#     endif
-#     ifdef GLIBC_2_1    
-      case PTHREAD_MUTEX_FAST_NP:
-#     endif
-      case PTHREAD_MUTEX_RECURSIVE_NP:
-      case PTHREAD_MUTEX_ERRORCHECK_NP:
-         attr->__mutexkind = type;
-         return 0;
-      default:
-         pthread_error("pthread_mutexattr_settype: "
-                       "invalid type");
-         return EINVAL;
-   }
-}
-
-int __pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
-{
-   return 0;
-}
-
-
-/* ---------------------------------------------------
-   MUTEXes
-   ------------------------------------------------ */
-
-int __pthread_mutex_init(pthread_mutex_t *mutex, 
-                         const  pthread_mutexattr_t *mutexattr)
-{
-   mutex->__m_count = 0;
-   mutex->__m_owner = (_pthread_descr)VG_INVALID_THREADID;
-   mutex->__m_kind  = PTHREAD_MUTEX_ERRORCHECK_NP;
-   if (mutexattr)
-      mutex->__m_kind = mutexattr->__mutexkind;
-   return 0;
-}
-
-
-int __pthread_mutex_lock(pthread_mutex_t *mutex)
-{
-   int res;
-   static int moans = N_MOANS;
-   if (RUNNING_ON_VALGRIND) {
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__PTHREAD_MUTEX_LOCK,
-                              mutex, 0, 0, 0);
-      return res;
-   } else {
-      if (moans-- > 0)
-         not_inside("pthread_mutex_lock");
-      return 0; /* success */
-   }
-}
-
-
-int __pthread_mutex_trylock(pthread_mutex_t *mutex)
-{
-   int res;
-   static int moans = N_MOANS;
-   if (RUNNING_ON_VALGRIND) {
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__PTHREAD_MUTEX_TRYLOCK,
-                              mutex, 0, 0, 0);
-      return res;
-   } else {
-      if (moans-- > 0)
-         not_inside("pthread_mutex_trylock");
-      return 0;
-   }
-}
-
-
-int __pthread_mutex_unlock(pthread_mutex_t *mutex)
-{
-   int res;
-   static int moans = N_MOANS;
-   if (RUNNING_ON_VALGRIND) {
-      VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                              VG_USERREQ__PTHREAD_MUTEX_UNLOCK,
-                              mutex, 0, 0, 0);
-      return res;
-   } else {
-      if (moans-- > 0)
-         not_inside("pthread_mutex_unlock");
-      return 0;
-   }
-}
-
-
-int __pthread_mutex_destroy(pthread_mutex_t *mutex)
-{
-   /* Valgrind doesn't hold any resources on behalf of the mutex, so no
-      need to involve it. */
-   if (mutex->__m_count > 0) {
-       pthread_error("pthread_mutex_destroy: "
-                     "mutex is still in use");
-       return EBUSY;
-   }
-   mutex->__m_count = 0;
-   mutex->__m_owner = (_pthread_descr)VG_INVALID_THREADID;
-   mutex->__m_kind  = PTHREAD_MUTEX_ERRORCHECK_NP;
-   return 0;
-}
-
-
-/* ---------------------------------------------------
-   CONDITION VARIABLES
-   ------------------------------------------------ */
-
-/* LinuxThreads supports no attributes for conditions.  Hence ... */
-
-int pthread_condattr_init(pthread_condattr_t *attr)
-{
-   return 0;
-}
-
-int pthread_condattr_destroy(pthread_condattr_t *attr)
-{
-   return 0;
-}
-
-int pthread_cond_init( pthread_cond_t *cond,
-                       const pthread_condattr_t *cond_attr)
-{
-   cond->__c_waiting = (_pthread_descr)VG_INVALID_THREADID;
-   return 0;
-}
-
-int pthread_cond_destroy(pthread_cond_t *cond)
-{
-   /* should check that no threads are waiting on this CV */
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      kludged("pthread_cond_destroy");
-   return 0;
-}
-
-/* ---------------------------------------------------
-   SCHEDULING
-   ------------------------------------------------ */
-
-/* This is completely bogus. */
-int   pthread_getschedparam(pthread_t  target_thread,  
-                            int  *policy,
-                            struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      kludged("pthread_getschedparam");
-   if (policy) *policy = SCHED_OTHER;
-#  ifdef HAVE_SCHED_PRIORITY
-   if (param) param->sched_priority = 0; /* who knows */
-#  else
-   if (param) param->__sched_priority = 0; /* who knows */
-#  endif
-   return 0;
-}
-
-int pthread_setschedparam(pthread_t target_thread, 
-                          int policy, 
-                          const struct sched_param *param)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_setschedparam");
-   return 0;
-}
-
-int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
-{
-   int res;
-   ensure_valgrind("pthread_cond_wait");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_WAIT,
-			   cond, mutex, 0, 0);
-   return res;
-}
-
-int pthread_cond_timedwait ( pthread_cond_t *cond, 
-                             pthread_mutex_t *mutex, 
-                             const struct  timespec *abstime )
-{
-   int res;
-   unsigned int ms_now, ms_end;
-   struct  timeval timeval_now;
-   unsigned long long int ull_ms_now_after_1970;
-   unsigned long long int ull_ms_end_after_1970;
-
-   ensure_valgrind("pthread_cond_timedwait");
-   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__READ_MILLISECOND_TIMER,
-                           0, 0, 0, 0);
-   my_assert(ms_now != 0xFFFFFFFF);
-   res = gettimeofday(&timeval_now, NULL);
-   my_assert(res == 0);
-
-   ull_ms_now_after_1970 
-      = 1000ULL * ((unsigned long long int)(timeval_now.tv_sec))
-        + ((unsigned long long int)(timeval_now.tv_usec / 1000000));
-   ull_ms_end_after_1970
-      = 1000ULL * ((unsigned long long int)(abstime->tv_sec))
-        + ((unsigned long long int)(abstime->tv_nsec / 1000000));
-   if (ull_ms_end_after_1970 < ull_ms_now_after_1970)
-      ull_ms_end_after_1970 = ull_ms_now_after_1970;
-   ms_end 
-      = ms_now + (unsigned int)(ull_ms_end_after_1970 - ull_ms_now_after_1970);
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_TIMEDWAIT,
-			   cond, mutex, ms_end, 0);
-   return res;
-}
-
-
-int pthread_cond_signal(pthread_cond_t *cond)
-{
-   int res;
-   ensure_valgrind("pthread_cond_signal");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_SIGNAL,
-			   cond, 0, 0, 0);
-   return res;
-}
-
-int pthread_cond_broadcast(pthread_cond_t *cond)
-{
-   int res;
-   ensure_valgrind("pthread_cond_broadcast");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_COND_BROADCAST,
-			   cond, 0, 0, 0);
-   return res;
-}
-
-
-/* ---------------------------------------------------
-   CANCELLATION
-   ------------------------------------------------ */
-
-int pthread_setcancelstate(int state, int *oldstate)
-{
-   int res;
-   ensure_valgrind("pthread_setcancelstate");
-   if (state != PTHREAD_CANCEL_ENABLE
-       && state != PTHREAD_CANCEL_DISABLE) {
-      pthread_error("pthread_setcancelstate: "
-                    "invalid state");
-      return EINVAL;
-   }
-   my_assert(-1 != PTHREAD_CANCEL_ENABLE);
-   my_assert(-1 != PTHREAD_CANCEL_DISABLE);
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELSTATE,
-                           state, 0, 0, 0);
-   my_assert(res != -1);
-   if (oldstate) 
-      *oldstate = res;
-   return 0;
-}
-
-int pthread_setcanceltype(int type, int *oldtype)
-{
-   int res;
-   ensure_valgrind("pthread_setcanceltype");
-   if (type != PTHREAD_CANCEL_DEFERRED
-       && type != PTHREAD_CANCEL_ASYNCHRONOUS) {
-      pthread_error("pthread_setcanceltype: "
-                    "invalid type");
-      return EINVAL;
-   }
-   my_assert(-1 != PTHREAD_CANCEL_DEFERRED);
-   my_assert(-1 != PTHREAD_CANCEL_ASYNCHRONOUS);
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELTYPE,
-                           type, 0, 0, 0);
-   my_assert(res != -1);
-   if (oldtype) 
-      *oldtype = res;
-   return 0;
-}
-
-int pthread_cancel(pthread_t thread)
-{
-   int res;
-   ensure_valgrind("pthread_cancel");
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__SET_CANCELPEND,
-                           thread, &thread_exit_wrapper, 0, 0);
-   my_assert(res != -1);
-   return res;
-}
-
-static __inline__
-void __my_pthread_testcancel(void)
-{
-   int res;
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__TESTCANCEL,
-                           0, 0, 0, 0);
-   my_assert(res == 0);
-}
-
-void pthread_testcancel ( void )
-{
-   __my_pthread_testcancel();
-}
-
-
-/* Not really sure what this is for.  I suspect for doing the POSIX
-   requirements for fork() and exec().  We do this internally anyway
-   whenever those syscalls are observed, so this could be superfluous,
-   but hey ... 
-*/
-void __pthread_kill_other_threads_np ( void )
-{
-   int res;
-   ensure_valgrind("__pthread_kill_other_threads_np");
-   VALGRIND_MAGIC_SEQUENCE(res, (-1) /* default */,
-                           VG_USERREQ__NUKE_OTHER_THREADS,
-                           0, 0, 0, 0);
-   my_assert(res == 0);
-}
-
-
-/* ---------------------------------------------------
-   SIGNALS
-   ------------------------------------------------ */
-
-#include <signal.h>
-
-int pthread_sigmask(int how, const sigset_t *newmask, 
-                             sigset_t *oldmask)
-{
-   int res;
-
-   /* A bit subtle, because the scheduler expects newmask and oldmask
-      to be vki_sigset_t* rather than sigset_t*, and the two are
-      different.  Fortunately the first 64 bits of a sigset_t are
-      exactly a vki_sigset_t, so we just pass the pointers through
-      unmodified.  Haaaack! 
-
-      Also mash the how value so that the SIG_ constants from glibc
-      constants to VKI_ constants, so that the former do not have to
-      be included into vg_scheduler.c. */
-
-   ensure_valgrind("pthread_sigmask");
-
-   switch (how) {
-      case SIG_SETMASK: how = VKI_SIG_SETMASK; break;
-      case SIG_BLOCK:   how = VKI_SIG_BLOCK; break;
-      case SIG_UNBLOCK: how = VKI_SIG_UNBLOCK; break;
-      default: pthread_error("pthread_sigmask: invalid how");
-               return EINVAL;
-   }
-
-   /* Crude check */
-   if (newmask == NULL)
-      return EFAULT;
-
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_SIGMASK,
-                           how, newmask, oldmask, 0);
-
-   /* The scheduler tells us of any memory violations. */
-   return res == 0 ? 0 : EFAULT;
-}
-
-
-int sigwait ( const sigset_t* set, int* sig )
-{
-   int res;
-   ensure_valgrind("sigwait");
-   /* As with pthread_sigmask we deliberately confuse sigset_t with
-      vki_ksigset_t. */
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__SIGWAIT,
-                           set, sig, 0, 0);
-   return res;
-}
-
-
-int pthread_kill(pthread_t thread, int signo)
-{
-   int res;
-   ensure_valgrind("pthread_kill");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_KILL, 
-                           thread, signo, 0, 0);
-   return res;
-}
-
-
-/* Copied verbatim from Linuxthreads */
-/* Redefine raise() to send signal to calling thread only,
-   as per POSIX 1003.1c */
-int raise (int sig)
-{
-  int retcode = pthread_kill(pthread_self(), sig);
-  if (retcode == 0) {
-    return 0;
-  } else {
-    errno = retcode;
-    return -1;
-  }
-}
-
-
-int pause ( void )
-{
-   unsigned int n_orig, n_now;
-   struct vki_timespec nanosleep_interval;
-   ensure_valgrind("pause");
-
-   /* This is surely a cancellation point. */
-   __my_pthread_testcancel();
-
-   VALGRIND_MAGIC_SEQUENCE(n_orig, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__GET_N_SIGS_RETURNED, 
-                           0, 0, 0, 0);
-   my_assert(n_orig != 0xFFFFFFFF);
-
-   while (1) {
-      VALGRIND_MAGIC_SEQUENCE(n_now, 0xFFFFFFFF /* default */,
-                              VG_USERREQ__GET_N_SIGS_RETURNED, 
-                              0, 0, 0, 0);
-      my_assert(n_now != 0xFFFFFFFF);
-      my_assert(n_now >= n_orig);
-      if (n_now != n_orig) break;
-
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 52 * 1000 * 1000; /* 52 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      (void)my_do_syscall2(__NR_nanosleep, 
-                           (int)(&nanosleep_interval), (int)NULL);
-   }
-
-   * (__errno_location()) = EINTR;
-   return -1;
-}
-
-
-/* ---------------------------------------------------
-   THREAD-SPECIFICs
-   ------------------------------------------------ */
-
-int __pthread_key_create(pthread_key_t *key,  
-                         void  (*destr_function)  (void *))
-{
-   int res;
-   ensure_valgrind("pthread_key_create");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_KEY_CREATE,
-                           key, destr_function, 0, 0);
-   return res;
-}
-
-int pthread_key_delete(pthread_key_t key)
-{
-   static int moans = N_MOANS;
-   if (moans-- > 0) 
-      ignored("pthread_key_delete");
-   return 0;
-}
-
-int __pthread_setspecific(pthread_key_t key, const void *pointer)
-{
-   int res;
-   ensure_valgrind("pthread_setspecific");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_SETSPECIFIC,
-                           key, pointer, 0, 0);
-   return res;
-}
-
-void * __pthread_getspecific(pthread_key_t key)
-{
-   int res;
-   ensure_valgrind("pthread_getspecific");
-   VALGRIND_MAGIC_SEQUENCE(res, 0 /* default */,
-                           VG_USERREQ__PTHREAD_GETSPECIFIC,
-                           key, 0 , 0, 0);
-   return (void*)res;
-}
-
-
-/* ---------------------------------------------------
-   ONCEry
-   ------------------------------------------------ */
-
-static pthread_mutex_t once_masterlock = PTHREAD_MUTEX_INITIALIZER;
-
-
-int __pthread_once ( pthread_once_t *once_control, 
-                     void (*init_routine) (void) )
-{
-   int res;
-   ensure_valgrind("pthread_once");
-
-   res = __pthread_mutex_lock(&once_masterlock);
-
-   if (res != 0) {
-      barf("pthread_once: Looks like your program's "
-           "init routine calls back to pthread_once() ?!");
-   }
-
-   if (*once_control == 0) {
-      *once_control = 1;
-      init_routine();
-   }
-
-   __pthread_mutex_unlock(&once_masterlock);
-
-   return 0;
-}
-
-
-/* ---------------------------------------------------
-   MISC
-   ------------------------------------------------ */
-
-static pthread_mutex_t pthread_atfork_lock 
-   = PTHREAD_MUTEX_INITIALIZER;
-
-int __pthread_atfork ( void (*prepare)(void),
-                       void (*parent)(void),
-                       void (*child)(void) )
-{
-   int n, res;
-   ForkHandlerEntry entry;
-
-   ensure_valgrind("pthread_atfork");
-   __pthread_mutex_lock(&pthread_atfork_lock);
-
-   /* Fetch old counter */
-   VALGRIND_MAGIC_SEQUENCE(n, -2 /* default */,
-                           VG_USERREQ__GET_FHSTACK_USED,
-                           0, 0, 0, 0);
-   my_assert(n >= 0 && n < VG_N_FORKHANDLERSTACK);
-   if (n == VG_N_FORKHANDLERSTACK-1)
-      barf("pthread_atfork: VG_N_FORKHANDLERSTACK is too low; "
-           "increase and recompile");
-
-   /* Add entry */
-   entry.prepare = *prepare;
-   entry.parent  = *parent;
-   entry.child   = *child;   
-   VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                           VG_USERREQ__SET_FHSTACK_ENTRY,
-                           n, &entry, 0, 0);
-   my_assert(res == 0);
-
-   /* Bump counter */
-   VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                           VG_USERREQ__SET_FHSTACK_USED,
-                           n+1, 0, 0, 0);
-   my_assert(res == 0);
-
-   __pthread_mutex_unlock(&pthread_atfork_lock);
-   return 0;
-}
-
-
-__attribute__((weak)) 
-void __pthread_initialize ( void )
-{
-   ensure_valgrind("__pthread_initialize");
-}
-
-
-/* ---------------------------------------------------
-   LIBRARY-PRIVATE THREAD SPECIFIC STATE
-   ------------------------------------------------ */
-
-#include <resolv.h>
-static int thread_specific_errno[VG_N_THREADS];
-static int thread_specific_h_errno[VG_N_THREADS];
-static struct __res_state
-           thread_specific_res_state[VG_N_THREADS];
-
-int* __errno_location ( void )
-{
-   int tid;
-   /* ensure_valgrind("__errno_location"); */
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   /* 'cos I'm paranoid ... */
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("__errno_location: invalid ThreadId");
-   return & thread_specific_errno[tid];
-}
-
-int* __h_errno_location ( void )
-{
-   int tid;
-   /* ensure_valgrind("__h_errno_location"); */
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   /* 'cos I'm paranoid ... */
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("__h_errno_location: invalid ThreadId");
-   return & thread_specific_h_errno[tid];
-}
-
-struct __res_state* __res_state ( void )
-{
-   int tid;
-   /* ensure_valgrind("__res_state"); */
-   VALGRIND_MAGIC_SEQUENCE(tid, 1 /* default */,
-                           VG_USERREQ__PTHREAD_GET_THREADID,
-                           0, 0, 0, 0);
-   /* 'cos I'm paranoid ... */
-   if (tid < 1 || tid >= VG_N_THREADS)
-      barf("__res_state: invalid ThreadId");
-   return & thread_specific_res_state[tid];
-}
-
-
-/* ---------------------------------------------------
-   LIBC-PRIVATE SPECIFIC DATA
-   ------------------------------------------------ */
-
-/* Relies on assumption that initial private data is NULL.  This
-   should be fixed somehow. */
-
-/* The allowable keys (indices) (all 2 of them). 
-   From sysdeps/pthread/bits/libc-tsd.h
-*/
-#define N_LIBC_TSD_EXTRA_KEYS 1
-
-enum __libc_tsd_key_t { _LIBC_TSD_KEY_MALLOC = 0,
-                        _LIBC_TSD_KEY_DL_ERROR,
-                        _LIBC_TSD_KEY_N };
-
-/* Auto-initialising subsystem.  libc_specifics_inited is set 
-   after initialisation.  libc_specifics_inited_mx guards it. */
-static int             libc_specifics_inited    = 0;
-static pthread_mutex_t libc_specifics_inited_mx = PTHREAD_MUTEX_INITIALIZER;
-
-/* These are the keys we must initialise the first time. */
-static pthread_key_t libc_specifics_keys[_LIBC_TSD_KEY_N
-                                         + N_LIBC_TSD_EXTRA_KEYS];
-
-/* Initialise the keys, if they are not already initialise. */
-static
-void init_libc_tsd_keys ( void )
-{
-   int res, i;
-   pthread_key_t k;
-
-   res = pthread_mutex_lock(&libc_specifics_inited_mx);
-   if (res != 0) barf("init_libc_tsd_keys: lock");
-
-   if (libc_specifics_inited == 0) {
-      /* printf("INIT libc specifics\n"); */
-      libc_specifics_inited = 1;
-      for (i = 0; i < _LIBC_TSD_KEY_N + N_LIBC_TSD_EXTRA_KEYS; i++) {
-         res = pthread_key_create(&k, NULL);
-	 if (res != 0) barf("init_libc_tsd_keys: create");
-         libc_specifics_keys[i] = k;
-      }
-   }
-
-   res = pthread_mutex_unlock(&libc_specifics_inited_mx);
-   if (res != 0) barf("init_libc_tsd_keys: unlock");
-}
-
-
-static int
-libc_internal_tsd_set ( enum __libc_tsd_key_t key, 
-                        const void * pointer )
-{
-   int        res;
-   static int moans = N_MOANS;
-   /* printf("SET SET SET key %d ptr %p\n", key, pointer); */
-   if (key < _LIBC_TSD_KEY_MALLOC 
-       || key >= _LIBC_TSD_KEY_N + N_LIBC_TSD_EXTRA_KEYS)
-      barf("libc_internal_tsd_set: invalid key");
-   if (key >= _LIBC_TSD_KEY_N && moans-- > 0)
-      fprintf(stderr, 
-         "valgrind's libpthread.so: libc_internal_tsd_set: "
-         "dubious key %d\n", key);
-   init_libc_tsd_keys();
-   res = pthread_setspecific(libc_specifics_keys[key], pointer);
-   if (res != 0) barf("libc_internal_tsd_set: setspecific failed");
-   return 0;
-}
-
-static void *
-libc_internal_tsd_get ( enum __libc_tsd_key_t key )
-{
-   void*      v;
-   static int moans = N_MOANS;
-   /* printf("GET GET GET key %d\n", key); */
-   if (key < _LIBC_TSD_KEY_MALLOC 
-       || key >= _LIBC_TSD_KEY_N + N_LIBC_TSD_EXTRA_KEYS)
-      barf("libc_internal_tsd_get: invalid key");
-   if (key >= _LIBC_TSD_KEY_N && moans-- > 0)
-      fprintf(stderr, 
-         "valgrind's libpthread.so: libc_internal_tsd_get: "
-         "dubious key %d\n", key);
-   init_libc_tsd_keys();
-   v = pthread_getspecific(libc_specifics_keys[key]);
-   /* if (v == NULL) barf("libc_internal_tsd_set: getspecific failed"); */
-   return v;
-}
-
-
-
-
-int (*__libc_internal_tsd_set)
-    (enum __libc_tsd_key_t key, const void * pointer)
-   = libc_internal_tsd_set;
-
-void* (*__libc_internal_tsd_get)
-      (enum __libc_tsd_key_t key)
-   = libc_internal_tsd_get;
-
-
-/* ---------------------------------------------------------------------
-   These are here (I think) because they are deemed cancellation
-   points by POSIX.  For the moment we'll simply pass the call along
-   to the corresponding thread-unaware (?) libc routine.
-   ------------------------------------------------------------------ */
-
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-
-#ifdef GLIBC_2_1
-extern
-int __sigaction
-             (int signum, 
-              const struct sigaction *act,  
-              struct  sigaction *oldact);
-#else
-extern
-int __libc_sigaction
-             (int signum, 
-              const struct sigaction *act,  
-              struct  sigaction *oldact);
-#endif
-int sigaction(int signum, 
-              const struct sigaction *act,  
-              struct  sigaction *oldact)
-{
-   __my_pthread_testcancel();
-#  ifdef GLIBC_2_1
-   return __sigaction(signum, act, oldact);
-#  else
-   return __libc_sigaction(signum, act, oldact);
-#  endif
-}
-
-
-extern
-int  __libc_connect(int  sockfd,  
-                    const  struct  sockaddr  *serv_addr, 
-                    socklen_t addrlen);
-__attribute__((weak))
-int  connect(int  sockfd,  
-             const  struct  sockaddr  *serv_addr, 
-             socklen_t addrlen)
-{
-   __my_pthread_testcancel();
-   return __libc_connect(sockfd, serv_addr, addrlen);
-}
-
-
-extern
-int __libc_fcntl(int fd, int cmd, long arg);
-__attribute__((weak))
-int fcntl(int fd, int cmd, long arg)
-{
-   __my_pthread_testcancel();
-   return __libc_fcntl(fd, cmd, arg);
-}
-
-
-extern 
-ssize_t __libc_write(int fd, const void *buf, size_t count);
-__attribute__((weak))
-ssize_t write(int fd, const void *buf, size_t count)
-{
-   __my_pthread_testcancel();
-   return __libc_write(fd, buf, count);
-}
-
-
-extern 
-ssize_t __libc_read(int fd, void *buf, size_t count);
-__attribute__((weak))
-ssize_t read(int fd, void *buf, size_t count)
-{
-   __my_pthread_testcancel();
-   return __libc_read(fd, buf, count);
-}
-
- 
-extern
-int __libc_open64(const char *pathname, int flags, mode_t mode);
-__attribute__((weak))
-int open64(const char *pathname, int flags, mode_t mode)
-{
-   __my_pthread_testcancel();
-   return __libc_open64(pathname, flags, mode);
-}
-
-
-extern
-int __libc_open(const char *pathname, int flags, mode_t mode);
-__attribute__((weak))
-int open(const char *pathname, int flags, mode_t mode)
-{
-   __my_pthread_testcancel();
-   return __libc_open(pathname, flags, mode);
-}
-
-
-extern
-int __libc_close(int fd);
-__attribute__((weak))
-int close(int fd)
-{
-   __my_pthread_testcancel();
-   return __libc_close(fd);
-}
-
-
-extern
-int __libc_accept(int s, struct sockaddr *addr, socklen_t *addrlen);
-__attribute__((weak))
-int accept(int s, struct sockaddr *addr, socklen_t *addrlen)
-{
-   __my_pthread_testcancel();
-   wait_for_fd_to_be_readable_or_erring(s);
-   __my_pthread_testcancel();
-   return __libc_accept(s, addr, addrlen);
-}
-
-
-extern
-pid_t __libc_waitpid(pid_t pid, int *status, int options);
-__attribute__((weak))
-pid_t waitpid(pid_t pid, int *status, int options)
-{
-   __my_pthread_testcancel();
-   return __libc_waitpid(pid, status, options);
-}
-
-
-extern
-int __libc_nanosleep(const struct timespec *req, struct timespec *rem);
-__attribute__((weak))
-int nanosleep(const struct timespec *req, struct timespec *rem)
-{
-   __my_pthread_testcancel();
-   return __libc_nanosleep(req, rem);
-}
-
-
-extern
-int __libc_fsync(int fd);
-__attribute__((weak))
-int fsync(int fd)
-{
-   __my_pthread_testcancel();
-   return __libc_fsync(fd);
-}
-
-
-extern
-off_t __libc_lseek(int fildes, off_t offset, int whence);
-__attribute__((weak))
-off_t lseek(int fildes, off_t offset, int whence)
-{
-   __my_pthread_testcancel();
-   return __libc_lseek(fildes, offset, whence);
-}
-
-
-extern
-__off64_t __libc_lseek64(int fildes, __off64_t offset, int whence);
-__attribute__((weak))
-__off64_t lseek64(int fildes, __off64_t offset, int whence)
-{
-   __my_pthread_testcancel();
-   return __libc_lseek64(fildes, offset, whence);
-}
-
-
-extern 
-ssize_t __libc_pread64 (int __fd, void *__buf, size_t __nbytes,
-                        __off64_t __offset);
-ssize_t __pread64 (int __fd, void *__buf, size_t __nbytes,
-                   __off64_t __offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pread64(__fd, __buf, __nbytes, __offset);
-}
-
-
-extern
-ssize_t __libc_pwrite64 (int __fd, const void *__buf, size_t __nbytes,
-                        __off64_t __offset);
-ssize_t __pwrite64 (int __fd, const void *__buf, size_t __nbytes,
-                   __off64_t __offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pwrite64(__fd, __buf, __nbytes, __offset);
-}
-
-
-extern 
-ssize_t __libc_pwrite(int fd, const void *buf, size_t count, off_t offset);
-__attribute__((weak))
-ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pwrite(fd, buf, count, offset);
-}
-
-
-extern 
-ssize_t __libc_pread(int fd, void *buf, size_t count, off_t offset);
-__attribute__((weak))
-ssize_t pread(int fd, void *buf, size_t count, off_t offset)
-{
-   __my_pthread_testcancel();
-   return __libc_pread(fd, buf, count, offset);
-}
-
-
-extern  
-void __libc_longjmp(jmp_buf env, int val) __attribute((noreturn));
-/* not weak: __attribute__((weak)) */
-void longjmp(jmp_buf env, int val)
-{
-   __libc_longjmp(env, val);
-}
-
-
-extern void __libc_siglongjmp (sigjmp_buf env, int val)
-                               __attribute__ ((noreturn));
-void siglongjmp(sigjmp_buf env, int val)
-{
-   kludged("siglongjmp (cleanup handlers are ignored)");
-   __libc_siglongjmp(env, val);
-}
-
-
-extern
-int __libc_send(int s, const void *msg, size_t len, int flags);
-__attribute__((weak))
-int send(int s, const void *msg, size_t len, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_send(s, msg, len, flags);
-}
-
-
-extern
-int __libc_recv(int s, void *buf, size_t len, int flags);
-__attribute__((weak))
-int recv(int s, void *buf, size_t len, int flags)
-{
-   __my_pthread_testcancel();
-   wait_for_fd_to_be_readable_or_erring(s);
-   __my_pthread_testcancel();
-   return __libc_recv(s, buf, len, flags);
-}
-
-
-extern 
-int __libc_sendmsg(int s, const struct msghdr *msg, int flags);
-__attribute__((weak))
-int sendmsg(int s, const struct msghdr *msg, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_sendmsg(s, msg, flags);
-}
-
-
-extern
-int __libc_recvmsg(int s, struct msghdr *msg, int flags);
-__attribute__((weak))
-int recvmsg(int s, struct msghdr *msg, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_recvmsg(s, msg, flags);
-}
-
-
-extern
-int __libc_recvfrom(int s, void *buf, size_t len, int flags,
-                    struct sockaddr *from, socklen_t *fromlen);
-__attribute__((weak))
-int recvfrom(int s, void *buf, size_t len, int flags,
-             struct sockaddr *from, socklen_t *fromlen)
-{
-   __my_pthread_testcancel();
-   wait_for_fd_to_be_readable_or_erring(s);
-   __my_pthread_testcancel();
-   return __libc_recvfrom(s, buf, len, flags, from, fromlen);
-}
-
-
-extern
-int __libc_sendto(int s, const void *msg, size_t len, int flags, 
-                  const struct sockaddr *to, socklen_t tolen);
-__attribute__((weak))
-int sendto(int s, const void *msg, size_t len, int flags, 
-           const struct sockaddr *to, socklen_t tolen)
-{
-   __my_pthread_testcancel();
-   return __libc_sendto(s, msg, len, flags, to, tolen);
-}
-
-
-extern 
-int __libc_system(const char* str);
-__attribute__((weak))
-int system(const char* str)
-{
-   __my_pthread_testcancel();
-   return __libc_system(str);
-}
-
-
-extern
-pid_t __libc_wait(int *status);
-__attribute__((weak))
-pid_t wait(int *status)
-{
-   __my_pthread_testcancel();
-   return __libc_wait(status);
-}
-
-
-extern
-int __libc_msync(const void *start, size_t length, int flags);
-__attribute__((weak))
-int msync(const void *start, size_t length, int flags)
-{
-   __my_pthread_testcancel();
-   return __libc_msync(start, length, flags);
-}
-
-
-/*--- fork and its helper ---*/
-
-static
-void run_fork_handlers ( int what )
-{
-   ForkHandlerEntry entry;
-   int n_h, n_handlers, i, res;
-
-   my_assert(what == 0 || what == 1 || what == 2);
-
-   /* Fetch old counter */
-   VALGRIND_MAGIC_SEQUENCE(n_handlers, -2 /* default */,
-                           VG_USERREQ__GET_FHSTACK_USED,
-                           0, 0, 0, 0);
-   my_assert(n_handlers >= 0 && n_handlers < VG_N_FORKHANDLERSTACK);
-
-   /* Prepare handlers (what == 0) are called in opposite order of
-      calls to pthread_atfork.  Parent and child handlers are called
-      in the same order as calls to pthread_atfork. */
-   if (what == 0)
-      n_h = n_handlers - 1;
-   else
-      n_h = 0;
-
-   for (i = 0; i < n_handlers; i++) {
-      VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                              VG_USERREQ__GET_FHSTACK_ENTRY,
-                              n_h, &entry, 0, 0);
-      my_assert(res == 0);
-      switch (what) {
-         case 0:  if (entry.prepare) entry.prepare(); 
-                  n_h--; break;
-         case 1:  if (entry.parent) entry.parent(); 
-                  n_h++; break;
-         case 2:  if (entry.child) entry.child(); 
-                  n_h++; break;
-         default: barf("run_fork_handlers: invalid what");
-      }
-   }
-
-   if (what != 0 /* prepare */) {
-      /* Empty out the stack. */
-      VALGRIND_MAGIC_SEQUENCE(res, -2 /* default */,
-                              VG_USERREQ__SET_FHSTACK_USED,
-                              0, 0, 0, 0);
-      my_assert(res == 0);
-   }
-}
-
-extern
-pid_t __libc_fork(void);
-pid_t __fork(void)
-{
-   pid_t pid;
-   __my_pthread_testcancel();
-   __pthread_mutex_lock(&pthread_atfork_lock);
-
-   run_fork_handlers(0 /* prepare */);
-   pid = __libc_fork();
-   if (pid == 0) {
-      /* I am the child */
-      run_fork_handlers(2 /* child */);
-      __pthread_mutex_init(&pthread_atfork_lock, NULL);
-   } else {
-      /* I am the parent */
-      run_fork_handlers(1 /* parent */);
-      __pthread_mutex_unlock(&pthread_atfork_lock);
-   }
-   return pid;
-}
-
-
-
-
-/* ---------------------------------------------------------------------
-   Nonblocking implementations of select() and poll().  This stuff will
-   surely rot your mind.
-   ------------------------------------------------------------------ */
-
-/*--------------------------------------------------*/
-
-#include "vg_kerneliface.h"
-
-static
-__inline__
-int is_kerror ( int res )
-{
-   if (res >= -4095 && res <= -1)
-      return 1;
-   else
-      return 0;
-}
-
-
-static
-int my_do_syscall1 ( int syscallno, int arg1 )
-{ 
-   int __res;
-   __asm__ volatile ("pushl %%ebx; movl %%edx,%%ebx ; int $0x80 ; popl %%ebx"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "d" (arg1) );
-   return __res;
-}
-
-static
-int my_do_syscall2 ( int syscallno, 
-                     int arg1, int arg2 )
-{ 
-   int __res;
-   __asm__ volatile ("pushl %%ebx; movl %%edx,%%ebx ; int $0x80 ; popl %%ebx"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "d" (arg1),
-                       "c" (arg2) );
-   return __res;
-}
-
-static
-int my_do_syscall3 ( int syscallno, 
-                     int arg1, int arg2, int arg3 )
-{ 
-   int __res;
-   __asm__ volatile ("pushl %%ebx; movl %%esi,%%ebx ; int $0x80 ; popl %%ebx"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "S" (arg1),
-                       "c" (arg2),
-                       "d" (arg3) );
-   return __res;
-}
-
-static
-int do_syscall_select( int n, 
-                       vki_fd_set* readfds, 
-                       vki_fd_set* writefds, 
-                       vki_fd_set* exceptfds, 
-                       struct vki_timeval * timeout )
-{
-   int res;
-   int args[5];
-   args[0] = n;
-   args[1] = (int)readfds;
-   args[2] = (int)writefds;
-   args[3] = (int)exceptfds;
-   args[4] = (int)timeout;
-   res = my_do_syscall1(__NR_select, (int)(&(args[0])) );
-   return res;
-}
-
-
-/* This is a wrapper round select(), which makes it thread-safe,
-   meaning that only this thread will block, rather than the entire
-   process.  This wrapper in turn depends on nanosleep() not to block
-   the entire process, but I think (hope? suspect?) that POSIX
-   pthreads guarantees that to be the case.
-
-   Basic idea is: modify the timeout parameter to select so that it
-   returns immediately.  Poll like this until select returns non-zero,
-   indicating something interesting happened, or until our time is up.
-   Space out the polls with nanosleeps of say 20 milliseconds, which
-   is required to be nonblocking; this allows other threads to run.  
-
-   Assumes:
-   * (checked via my_assert) types fd_set and vki_fd_set are identical.
-   * (checked via my_assert) types timeval and vki_timeval are identical.
-   * (unchecked) libc error numbers (EINTR etc) are the negation of the
-     kernel's error numbers (VKI_EINTR etc).
-*/
-
-/* __attribute__((weak)) */
-int select ( int n, 
-             fd_set *rfds, 
-             fd_set *wfds, 
-             fd_set *xfds, 
-             struct timeval *timeout )
-{
-   unsigned int ms_now, ms_end;
-   int    res;
-   fd_set rfds_copy;
-   fd_set wfds_copy;
-   fd_set xfds_copy;
-   struct vki_timeval  t_now;
-   struct vki_timeval  zero_timeout;
-   struct vki_timespec nanosleep_interval;
-
-   __my_pthread_testcancel();
-
-   /* gcc's complains about ms_end being used uninitialised -- classic
-      case it can't understand, where ms_end is both defined and used
-      only if timeout != NULL.  Hence ... */
-   ms_end = 0;
-
-   /* We assume that the kernel and libc data layouts are identical
-      for the following types.  These asserts provide a crude
-      check. */
-   if (sizeof(fd_set) != sizeof(vki_fd_set)
-       || sizeof(struct timeval) != sizeof(struct vki_timeval))
-      barf("valgrind's hacky non-blocking select(): data sizes error");
-
-   /* Detect the current time and simultaneously find out if we are
-      running on Valgrind. */
-   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__READ_MILLISECOND_TIMER,
-                           0, 0, 0, 0);
-
-   /* If a zero timeout specified, this call is harmless.  Also go
-      this route if we're not running on Valgrind, for whatever
-      reason. */
-   if ( (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
-        || (ms_now == 0xFFFFFFFF) ) {
-      res = do_syscall_select( n, (vki_fd_set*)rfds, 
-                                   (vki_fd_set*)wfds, 
-                                   (vki_fd_set*)xfds, 
-                                   (struct vki_timeval*)timeout);
-      if (is_kerror(res)) {
-         * (__errno_location()) = -res;
-         return -1;
-      } else {
-         return res;
-      }
-   }
-
-   /* If a timeout was specified, set ms_end to be the end millisecond
-      counter [wallclock] time. */
-   if (timeout) {
-      res = my_do_syscall2(__NR_gettimeofday, (int)&t_now, (int)NULL);
-      my_assert(res == 0);
-      ms_end = ms_now;
-      ms_end += (timeout->tv_usec / 1000);
-      ms_end += (timeout->tv_sec * 1000);
-      /* Stay sane ... */
-      my_assert (ms_end >= ms_now);
-   }
-
-   /* fprintf(stderr, "MY_SELECT: before loop\n"); */
-
-   /* Either timeout == NULL, meaning wait indefinitely, or timeout !=
-      NULL, in which case ms_end holds the end time. */
-
-   while (1) {
-
-      /* First, do a return-immediately select(). */
-
-      /* These could be trashed each time round the loop, so restore
-         them each time. */
-      if (rfds) rfds_copy = *rfds;
-      if (wfds) wfds_copy = *wfds;
-      if (xfds) xfds_copy = *xfds;
-
-      zero_timeout.tv_sec = zero_timeout.tv_usec = 0;
-
-      res = do_syscall_select( n, 
-                               rfds ? (vki_fd_set*)(&rfds_copy) : NULL,
-                               wfds ? (vki_fd_set*)(&wfds_copy) : NULL,
-                               xfds ? (vki_fd_set*)(&xfds_copy) : NULL,
-                               & zero_timeout );
-      if (is_kerror(res)) {
-         /* Some kind of error (including EINTR).  Set errno and
-            return.  The sets are unspecified in this case. */
-         * (__errno_location()) = -res;
-         return -1;
-      }
-      if (res > 0) {
-         /* one or more fds is ready.  Copy out resulting sets and
-            return. */
-         if (rfds) *rfds = rfds_copy;
-         if (wfds) *wfds = wfds_copy;
-         if (xfds) *xfds = xfds_copy;
-         return res;
-      }
-
-      /* Nothing interesting happened, so we go to sleep for a
-         while. */
-
-      /* fprintf(stderr, "MY_SELECT: nanosleep\n"); */
-      /* nanosleep and go round again */
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 50 * 1000 * 1000; /* 50 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      res = my_do_syscall2(__NR_nanosleep, 
-                           (int)(&nanosleep_interval), (int)NULL);
-      if (res == -VKI_EINTR) {
-         /* The nanosleep was interrupted by a signal.  So we do the
-            same. */
-         * (__errno_location()) = EINTR;
-         return -1;
-      }
-
-      /* Sleeping finished.  If a finite timeout, check to see if it
-         has expired yet. */
-      if (timeout) {
-         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                                 VG_USERREQ__READ_MILLISECOND_TIMER,
-                                 0, 0, 0, 0);
-         my_assert(ms_now != 0xFFFFFFFF);
-         if (ms_now >= ms_end) {
-            /* timeout; nothing interesting happened. */
-            if (rfds) FD_ZERO(rfds);
-            if (wfds) FD_ZERO(wfds);
-            if (xfds) FD_ZERO(xfds);
-            return 0;
-         }
-      }
-
-   }
-}
-
-
-
-
-#include <sys/poll.h>
-
-#ifndef HAVE_NFDS_T
-typedef unsigned long int nfds_t;
-#endif
-
-
-/* __attribute__((weak)) */
-int poll (struct pollfd *__fds, nfds_t __nfds, int __timeout)
-{
-   unsigned int        ms_now, ms_end;
-   int                 res, i;
-   struct vki_timespec nanosleep_interval;
-
-   __my_pthread_testcancel();
-   ensure_valgrind("poll");
-
-   /* Detect the current time and simultaneously find out if we are
-      running on Valgrind. */
-   VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                           VG_USERREQ__READ_MILLISECOND_TIMER,
-                           0, 0, 0, 0);
-
-   if (/* CHECK SIZES FOR struct pollfd */
-       sizeof(struct timeval) != sizeof(struct vki_timeval))
-      barf("valgrind's hacky non-blocking poll(): data sizes error");
-
-   /* dummy initialisation to keep gcc -Wall happy */
-   ms_end = 0;
-
-   /* If a zero timeout specified, this call is harmless.  Also do
-      this if not running on Valgrind. */
-   if (__timeout == 0 || ms_now == 0xFFFFFFFF) {
-      res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, __timeout);
-      if (is_kerror(res)) {
-         * (__errno_location()) = -res;
-         return -1;
-      } else {
-         return res;
-      }
-   }
-
-   /* If a timeout was specified, set ms_end to be the end wallclock
-      time.  Easy considering that __timeout is in milliseconds. */
-   if (__timeout > 0) {
-      ms_end = ms_now + (unsigned int)__timeout;
-   }
-
-   /* fprintf(stderr, "MY_POLL: before loop\n"); */
-
-   /* Either timeout < 0, meaning wait indefinitely, or timeout > 0,
-      in which case t_end holds the end time. */
-
-   my_assert(__timeout != 0);
-
-   while (1) {
-
-      /* Do a return-immediately poll. */
-
-      res = my_do_syscall3(__NR_poll, (int)__fds, __nfds, 0 );
-      if (is_kerror(res)) {
-         /* Some kind of error.  Set errno and return.  */
-         * (__errno_location()) = -res;
-         return -1;
-      }
-      if (res > 0) {
-         /* One or more fds is ready.  Return now. */
-         return res;
-      }
-
-      /* Nothing interesting happened, so we go to sleep for a
-         while. */
-
-      /* fprintf(stderr, "MY_POLL: nanosleep\n"); */
-      /* nanosleep and go round again */
-      nanosleep_interval.tv_sec  = 0;
-      nanosleep_interval.tv_nsec = 51 * 1000 * 1000; /* 51 milliseconds */
-      /* It's critical here that valgrind's nanosleep implementation
-         is nonblocking. */
-      (void)my_do_syscall2(__NR_nanosleep, 
-                           (int)(&nanosleep_interval), (int)NULL);
-
-      /* Sleeping finished.  If a finite timeout, check to see if it
-         has expired yet. */
-      if (__timeout > 0) {
-         VALGRIND_MAGIC_SEQUENCE(ms_now, 0xFFFFFFFF /* default */,
-                                 VG_USERREQ__READ_MILLISECOND_TIMER,
-                                 0, 0, 0, 0);
-         my_assert(ms_now != 0xFFFFFFFF);
-         if (ms_now >= ms_end) {
-            /* timeout; nothing interesting happened. */
-            for (i = 0; i < __nfds; i++) 
-               __fds[i].revents = 0;
-            return 0;
-         }
-      }
-
-   }
-}
-
-
-/* Helper function used to make accept() non-blocking.  Idea is to use
-   the above nonblocking poll() to make this thread ONLY wait for the
-   specified fd to become ready, and then return. */
-
-/* Sigh -- a hack.  We're not supposed to include this file directly;
-   should do it via /usr/include/fcntl.h, but that introduces a
-   varargs prototype for fcntl itself, which we can't mimic. */
-#define _FCNTL_H
-#include <bits/fcntl.h>
-
-static void wait_for_fd_to_be_readable_or_erring ( int fd )
-{
-   struct pollfd pfd;
-   int           res;
-
-   /* fprintf(stderr, "wait_for_fd_to_be_readable_or_erring %d\n", fd); */
-
-   /* First check to see if the fd is nonblocking, and/or invalid.  In
-      either case return immediately. */
-   res = __libc_fcntl(fd, F_GETFL, 0);
-   if (res == -1) return; /* fd is invalid somehow */
-   if (res & O_NONBLOCK) return; /* fd is nonblocking */
-
-   /* Ok, we'd better wait with poll. */
-   pfd.fd = fd;
-   pfd.events = POLLIN | POLLPRI | POLLERR | POLLHUP | POLLNVAL;
-   /* ... but not POLLOUT, you may notice. */
-   pfd.revents = 0;
-   (void)poll(&pfd, 1, -1 /* forever */);
-}
-
-
-/* ---------------------------------------------------------------------
-   Hacky implementation of semaphores.
-   ------------------------------------------------------------------ */
-
-#include <semaphore.h>
-
-/* This is a terrible way to do the remapping.  Plan is to import an
-   AVL tree at some point. */
-
-typedef
-   struct {
-      pthread_mutex_t se_mx;
-      pthread_cond_t se_cv;
-      int count;
-   }
-   vg_sem_t;
-
-static pthread_mutex_t se_remap_mx = PTHREAD_MUTEX_INITIALIZER;
-
-static int      se_remap_used = 0;
-static sem_t*   se_remap_orig[VG_N_SEMAPHORES];
-static vg_sem_t se_remap_new[VG_N_SEMAPHORES];
-
-static vg_sem_t* se_remap ( sem_t* orig )
-{
-   int res, i;
-   res = __pthread_mutex_lock(&se_remap_mx);
-   my_assert(res == 0);
-
-   for (i = 0; i < se_remap_used; i++) {
-      if (se_remap_orig[i] == orig)
-         break;
-   }
-   if (i == se_remap_used) {
-      if (se_remap_used == VG_N_SEMAPHORES) {
-         res = pthread_mutex_unlock(&se_remap_mx);
-         my_assert(res == 0);
-         barf("VG_N_SEMAPHORES is too low.  Increase and recompile.");
-      }
-      se_remap_used++;
-      se_remap_orig[i] = orig;
-      /* printf("allocated semaphore %d\n", i); */
-   }
-   res = __pthread_mutex_unlock(&se_remap_mx);
-   my_assert(res == 0);
-   return &se_remap_new[i];
-}
-
-
-int sem_init(sem_t *sem, int pshared, unsigned int value)
-{
-   int       res;
-   vg_sem_t* vg_sem;
-   ensure_valgrind("sem_init");
-   if (pshared != 0) {
-      pthread_error("sem_init: unsupported pshared value");
-      errno = ENOSYS;
-      return -1;
-   }
-   vg_sem = se_remap(sem);
-   res = pthread_mutex_init(&vg_sem->se_mx, NULL);
-   my_assert(res == 0);
-   res = pthread_cond_init(&vg_sem->se_cv, NULL);
-   my_assert(res == 0);
-   vg_sem->count = value;
-   return 0;
-}
-
-
-int sem_wait ( sem_t* sem ) 
-{
-   int       res;
-   vg_sem_t* vg_sem;
-   ensure_valgrind("sem_wait");
-   vg_sem = se_remap(sem);
-   res = __pthread_mutex_lock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   while (vg_sem->count == 0) {
-      res = pthread_cond_wait(&vg_sem->se_cv, &vg_sem->se_mx);
-      my_assert(res == 0);
-   }
-   vg_sem->count--;
-   res = __pthread_mutex_unlock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-int sem_post ( sem_t* sem ) 
-{
-   int       res;
-   vg_sem_t* vg_sem; 
-   ensure_valgrind("sem_post");
-   vg_sem = se_remap(sem);
-   res = __pthread_mutex_lock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   if (vg_sem->count == 0) {
-      vg_sem->count++;
-      res = pthread_cond_broadcast(&vg_sem->se_cv);
-      my_assert(res == 0);
-   } else {
-      vg_sem->count++;
-   }
-   res = __pthread_mutex_unlock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int sem_trywait ( sem_t* sem ) 
-{
-   int       ret, res;
-   vg_sem_t* vg_sem; 
-   ensure_valgrind("sem_trywait");
-   vg_sem = se_remap(sem);
-   res = __pthread_mutex_lock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   if (vg_sem->count > 0) { 
-      vg_sem->count--; 
-      ret = 0; 
-   } else { 
-      ret = -1; 
-      errno = EAGAIN; 
-   }
-   res = __pthread_mutex_unlock(&vg_sem->se_mx);
-   my_assert(res == 0);
-   return ret;
-}
-
-
-int sem_getvalue(sem_t* sem, int * sval)
-{
-   vg_sem_t* vg_sem; 
-   ensure_valgrind("sem_trywait");
-   vg_sem = se_remap(sem);
-   *sval = vg_sem->count;
-   return 0;
-}
-
-
-int sem_destroy(sem_t * sem)
-{
-   kludged("sem_destroy");
-   /* if someone waiting on this semaphore, errno = EBUSY, return -1 */
-   return 0;
-}
-
-
-/* ---------------------------------------------------------------------
-   Reader-writer locks.
-   ------------------------------------------------------------------ */
-
-typedef 
-   struct {
-      int             initted;  /* != 0 --> in use; sanity check only */
-      int             prefer_w; /* != 0 --> prefer writer */
-      int             nwait_r;  /* # of waiting readers */
-      int             nwait_w;  /* # of waiting writers */
-      pthread_cond_t  cv_r;     /* for signalling readers */
-      pthread_cond_t  cv_w;     /* for signalling writers */
-      pthread_mutex_t mx;
-      int             status;
-      /* allowed range for status: >= -1.  -1 means 1 writer currently
-         active, >= 0 means N readers currently active. */
-   } 
-   vg_rwlock_t;
-
-
-static pthread_mutex_t rw_remap_mx = PTHREAD_MUTEX_INITIALIZER;
-
-static int                 rw_remap_used = 0;
-static pthread_rwlock_t*   rw_remap_orig[VG_N_RWLOCKS];
-static vg_rwlock_t         rw_remap_new[VG_N_RWLOCKS];
-
-
-static 
-void init_vg_rwlock ( vg_rwlock_t* vg_rwl )
-{
-   int res = 0;
-   vg_rwl->initted = 1;
-   vg_rwl->prefer_w = 1;
-   vg_rwl->nwait_r = 0;
-   vg_rwl->nwait_w = 0;
-   vg_rwl->status = 0;
-   res = pthread_mutex_init(&vg_rwl->mx, NULL);
-   res |= pthread_cond_init(&vg_rwl->cv_r, NULL);
-   res |= pthread_cond_init(&vg_rwl->cv_w, NULL);
-   my_assert(res == 0);
-}
-
-
-/* Take the address of a LinuxThreads rwlock_t and return the shadow
-   address of our version.  Further, if the LinuxThreads version
-   appears to have been statically initialised, do the same to the one
-   we allocate here.  The pthread_rwlock_t.__rw_readers field is set
-   to zero by PTHREAD_RWLOCK_INITIALIZER, so we take zero as meaning
-   uninitialised and non-zero meaning initialised. 
-*/
-static vg_rwlock_t* rw_remap ( pthread_rwlock_t* orig )
-{
-   int          res, i;
-   vg_rwlock_t* vg_rwl;
-   res = __pthread_mutex_lock(&rw_remap_mx);
-   my_assert(res == 0);
-
-   for (i = 0; i < rw_remap_used; i++) {
-      if (rw_remap_orig[i] == orig)
-         break;
-   }
-   if (i == rw_remap_used) {
-      if (rw_remap_used == VG_N_RWLOCKS) {
-         res = __pthread_mutex_unlock(&rw_remap_mx);
-         my_assert(res == 0);
-         barf("VG_N_RWLOCKS is too low.  Increase and recompile.");
-      }
-      rw_remap_used++;
-      rw_remap_orig[i] = orig;
-      rw_remap_new[i].initted = 0;
-      if (0) printf("allocated rwlock %d\n", i);
-   }
-   res = __pthread_mutex_unlock(&rw_remap_mx);
-   my_assert(res == 0);
-   vg_rwl = &rw_remap_new[i];
-
-   /* Initialise the shadow, if required. */
-   if (orig->__rw_readers == 0) {
-      orig->__rw_readers = 1;
-      init_vg_rwlock(vg_rwl);
-      if (orig->__rw_kind == PTHREAD_RWLOCK_PREFER_READER_NP)
-         vg_rwl->prefer_w = 0;
-   }
-
-   return vg_rwl;
-}
-
-
-int pthread_rwlock_init ( pthread_rwlock_t* orig,
-                          const pthread_rwlockattr_t* attr )
-{
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_init\n");
-   /* Force the remapper to initialise the shadow. */
-   orig->__rw_readers = 0;
-   /* Install the lock preference; the remapper needs to know it. */
-   orig->__rw_kind = PTHREAD_RWLOCK_DEFAULT_NP;
-   if (attr)
-      orig->__rw_kind = attr->__lockkind;
-   rwl = rw_remap ( orig );
-   return 0;
-}
-
-
-static 
-void pthread_rwlock_rdlock_CANCEL_HDLR ( void* rwl_v )
-{
-   vg_rwlock_t* rwl = (vg_rwlock_t*)rwl_v;
-   rwl->nwait_r--;
-   pthread_mutex_unlock (&rwl->mx);
-}
-
-
-int pthread_rwlock_rdlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_rdlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status < 0) {
-      my_assert(rwl->status == -1);
-      rwl->nwait_r++;
-      pthread_cleanup_push( pthread_rwlock_rdlock_CANCEL_HDLR, rwl );
-      while (1) {
-         if (rwl->status == 0) break;
-         res = pthread_cond_wait(&rwl->cv_r, &rwl->mx);
-         my_assert(res == 0);
-      }
-      pthread_cleanup_pop(0);
-      rwl->nwait_r--;
-   }
-   my_assert(rwl->status >= 0);
-   rwl->status++;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int pthread_rwlock_tryrdlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_tryrdlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status == -1) {
-      /* Writer active; we have to give up. */
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EBUSY;
-   }
-   /* Success */
-   my_assert(rwl->status >= 0);
-   rwl->status++;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-static 
-void pthread_rwlock_wrlock_CANCEL_HDLR ( void* rwl_v )
-{
-   vg_rwlock_t* rwl = (vg_rwlock_t*)rwl_v;
-   rwl->nwait_w--;
-   pthread_mutex_unlock (&rwl->mx);
-}
-
-
-int pthread_rwlock_wrlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_wrlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status != 0) {
-      rwl->nwait_w++;
-      pthread_cleanup_push( pthread_rwlock_wrlock_CANCEL_HDLR, rwl );
-      while (1) {
-         if (rwl->status == 0) break;
-         res = pthread_cond_wait(&rwl->cv_w, &rwl->mx);
-         my_assert(res == 0);
-      }
-      pthread_cleanup_pop(0);
-      rwl->nwait_w--;
-   }
-   my_assert(rwl->status == 0);
-   rwl->status = -1;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int pthread_rwlock_trywrlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_wrlock_trywrlock\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status != 0) {
-      /* Reader(s) or a writer active; we have to give up. */
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EBUSY;
-   }
-   /* Success */
-   my_assert(rwl->status == 0);
-   rwl->status = -1;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-int pthread_rwlock_unlock ( pthread_rwlock_t* orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_unlock\n");
-   rwl = rw_remap ( orig );
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status == 0) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EPERM;
-   }
-   my_assert(rwl->status != 0);
-   if (rwl->status == -1) {
-     rwl->status = 0;
-   } else {
-     my_assert(rwl->status > 0);
-     rwl->status--;
-   }
-
-   my_assert(rwl->status >= 0);
-
-   if (rwl->prefer_w) {
-
-      /* Favour waiting writers, if any. */
-      if (rwl->nwait_w > 0) {
-         /* Writer(s) are waiting. */
-         if (rwl->status == 0) {
-            /* We can let a writer in. */
-            res = pthread_cond_signal(&rwl->cv_w);
-            my_assert(res == 0);
-         } else {
-            /* There are still readers active.  Do nothing; eventually
-               they will disappear, at which point a writer will be
-               admitted. */
-         }
-      } 
-      else
-      /* No waiting writers. */
-      if (rwl->nwait_r > 0) {
-         /* Let in a waiting reader. */
-         res = pthread_cond_signal(&rwl->cv_r);
-         my_assert(res == 0);
-      }
-
-   } else {
-
-      /* Favour waiting readers, if any. */
-      if (rwl->nwait_r > 0) {
-         /* Reader(s) are waiting; let one in. */
-         res = pthread_cond_signal(&rwl->cv_r);
-         my_assert(res == 0);
-      } 
-      else
-      /* No waiting readers. */
-      if (rwl->nwait_w > 0 && rwl->status == 0) {
-         /* We have waiting writers and no active readers; let a
-            writer in. */
-         res = pthread_cond_signal(&rwl->cv_w);
-         my_assert(res == 0);
-      }
-   }
-
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;   
-}
-
-
-int pthread_rwlock_destroy ( pthread_rwlock_t *orig )
-{
-   int res;
-   vg_rwlock_t* rwl;
-   if (0) printf ("pthread_rwlock_destroy\n");
-   rwl = rw_remap ( orig );
-   res = __pthread_mutex_lock(&rwl->mx);
-   my_assert(res == 0);
-   if (!rwl->initted) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EINVAL;
-   }
-   if (rwl->status != 0 || rwl->nwait_r > 0 || rwl->nwait_w > 0) {
-      res = __pthread_mutex_unlock(&rwl->mx);
-      my_assert(res == 0);
-      return EBUSY;
-   }
-   rwl->initted = 0;
-   res = __pthread_mutex_unlock(&rwl->mx);
-   my_assert(res == 0);
-   return 0;
-}
-
-
-/* Copied directly from LinuxThreads. */
-int
-pthread_rwlockattr_init (pthread_rwlockattr_t *attr)
-{
-  attr->__lockkind = 0;
-  attr->__pshared = PTHREAD_PROCESS_PRIVATE;
-
-  return 0;
-}
-
-/* Copied directly from LinuxThreads. */
-int
-pthread_rwlockattr_setpshared (pthread_rwlockattr_t *attr, int pshared)
-{
-  if (pshared != PTHREAD_PROCESS_PRIVATE && pshared != PTHREAD_PROCESS_SHARED)
-    return EINVAL;
-
-  /* For now it is not possible to shared a conditional variable.  */
-  if (pshared != PTHREAD_PROCESS_PRIVATE)
-    return ENOSYS;
-
-  attr->__pshared = pshared;
-
-  return 0;
-}
-
-
-/* ---------------------------------------------------------------------
-   B'stard.
-   ------------------------------------------------------------------ */
-
-# define strong_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
-
-# define weak_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
-
-strong_alias(__pthread_mutex_lock, pthread_mutex_lock)
-strong_alias(__pthread_mutex_trylock, pthread_mutex_trylock)
-strong_alias(__pthread_mutex_unlock, pthread_mutex_unlock)
-strong_alias(__pthread_mutexattr_init, pthread_mutexattr_init)
-  weak_alias(__pthread_mutexattr_settype, pthread_mutexattr_settype)
-strong_alias(__pthread_mutex_init, pthread_mutex_init)
-strong_alias(__pthread_mutexattr_destroy, pthread_mutexattr_destroy)
-strong_alias(__pthread_mutex_destroy, pthread_mutex_destroy)
-strong_alias(__pthread_once, pthread_once)
-strong_alias(__pthread_atfork, pthread_atfork)
-strong_alias(__pthread_key_create, pthread_key_create)
-strong_alias(__pthread_getspecific, pthread_getspecific)
-strong_alias(__pthread_setspecific, pthread_setspecific)
-
-#ifndef GLIBC_2_1
-strong_alias(sigaction, __sigaction)
-#endif
-     
-strong_alias(close, __close)
-strong_alias(fcntl, __fcntl)
-strong_alias(lseek, __lseek)
-strong_alias(open, __open)
-strong_alias(open64, __open64)
-strong_alias(read, __read)
-strong_alias(wait, __wait)
-strong_alias(write, __write)
-strong_alias(connect, __connect)
-strong_alias(send, __send)
-
-weak_alias (__pread64, pread64)
-weak_alias (__pwrite64, pwrite64)
-weak_alias(__fork, fork)
-
-weak_alias (__pthread_kill_other_threads_np, pthread_kill_other_threads_np)
-
-/*--------------------------------------------------*/
-
-weak_alias(pthread_rwlock_rdlock, __pthread_rwlock_rdlock)
-weak_alias(pthread_rwlock_unlock, __pthread_rwlock_unlock)
-weak_alias(pthread_rwlock_wrlock, __pthread_rwlock_wrlock)
-
-weak_alias(pthread_rwlock_destroy, __pthread_rwlock_destroy)
-weak_alias(pthread_rwlock_init, __pthread_rwlock_init)
-weak_alias(pthread_rwlock_tryrdlock, __pthread_rwlock_tryrdlock)
-weak_alias(pthread_rwlock_trywrlock, __pthread_rwlock_trywrlock)
-
-
-/* I've no idea what these are, but they get called quite a lot.
-   Anybody know? */
-
-#undef _IO_flockfile
-void _IO_flockfile ( _IO_FILE * file )
-{
-   pthread_mutex_lock(file->_lock);
-}
-weak_alias(_IO_flockfile, flockfile);
-
-
-#undef _IO_funlockfile
-void _IO_funlockfile ( _IO_FILE * file )
-{
-   pthread_mutex_unlock(file->_lock);
-}
-weak_alias(_IO_funlockfile, funlockfile);
-
-
-/* This doesn't seem to be needed to simulate libpthread.so's external
-   interface, but many people complain about its absence. */
-
-strong_alias(__pthread_mutexattr_settype, __pthread_mutexattr_setkind_np)
-weak_alias(__pthread_mutexattr_setkind_np, pthread_mutexattr_setkind_np)
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                          vg_libpthread.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_libpthread.vs b/coregrind/vg_libpthread.vs
deleted file mode 100644
index 69efdcc363..0000000000
--- a/coregrind/vg_libpthread.vs
+++ /dev/null
@@ -1,19 +0,0 @@
-
-GLIBC_2.0 {
-};
-
-GLIBC_2.1 {
-} GLIBC_2.0;
-
-GLIBC_2.2 {
-} GLIBC_2.1;
-
-GLIBC_2.2.3 {
-   __pthread_clock_gettime;
-   __pthread_clock_settime;
-} GLIBC_2.2;
-
-GLIBC_PRIVATE {
-   __pthread_clock_gettime;
-   __pthread_clock_settime;
-};
diff --git a/coregrind/vg_libpthread_unimp.c b/coregrind/vg_libpthread_unimp.c
deleted file mode 100644
index f413887f27..0000000000
--- a/coregrind/vg_libpthread_unimp.c
+++ /dev/null
@@ -1,262 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Give dummy bindings for everything the real libpthread.so    ---*/
-/*--- binds.                                 vg_libpthread_unimp.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-/* ---------------------------------------------------------------------
-   ALL THIS CODE RUNS ON THE SIMULATED CPU.
-   Give a binding for everything the real libpthread.so binds.
-   ------------------------------------------------------------------ */
-
-extern void vgPlain_unimp ( char* );
-#define unimp(str) vgPlain_unimp(str)
-
-//void _IO_flockfile ( void )  { unimp("_IO_flockfile"); }
-void _IO_ftrylockfile ( void )  { unimp("_IO_ftrylockfile"); }
-//void _IO_funlockfile ( void )  { unimp("_IO_funlockfile"); }
-//void __close ( void )  { unimp("__close"); }
-//void __connect ( void )  { unimp("__connect"); }
-//void __errno_location ( void )  { unimp("__errno_location"); }
-//void __fcntl ( void )  { unimp("__fcntl"); }
-//void __fork ( void )  { unimp("__fork"); }
-//void __h_errno_location ( void )  { unimp("__h_errno_location"); }
-void __libc_allocate_rtsig ( void )  { unimp("__libc_allocate_rtsig"); }
-void __libc_current_sigrtmax ( void )  { unimp("__libc_current_sigrtmax"); }
-void __libc_current_sigrtmin ( void )  { unimp("__libc_current_sigrtmin"); }
-//void __lseek ( void )  { unimp("__lseek"); }
-//void __open ( void )  { unimp("__open"); }
-//void __open64 ( void )  { unimp("__open64"); }
-//void __pread64 ( void )  { unimp("__pread64"); }
-//void __pthread_atfork ( void )  { unimp("__pthread_atfork"); }
-//void __pthread_getspecific ( void )  { unimp("__pthread_getspecific"); }
-//void __pthread_key_create ( void )  { unimp("__pthread_key_create"); }
-//void __pthread_kill_other_threads_np ( void )  { unimp("__pthread_kill_other_threads_np"); }
-//void __pthread_mutex_destroy ( void )  { unimp("__pthread_mutex_destroy"); }
-//void __pthread_mutex_init ( void )  { unimp("__pthread_mutex_init"); }
-//void __pthread_mutex_lock ( void )  { unimp("__pthread_mutex_lock"); }
-//void __pthread_mutex_trylock ( void )  { unimp("__pthread_mutex_trylock"); }
-//void __pthread_mutex_unlock ( void )  { unimp("__pthread_mutex_unlock"); }
-//void __pthread_mutexattr_destroy ( void )  { unimp("__pthread_mutexattr_destroy"); }
-//void __pthread_mutexattr_init ( void )  { unimp("__pthread_mutexattr_init"); }
-//void __pthread_mutexattr_settype ( void )  { unimp("__pthread_mutexattr_settype"); }
-//void __pthread_once ( void )  { unimp("__pthread_once"); }
-//void __pthread_setspecific ( void )  { unimp("__pthread_setspecific"); }
-//void __pwrite64 ( void )  { unimp("__pwrite64"); }
-//void __read ( void )  { unimp("__read"); }
-//void __res_state ( void )  { unimp("__res_state"); }
-//void __send ( void )  { unimp("__send"); }
-//void __sigaction ( void )  { unimp("__sigaction"); }
-//--//void __vfork ( void )  { unimp("__vfork"); }
-//void __wait ( void )  { unimp("__wait"); }
-//void __write ( void )  { unimp("__write"); }
-//void _pthread_cleanup_pop ( void )  { unimp("_pthread_cleanup_pop"); }
-//void _pthread_cleanup_pop_restore ( void )  { unimp("_pthread_cleanup_pop_restore"); }
-//void _pthread_cleanup_push ( void )  { unimp("_pthread_cleanup_push"); }
-//void _pthread_cleanup_push_defer ( void )  { unimp("_pthread_cleanup_push_defer"); }
-//void longjmp ( void )  { unimp("longjmp"); }
-//void pthread_atfork ( void )  { unimp("pthread_atfork"); }
-//void pthread_attr_destroy ( void )  { unimp("pthread_attr_destroy"); }
-void pthread_attr_getdetachstate ( void )  { unimp("pthread_attr_getdetachstate"); }
-void pthread_attr_getinheritsched ( void )  { unimp("pthread_attr_getinheritsched"); }
-//void pthread_attr_getschedparam ( void )  { unimp("pthread_attr_getschedparam"); }
-//void pthread_attr_getschedpolicy ( void )  { unimp("pthread_attr_getschedpolicy"); }
-//void pthread_attr_getscope ( void )  { unimp("pthread_attr_getscope"); }
-
-//void pthread_attr_setdetachstate ( void )  { unimp("pthread_attr_setdetachstate"); }
-//void pthread_attr_setinheritsched ( void )  { unimp("pthread_attr_setinheritsched"); }
-//void pthread_attr_setschedparam ( void )  { unimp("pthread_attr_setschedparam"); }
-//void pthread_attr_setschedpolicy ( void )  { unimp("pthread_attr_setschedpolicy"); }
-//void pthread_attr_setscope ( void )  { unimp("pthread_attr_setscope"); }
-void pthread_barrier_destroy ( void )  { unimp("pthread_barrier_destroy"); }
-void pthread_barrier_init ( void )  { unimp("pthread_barrier_init"); }
-void pthread_barrier_wait ( void )  { unimp("pthread_barrier_wait"); }
-void pthread_barrierattr_destroy ( void )  { unimp("pthread_barrierattr_destroy"); }
-void pthread_barrierattr_init ( void )  { unimp("pthread_barrierattr_init"); }
-void pthread_barrierattr_setpshared ( void )  { unimp("pthread_barrierattr_setpshared"); }
-//void pthread_cancel ( void )  { unimp("pthread_cancel"); }
-//void pthread_cond_broadcast ( void )  { unimp("pthread_cond_broadcast"); }
-//void pthread_cond_destroy ( void )  { unimp("pthread_cond_destroy"); }
-//void pthread_cond_init ( void )  { unimp("pthread_cond_init"); }
-//void pthread_cond_signal ( void )  { unimp("pthread_cond_signal"); }
-//void pthread_cond_timedwait ( void )  { unimp("pthread_cond_timedwait"); }
-//void pthread_cond_wait ( void )  { unimp("pthread_cond_wait"); }
-//void pthread_condattr_destroy ( void )  { unimp("pthread_condattr_destroy"); }
-void pthread_condattr_getpshared ( void )  { unimp("pthread_condattr_getpshared"); }
-//void pthread_condattr_init ( void )  { unimp("pthread_condattr_init"); }
-void pthread_condattr_setpshared ( void )  { unimp("pthread_condattr_setpshared"); }
-//void pthread_detach ( void )  { unimp("pthread_detach"); }
-//void pthread_equal ( void )  { unimp("pthread_equal"); }
-//void pthread_exit ( void )  { unimp("pthread_exit"); }
-//void pthread_getattr_np ( void )  { unimp("pthread_getattr_np"); }
-void pthread_getcpuclockid ( void )  { unimp("pthread_getcpuclockid"); }
-//void pthread_getschedparam ( void )  { unimp("pthread_getschedparam"); }
-//void pthread_getspecific ( void )  { unimp("pthread_getspecific"); }
-//void pthread_join ( void )  { unimp("pthread_join"); }
-//void pthread_key_create ( void )  { unimp("pthread_key_create"); }
-//void pthread_key_delete ( void )  { unimp("pthread_key_delete"); }
-//void pthread_kill ( void )  { unimp("pthread_kill"); }
-//void pthread_mutex_destroy ( void )  { unimp("pthread_mutex_destroy"); }
-//void pthread_mutex_init ( void )  { unimp("pthread_mutex_init"); }
-//void pthread_mutex_lock ( void )  { unimp("pthread_mutex_lock"); }
-void pthread_mutex_timedlock ( void )  { unimp("pthread_mutex_timedlock"); }
-//void pthread_mutex_trylock ( void )  { unimp("pthread_mutex_trylock"); }
-//void pthread_mutex_unlock ( void )  { unimp("pthread_mutex_unlock"); }
-//void pthread_mutexattr_destroy ( void )  { unimp("pthread_mutexattr_destroy"); }
-//void pthread_mutexattr_init ( void )  { unimp("pthread_mutexattr_init"); }
-//void pthread_once ( void )  { unimp("pthread_once"); }
-//void pthread_rwlock_destroy ( void )  { unimp("pthread_rwlock_destroy"); }
-//void pthread_rwlock_init ( void )  { unimp("pthread_rwlock_init"); }
-//void pthread_rwlock_rdlock ( void )  { unimp("pthread_rwlock_rdlock"); }
-void pthread_rwlock_timedrdlock ( void )  { unimp("pthread_rwlock_timedrdlock"); }
-void pthread_rwlock_timedwrlock ( void )  { unimp("pthread_rwlock_timedwrlock"); }
-//void pthread_rwlock_tryrdlock ( void )  { unimp("pthread_rwlock_tryrdlock"); }
-//void pthread_rwlock_trywrlock ( void )  { unimp("pthread_rwlock_trywrlock"); }
-//void pthread_rwlock_unlock ( void )  { unimp("pthread_rwlock_unlock"); }
-//void pthread_rwlock_wrlock ( void )  { unimp("pthread_rwlock_wrlock"); }
-void pthread_rwlockattr_destroy ( void )  { unimp("pthread_rwlockattr_destroy"); }
-void pthread_rwlockattr_getkind_np ( void )  { unimp("pthread_rwlockattr_getkind_np"); }
-void pthread_rwlockattr_getpshared ( void )  { unimp("pthread_rwlockattr_getpshared"); }
-//void pthread_rwlockattr_init ( void )  { unimp("pthread_rwlockattr_init"); }
-void pthread_rwlockattr_setkind_np ( void )  { unimp("pthread_rwlockattr_setkind_np"); }
-//void pthread_rwlockattr_setpshared ( void )  { unimp("pthread_rwlockattr_setpshared"); }
-//void pthread_self ( void )  { unimp("pthread_self"); }
-//void pthread_setcancelstate ( void )  { unimp("pthread_setcancelstate"); }
-//void pthread_setcanceltype ( void )  { unimp("pthread_setcanceltype"); }
-//void pthread_setschedparam ( void )  { unimp("pthread_setschedparam"); }
-//void pthread_setspecific ( void )  { unimp("pthread_setspecific"); }
-//void pthread_sigmask ( void )  { unimp("pthread_sigmask"); }
-//void pthread_testcancel ( void )  { unimp("pthread_testcancel"); }
-//void raise ( void )  { unimp("raise"); }
-void sem_close ( void )  { unimp("sem_close"); }
-void sem_open ( void )  { unimp("sem_open"); }
-void sem_timedwait ( void )  { unimp("sem_timedwait"); }
-void sem_unlink ( void )  { unimp("sem_unlink"); }
-//void sigaction ( void )  { unimp("sigaction"); }
-//void siglongjmp ( void )  { unimp("siglongjmp"); }
-//void sigwait ( void )  { unimp("sigwait"); }
-
-void __pthread_clock_gettime ( void ) { unimp("__pthread_clock_gettime"); }
-void __pthread_clock_settime ( void ) { unimp("__pthread_clock_settime"); }
-
-#if 0
-void pthread_create@@GLIBC_2.1 ( void )  { unimp("pthread_create@@GLIBC_2.1"); }
-void pthread_create@GLIBC_2.0 ( void )  { unimp("pthread_create@GLIBC_2.0"); }
-
-void sem_wait@@GLIBC_2.1 ( void )  { unimp("sem_wait@@GLIBC_2.1"); }
-void sem_wait@GLIBC_2.0 ( void )  { unimp("sem_wait@GLIBC_2.0"); }
-
-void sem_trywait@@GLIBC_2.1 ( void )  { unimp("sem_trywait@@GLIBC_2.1"); }
-void sem_trywait@GLIBC_2.0 ( void )  { unimp("sem_trywait@GLIBC_2.0"); }
-
-void sem_post@@GLIBC_2.1 ( void )  { unimp("sem_post@@GLIBC_2.1"); }
-void sem_post@GLIBC_2.0 ( void )  { unimp("sem_post@GLIBC_2.0"); }
-
-void sem_destroy@@GLIBC_2.1 ( void )  { unimp("sem_destroy@@GLIBC_2.1"); }
-void sem_destroy@GLIBC_2.0 ( void )  { unimp("sem_destroy@GLIBC_2.0"); }
-void sem_getvalue@@GLIBC_2.1 ( void )  { unimp("sem_getvalue@@GLIBC_2.1"); }
-void sem_getvalue@GLIBC_2.0 ( void )  { unimp("sem_getvalue@GLIBC_2.0"); }
-void sem_init@@GLIBC_2.1 ( void )  { unimp("sem_init@@GLIBC_2.1"); }
-void sem_init@GLIBC_2.0 ( void )  { unimp("sem_init@GLIBC_2.0"); }
-
-void pthread_attr_init@@GLIBC_2.1 ( void )  { unimp("pthread_attr_init@@GLIBC_2.1"); }
-void pthread_attr_init@GLIBC_2.0 ( void )  { unimp("pthread_attr_init@GLIBC_2.0"); }
-#endif
-
-
-
-# define strong_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
-
-# define weak_alias(name, aliasname) \
-  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
-
-//weak_alias(pthread_rwlock_destroy, __pthread_rwlock_destroy)
-//weak_alias(pthread_rwlock_init, __pthread_rwlock_init)
-//weak_alias(pthread_rwlock_tryrdlock, __pthread_rwlock_tryrdlock)
-//weak_alias(pthread_rwlock_trywrlock, __pthread_rwlock_trywrlock)
-//weak_alias(pthread_rwlock_wrlock, __pthread_rwlock_wrlock)
-weak_alias(_IO_ftrylockfile, ftrylockfile)
-
-//__attribute__((weak)) void pread ( void ) { vgPlain_unimp("pread"); }
-//__attribute__((weak)) void pwrite ( void ) { vgPlain_unimp("pwrite"); }
-//__attribute__((weak)) void msync ( void ) { vgPlain_unimp("msync"); }
-//__attribute__((weak)) void pause ( void ) { vgPlain_unimp("pause"); }
-//__attribute__((weak)) void recvfrom ( void ) { vgPlain_unimp("recvfrom"); }
-//__attribute__((weak)) void recvmsg ( void ) { vgPlain_unimp("recvmsg"); }
-//__attribute__((weak)) void sendmsg ( void ) { vgPlain_unimp("sendmsg"); }
-__attribute__((weak)) void tcdrain ( void ) { vgPlain_unimp("tcdrain"); }
-//--//__attribute__((weak)) void vfork ( void ) { vgPlain_unimp("vfork"); }
-
-__attribute__((weak)) void pthread_attr_getguardsize ( void )
-                      { vgPlain_unimp("pthread_attr_getguardsize"); }
-__attribute__((weak)) void pthread_attr_getstack ( void )
-                      { vgPlain_unimp("pthread_attr_getstack"); }
-__attribute__((weak)) void pthread_attr_getstackaddr ( void )
-                      { vgPlain_unimp("pthread_attr_getstackaddr"); }
-__attribute__((weak)) void pthread_attr_getstacksize ( void )
-                      { vgPlain_unimp("pthread_attr_getstacksize"); }
-__attribute__((weak)) void pthread_attr_setguardsize ( void )
-                      { vgPlain_unimp("pthread_attr_setguardsize"); }
-__attribute__((weak)) void pthread_attr_setstack ( void )
-                      { vgPlain_unimp("pthread_attr_setstack"); }
-__attribute__((weak)) void pthread_attr_setstackaddr ( void )
-                      { vgPlain_unimp("pthread_attr_setstackaddr"); }
-//__attribute__((weak)) void pthread_attr_setstacksize ( void )
-//                      { vgPlain_unimp("pthread_attr_setstacksize"); }
-__attribute__((weak)) void pthread_getconcurrency ( void )
-                      { vgPlain_unimp("pthread_getconcurrency"); }
-//__attribute__((weak)) void pthread_kill_other_threads_np ( void )
-//                      { vgPlain_unimp("pthread_kill_other_threads_np"); }
-__attribute__((weak)) void pthread_mutexattr_getkind_np ( void )
-                      { vgPlain_unimp("pthread_mutexattr_getkind_np"); }
-__attribute__((weak)) void pthread_mutexattr_getpshared ( void )
-                      { vgPlain_unimp("pthread_mutexattr_getpshared"); }
-__attribute__((weak)) void pthread_mutexattr_gettype ( void )
-                      { vgPlain_unimp("pthread_mutexattr_gettype"); }
-__attribute__((weak)) void pthread_mutexattr_setkind_np ( void )
-                      { vgPlain_unimp("pthread_mutexattr_setkind_np"); }
-__attribute__((weak)) void pthread_mutexattr_setpshared ( void )
-                      { vgPlain_unimp("pthread_mutexattr_setpshared"); }
-__attribute__((weak)) void pthread_setconcurrency ( void )
-                      { vgPlain_unimp("pthread_setconcurrency"); }
-__attribute__((weak)) void pthread_spin_destroy ( void )
-                      { vgPlain_unimp("pthread_spin_destroy"); }
-__attribute__((weak)) void pthread_spin_init ( void )
-                      { vgPlain_unimp("pthread_spin_init"); }
-__attribute__((weak)) void pthread_spin_lock ( void )
-                      { vgPlain_unimp("pthread_spin_lock"); }
-__attribute__((weak)) void pthread_spin_trylock ( void )
-                      { vgPlain_unimp("pthread_spin_trylock"); }
-__attribute__((weak)) void pthread_spin_unlock ( void )
-                      { vgPlain_unimp("pthread_spin_unlock"); }
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                    vg_libpthread_unimp.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
deleted file mode 100644
index 5cce13d2e8..0000000000
--- a/coregrind/vg_main.c
+++ /dev/null
@@ -1,1411 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- C startup stuff, reached from vg_startup.S.                  ---*/
-/*---                                                    vg_main.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "vg_constants.h"
-
-
-/* ---------------------------------------------------------------------
-   Compute offsets into baseBlock.  See comments in vg_include.h.
-   ------------------------------------------------------------------ */
-
-/* The variables storing offsets. */
-
-#define INVALID_OFFSET (-1)
-
-Int VGOFF_(m_eax) = INVALID_OFFSET;
-Int VGOFF_(m_ecx) = INVALID_OFFSET;
-Int VGOFF_(m_edx) = INVALID_OFFSET;
-Int VGOFF_(m_ebx) = INVALID_OFFSET;
-Int VGOFF_(m_esp) = INVALID_OFFSET;
-Int VGOFF_(m_ebp) = INVALID_OFFSET;
-Int VGOFF_(m_esi) = INVALID_OFFSET;
-Int VGOFF_(m_edi) = INVALID_OFFSET;
-Int VGOFF_(m_eflags) = INVALID_OFFSET;
-Int VGOFF_(m_fpustate) = INVALID_OFFSET;
-Int VGOFF_(m_eip) = INVALID_OFFSET;
-Int VGOFF_(spillslots) = INVALID_OFFSET;
-Int VGOFF_(sh_eax) = INVALID_OFFSET;
-Int VGOFF_(sh_ecx) = INVALID_OFFSET;
-Int VGOFF_(sh_edx) = INVALID_OFFSET;
-Int VGOFF_(sh_ebx) = INVALID_OFFSET;
-Int VGOFF_(sh_esp) = INVALID_OFFSET;
-Int VGOFF_(sh_ebp) = INVALID_OFFSET;
-Int VGOFF_(sh_esi) = INVALID_OFFSET;
-Int VGOFF_(sh_edi) = INVALID_OFFSET;
-Int VGOFF_(sh_eflags) = INVALID_OFFSET;
-Int VGOFF_(helper_idiv_64_32) = INVALID_OFFSET;
-Int VGOFF_(helper_div_64_32) = INVALID_OFFSET;
-Int VGOFF_(helper_idiv_32_16) = INVALID_OFFSET;
-Int VGOFF_(helper_div_32_16) = INVALID_OFFSET;
-Int VGOFF_(helper_idiv_16_8) = INVALID_OFFSET;
-Int VGOFF_(helper_div_16_8) = INVALID_OFFSET;
-Int VGOFF_(helper_imul_32_64) = INVALID_OFFSET;
-Int VGOFF_(helper_mul_32_64) = INVALID_OFFSET;
-Int VGOFF_(helper_imul_16_32) = INVALID_OFFSET;
-Int VGOFF_(helper_mul_16_32) = INVALID_OFFSET;
-Int VGOFF_(helper_imul_8_16) = INVALID_OFFSET;
-Int VGOFF_(helper_mul_8_16) = INVALID_OFFSET;
-Int VGOFF_(helper_CLD) = INVALID_OFFSET;
-Int VGOFF_(helper_STD) = INVALID_OFFSET;
-Int VGOFF_(helper_get_dirflag) = INVALID_OFFSET;
-Int VGOFF_(helper_CLC) = INVALID_OFFSET;
-Int VGOFF_(helper_STC) = INVALID_OFFSET;
-Int VGOFF_(helper_shldl) = INVALID_OFFSET;
-Int VGOFF_(helper_shldw) = INVALID_OFFSET;
-Int VGOFF_(helper_shrdl) = INVALID_OFFSET;
-Int VGOFF_(helper_shrdw) = INVALID_OFFSET;
-Int VGOFF_(helper_RDTSC) = INVALID_OFFSET;
-Int VGOFF_(helper_CPUID) = INVALID_OFFSET;
-Int VGOFF_(helper_BSWAP) = INVALID_OFFSET;
-Int VGOFF_(helper_bsf) = INVALID_OFFSET;
-Int VGOFF_(helper_bsr) = INVALID_OFFSET;
-Int VGOFF_(helper_fstsw_AX) = INVALID_OFFSET;
-Int VGOFF_(helper_SAHF) = INVALID_OFFSET;
-Int VGOFF_(helper_DAS) = INVALID_OFFSET;
-Int VGOFF_(helper_DAA) = INVALID_OFFSET;
-Int VGOFF_(helper_value_check4_fail) = INVALID_OFFSET;
-Int VGOFF_(helper_value_check2_fail) = INVALID_OFFSET;
-Int VGOFF_(helper_value_check1_fail) = INVALID_OFFSET;
-Int VGOFF_(helper_value_check0_fail) = INVALID_OFFSET;
-Int VGOFF_(helperc_LOADV4) = INVALID_OFFSET;
-Int VGOFF_(helperc_LOADV2) = INVALID_OFFSET;
-Int VGOFF_(helperc_LOADV1) = INVALID_OFFSET;
-Int VGOFF_(helperc_STOREV4) = INVALID_OFFSET;
-Int VGOFF_(helperc_STOREV2) = INVALID_OFFSET;
-Int VGOFF_(helperc_STOREV1) = INVALID_OFFSET;
-Int VGOFF_(handle_esp_assignment) = INVALID_OFFSET;
-Int VGOFF_(fpu_write_check) = INVALID_OFFSET;
-Int VGOFF_(fpu_read_check) = INVALID_OFFSET;
-Int VGOFF_(cachesim_log_non_mem_instr) = INVALID_OFFSET;
-Int VGOFF_(cachesim_log_mem_instr)     = INVALID_OFFSET;
-
-/* This is the actual defn of baseblock. */
-UInt VG_(baseBlock)[VG_BASEBLOCK_WORDS];
-
-/* Words. */
-static Int baB_off = 0;
-
-/* Returns the offset, in words. */
-static Int alloc_BaB ( Int words )
-{
-   Int off = baB_off;
-   baB_off += words;
-   if (baB_off >= VG_BASEBLOCK_WORDS)
-      VG_(panic)( "alloc_BaB: baseBlock is too small");
-
-   return off;   
-}
-
-/* Allocate 1 word in baseBlock and set it to the given value. */
-static Int alloc_BaB_1_set ( Addr a )
-{
-   Int off = alloc_BaB(1);
-   VG_(baseBlock)[off] = (UInt)a;
-   return off;
-}
-
-
-/* Here we assign actual offsets.  It's important to get the most
-   popular referents within 128 bytes of the start, so we can take
-   advantage of short addressing modes relative to %ebp.  Popularity
-   of offsets was measured on 22 Feb 02 running a KDE application, and
-   the slots rearranged accordingly, with a 1.5% reduction in total
-   size of translations. */
-
-static void vg_init_baseBlock ( void )
-{
-   baB_off = 0;
-
-   /* Those with offsets under 128 are carefully chosen. */
-
-   /* WORD offsets in this column */
-   /* 0   */ VGOFF_(m_eax)     = alloc_BaB(1);
-   /* 1   */ VGOFF_(m_ecx)     = alloc_BaB(1);
-   /* 2   */ VGOFF_(m_edx)     = alloc_BaB(1);
-   /* 3   */ VGOFF_(m_ebx)     = alloc_BaB(1);
-   /* 4   */ VGOFF_(m_esp)     = alloc_BaB(1);
-   /* 5   */ VGOFF_(m_ebp)     = alloc_BaB(1);
-   /* 6   */ VGOFF_(m_esi)     = alloc_BaB(1);
-   /* 7   */ VGOFF_(m_edi)     = alloc_BaB(1);
-   /* 8   */ VGOFF_(m_eflags)  = alloc_BaB(1);
-
-   /* 9   */ VGOFF_(sh_eax)    = alloc_BaB(1);
-   /* 10  */ VGOFF_(sh_ecx)    = alloc_BaB(1);
-   /* 11  */ VGOFF_(sh_edx)    = alloc_BaB(1);
-   /* 12  */ VGOFF_(sh_ebx)    = alloc_BaB(1);
-   /* 13  */ VGOFF_(sh_esp)    = alloc_BaB(1);
-   /* 14  */ VGOFF_(sh_ebp)    = alloc_BaB(1);
-   /* 15  */ VGOFF_(sh_esi)    = alloc_BaB(1);
-   /* 16  */ VGOFF_(sh_edi)    = alloc_BaB(1);
-   /* 17  */ VGOFF_(sh_eflags) = alloc_BaB(1);
-
-   /* 17a */ 
-   VGOFF_(cachesim_log_non_mem_instr)  
-      = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_non_mem_instr) );
-   /* 17b */ 
-   VGOFF_(cachesim_log_mem_instr)  
-      = alloc_BaB_1_set( (Addr) & VG_(cachesim_log_mem_instr) );
-
-   /* 18  */ 
-   VGOFF_(helper_value_check4_fail) 
-      = alloc_BaB_1_set( (Addr) & VG_(helper_value_check4_fail) );
-   /* 19 */
-   VGOFF_(helper_value_check0_fail)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_value_check0_fail) );
-
-   /* 20  */
-   VGOFF_(helperc_STOREV4)
-      = alloc_BaB_1_set( (Addr) & VG_(helperc_STOREV4) );
-   /* 21  */
-   VGOFF_(helperc_STOREV1)
-      = alloc_BaB_1_set( (Addr) & VG_(helperc_STOREV1) );
-
-   /* 22  */
-   VGOFF_(helperc_LOADV4)
-      = alloc_BaB_1_set( (Addr) & VG_(helperc_LOADV4) );
-   /* 23  */
-   VGOFF_(helperc_LOADV1)
-      = alloc_BaB_1_set( (Addr) & VG_(helperc_LOADV1) );
-
-   /* 24  */
-   VGOFF_(handle_esp_assignment)
-      = alloc_BaB_1_set( (Addr) & VGM_(handle_esp_assignment) );
-
-   /* 25 */
-   VGOFF_(m_eip) = alloc_BaB(1);
-
-   /* There are currently 24 spill slots */
-   /* 26 .. 49  This overlaps the magic boundary at >= 32 words, but
-      most spills are to low numbered spill slots, so the ones above
-      the boundary don't see much action. */
-   VGOFF_(spillslots) = alloc_BaB(VG_MAX_SPILLSLOTS);
-
-   /* These two pushed beyond the boundary because 2-byte transactions
-      are rare. */
-   /* 50  */
-   VGOFF_(helperc_STOREV2)
-      = alloc_BaB_1_set( (Addr) & VG_(helperc_STOREV2) );
-   /* 51  */
-   VGOFF_(helperc_LOADV2)
-      = alloc_BaB_1_set( (Addr) & VG_(helperc_LOADV2) );
-
-   /* 52  */
-   VGOFF_(fpu_write_check)
-      = alloc_BaB_1_set( (Addr) & VGM_(fpu_write_check) );
-   /* 53  */
-   VGOFF_(fpu_read_check)
-      = alloc_BaB_1_set( (Addr) & VGM_(fpu_read_check) );
-
-   /* Actually I don't think these two are ever used. */
-   /* 54  */ 
-   VGOFF_(helper_value_check2_fail)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_value_check2_fail) );
-   /* 55  */ 
-   VGOFF_(helper_value_check1_fail)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_value_check1_fail) );
-
-   /* I gave up counting at this point.  Since they're way above the
-      short-amode-boundary, there's no point. */
-
-   VGOFF_(m_fpustate) = alloc_BaB(VG_SIZE_OF_FPUSTATE_W);
-
-   VGOFF_(helper_idiv_64_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_64_32) );
-   VGOFF_(helper_div_64_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_div_64_32) );
-   VGOFF_(helper_idiv_32_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_32_16) );
-   VGOFF_(helper_div_32_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_div_32_16) );
-   VGOFF_(helper_idiv_16_8)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_16_8) );
-   VGOFF_(helper_div_16_8)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_div_16_8) );
-
-   VGOFF_(helper_imul_32_64)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_imul_32_64) );
-   VGOFF_(helper_mul_32_64)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_mul_32_64) );
-   VGOFF_(helper_imul_16_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_imul_16_32) );
-   VGOFF_(helper_mul_16_32)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_mul_16_32) );
-   VGOFF_(helper_imul_8_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_imul_8_16) );
-   VGOFF_(helper_mul_8_16)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_mul_8_16) );
-
-   VGOFF_(helper_CLD)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_CLD) );
-   VGOFF_(helper_STD)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_STD) );
-   VGOFF_(helper_get_dirflag)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_get_dirflag) );
-
-   VGOFF_(helper_CLC)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_CLC) );
-    VGOFF_(helper_STC)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_STC) );
-
-   VGOFF_(helper_shldl)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shldl) );
-   VGOFF_(helper_shldw)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shldw) );
-   VGOFF_(helper_shrdl)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shrdl) );
-   VGOFF_(helper_shrdw)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_shrdw) );
-
-   VGOFF_(helper_RDTSC)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_RDTSC) );
-   VGOFF_(helper_CPUID)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_CPUID) );
-
-   VGOFF_(helper_bsf)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_bsf) );
-   VGOFF_(helper_bsr)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_bsr) );
-
-   VGOFF_(helper_fstsw_AX)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_fstsw_AX) );
-   VGOFF_(helper_SAHF)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_SAHF) );
-   VGOFF_(helper_DAS)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_DAS) );
-   VGOFF_(helper_DAA)
-      = alloc_BaB_1_set( (Addr) & VG_(helper_DAA) );
-}
-
-
-/* ---------------------------------------------------------------------
-   Global entities which are not referenced from generated code.
-   ------------------------------------------------------------------ */
-
-/* The stack on which Valgrind runs.  We can't use the same stack as
-   the simulatee -- that's an important design decision.  */
-UInt VG_(stack)[10000];
-
-/* Ditto our signal delivery stack. */
-UInt VG_(sigstack)[10000];
-
-/* Saving stuff across system calls. */
-UInt VG_(real_fpu_state_saved_over_syscall)[VG_SIZE_OF_FPUSTATE_W];
-Addr VG_(esp_saved_over_syscall);
-
-/* Counts downwards in vg_run_innerloop. */
-UInt VG_(dispatch_ctr);
-
-
-/* 64-bit counter for the number of basic blocks done. */
-ULong VG_(bbs_done);
-/* 64-bit counter for the number of bbs to go before a debug exit. */
-ULong VG_(bbs_to_go);
-
-/* Produce debugging output? */
-Bool VG_(disassemble) = False;
-
-/* The current LRU epoch. */
-UInt VG_(current_epoch) = 0;
-
-/* This is the ThreadId of the last thread the scheduler ran. */
-ThreadId VG_(last_run_tid) = 0;
-
-
-/* ---------------------------------------------------------------------
-   Counters, for informational purposes only.
-   ------------------------------------------------------------------ */
-
-/* Number of lookups which miss the fast tt helper. */
-UInt VG_(tt_fast_misses) = 0;
-
-
-/* Counts for LRU informational messages. */
-
-/* Number and total o/t size of new translations this epoch. */
-UInt VG_(this_epoch_in_count) = 0;
-UInt VG_(this_epoch_in_osize) = 0;
-UInt VG_(this_epoch_in_tsize) = 0;
-/* Number and total o/t size of discarded translations this epoch. */
-UInt VG_(this_epoch_out_count) = 0;
-UInt VG_(this_epoch_out_osize) = 0;
-UInt VG_(this_epoch_out_tsize) = 0;
-/* Number and total o/t size of translations overall. */
-UInt VG_(overall_in_count) = 0;
-UInt VG_(overall_in_osize) = 0;
-UInt VG_(overall_in_tsize) = 0;
-/* Number and total o/t size of discards overall. */
-UInt VG_(overall_out_count) = 0;
-UInt VG_(overall_out_osize) = 0;
-UInt VG_(overall_out_tsize) = 0;
-
-/* The number of LRU-clearings of TT/TC. */
-UInt VG_(number_of_lrus) = 0;
-
-
-/* Counts pertaining to the register allocator. */
-
-/* total number of uinstrs input to reg-alloc */
-UInt VG_(uinstrs_prealloc) = 0;
-
-/* total number of uinstrs added due to spill code */
-UInt VG_(uinstrs_spill) = 0;
-
-/* number of bbs requiring spill code */
-UInt VG_(translations_needing_spill) = 0;
-
-/* total of register ranks over all translations */
-UInt VG_(total_reg_rank) = 0;
-
-
-/* Counts pertaining to internal sanity checking. */
-UInt VG_(sanity_fast_count) = 0;
-UInt VG_(sanity_slow_count) = 0;
-
-/* Counts pertaining to the scheduler. */
-UInt VG_(num_scheduling_events_MINOR) = 0;
-UInt VG_(num_scheduling_events_MAJOR) = 0;
-
-
-/* ---------------------------------------------------------------------
-   Values derived from command-line options.
-   ------------------------------------------------------------------ */
-
-Bool   VG_(clo_error_limit);
-Bool   VG_(clo_check_addrVs);
-Bool   VG_(clo_GDB_attach);
-Int    VG_(sanity_level);
-Int    VG_(clo_verbosity);
-Bool   VG_(clo_demangle);
-Bool   VG_(clo_leak_check);
-Bool   VG_(clo_show_reachable);
-Int    VG_(clo_leak_resolution);
-Bool   VG_(clo_sloppy_malloc);
-Int    VG_(clo_alignment);
-Bool   VG_(clo_partial_loads_ok);
-Bool   VG_(clo_trace_children);
-Int    VG_(clo_logfile_fd);
-Int    VG_(clo_freelist_vol);
-Bool   VG_(clo_workaround_gcc296_bugs);
-Int    VG_(clo_n_suppressions);
-Char*  VG_(clo_suppressions)[VG_CLO_MAX_SFILES];
-Bool   VG_(clo_single_step);
-Bool   VG_(clo_optimise);
-Bool   VG_(clo_instrument);
-Bool   VG_(clo_cleanup);
-Bool   VG_(clo_cachesim);
-cache_t VG_(clo_I1_cache);
-cache_t VG_(clo_D1_cache);
-cache_t VG_(clo_L2_cache);
-Int    VG_(clo_smc_check);
-Bool   VG_(clo_trace_syscalls);
-Bool   VG_(clo_trace_signals);
-Bool   VG_(clo_trace_symtab);
-Bool   VG_(clo_trace_malloc);
-Bool   VG_(clo_trace_sched);
-Int    VG_(clo_trace_pthread_level);
-ULong  VG_(clo_stop_after);
-Int    VG_(clo_dump_error);
-Int    VG_(clo_backtrace_size);
-Char*  VG_(clo_weird_hacks);
-
-/* This Bool is needed by wrappers in vg_clientmalloc.c to decide how
-   to behave.  Initially we say False. */
-Bool VG_(running_on_simd_CPU) = False;
-
-/* Holds client's %esp at the point we gained control. */
-Addr VG_(esp_at_startup);
-
-/* As deduced from VG_(esp_at_startup), the client's argc, argv[] and
-   envp[] as extracted from the client's stack at startup-time. */
-Int    VG_(client_argc);
-Char** VG_(client_argv);
-Char** VG_(client_envp);
-
-/* A place into which to copy the value of env var VG_ARGS, so we
-   don't have to modify the original. */
-static Char vg_cmdline_copy[M_VG_CMDLINE_STRLEN];
-
-
-/* ---------------------------------------------------------------------
-   Processing of command-line options.
-   ------------------------------------------------------------------ */
-
-static void bad_option ( Char* opt )
-{
-   VG_(shutdown_logging)();
-   VG_(clo_logfile_fd) = 2; /* stderr */
-   VG_(printf)("valgrind.so: Bad option `%s'; aborting.\n", opt);
-   VG_(exit)(1);
-}
-
-static void config_error ( Char* msg )
-{
-   VG_(shutdown_logging)();
-   VG_(clo_logfile_fd) = 2; /* stderr */
-   VG_(printf)(
-      "valgrind.so: Startup or configuration error:\n   %s\n", msg);
-   VG_(printf)(
-      "valgrind.so: Unable to start up properly.  Giving up.\n");
-   VG_(exit)(1);
-}
-
-static void args_grok_error ( Char* msg )
-{
-   VG_(shutdown_logging)();
-   VG_(clo_logfile_fd) = 2; /* stderr */
-   VG_(printf)("valgrind.so: When searching for "
-               "client's argc/argc/envp:\n\t%s\n", msg);
-   config_error("couldn't find client's argc/argc/envp");
-}   
-
-static void parse_cache_opt ( cache_t* cache, char* orig_opt, int opt_len )
-{
-   int   i1, i2, i3;
-   int   i;
-   char *opt = VG_(strdup)(VG_AR_PRIVATE, orig_opt);
-
-   i = i1 = opt_len;
-
-   /* Option looks like "--I1=65536,2,64".
-    * Find commas, replace with NULs to make three independent 
-    * strings, then extract numbers.  Yuck. */
-   while (VG_(isdigit)(opt[i])) i++;
-   if (',' == opt[i]) {
-      opt[i++] = '\0';
-      i2 = i;
-   } else goto bad;
-   while (VG_(isdigit)(opt[i])) i++;
-   if (',' == opt[i]) {
-      opt[i++] = '\0';
-      i3 = i;
-   } else goto bad;
-   while (VG_(isdigit)(opt[i])) i++;
-   if ('\0' != opt[i]) goto bad;
-
-   cache->size      = (Int)VG_(atoll)(opt + i1);
-   cache->assoc     = (Int)VG_(atoll)(opt + i2);
-   cache->line_size = (Int)VG_(atoll)(opt + i3);
-
-   VG_(free)(VG_AR_PRIVATE, opt);
-   return;
-
-  bad:    
-   bad_option(orig_opt);
-}
-
-static void process_cmd_line_options ( void )
-{
-   UChar* argv[M_VG_CMDLINE_OPTS];
-   UInt   argc;
-   UChar* p;
-   UChar* str;
-   Int    i, eventually_logfile_fd, ctr;
-
-#  define ISSPACE(cc)      ((cc) == ' ' || (cc) == '\t' || (cc) == '\n')
-#  define STREQ(s1,s2)     (0==VG_(strcmp_ws)((s1),(s2)))
-#  define STREQN(nn,s1,s2) (0==VG_(strncmp_ws)((s1),(s2),(nn)))
-
-   /* Set defaults. */
-   VG_(clo_error_limit)      = True;
-   VG_(clo_check_addrVs)     = True;
-   VG_(clo_GDB_attach)       = False;
-   VG_(sanity_level)         = 1;
-   VG_(clo_verbosity)        = 1;
-   VG_(clo_demangle)         = True;
-   VG_(clo_leak_check)       = False;
-   VG_(clo_show_reachable)   = False;
-   VG_(clo_leak_resolution)  = 2;
-   VG_(clo_sloppy_malloc)    = False;
-   VG_(clo_alignment)        = 4;
-   VG_(clo_partial_loads_ok) = True;
-   VG_(clo_trace_children)   = False;
-   VG_(clo_logfile_fd)       = 2; /* stderr */
-   VG_(clo_freelist_vol)     = 1000000;
-   VG_(clo_workaround_gcc296_bugs) = False;
-   VG_(clo_n_suppressions)   = 0;
-   VG_(clo_single_step)      = False;
-   VG_(clo_optimise)         = True;
-   VG_(clo_instrument)       = True;
-   VG_(clo_cachesim)         = False;
-   VG_(clo_I1_cache)         = UNDEFINED_CACHE;
-   VG_(clo_D1_cache)         = UNDEFINED_CACHE;
-   VG_(clo_L2_cache)         = UNDEFINED_CACHE;
-   VG_(clo_cleanup)          = True;
-   VG_(clo_smc_check)        = /* VG_CLO_SMC_SOME */ VG_CLO_SMC_NONE;
-   VG_(clo_trace_syscalls)   = False;
-   VG_(clo_trace_signals)    = False;
-   VG_(clo_trace_symtab)     = False;
-   VG_(clo_trace_malloc)     = False;
-   VG_(clo_trace_sched)      = False;
-   VG_(clo_trace_pthread_level) = 0;
-   VG_(clo_stop_after)       = 1000000000000LL;
-   VG_(clo_dump_error)       = 0;
-   VG_(clo_backtrace_size)   = 4;
-   VG_(clo_weird_hacks)      = NULL;
-
-   eventually_logfile_fd = VG_(clo_logfile_fd);
-
-   /* Once logging is started, we can safely send messages pertaining
-      to failures in initialisation. */
-   VG_(startup_logging)();
-
-   /* Check for sane path in ./configure --prefix=... */
-   if (VG_(strlen)(VG_LIBDIR) < 1 
-       || VG_LIBDIR[0] != '/') 
-     config_error("Please use absolute paths in "
-                  "./configure --prefix=... or --libdir=...");
-
-   /* (Suggested by Fabrice Bellard ... )
-      We look for the Linux ELF table and go down until we find the
-      envc & envp. It is not fool-proof, but these structures should
-      change less often than the libc ones. */
-   {
-       UInt* sp = 0; /* bogus init to keep gcc -O happy */
-
-       /* locate the top of the stack */
-       if (VG_STACK_MATCHES_BASE( VG_(esp_at_startup), 
-                                  VG_STARTUP_STACK_BASE_1 )) {
-          sp = (UInt*)VG_STARTUP_STACK_BASE_1;
-       } else
-       if (VG_STACK_MATCHES_BASE( VG_(esp_at_startup), 
-                                  VG_STARTUP_STACK_BASE_2 )) {
-          sp = (UInt*)VG_STARTUP_STACK_BASE_2;
-       } else 
-       if (VG_STACK_MATCHES_BASE( VG_(esp_at_startup), 
-                                  VG_STARTUP_STACK_BASE_3 )) {
-          sp = (UInt*)VG_STARTUP_STACK_BASE_3;
- 
-       } else {
-          args_grok_error(
-             "startup %esp is not near any VG_STARTUP_STACK_BASE_*\n   "
-             "constants defined in vg_include.h.  You should investigate."
-          );
-       }
- 
-       /* we locate: NEW_AUX_ENT(1, AT_PAGESZ, ELF_EXEC_PAGESIZE) in
-          the elf interpreter table */
-       sp -= 2;
-       while (sp[0] != VKI_AT_PAGESZ || sp[1] != 4096) {
-           /* VG_(printf)("trying %p\n", sp); */
-           sp--;
-       }
-
-       if (sp[2] == VKI_AT_BASE 
-           && sp[0] == VKI_AT_PAGESZ
-           && sp[-2] == VKI_AT_PHNUM
-           && sp[-4] == VKI_AT_PHENT
-           && sp[-6] == VKI_AT_PHDR
-           && sp[-6-1] == 0) {
-          if (0)
-             VG_(printf)("Looks like you've got a 2.2.X kernel here.\n");
-          sp -= 6;
-       } else
-       if (sp[2] == VKI_AT_CLKTCK
-           && sp[0] == VKI_AT_PAGESZ
-           && sp[-2] == VKI_AT_HWCAP
-           && sp[-2-1] == 0) {
-          if (0)
-             VG_(printf)("Looks like you've got a 2.4.X kernel here.\n");
-          sp -= 2;
-       } else
-       if (sp[2] == VKI_AT_CLKTCK
-           && sp[0] == VKI_AT_PAGESZ
-           && sp[-2] == VKI_AT_HWCAP
-           && sp[-4] == VKI_AT_USER_AUX_SEGMENT
-           && sp[-4-1] == 0) {
-          if (0)
-             VG_(printf)("Looks like you've got a R-H Limbo 2.4.X "
-                         "kernel here.\n");
-          sp -= 4;
-       } else
-       if (sp[2] == VKI_AT_CLKTCK
-           && sp[0] == VKI_AT_PAGESZ
-           && sp[-2] == VKI_AT_HWCAP
-           && sp[-2-20-1] == 0) {
-          if (0)
-             VG_(printf)("Looks like you've got a early 2.4.X kernel here.\n");
-          sp -= 22;
-       } else
-         args_grok_error(
-            "ELF frame does not look like 2.2.X or 2.4.X.\n   "
-            "See kernel sources linux/fs/binfmt_elf.c to make sense of this."
-         );
-
-       sp--;
-       if (*sp != 0)
-	 args_grok_error("can't find NULL at end of env[]");
-
-       /* sp now points to NULL at the end of env[] */
-       ctr = 0;
-       while (True) {
-           sp --;
-           if (*sp == 0) break;
-           if (++ctr >= 1000)
-              args_grok_error(
-                 "suspiciously many (1000) env[] entries; giving up");
-           
-       }
-       /* sp now points to NULL at the end of argv[] */
-       VG_(client_envp) = (Char**)(sp+1);
-
-       ctr = 0;
-       VG_(client_argc) = 0;
-       while (True) {
-          sp--;
-          if (*sp == VG_(client_argc))
-             break;
-          VG_(client_argc)++;
-           if (++ctr >= 1000)
-              args_grok_error(
-                 "suspiciously many (1000) argv[] entries; giving up");
-       }
-
-       VG_(client_argv) = (Char**)(sp+1);
-   }
-
-   /* Now that VG_(client_envp) has been set, we can extract the args
-      for Valgrind itself.  Copy into global var so that we don't have to
-      write zeroes to the getenv'd value itself. */
-   str = VG_(getenv)("VG_ARGS");
-   argc = 0;
-
-   if (!str) {
-      config_error("Can't read options from env var VG_ARGS.");
-   }
-
-   if (VG_(strlen)(str) >= M_VG_CMDLINE_STRLEN-1) {
-      config_error("Command line length exceeds M_CMDLINE_STRLEN.");
-   }
-   VG_(strcpy)(vg_cmdline_copy, str);
-   str = NULL;
-
-   p = &vg_cmdline_copy[0];
-   while (True) {
-      while (ISSPACE(*p)) { *p = 0; p++; }
-      if (*p == 0) break;
-      if (argc < M_VG_CMDLINE_OPTS-1) { 
-         argv[argc] = p; argc++; 
-      } else {
-         config_error(
-            "Found more than M_CMDLINE_OPTS command-line opts.");
-      }
-      while (*p != 0 && !ISSPACE(*p)) p++;
-   }
-
-   for (i = 0; i < argc; i++) {
-
-      if (STREQ(argv[i], "-v") || STREQ(argv[i], "--verbose"))
-         VG_(clo_verbosity)++;
-      else if (STREQ(argv[i], "-q") || STREQ(argv[i], "--quiet"))
-         VG_(clo_verbosity)--;
-
-      else if (STREQ(argv[i], "--error-limit=yes"))
-         VG_(clo_error_limit) = True;
-      else if (STREQ(argv[i], "--error-limit=no"))
-         VG_(clo_error_limit) = False;
-
-      else if (STREQ(argv[i], "--check-addrVs=yes"))
-         VG_(clo_check_addrVs) = True;
-      else if (STREQ(argv[i], "--check-addrVs=no"))
-         VG_(clo_check_addrVs) = False;
-
-      else if (STREQ(argv[i], "--gdb-attach=yes"))
-         VG_(clo_GDB_attach) = True;
-      else if (STREQ(argv[i], "--gdb-attach=no"))
-         VG_(clo_GDB_attach) = False;
-
-      else if (STREQ(argv[i], "--demangle=yes"))
-         VG_(clo_demangle) = True;
-      else if (STREQ(argv[i], "--demangle=no"))
-         VG_(clo_demangle) = False;
-
-      else if (STREQ(argv[i], "--partial-loads-ok=yes"))
-         VG_(clo_partial_loads_ok) = True;
-      else if (STREQ(argv[i], "--partial-loads-ok=no"))
-         VG_(clo_partial_loads_ok) = False;
-
-      else if (STREQ(argv[i], "--leak-check=yes"))
-         VG_(clo_leak_check) = True;
-      else if (STREQ(argv[i], "--leak-check=no"))
-         VG_(clo_leak_check) = False;
-
-      else if (STREQ(argv[i], "--show-reachable=yes"))
-         VG_(clo_show_reachable) = True;
-      else if (STREQ(argv[i], "--show-reachable=no"))
-         VG_(clo_show_reachable) = False;
-
-      else if (STREQ(argv[i], "--leak-resolution=low"))
-         VG_(clo_leak_resolution) = 2;
-      else if (STREQ(argv[i], "--leak-resolution=med"))
-         VG_(clo_leak_resolution) = 4;
-      else if (STREQ(argv[i], "--leak-resolution=high"))
-         VG_(clo_leak_resolution) = VG_DEEPEST_BACKTRACE;
-
-      else if (STREQ(argv[i], "--sloppy-malloc=yes"))
-         VG_(clo_sloppy_malloc) = True;
-      else if (STREQ(argv[i], "--sloppy-malloc=no"))
-         VG_(clo_sloppy_malloc) = False;
-
-      else if (STREQN(12, argv[i], "--alignment="))
-         VG_(clo_alignment) = (Int)VG_(atoll)(&argv[i][12]);
-
-      else if (STREQ(argv[i], "--trace-children=yes"))
-         VG_(clo_trace_children) = True;
-      else if (STREQ(argv[i], "--trace-children=no"))
-         VG_(clo_trace_children) = False;
-
-      else if (STREQ(argv[i], "--workaround-gcc296-bugs=yes"))
-         VG_(clo_workaround_gcc296_bugs) = True;
-      else if (STREQ(argv[i], "--workaround-gcc296-bugs=no"))
-         VG_(clo_workaround_gcc296_bugs) = False;
-
-      else if (STREQN(15, argv[i], "--sanity-level="))
-         VG_(sanity_level) = (Int)VG_(atoll)(&argv[i][15]);
-
-      else if (STREQN(13, argv[i], "--logfile-fd="))
-         eventually_logfile_fd = (Int)VG_(atoll)(&argv[i][13]);
-
-      else if (STREQN(15, argv[i], "--freelist-vol=")) {
-         VG_(clo_freelist_vol) = (Int)VG_(atoll)(&argv[i][15]);
-         if (VG_(clo_freelist_vol) < 0) VG_(clo_freelist_vol) = 2;
-      }
-
-      else if (STREQN(15, argv[i], "--suppressions=")) {
-         if (VG_(clo_n_suppressions) >= VG_CLO_MAX_SFILES) {
-            VG_(message)(Vg_UserMsg, "Too many logfiles specified.");
-            VG_(message)(Vg_UserMsg, 
-                         "Increase VG_CLO_MAX_SFILES and recompile.");
-            bad_option(argv[i]);
-         }
-         VG_(clo_suppressions)[VG_(clo_n_suppressions)] = &argv[i][15];
-         VG_(clo_n_suppressions)++;
-      }
-      else if (STREQ(argv[i], "--single-step=yes"))
-         VG_(clo_single_step) = True;
-      else if (STREQ(argv[i], "--single-step=no"))
-         VG_(clo_single_step) = False;
-
-      else if (STREQ(argv[i], "--optimise=yes"))
-         VG_(clo_optimise) = True;
-      else if (STREQ(argv[i], "--optimise=no"))
-         VG_(clo_optimise) = False;
-
-      else if (STREQ(argv[i], "--instrument=yes"))
-         VG_(clo_instrument) = True;
-      else if (STREQ(argv[i], "--instrument=no"))
-         VG_(clo_instrument) = False;
-
-      else if (STREQ(argv[i], "--cleanup=yes"))
-         VG_(clo_cleanup) = True;
-      else if (STREQ(argv[i], "--cleanup=no"))
-         VG_(clo_cleanup) = False;
-
-      else if (STREQ(argv[i], "--cachesim=yes"))
-         VG_(clo_cachesim) = True;     
-      else if (STREQ(argv[i], "--cachesim=no"))
-         VG_(clo_cachesim) = False;
-
-      /* 5 is length of "--I1=" */
-      else if (0 == VG_(strncmp)(argv[i], "--I1=",    5))
-         parse_cache_opt(&VG_(clo_I1_cache), argv[i], 5);
-      else if (0 == VG_(strncmp)(argv[i], "--D1=",    5))
-         parse_cache_opt(&VG_(clo_D1_cache), argv[i], 5);
-      else if (0 == VG_(strncmp)(argv[i], "--L2=",    5))
-         parse_cache_opt(&VG_(clo_L2_cache), argv[i], 5);
-
-      else if (STREQ(argv[i], "--smc-check=none"))
-         VG_(clo_smc_check) = VG_CLO_SMC_NONE;
-      else if (STREQ(argv[i], "--smc-check=some"))
-         VG_(clo_smc_check) = VG_CLO_SMC_SOME;
-      else if (STREQ(argv[i], "--smc-check=all"))
-         VG_(clo_smc_check) = VG_CLO_SMC_ALL;
-
-      else if (STREQ(argv[i], "--trace-syscalls=yes"))
-         VG_(clo_trace_syscalls) = True;
-      else if (STREQ(argv[i], "--trace-syscalls=no"))
-         VG_(clo_trace_syscalls) = False;
-
-      else if (STREQ(argv[i], "--trace-signals=yes"))
-         VG_(clo_trace_signals) = True;
-      else if (STREQ(argv[i], "--trace-signals=no"))
-         VG_(clo_trace_signals) = False;
-
-      else if (STREQ(argv[i], "--trace-symtab=yes"))
-         VG_(clo_trace_symtab) = True;
-      else if (STREQ(argv[i], "--trace-symtab=no"))
-         VG_(clo_trace_symtab) = False;
-
-      else if (STREQ(argv[i], "--trace-malloc=yes"))
-         VG_(clo_trace_malloc) = True;
-      else if (STREQ(argv[i], "--trace-malloc=no"))
-         VG_(clo_trace_malloc) = False;
-
-      else if (STREQ(argv[i], "--trace-sched=yes"))
-         VG_(clo_trace_sched) = True;
-      else if (STREQ(argv[i], "--trace-sched=no"))
-         VG_(clo_trace_sched) = False;
-
-      else if (STREQ(argv[i], "--trace-pthread=none"))
-         VG_(clo_trace_pthread_level) = 0;
-      else if (STREQ(argv[i], "--trace-pthread=some"))
-         VG_(clo_trace_pthread_level) = 1;
-      else if (STREQ(argv[i], "--trace-pthread=all"))
-         VG_(clo_trace_pthread_level) = 2;
-
-      else if (STREQN(14, argv[i], "--weird-hacks="))
-         VG_(clo_weird_hacks) = &argv[i][14];
-
-      else if (STREQN(13, argv[i], "--stop-after="))
-         VG_(clo_stop_after) = VG_(atoll)(&argv[i][13]);
-
-      else if (STREQN(13, argv[i], "--dump-error="))
-         VG_(clo_dump_error) = (Int)VG_(atoll)(&argv[i][13]);
-
-      else if (STREQN(14, argv[i], "--num-callers=")) {
-         /* Make sure it's sane. */
-	 VG_(clo_backtrace_size) = (Int)VG_(atoll)(&argv[i][14]);
-         if (VG_(clo_backtrace_size) < 2)
-            VG_(clo_backtrace_size) = 2;
-         if (VG_(clo_backtrace_size) >= VG_DEEPEST_BACKTRACE)
-            VG_(clo_backtrace_size) = VG_DEEPEST_BACKTRACE;
-      }
-
-      else
-         bad_option(argv[i]);
-   }
-
-#  undef ISSPACE
-#  undef STREQ
-#  undef STREQN
-
-   if (VG_(clo_verbosity < 0))
-      VG_(clo_verbosity) = 0;
-
-   if (VG_(clo_alignment) < 4 
-       || VG_(clo_alignment) > 4096
-       || VG_(log2)( VG_(clo_alignment) ) == -1 /* not a power of 2 */) {
-      VG_(message)(Vg_UserMsg, "");
-      VG_(message)(Vg_UserMsg, 
-         "Invalid --alignment= setting.  "
-         "Should be a power of 2, >= 4, <= 4096.");
-      bad_option("--alignment");
-   }
-
-   if (VG_(clo_GDB_attach) && VG_(clo_trace_children)) {
-      VG_(message)(Vg_UserMsg, "");
-      VG_(message)(Vg_UserMsg, 
-         "--gdb-attach=yes conflicts with --trace-children=yes");
-      VG_(message)(Vg_UserMsg, 
-         "Please choose one or the other, but not both.");
-      bad_option("--gdb-attach=yes and --trace-children=yes");
-   }
-
-   VG_(clo_logfile_fd) = eventually_logfile_fd;
-
-   /* Don't do memory checking if simulating the cache. */
-   if (VG_(clo_cachesim)) {
-       VG_(clo_instrument) = False;
-   }
-
-   if (VG_(clo_verbosity > 0)) {
-      if (VG_(clo_cachesim)) {
-         VG_(message)(Vg_UserMsg, 
-            "cachegrind-%s, an I1/D1/L2 cache profiler for x86 GNU/Linux.",
-            VERSION);
-      } else {
-         VG_(message)(Vg_UserMsg, 
-            "valgrind-%s, a memory error detector for x86 GNU/Linux.",
-            VERSION);
-      }
-   }
-
-   if (VG_(clo_verbosity > 0))
-      VG_(message)(Vg_UserMsg, 
-                   "Copyright (C) 2000-2002, and GNU GPL'd, by Julian Seward.");
-   if (VG_(clo_verbosity) > 1) {
-      VG_(message)(Vg_UserMsg, "Startup, with flags:");
-      for (i = 0; i < argc; i++) {
-         VG_(message)(Vg_UserMsg, "   %s", argv[i]);
-      }
-   }
-
-   if (VG_(clo_n_suppressions) == 0 && !VG_(clo_cachesim)) {
-      config_error("No error-suppression files were specified.");
-   }
-}
-
-
-/* ---------------------------------------------------------------------
-   Copying to/from m_state_static.
-   ------------------------------------------------------------------ */
-
-UInt VG_(m_state_static) [8 /* int regs, in Intel order */ 
-                          + 1 /* %eflags */ 
-                          + 1 /* %eip */
-                          + VG_SIZE_OF_FPUSTATE_W /* FPU state */
-                         ];
-
-void VG_(copy_baseBlock_to_m_state_static) ( void )
-{
-   Int i;
-   VG_(m_state_static)[ 0/4] = VG_(baseBlock)[VGOFF_(m_eax)];
-   VG_(m_state_static)[ 4/4] = VG_(baseBlock)[VGOFF_(m_ecx)];
-   VG_(m_state_static)[ 8/4] = VG_(baseBlock)[VGOFF_(m_edx)];
-   VG_(m_state_static)[12/4] = VG_(baseBlock)[VGOFF_(m_ebx)];
-   VG_(m_state_static)[16/4] = VG_(baseBlock)[VGOFF_(m_esp)];
-   VG_(m_state_static)[20/4] = VG_(baseBlock)[VGOFF_(m_ebp)];
-   VG_(m_state_static)[24/4] = VG_(baseBlock)[VGOFF_(m_esi)];
-   VG_(m_state_static)[28/4] = VG_(baseBlock)[VGOFF_(m_edi)];
-
-   VG_(m_state_static)[32/4] = VG_(baseBlock)[VGOFF_(m_eflags)];
-   VG_(m_state_static)[36/4] = VG_(baseBlock)[VGOFF_(m_eip)];
-
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(m_state_static)[40/4 + i] 
-         = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
-}
-
-
-void VG_(copy_m_state_static_to_baseBlock) ( void )
-{
-   Int i;
-   VG_(baseBlock)[VGOFF_(m_eax)] = VG_(m_state_static)[ 0/4];
-   VG_(baseBlock)[VGOFF_(m_ecx)] = VG_(m_state_static)[ 4/4];
-   VG_(baseBlock)[VGOFF_(m_edx)] = VG_(m_state_static)[ 8/4];
-   VG_(baseBlock)[VGOFF_(m_ebx)] = VG_(m_state_static)[12/4];
-   VG_(baseBlock)[VGOFF_(m_esp)] = VG_(m_state_static)[16/4];
-   VG_(baseBlock)[VGOFF_(m_ebp)] = VG_(m_state_static)[20/4];
-   VG_(baseBlock)[VGOFF_(m_esi)] = VG_(m_state_static)[24/4];
-   VG_(baseBlock)[VGOFF_(m_edi)] = VG_(m_state_static)[28/4];
-
-   VG_(baseBlock)[VGOFF_(m_eflags)] = VG_(m_state_static)[32/4];
-   VG_(baseBlock)[VGOFF_(m_eip)] = VG_(m_state_static)[36/4];
-
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_fpustate) + i]
-         = VG_(m_state_static)[40/4 + i];
-}
-
-
-/* ---------------------------------------------------------------------
-   Show accumulated counts.
-   ------------------------------------------------------------------ */
-
-static void vg_show_counts ( void )
-{
-   VG_(message)(Vg_DebugMsg,
-		"      lru: %d epochs, %d clearings.",
-		VG_(current_epoch),
-                VG_(number_of_lrus) );
-   VG_(message)(Vg_DebugMsg,
-                "translate: new %d (%d -> %d), discard %d (%d -> %d).",
-                VG_(overall_in_count),
-                VG_(overall_in_osize),
-                VG_(overall_in_tsize),
-                VG_(overall_out_count),
-                VG_(overall_out_osize),
-                VG_(overall_out_tsize) );
-   VG_(message)(Vg_DebugMsg,
-      " dispatch: %lu basic blocks, %d/%d sched events, %d tt_fast misses.", 
-      VG_(bbs_done), VG_(num_scheduling_events_MAJOR), 
-                     VG_(num_scheduling_events_MINOR), 
-                     VG_(tt_fast_misses));
-   VG_(message)(Vg_DebugMsg, 
-                "reg-alloc: %d t-req-spill, "
-                "%d+%d orig+spill uis, %d total-reg-r.",
-                VG_(translations_needing_spill),
-                VG_(uinstrs_prealloc),
-                VG_(uinstrs_spill),
-                VG_(total_reg_rank) );
-   VG_(message)(Vg_DebugMsg, 
-                "   sanity: %d cheap, %d expensive checks.",
-                VG_(sanity_fast_count), 
-                VG_(sanity_slow_count) );
-}
-
-
-/* ---------------------------------------------------------------------
-   Main!
-   ------------------------------------------------------------------ */
-
-/* Where we jump to once Valgrind has got control, and the real
-   machine's state has been copied to the m_state_static. */
-
-void VG_(main) ( void )
-{
-   Int               i;
-   VgSchedReturnCode src;
-   ThreadState*      tst;
-
-   /* Set up our stack sanity-check words. */
-   for (i = 0; i < 10; i++) {
-      VG_(stack)[i]         = (UInt)(&VG_(stack)[i])         ^ 0xA4B3C2D1;
-      VG_(stack)[10000-1-i] = (UInt)(&VG_(stack)[10000-i-1]) ^ 0xABCD4321;
-   }
-
-   /* Set up baseBlock offsets and copy the saved machine's state into
-      it. */
-   vg_init_baseBlock();
-   VG_(copy_m_state_static_to_baseBlock)();
-
-   /* Process Valgrind's command-line opts (from env var VG_OPTS). */
-   process_cmd_line_options();
-
-   /* Hook to delay things long enough so we can get the pid and
-      attach GDB in another shell. */
-   if (0) { 
-      Int p, q;
-      for (p = 0; p < 50000; p++)
-         for (q = 0; q < 50000; q++) ;
-   }
-
-   /* Initialise the scheduler, and copy the client's state from
-      baseBlock into VG_(threads)[1].  This has to come before signal
-      initialisations. */
-   VG_(scheduler_init)();
-
-   /* Initialise the signal handling subsystem, temporarily parking
-      the saved blocking-mask in saved_sigmask. */
-   VG_(sigstartup_actions)();
-
-   /* Perhaps we're profiling Valgrind? */
-#  ifdef VG_PROFILE
-   VGP_(init_profiling)();
-#  endif
-
-   /* Start calibration of our RDTSC-based clock. */
-   VG_(start_rdtsc_calibration)();
-
-   if (VG_(clo_instrument) || VG_(clo_cachesim)) {
-      VGP_PUSHCC(VgpInitAudit);
-      VGM_(init_memory_audit)();
-      VGP_POPCC;
-   }
-
-   VGP_PUSHCC(VgpReadSyms);
-   VG_(read_symbols)();
-   VGP_POPCC;
-
-   /* End calibration of our RDTSC-based clock, leaving it as long as
-      we can. */
-   VG_(end_rdtsc_calibration)();
-
-   /* This should come after init_memory_audit; otherwise the latter
-      carefully sets up the permissions maps to cover the anonymous
-      mmaps for the translation table and translation cache, which
-      wastes > 20M of virtual address space. */
-   VG_(init_tt_tc)();
-
-   if (VG_(clo_verbosity) == 1) {
-      VG_(message)(Vg_UserMsg, 
-                   "For more details, rerun with: -v");
-   }
-
-   /* Now it is safe for malloc et al in vg_clientmalloc.c to act
-      instrumented-ly. */
-   VG_(running_on_simd_CPU) = True;
-   if (VG_(clo_instrument)) {
-      VGM_(make_readable) ( (Addr)&VG_(running_on_simd_CPU), 1 );
-      VGM_(make_readable) ( (Addr)&VG_(clo_instrument), 1 );
-      VGM_(make_readable) ( (Addr)&VG_(clo_trace_malloc), 1 );
-      VGM_(make_readable) ( (Addr)&VG_(clo_sloppy_malloc), 1 );
-   }
-
-   if (VG_(clo_cachesim)) 
-      VG_(init_cachesim)();
-
-   if (VG_(clo_verbosity) > 0)
-      VG_(message)(Vg_UserMsg, "");
-
-   VG_(bbs_to_go) = VG_(clo_stop_after);
-
-   /* Run! */
-   VGP_PUSHCC(VgpSched);
-   src = VG_(scheduler)();
-   VGP_POPCC;
-
-   if (VG_(clo_verbosity) > 0)
-      VG_(message)(Vg_UserMsg, "");
-
-   if (src == VgSrc_Deadlock) {
-     VG_(message)(Vg_UserMsg, 
-        "Warning: pthread scheduler exited due to deadlock");
-   }
-
-   if (VG_(clo_instrument)) {
-      VG_(show_all_errors)();
-      VG_(clientmalloc_done)();
-      if (VG_(clo_verbosity) == 1) {
-         VG_(message)(Vg_UserMsg, 
-                      "For counts of detected errors, rerun with: -v");
-      }
-      if (VG_(clo_leak_check)) VG_(detect_memory_leaks)();
-   }
-   VG_(running_on_simd_CPU) = False;
-
-   if (VG_(clo_cachesim))
-      VG_(do_cachesim_results)(VG_(client_argc), VG_(client_argv));
-
-   VG_(do_sanity_checks)( True /*include expensive checks*/ );
-
-   if (VG_(clo_verbosity) > 1)
-      vg_show_counts();
-
-   if (0) {
-      VG_(message)(Vg_DebugMsg, "");
-      VG_(message)(Vg_DebugMsg, 
-         "------ Valgrind's internal memory use stats follow ------" );
-      VG_(mallocSanityCheckAll)();
-      VG_(show_all_arena_stats)();
-      VG_(message)(Vg_DebugMsg, 
-         "------ Valgrind's ExeContext management stats follow ------" );
-      VG_(show_ExeContext_stats)();
-      VG_(message)(Vg_DebugMsg, 
-         "------ Valgrind's client block stats follow ---------------" );
-      VG_(show_client_block_stats)();
-   }
- 
-#  ifdef VG_PROFILE
-   VGP_(done_profiling)();
-#  endif
-
-   VG_(done_prof_mem)();
-
-   VG_(shutdown_logging)();
-
-   /* Remove valgrind.so from a LD_PRELOAD=... string so child
-      processes don't get traced into.  Also mess up $libdir/valgrind
-      so that our libpthread.so disappears from view. */
-   if (!VG_(clo_trace_children)) { 
-      VG_(mash_LD_PRELOAD_and_LD_LIBRARY_PATH)(
-         VG_(getenv)("LD_PRELOAD"),
-         VG_(getenv)("LD_LIBRARY_PATH") 
-      );
-   }
-
-   /* Decide how to exit.  This depends on what the scheduler
-      returned. */
-   switch (src) {
-      case VgSrc_ExitSyscall: /* the normal way out */
-         vg_assert(VG_(last_run_tid) > 0 
-                   && VG_(last_run_tid) < VG_N_THREADS);
-         tst = & VG_(threads)[VG_(last_run_tid)];
-         vg_assert(tst->status == VgTs_Runnable);
-         /* The thread's %EBX will hold the arg to exit(), so we just
-            do exit with that arg. */
-         VG_(exit)( tst->m_ebx );
-         /* NOT ALIVE HERE! */
-         VG_(panic)("entered the afterlife in vg_main() -- ExitSyscall");
-         break; /* what the hell :) */
-
-      case VgSrc_Deadlock:
-         /* Just exit now.  No point in continuing. */
-         VG_(exit)(0);
-         VG_(panic)("entered the afterlife in vg_main() -- Deadlock");
-         break;
-
-      case VgSrc_BbsDone: 
-         /* Tricky; we have to try and switch back to the real CPU.
-            This is all very dodgy and won't work at all in the
-            presence of threads, or if the client happened to be
-            running a signal handler. */
-         /* Prepare to restore state to the real CPU. */
-         VG_(load_thread_state)(1 /* root thread */ );
-         VG_(copy_baseBlock_to_m_state_static)();
-
-         /* This pushes a return address on the simulator's stack,
-            which is abandoned.  We call vg_sigshutdown_actions() at
-            the end of vg_switch_to_real_CPU(), so as to ensure that
-            the original stack and machine state is restored before
-            the real signal mechanism is restored.  */
-         VG_(switch_to_real_CPU)();
-
-      default:
-         VG_(panic)("vg_main(): unexpected scheduler return code");
-   }
-}
-
-
-/* Debugging thing .. can be called from assembly with OYNK macro. */
-void VG_(oynk) ( Int n )
-{
-   OINK(n);
-}
-
-
-/* Find "valgrind.so" in a LD_PRELOAD=... string, and convert it to
-   "valgrinq.so", which doesn't do anything.  This is used to avoid
-   tracing into child processes.  To make this work the build system
-   also supplies a dummy file, "valgrinq.so". 
-
-   Also look for $(libdir)/lib/valgrind in LD_LIBRARY_PATH and change
-   it to $(libdir)/lib/valgrinq, so as to make our libpthread.so
-   disappear.  
-*/
-void VG_(mash_LD_PRELOAD_and_LD_LIBRARY_PATH) ( Char* ld_preload_str,
-                                                Char* ld_library_path_str )
-{
-   Char* p_prel = NULL;
-   Char* p_path = NULL;
-   Int   what = 0;
-   if (ld_preload_str == NULL || ld_library_path_str == NULL)
-      goto mutancy;
-
-   /* VG_(printf)("%s %s\n", ld_preload_str, ld_library_path_str); */
-
-   p_prel = VG_(strstr)(ld_preload_str, "valgrind.so");
-   p_path = VG_(strstr)(ld_library_path_str, VG_LIBDIR);
-
-   if (p_prel == NULL) {
-      /* perhaps already happened? */
-      what = 1;
-      if (VG_(strstr)(ld_preload_str, "valgrinq.so") == NULL)
-         goto mutancy;
-      if (VG_(strstr)(ld_library_path_str, "lib/valgrinq") == NULL)
-         goto mutancy;
-      return;
-   }
-
-   what = 2;
-   if (p_path == NULL) goto mutancy;
-
-   /* in LD_PRELOAD, turn valgrind.so into valgrinq.so. */
-   what = 3;
-   if (p_prel[7] != 'd') goto mutancy;
-   p_prel[7] = 'q';
-
-   /* in LD_LIBRARY_PATH, turn $libdir/valgrind (as configure'd) from 
-      .../lib/valgrind .../lib/valgrinq, which doesn't exist,
-      so that our own libpthread.so goes out of scope. */
-   p_path += VG_(strlen)(VG_LIBDIR);
-   what = 4;
-   if (p_path[0] != '/') goto mutancy;
-   p_path++; /* step over / */
-   what = 5;
-   if (p_path[7] != 'd') goto mutancy;
-   p_path[7] = 'q';
-   return;
-
-  mutancy:
-   VG_(printf)(
-      "\nVG_(mash_LD_PRELOAD_and_LD_LIBRARY_PATH): internal error:\n"
-      "   what                = %d\n"
-      "   ld_preload_str      = `%s'\n"
-      "   ld_library_path_str = `%s'\n"
-      "   p_prel              = `%s'\n"
-      "   p_path              = `%s'\n"
-      "   VG_LIBDIR           = `%s'\n",
-      what, ld_preload_str, ld_library_path_str, 
-      p_prel, p_path, VG_LIBDIR 
-   );
-   VG_(printf)(
-      "\n"
-      "Note that this is often caused by mis-installation of valgrind.\n"
-      "Correct installation procedure is:\n"
-      "   ./configure --prefix=/install/dir\n"
-      "   make install\n"
-      "And then use /install/dir/bin/valgrind\n"
-      "Moving the installation directory elsewhere after 'make install'\n"
-      "will cause the above error.  Hand-editing the paths in the shell\n"
-      "scripts is also likely to cause problems.\n"
-      "\n"
-   );
-   VG_(panic)("VG_(mash_LD_PRELOAD_and_LD_LIBRARY_PATH) failed\n");
-}
-
-
-/* RUNS ON THE CLIENT'S STACK, but on the real CPU.  Start GDB and get
-   it to attach to this process.  Called if the user requests this
-   service after an error has been shown, so she can poke around and
-   look at parameters, memory, etc.  You can't meaningfully get GDB to
-   continue the program, though; to continue, quit GDB.  */
-extern void VG_(start_GDB_whilst_on_client_stack) ( void )
-{
-   Int   res;
-   UChar buf[100];
-   VG_(sprintf)(buf,
-                "/usr/bin/gdb -nw /proc/%d/exe %d", 
-                VG_(getpid)(), VG_(getpid)());
-   VG_(message)(Vg_UserMsg, "starting GDB with cmd: %s", buf);
-   res = VG_(system)(buf);
-   if (res == 0) {      
-      VG_(message)(Vg_UserMsg, "");
-      VG_(message)(Vg_UserMsg, 
-         "GDB has detached.  Valgrind regains control.  We continue.");
-   } else {
-      VG_(message)(Vg_UserMsg, "Apparently failed!");
-      VG_(message)(Vg_UserMsg, "");
-   }
-}
-
-
-/* Print some helpful-ish text about unimplemented things, and give
-   up. */
-void VG_(unimplemented) ( Char* msg )
-{
-   VG_(message)(Vg_UserMsg, "");
-   VG_(message)(Vg_UserMsg, 
-      "Valgrind detected that your program requires");
-   VG_(message)(Vg_UserMsg, 
-      "the following unimplemented functionality:");
-   VG_(message)(Vg_UserMsg, "   %s", msg);
-   VG_(message)(Vg_UserMsg,
-      "This may be because the functionality is hard to implement,");
-   VG_(message)(Vg_UserMsg,
-      "or because no reasonable program would behave this way,");
-   VG_(message)(Vg_UserMsg,
-      "or because nobody has yet needed it.  In any case, let me know");
-   VG_(message)(Vg_UserMsg,
-      "(jseward@acm.org) and/or try to work around the problem, if you can.");
-   VG_(message)(Vg_UserMsg,
-      "");
-   VG_(message)(Vg_UserMsg,
-      "Valgrind has to exit now.  Sorry.  Bye!");
-   VG_(message)(Vg_UserMsg,
-      "");
-   VG_(pp_sched_status)();
-   VG_(exit)(1);
-}
-
-
-void VG_(nvidia_moan) ( void) 
-{
-   VG_(message)(Vg_UserMsg,
-      "The following failure _might_ be caused by linking to NVidia's\n   "
-      "libGL.so, so avoiding it, if you can, _might_ help you.  For example,\n   "
-      "re-build any Qt libraries you are using without OpenGL support.");
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                                vg_main.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_malloc2.c b/coregrind/vg_malloc2.c
deleted file mode 100644
index 87f580d01a..0000000000
--- a/coregrind/vg_malloc2.c
+++ /dev/null
@@ -1,1299 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- An implementation of malloc/free which doesn't use sbrk.     ---*/
-/*---                                                 vg_malloc2.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-
-#include "vg_include.h"
-
-/* Define to turn on (heavyweight) debugging machinery. */
-/* #define DEBUG_MALLOC */
-
-
-/*------------------------------------------------------------*/
-/*--- Structs n stuff                                      ---*/
-/*------------------------------------------------------------*/
-
-#define VG_REDZONE_LO_MASK 0x31415927
-#define VG_REDZONE_HI_MASK 0x14141356
-
-#define VG_N_MALLOC_LISTS 16 /* do not change this */
-
-
-typedef UInt Word;
-typedef Word WordF;
-typedef Word WordL;
-
-
-/* A superblock. */
-typedef 
-   struct _Superblock {
-      struct _Superblock* next;
-      /* number of payload words in this superblock. */
-      Int  n_payload_words;
-      Word payload_words[0];
-   }
-   Superblock;
-
-
-/* An arena. */
-typedef 
-   struct {
-      Char*       name;
-      Int         rz_szW; /* Red zone size in words */
-      Bool        rz_check; /* Check red-zone on free? */
-      Int         min_sblockW; /* Minimum superblock size */
-      WordF*      freelist[VG_N_MALLOC_LISTS];
-      Superblock* sblocks;
-      /* Stats only. */
-      UInt bytes_on_loan;
-      UInt bytes_mmaped;
-      UInt bytes_on_loan_max;
-   } 
-   Arena;
-
-
-/* Block layout:
-
-     this block total sizeW   (1 word)
-     freelist previous ptr    (1 word)
-     freelist next  ptr       (1 word)
-     red zone words (depends on .rz_szW field of Arena)
-     (payload words)
-     red zone words (depends on .rz_szW field of Arena)
-     this block total sizeW  (1 word)
-
-     Total size in words (bszW) and payload size in words (pszW)
-     are related by
-        bszW == pszW + 4 + 2 * a->rz_szW
-
-     Furthermore, both size fields in the block are negative if it is
-     not in use, and positive if it is in use.  A block size of zero
-     is not possible, because a block always has at least four words
-     of overhead.  
-*/
-typedef
-   struct {
-      Int   bszW_lo;
-      Word* prev;
-      Word* next;
-      Word  redzone[0];
-   } 
-   BlockHeader;
-
-
-/*------------------------------------------------------------*/
-/*--- Forwardses ... and misc ...                          ---*/
-/*------------------------------------------------------------*/
-
-static Bool blockSane ( Arena* a, Word* b );
-
-/* Align ptr p upwards to an align-sized boundary. */
-static
-void* align_upwards ( void* p, Int align )
-{
-   Addr a = (Addr)p;
-   if ((a % align) == 0) return (void*)a;
-   return (void*)(a - (a % align) + align);
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Arena management stuff                               ---*/
-/*------------------------------------------------------------*/
-
-/* The arena structures themselves. */
-static Arena vg_arena[VG_N_ARENAS];
-
-/* Functions external to this module identify arenas using ArenaIds,
-   not Arena*s.  This fn converts the former to the latter. */
-static Arena* arenaId_to_ArenaP ( ArenaId arena )
-{
-   vg_assert(arena >= 0 && arena < VG_N_ARENAS);
-   return & vg_arena[arena];
-}
-
-
-/* Initialise an arena. */
-static
-void arena_init ( Arena* a, Char* name, 
-                  Int rz_szW, Bool rz_check, Int min_sblockW )
-{
-   Int i;
-   vg_assert((min_sblockW % VKI_WORDS_PER_PAGE) == 0);
-   a->name = name;
-   a->rz_szW = rz_szW;
-   a->rz_check = rz_check;
-   a->min_sblockW = min_sblockW;
-   for (i = 0; i < VG_N_MALLOC_LISTS; i++) a->freelist[i] = NULL;
-   a->sblocks = NULL;
-   a->bytes_on_loan     = 0;
-   a->bytes_mmaped      = 0;
-   a->bytes_on_loan_max = 0;
-}
-
-
-/* Print vital stats for an arena. */
-void VG_(show_all_arena_stats) ( void )
-{
-   Int i;
-   for (i = 0; i < VG_N_ARENAS; i++) {
-      VG_(message)(Vg_DebugMsg,
-         "Arena `%s': %7d max useful, %7d mmap'd, %7d current useful",
-         vg_arena[i].name, 
-         vg_arena[i].bytes_on_loan_max, 
-         vg_arena[i].bytes_mmaped, 
-         vg_arena[i].bytes_on_loan 
-      );
-   }
-}
-
-
-/* It is important that this library is self-initialising, because it
-   may get called very early on -- as a result of C++ static
-   constructor initialisations -- before Valgrind itself is
-   initialised.  Hence vg_malloc() and vg_free() below always call
-   ensure_mm_init() to ensure things are correctly initialised.  */
-
-static
-void ensure_mm_init ( void )
-{
-   static Bool init_done = False;
-   if (init_done) return;
-
-   /* Use a checked red zone size of 1 word for our internal stuff,
-      and an unchecked zone of arbitrary size for the client.  Of
-      course the client's red zone is checked really, but using the
-      addressibility maps, not by the mechanism implemented here,
-      which merely checks at the time of freeing that the red zone
-      words are unchanged. */
-
-   arena_init ( &vg_arena[VG_AR_PRIVATE], "private ", 
-                1, True, 262144 );
-
-   arena_init ( &vg_arena[VG_AR_SYMTAB],  "symtab  ", 
-                1, True, 262144 );
-
-   arena_init ( &vg_arena[VG_AR_CLIENT],  "client  ",  
-                VG_AR_CLIENT_REDZONE_SZW, False, 262144 );
-
-   arena_init ( &vg_arena[VG_AR_DEMANGLE], "demangle",  
-                4 /*paranoid*/, True, 16384 );
-
-   arena_init ( &vg_arena[VG_AR_EXECTXT],  "exectxt ",  
-                1, True, 16384 );
-
-   arena_init ( &vg_arena[VG_AR_ERRCTXT],  "errctxt ",  
-                1, True, 16384 );
-
-   arena_init ( &vg_arena[VG_AR_TRANSIENT], "transien",  
-                2, True, 16384 );
-
-   init_done = True;
-#  ifdef DEBUG_MALLOC
-   VG_(mallocSanityCheckAll)();
-#  endif
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Arena management stuff                               ---*/
-/*------------------------------------------------------------*/
-
-static
-Superblock* newSuperblock ( Arena* a, Int cszW )
-{
-   Superblock* sb;
-   cszW += 2; /* Take into account sb->next and sb->n_words fields */
-   if (cszW < a->min_sblockW) cszW = a->min_sblockW;
-   while ((cszW % VKI_WORDS_PER_PAGE) > 0) cszW++;
-   sb = VG_(get_memory_from_mmap) ( cszW * sizeof(Word), 
-                                    "newSuperblock" );
-   sb->n_payload_words = cszW - 2;
-   a->bytes_mmaped += cszW * sizeof(Word);
-   if (0)
-      VG_(message)(Vg_DebugMsg, "newSuperblock, %d payload words", 
-                                sb->n_payload_words);
-   return sb;
-}
-
-
-/* Find the superblock containing the given chunk. */
-static
-Superblock* findSb ( Arena* a, UInt* ch )
-{
-   Superblock* sb;
-   for (sb = a->sblocks; sb; sb = sb->next)
-      if (&sb->payload_words[0] <= ch
-          && ch < &sb->payload_words[sb->n_payload_words]) 
-         return sb;
-   VG_(printf)("findSb: can't find pointer %p in arena `%s'\n",
-               ch, a->name );
-   VG_(panic)("findSb: vg_free() in wrong arena?");
-   return NULL; /*NOTREACHED*/
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Low-level functions for working with blocks.         ---*/
-/*------------------------------------------------------------*/
-
-/* Add the not-in-use attribute to a bszW. */
-static __inline__
-Int mk_free_bszW ( Int bszW )
-{
-   vg_assert(bszW != 0);
-   return (bszW < 0) ? bszW : -bszW;
-}
-
-/* Add the in-use attribute to a bszW. */
-static __inline__
-Int mk_inuse_bszW ( Int bszW )
-{
-   vg_assert(bszW != 0);
-   return (bszW < 0) ? -bszW : bszW;
-}
-
-/* Remove the in-use/not-in-use attribute from a bszW, leaving just
-   the size. */
-static __inline__
-Int mk_plain_bszW ( Int bszW )
-{
-   vg_assert(bszW != 0);
-   return (bszW < 0) ? -bszW : bszW;
-}
-
-/* Does this bszW have the in-use attribute ? */
-static __inline__
-Bool is_inuse_bszW ( Int bszW )
-{
-   vg_assert(bszW != 0);
-   return (bszW < 0) ? False : True;
-}
-
-
-/* Given the addr of the first word of a block, return the addr of the
-   last word. */
-static __inline__
-WordL* first_to_last ( WordF* fw )
-{
-   return fw + mk_plain_bszW(fw[0]) - 1;
-}
-
-/* Given the addr of the last word of a block, return the addr of the
-   first word. */
-static __inline__
-WordF* last_to_first ( WordL* lw )
-{
-   return lw - mk_plain_bszW(lw[0]) + 1;
-}
-
-
-/* Given the addr of the first word of a block, return the addr of the
-   first word of its payload. */
-static __inline__
-Word* first_to_payload ( Arena* a, WordF* fw )
-{
-   return & fw[3 + a->rz_szW];
-}
-
-/* Given the addr of the first word of a the payload of a block,
-   return the addr of the first word of the block. */
-static __inline__
-Word* payload_to_first ( Arena* a, WordF* payload )
-{
-   return & payload[- 3 - a->rz_szW];
-}
-
-/* Set and get the lower size field of a block. */
-static __inline__
-void set_bszW_lo ( WordF* fw, Int bszW ) { 
-   fw[0] = bszW; 
-}
-static __inline__
-Int get_bszW_lo ( WordF* fw )
-{
-   return fw[0];
-}
-
-
-/* Set and get the next and previous link fields of a block. */
-static __inline__
-void set_prev_p  ( WordF* fw, Word* prev_p ) { 
-   fw[1] = (Word)prev_p; 
-}
-static __inline__
-void set_next_p  ( WordF* fw, Word* next_p ) { 
-   fw[2] = (Word)next_p; 
-}
-static __inline__
-Word* get_prev_p  ( WordF* fw ) { 
-   return (Word*)(fw[1]);
-}
-static __inline__
-Word* get_next_p  ( WordF* fw ) { 
-   return (Word*)(fw[2]);
-}
-
-
-/* Set and get the upper size field of a block. */
-static __inline__
-void set_bszW_hi ( WordF* fw, Int bszW ) {
-   WordL* lw = first_to_last(fw);
-   vg_assert(lw == fw + mk_plain_bszW(bszW) - 1);
-   lw[0] = bszW;
-}
-static __inline__
-Int get_bszW_hi ( WordF* fw ) {
-   WordL* lw = first_to_last(fw);
-   return lw[0];
-}
-
-/* Get the upper size field of a block, given a pointer to the last
-   word of it. */
-static __inline__
-Int get_bszW_hi_from_last_word ( WordL* lw ) {
-   WordF* fw = last_to_first(lw);
-   return get_bszW_lo(fw);
-}
-
-
-/* Read and write the lower and upper red-zone words of a block. */
-static __inline__
-void set_rz_lo_word ( Arena* a, WordF* fw, Int rz_wordno, Word w )
-{
-   fw[3 + rz_wordno] = w;
-}
-static __inline__
-void set_rz_hi_word ( Arena* a, WordF* fw, Int rz_wordno, Word w )
-{
-   WordL* lw = first_to_last(fw);
-   lw[-1-rz_wordno] = w;
-}
-static __inline__
-Word get_rz_lo_word ( Arena* a, WordF* fw, Int rz_wordno )
-{
-   return fw[3 + rz_wordno];
-}
-static __inline__
-Word get_rz_hi_word ( Arena* a, WordF* fw, Int rz_wordno )
-{
-   WordL* lw = first_to_last(fw);
-   return lw[-1-rz_wordno];
-}
-
-
-/* Return the lower, upper and total overhead in words for a block.
-   These are determined purely by which arena the block lives in. */
-static __inline__
-Int overhead_szW_lo ( Arena* a )
-{
-   return 3 + a->rz_szW;
-}
-static __inline__
-Int overhead_szW_hi ( Arena* a )
-{
-   return 1 + a->rz_szW;
-}
-static __inline__
-Int overhead_szW ( Arena* a )
-{
-   return overhead_szW_lo(a) + overhead_szW_hi(a);
-}
-
-
-/* Convert pointer size in words to block size in words, and back. */
-static __inline__
-Int pszW_to_bszW ( Arena* a, Int pszW )
-{
-   vg_assert(pszW >= 0);
-   return pszW + overhead_szW(a);
-}
-static __inline__
-Int bszW_to_pszW ( Arena* a, Int bszW )
-{
-   Int pszW = bszW - overhead_szW(a);
-   vg_assert(pszW >= 0);
-   return pszW;
-}
-
-/*------------------------------------------------------------*/
-/*--- Functions for working with freelists.                ---*/
-/*------------------------------------------------------------*/
-
-/* Determination of which freelist a block lives on is based on the
-   payload size, not block size, in words. */
-
-/* Convert a payload size in words to a freelist number. */
-
-static
-Int pszW_to_listNo ( Int pszW )
-{
-   vg_assert(pszW >= 0);
-   if (pszW <= 3)   return 0;
-   if (pszW <= 4)   return 1;
-   if (pszW <= 5)   return 2;
-   if (pszW <= 6)   return 3;
-   if (pszW <= 7)   return 4;
-   if (pszW <= 8)   return 5;
-   if (pszW <= 9)   return 6;
-   if (pszW <= 10)  return 7;
-   if (pszW <= 11)  return 8;
-   if (pszW <= 12)  return 9;
-   if (pszW <= 16)  return 10;
-   if (pszW <= 32)  return 11;
-   if (pszW <= 64)  return 12;
-   if (pszW <= 128) return 13;
-   if (pszW <= 256) return 14;
-   return 15;
-}
-
-
-/* What are the minimum and maximum payload sizes for a given list? */
-
-static
-Int listNo_to_pszW_min ( Int listNo )
-{
-   Int pszW = 0;
-   vg_assert(listNo >= 0 && listNo <= VG_N_MALLOC_LISTS);
-   while (pszW_to_listNo(pszW) < listNo) pszW++;
-   return pszW;
-}
-
-static
-Int listNo_to_pszW_max ( Int listNo )
-{
-   vg_assert(listNo >= 0 && listNo <= VG_N_MALLOC_LISTS);
-   if (listNo == VG_N_MALLOC_LISTS-1) {
-      return 999999999;
-   } else {
-      return listNo_to_pszW_min(listNo+1) - 1;
-   }
-}
-
-
-/* A nasty hack to try and reduce fragmentation.  Try and replace
-   a->freelist[lno] with another block on the same list but with a
-   lower address, with the idea of attempting to recycle the same
-   blocks rather than cruise through the address space. */
-
-static 
-void swizzle ( Arena* a, Int lno )
-{
-   UInt* p_best;
-   UInt* pp;
-   UInt* pn;
-   Int   i;
-
-   p_best = a->freelist[lno];
-   if (p_best == NULL) return;
-
-   pn = pp = p_best;
-   for (i = 0; i < 20; i++) {
-      pn = get_next_p(pn);
-      pp = get_prev_p(pp);
-      if (pn < p_best) p_best = pn;
-      if (pp < p_best) p_best = pp;
-   }
-   if (p_best < a->freelist[lno]) {
-#     ifdef DEBUG_MALLOC
-      VG_(printf)("retreat by %d\n", 
-           ((Char*)(a->freelist[lno])) - ((Char*)p_best));
-#     endif
-      a->freelist[lno] = p_best;
-   }
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Creating and deleting blocks.                        ---*/
-/*------------------------------------------------------------*/
-
-/* Mark the words at b .. b+bszW-1 as not in use, and add them to the
-   relevant free list. */
-
-static
-void mkFreeBlock ( Arena* a, Word* b, Int bszW, Int b_lno )
-{
-   Int pszW = bszW_to_pszW(a, bszW);
-   vg_assert(pszW >= 0);
-   vg_assert(b_lno == pszW_to_listNo(pszW));
-   /* Set the size fields and indicate not-in-use. */
-   set_bszW_lo(b, mk_free_bszW(bszW));
-   set_bszW_hi(b, mk_free_bszW(bszW));
-
-   /* Add to the relevant list. */
-   if (a->freelist[b_lno] == NULL) {
-      set_prev_p(b, b);
-      set_next_p(b, b);
-      a->freelist[b_lno] = b;
-   } else {
-      Word* b_prev = get_prev_p(a->freelist[b_lno]);
-      Word* b_next = a->freelist[b_lno];
-      set_next_p(b_prev, b);
-      set_prev_p(b_next, b);
-      set_next_p(b, b_next);
-      set_prev_p(b, b_prev);
-   }
-#  ifdef DEBUG_MALLOC
-   (void)blockSane(a,b);
-#  endif
-}
-
-
-/* Mark the words at b .. b+bszW-1 as in use, and set up the block
-   appropriately. */
-static
-void mkInuseBlock ( Arena* a, UInt* b, UInt bszW )
-{
-   Int i;
-   set_bszW_lo(b, mk_inuse_bszW(bszW));
-   set_bszW_hi(b, mk_inuse_bszW(bszW));
-   set_prev_p(b, NULL);
-   set_next_p(b, NULL);
-   if (a->rz_check) {
-      for (i = 0; i < a->rz_szW; i++) {
-         set_rz_lo_word(a, b, i, (UInt)b ^ VG_REDZONE_LO_MASK);
-         set_rz_hi_word(a, b, i, (UInt)b ^ VG_REDZONE_HI_MASK);
-      }
-   }
-#  ifdef DEBUG_MALLOC
-   (void)blockSane(a,b);
-#  endif
-}
-
-
-/* Remove a block from a given list.  Does no sanity checking. */
-static
-void unlinkBlock ( Arena* a, UInt* b, Int listno )
-{
-   vg_assert(listno >= 0 && listno < VG_N_MALLOC_LISTS);
-   if (get_prev_p(b) == b) {
-      /* Only one element in the list; treat it specially. */
-      vg_assert(get_next_p(b) == b);
-      a->freelist[listno] = NULL;
-   } else {
-      UInt* b_prev = get_prev_p(b);
-      UInt* b_next = get_next_p(b);
-      a->freelist[listno] = b_prev;
-      set_next_p(b_prev, b_next);
-      set_prev_p(b_next, b_prev);
-      swizzle ( a, listno );
-   }
-   set_prev_p(b, NULL);
-   set_next_p(b, NULL);
-}
-
-
-/* Split an existing free block into two pieces, and put the fragment
-   (the second one along in memory) onto the relevant free list.
-   req_bszW is the required size of the block which isn't the
-   fragment. */
-static
-void splitChunk ( Arena* a, UInt* b, Int b_listno, UInt req_bszW )
-{
-   Int b_bszW, frag_bszW;
-   b_bszW = mk_plain_bszW(get_bszW_lo(b));
-   vg_assert(req_bszW < b_bszW);
-   frag_bszW = b_bszW - req_bszW;
-   vg_assert(frag_bszW >= overhead_szW(a));
-   /*
-   printf( "split %d into %d and %d\n", 
-                   b_bszW,req_bszW,frag_bszW  );
-   */
-   vg_assert(bszW_to_pszW(a, frag_bszW) > 0);
-   unlinkBlock(a, b, b_listno);
-   mkInuseBlock(a, b, req_bszW);
-   mkFreeBlock(a, &b[req_bszW], frag_bszW, 
-                  pszW_to_listNo(bszW_to_pszW(a, frag_bszW)));
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Sanity-check/debugging machinery.                    ---*/
-/*------------------------------------------------------------*/
-
-/* Do some crude sanity checks on a chunk. */
-static 
-Bool blockSane ( Arena* a, Word* b )
-{
-#  define BLEAT(str) VG_(printf)("blockSane: fail -- %s\n",str)
-   Int i;
-   if (get_bszW_lo(b) != get_bszW_hi(b)) 
-      {BLEAT("sizes");return False;}
-   if (a->rz_check && is_inuse_bszW(get_bszW_lo(b))) {
-      for (i = 0; i < a->rz_szW; i++) {
-         if (get_rz_lo_word(a, b, i) != ((Word)b ^ VG_REDZONE_LO_MASK))
-            {BLEAT("redzone-lo");return False;}
-         if (get_rz_hi_word(a, b, i) != ((Word)b ^ VG_REDZONE_HI_MASK))
-            {BLEAT("redzone-hi");return False;}
-      }      
-   }
-   return True;
-#  undef BLEAT
-}
-
-
-/* Print superblocks (only for debugging). */
-static 
-void ppSuperblocks ( Arena* a )
-{
-   Int i, ch_bszW, blockno;
-   UInt* ch;
-   Superblock* sb = a->sblocks;
-   blockno = 1;
-
-   while (sb) {
-      VG_(printf)( "\n" );
-      VG_(printf)( "superblock %d at %p, sb->n_pl_ws = %d, next = %p\n", 
-                   blockno++, sb, sb->n_payload_words, sb->next );
-      i = 0;
-      while (True) {
-         if (i >= sb->n_payload_words) break;
-         ch     = &sb->payload_words[i];
-         ch_bszW = get_bszW_lo(ch);
-         VG_(printf)( "   block at %d, bszW %d: ", i, mk_plain_bszW(ch_bszW) );
-         VG_(printf)( "%s, ", is_inuse_bszW(ch_bszW) ? "inuse" : "free" );
-         VG_(printf)( "%s\n", blockSane(a,ch) ? "ok" : "BAD" );
-         i += mk_plain_bszW(ch_bszW);
-      }
-      if (i > sb->n_payload_words) 
-         VG_(printf)( "   last block overshoots end of SB\n");
-      sb = sb->next;
-   }
-   VG_(printf)( "end of superblocks\n\n" );
-}
-
-
-/* Sanity check both the superblocks and the chains. */
-void VG_(mallocSanityCheckArena) ( ArenaId aid )
-{
-   Int         i, superblockctr, b_bszW, b_pszW, blockctr_sb, blockctr_li;
-   Int         blockctr_sb_free, listno, list_min_pszW, list_max_pszW;
-   Superblock* sb;
-   Bool        thisFree, lastWasFree;
-   Word*       b;
-   Word*       b_prev;
-   UInt        arena_bytes_on_loan;
-   Arena*      a;
-
-#  define BOMB VG_(panic)("vg_mallocSanityCheckArena")
-
-   a = arenaId_to_ArenaP(aid);
-   
-   /* First, traverse all the superblocks, inspecting the chunks in
-      each. */
-   superblockctr = blockctr_sb = blockctr_sb_free = 0;
-   arena_bytes_on_loan = 0;
-   sb = a->sblocks;
-   while (sb) {
-      lastWasFree = False;
-      superblockctr++;
-      i = 0;
-      while (True) {
-         if (i >= sb->n_payload_words) break;
-         blockctr_sb++;
-         b     = &sb->payload_words[i];
-         b_bszW = get_bszW_lo(b);
-         if (!blockSane(a, b)) {
-            VG_(printf)( "mallocSanityCheck: sb %p, block %d (bszW %d): "
-                         "BAD\n",
-                         sb, i, b_bszW );
-            BOMB;
-         }
-         thisFree = !is_inuse_bszW(b_bszW);
-         if (thisFree && lastWasFree) {
-            VG_(printf)( "mallocSanityCheck: sb %p, block %d (bszW %d): "
-                         "UNMERGED FREES\n",
-                         sb, i, b_bszW );
-            BOMB;
-         }
-         lastWasFree = thisFree;
-         if (thisFree) blockctr_sb_free++;
-         if (!thisFree) 
-            arena_bytes_on_loan += sizeof(Word) * bszW_to_pszW(a, b_bszW);
-         i += mk_plain_bszW(b_bszW);
-      }
-      if (i > sb->n_payload_words) {
-         VG_(printf)( "mallocSanityCheck: sb %p: last block "
-                      "overshoots end\n", sb);
-         BOMB;
-      }
-      sb = sb->next;
-   }
-
-   if (arena_bytes_on_loan != a->bytes_on_loan) {
-            VG_(printf)( 
-                    "mallocSanityCheck: a->bytes_on_loan %d, "
-                    "arena_bytes_on_loan %d: "
-                    "MISMATCH\n", a->bytes_on_loan, arena_bytes_on_loan);
-      ppSuperblocks(a);
-      BOMB;
-   }
-
-   /* Second, traverse each list, checking that the back pointers make
-      sense, counting blocks encountered, and checking that each block
-      is an appropriate size for this list. */
-   blockctr_li = 0;
-   for (listno = 0; listno < VG_N_MALLOC_LISTS; listno++) {
-      list_min_pszW = listNo_to_pszW_min(listno);
-      list_max_pszW = listNo_to_pszW_max(listno);
-      b = a->freelist[listno];
-      if (b == NULL) continue;
-      while (True) {
-         b_prev = b;
-         b = get_next_p(b);
-         if (get_prev_p(b) != b_prev) {
-            VG_(printf)( "mallocSanityCheck: list %d at %p: "
-                         "BAD LINKAGE\n", 
-                         listno, b );
-            BOMB;
-         }
-         b_pszW = bszW_to_pszW(a, mk_plain_bszW(get_bszW_lo(b)));
-         if (b_pszW < list_min_pszW || b_pszW > list_max_pszW) {
-            VG_(printf)( 
-               "mallocSanityCheck: list %d at %p: "
-               "WRONG CHAIN SIZE %d (%d, %d)\n", 
-               listno, b, b_pszW, list_min_pszW, list_max_pszW );
-            BOMB;
-         }
-         blockctr_li++;
-         if (b == a->freelist[listno]) break;
-      }
-   }
-
-   if (blockctr_sb_free != blockctr_li) {
-      VG_(printf)( 
-         "mallocSanityCheck: BLOCK COUNT MISMATCH "
-         "(via sbs %d, via lists %d)\n",
-         blockctr_sb_free, blockctr_li );
-      ppSuperblocks(a);
-      BOMB;
-   }
-
-   VG_(message)(Vg_DebugMsg,
-                "mSC [%s]: %2d sbs, %5d tot bs, %4d/%-4d free bs, "
-                "%2d lists, %7d mmap, %7d loan", 
-                a->name,
-                superblockctr,
-                blockctr_sb, blockctr_sb_free, blockctr_li, 
-                VG_N_MALLOC_LISTS, 
-                a->bytes_mmaped, a->bytes_on_loan);   
-#  undef BOMB
-}
-
-
-void VG_(mallocSanityCheckAll) ( void )
-{
-   Int i;
-   for (i = 0; i < VG_N_ARENAS; i++)
-      VG_(mallocSanityCheckArena) ( i );
-}
-
-
-/* Really, this isn't the right place for this.  Nevertheless: find
-   out if an arena is empty -- currently has no bytes on loan.  This
-   is useful for checking for memory leaks (of valgrind, not the
-   client.) 
-*/
-Bool VG_(is_empty_arena) ( ArenaId aid )
-{
-   Arena*      a;
-   Superblock* sb;
-   WordF*      b;
-   Int         b_bszW;
-   ensure_mm_init();
-   a = arenaId_to_ArenaP(aid);
-   for (sb = a->sblocks; sb != NULL; sb = sb->next) {
-      /* If the superblock is empty, it should contain a single free
-         block, of the right size. */
-      b = &(sb->payload_words[0]);
-      b_bszW = get_bszW_lo(b);
-      if (is_inuse_bszW(b_bszW)) return False;
-      if (mk_plain_bszW(b_bszW) != sb->n_payload_words) return False;
-      /* So this block is not in use and is of the right size.  Keep
-         going. */
-   }
-   return True;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Externally-visible functions.                        ---*/
-/*------------------------------------------------------------*/
-
-void* VG_(malloc) ( ArenaId aid, Int req_pszB )
-{
-   Int         req_pszW, req_bszW, frag_bszW, b_bszW, lno;
-   Superblock* new_sb;
-   Word*       b;
-   Arena*      a;
-
-   VGP_PUSHCC(VgpMalloc);
-
-   ensure_mm_init();
-   a = arenaId_to_ArenaP(aid);
-
-   vg_assert(req_pszB >= 0);
-   vg_assert(req_pszB < 0x7FFFFFF0);
-
-   req_pszW = (req_pszB + VKI_BYTES_PER_WORD - 1) / VKI_BYTES_PER_WORD;
-
-   /* Keep gcc -O happy: */
-   b = NULL;
-
-   /* Start searching at this list. */
-   lno = pszW_to_listNo(req_pszW);
-
-   /* This loop finds a list which has a block big enough, or sets
-      req_listno to N_LISTS if no such block exists. */
-   while (True) {
-      if (lno == VG_N_MALLOC_LISTS) break;
-      /* If this list is empty, try the next one. */
-      if (a->freelist[lno] == NULL) {
-         lno++;
-         continue;
-      }
-      /* Scan a->list[lno] to find a big-enough chunk. */
-      b = a->freelist[lno];
-      b_bszW = mk_plain_bszW(get_bszW_lo(b));
-      while (True) {
-         if (bszW_to_pszW(a, b_bszW) >= req_pszW) break;
-         b = get_next_p(b);
-         b_bszW = mk_plain_bszW(get_bszW_lo(b));
-         if (b == a->freelist[lno]) break;
-      }
-      if (bszW_to_pszW(a, b_bszW) >= req_pszW) break;
-      /* No luck?  Try a larger list. */
-      lno++;
-   }
-
-   /* Either lno < VG_N_MALLOC_LISTS and b points to the selected
-      block, or lno == VG_N_MALLOC_LISTS, and we have to allocate a
-      new superblock. */
-
-   if (lno == VG_N_MALLOC_LISTS) {
-      req_bszW = pszW_to_bszW(a, req_pszW);      
-      new_sb = newSuperblock(a, req_bszW);
-      vg_assert(new_sb != NULL);
-      new_sb->next = a->sblocks;
-      a->sblocks = new_sb;
-      b = &(new_sb->payload_words[0]);
-      lno = pszW_to_listNo(bszW_to_pszW(a, new_sb->n_payload_words));
-      mkFreeBlock ( a, b, new_sb->n_payload_words, lno);
-   }
-
-   /* Ok, we can allocate from b, which lives in list req_listno. */
-   vg_assert(b != NULL);
-   vg_assert(lno >= 0 && lno < VG_N_MALLOC_LISTS);
-   vg_assert(a->freelist[lno] != NULL);
-   b_bszW = mk_plain_bszW(get_bszW_lo(b));
-   req_bszW = pszW_to_bszW(a, req_pszW);
-   /* req_bszW is the size of the block we are after.  b_bszW is the
-      size of what we've actually got. */
-   vg_assert(b_bszW >= req_bszW);
-
-   /* Could we split this block and still get a useful fragment?
-      Where "useful" means that the payload size of the frag is at
-      least one word.  */
-   frag_bszW = b_bszW - req_bszW;
-   if (frag_bszW > overhead_szW(a)) {
-      splitChunk(a, b, lno, req_bszW);
-   } else {
-      /* No, mark as in use and use as-is. */
-      unlinkBlock(a, b, lno);
-      /*
-      set_bszW_lo(b, mk_inuse_bszW(b_bszW));
-      set_bszW_hi(b, mk_inuse_bszW(b_bszW));
-      */
-      mkInuseBlock(a, b, b_bszW);
-   }
-   vg_assert(req_bszW <= mk_plain_bszW(get_bszW_lo(b)));
-
-   a->bytes_on_loan 
-      += sizeof(Word) 
-         * bszW_to_pszW(a, mk_plain_bszW(get_bszW_lo(b)));
-   if (a->bytes_on_loan > a->bytes_on_loan_max)
-      a->bytes_on_loan_max = a->bytes_on_loan;
-
-#  ifdef DEBUG_MALLOC
-   VG_(mallocSanityCheckArena)(aid);
-#  endif
-
-   VGP_POPCC;
-   return first_to_payload(a, b);
-}
-
- 
-void VG_(free) ( ArenaId aid, void* ptr )
-{
-   Superblock* sb;
-   UInt*       sb_payl_firstw;
-   UInt*       sb_payl_lastw;
-   UInt*       other;
-   UInt*       ch;
-   Int         ch_bszW, ch_pszW, other_bszW, ch_listno;
-   Arena*      a;
-
-   VGP_PUSHCC(VgpMalloc);
-
-   ensure_mm_init();
-   a = arenaId_to_ArenaP(aid);
-
-   if (ptr == NULL) return;
-
-   ch = payload_to_first(a, ptr);
-
-#  ifdef DEBUG_MALLOC
-   vg_assert(blockSane(a,ch));
-#  endif
-
-   a->bytes_on_loan 
-      -= sizeof(Word) 
-         * bszW_to_pszW(a, mk_plain_bszW(get_bszW_lo(ch)));
-
-   sb             = findSb( a, ch );
-   sb_payl_firstw = &(sb->payload_words[0]);
-   sb_payl_lastw  = &(sb->payload_words[sb->n_payload_words-1]);
-
-   /* Put this chunk back on a list somewhere. */
-   ch_bszW    = get_bszW_lo(ch);
-   ch_pszW    = bszW_to_pszW(a, ch_bszW);
-   ch_listno  = pszW_to_listNo(ch_pszW);
-   mkFreeBlock( a, ch, ch_bszW, ch_listno );
-
-   /* See if this block can be merged with the following one. */
-   other = ch + ch_bszW;
-   /* overhead_szW(a) is the smallest possible bszW for this arena.
-      So the nearest possible end to the block beginning at other is
-      other+overhead_szW(a)-1.  Hence the test below. */
-   if (other+overhead_szW(a)-1 <= sb_payl_lastw) {
-      other_bszW = get_bszW_lo(other);
-      if (!is_inuse_bszW(other_bszW)) {
-         /* VG_(printf)( "merge-successor\n"); */
-         other_bszW = mk_plain_bszW(other_bszW);
-#        ifdef DEBUG_MALLOC
-         vg_assert(blockSane(a, other));
-#        endif
-         unlinkBlock( a, ch, ch_listno );
-         unlinkBlock( a, other, pszW_to_listNo(bszW_to_pszW(a,other_bszW)) );
-         ch_bszW += other_bszW; 
-         ch_listno = pszW_to_listNo(bszW_to_pszW(a, ch_bszW));
-         mkFreeBlock( a, ch, ch_bszW, ch_listno );
-      }
-   }
-
-   /* See if this block can be merged with its predecessor. */
-   if (ch-overhead_szW(a) >= sb_payl_firstw) {
-      other_bszW = get_bszW_hi_from_last_word( ch-1 );
-      if (!is_inuse_bszW(other_bszW)) {
-         /* VG_(printf)( "merge-predecessor\n"); */
-         other = last_to_first( ch-1 );
-         other_bszW = mk_plain_bszW(other_bszW);         
-         unlinkBlock( a, ch, ch_listno );
-         unlinkBlock( a, other, pszW_to_listNo(bszW_to_pszW(a, other_bszW)) );
-         ch = other;
-         ch_bszW += other_bszW;
-         ch_listno = pszW_to_listNo(bszW_to_pszW(a, ch_bszW));
-         mkFreeBlock( a, ch, ch_bszW, ch_listno );
-      }
-   }
-
-#  ifdef DEBUG_MALLOC
-   VG_(mallocSanityCheckArena)(aid);
-#  endif
-
-   VGP_POPCC;
-}
-
-
-/*
-   The idea for malloc_aligned() is to allocate a big block, base, and
-   then split it into two parts: frag, which is returned to the the
-   free pool, and align, which is the bit we're really after.  Here's
-   a picture.  L and H denote the block lower and upper overheads, in
-   words.  The details are gruesome.  Note it is slightly complicated
-   because the initial request to generate base may return a bigger
-   block than we asked for, so it is important to distinguish the base
-   request size and the base actual size.
-
-   frag_b                   align_b
-   |                        |
-   |    frag_p              |    align_p
-   |    |                   |    |
-   v    v                   v    v
-
-   +---+                +---+---+               +---+
-   | L |----------------| H | L |---------------| H |
-   +---+                +---+---+               +---+
-
-   ^    ^                        ^
-   |    |                        :
-   |    base_p                   this addr must be aligned
-   |
-   base_b
-
-   .    .               .   .   .               .   .
-   <------ frag_bszW ------->   .               .   .
-   .    <------------- base_pszW_act ----------->   .
-   .    .               .   .   .               .   .
-
-*/
-void* VG_(malloc_aligned) ( ArenaId aid, Int req_alignB, Int req_pszB )
-{
-   Int    req_alignW, req_pszW, base_pszW_req, base_pszW_act, frag_bszW;
-   Word   *base_b, *base_p, *align_p;
-   UInt   saved_bytes_on_loan;
-   Arena* a;
-
-   ensure_mm_init();
-   a = arenaId_to_ArenaP(aid);
-
-   vg_assert(req_pszB >= 0);
-   vg_assert(req_pszB < 0x7FFFFFF0);
-
-   /* Check that the requested alignment seems reasonable; that is, is
-      a power of 2.  */
-   switch (req_alignB) {
-      case 4:
-      case 8: case 16: case 32: case 64: case 128: case 256: 
-      case 512: case 1024: case 2048: case 4096: case 8192: 
-      case 16384: case 32768: case 65536: case 131072: 
-      case 262144:
-      case 1048576: 
-         /* can't be bothered to calculate larger ones */
-         break;
-      default:
-         VG_(printf)("vg_malloc_aligned(%p, %d, %d)\nbad alignment request", 
-                     a, req_pszB, req_alignB );
-         VG_(panic)("vg_malloc_aligned");
-         /*NOTREACHED*/
-   }
-
-   /* Required alignment, in words.  Since it's constrained to be a
-      power of 2 >= word size, no need to align the alignment.  Still,
-      we check. */
-   req_alignW = req_alignB / VKI_BYTES_PER_WORD;
-   vg_assert(req_alignB == req_alignW * VKI_BYTES_PER_WORD);
-
-   /* Required payload size for the aligned chunk. */
-   req_pszW = (req_pszB + VKI_BYTES_PER_WORD - 1) / VKI_BYTES_PER_WORD;
-   
-   /* Payload size to request for the big block that we will split
-      up. */
-   base_pszW_req = req_pszW + overhead_szW(a) + req_alignW;
-
-   /* Payload ptr for the block we are going to split.  Note this
-      changes a->bytes_on_loan; we save and restore it ourselves. */
-   saved_bytes_on_loan = a->bytes_on_loan;
-   base_p = VG_(malloc) ( aid, base_pszW_req * VKI_BYTES_PER_WORD );
-   a->bytes_on_loan = saved_bytes_on_loan;
-
-   /* Block ptr for the block we are going to split. */
-   base_b = payload_to_first ( a, base_p );
-
-   /* Pointer to the payload of the aligned block we are going to
-      return.  This has to be suitably aligned. */
-   align_p = align_upwards ( base_b + 2 * overhead_szW_lo(a) 
-                                    + overhead_szW_hi(a),
-                             req_alignB );
-
-   /* The block size of the fragment we will create.  This must be big
-      enough to actually create a fragment. */
-   frag_bszW = align_p - overhead_szW_lo(a) - base_b;
-   vg_assert(frag_bszW >= overhead_szW(a));
-
-   /* The actual payload size of the block we are going to split. */
-   base_pszW_act = bszW_to_pszW(a, mk_plain_bszW(get_bszW_lo(base_b)));
-
-   /* Create the fragment block, and put it back on the relevant free
-      list. */
-   mkFreeBlock ( a, base_b, frag_bszW, 
-                 pszW_to_listNo(bszW_to_pszW(a, frag_bszW)) );
-
-   /* Create the aligned block. */
-   mkInuseBlock ( a,
-                  align_p - overhead_szW_lo(a), 
-                  base_p + base_pszW_act 
-                         + overhead_szW_hi(a) 
-                         - (align_p - overhead_szW_lo(a)) );
-
-   /* Final sanity checks. */
-   vg_assert(( (UInt)align_p % req_alignB) == 0);
-
-   vg_assert(is_inuse_bszW(get_bszW_lo(payload_to_first(a, align_p))));
-
-   vg_assert(req_pszW 
-             <= 
-             bszW_to_pszW(a, mk_plain_bszW(get_bszW_lo(
-                payload_to_first(a, align_p))))
-            );
-
-   a->bytes_on_loan 
-      += sizeof(Word)
-         * bszW_to_pszW(a, mk_plain_bszW(get_bszW_lo(
-              payload_to_first(a, align_p))));
-   if (a->bytes_on_loan > a->bytes_on_loan_max)
-      a->bytes_on_loan_max = a->bytes_on_loan;
-
-#  ifdef DEBUG_MALLOC
-   VG_(mallocSanityCheckArena)(aid);
-#  endif
-
-   return align_p;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Services layered on top of malloc/free.              ---*/
-/*------------------------------------------------------------*/
-
-void* VG_(calloc) ( ArenaId aid, Int nmemb, Int nbytes )
-{
-   Int    i, size;
-   UChar* p;
-   size = nmemb * nbytes;
-   vg_assert(size >= 0);
-   p = VG_(malloc) ( aid, size );
-   for (i = 0; i < size; i++) p[i] = 0;
-   return p;
-}
-
-
-void* VG_(realloc) ( ArenaId aid, void* ptr, Int req_pszB )
-{
-   Arena* a;
-   Int    old_bszW, old_pszW, old_pszB, i;
-   UChar  *p_old, *p_new;
-   UInt*  ch;
-
-   ensure_mm_init();
-   a = arenaId_to_ArenaP(aid);
-
-   vg_assert(req_pszB >= 0);
-   vg_assert(req_pszB < 0x7FFFFFF0);
-
-   ch = payload_to_first(a, ptr);
-   vg_assert(blockSane(a, ch));
-
-   old_bszW = get_bszW_lo(ch);
-   vg_assert(is_inuse_bszW(old_bszW));
-   old_bszW = mk_plain_bszW(old_bszW);
-   old_pszW = bszW_to_pszW(a, old_bszW);
-   old_pszB = old_pszW * VKI_BYTES_PER_WORD;
-
-   if (req_pszB <= old_pszB) return ptr;
-
-   p_new = VG_(malloc) ( aid, req_pszB );
-   p_old = (UChar*)ptr;
-   for (i = 0; i < old_pszB; i++)
-      p_new[i] = p_old[i];
-
-   VG_(free)(aid, p_old);
-   return p_new;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- The original test driver machinery.                  ---*/
-/*------------------------------------------------------------*/
-
-#if 0
-
-#if 1
-#define N_TEST_TRANSACTIONS 100000000
-#define N_TEST_ARR 200000
-#define M_TEST_MALLOC 1000
-#else
-#define N_TEST_TRANSACTIONS 500000
-#define N_TEST_ARR 30000
-#define M_TEST_MALLOC 500
-#endif
-
-
-void* test_arr[N_TEST_ARR];
-
-int main ( int argc, char** argv )
-{
-   Int i, j, k, nbytes, qq;
-   unsigned char* chp;
-   Arena* a = &arena[VG_AR_PRIVATE];
-   srandom(1);
-   for (i = 0; i < N_TEST_ARR; i++)
-      test_arr[i] = NULL;
-
-   for (i = 0; i < N_TEST_TRANSACTIONS; i++) {
-      if (i % 50000 == 0) mallocSanityCheck(a);
-      j = random() % N_TEST_ARR;
-      if (test_arr[j]) {
-         vg_free(a, test_arr[j]);
-         test_arr[j] = NULL;
-      } else {
-         nbytes = 1 + random() % M_TEST_MALLOC;
-         qq = random()%64;
-         if (qq == 32) 
-            nbytes *= 17;
-         else if (qq == 33)
-            nbytes = 0;
-         test_arr[j] 
-           = (i % 17) == 0
-                ? vg_memalign(a, nbytes, 1<< (3+(random()%10)))
-                : vg_malloc( a, nbytes );
-         chp = test_arr[j];
-         for (k = 0; k < nbytes; k++) 
-            chp[k] = (unsigned char)(k + 99);
-      }
-   }
-
-
-   for (i = 0; i < N_TEST_ARR; i++) {
-      if (test_arr[i]) {
-         vg_free(a, test_arr[i]);
-         test_arr[i] = NULL;
-      }
-   }
-   mallocSanityCheck(a);
-
-   fprintf(stderr, "ALL DONE\n");
-
-   show_arena_stats(a);
-   fprintf(stderr, "%d max useful, %d bytes mmap'd (%4.1f%%), %d useful\n",
-           a->bytes_on_loan_max, 
-           a->bytes_mmaped, 
-	   100.0 * (double)a->bytes_on_loan_max / (double)a->bytes_mmaped,
-           a->bytes_on_loan );
-
-   return 0;
-}
-#endif /* 0 */
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                             vg_malloc2.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_memory.c b/coregrind/vg_memory.c
deleted file mode 100644
index eea79cbe9a..0000000000
--- a/coregrind/vg_memory.c
+++ /dev/null
@@ -1,2414 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Maintain bitmaps of memory, tracking the accessibility (A)   ---*/
-/*--- and validity (V) status of each byte.                        ---*/
-/*---                                                  vg_memory.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-/* Define to debug the mem audit system. */
-/* #define VG_DEBUG_MEMORY */
-
-/* Define to debug the memory-leak-detector. */
-/* #define VG_DEBUG_LEAKCHECK */
-
-/* Define to collect detailed performance info. */
-/* #define VG_PROFILE_MEMORY */
-
-
-/*------------------------------------------------------------*/
-/*--- Low-level support for memory checking.               ---*/
-/*------------------------------------------------------------*/
-
-/* 
-   All reads and writes are checked against a memory map, which
-   records the state of all memory in the process.  The memory map is
-   organised like this:
-
-   The top 16 bits of an address are used to index into a top-level
-   map table, containing 65536 entries.  Each entry is a pointer to a
-   second-level map, which records the accesibililty and validity
-   permissions for the 65536 bytes indexed by the lower 16 bits of the
-   address.  Each byte is represented by nine bits, one indicating
-   accessibility, the other eight validity.  So each second-level map
-   contains 73728 bytes.  This two-level arrangement conveniently
-   divides the 4G address space into 64k lumps, each size 64k bytes.
-
-   All entries in the primary (top-level) map must point to a valid
-   secondary (second-level) map.  Since most of the 4G of address
-   space will not be in use -- ie, not mapped at all -- there is a
-   distinguished secondary map, which indicates `not addressible and
-   not valid' writeable for all bytes.  Entries in the primary map for
-   which the entire 64k is not in use at all point at this
-   distinguished map.
-
-   [...] lots of stuff deleted due to out of date-ness
-
-   As a final optimisation, the alignment and address checks for
-   4-byte loads and stores are combined in a neat way.  The primary
-   map is extended to have 262144 entries (2^18), rather than 2^16.
-   The top 3/4 of these entries are permanently set to the
-   distinguished secondary map.  For a 4-byte load/store, the
-   top-level map is indexed not with (addr >> 16) but instead f(addr),
-   where
-
-    f( XXXX XXXX XXXX XXXX ____ ____ ____ __YZ )
-        = ____ ____ ____ __YZ XXXX XXXX XXXX XXXX  or 
-        = ____ ____ ____ __ZY XXXX XXXX XXXX XXXX
-
-   ie the lowest two bits are placed above the 16 high address bits.
-   If either of these two bits are nonzero, the address is misaligned;
-   this will select a secondary map from the upper 3/4 of the primary
-   map.  Because this is always the distinguished secondary map, a
-   (bogus) address check failure will result.  The failure handling
-   code can then figure out whether this is a genuine addr check
-   failure or whether it is a possibly-legitimate access at a
-   misaligned address.  
-*/
-
-
-/*------------------------------------------------------------*/
-/*--- Crude profiling machinery.                           ---*/
-/*------------------------------------------------------------*/
-
-#ifdef VG_PROFILE_MEMORY
-
-#define N_PROF_EVENTS 150
-
-static UInt event_ctr[N_PROF_EVENTS];
-
-static void init_prof_mem ( void )
-{
-   Int i;
-   for (i = 0; i < N_PROF_EVENTS; i++)
-      event_ctr[i] = 0;
-}
-
-void VG_(done_prof_mem) ( void )
-{
-   Int i;
-   for (i = 0; i < N_PROF_EVENTS; i++) {
-      if ((i % 10) == 0) 
-         VG_(printf)("\n");
-      if (event_ctr[i] > 0)
-         VG_(printf)( "prof mem event %2d: %d\n", i, event_ctr[i] );
-   }
-   VG_(printf)("\n");
-}
-
-#define PROF_EVENT(ev)                                  \
-   do { vg_assert((ev) >= 0 && (ev) < N_PROF_EVENTS);   \
-        event_ctr[ev]++;                                \
-   } while (False);
-
-#else
-
-static void init_prof_mem ( void ) { }
-       void VG_(done_prof_mem) ( void ) { }
-
-#define PROF_EVENT(ev) /* */
-
-#endif
-
-/* Event index.  If just the name of the fn is given, this means the
-   number of calls to the fn.  Otherwise it is the specified event.
-
-   10   alloc_secondary_map
-
-   20   get_abit
-   21   get_vbyte
-   22   set_abit
-   23   set_vbyte
-   24   get_abits4_ALIGNED
-   25   get_vbytes4_ALIGNED
-
-   30   set_address_range_perms
-   31   set_address_range_perms(lower byte loop)
-   32   set_address_range_perms(quadword loop)
-   33   set_address_range_perms(upper byte loop)
-   
-   35   make_noaccess
-   36   make_writable
-   37   make_readable
-
-   40   copy_address_range_perms
-   41   copy_address_range_perms(byte loop)
-   42   check_writable
-   43   check_writable(byte loop)
-   44   check_readable
-   45   check_readable(byte loop)
-   46   check_readable_asciiz
-   47   check_readable_asciiz(byte loop)
-
-   50   make_aligned_word_NOACCESS
-   51   make_aligned_word_WRITABLE
-
-   60   helperc_LOADV4
-   61   helperc_STOREV4
-   62   helperc_LOADV2
-   63   helperc_STOREV2
-   64   helperc_LOADV1
-   65   helperc_STOREV1
-
-   70   rim_rd_V4_SLOWLY
-   71   rim_wr_V4_SLOWLY
-   72   rim_rd_V2_SLOWLY
-   73   rim_wr_V2_SLOWLY
-   74   rim_rd_V1_SLOWLY
-   75   rim_wr_V1_SLOWLY
-
-   80   fpu_read
-   81   fpu_read aligned 4
-   82   fpu_read aligned 8
-   83   fpu_read 2
-   84   fpu_read 10
-
-   85   fpu_write
-   86   fpu_write aligned 4
-   87   fpu_write aligned 8
-   88   fpu_write 2
-   89   fpu_write 10
-
-   90   fpu_read_check_SLOWLY
-   91   fpu_read_check_SLOWLY(byte loop)
-   92   fpu_write_check_SLOWLY
-   93   fpu_write_check_SLOWLY(byte loop)
-
-   100  is_plausible_stack_addr
-   101  handle_esp_assignment
-   102  handle_esp_assignment(-4)
-   103  handle_esp_assignment(+4)
-   104  handle_esp_assignment(-12)
-   105  handle_esp_assignment(-8)
-   106  handle_esp_assignment(+16)
-   107  handle_esp_assignment(+12)
-   108  handle_esp_assignment(0)
-   109  handle_esp_assignment(+8)
-   110  handle_esp_assignment(-16)
-   111  handle_esp_assignment(+20)
-   112  handle_esp_assignment(-20)
-   113  handle_esp_assignment(+24)
-   114  handle_esp_assignment(-24)
-
-   120  vg_handle_esp_assignment_SLOWLY
-   121  vg_handle_esp_assignment_SLOWLY(normal; move down)
-   122  vg_handle_esp_assignment_SLOWLY(normal; move up)
-   123  vg_handle_esp_assignment_SLOWLY(normal)
-   124  vg_handle_esp_assignment_SLOWLY(>= HUGE_DELTA)
-*/
-
-/*------------------------------------------------------------*/
-/*--- Function declarations.                               ---*/
-/*------------------------------------------------------------*/
-
-/* Set permissions for an address range.  Not speed-critical. */
-void VGM_(make_noaccess) ( Addr a, UInt len );
-void VGM_(make_writable) ( Addr a, UInt len );
-void VGM_(make_readable) ( Addr a, UInt len );
-
-/* Check permissions for an address range.  Not speed-critical. */
-Bool VGM_(check_writable) ( Addr a, UInt len, Addr* bad_addr );
-Bool VGM_(check_readable) ( Addr a, UInt len, Addr* bad_addr );
-Bool VGM_(check_readable_asciiz) ( Addr a, Addr* bad_addr );
-
-static UInt vgm_rd_V4_SLOWLY ( Addr a );
-static UInt vgm_rd_V2_SLOWLY ( Addr a );
-static UInt vgm_rd_V1_SLOWLY ( Addr a );
-static void vgm_wr_V4_SLOWLY ( Addr a, UInt vbytes );
-static void vgm_wr_V2_SLOWLY ( Addr a, UInt vbytes );
-static void vgm_wr_V1_SLOWLY ( Addr a, UInt vbytes );
-static void fpu_read_check_SLOWLY ( Addr addr, Int size );
-static void fpu_write_check_SLOWLY ( Addr addr, Int size );
-
-
-/*------------------------------------------------------------*/
-/*--- Data defns.                                          ---*/
-/*------------------------------------------------------------*/
-
-typedef 
-   struct {
-      UChar abits[8192];
-      UChar vbyte[65536];
-   }
-   SecMap;
-
-/* These two are statically allocated.  Should they be non-public? */
-SecMap* VG_(primary_map)[ /*65536*/ 262144 ];
-static SecMap  vg_distinguished_secondary_map;
-
-#define IS_DISTINGUISHED_SM(smap) \
-   ((smap) == &vg_distinguished_secondary_map)
-
-#define ENSURE_MAPPABLE(addr,caller)                                   \
-   do {                                                                \
-      if (IS_DISTINGUISHED_SM(VG_(primary_map)[(addr) >> 16])) {       \
-         VG_(primary_map)[(addr) >> 16] = alloc_secondary_map(caller); \
-         /* VG_(printf)("new 2map because of %p\n", addr);   */       \
-      }                                                                \
-   } while(0)
-
-#define BITARR_SET(aaa_p,iii_p)                         \
-   do {                                                 \
-      UInt   iii = (UInt)iii_p;                         \
-      UChar* aaa = (UChar*)aaa_p;                       \
-      aaa[iii >> 3] |= (1 << (iii & 7));                \
-   } while (0)
-
-#define BITARR_CLEAR(aaa_p,iii_p)                       \
-   do {                                                 \
-      UInt   iii = (UInt)iii_p;                         \
-      UChar* aaa = (UChar*)aaa_p;                       \
-      aaa[iii >> 3] &= ~(1 << (iii & 7));               \
-   } while (0)
-
-#define BITARR_TEST(aaa_p,iii_p)                        \
-      (0 != (((UChar*)aaa_p)[ ((UInt)iii_p) >> 3 ]      \
-               & (1 << (((UInt)iii_p) & 7))))           \
-
-
-#define VGM_BIT_VALID      0
-#define VGM_BIT_INVALID    1
-
-#define VGM_NIBBLE_VALID   0
-#define VGM_NIBBLE_INVALID 0xF
-
-#define VGM_BYTE_VALID     0
-#define VGM_BYTE_INVALID   0xFF
-
-/* Now in vg_include.h.
-#define VGM_WORD_VALID     0
-#define VGM_WORD_INVALID   0xFFFFFFFF
-*/
-
-#define VGM_EFLAGS_VALID   0xFFFFFFFE
-#define VGM_EFLAGS_INVALID 0xFFFFFFFF
-
-
-#define IS_ALIGNED4_ADDR(aaa_p) (0 == (((UInt)(aaa_p)) & 3))
-
-
-/*------------------------------------------------------------*/
-/*--- Basic bitmap management, reading and writing.        ---*/
-/*------------------------------------------------------------*/
-
-/* Allocate and initialise a secondary map. */
-
-static SecMap* alloc_secondary_map ( __attribute__ ((unused)) 
-                                     Char* caller )
-{
-   SecMap* map;
-   UInt  i;
-   PROF_EVENT(10);
-
-   /* Mark all bytes as invalid access and invalid value. */
-
-   /* It just happens that a SecMap occupies exactly 18 pages --
-      although this isn't important, so the following assert is
-      spurious. */
-   vg_assert(0 == (sizeof(SecMap) % VKI_BYTES_PER_PAGE));
-   map = VG_(get_memory_from_mmap)( sizeof(SecMap), caller );
-
-   for (i = 0; i < 8192; i++)
-      map->abits[i] = VGM_BYTE_INVALID; /* Invalid address */
-   for (i = 0; i < 65536; i++)
-      map->vbyte[i] = VGM_BYTE_INVALID; /* Invalid Value */
-
-   /* VG_(printf)("ALLOC_2MAP(%s)\n", caller ); */
-   return map;
-}
-
-
-/* Basic reading/writing of the bitmaps, for byte-sized accesses. */
-
-static __inline__ UChar get_abit ( Addr a )
-{
-   SecMap* sm     = VG_(primary_map)[a >> 16];
-   UInt    sm_off = a & 0xFFFF;
-   PROF_EVENT(20);
-   return BITARR_TEST(sm->abits, sm_off) 
-             ? VGM_BIT_INVALID : VGM_BIT_VALID;
-}
-
-static __inline__ UChar get_vbyte ( Addr a )
-{
-   SecMap* sm     = VG_(primary_map)[a >> 16];
-   UInt    sm_off = a & 0xFFFF;
-   PROF_EVENT(21);
-   return sm->vbyte[sm_off];
-}
-
-static __inline__ void set_abit ( Addr a, UChar abit )
-{
-   SecMap* sm;
-   UInt    sm_off;
-   PROF_EVENT(22);
-   ENSURE_MAPPABLE(a, "set_abit");
-   sm     = VG_(primary_map)[a >> 16];
-   sm_off = a & 0xFFFF;
-   if (abit) 
-      BITARR_SET(sm->abits, sm_off);
-   else
-      BITARR_CLEAR(sm->abits, sm_off);
-}
-
-static __inline__ void set_vbyte ( Addr a, UChar vbyte )
-{
-   SecMap* sm;
-   UInt    sm_off;
-   PROF_EVENT(23);
-   ENSURE_MAPPABLE(a, "set_vbyte");
-   sm     = VG_(primary_map)[a >> 16];
-   sm_off = a & 0xFFFF;
-   sm->vbyte[sm_off] = vbyte;
-}
-
-
-/* Reading/writing of the bitmaps, for aligned word-sized accesses. */
-
-static __inline__ UChar get_abits4_ALIGNED ( Addr a )
-{
-   SecMap* sm;
-   UInt    sm_off;
-   UChar   abits8;
-   PROF_EVENT(24);
-#  ifdef VG_DEBUG_MEMORY
-   vg_assert(IS_ALIGNED4_ADDR(a));
-#  endif
-   sm     = VG_(primary_map)[a >> 16];
-   sm_off = a & 0xFFFF;
-   abits8 = sm->abits[sm_off >> 3];
-   abits8 >>= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
-   abits8 &= 0x0F;
-   return abits8;
-}
-
-static UInt __inline__ get_vbytes4_ALIGNED ( Addr a )
-{
-   SecMap* sm     = VG_(primary_map)[a >> 16];
-   UInt    sm_off = a & 0xFFFF;
-   PROF_EVENT(25);
-#  ifdef VG_DEBUG_MEMORY
-   vg_assert(IS_ALIGNED4_ADDR(a));
-#  endif
-   return ((UInt*)(sm->vbyte))[sm_off >> 2];
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Setting permissions over address ranges.             ---*/
-/*------------------------------------------------------------*/
-
-static void set_address_range_perms ( Addr a, UInt len, 
-                                      UInt example_a_bit,
-                                      UInt example_v_bit )
-{
-   UChar   vbyte, abyte8;
-   UInt    vword4, sm_off;
-   SecMap* sm;
-
-   PROF_EVENT(30);
-
-   if (len == 0)
-      return;
-
-   if (len > 100 * 1000 * 1000) 
-      VG_(message)(Vg_UserMsg, 
-                   "Warning: set address range perms: "
-                   "large range %d, a %d, v %d",
-                   len, example_a_bit, example_v_bit );
-
-   VGP_PUSHCC(VgpSARP);
-
-   /* Requests to change permissions of huge address ranges may
-      indicate bugs in our machinery.  30,000,000 is arbitrary, but so
-      far all legitimate requests have fallen beneath that size. */
-   /* 4 Mar 02: this is just stupid; get rid of it. */
-   /* vg_assert(len < 30000000); */
-
-   /* Check the permissions make sense. */
-   vg_assert(example_a_bit == VGM_BIT_VALID 
-             || example_a_bit == VGM_BIT_INVALID);
-   vg_assert(example_v_bit == VGM_BIT_VALID 
-             || example_v_bit == VGM_BIT_INVALID);
-   if (example_a_bit == VGM_BIT_INVALID)
-      vg_assert(example_v_bit == VGM_BIT_INVALID);
-
-   /* The validity bits to write. */
-   vbyte = example_v_bit==VGM_BIT_VALID 
-              ? VGM_BYTE_VALID : VGM_BYTE_INVALID;
-
-   /* In order that we can charge through the address space at 8
-      bytes/main-loop iteration, make up some perms. */
-   abyte8 = (example_a_bit << 7)
-            | (example_a_bit << 6)
-            | (example_a_bit << 5)
-            | (example_a_bit << 4)
-            | (example_a_bit << 3)
-            | (example_a_bit << 2)
-            | (example_a_bit << 1)
-            | (example_a_bit << 0);
-   vword4 = (vbyte << 24) | (vbyte << 16) | (vbyte << 8) | vbyte;
-
-#  ifdef VG_DEBUG_MEMORY
-   /* Do it ... */
-   while (True) {
-      PROF_EVENT(31);
-      if (len == 0) break;
-      set_abit ( a, example_a_bit );
-      set_vbyte ( a, vbyte );
-      a++;
-      len--;
-   }
-
-#  else
-   /* Slowly do parts preceding 8-byte alignment. */
-   while (True) {
-      PROF_EVENT(31);
-      if (len == 0) break;
-      if ((a % 8) == 0) break;
-      set_abit ( a, example_a_bit );
-      set_vbyte ( a, vbyte );
-      a++;
-      len--;
-   }   
-
-   if (len == 0) {
-      VGP_POPCC;
-      return;
-   }
-   vg_assert((a % 8) == 0 && len > 0);
-
-   /* Once aligned, go fast. */
-   while (True) {
-      PROF_EVENT(32);
-      if (len < 8) break;
-      ENSURE_MAPPABLE(a, "set_address_range_perms(fast)");
-      sm = VG_(primary_map)[a >> 16];
-      sm_off = a & 0xFFFF;
-      sm->abits[sm_off >> 3] = abyte8;
-      ((UInt*)(sm->vbyte))[(sm_off >> 2) + 0] = vword4;
-      ((UInt*)(sm->vbyte))[(sm_off >> 2) + 1] = vword4;
-      a += 8;
-      len -= 8;
-   }
-
-   if (len == 0) {
-      VGP_POPCC;
-      return;
-   }
-   vg_assert((a % 8) == 0 && len > 0 && len < 8);
-
-   /* Finish the upper fragment. */
-   while (True) {
-      PROF_EVENT(33);
-      if (len == 0) break;
-      set_abit ( a, example_a_bit );
-      set_vbyte ( a, vbyte );
-      a++;
-      len--;
-   }   
-#  endif
-
-   /* Check that zero page and highest page have not been written to
-      -- this could happen with buggy syscall wrappers.  Today
-      (2001-04-26) had precisely such a problem with
-      __NR_setitimer. */
-   vg_assert(VG_(first_and_last_secondaries_look_plausible)());
-   VGP_POPCC;
-}
-
-
-/* Set permissions for address ranges ... */
-
-void VGM_(make_noaccess) ( Addr a, UInt len )
-{
-   PROF_EVENT(35);
-   set_address_range_perms ( a, len, VGM_BIT_INVALID, VGM_BIT_INVALID );
-}
-
-void VGM_(make_writable) ( Addr a, UInt len )
-{
-   PROF_EVENT(36);
-   set_address_range_perms ( a, len, VGM_BIT_VALID, VGM_BIT_INVALID );
-}
-
-void VGM_(make_readable) ( Addr a, UInt len )
-{
-   PROF_EVENT(37);
-   set_address_range_perms ( a, len, VGM_BIT_VALID, VGM_BIT_VALID );
-}
-
-void VGM_(make_readwritable) ( Addr a, UInt len )
-{
-   PROF_EVENT(38);
-   set_address_range_perms ( a, len, VGM_BIT_VALID, VGM_BIT_VALID );
-}
-
-/* Block-copy permissions (needed for implementing realloc()). */
-
-void VGM_(copy_address_range_perms) ( Addr src, Addr dst, UInt len )
-{
-   UInt i;
-   PROF_EVENT(40);
-   for (i = 0; i < len; i++) {
-      UChar abit  = get_abit ( src+i );
-      UChar vbyte = get_vbyte ( src+i );
-      PROF_EVENT(41);
-      set_abit ( dst+i, abit );
-      set_vbyte ( dst+i, vbyte );
-   }
-}
-
-
-/* Check permissions for address range.  If inadequate permissions
-   exist, *bad_addr is set to the offending address, so the caller can
-   know what it is. */
-
-Bool VGM_(check_writable) ( Addr a, UInt len, Addr* bad_addr )
-{
-   UInt  i;
-   UChar abit;
-   PROF_EVENT(42);
-   for (i = 0; i < len; i++) {
-      PROF_EVENT(43);
-      abit = get_abit(a);
-      if (abit == VGM_BIT_INVALID) {
-         if (bad_addr != NULL) *bad_addr = a;
-         return False;
-      }
-      a++;
-   }
-   return True;
-}
-
-Bool VGM_(check_readable) ( Addr a, UInt len, Addr* bad_addr )
-{
-   UInt  i;
-   UChar abit;
-   UChar vbyte;
-   PROF_EVENT(44);
-   for (i = 0; i < len; i++) {
-      abit  = get_abit(a);
-      vbyte = get_vbyte(a);
-      PROF_EVENT(45);
-      if (abit != VGM_BIT_VALID || vbyte != VGM_BYTE_VALID) {
-         if (bad_addr != NULL) *bad_addr = a;
-         return False;
-      }
-      a++;
-   }
-   return True;
-}
-
-
-/* Check a zero-terminated ascii string.  Tricky -- don't want to
-   examine the actual bytes, to find the end, until we're sure it is
-   safe to do so. */
-
-Bool VGM_(check_readable_asciiz) ( Addr a, Addr* bad_addr )
-{
-   UChar abit;
-   UChar vbyte;
-   PROF_EVENT(46);
-   while (True) {
-      PROF_EVENT(47);
-      abit  = get_abit(a);
-      vbyte = get_vbyte(a);
-      if (abit != VGM_BIT_VALID || vbyte != VGM_BYTE_VALID) {
-         if (bad_addr != NULL) *bad_addr = a;
-         return False;
-      }
-      /* Ok, a is safe to read. */
-      if (* ((UChar*)a) == 0) return True;
-      a++;
-   }
-}
-
-
-/* Setting permissions for aligned words.  This supports fast stack
-   operations. */
-
-static __inline__ void make_aligned_word_NOACCESS ( Addr a )
-{
-   SecMap* sm;
-   UInt    sm_off;
-   UChar   mask;
-   PROF_EVENT(50);
-#  ifdef VG_DEBUG_MEMORY
-   vg_assert(IS_ALIGNED4_ADDR(a));
-#  endif
-   ENSURE_MAPPABLE(a, "make_aligned_word_NOACCESS");
-   sm     = VG_(primary_map)[a >> 16];
-   sm_off = a & 0xFFFF;
-   ((UInt*)(sm->vbyte))[sm_off >> 2] = VGM_WORD_INVALID;
-   mask = 0x0F;
-   mask <<= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
-   /* mask now contains 1s where we wish to make address bits
-      invalid (1s). */
-   sm->abits[sm_off >> 3] |= mask;
-}
-
-static __inline__ void make_aligned_word_WRITABLE ( Addr a )
-{
-   SecMap* sm;
-   UInt    sm_off;
-   UChar   mask;
-   PROF_EVENT(51);
-#  ifdef VG_DEBUG_MEMORY
-   vg_assert(IS_ALIGNED4_ADDR(a));
-#  endif
-   ENSURE_MAPPABLE(a, "make_aligned_word_WRITABLE");
-   sm     = VG_(primary_map)[a >> 16];
-   sm_off = a & 0xFFFF;
-   ((UInt*)(sm->vbyte))[sm_off >> 2] = VGM_WORD_INVALID;
-   mask = 0x0F;
-   mask <<= (a & 4 /* 100b */);   /* a & 4 is either 0 or 4 */
-   /* mask now contains 1s where we wish to make address bits
-      invalid (0s). */
-   sm->abits[sm_off >> 3] &= ~mask;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Functions called directly from generated code.       ---*/
-/*------------------------------------------------------------*/
-
-static __inline__ UInt rotateRight16 ( UInt x )
-{
-   /* Amazingly, gcc turns this into a single rotate insn. */
-   return (x >> 16) | (x << 16);
-}
-
-
-static __inline__ UInt shiftRight16 ( UInt x )
-{
-   return x >> 16;
-}
-
-
-/* Read/write 1/2/4 sized V bytes, and emit an address error if
-   needed. */
-
-/* VG_(helperc_{LD,ST}V{1,2,4}) handle the common case fast.
-   Under all other circumstances, it defers to the relevant _SLOWLY
-   function, which can handle all situations.
-*/
-UInt VG_(helperc_LOADV4) ( Addr a )
-{
-#  ifdef VG_DEBUG_MEMORY
-   return vgm_rd_V4_SLOWLY(a);
-#  else
-   UInt    sec_no = rotateRight16(a) & 0x3FFFF;
-   SecMap* sm     = VG_(primary_map)[sec_no];
-   UInt    a_off  = (a & 0xFFFF) >> 3;
-   UChar   abits  = sm->abits[a_off];
-   abits >>= (a & 4);
-   abits &= 15;
-   PROF_EVENT(60);
-   if (abits == VGM_NIBBLE_VALID) {
-      /* Handle common case quickly: a is suitably aligned, is mapped,
-         and is addressible. */
-      UInt v_off = a & 0xFFFF;
-      return ((UInt*)(sm->vbyte))[ v_off >> 2 ];
-   } else {
-      /* Slow but general case. */
-      return vgm_rd_V4_SLOWLY(a);
-   }
-#  endif
-}
-
-void VG_(helperc_STOREV4) ( Addr a, UInt vbytes )
-{
-#  ifdef VG_DEBUG_MEMORY
-   vgm_wr_V4_SLOWLY(a, vbytes);
-#  else
-   UInt    sec_no = rotateRight16(a) & 0x3FFFF;
-   SecMap* sm     = VG_(primary_map)[sec_no];
-   UInt    a_off  = (a & 0xFFFF) >> 3;
-   UChar   abits  = sm->abits[a_off];
-   abits >>= (a & 4);
-   abits &= 15;
-   PROF_EVENT(61);
-   if (abits == VGM_NIBBLE_VALID) {
-      /* Handle common case quickly: a is suitably aligned, is mapped,
-         and is addressible. */
-      UInt v_off = a & 0xFFFF;
-      ((UInt*)(sm->vbyte))[ v_off >> 2 ] = vbytes;
-   } else {
-      /* Slow but general case. */
-      vgm_wr_V4_SLOWLY(a, vbytes);
-   }
-#  endif
-}
-
-UInt VG_(helperc_LOADV2) ( Addr a )
-{
-#  ifdef VG_DEBUG_MEMORY
-   return vgm_rd_V2_SLOWLY(a);
-#  else
-   UInt    sec_no = rotateRight16(a) & 0x1FFFF;
-   SecMap* sm     = VG_(primary_map)[sec_no];
-   UInt    a_off  = (a & 0xFFFF) >> 3;
-   PROF_EVENT(62);
-   if (sm->abits[a_off] == VGM_BYTE_VALID) {
-      /* Handle common case quickly. */
-      UInt v_off = a & 0xFFFF;
-      return 0xFFFF0000 
-             |  
-             (UInt)( ((UShort*)(sm->vbyte))[ v_off >> 1 ] );
-   } else {
-      /* Slow but general case. */
-      return vgm_rd_V2_SLOWLY(a);
-   }
-#  endif
-}
-
-void VG_(helperc_STOREV2) ( Addr a, UInt vbytes )
-{
-#  ifdef VG_DEBUG_MEMORY
-   vgm_wr_V2_SLOWLY(a, vbytes);
-#  else
-   UInt    sec_no = rotateRight16(a) & 0x1FFFF;
-   SecMap* sm     = VG_(primary_map)[sec_no];
-   UInt    a_off  = (a & 0xFFFF) >> 3;
-   PROF_EVENT(63);
-   if (sm->abits[a_off] == VGM_BYTE_VALID) {
-      /* Handle common case quickly. */
-      UInt v_off = a & 0xFFFF;
-      ((UShort*)(sm->vbyte))[ v_off >> 1 ] = vbytes & 0x0000FFFF;
-   } else {
-      /* Slow but general case. */
-      vgm_wr_V2_SLOWLY(a, vbytes);
-   }
-#  endif
-}
-
-UInt VG_(helperc_LOADV1) ( Addr a )
-{
-#  ifdef VG_DEBUG_MEMORY
-   return vgm_rd_V1_SLOWLY(a);
-#  else
-   UInt    sec_no = shiftRight16(a);
-   SecMap* sm     = VG_(primary_map)[sec_no];
-   UInt    a_off  = (a & 0xFFFF) >> 3;
-   PROF_EVENT(64);
-   if (sm->abits[a_off] == VGM_BYTE_VALID) {
-      /* Handle common case quickly. */
-      UInt v_off = a & 0xFFFF;
-      return 0xFFFFFF00
-             |
-             (UInt)( ((UChar*)(sm->vbyte))[ v_off ] );
-   } else {
-      /* Slow but general case. */
-      return vgm_rd_V1_SLOWLY(a);
-   }
-#  endif
-}
-
-void VG_(helperc_STOREV1) ( Addr a, UInt vbytes )
-{
-#  ifdef VG_DEBUG_MEMORY
-   vgm_wr_V1_SLOWLY(a, vbytes);
-#  else
-   UInt    sec_no = shiftRight16(a);
-   SecMap* sm     = VG_(primary_map)[sec_no];
-   UInt    a_off  = (a & 0xFFFF) >> 3;
-   PROF_EVENT(65);
-   if (sm->abits[a_off] == VGM_BYTE_VALID) {
-      /* Handle common case quickly. */
-      UInt v_off = a & 0xFFFF;
-      ((UChar*)(sm->vbyte))[ v_off ] = vbytes & 0x000000FF;
-   } else {
-      /* Slow but general case. */
-      vgm_wr_V1_SLOWLY(a, vbytes);
-   }
-#  endif
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Fallback functions to handle cases that the above    ---*/
-/*--- VG_(helperc_{LD,ST}V{1,2,4}) can't manage.           ---*/
-/*------------------------------------------------------------*/
-
-static UInt vgm_rd_V4_SLOWLY ( Addr a )
-{
-   Bool a0ok, a1ok, a2ok, a3ok;
-   UInt vb0, vb1, vb2, vb3;
-
-   PROF_EVENT(70);
-
-   /* First establish independently the addressibility of the 4 bytes
-      involved. */
-   a0ok = get_abit(a+0) == VGM_BIT_VALID;
-   a1ok = get_abit(a+1) == VGM_BIT_VALID;
-   a2ok = get_abit(a+2) == VGM_BIT_VALID;
-   a3ok = get_abit(a+3) == VGM_BIT_VALID;
-
-   /* Also get the validity bytes for the address. */
-   vb0 = (UInt)get_vbyte(a+0);
-   vb1 = (UInt)get_vbyte(a+1);
-   vb2 = (UInt)get_vbyte(a+2);
-   vb3 = (UInt)get_vbyte(a+3);
-
-   /* Now distinguish 3 cases */
-
-   /* Case 1: the address is completely valid, so:
-      - no addressing error
-      - return V bytes as read from memory
-   */
-   if (a0ok && a1ok && a2ok && a3ok) {
-      UInt vw = VGM_WORD_INVALID;
-      vw <<= 8; vw |= vb3;
-      vw <<= 8; vw |= vb2;
-      vw <<= 8; vw |= vb1;
-      vw <<= 8; vw |= vb0;
-      return vw;
-   }
-
-   /* Case 2: the address is completely invalid.  
-      - emit addressing error
-      - return V word indicating validity.  
-      This sounds strange, but if we make loads from invalid addresses 
-      give invalid data, we also risk producing a number of confusing
-      undefined-value errors later, which confuses the fact that the
-      error arose in the first place from an invalid address. 
-   */
-   /* VG_(printf)("%p (%d %d %d %d)\n", a, a0ok, a1ok, a2ok, a3ok); */
-   if (!VG_(clo_partial_loads_ok) 
-       || ((a & 3) != 0)
-       || (!a0ok && !a1ok && !a2ok && !a3ok)) {
-      VG_(record_address_error)( a, 4, False );
-      return (VGM_BYTE_VALID << 24) | (VGM_BYTE_VALID << 16) 
-             | (VGM_BYTE_VALID << 8) | VGM_BYTE_VALID;
-   }
-
-   /* Case 3: the address is partially valid.  
-      - no addressing error
-      - returned V word is invalid where the address is invalid, 
-        and contains V bytes from memory otherwise. 
-      Case 3 is only allowed if VG_(clo_partial_loads_ok) is True
-      (which is the default), and the address is 4-aligned.  
-      If not, Case 2 will have applied.
-   */
-   vg_assert(VG_(clo_partial_loads_ok));
-   {
-      UInt vw = VGM_WORD_INVALID;
-      vw <<= 8; vw |= (a3ok ? vb3 : VGM_BYTE_INVALID);
-      vw <<= 8; vw |= (a2ok ? vb2 : VGM_BYTE_INVALID);
-      vw <<= 8; vw |= (a1ok ? vb1 : VGM_BYTE_INVALID);
-      vw <<= 8; vw |= (a0ok ? vb0 : VGM_BYTE_INVALID);
-      return vw;
-   }
-}
-
-static void vgm_wr_V4_SLOWLY ( Addr a, UInt vbytes )
-{
-   /* Check the address for validity. */
-   Bool aerr = False;
-   PROF_EVENT(71);
-
-   if (get_abit(a+0) != VGM_BIT_VALID) aerr = True;
-   if (get_abit(a+1) != VGM_BIT_VALID) aerr = True;
-   if (get_abit(a+2) != VGM_BIT_VALID) aerr = True;
-   if (get_abit(a+3) != VGM_BIT_VALID) aerr = True;
-
-   /* Store the V bytes, remembering to do it little-endian-ly. */
-   set_vbyte( a+0, vbytes & 0x000000FF ); vbytes >>= 8;
-   set_vbyte( a+1, vbytes & 0x000000FF ); vbytes >>= 8;
-   set_vbyte( a+2, vbytes & 0x000000FF ); vbytes >>= 8;
-   set_vbyte( a+3, vbytes & 0x000000FF );
-
-   /* If an address error has happened, report it. */
-   if (aerr)
-      VG_(record_address_error)( a, 4, True );
-}
-
-static UInt vgm_rd_V2_SLOWLY ( Addr a )
-{
-   /* Check the address for validity. */
-   UInt vw   = VGM_WORD_INVALID;
-   Bool aerr = False;
-   PROF_EVENT(72);
-
-   if (get_abit(a+0) != VGM_BIT_VALID) aerr = True;
-   if (get_abit(a+1) != VGM_BIT_VALID) aerr = True;
-
-   /* Fetch the V bytes, remembering to do it little-endian-ly. */
-   vw <<= 8; vw |= (UInt)get_vbyte(a+1);
-   vw <<= 8; vw |= (UInt)get_vbyte(a+0);
-
-   /* If an address error has happened, report it. */
-   if (aerr) {
-      VG_(record_address_error)( a, 2, False );
-      vw = (VGM_BYTE_INVALID << 24) | (VGM_BYTE_INVALID << 16) 
-           | (VGM_BYTE_VALID << 8) | (VGM_BYTE_VALID);
-   }
-   return vw;   
-}
-
-static void vgm_wr_V2_SLOWLY ( Addr a, UInt vbytes )
-{
-   /* Check the address for validity. */
-   Bool aerr = False;
-   PROF_EVENT(73);
-
-   if (get_abit(a+0) != VGM_BIT_VALID) aerr = True;
-   if (get_abit(a+1) != VGM_BIT_VALID) aerr = True;
-
-   /* Store the V bytes, remembering to do it little-endian-ly. */
-   set_vbyte( a+0, vbytes & 0x000000FF ); vbytes >>= 8;
-   set_vbyte( a+1, vbytes & 0x000000FF );
-
-   /* If an address error has happened, report it. */
-   if (aerr)
-      VG_(record_address_error)( a, 2, True );
-}
-
-static UInt vgm_rd_V1_SLOWLY ( Addr a )
-{
-   /* Check the address for validity. */
-   UInt vw   = VGM_WORD_INVALID;
-   Bool aerr = False;
-   PROF_EVENT(74);
-
-   if (get_abit(a+0) != VGM_BIT_VALID) aerr = True;
-
-   /* Fetch the V byte. */
-   vw <<= 8; vw |= (UInt)get_vbyte(a+0);
-
-   /* If an address error has happened, report it. */
-   if (aerr) {
-      VG_(record_address_error)( a, 1, False );
-      vw = (VGM_BYTE_INVALID << 24) | (VGM_BYTE_INVALID << 16) 
-           | (VGM_BYTE_INVALID << 8) | (VGM_BYTE_VALID);
-   }
-   return vw;   
-}
-
-static void vgm_wr_V1_SLOWLY ( Addr a, UInt vbytes )
-{
-   /* Check the address for validity. */
-   Bool aerr = False;
-   PROF_EVENT(75);
-   if (get_abit(a+0) != VGM_BIT_VALID) aerr = True;
-
-   /* Store the V bytes, remembering to do it little-endian-ly. */
-   set_vbyte( a+0, vbytes & 0x000000FF );
-
-   /* If an address error has happened, report it. */
-   if (aerr)
-      VG_(record_address_error)( a, 1, True );
-}
-
-
-/* ---------------------------------------------------------------------
-   Called from generated code, or from the assembly helpers.
-   Handlers for value check failures.
-   ------------------------------------------------------------------ */
-
-void VG_(helperc_value_check0_fail) ( void )
-{
-   VG_(record_value_error) ( 0 );
-}
-
-void VG_(helperc_value_check1_fail) ( void )
-{
-   VG_(record_value_error) ( 1 );
-}
-
-void VG_(helperc_value_check2_fail) ( void )
-{
-   VG_(record_value_error) ( 2 );
-}
-
-void VG_(helperc_value_check4_fail) ( void )
-{
-   VG_(record_value_error) ( 4 );
-}
-
-
-/* ---------------------------------------------------------------------
-   FPU load and store checks, called from generated code.
-   ------------------------------------------------------------------ */
-
-void VGM_(fpu_read_check) ( Addr addr, Int size )
-{
-   /* Ensure the read area is both addressible and valid (ie,
-      readable).  If there's an address error, don't report a value
-      error too; but if there isn't an address error, check for a
-      value error. 
-
-      Try to be reasonably fast on the common case; wimp out and defer
-      to fpu_read_check_SLOWLY for everything else.  */
-
-   SecMap* sm;
-   UInt    sm_off, v_off, a_off;
-   Addr    addr4;
-
-   PROF_EVENT(80);
-
-#  ifdef VG_DEBUG_MEMORY
-   fpu_read_check_SLOWLY ( addr, size );
-#  else
-
-   if (size == 4) {
-      if (!IS_ALIGNED4_ADDR(addr)) goto slow4;
-      PROF_EVENT(81);
-      /* Properly aligned. */
-      sm     = VG_(primary_map)[addr >> 16];
-      sm_off = addr & 0xFFFF;
-      a_off  = sm_off >> 3;
-      if (sm->abits[a_off] != VGM_BYTE_VALID) goto slow4;
-      /* Properly aligned and addressible. */
-      v_off = addr & 0xFFFF;
-      if (((UInt*)(sm->vbyte))[ v_off >> 2 ] != VGM_WORD_VALID) 
-         goto slow4;
-      /* Properly aligned, addressible and with valid data. */
-      return;
-     slow4:
-      fpu_read_check_SLOWLY ( addr, 4 );
-      return;
-   }
-
-   if (size == 8) {
-      if (!IS_ALIGNED4_ADDR(addr)) goto slow8;
-      PROF_EVENT(82);
-      /* Properly aligned.  Do it in two halves. */
-      addr4 = addr + 4;
-      /* First half. */
-      sm     = VG_(primary_map)[addr >> 16];
-      sm_off = addr & 0xFFFF;
-      a_off  = sm_off >> 3;
-      if (sm->abits[a_off] != VGM_BYTE_VALID) goto slow8;
-      /* First half properly aligned and addressible. */
-      v_off = addr & 0xFFFF;
-      if (((UInt*)(sm->vbyte))[ v_off >> 2 ] != VGM_WORD_VALID) 
-         goto slow8;
-      /* Second half. */
-      sm     = VG_(primary_map)[addr4 >> 16];
-      sm_off = addr4 & 0xFFFF;
-      a_off  = sm_off >> 3;
-      if (sm->abits[a_off] != VGM_BYTE_VALID) goto slow8;
-      /* Second half properly aligned and addressible. */
-      v_off = addr4 & 0xFFFF;
-      if (((UInt*)(sm->vbyte))[ v_off >> 2 ] != VGM_WORD_VALID) 
-         goto slow8;
-      /* Both halves properly aligned, addressible and with valid
-         data. */
-      return;
-     slow8:
-      fpu_read_check_SLOWLY ( addr, 8 );
-      return;
-   }
-
-   /* Can't be bothered to huff'n'puff to make these (allegedly) rare
-      cases go quickly.  */
-   if (size == 2) {
-      PROF_EVENT(83);
-      fpu_read_check_SLOWLY ( addr, 2 );
-      return;
-   }
-
-   if (size == 10) {
-      PROF_EVENT(84);
-      fpu_read_check_SLOWLY ( addr, 10 );
-      return;
-   }
-
-   if (size == 28) {
-      PROF_EVENT(84); /* XXX assign correct event number */
-      fpu_read_check_SLOWLY ( addr, 28 );
-      return;
-   }
-
-   VG_(printf)("size is %d\n", size);
-   VG_(panic)("vgm_fpu_read_check: unhandled size");
-#  endif
-}
-
-
-void VGM_(fpu_write_check) ( Addr addr, Int size )
-{
-   /* Ensure the written area is addressible, and moan if otherwise.
-      If it is addressible, make it valid, otherwise invalid. 
-   */
-
-   SecMap* sm;
-   UInt    sm_off, v_off, a_off;
-   Addr    addr4;
-
-   PROF_EVENT(85);
-
-#  ifdef VG_DEBUG_MEMORY
-   fpu_write_check_SLOWLY ( addr, size );
-#  else
-
-   if (size == 4) {
-      if (!IS_ALIGNED4_ADDR(addr)) goto slow4;
-      PROF_EVENT(86);
-      /* Properly aligned. */
-      sm     = VG_(primary_map)[addr >> 16];
-      sm_off = addr & 0xFFFF;
-      a_off  = sm_off >> 3;
-      if (sm->abits[a_off] != VGM_BYTE_VALID) goto slow4;
-      /* Properly aligned and addressible.  Make valid. */
-      v_off = addr & 0xFFFF;
-      ((UInt*)(sm->vbyte))[ v_off >> 2 ] = VGM_WORD_VALID;
-      return;
-     slow4:
-      fpu_write_check_SLOWLY ( addr, 4 );
-      return;
-   }
-
-   if (size == 8) {
-      if (!IS_ALIGNED4_ADDR(addr)) goto slow8;
-      PROF_EVENT(87);
-      /* Properly aligned.  Do it in two halves. */
-      addr4 = addr + 4;
-      /* First half. */
-      sm     = VG_(primary_map)[addr >> 16];
-      sm_off = addr & 0xFFFF;
-      a_off  = sm_off >> 3;
-      if (sm->abits[a_off] != VGM_BYTE_VALID) goto slow8;
-      /* First half properly aligned and addressible.  Make valid. */
-      v_off = addr & 0xFFFF;
-      ((UInt*)(sm->vbyte))[ v_off >> 2 ] = VGM_WORD_VALID;
-      /* Second half. */
-      sm     = VG_(primary_map)[addr4 >> 16];
-      sm_off = addr4 & 0xFFFF;
-      a_off  = sm_off >> 3;
-      if (sm->abits[a_off] != VGM_BYTE_VALID) goto slow8;
-      /* Second half properly aligned and addressible. */
-      v_off = addr4 & 0xFFFF;
-      ((UInt*)(sm->vbyte))[ v_off >> 2 ] = VGM_WORD_VALID;
-      /* Properly aligned, addressible and with valid data. */
-      return;
-     slow8:
-      fpu_write_check_SLOWLY ( addr, 8 );
-      return;
-   }
-
-   /* Can't be bothered to huff'n'puff to make these (allegedly) rare
-      cases go quickly.  */
-   if (size == 2) {
-      PROF_EVENT(88);
-      fpu_write_check_SLOWLY ( addr, 2 );
-      return;
-   }
-
-   if (size == 10) {
-      PROF_EVENT(89);
-      fpu_write_check_SLOWLY ( addr, 10 );
-      return;
-   }
-
-   if (size == 28) {
-      PROF_EVENT(89); /* XXX assign correct event number */
-      fpu_write_check_SLOWLY ( addr, 28 );
-      return;
-   }
-
-   VG_(printf)("size is %d\n", size);
-   VG_(panic)("vgm_fpu_write_check: unhandled size");
-#  endif
-}
-
-
-/* ---------------------------------------------------------------------
-   Slow, general cases for FPU load and store checks.
-   ------------------------------------------------------------------ */
-
-/* Generic version.  Test for both addr and value errors, but if
-   there's an addr error, don't report a value error even if it
-   exists. */
-
-void fpu_read_check_SLOWLY ( Addr addr, Int size )
-{
-   Int  i;
-   Bool aerr = False;
-   Bool verr = False;
-   PROF_EVENT(90);
-   for (i = 0; i < size; i++) {
-      PROF_EVENT(91);
-      if (get_abit(addr+i) != VGM_BIT_VALID)
-         aerr = True;
-      if (get_vbyte(addr+i) != VGM_BYTE_VALID)
-         verr = True;
-   }
-
-   if (aerr) {
-      VG_(record_address_error)( addr, size, False );
-   } else {
-     if (verr)
-        VG_(record_value_error)( size );
-   }
-}
-
-
-/* Generic version.  Test for addr errors.  Valid addresses are
-   given valid values, and invalid addresses invalid values. */
-
-void fpu_write_check_SLOWLY ( Addr addr, Int size )
-{
-   Int  i;
-   Addr a_here;
-   Bool a_ok;
-   Bool aerr = False;
-   PROF_EVENT(92);
-   for (i = 0; i < size; i++) {
-      PROF_EVENT(93);
-      a_here = addr+i;
-      a_ok = get_abit(a_here) == VGM_BIT_VALID;
-      if (a_ok) {
-	set_vbyte(a_here, VGM_BYTE_VALID);
-      } else {
-	set_vbyte(a_here, VGM_BYTE_INVALID);
-        aerr = True;
-      }
-   }
-   if (aerr) {
-      VG_(record_address_error)( addr, size, True );
-   }
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Tracking permissions around %esp changes.            ---*/
-/*------------------------------------------------------------*/
-
-/*
-   The stack
-   ~~~~~~~~~
-   The stack's segment seems to be dynamically extended downwards
-   by the kernel as the stack pointer moves down.  Initially, a
-   1-page (4k) stack is allocated.  When %esp moves below that for
-   the first time, presumably a page fault occurs.  The kernel
-   detects that the faulting address is in the range from %esp upwards
-   to the current valid stack.  It then extends the stack segment
-   downwards for enough to cover the faulting address, and resumes
-   the process (invisibly).  The process is unaware of any of this.
-
-   That means that Valgrind can't spot when the stack segment is
-   being extended.  Fortunately, we want to precisely and continuously
-   update stack permissions around %esp, so we need to spot all
-   writes to %esp anyway.
-
-   The deal is: when %esp is assigned a lower value, the stack is
-   being extended.  Create a secondary maps to fill in any holes
-   between the old stack ptr and this one, if necessary.  Then 
-   mark all bytes in the area just "uncovered" by this %esp change
-   as write-only.
-
-   When %esp goes back up, mark the area receded over as unreadable
-   and unwritable.
-
-   Just to record the %esp boundary conditions somewhere convenient:
-   %esp always points to the lowest live byte in the stack.  All
-   addresses below %esp are not live; those at and above it are.  
-*/
-
-/* Does this address look like something in or vaguely near the
-   current thread's stack? */
-static
-Bool is_plausible_stack_addr ( ThreadState* tst, Addr aa )
-{
-   UInt a = (UInt)aa;
-   PROF_EVENT(100);
-   if (a <= tst->stack_highest_word && 
-       a > tst->stack_highest_word - VG_PLAUSIBLE_STACK_SIZE)
-      return True;
-   else
-      return False;
-}
-
-
-/* Is this address within some small distance below %ESP?  Used only
-   for the --workaround-gcc296-bugs kludge. */
-Bool VG_(is_just_below_ESP)( Addr esp, Addr aa )
-{
-   if ((UInt)esp > (UInt)aa
-       && ((UInt)esp - (UInt)aa) <= VG_GCC296_BUG_STACK_SLOP)
-      return True;
-   else
-      return False;
-}
-
-
-/* Kludgey ... how much does %esp have to change before we reckon that
-   the application is switching stacks ? */
-#define VG_HUGE_DELTA (VG_PLAUSIBLE_STACK_SIZE / 4)
-
-static Addr get_page_base ( Addr a )
-{
-   return a & ~(VKI_BYTES_PER_PAGE-1);
-}
-
-
-static void vg_handle_esp_assignment_SLOWLY ( Addr );
-
-void VGM_(handle_esp_assignment) ( Addr new_espA )
-{
-   UInt old_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   UInt new_esp = (UInt)new_espA;
-   Int  delta   = ((Int)new_esp) - ((Int)old_esp);
-
-   PROF_EVENT(101);
-
-#  ifndef VG_DEBUG_MEMORY
-
-   if (IS_ALIGNED4_ADDR(old_esp)) {
-
-      /* Deal with the most common cases fast.  These are ordered in
-         the sequence most common first. */
-
-      if (delta == -4) {
-         /* Moving down by 4 and properly aligned.. */
-         PROF_EVENT(102);
-         make_aligned_word_WRITABLE(new_esp);
-         return;
-      }
-
-      if (delta == 4) {
-         /* Moving up by 4 and properly aligned. */
-         PROF_EVENT(103);
-         make_aligned_word_NOACCESS(old_esp);
-         return;
-      }
-
-      if (delta == -12) {
-         PROF_EVENT(104);
-         make_aligned_word_WRITABLE(new_esp);
-         make_aligned_word_WRITABLE(new_esp+4);
-         make_aligned_word_WRITABLE(new_esp+8);
-         return;
-      }
-
-      if (delta == -8) {
-         PROF_EVENT(105);
-         make_aligned_word_WRITABLE(new_esp);
-         make_aligned_word_WRITABLE(new_esp+4);
-         return;
-      }
-
-      if (delta == 16) {
-         PROF_EVENT(106);
-         make_aligned_word_NOACCESS(old_esp);
-         make_aligned_word_NOACCESS(old_esp+4);
-         make_aligned_word_NOACCESS(old_esp+8);
-         make_aligned_word_NOACCESS(old_esp+12);
-         return;
-      }
-
-      if (delta == 12) {
-         PROF_EVENT(107);
-         make_aligned_word_NOACCESS(old_esp);
-         make_aligned_word_NOACCESS(old_esp+4);
-         make_aligned_word_NOACCESS(old_esp+8);
-         return;
-      }
-
-      if (delta == 0) {
-         PROF_EVENT(108);
-         return;
-      }
-
-      if (delta == 8) {
-         PROF_EVENT(109);
-         make_aligned_word_NOACCESS(old_esp);
-         make_aligned_word_NOACCESS(old_esp+4);
-         return;
-      }
-
-      if (delta == -16) {
-         PROF_EVENT(110);
-         make_aligned_word_WRITABLE(new_esp);
-         make_aligned_word_WRITABLE(new_esp+4);
-         make_aligned_word_WRITABLE(new_esp+8);
-         make_aligned_word_WRITABLE(new_esp+12);
-         return;
-      }
-
-      if (delta == 20) {
-         PROF_EVENT(111);
-         make_aligned_word_NOACCESS(old_esp);
-         make_aligned_word_NOACCESS(old_esp+4);
-         make_aligned_word_NOACCESS(old_esp+8);
-         make_aligned_word_NOACCESS(old_esp+12);
-         make_aligned_word_NOACCESS(old_esp+16);
-         return;
-      }
-
-      if (delta == -20) {
-         PROF_EVENT(112);
-         make_aligned_word_WRITABLE(new_esp);
-         make_aligned_word_WRITABLE(new_esp+4);
-         make_aligned_word_WRITABLE(new_esp+8);
-         make_aligned_word_WRITABLE(new_esp+12);
-         make_aligned_word_WRITABLE(new_esp+16);
-         return;
-      }
-
-      if (delta == 24) {
-         PROF_EVENT(113);
-         make_aligned_word_NOACCESS(old_esp);
-         make_aligned_word_NOACCESS(old_esp+4);
-         make_aligned_word_NOACCESS(old_esp+8);
-         make_aligned_word_NOACCESS(old_esp+12);
-         make_aligned_word_NOACCESS(old_esp+16);
-         make_aligned_word_NOACCESS(old_esp+20);
-         return;
-      }
-
-      if (delta == -24) {
-         PROF_EVENT(114);
-         make_aligned_word_WRITABLE(new_esp);
-         make_aligned_word_WRITABLE(new_esp+4);
-         make_aligned_word_WRITABLE(new_esp+8);
-         make_aligned_word_WRITABLE(new_esp+12);
-         make_aligned_word_WRITABLE(new_esp+16);
-         make_aligned_word_WRITABLE(new_esp+20);
-         return;
-      }
-
-   }
-
-#  endif
-
-   /* The above special cases handle 90% to 95% of all the stack
-      adjustments.  The rest we give to the slow-but-general
-      mechanism. */
-   vg_handle_esp_assignment_SLOWLY ( new_espA );
-}
-
-
-static void vg_handle_esp_assignment_SLOWLY ( Addr new_espA )
-{
-   UInt old_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   UInt new_esp = (UInt)new_espA;
-   Int  delta   = ((Int)new_esp) - ((Int)old_esp);
-   //   VG_(printf)("%d ", delta);
-   PROF_EVENT(120);
-   if (-(VG_HUGE_DELTA) < delta && delta < VG_HUGE_DELTA) {
-      /* "Ordinary" stack change. */
-      if (new_esp < old_esp) {
-         /* Moving down; the stack is growing. */
-         PROF_EVENT(121);
-         VGM_(make_writable) ( new_esp, old_esp - new_esp );
-         return;
-      }
-      if (new_esp > old_esp) {
-         /* Moving up; the stack is shrinking. */
-         PROF_EVENT(122);
-         VGM_(make_noaccess) ( old_esp, new_esp - old_esp );
-         return;
-      }
-      PROF_EVENT(123);
-      return; /* when old_esp == new_esp */
-   }
-
-   /* %esp has changed by more than HUGE_DELTA.  We take this to mean
-      that the application is switching to a new stack, for whatever
-      reason, and we attempt to initialise the permissions around the
-      new stack in some plausible way.  All pretty kludgey; needed to
-      make netscape-4.07 run without generating thousands of error
-      contexts.
-
-      If we appear to be switching back to the main stack, don't mess
-      with the permissions in the area at and above the stack ptr.
-      Otherwise, we're switching to an alternative stack; make the
-      area above %esp readable -- this doesn't seem right -- the right
-      thing to do would be to make it writable -- but is needed to
-      avoid huge numbers of errs in netscape.  To be investigated. */
-
-   { Addr invalid_down_to = get_page_base(new_esp) 
-                            - 0 * VKI_BYTES_PER_PAGE;
-     Addr valid_up_to     = get_page_base(new_esp) + VKI_BYTES_PER_PAGE
-                            + 0 * VKI_BYTES_PER_PAGE;
-     ThreadState* tst     = VG_(get_current_thread_state)();
-     PROF_EVENT(124);
-     if (VG_(clo_verbosity) > 1)
-        VG_(message)(Vg_UserMsg, "Warning: client switching stacks?  "
-                                 "%%esp: %p --> %p",
-                                  old_esp, new_esp);
-     /* VG_(printf)("na %p,   %%esp %p,   wr %p\n",
-                    invalid_down_to, new_esp, valid_up_to ); */
-     VGM_(make_noaccess) ( invalid_down_to, new_esp - invalid_down_to );
-     if (!is_plausible_stack_addr(tst, new_esp)) {
-        VGM_(make_readable) ( new_esp, valid_up_to - new_esp );
-     }
-   }
-}
-
-
-/*--------------------------------------------------------------*/
-/*--- Initialise the memory audit system on program startup. ---*/
-/*--------------------------------------------------------------*/
-
-/* Handle one entry derived from /proc/self/maps. */
-
-static
-void init_memory_audit_callback ( 
-        Addr start, UInt size, 
-        Char rr, Char ww, Char xx, 
-        UInt foffset, UChar* filename )
-{
-   UChar example_a_bit;
-   UChar example_v_bit;
-   UInt  r_esp;
-   Bool  is_stack_segment;
-
-   /* Sanity check ... if this is the executable's text segment,
-      ensure it is loaded where we think it ought to be.  Any file
-      name which doesn't contain ".so" is assumed to be the
-      executable. */
-   if (filename != NULL
-       && xx == 'x'
-       && VG_(strstr(filename, ".so")) == NULL
-      ) {
-      /* We assume this is the executable. */
-      if (start != VG_ASSUMED_EXE_BASE) {
-         VG_(message)(Vg_UserMsg,
-                      "FATAL: executable base addr not as assumed.");
-         VG_(message)(Vg_UserMsg, "name %s, actual %p, assumed %p.",
-                      filename, start, VG_ASSUMED_EXE_BASE);
-         VG_(message)(Vg_UserMsg,
-            "One reason this could happen is that you have a shared object");
-         VG_(message)(Vg_UserMsg,
-            " whose name doesn't contain the characters \".so\", so Valgrind ");
-         VG_(message)(Vg_UserMsg,
-            "naively assumes it is the executable.  ");
-         VG_(message)(Vg_UserMsg,
-            "In that case, rename it appropriately.");
-         VG_(panic)("VG_ASSUMED_EXE_BASE doesn't match reality");
-      }
-   }
-    
-   if (0)
-      VG_(message)(Vg_DebugMsg, 
-                   "initial map %8x-%8x %c%c%c? %8x (%d) (%s)",
-                   start,start+size,rr,ww,xx,foffset,
-                   size, filename?filename:(UChar*)"NULL");
-
-   r_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   is_stack_segment = start <= r_esp && r_esp < start+size;
-
-   /* Figure out the segment's permissions.
-
-      All segments are addressible -- since a process can read its
-      own text segment.
-
-      A read-but-not-write segment presumably contains initialised
-      data, so is all valid.  Read-write segments presumably contains
-      uninitialised data, so is all invalid.  */
-
-   /* ToDo: make this less bogus. */
-   if (rr != 'r' && xx != 'x' && ww != 'w') {
-      /* Very bogus; this path never gets taken. */
-      /* A no, V no */
-      example_a_bit = VGM_BIT_INVALID;
-      example_v_bit = VGM_BIT_INVALID;
-   } else {
-      /* A yes, V yes */
-      example_a_bit = VGM_BIT_VALID;
-      example_v_bit = VGM_BIT_VALID;
-      /* Causes a lot of errs for unknown reasons. 
-         if (filename is valgrind.so 
-               [careful about end conditions on filename]) {
-            example_a_bit = VGM_BIT_INVALID;
-            example_v_bit = VGM_BIT_INVALID;
-         }
-      */
-   }
-
-   set_address_range_perms ( start, size, 
-                             example_a_bit, example_v_bit );
-
-   if (is_stack_segment) {
-      /* This is the stack segment.  Mark all below %esp as
-         noaccess. */
-      if (0)
-         VG_(message)(Vg_DebugMsg, 
-                      "invalidating stack area: %x .. %x",
-                      start,r_esp);
-      VGM_(make_noaccess)( start, r_esp-start );
-   }
-}
-
-
-/* Initialise the memory audit system. */
-void VGM_(init_memory_audit) ( void )
-{
-   Int i;
-
-   init_prof_mem();
-
-   for (i = 0; i < 8192; i++)
-      vg_distinguished_secondary_map.abits[i] 
-         = VGM_BYTE_INVALID; /* Invalid address */
-   for (i = 0; i < 65536; i++)
-      vg_distinguished_secondary_map.vbyte[i] 
-         = VGM_BYTE_INVALID; /* Invalid Value */
-
-   /* These entries gradually get overwritten as the used address
-      space expands. */
-   for (i = 0; i < 65536; i++)
-      VG_(primary_map)[i] = &vg_distinguished_secondary_map;
-   /* These ones should never change; it's a bug in Valgrind if they
-      do. */
-   for (i = 65536; i < 262144; i++)
-      VG_(primary_map)[i] = &vg_distinguished_secondary_map;
-
-   /* Read the initial memory mapping from the /proc filesystem, and
-      set up our own maps accordingly. */
-   VG_(read_procselfmaps) ( init_memory_audit_callback );
-
-   /* Last but not least, set up the shadow regs with reasonable (sic)
-      values.  All regs are claimed to have valid values.
-   */
-   VG_(baseBlock)[VGOFF_(sh_esp)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_ebp)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_eax)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_ecx)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_edx)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_ebx)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_esi)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_edi)]    = VGM_WORD_VALID;
-   VG_(baseBlock)[VGOFF_(sh_eflags)] = VGM_EFLAGS_VALID;
-
-   /* Record the end of the data segment, so that vg_syscall_mem.c
-      can make sense of calls to brk(). 
-   */
-   VGM_(curr_dataseg_end) = (Addr)VG_(brk)(0);
-   if (VGM_(curr_dataseg_end) == (Addr)(-1))
-      VG_(panic)("vgm_init_memory_audit: can't determine data-seg end");
-
-   if (0)
-      VG_(printf)("DS END is %p\n", (void*)VGM_(curr_dataseg_end));
-
-   /* Read the list of errors to suppress.  This should be found in
-      the file specified by vg_clo_suppressions. */
-   VG_(load_suppressions)();
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Low-level address-space scanning, for the leak       ---*/
-/*--- detector.                                            ---*/
-/*------------------------------------------------------------*/
-
-static 
-jmp_buf memscan_jmpbuf;
-
-static
-void vg_scan_all_valid_memory_sighandler ( Int sigNo )
-{
-   __builtin_longjmp(memscan_jmpbuf, 1);
-}
-
-UInt VG_(scan_all_valid_memory) ( void (*notify_word)( Addr, UInt ) )
-{
-   /* All volatile, because some gccs seem paranoid about longjmp(). */
-   volatile UInt res, numPages, page, vbytes, primaryMapNo, nWordsNotified;
-   volatile Addr pageBase, addr;
-   volatile SecMap* sm;
-   volatile UChar abits;
-   volatile UInt page_first_word;
-
-   vki_ksigaction sigbus_saved;
-   vki_ksigaction sigbus_new;
-   vki_ksigaction sigsegv_saved;
-   vki_ksigaction sigsegv_new;
-   vki_ksigset_t  blockmask_saved;
-   vki_ksigset_t  unblockmask_new;
-
-   /* Temporarily install a new sigsegv and sigbus handler, and make
-      sure SIGBUS, SIGSEGV and SIGTERM are unblocked.  (Perhaps the
-      first two can never be blocked anyway?)  */
-
-   sigbus_new.ksa_handler = vg_scan_all_valid_memory_sighandler;
-   sigbus_new.ksa_flags = VKI_SA_ONSTACK | VKI_SA_RESTART;
-   sigbus_new.ksa_restorer = NULL;
-   res = VG_(ksigemptyset)( &sigbus_new.ksa_mask );
-   vg_assert(res == 0);
-
-   sigsegv_new.ksa_handler = vg_scan_all_valid_memory_sighandler;
-   sigsegv_new.ksa_flags = VKI_SA_ONSTACK | VKI_SA_RESTART;
-   sigsegv_new.ksa_restorer = NULL;
-   res = VG_(ksigemptyset)( &sigsegv_new.ksa_mask );
-   vg_assert(res == 0+0);
-
-   res =  VG_(ksigemptyset)( &unblockmask_new );
-   res |= VG_(ksigaddset)( &unblockmask_new, VKI_SIGBUS );
-   res |= VG_(ksigaddset)( &unblockmask_new, VKI_SIGSEGV );
-   res |= VG_(ksigaddset)( &unblockmask_new, VKI_SIGTERM );
-   vg_assert(res == 0+0+0);
-
-   res = VG_(ksigaction)( VKI_SIGBUS, &sigbus_new, &sigbus_saved );
-   vg_assert(res == 0+0+0+0);
-
-   res = VG_(ksigaction)( VKI_SIGSEGV, &sigsegv_new, &sigsegv_saved );
-   vg_assert(res == 0+0+0+0+0);
-
-   res = VG_(ksigprocmask)( VKI_SIG_UNBLOCK, &unblockmask_new, &blockmask_saved );
-   vg_assert(res == 0+0+0+0+0+0);
-
-   /* The signal handlers are installed.  Actually do the memory scan. */
-   numPages = 1 << (32-VKI_BYTES_PER_PAGE_BITS);
-   vg_assert(numPages == 1048576);
-   vg_assert(4096 == (1 << VKI_BYTES_PER_PAGE_BITS));
-
-   nWordsNotified = 0;
-
-   for (page = 0; page < numPages; page++) {
-      pageBase = page << VKI_BYTES_PER_PAGE_BITS;
-      primaryMapNo = pageBase >> 16;
-      sm = VG_(primary_map)[primaryMapNo];
-      if (IS_DISTINGUISHED_SM(sm)) continue;
-      if (__builtin_setjmp(memscan_jmpbuf) == 0) {
-         /* try this ... */
-         page_first_word = * (volatile UInt*)pageBase;
-         /* we get here if we didn't get a fault */
-         /* Scan the page */
-         for (addr = pageBase; addr < pageBase+VKI_BYTES_PER_PAGE; addr += 4) {
-            abits  = get_abits4_ALIGNED(addr);
-            vbytes = get_vbytes4_ALIGNED(addr);
-            if (abits == VGM_NIBBLE_VALID 
-                && vbytes == VGM_WORD_VALID) {
-               nWordsNotified++;
-               notify_word ( addr, *(UInt*)addr );
-	    }
-         }
-      } else {
-         /* We get here if reading the first word of the page caused a
-            fault, which in turn caused the signal handler to longjmp.
-            Ignore this page. */
-         if (0)
-         VG_(printf)(
-            "vg_scan_all_valid_memory_sighandler: ignoring page at %p\n",
-            (void*)pageBase 
-         );
-      }
-   }
-
-   /* Restore signal state to whatever it was before. */
-   res = VG_(ksigaction)( VKI_SIGBUS, &sigbus_saved, NULL );
-   vg_assert(res == 0 +0);
-
-   res = VG_(ksigaction)( VKI_SIGSEGV, &sigsegv_saved, NULL );
-   vg_assert(res == 0 +0 +0);
-
-   res = VG_(ksigprocmask)( VKI_SIG_SETMASK, &blockmask_saved, NULL );
-   vg_assert(res == 0 +0 +0 +0);
-
-   return nWordsNotified;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Detecting leaked (unreachable) malloc'd blocks.      ---*/
-/*------------------------------------------------------------*/
-
-/* A block is either 
-   -- Proper-ly reached; a pointer to its start has been found
-   -- Interior-ly reached; only an interior pointer to it has been found
-   -- Unreached; so far, no pointers to any part of it have been found. 
-*/
-typedef 
-   enum { Unreached, Interior, Proper } 
-   Reachedness;
-
-/* A block record, used for generating err msgs. */
-typedef
-   struct _LossRecord {
-      struct _LossRecord* next;
-      /* Where these lost blocks were allocated. */
-      ExeContext*  allocated_at;
-      /* Their reachability. */
-      Reachedness  loss_mode;
-      /* Number of blocks and total # bytes involved. */
-      UInt         total_bytes;
-      UInt         num_blocks;
-   }
-   LossRecord;
-
-
-/* Find the i such that ptr points at or inside the block described by
-   shadows[i].  Return -1 if none found.  This assumes that shadows[]
-   has been sorted on the ->data field. */
-
-#ifdef VG_DEBUG_LEAKCHECK
-/* Used to sanity-check the fast binary-search mechanism. */
-static Int find_shadow_for_OLD ( Addr          ptr, 
-                                 ShadowChunk** shadows,
-                                 Int           n_shadows )
-
-{
-   Int  i;
-   Addr a_lo, a_hi;
-   PROF_EVENT(70);
-   for (i = 0; i < n_shadows; i++) {
-      PROF_EVENT(71);
-      a_lo = shadows[i]->data;
-      a_hi = ((Addr)shadows[i]->data) + shadows[i]->size - 1;
-      if (a_lo <= ptr && ptr <= a_hi)
-         return i;
-   }
-   return -1;
-}
-#endif
-
-
-static Int find_shadow_for ( Addr          ptr, 
-                             ShadowChunk** shadows,
-                             Int           n_shadows )
-{
-   Addr a_mid_lo, a_mid_hi;
-   Int lo, mid, hi, retVal;
-   PROF_EVENT(70);
-   /* VG_(printf)("find shadow for %p = ", ptr); */
-   retVal = -1;
-   lo = 0;
-   hi = n_shadows-1;
-   while (True) {
-      PROF_EVENT(71);
-
-      /* invariant: current unsearched space is from lo to hi,
-         inclusive. */
-      if (lo > hi) break; /* not found */
-
-      mid      = (lo + hi) / 2;
-      a_mid_lo = shadows[mid]->data;
-      a_mid_hi = ((Addr)shadows[mid]->data) + shadows[mid]->size - 1;
-
-      if (ptr < a_mid_lo) {
-         hi = mid-1;
-         continue;
-      } 
-      if (ptr > a_mid_hi) {
-         lo = mid+1;
-         continue;
-      }
-      vg_assert(ptr >= a_mid_lo && ptr <= a_mid_hi);
-      retVal = mid;
-      break;
-   }
-
-#  ifdef VG_DEBUG_LEAKCHECK
-   vg_assert(retVal == find_shadow_for_OLD ( ptr, shadows, n_shadows ));
-#  endif
-   /* VG_(printf)("%d\n", retVal); */
-   return retVal;
-}
-
-
-
-static void sort_malloc_shadows ( ShadowChunk** shadows, UInt n_shadows )
-{
-   Int   incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
-                      9841, 29524, 88573, 265720,
-                      797161, 2391484 };
-   Int          lo = 0;
-   Int          hi = n_shadows-1;
-   Int          i, j, h, bigN, hp;
-   ShadowChunk* v;
-
-   PROF_EVENT(72);
-   bigN = hi - lo + 1; if (bigN < 2) return;
-   hp = 0; while (incs[hp] < bigN) hp++; hp--;
-
-   for (; hp >= 0; hp--) {
-      PROF_EVENT(73);
-      h = incs[hp];
-      i = lo + h;
-      while (1) {
-         PROF_EVENT(74);
-         if (i > hi) break;
-         v = shadows[i];
-         j = i;
-         while (shadows[j-h]->data > v->data) {
-            PROF_EVENT(75);
-            shadows[j] = shadows[j-h];
-            j = j - h;
-            if (j <= (lo + h - 1)) break;
-         }
-         shadows[j] = v;
-         i++;
-      }
-   }
-}
-
-/* Globals, for the callback used by VG_(detect_memory_leaks). */
-
-static ShadowChunk** vglc_shadows;
-static Int           vglc_n_shadows;
-static Reachedness*  vglc_reachedness;
-static Addr          vglc_min_mallocd_addr;
-static Addr          vglc_max_mallocd_addr;
-
-static 
-void vg_detect_memory_leaks_notify_addr ( Addr a, UInt word_at_a )
-{
-   Int  sh_no;
-   Addr ptr;
-
-   /* Rule out some known causes of bogus pointers.  Mostly these do
-      not cause much trouble because only a few false pointers can
-      ever lurk in these places.  This mainly stops it reporting that
-      blocks are still reachable in stupid test programs like this
-
-         int main (void) { char* a = malloc(100); return 0; }
-
-      which people seem inordinately fond of writing, for some reason.  
-
-      Note that this is a complete kludge.  It would be better to
-      ignore any addresses corresponding to valgrind.so's .bss and
-      .data segments, but I cannot think of a reliable way to identify
-      where the .bss segment has been put.  If you can, drop me a
-      line.  
-   */
-   if (a >= ((Addr)(&VG_(stack)))
-       && a <= ((Addr)(&VG_(stack))) + sizeof(VG_(stack))) {
-      return;
-   }
-   if (a >= ((Addr)(&VG_(m_state_static)))
-       && a <= ((Addr)(&VG_(m_state_static))) + sizeof(VG_(m_state_static))) {
-      return;
-   }
-   if (a == (Addr)(&vglc_min_mallocd_addr))
-      return;
-   if (a == (Addr)(&vglc_max_mallocd_addr))
-      return;
-
-   /* OK, let's get on and do something Useful for a change. */
-
-   ptr = (Addr)word_at_a;
-   if (ptr >= vglc_min_mallocd_addr && ptr <= vglc_max_mallocd_addr) {
-      /* Might be legitimate; we'll have to investigate further. */
-      sh_no = find_shadow_for ( ptr, vglc_shadows, vglc_n_shadows );
-      if (sh_no != -1) {
-         /* Found a block at/into which ptr points. */
-         vg_assert(sh_no >= 0 && sh_no < vglc_n_shadows);
-         vg_assert(ptr < vglc_shadows[sh_no]->data 
-                         + vglc_shadows[sh_no]->size);
-         /* Decide whether Proper-ly or Interior-ly reached. */
-         if (ptr == vglc_shadows[sh_no]->data) {
-            if (0) VG_(printf)("pointer at %p to %p\n", a, word_at_a );
-            vglc_reachedness[sh_no] = Proper;
-         } else {
-            if (vglc_reachedness[sh_no] == Unreached)
-               vglc_reachedness[sh_no] = Interior;
-         }
-      }
-   }
-}
-
-
-void VG_(detect_memory_leaks) ( void )
-{
-   Int    i;
-   Int    blocks_leaked, bytes_leaked;
-   Int    blocks_dubious, bytes_dubious;
-   Int    blocks_reachable, bytes_reachable;
-   Int    n_lossrecords;
-   UInt   bytes_notified;
-   
-   LossRecord*  errlist;
-   LossRecord*  p;
-
-   Bool (*ec_comparer_fn) ( ExeContext*, ExeContext* );
-   PROF_EVENT(76);
-   vg_assert(VG_(clo_instrument));
-
-   /* Decide how closely we want to match ExeContexts in leak
-      records. */
-   switch (VG_(clo_leak_resolution)) {
-      case 2: 
-         ec_comparer_fn = VG_(eq_ExeContext_top2); 
-         break;
-      case 4: 
-         ec_comparer_fn = VG_(eq_ExeContext_top4); 
-         break;
-      case VG_DEEPEST_BACKTRACE: 
-         ec_comparer_fn = VG_(eq_ExeContext_all); 
-         break;
-      default: 
-         VG_(panic)("VG_(detect_memory_leaks): "
-                    "bad VG_(clo_leak_resolution)");
-         break;
-   }
-
-   /* vg_get_malloc_shadows allocates storage for shadows */
-   vglc_shadows = VG_(get_malloc_shadows)( &vglc_n_shadows );
-   if (vglc_n_shadows == 0) {
-      vg_assert(vglc_shadows == NULL);
-      VG_(message)(Vg_UserMsg, 
-                   "No malloc'd blocks -- no leaks are possible.\n");
-      return;
-   }
-
-   VG_(message)(Vg_UserMsg, 
-                "searching for pointers to %d not-freed blocks.", 
-                vglc_n_shadows );
-   sort_malloc_shadows ( vglc_shadows, vglc_n_shadows );
-
-   /* Sanity check; assert that the blocks are now in order and that
-      they don't overlap. */
-   for (i = 0; i < vglc_n_shadows-1; i++) {
-      vg_assert( ((Addr)vglc_shadows[i]->data)
-                 < ((Addr)vglc_shadows[i+1]->data) );
-      vg_assert( ((Addr)vglc_shadows[i]->data) + vglc_shadows[i]->size
-                 < ((Addr)vglc_shadows[i+1]->data) );
-   }
-
-   vglc_min_mallocd_addr = ((Addr)vglc_shadows[0]->data);
-   vglc_max_mallocd_addr = ((Addr)vglc_shadows[vglc_n_shadows-1]->data)
-                         + vglc_shadows[vglc_n_shadows-1]->size - 1;
-
-   vglc_reachedness 
-      = VG_(malloc)( VG_AR_PRIVATE, vglc_n_shadows * sizeof(Reachedness) );
-   for (i = 0; i < vglc_n_shadows; i++)
-      vglc_reachedness[i] = Unreached;
-
-   /* Do the scan of memory. */
-   bytes_notified
-       = VG_(scan_all_valid_memory)( &vg_detect_memory_leaks_notify_addr )
-         * VKI_BYTES_PER_WORD;
-
-   VG_(message)(Vg_UserMsg, "checked %d bytes.", bytes_notified);
-
-   blocks_leaked    = bytes_leaked    = 0;
-   blocks_dubious   = bytes_dubious   = 0;
-   blocks_reachable = bytes_reachable = 0;
-
-   for (i = 0; i < vglc_n_shadows; i++) {
-      if (vglc_reachedness[i] == Unreached) {
-         blocks_leaked++;
-         bytes_leaked += vglc_shadows[i]->size;
-      }
-      else if (vglc_reachedness[i] == Interior) {
-         blocks_dubious++;
-         bytes_dubious += vglc_shadows[i]->size;
-      }
-      else if (vglc_reachedness[i] == Proper) {
-         blocks_reachable++;
-         bytes_reachable += vglc_shadows[i]->size;
-      }
-   }
-
-   VG_(message)(Vg_UserMsg, "");
-   VG_(message)(Vg_UserMsg, "definitely lost: %d bytes in %d blocks.", 
-                            bytes_leaked, blocks_leaked );
-   VG_(message)(Vg_UserMsg, "possibly lost:   %d bytes in %d blocks.", 
-                            bytes_dubious, blocks_dubious );
-   VG_(message)(Vg_UserMsg, "still reachable: %d bytes in %d blocks.", 
-                            bytes_reachable, blocks_reachable );
-
-
-   /* Common up the lost blocks so we can print sensible error
-      messages. */
-
-   n_lossrecords = 0;
-   errlist       = NULL;
-   for (i = 0; i < vglc_n_shadows; i++) {
-      for (p = errlist; p != NULL; p = p->next) {
-         if (p->loss_mode == vglc_reachedness[i]
-             && ec_comparer_fn (
-                   p->allocated_at, 
-                   vglc_shadows[i]->where) ) {
-            break;
-	 }
-      }
-      if (p != NULL) {
-         p->num_blocks  ++;
-         p->total_bytes += vglc_shadows[i]->size;
-      } else {
-         n_lossrecords ++;
-         p = VG_(malloc)(VG_AR_PRIVATE, sizeof(LossRecord));
-         p->loss_mode    = vglc_reachedness[i];
-         p->allocated_at = vglc_shadows[i]->where;
-         p->total_bytes  = vglc_shadows[i]->size;
-         p->num_blocks   = 1;
-         p->next         = errlist;
-         errlist         = p;
-      }
-   }
-   
-   for (i = 0; i < n_lossrecords; i++) {
-      LossRecord* p_min = NULL;
-      UInt        n_min = 0xFFFFFFFF;
-      for (p = errlist; p != NULL; p = p->next) {
-         if (p->num_blocks > 0 && p->total_bytes < n_min) {
-            n_min = p->total_bytes;
-            p_min = p;
-         }
-      }
-      vg_assert(p_min != NULL);
-
-      if ( (!VG_(clo_show_reachable)) && p_min->loss_mode == Proper) {
-         p_min->num_blocks = 0;
-         continue;
-      }
-
-      VG_(message)(Vg_UserMsg, "");
-      VG_(message)(
-         Vg_UserMsg,
-         "%d bytes in %d blocks are %s in loss record %d of %d",
-         p_min->total_bytes, p_min->num_blocks,
-         p_min->loss_mode==Unreached ? "definitely lost" :
-            (p_min->loss_mode==Interior ? "possibly lost"
-                                        : "still reachable"),
-         i+1, n_lossrecords
-      );
-      VG_(pp_ExeContext)(p_min->allocated_at);
-      p_min->num_blocks = 0;
-   }
-
-   VG_(message)(Vg_UserMsg, "");
-   VG_(message)(Vg_UserMsg, "LEAK SUMMARY:");
-   VG_(message)(Vg_UserMsg, "   definitely lost: %d bytes in %d blocks.", 
-                            bytes_leaked, blocks_leaked );
-   VG_(message)(Vg_UserMsg, "   possibly lost:   %d bytes in %d blocks.", 
-                            bytes_dubious, blocks_dubious );
-   VG_(message)(Vg_UserMsg, "   still reachable: %d bytes in %d blocks.", 
-                            bytes_reachable, blocks_reachable );
-   if (!VG_(clo_show_reachable)) {
-      VG_(message)(Vg_UserMsg, 
-         "Reachable blocks (those to which a pointer was found) are not shown.");
-      VG_(message)(Vg_UserMsg, 
-         "To see them, rerun with: --show-reachable=yes");
-   }
-   VG_(message)(Vg_UserMsg, "");
-
-   VG_(free) ( VG_AR_PRIVATE, vglc_shadows );
-   VG_(free) ( VG_AR_PRIVATE, vglc_reachedness );
-}
-
-
-/* ---------------------------------------------------------------------
-   Sanity check machinery (permanently engaged).
-   ------------------------------------------------------------------ */
-
-/* Check that nobody has spuriously claimed that the first or last 16
-   pages (64 KB) of address space have become accessible.  Failure of
-   the following do not per se indicate an internal consistency
-   problem, but they are so likely to that we really want to know
-   about it if so. */
-
-Bool VG_(first_and_last_secondaries_look_plausible) ( void )
-{
-   if (IS_DISTINGUISHED_SM(VG_(primary_map)[0])
-       && IS_DISTINGUISHED_SM(VG_(primary_map)[65535])) {
-      return True;
-   } else {
-      return False;
-   }
-}
-
-
-/* A fast sanity check -- suitable for calling circa once per
-   millisecond. */
-
-void VG_(do_sanity_checks) ( Bool force_expensive )
-{
-   Int          i;
-   Bool         do_expensive_checks;
-
-   if (VG_(sanity_level) < 1) return;
-
-   /* --- First do all the tests that we can do quickly. ---*/
-
-   VG_(sanity_fast_count)++;
-
-   /* Check that we haven't overrun our private stack. */
-   for (i = 0; i < 10; i++) {
-      vg_assert(VG_(stack)[i]
-                == ((UInt)(&VG_(stack)[i]) ^ 0xA4B3C2D1));
-      vg_assert(VG_(stack)[10000-1-i] 
-                == ((UInt)(&VG_(stack)[10000-i-1]) ^ 0xABCD4321));
-   }
-
-   /* Check stuff pertaining to the memory check system. */
-
-   if (VG_(clo_instrument)) {
-
-      /* Check that nobody has spuriously claimed that the first or
-         last 16 pages of memory have become accessible [...] */
-      vg_assert(VG_(first_and_last_secondaries_look_plausible)());
-   }
-
-   /* --- Now some more expensive checks. ---*/
-
-   /* Once every 25 times, check some more expensive stuff. */
-
-   do_expensive_checks = False;
-   if (force_expensive) 
-      do_expensive_checks = True;
-   if (VG_(sanity_level) > 1) 
-      do_expensive_checks = True;
-   if (VG_(sanity_level) == 1 
-       && (VG_(sanity_fast_count) % 25) == 0)
-      do_expensive_checks = True;
-
-   if (do_expensive_checks) {
-      VG_(sanity_slow_count)++;
-
-#     if 0
-      { void zzzmemscan(void); zzzmemscan(); }
-#     endif
-
-      if ((VG_(sanity_fast_count) % 250) == 0)
-         VG_(sanity_check_tc_tt)();
-
-      if (VG_(clo_instrument)) {
-         /* Make sure nobody changed the distinguished secondary. */
-         for (i = 0; i < 8192; i++)
-            vg_assert(vg_distinguished_secondary_map.abits[i] 
-                      == VGM_BYTE_INVALID);
-         for (i = 0; i < 65536; i++)
-            vg_assert(vg_distinguished_secondary_map.vbyte[i] 
-                      == VGM_BYTE_INVALID);
-
-         /* Make sure that the upper 3/4 of the primary map hasn't
-            been messed with. */
-         for (i = 65536; i < 262144; i++)
-            vg_assert(VG_(primary_map)[i] 
-                      == & vg_distinguished_secondary_map);
-      }
-      /* 
-      if ((VG_(sanity_fast_count) % 500) == 0) VG_(mallocSanityCheckAll)(); 
-      */
-   }
-
-   if (VG_(sanity_level) > 1) {
-      /* Check sanity of the low-level memory manager.  Note that bugs
-         in the client's code can cause this to fail, so we don't do
-         this check unless specially asked for.  And because it's
-         potentially very expensive. */
-      VG_(mallocSanityCheckAll)();
-   }
-}
-
-
-/* ---------------------------------------------------------------------
-   Debugging machinery (turn on to debug).  Something of a mess.
-   ------------------------------------------------------------------ */
-
-/* Print the value tags on the 8 integer registers & flag reg. */
-
-static void uint_to_bits ( UInt x, Char* str )
-{
-   Int i;
-   Int w = 0;
-   /* str must point to a space of at least 36 bytes. */
-   for (i = 31; i >= 0; i--) {
-      str[w++] = (x & ( ((UInt)1) << i)) ? '1' : '0';
-      if (i == 24 || i == 16 || i == 8)
-         str[w++] = ' ';
-   }
-   str[w++] = 0;
-   vg_assert(w == 36);
-}
-
-/* Caution!  Not vthread-safe; looks in VG_(baseBlock), not the thread
-   state table. */
-
-void VG_(show_reg_tags) ( void )
-{
-   Char buf1[36];
-   Char buf2[36];
-   UInt z_eax, z_ebx, z_ecx, z_edx, 
-        z_esi, z_edi, z_ebp, z_esp, z_eflags;
-
-   z_eax    = VG_(baseBlock)[VGOFF_(sh_eax)];
-   z_ebx    = VG_(baseBlock)[VGOFF_(sh_ebx)];
-   z_ecx    = VG_(baseBlock)[VGOFF_(sh_ecx)];
-   z_edx    = VG_(baseBlock)[VGOFF_(sh_edx)];
-   z_esi    = VG_(baseBlock)[VGOFF_(sh_esi)];
-   z_edi    = VG_(baseBlock)[VGOFF_(sh_edi)];
-   z_ebp    = VG_(baseBlock)[VGOFF_(sh_ebp)];
-   z_esp    = VG_(baseBlock)[VGOFF_(sh_esp)];
-   z_eflags = VG_(baseBlock)[VGOFF_(sh_eflags)];
-   
-   uint_to_bits(z_eflags, buf1);
-   VG_(message)(Vg_DebugMsg, "efl %\n", buf1);
-
-   uint_to_bits(z_eax, buf1);
-   uint_to_bits(z_ebx, buf2);
-   VG_(message)(Vg_DebugMsg, "eax %s   ebx %s\n", buf1, buf2);
-
-   uint_to_bits(z_ecx, buf1);
-   uint_to_bits(z_edx, buf2);
-   VG_(message)(Vg_DebugMsg, "ecx %s   edx %s\n", buf1, buf2);
-
-   uint_to_bits(z_esi, buf1);
-   uint_to_bits(z_edi, buf2);
-   VG_(message)(Vg_DebugMsg, "esi %s   edi %s\n", buf1, buf2);
-
-   uint_to_bits(z_ebp, buf1);
-   uint_to_bits(z_esp, buf2);
-   VG_(message)(Vg_DebugMsg, "ebp %s   esp %s\n", buf1, buf2);
-}
-
-
-#if 0
-/* For debugging only.  Scan the address space and touch all allegedly
-   addressible words.  Useful for establishing where Valgrind's idea of
-   addressibility has diverged from what the kernel believes. */
-
-static 
-void zzzmemscan_notify_word ( Addr a, UInt w )
-{
-}
-
-void zzzmemscan ( void )
-{
-   Int n_notifies
-      = VG_(scan_all_valid_memory)( zzzmemscan_notify_word );
-   VG_(printf)("zzzmemscan: n_bytes = %d\n", 4 * n_notifies );
-}
-#endif
-
-
-
-
-#if 0
-static Int zzz = 0;
-
-void show_bb ( Addr eip_next )
-{
-   VG_(printf)("[%4d] ", zzz);
-   VG_(show_reg_tags)( &VG_(m_shadow );
-   VG_(translate) ( eip_next, NULL, NULL, NULL );
-}
-#endif /* 0 */
-
-/*--------------------------------------------------------------------*/
-/*--- end                                              vg_memory.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_messages.c b/coregrind/vg_messages.c
deleted file mode 100644
index 3eaf8cd53e..0000000000
--- a/coregrind/vg_messages.c
+++ /dev/null
@@ -1,104 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- For sending error/informative messages.                      ---*/
-/*---                                                 vg_message.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-
-#include "vg_include.h"
-
-
-static char vg_mbuf[M_VG_MSGBUF];
-static int vg_n_mbuf;
-
-static void add_to_buf ( Char c )
-{
-  if (vg_n_mbuf >= (M_VG_MSGBUF-1)) return;
-  vg_mbuf[vg_n_mbuf++] = c;
-  vg_mbuf[vg_n_mbuf]   = 0;
-}
-
-
-/* Publically visible from here onwards. */
-
-void
-VG_(add_to_msg) ( Char *format, ... )
-{
-   va_list vargs;
-   va_start(vargs,format);
-   VG_(vprintf) ( add_to_buf, format, vargs );
-   va_end(vargs);
-}
-
-/* Send a simple single-part message. */
-void VG_(message) ( VgMsgKind kind, Char* format, ... )
-{
-   va_list vargs;
-   va_start(vargs,format);
-   VG_(start_msg) ( kind );
-   VG_(vprintf) ( add_to_buf, format, vargs );
-   va_end(vargs);
-   VG_(end_msg)();
-}
-
-void VG_(start_msg) ( VgMsgKind kind )
-{
-   Char c;
-   vg_n_mbuf = 0;
-   vg_mbuf[vg_n_mbuf] = 0;
-   switch (kind) {
-      case Vg_UserMsg:       c = '='; break;
-      case Vg_DebugMsg:      c = '-'; break;
-      case Vg_DebugExtraMsg: c = '+'; break;
-      default:               c = '?'; break;
-   }
-   VG_(add_to_msg)( "%c%c%d%c%c ", 
-                    c,c, VG_(getpid)(), c,c );
-}
-
-
-void VG_(end_msg) ( void )
-{
-   if (VG_(clo_logfile_fd) >= 0) {
-      add_to_buf('\n');
-      VG_(write)(VG_(clo_logfile_fd), vg_mbuf, VG_(strlen)(vg_mbuf));
-   }
-}
-
-
-void VG_(startup_logging) ( void )
-{
-}
-
-void VG_(shutdown_logging) ( void )
-{
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                             vg_message.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_mylibc.c b/coregrind/vg_mylibc.c
deleted file mode 100644
index e32aee8d10..0000000000
--- a/coregrind/vg_mylibc.c
+++ /dev/null
@@ -1,1277 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Reimplementation of some C library stuff, to avoid depending ---*/
-/*--- on libc.so.                                                  ---*/
-/*---                                                  vg_mylibc.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-
-
-/* ---------------------------------------------------------------------
-   Really Actually DO system calls.
-   ------------------------------------------------------------------ */
-
-/* Ripped off from /usr/include/asm/unistd.h. */
-
-static
-UInt vg_do_syscall0 ( UInt syscallno )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall1 ( UInt syscallno, UInt arg1 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall2 ( UInt syscallno, 
-                      UInt arg1, UInt arg2 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall3 ( UInt syscallno, 
-                      UInt arg1, UInt arg2, UInt arg3 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2),
-                       "d" (arg3) );
-   return __res;
-}
-
-
-static
-UInt vg_do_syscall4 ( UInt syscallno, 
-                      UInt arg1, UInt arg2, UInt arg3, UInt arg4 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2),
-                       "d" (arg3),
-                       "S" (arg4) );
-   return __res;
-}
-
-
-#if 0
-static
-UInt vg_do_syscall5 ( UInt syscallno, 
-                      UInt arg1, UInt arg2, UInt arg3, UInt arg4, 
-                      UInt arg5 )
-{ 
-   UInt __res;
-   __asm__ volatile ("int $0x80"
-                     : "=a" (__res)
-                     : "0" (syscallno),
-                       "b" (arg1),
-                       "c" (arg2),
-                       "d" (arg3),
-                       "S" (arg4),
-                       "D" (arg5) );
-   return __res;
-}
-#endif
-
-/* ---------------------------------------------------------------------
-   Wrappers around system calls, and other stuff, to do with signals.
-   ------------------------------------------------------------------ */
-
-/* sigemptyset, sigfullset, sigaddset and sigdelset return 0 on
-   success and -1 on error.  
-*/
-Int VG_(ksigfillset)( vki_ksigset_t* set )
-{
-   Int i;
-   if (set == NULL)
-      return -1;
-   for (i = 0; i < VKI_KNSIG_WORDS; i++)
-      set->ws[i] = 0xFFFFFFFF;
-   return 0;
-}
-
-Int VG_(ksigemptyset)( vki_ksigset_t* set )
-{
-   Int i;
-   if (set == NULL)
-      return -1;
-   for (i = 0; i < VKI_KNSIG_WORDS; i++)
-      set->ws[i] = 0x0;
-   return 0;
-}
-
-Bool VG_(kisemptysigset)( vki_ksigset_t* set )
-{
-   Int i;
-   vg_assert(set != NULL);
-   for (i = 0; i < VKI_KNSIG_WORDS; i++)
-      if (set->ws[i] != 0x0) return False;
-   return True;
-}
-
-Bool VG_(kisfullsigset)( vki_ksigset_t* set )
-{
-   Int i;
-   vg_assert(set != NULL);
-   for (i = 0; i < VKI_KNSIG_WORDS; i++)
-      if (set->ws[i] != ~0x0) return False;
-   return True;
-}
-
-
-Int VG_(ksigaddset)( vki_ksigset_t* set, Int signum )
-{
-   if (set == NULL)
-      return -1;
-   if (signum < 1 && signum > VKI_KNSIG)
-      return -1;
-   signum--;
-   set->ws[signum / VKI_KNSIG_BPW] |= (1 << (signum % VKI_KNSIG_BPW));
-   return 0;
-}
-
-Int VG_(ksigdelset)( vki_ksigset_t* set, Int signum )
-{
-   if (set == NULL)
-      return -1;
-   if (signum < 1 && signum > VKI_KNSIG)
-      return -1;
-   signum--;
-   set->ws[signum / VKI_KNSIG_BPW] &= ~(1 << (signum % VKI_KNSIG_BPW));
-   return 0;
-}
-
-Int VG_(ksigismember) ( vki_ksigset_t* set, Int signum )
-{
-   if (set == NULL)
-      return 0;
-   if (signum < 1 && signum > VKI_KNSIG)
-      return 0;
-   signum--;
-   if (1 & ((set->ws[signum / VKI_KNSIG_BPW]) >> (signum % VKI_KNSIG_BPW)))
-      return 1;
-   else
-      return 0;
-}
-
-
-/* Add all signals in src to dst. */
-void VG_(ksigaddset_from_set)( vki_ksigset_t* dst, vki_ksigset_t* src )
-{
-   Int i;
-   vg_assert(dst != NULL && src != NULL);
-   for (i = 0; i < VKI_KNSIG_WORDS; i++)
-      dst->ws[i] |= src->ws[i];
-}
-
-/* Remove all signals in src from dst. */
-void VG_(ksigdelset_from_set)( vki_ksigset_t* dst, vki_ksigset_t* src )
-{
-   Int i;
-   vg_assert(dst != NULL && src != NULL);
-   for (i = 0; i < VKI_KNSIG_WORDS; i++)
-      dst->ws[i] &= ~(src->ws[i]);
-}
-
-
-/* The functions sigaction, sigprocmask, sigpending and sigsuspend
-   return 0 on success and -1 on error.  
-*/
-Int VG_(ksigprocmask)( Int how, 
-                       const vki_ksigset_t* set, 
-                       vki_ksigset_t* oldset)
-{
-   Int res 
-      = vg_do_syscall4(__NR_rt_sigprocmask, 
-                       how, (UInt)set, (UInt)oldset, 
-                       VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
-
-Int VG_(ksigaction) ( Int signum,  
-                      const vki_ksigaction* act,  
-                      vki_ksigaction* oldact)
-{
-   Int res
-     = vg_do_syscall4(__NR_rt_sigaction,
-                      signum, (UInt)act, (UInt)oldact, 
-                      VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
-   /* VG_(printf)("res = %d\n",res); */
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
-
-Int VG_(ksigaltstack)( const vki_kstack_t* ss, vki_kstack_t* oss )
-{
-   Int res
-     = vg_do_syscall2(__NR_sigaltstack, (UInt)ss, (UInt)oss);
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
- 
-Int VG_(ksignal)(Int signum, void (*sighandler)(Int))
-{
-   Int res;
-   vki_ksigaction sa;
-   sa.ksa_handler = sighandler;
-   sa.ksa_flags = VKI_SA_ONSTACK | VKI_SA_RESTART;
-   sa.ksa_restorer = NULL;
-   res = VG_(ksigemptyset)( &sa.ksa_mask );
-   vg_assert(res == 0);
-   res = vg_do_syscall4(__NR_rt_sigaction,
-                        signum, (UInt)(&sa), (UInt)NULL,
-                        VKI_KNSIG_WORDS * VKI_BYTES_PER_WORD);
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
-
-Int VG_(kill)( Int pid, Int signo )
-{
-   Int res = vg_do_syscall2(__NR_kill, pid, signo);
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
-
-Int VG_(sigpending) ( vki_ksigset_t* set )
-{
-   Int res = vg_do_syscall1(__NR_sigpending, (UInt)set);
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
-
-/* ---------------------------------------------------------------------
-   mmap/munmap, exit, fcntl
-   ------------------------------------------------------------------ */
-
-/* Returns -1 on failure. */
-void* VG_(mmap)( void* start, UInt length, 
-                 UInt prot, UInt flags, UInt fd, UInt offset)
-{
-   Int  res;
-   UInt args[6];
-   args[0] = (UInt)start;
-   args[1] = length;
-   args[2] = prot;
-   args[3] = flags;
-   args[4] = fd;
-   args[5] = offset;
-   res = vg_do_syscall1(__NR_mmap, (UInt)(&(args[0])) );
-   return VG_(is_kerror)(res) ? ((void*)(-1)) : (void*)res;
-}
-
-/* Returns -1 on failure. */
-Int VG_(munmap)( void* start, Int length )
-{
-   Int res = vg_do_syscall2(__NR_munmap, (UInt)start, (UInt)length );
-   return VG_(is_kerror)(res) ? -1 : 0;
-}
-
-void VG_(exit)( Int status )
-{
-   (void)vg_do_syscall1(__NR_exit, (UInt)status );
-   /* Why are we still alive here? */
-   /*NOTREACHED*/
-   vg_assert(2+2 == 5);
-}
-
-/* Returns -1 on error. */
-Int VG_(fcntl) ( Int fd, Int cmd, Int arg )
-{
-   Int res = vg_do_syscall3(__NR_fcntl, fd, cmd, arg);
-   return VG_(is_kerror)(res) ? -1 : res;
-}
-
-/* Returns -1 on error. */
-Int VG_(select)( Int n, 
-                 vki_fd_set* readfds, 
-                 vki_fd_set* writefds, 
-                 vki_fd_set* exceptfds, 
-                 struct vki_timeval * timeout )
-{
-   Int res;
-   UInt args[5];
-   args[0] = n;
-   args[1] = (UInt)readfds;
-   args[2] = (UInt)writefds;
-   args[3] = (UInt)exceptfds;
-   args[4] = (UInt)timeout;
-   res = vg_do_syscall1(__NR_select, (UInt)(&(args[0])) );
-   return VG_(is_kerror)(res) ? -1 : res;
-}
-
-/* Returns -1 on error, 0 if ok, 1 if interrupted. */
-Int VG_(nanosleep)( const struct vki_timespec *req, 
-                    struct vki_timespec *rem )
-{
-   Int res;
-   res = vg_do_syscall2(__NR_nanosleep, (UInt)req, (UInt)rem);
-   if (res == -VKI_EINVAL) return -1;
-   if (res == -VKI_EINTR)  return 1;
-   return 0;
-}
-
-void* VG_(brk) ( void* end_data_segment )
-{
-   Int res;
-   res = vg_do_syscall1(__NR_brk, (UInt)end_data_segment);
-   return (void*)(  VG_(is_kerror)(res) ? -1 : res  );
-}
-
-
-/* ---------------------------------------------------------------------
-   printf implementation.  The key function, vg_vprintf(), emits chars 
-   into a caller-supplied function.  Distantly derived from:
-
-      vprintf replacement for Checker.
-      Copyright 1993, 1994, 1995 Tristan Gingold
-      Written September 1993 Tristan Gingold
-      Tristan Gingold, 8 rue Parmentier, F-91120 PALAISEAU, FRANCE
-
-   (Checker itself was GPL'd.)
-   ------------------------------------------------------------------ */
-
-
-/* Some flags.  */
-#define VG_MSG_SIGNED    1 /* The value is signed. */
-#define VG_MSG_ZJUSTIFY  2 /* Must justify with '0'. */
-#define VG_MSG_LJUSTIFY  4 /* Must justify on the left. */
-
-
-/* Copy a string into the buffer. */
-static void
-myvprintf_str ( void(*send)(Char), Int flags, Int width, Char* str, 
-                Bool capitalise )
-{
-#  define MAYBE_TOUPPER(ch) (capitalise ? VG_(toupper)(ch) : (ch))
-
-   Int i, extra;
-   Int len = VG_(strlen)(str);
-
-   if (width == 0) {
-      for (i = 0; i < len; i++)
-         send(MAYBE_TOUPPER(str[i]));
-      return;
-   }
-
-   if (len > width) {
-      for (i = 0; i < width; i++)
-         send(MAYBE_TOUPPER(str[i]));
-      return;
-   }
-
-   extra = width - len;
-   if (flags & VG_MSG_LJUSTIFY) {
-      for (i = 0; i < extra; i++)
-         send(' ');
-   }
-   for (i = 0; i < len; i++)
-      send(MAYBE_TOUPPER(str[i]));
-   if (!(flags & VG_MSG_LJUSTIFY)) {
-      for (i = 0; i < extra; i++)
-         send(' ');
-   }
-
-#  undef MAYBE_TOUPPER
-}
-
-/* Write P into the buffer according to these args:
- *  If SIGN is true, p is a signed.
- *  BASE is the base.
- *  If WITH_ZERO is true, '0' must be added.
- *  WIDTH is the width of the field.
- */
-static void
-myvprintf_int64 ( void(*send)(Char), Int flags, Int base, Int width, ULong p)
-{
-   Char buf[40];
-   Int  ind = 0;
-   Int  i;
-   Bool neg = False;
-   Char *digits = "0123456789ABCDEF";
- 
-   if (base < 2 || base > 16)
-      return;
- 
-   if ((flags & VG_MSG_SIGNED) && (Long)p < 0) {
-      p   = - (Long)p;
-      neg = True;
-   }
-
-   if (p == 0)
-      buf[ind++] = '0';
-   else {
-      while (p > 0) {
-         buf[ind++] = digits[p % base];
-         p /= base;
-       }
-   }
-
-   if (neg)
-      buf[ind++] = '-';
-
-   if (width > 0 && !(flags & VG_MSG_LJUSTIFY)) {
-      for(; ind < width; ind++) {
-         vg_assert(ind < 39);
-         buf[ind] = (flags & VG_MSG_ZJUSTIFY) ? '0': ' ';
-      }
-   }
-
-   /* Reverse copy to buffer.  */
-   for (i = ind -1; i >= 0; i--)
-      send(buf[i]);
-
-   if (width > 0 && (flags & VG_MSG_LJUSTIFY)) {
-      for(; ind < width; ind++)
-         send((flags & VG_MSG_ZJUSTIFY) ? '0': ' ');
-   }
-}
-
-
-/* A simple vprintf().  */
-void
-VG_(vprintf) ( void(*send)(Char), const Char *format, va_list vargs )
-{
-   int i;
-   int flags;
-   int width;
-   Bool is_long;
-
-   /* We assume that vargs has already been initialised by the 
-      caller, using va_start, and that the caller will similarly
-      clean up with va_end.
-   */
-
-   for (i = 0; format[i] != 0; i++) {
-      if (format[i] != '%') {
-         send(format[i]);
-         continue;
-      }
-      i++;
-      /* A '%' has been found.  Ignore a trailing %. */
-      if (format[i] == 0)
-         break;
-      if (format[i] == '%') {
-         /* `%%' is replaced by `%'. */
-         send('%');
-         continue;
-      }
-      flags = 0;
-      is_long = False;
-      width = 0; /* length of the field. */
-      /* If '-' follows '%', justify on the left. */
-      if (format[i] == '-') {
-         flags |= VG_MSG_LJUSTIFY;
-         i++;
-      }
-      /* If '0' follows '%', pads will be inserted. */
-      if (format[i] == '0') {
-         flags |= VG_MSG_ZJUSTIFY;
-         i++;
-      }
-      /* Compute the field length. */
-      while (format[i] >= '0' && format[i] <= '9') {
-         width *= 10;
-         width += format[i++] - '0';
-      }
-      while (format[i] == 'l') {
-         i++;
-         is_long = True;
-      }
-
-      switch (format[i]) {
-         case 'd': /* %d */
-            flags |= VG_MSG_SIGNED;
-            if (is_long)
-               myvprintf_int64(send, flags, 10, width, 
-                               (ULong)(va_arg (vargs, Long)));
-            else
-               myvprintf_int64(send, flags, 10, width, 
-                               (ULong)(va_arg (vargs, Int)));
-            break;
-         case 'u': /* %u */
-            if (is_long)
-               myvprintf_int64(send, flags, 10, width, 
-                               (ULong)(va_arg (vargs, ULong)));
-            else
-               myvprintf_int64(send, flags, 10, width, 
-                               (ULong)(va_arg (vargs, UInt)));
-            break;
-         case 'p': /* %p */
-            send('0');
-            send('x');
-            myvprintf_int64(send, flags, 16, width, 
-                            (ULong)((UInt)va_arg (vargs, void *)));
-            break;
-         case 'x': /* %x */
-            if (is_long)
-               myvprintf_int64(send, flags, 16, width, 
-                               (ULong)(va_arg (vargs, ULong)));
-            else
-               myvprintf_int64(send, flags, 16, width, 
-                               (ULong)(va_arg (vargs, UInt)));
-            break;
-         case 'c': /* %c */
-            send(va_arg (vargs, int));
-            break;
-         case 's': case 'S': { /* %s */
-            char *str = va_arg (vargs, char *);
-            if (str == (char*) 0) str = "(null)";
-            myvprintf_str(send, flags, width, str, format[i]=='S');
-            break;
-         }
-         default:
-            break;
-      }
-   }
-}
-
-
-/* A general replacement for printf().  Note that only low-level 
-   debugging info should be sent via here.  The official route is to
-   to use vg_message().  This interface is deprecated.
-*/
-static char myprintf_buf[100];
-static int  n_myprintf_buf;
-
-static void add_to_myprintf_buf ( Char c )
-{
-   if (n_myprintf_buf >= 100-10 /*paranoia*/ ) {
-      if (VG_(clo_logfile_fd) >= 0)
-         VG_(write)
-           (VG_(clo_logfile_fd), myprintf_buf, VG_(strlen)(myprintf_buf));
-      n_myprintf_buf = 0;
-      myprintf_buf[n_myprintf_buf] = 0;      
-   }
-   myprintf_buf[n_myprintf_buf++] = c;
-   myprintf_buf[n_myprintf_buf] = 0;
-}
-
-void VG_(printf) ( const char *format, ... )
-{
-   va_list vargs;
-   va_start(vargs,format);
-   
-   n_myprintf_buf = 0;
-   myprintf_buf[n_myprintf_buf] = 0;      
-   VG_(vprintf) ( add_to_myprintf_buf, format, vargs );
-
-   if (n_myprintf_buf > 0 && VG_(clo_logfile_fd) >= 0)
-      VG_(write)
-         ( VG_(clo_logfile_fd), myprintf_buf, VG_(strlen)(myprintf_buf));
-
-   va_end(vargs);
-}
-
-
-/* A general replacement for sprintf(). */
-static Char* vg_sprintf_ptr;
-
-static void add_to_vg_sprintf_buf ( Char c )
-{
-   *vg_sprintf_ptr++ = c;
-}
-
-void VG_(sprintf) ( Char* buf, Char *format, ... )
-{
-   va_list vargs;
-   va_start(vargs,format);
-
-   vg_sprintf_ptr = buf;
-   VG_(vprintf) ( add_to_vg_sprintf_buf, format, vargs );
-   add_to_vg_sprintf_buf(0);
-
-   va_end(vargs);
-}
-
-
-/* ---------------------------------------------------------------------
-   Misc str* functions.
-   ------------------------------------------------------------------ */
-
-Bool VG_(isspace) ( Char c )
-{
-   return (c == ' ' || c == '\n' || c == '\t' || c == 0);
-}
-
-Bool VG_(isdigit) ( Char c )
-{
-   return (c >= '0' && c <= '9');
-}
-
-Int VG_(strlen) ( const Char* str )
-{
-   Int i = 0;
-   while (str[i] != 0) i++;
-   return i;
-}
-
-
-Long VG_(atoll) ( Char* str )
-{
-   Bool neg = False;
-   Long n = 0;
-   if (*str == '-') { str++; neg = True; };
-   while (*str >= '0' && *str <= '9') {
-      n = 10*n + (Long)(*str - '0');
-      str++;
-   }
-   if (neg) n = -n;
-   return n;
-}
-
-
-Long VG_(atoll36) ( Char* str )
-{
-   Bool neg = False;
-   Long n = 0;
-   if (*str == '-') { str++; neg = True; };
-   while (True) {
-      if (*str >= '0' && *str <= '9') {
-         n = 36*n + (Long)(*str - '0');
-      }
-      else 
-      if (*str >= 'A' && *str <= 'Z') {
-         n = 36*n + (Long)((*str - 'A') + 10);
-      }
-      else 
-      if (*str >= 'a' && *str <= 'z') {
-         n = 36*n + (Long)((*str - 'a') + 10);
-      }
-      else {
-	break;
-      }
-      str++;
-   }
-   if (neg) n = -n;
-   return n;
-}
-
-
-Char* VG_(strcat) ( Char* dest, const Char* src )
-{
-   Char* dest_orig = dest;
-   while (*dest) dest++;
-   while (*src) *dest++ = *src++;
-   *dest = 0;
-   return dest_orig;
-}
-
-
-Char* VG_(strncat) ( Char* dest, const Char* src, Int n )
-{
-   Char* dest_orig = dest;
-   while (*dest) dest++;
-   while (*src && n > 0) { *dest++ = *src++; n--; }
-   *dest = 0;
-   return dest_orig;
-}
-
-
-Char* VG_(strpbrk) ( const Char* s, const Char* accept )
-{
-   const Char* a;
-   while (*s) {
-      a = accept;
-      while (*a)
-         if (*a++ == *s)
-            return (Char *) s;
-      s++;
-   }
-   return NULL;
-}
-
-
-Char* VG_(strcpy) ( Char* dest, const Char* src )
-{
-   Char* dest_orig = dest;
-   while (*src) *dest++ = *src++;
-   *dest = 0;
-   return dest_orig;
-}
-
-
-/* Copy bytes, not overrunning the end of dest and always ensuring
-   zero termination. */
-void VG_(strncpy_safely) ( Char* dest, const Char* src, Int ndest )
-{
-   Int i;
-   vg_assert(ndest > 0);
-   i = 0;
-   dest[i] = 0;
-   while (True) {
-      if (src[i] == 0) return;
-      if (i >= ndest-1) return;
-      dest[i] = src[i];
-      i++;
-      dest[i] = 0;
-   }
-}
-
-
-void VG_(strncpy) ( Char* dest, const Char* src, Int ndest )
-{
-   VG_(strncpy_safely)( dest, src, ndest+1 ); 
-}
-
-
-Int VG_(strcmp) ( const Char* s1, const Char* s2 )
-{
-   while (True) {
-      if (*s1 == 0 && *s2 == 0) return 0;
-      if (*s1 == 0) return -1;
-      if (*s2 == 0) return 1;
-
-      if (*(UChar*)s1 < *(UChar*)s2) return -1;
-      if (*(UChar*)s1 > *(UChar*)s2) return 1;
-
-      s1++; s2++;
-   }
-}
-
-
-Int VG_(strcmp_ws) ( const Char* s1, const Char* s2 )
-{
-   while (True) {
-      if (VG_(isspace)(*s1) && VG_(isspace)(*s2)) return 0;
-      if (VG_(isspace)(*s1)) return -1;
-      if (VG_(isspace)(*s2)) return 1;
-
-      if (*(UChar*)s1 < *(UChar*)s2) return -1;
-      if (*(UChar*)s1 > *(UChar*)s2) return 1;
-
-      s1++; s2++;
-   }
-}
-
-
-Int VG_(strncmp) ( const Char* s1, const Char* s2, Int nmax )
-{
-   Int n = 0;
-   while (True) {
-      if (n >= nmax) return 0;
-      if (*s1 == 0 && *s2 == 0) return 0;
-      if (*s1 == 0) return -1;
-      if (*s2 == 0) return 1;
-
-      if (*(UChar*)s1 < *(UChar*)s2) return -1;
-      if (*(UChar*)s1 > *(UChar*)s2) return 1;
-
-      s1++; s2++; n++;
-   }
-}
-
-
-Int VG_(strncmp_ws) ( const Char* s1, const Char* s2, Int nmax )
-{
-   Int n = 0;
-   while (True) {
-      if (n >= nmax) return 0;
-      if (VG_(isspace)(*s1) && VG_(isspace)(*s2)) return 0;
-      if (VG_(isspace)(*s1)) return -1;
-      if (VG_(isspace)(*s2)) return 1;
-
-      if (*(UChar*)s1 < *(UChar*)s2) return -1;
-      if (*(UChar*)s1 > *(UChar*)s2) return 1;
-
-      s1++; s2++; n++;
-   }
-}
-
-
-Char* VG_(strstr) ( const Char* haystack, Char* needle )
-{
-   Int n; 
-   if (haystack == NULL)
-      return NULL;
-   n = VG_(strlen)(needle);
-   while (True) {
-      if (haystack[0] == 0) 
-         return NULL;
-      if (VG_(strncmp)(haystack, needle, n) == 0) 
-         return (Char*)haystack;
-      haystack++;
-   }
-}
-
-
-Char* VG_(strchr) ( const Char* s, Char c )
-{
-   while (True) {
-      if (*s == c) return (Char*)s;
-      if (*s == 0) return NULL;
-      s++;
-   }
-}
-
-
-Char VG_(toupper) ( Char c )
-{
-   if (c >= 'a' && c <= 'z')
-      return c + ('A' - 'a'); 
-   else
-      return c;
-}
-
-
-Char* VG_(strdup) ( ArenaId aid, const Char* s )
-{
-    Int   i;
-    Int   len = VG_(strlen)(s) + 1;
-    Char* res = VG_(malloc) (aid, len);
-    for (i = 0; i < len; i++)
-       res[i] = s[i];
-    return res;
-}
-
-
-/* ---------------------------------------------------------------------
-   A simple string matching routine, purloined from Hugs98.
-      `*'    matches any sequence of zero or more characters
-      `?'    matches any single character exactly 
-      `\c'   matches the character c only (ignoring special chars)
-      c      matches the character c only
-   ------------------------------------------------------------------ */
-
-/* Keep track of recursion depth. */
-static Int recDepth;
-
-static Bool stringMatch_wrk ( Char* pat, Char* str )
-{
-   vg_assert(recDepth >= 0 && recDepth < 250);
-   recDepth++;
-   for (;;) {
-      switch (*pat) {
-         case '\0' : return (*str=='\0');
-         case '*'  : do {
-                        if (stringMatch_wrk(pat+1,str)) {
-                           recDepth--;
-                           return True;
-                        }
-                     } while (*str++);
-                     recDepth--;
-                     return False;
-         case '?'  : if (*str++=='\0') {
-                        recDepth--;
-                        return False;
-                     }
-                     pat++;
-                     break;
-         case '\\' : if (*++pat == '\0') {
-                        recDepth--;
-                        return False; /* spurious trailing \ in pattern */
-                     }
-                     /* falls through to ... */
-         default   : if (*pat++ != *str++) {
-                        recDepth--;
-                        return False;
-                     }
-                     break;
-      }
-   }
-}
-
-Bool VG_(stringMatch) ( Char* pat, Char* str )
-{
-   Bool b;
-   recDepth = 0;
-   b = stringMatch_wrk ( pat, str );
-   /*
-   VG_(printf)("%s   %s   %s\n",
-	       b?"TRUE ":"FALSE", pat, str);
-   */
-   return b;
-}
-
-
-/* ---------------------------------------------------------------------
-   Assertery.
-   ------------------------------------------------------------------ */
-
-void VG_(assert_fail) ( Char* expr, Char* file, Int line, Char* fn )
-{
-   static Bool entered = False;
-   if (entered) 
-     VG_(exit)(2);
-   entered = True;
-   VG_(printf)("\n%s: %s:%d (%s): Assertion `%s' failed.\n",
-               "valgrind", file, line, fn, expr );
-   VG_(pp_sched_status)();
-   VG_(printf)("Please report this bug to me at: %s\n\n", VG_EMAIL_ADDR);
-   VG_(shutdown_logging)();
-   VG_(exit)(1);
-}
-
-void VG_(panic) ( Char* str )
-{
-   VG_(printf)("\nvalgrind: the `impossible' happened:\n   %s\n", str);
-   VG_(printf)("Basic block ctr is approximately %llu\n", VG_(bbs_done) );
-   VG_(pp_sched_status)();
-   VG_(printf)("Please report this bug to me at: %s\n\n", VG_EMAIL_ADDR);
-   VG_(shutdown_logging)();
-   VG_(exit)(1);
-}
-
-
-/* ---------------------------------------------------------------------
-   Primitive support for reading files.
-   ------------------------------------------------------------------ */
-
-/* Returns -1 on failure. */
-Int VG_(open_read) ( Char* pathname )
-{
-   Int fd;
-   /* VG_(printf)("vg_open_read %s\n", pathname ); */
-
-   /* This gets a segmentation fault if pathname isn't a valid file.
-      I don't know why.  It seems like the call to open is getting
-      intercepted and messed with by glibc ... */
-   /* fd = open( pathname, O_RDONLY ); */
-   /* ... so we go direct to the horse's mouth, which seems to work
-      ok: */
-   const int O_RDONLY = 0; /* See /usr/include/bits/fcntl.h */
-   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_RDONLY, 0);
-   /* VG_(printf)("result = %d\n", fd); */
-   if (VG_(is_kerror)(fd)) fd = -1;
-   return fd;
-}
-
-/* Returns -1 on failure. */
-static Int VG_(chmod_u_rw) ( Int fd )
-{
-   Int res;
-   const int O_IRUSR_IWUSR = 000600; /* See /usr/include/cpio.h */
-   res = vg_do_syscall2(__NR_fchmod, fd, O_IRUSR_IWUSR);
-   if (VG_(is_kerror)(res)) res = -1;
-   return res;
-}
- 
-/* Returns -1 on failure. */
-Int VG_(create_and_write) ( Char* pathname )
-{
-   Int fd;
-
-   const int O_CR_AND_WR_ONLY = 0101; /* See /usr/include/bits/fcntl.h */
-   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_CR_AND_WR_ONLY, 0);
-   /* VG_(printf)("result = %d\n", fd); */
-   if (VG_(is_kerror)(fd)) {
-      fd = -1;
-   } else {
-      VG_(chmod_u_rw)(fd);
-      if (VG_(is_kerror)(fd)) {
-         fd = -1;
-      }
-   }
-   return fd;
-}
- 
-/* Returns -1 on failure. */
-Int VG_(open_write) ( Char* pathname )
-{  
-   Int fd;
-
-   const int O_WRONLY_AND_TRUNC = 01001; /* See /usr/include/bits/fcntl.h */
-   fd = vg_do_syscall3(__NR_open, (UInt)pathname, O_WRONLY_AND_TRUNC, 0);
-   /* VG_(printf)("result = %d\n", fd); */
-   if (VG_(is_kerror)(fd)) {
-      fd = -1;
-   } 
-   return fd;
-}
-
-void VG_(close) ( Int fd )
-{
-   vg_do_syscall1(__NR_close, fd);
-}
-
-
-Int VG_(read) ( Int fd, void* buf, Int count)
-{
-   Int res;
-   /* res = read( fd, buf, count ); */
-   res = vg_do_syscall3(__NR_read, fd, (UInt)buf, count);
-   if (VG_(is_kerror)(res)) res = -1;
-   return res;
-}
-
-Int VG_(write) ( Int fd, void* buf, Int count)
-{
-   Int res;
-   /* res = write( fd, buf, count ); */
-   res = vg_do_syscall3(__NR_write, fd, (UInt)buf, count);
-   if (VG_(is_kerror)(res)) res = -1;
-   return res;
-}
-
-Int VG_(stat) ( Char* file_name, struct vki_stat* buf )
-{
-   Int res;
-   res = vg_do_syscall2(__NR_stat, (UInt)file_name, (UInt)buf);
-   return
-      VG_(is_kerror)(res) ? (-1) : 0;
-}
-
-/* Misc functions looking for a proper home. */
-
-/* We do getenv without libc's help by snooping around in
-   VG_(client_env) as determined at startup time. */
-Char* VG_(getenv) ( Char* varname )
-{
-   Int i, n;
-   n = VG_(strlen)(varname);
-   for (i = 0; VG_(client_envp)[i] != NULL; i++) {
-      Char* s = VG_(client_envp)[i];
-      if (VG_(strncmp)(varname, s, n) == 0 && s[n] == '=') {
-         return & s[n+1];
-      }
-   }
-   return NULL;
-}
-
-/* You'd be amazed how many places need to know the current pid. */
-Int VG_(getpid) ( void )
-{
-   Int res;
-   /* res = getpid(); */
-   res = vg_do_syscall0(__NR_getpid);
-   return res;
-}
-
-/* Return -1 if error, else 0.  NOTE does not indicate return code of
-   child! */
-Int VG_(system) ( Char* cmd )
-{
-   Int pid, res;
-   void* environ[1] = { NULL };
-   if (cmd == NULL)
-      return 1;
-   pid = vg_do_syscall0(__NR_fork);
-   if (VG_(is_kerror)(pid))
-      return -1;
-   if (pid == 0) {
-      /* child */
-      Char* argv[4];
-      argv[0] = "/bin/sh";
-      argv[1] = "-c";
-      argv[2] = cmd;
-      argv[3] = 0;
-      (void)vg_do_syscall3(__NR_execve, 
-                           (UInt)"/bin/sh", (UInt)argv, (UInt)&environ);
-      /* If we're still alive here, execve failed. */
-      return -1;
-   } else {
-      /* parent */
-      res = vg_do_syscall3(__NR_waitpid, pid, (UInt)NULL, 0);
-      if (VG_(is_kerror)(res)) {
-         return -1;
-      } else {
-	return 0;
-      }
-   }
-}
-
-
-/* ---------------------------------------------------------------------
-   Support for a millisecond-granularity counter using RDTSC.
-   ------------------------------------------------------------------ */
-
-static __inline__ ULong do_rdtsc_insn ( void )
-{
-   ULong x;
-   __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
-   return x;
-}
-
-/* 0 = pre-calibration, 1 = calibration, 2 = running */
-static Int   rdtsc_calibration_state     = 0;
-static ULong rdtsc_ticks_per_millisecond = 0; /* invalid value */
-
-static struct vki_timeval rdtsc_cal_start_timeval;
-static struct vki_timeval rdtsc_cal_end_timeval;
-
-static ULong              rdtsc_cal_start_raw;
-static ULong              rdtsc_cal_end_raw;
-
-UInt VG_(read_millisecond_timer) ( void )
-{
-   ULong rdtsc_now;
-   vg_assert(rdtsc_calibration_state == 2);
-   rdtsc_now = do_rdtsc_insn();
-   vg_assert(rdtsc_now > rdtsc_cal_end_raw);
-   rdtsc_now -= rdtsc_cal_end_raw;
-   rdtsc_now /= rdtsc_ticks_per_millisecond;
-   return (UInt)rdtsc_now;
-}
-
-
-void VG_(start_rdtsc_calibration) ( void )
-{
-   Int res;
-   vg_assert(rdtsc_calibration_state == 0);
-   rdtsc_calibration_state = 1;
-   rdtsc_cal_start_raw = do_rdtsc_insn();
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_start_timeval, 
-                                           (UInt)NULL);
-   vg_assert(!VG_(is_kerror)(res));
-}
-
-void VG_(end_rdtsc_calibration) ( void )
-{
-   Int   res, loops;
-   ULong cpu_clock_MHZ;
-   ULong cal_clock_ticks;
-   ULong cal_wallclock_microseconds;
-   ULong wallclock_start_microseconds;
-   ULong wallclock_end_microseconds;
-   struct vki_timespec req;
-   struct vki_timespec rem;
-   
-   vg_assert(rdtsc_calibration_state == 1);
-   rdtsc_calibration_state = 2;
-
-   /* Try and delay for 20 milliseconds, so that we can at least have
-      some minimum level of accuracy. */
-   req.tv_sec = 0;
-   req.tv_nsec = 20 * 1000 * 1000;
-   loops = 0;
-   while (True) {
-      res = VG_(nanosleep)(&req, &rem);
-      vg_assert(res == 0 /*ok*/ || res == 1 /*interrupted*/);
-      if (res == 0)
-         break;
-      if (rem.tv_sec == 0 && rem.tv_nsec == 0) 
-         break;
-      req = rem;
-      loops++;
-      if (loops > 100) 
-         VG_(panic)("calibration nanosleep loop failed?!");
-   }
-
-   /* Now read both timers, and do the Math. */
-   rdtsc_cal_end_raw = do_rdtsc_insn();
-   res = vg_do_syscall2(__NR_gettimeofday, (UInt)&rdtsc_cal_end_timeval, 
-                                           (UInt)NULL);
-
-   vg_assert(rdtsc_cal_end_raw > rdtsc_cal_start_raw);
-   cal_clock_ticks = rdtsc_cal_end_raw - rdtsc_cal_start_raw;
-
-   wallclock_start_microseconds
-      = (1000000ULL * (ULong)(rdtsc_cal_start_timeval.tv_sec)) 
-         + (ULong)(rdtsc_cal_start_timeval.tv_usec);
-   wallclock_end_microseconds
-      = (1000000ULL * (ULong)(rdtsc_cal_end_timeval.tv_sec)) 
-         + (ULong)(rdtsc_cal_end_timeval.tv_usec);
-   vg_assert(wallclock_end_microseconds > wallclock_start_microseconds);
-   cal_wallclock_microseconds 
-      = wallclock_end_microseconds - wallclock_start_microseconds;
-
-   /* Since we just nanoslept for 20 ms ... */
-   vg_assert(cal_wallclock_microseconds >= 20000);
-
-   /* Now we know (roughly) that cal_clock_ticks on RDTSC take
-      cal_wallclock_microseconds elapsed time.  Calculate the RDTSC
-      ticks-per-millisecond value. */
-   if (0)
-      VG_(printf)("%lld ticks in %lld microseconds\n", 
-                  cal_clock_ticks,  cal_wallclock_microseconds );
-
-   rdtsc_ticks_per_millisecond   
-      = cal_clock_ticks / (cal_wallclock_microseconds / 1000ULL);
-   cpu_clock_MHZ
-      = (1000ULL * rdtsc_ticks_per_millisecond) / 1000000ULL;
-   if (VG_(clo_verbosity) >= 1)
-      VG_(message)(Vg_UserMsg, "Estimated CPU clock rate is %d MHz",
-                               (UInt)cpu_clock_MHZ);
-   if (cpu_clock_MHZ < 50 || cpu_clock_MHZ > 10000)
-      VG_(panic)("end_rdtsc_calibration: "
-                 "estimated CPU MHz outside range 50 .. 10000");
-   /* Paranoia about division by zero later. */
-   vg_assert(rdtsc_ticks_per_millisecond != 0);
-   if (0)
-      VG_(printf)("ticks per millisecond %llu\n", 
-                  rdtsc_ticks_per_millisecond);
-}
-
-
-
-/* ---------------------------------------------------------------------
-   Primitive support for bagging memory via mmap.
-   ------------------------------------------------------------------ */
-
-void* VG_(get_memory_from_mmap) ( Int nBytes, Char* who )
-{
-   static UInt tot_alloc = 0;
-   void* p = VG_(mmap)( 0, nBytes,
-                        VKI_PROT_READ|VKI_PROT_WRITE|VKI_PROT_EXEC, 
-                        VKI_MAP_PRIVATE|VKI_MAP_ANONYMOUS, -1, 0 );
-   if (p != ((void*)(-1))) {
-      tot_alloc += (UInt)nBytes;
-      if (0)
-         VG_(printf)(
-            "get_memory_from_mmap: %d tot, %d req = %p .. %p, caller %s\n",
-            tot_alloc, nBytes, p, ((char*)p) + nBytes - 1, who );
-      return p;
-   }
-   VG_(printf)("vg_get_memory_from_mmap failed on request of %d\n", 
-               nBytes);
-   VG_(panic)("vg_get_memory_from_mmap: out of memory!  Fatal!  Bye!\n");
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                              vg_mylibc.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_procselfmaps.c b/coregrind/vg_procselfmaps.c
deleted file mode 100644
index ceba7b3bf0..0000000000
--- a/coregrind/vg_procselfmaps.c
+++ /dev/null
@@ -1,200 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A simple parser for /proc/self/maps on Linux 2.4.X           ---*/
-/*---                                            vg_procselfmaps.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-
-#include "vg_include.h"
-
-
-/* static ... to keep it out of the stack frame. */
-
-static Char procmap_buf[M_PROCMAP_BUF];
-
-
-/* Helper fns. */
-
-static Int hexdigit ( Char c )
-{
-   if (c >= '0' && c <= '9') return (Int)(c - '0');
-   if (c >= 'a' && c <= 'f') return 10 + (Int)(c - 'a');
-   if (c >= 'A' && c <= 'F') return 10 + (Int)(c - 'A');
-   return -1;
-}
-
-static Int readchar ( Char* buf, Char* ch )
-{
-   if (*buf == 0) return 0;
-   *ch = *buf;
-   return 1;
-}
-
-static Int readhex ( Char* buf, UInt* val )
-{
-   Int n = 0;
-   *val = 0;
-   while (hexdigit(*buf) >= 0) {
-      *val = (*val << 4) + hexdigit(*buf);
-      n++; buf++;
-   }
-   return n;
-}
-
-
-
-/* Read /proc/self/maps.  For each map entry, call
-   record_mapping, passing it, in this order:
-
-      start address in memory
-      length
-      r permissions char; either - or r
-      w permissions char; either - or w
-      x permissions char; either - or x
-      offset in file, or zero if no file
-      filename, zero terminated, or NULL if no file
-
-   So the sig of the called fn might be
-
-      void (*record_mapping)( Addr start, UInt size, 
-                              Char r, Char w, Char x, 
-                              UInt foffset, UChar* filename )
-
-   Note that the supplied filename is transiently stored; record_mapping 
-   should make a copy if it wants to keep it.
-
-   If there's a syntax error or other failure, just abort.  
-*/
-
-void VG_(read_procselfmaps) (
-   void (*record_mapping)( Addr, UInt, Char, Char, Char, UInt, UChar* )
-)
-{
-   Int    i, j, n_tot, n_chunk, fd, i_eol;
-   Addr   start, endPlusOne;
-   UChar* filename;
-   UInt   foffset;
-   UChar  rr, ww, xx, pp, ch;
-
-   /* Read the initial memory mapping from the /proc filesystem. */
-   fd = VG_(open_read) ( "/proc/self/maps" );
-   if (fd == -1) {
-      VG_(message)(Vg_UserMsg, "FATAL: can't open /proc/self/maps");
-      VG_(exit)(1);
-   }
-   n_tot = 0;
-   do {
-      n_chunk = VG_(read) ( fd, &procmap_buf[n_tot], M_PROCMAP_BUF - n_tot );
-      n_tot += n_chunk;
-   } while ( n_chunk > 0 && n_tot < M_PROCMAP_BUF );
-   VG_(close)(fd);
-   if (n_tot >= M_PROCMAP_BUF-5) {
-      VG_(message)(Vg_UserMsg, "FATAL: M_PROCMAP_BUF is too small; "
-                               "increase it and recompile");
-       VG_(exit)(1);
-   }
-   if (n_tot == 0) {
-      VG_(message)(Vg_UserMsg, "FATAL: I/O error on /proc/self/maps" );
-       VG_(exit)(1);
-   }
-   procmap_buf[n_tot] = 0;
-   if (0)
-      VG_(message)(Vg_DebugMsg, "raw:\n%s", procmap_buf );
-
-   /* Ok, it's safely aboard.  Parse the entries. */
-
-   i = 0;
-   while (True) {
-      if (i >= n_tot) break;
-
-      /* Read (without fscanf :) the pattern %8x-%8x %c%c%c%c %8x */
-      j = readhex(&procmap_buf[i], &start);
-      if (j > 0) i += j; else goto syntaxerror;
-      j = readchar(&procmap_buf[i], &ch);
-      if (j == 1 && ch == '-') i += j; else goto syntaxerror;
-      j = readhex(&procmap_buf[i], &endPlusOne);
-      if (j > 0) i += j; else goto syntaxerror;
-
-      j = readchar(&procmap_buf[i], &ch);
-      if (j == 1 && ch == ' ') i += j; else goto syntaxerror;
-
-      j = readchar(&procmap_buf[i], &rr);
-      if (j == 1 && (rr == 'r' || rr == '-')) i += j; else goto syntaxerror;
-      j = readchar(&procmap_buf[i], &ww);
-      if (j == 1 && (ww == 'w' || ww == '-')) i += j; else goto syntaxerror;
-      j = readchar(&procmap_buf[i], &xx);
-      if (j == 1 && (xx == 'x' || xx == '-')) i += j; else goto syntaxerror;
-      /* I haven't a clue what this last field means. */
-      j = readchar(&procmap_buf[i], &pp);
-      if (j == 1 && (pp == 'p' || pp == '-' || pp == 's')) 
-                                              i += j; else goto syntaxerror;
-
-      j = readchar(&procmap_buf[i], &ch);
-      if (j == 1 && ch == ' ') i += j; else goto syntaxerror;
-
-      j = readhex(&procmap_buf[i], &foffset);
-      if (j > 0) i += j; else goto syntaxerror;
-      
-      goto read_line_ok;
-
-    syntaxerror:
-      VG_(message)(Vg_UserMsg, "FATAL: syntax error reading /proc/self/maps");
-      { Int k;
-        VG_(printf)("last 50 chars: `");
-        for (k = i-50; k <= i; k++) VG_(printf)("%c", procmap_buf[k]);
-        VG_(printf)("'\n");
-      }
-       VG_(exit)(1);
-
-    read_line_ok:
-      /* Try and find the name of the file mapped to this segment, if
-         it exists. */
-      while (procmap_buf[i] != '\n' && i < M_PROCMAP_BUF-1) i++;
-      i_eol = i;
-      i--;
-      while (!VG_(isspace)(procmap_buf[i]) && i >= 0) i--;
-      i++;
-      if (i < i_eol-1 && procmap_buf[i] == '/') {
-         filename = &procmap_buf[i];
-         filename[i_eol - i] = '\0';
-      } else {
-         filename = NULL;
-         foffset = 0;
-      }
-
-      (*record_mapping) ( start, endPlusOne-start, 
-                          rr, ww, xx, 
-                          foffset, filename );
-
-      i = i_eol + 1;
-   }
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                        vg_procselfmaps.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
deleted file mode 100644
index 0ad56b10c5..0000000000
--- a/coregrind/vg_scheduler.c
+++ /dev/null
@@ -1,3426 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A user-space pthreads implementation.         vg_scheduler.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "vg_constants.h"
-#include "valgrind.h" /* for VG_USERREQ__MAKE_NOACCESS and
-                         VG_USERREQ__DO_LEAK_CHECK */
-
-/* BORKAGE/ISSUES as of 29 May 02
-
-- Currently, when a signal is run, just the ThreadStatus.status fields 
-  are saved in the signal frame, along with the CPU state.  Question: 
-  should I also save and restore:
-     ThreadStatus.joiner 
-     ThreadStatus.waited_on_mid
-     ThreadStatus.awaken_at
-     ThreadStatus.retval
-  Currently unsure, and so am not doing so.
-
-- Signals interrupting read/write and nanosleep: SA_RESTART settings.
-  Read/write correctly return with EINTR when SA_RESTART isn't
-  specified and they are interrupted by a signal.  nanosleep just
-  pretends signals don't exist -- should be fixed.
-
-- So, what's the deal with signals and mutexes?  If a thread is
-  blocked on a mutex, or for a condition variable for that matter, can
-  signals still be delivered to it?  This has serious consequences --
-  deadlocks, etc.
-
-- Signals still not really right.  Each thread should have its
-  own pending-set, but there is just one process-wide pending set.
-
-  TODO for valgrind-1.0:
-
-- Update assertion checking in scheduler_sanity().
-
-  TODO sometime:
-
-- poll() in the vg_libpthread.c -- should it handle the nanosleep
-  being interrupted by a signal?  Ditto accept?
-
-- Mutex scrubbing - clearup_after_thread_exit: look for threads
-  blocked on mutexes held by the exiting thread, and release them
-  appropriately. (??)
-
-- pthread_atfork
-
-*/
-
-
-/* ---------------------------------------------------------------------
-   Types and globals for the scheduler.
-   ------------------------------------------------------------------ */
-
-/* type ThreadId is defined in vg_include.h. */
-
-/* struct ThreadState is defined in vg_include.h. */
-
-/* Globals.  A statically allocated array of threads.  NOTE: [0] is
-   never used, to simplify the simulation of initialisers for
-   LinuxThreads. */
-ThreadState VG_(threads)[VG_N_THREADS];
-
-/* The process' fork-handler stack. */
-static Int              vg_fhstack_used = 0;
-static ForkHandlerEntry vg_fhstack[VG_N_FORKHANDLERSTACK];
-
-
-/* The tid of the thread currently in VG_(baseBlock). */
-static Int vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
-
-
-/* vg_oursignalhandler() might longjmp().  Here's the jmp_buf. */
-jmp_buf VG_(scheduler_jmpbuf);
-/* This says whether scheduler_jmpbuf is actually valid.  Needed so
-   that our signal handler doesn't longjmp when the buffer isn't
-   actually valid. */
-Bool    VG_(scheduler_jmpbuf_valid) = False;
-/* ... and if so, here's the signal which caused it to do so. */
-Int     VG_(longjmpd_on_signal);
-
-
-/* Machinery to keep track of which threads are waiting on which
-   fds. */
-typedef
-   struct {
-      /* The thread which made the request. */
-      ThreadId tid;
-
-      /* The next two fields describe the request. */
-      /* File descriptor waited for.  -1 means this slot is not in use */
-      Int      fd;
-      /* The syscall number the fd is used in. */
-      Int      syscall_no;
-
-      /* False => still waiting for select to tell us the fd is ready
-         to go.  True => the fd is ready, but the results have not yet
-         been delivered back to the calling thread.  Once the latter
-         happens, this entire record is marked as no longer in use, by
-         making the fd field be -1.  */
-      Bool     ready; 
-   }
-   VgWaitedOnFd;
-
-static VgWaitedOnFd vg_waiting_fds[VG_N_WAITING_FDS];
-
-
-/* Keeping track of keys. */
-typedef
-   struct {
-      /* Has this key been allocated ? */
-      Bool inuse;
-      /* If .inuse==True, records the address of the associated
-         destructor, or NULL if none. */
-      void (*destructor)(void*);
-   }
-   ThreadKeyState;
-
-/* And our array of thread keys. */
-static ThreadKeyState vg_thread_keys[VG_N_THREAD_KEYS];
-
-typedef UInt ThreadKey;
-
-
-/* Forwards */
-static void do_client_request ( ThreadId tid );
-static void scheduler_sanity ( void );
-static void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid );
-
-
-/* ---------------------------------------------------------------------
-   Helper functions for the scheduler.
-   ------------------------------------------------------------------ */
-
-__inline__
-Bool VG_(is_valid_tid) ( ThreadId tid )
-{
-   /* tid is unsigned, hence no < 0 test. */
-   if (tid == 0) return False;
-   if (tid >= VG_N_THREADS) return False;
-   if (VG_(threads)[tid].status == VgTs_Empty) return False;
-   return True;
-}
-
-
-__inline__
-Bool VG_(is_valid_or_empty_tid) ( ThreadId tid )
-{
-   /* tid is unsigned, hence no < 0 test. */
-   if (tid == 0) return False;
-   if (tid >= VG_N_THREADS) return False;
-   return True;
-}
-
-
-/* For constructing error messages only: try and identify a thread
-   whose stack this address currently falls within, or return
-   VG_INVALID_THREADID if it doesn't.  A small complication is dealing
-   with any currently VG_(baseBlock)-resident thread. 
-*/
-ThreadId VG_(identify_stack_addr)( Addr a )
-{
-   ThreadId tid, tid_to_skip;
-
-   tid_to_skip = VG_INVALID_THREADID;
-
-   /* First check to see if there's a currently-loaded thread in
-      VG_(baseBlock). */
-   if (vg_tid_currently_in_baseBlock != VG_INVALID_THREADID) {
-      tid = vg_tid_currently_in_baseBlock;
-      if (VG_(baseBlock)[VGOFF_(m_esp)] <= a
-          && a <= VG_(threads)[tid].stack_highest_word) 
-         return tid;
-      else
-         tid_to_skip = tid;
-   }
-
-   for (tid = 1; tid < VG_N_THREADS; tid++) {
-      if (VG_(threads)[tid].status == VgTs_Empty) continue;
-      if (tid == tid_to_skip) continue;
-      if (VG_(threads)[tid].m_esp <= a 
-          && a <= VG_(threads)[tid].stack_highest_word)
-         return tid;
-   }
-   return VG_INVALID_THREADID;
-}
- 
-
-/* Print the scheduler status. */
-void VG_(pp_sched_status) ( void )
-{
-   Int i; 
-   VG_(printf)("\nsched status:\n"); 
-   for (i = 1; i < VG_N_THREADS; i++) {
-      if (VG_(threads)[i].status == VgTs_Empty) continue;
-      VG_(printf)("\nThread %d: status = ", i);
-      switch (VG_(threads)[i].status) {
-         case VgTs_Runnable:   VG_(printf)("Runnable"); break;
-         case VgTs_WaitFD:     VG_(printf)("WaitFD"); break;
-         case VgTs_WaitJoinee: VG_(printf)("WaitJoinee(%d)", 
-                                           VG_(threads)[i].joiner_jee_tid);
-                               break;
-         case VgTs_WaitJoiner: VG_(printf)("WaitJoiner"); break;
-         case VgTs_Sleeping:   VG_(printf)("Sleeping"); break;
-         case VgTs_WaitMX:     VG_(printf)("WaitMX"); break;
-         case VgTs_WaitCV:     VG_(printf)("WaitCV"); break;
-         case VgTs_WaitSIG:    VG_(printf)("WaitSIG"); break;
-         default: VG_(printf)("???"); break;
-      }
-      VG_(printf)(", associated_mx = %p, associated_cv = %p\n", 
-                  VG_(threads)[i].associated_mx,
-                  VG_(threads)[i].associated_cv );
-      VG_(pp_ExeContext)( 
-         VG_(get_ExeContext)( False, VG_(threads)[i].m_eip, 
-                                     VG_(threads)[i].m_ebp ));
-   }
-   VG_(printf)("\n");
-}
-
-static
-void add_waiting_fd ( ThreadId tid, Int fd, Int syscall_no )
-{
-   Int i;
-
-   vg_assert(fd != -1); /* avoid total chaos */
-
-   for (i = 0;  i < VG_N_WAITING_FDS; i++)
-      if (vg_waiting_fds[i].fd == -1)
-         break;
-
-   if (i == VG_N_WAITING_FDS)
-      VG_(panic)("add_waiting_fd: VG_N_WAITING_FDS is too low");
-   /*
-   VG_(printf)("add_waiting_fd: add (tid %d, fd %d) at slot %d\n", 
-               tid, fd, i);
-   */
-   vg_waiting_fds[i].fd         = fd;
-   vg_waiting_fds[i].tid        = tid;
-   vg_waiting_fds[i].ready      = False;
-   vg_waiting_fds[i].syscall_no = syscall_no;
-}
-
-
-
-static
-void print_sched_event ( ThreadId tid, Char* what )
-{
-   VG_(message)(Vg_DebugMsg, "  SCHED[%d]: %s", tid, what );
-}
-
-
-static
-void print_pthread_event ( ThreadId tid, Char* what )
-{
-   VG_(message)(Vg_DebugMsg, "PTHREAD[%d]: %s", tid, what );
-}
-
-
-static
-Char* name_of_sched_event ( UInt event )
-{
-   switch (event) {
-      case VG_TRC_EBP_JMP_SYSCALL:    return "SYSCALL";
-      case VG_TRC_EBP_JMP_CLIENTREQ:  return "CLIENTREQ";
-      case VG_TRC_INNER_COUNTERZERO:  return "COUNTERZERO";
-      case VG_TRC_INNER_FASTMISS:     return "FASTMISS";
-      case VG_TRC_UNRESUMABLE_SIGNAL: return "FATALSIGNAL";
-      default:                        return "??UNKNOWN??";
-  }
-}
-
-
-/* Create a translation of the client basic block beginning at
-   orig_addr, and add it to the translation cache & translation table.
-   This probably doesn't really belong here, but, hey ... 
-*/
-static
-void create_translation_for ( ThreadId tid, Addr orig_addr )
-{
-   Addr    trans_addr;
-   TTEntry tte;
-   Int orig_size, trans_size;
-   /* Ensure there is space to hold a translation. */
-   VG_(maybe_do_lru_pass)();
-   VG_(translate)( &VG_(threads)[tid],
-                   orig_addr, &orig_size, &trans_addr, &trans_size );
-   /* Copy data at trans_addr into the translation cache.
-      Returned pointer is to the code, not to the 4-byte
-      header. */
-   /* Since the .orig_size and .trans_size fields are
-      UShort, be paranoid. */
-   vg_assert(orig_size > 0 && orig_size < 65536);
-   vg_assert(trans_size > 0 && trans_size < 65536);
-   tte.orig_size  = orig_size;
-   tte.orig_addr  = orig_addr;
-   tte.trans_size = trans_size;
-   tte.trans_addr = VG_(copy_to_transcache)
-                       ( trans_addr, trans_size );
-   tte.mru_epoch  = VG_(current_epoch);
-   /* Free the intermediary -- was allocated by VG_(emit_code). */
-   VG_(jitfree)( (void*)trans_addr );
-   /* Add to trans tab and set back pointer. */
-   VG_(add_to_trans_tab) ( &tte );
-   /* Update stats. */
-   VG_(this_epoch_in_count) ++;
-   VG_(this_epoch_in_osize) += orig_size;
-   VG_(this_epoch_in_tsize) += trans_size;
-   VG_(overall_in_count) ++;
-   VG_(overall_in_osize) += orig_size;
-   VG_(overall_in_tsize) += trans_size;
-}
-
-
-/* Allocate a completely empty ThreadState record. */
-static
-ThreadId vg_alloc_ThreadState ( void )
-{
-   Int i;
-   for (i = 1; i < VG_N_THREADS; i++) {
-      if (VG_(threads)[i].status == VgTs_Empty)
-         return i;
-   }
-   VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
-   VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
-   VG_(panic)("VG_N_THREADS is too low");
-   /*NOTREACHED*/
-}
-
-
-ThreadState* VG_(get_current_thread_state) ( void )
-{
-   vg_assert(VG_(is_valid_tid)(vg_tid_currently_in_baseBlock));
-   return & VG_(threads)[vg_tid_currently_in_baseBlock];
-}
-
-
-ThreadId VG_(get_current_tid) ( void )
-{
-   vg_assert(VG_(is_valid_tid)(vg_tid_currently_in_baseBlock));
-   return vg_tid_currently_in_baseBlock;
-}
-
-
-/* Copy the saved state of a thread into VG_(baseBlock), ready for it
-   to be run. */
-__inline__
-void VG_(load_thread_state) ( ThreadId tid )
-{
-   Int i;
-   vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
-
-   VG_(baseBlock)[VGOFF_(m_eax)] = VG_(threads)[tid].m_eax;
-   VG_(baseBlock)[VGOFF_(m_ebx)] = VG_(threads)[tid].m_ebx;
-   VG_(baseBlock)[VGOFF_(m_ecx)] = VG_(threads)[tid].m_ecx;
-   VG_(baseBlock)[VGOFF_(m_edx)] = VG_(threads)[tid].m_edx;
-   VG_(baseBlock)[VGOFF_(m_esi)] = VG_(threads)[tid].m_esi;
-   VG_(baseBlock)[VGOFF_(m_edi)] = VG_(threads)[tid].m_edi;
-   VG_(baseBlock)[VGOFF_(m_ebp)] = VG_(threads)[tid].m_ebp;
-   VG_(baseBlock)[VGOFF_(m_esp)] = VG_(threads)[tid].m_esp;
-   VG_(baseBlock)[VGOFF_(m_eflags)] = VG_(threads)[tid].m_eflags;
-   VG_(baseBlock)[VGOFF_(m_eip)] = VG_(threads)[tid].m_eip;
-
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_fpustate) + i] = VG_(threads)[tid].m_fpu[i];
-
-   VG_(baseBlock)[VGOFF_(sh_eax)] = VG_(threads)[tid].sh_eax;
-   VG_(baseBlock)[VGOFF_(sh_ebx)] = VG_(threads)[tid].sh_ebx;
-   VG_(baseBlock)[VGOFF_(sh_ecx)] = VG_(threads)[tid].sh_ecx;
-   VG_(baseBlock)[VGOFF_(sh_edx)] = VG_(threads)[tid].sh_edx;
-   VG_(baseBlock)[VGOFF_(sh_esi)] = VG_(threads)[tid].sh_esi;
-   VG_(baseBlock)[VGOFF_(sh_edi)] = VG_(threads)[tid].sh_edi;
-   VG_(baseBlock)[VGOFF_(sh_ebp)] = VG_(threads)[tid].sh_ebp;
-   VG_(baseBlock)[VGOFF_(sh_esp)] = VG_(threads)[tid].sh_esp;
-   VG_(baseBlock)[VGOFF_(sh_eflags)] = VG_(threads)[tid].sh_eflags;
-
-   vg_tid_currently_in_baseBlock = tid;
-}
-
-
-/* Copy the state of a thread from VG_(baseBlock), presumably after it
-   has been descheduled.  For sanity-check purposes, fill the vacated
-   VG_(baseBlock) with garbage so as to make the system more likely to
-   fail quickly if we erroneously continue to poke around inside
-   VG_(baseBlock) without first doing a load_thread_state().  
-*/
-__inline__
-void VG_(save_thread_state) ( ThreadId tid )
-{
-   Int i;
-   const UInt junk = 0xDEADBEEF;
-
-   vg_assert(vg_tid_currently_in_baseBlock != VG_INVALID_THREADID);
-
-   VG_(threads)[tid].m_eax = VG_(baseBlock)[VGOFF_(m_eax)];
-   VG_(threads)[tid].m_ebx = VG_(baseBlock)[VGOFF_(m_ebx)];
-   VG_(threads)[tid].m_ecx = VG_(baseBlock)[VGOFF_(m_ecx)];
-   VG_(threads)[tid].m_edx = VG_(baseBlock)[VGOFF_(m_edx)];
-   VG_(threads)[tid].m_esi = VG_(baseBlock)[VGOFF_(m_esi)];
-   VG_(threads)[tid].m_edi = VG_(baseBlock)[VGOFF_(m_edi)];
-   VG_(threads)[tid].m_ebp = VG_(baseBlock)[VGOFF_(m_ebp)];
-   VG_(threads)[tid].m_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-   VG_(threads)[tid].m_eflags = VG_(baseBlock)[VGOFF_(m_eflags)];
-   VG_(threads)[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];
-
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(threads)[tid].m_fpu[i] = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
-
-   VG_(threads)[tid].sh_eax = VG_(baseBlock)[VGOFF_(sh_eax)];
-   VG_(threads)[tid].sh_ebx = VG_(baseBlock)[VGOFF_(sh_ebx)];
-   VG_(threads)[tid].sh_ecx = VG_(baseBlock)[VGOFF_(sh_ecx)];
-   VG_(threads)[tid].sh_edx = VG_(baseBlock)[VGOFF_(sh_edx)];
-   VG_(threads)[tid].sh_esi = VG_(baseBlock)[VGOFF_(sh_esi)];
-   VG_(threads)[tid].sh_edi = VG_(baseBlock)[VGOFF_(sh_edi)];
-   VG_(threads)[tid].sh_ebp = VG_(baseBlock)[VGOFF_(sh_ebp)];
-   VG_(threads)[tid].sh_esp = VG_(baseBlock)[VGOFF_(sh_esp)];
-   VG_(threads)[tid].sh_eflags = VG_(baseBlock)[VGOFF_(sh_eflags)];
-
-   /* Fill it up with junk. */
-   VG_(baseBlock)[VGOFF_(m_eax)] = junk;
-   VG_(baseBlock)[VGOFF_(m_ebx)] = junk;
-   VG_(baseBlock)[VGOFF_(m_ecx)] = junk;
-   VG_(baseBlock)[VGOFF_(m_edx)] = junk;
-   VG_(baseBlock)[VGOFF_(m_esi)] = junk;
-   VG_(baseBlock)[VGOFF_(m_edi)] = junk;
-   VG_(baseBlock)[VGOFF_(m_ebp)] = junk;
-   VG_(baseBlock)[VGOFF_(m_esp)] = junk;
-   VG_(baseBlock)[VGOFF_(m_eflags)] = junk;
-   VG_(baseBlock)[VGOFF_(m_eip)] = junk;
-
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_fpustate) + i] = junk;
-
-   vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
-}
-
-
-/* Run the thread tid for a while, and return a VG_TRC_* value to the
-   scheduler indicating what happened. */
-static
-UInt run_thread_for_a_while ( ThreadId tid )
-{
-   volatile UInt trc = 0;
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-   vg_assert(VG_(bbs_to_go) > 0);
-   vg_assert(!VG_(scheduler_jmpbuf_valid));
-
-   VGP_PUSHCC(VgpRun);
-   VG_(load_thread_state) ( tid );
-   if (__builtin_setjmp(VG_(scheduler_jmpbuf)) == 0) {
-      /* try this ... */
-      VG_(scheduler_jmpbuf_valid) = True;
-      trc = VG_(run_innerloop)();
-      VG_(scheduler_jmpbuf_valid) = False;
-      /* We get here if the client didn't take a fault. */
-   } else {
-      /* We get here if the client took a fault, which caused our
-         signal handler to longjmp. */
-      VG_(scheduler_jmpbuf_valid) = False;
-      vg_assert(trc == 0);
-      trc = VG_TRC_UNRESUMABLE_SIGNAL;
-   }
-
-   vg_assert(!VG_(scheduler_jmpbuf_valid));
-
-   VG_(save_thread_state) ( tid );
-   VGP_POPCC;
-   return trc;
-}
-
-
-/* Increment the LRU epoch counter. */
-static
-void increment_epoch ( void )
-{
-   VG_(current_epoch)++;
-   if (VG_(clo_verbosity) > 2) {
-      UInt tt_used, tc_used;
-      VG_(get_tt_tc_used) ( &tt_used, &tc_used );
-      VG_(message)(Vg_UserMsg,
-         "%lu bbs, in: %d (%d -> %d), out %d (%d -> %d), TT %d, TC %d",
-          VG_(bbs_done), 
-          VG_(this_epoch_in_count),
-          VG_(this_epoch_in_osize),
-          VG_(this_epoch_in_tsize),
-          VG_(this_epoch_out_count),
-          VG_(this_epoch_out_osize),
-          VG_(this_epoch_out_tsize),
-          tt_used, tc_used
-       );
-   }
-   VG_(this_epoch_in_count) = 0;
-   VG_(this_epoch_in_osize) = 0;
-   VG_(this_epoch_in_tsize) = 0;
-   VG_(this_epoch_out_count) = 0;
-   VG_(this_epoch_out_osize) = 0;
-   VG_(this_epoch_out_tsize) = 0;
-}
-
-
-static 
-void mostly_clear_thread_record ( ThreadId tid )
-{
-   Int j;
-   vg_assert(tid >= 0 && tid < VG_N_THREADS);
-   VG_(threads)[tid].tid                  = tid;
-   VG_(threads)[tid].status               = VgTs_Empty;
-   VG_(threads)[tid].associated_mx        = NULL;
-   VG_(threads)[tid].associated_cv        = NULL;
-   VG_(threads)[tid].awaken_at            = 0;
-   VG_(threads)[tid].joinee_retval        = NULL;
-   VG_(threads)[tid].joiner_thread_return = NULL;
-   VG_(threads)[tid].joiner_jee_tid       = VG_INVALID_THREADID;
-   VG_(threads)[tid].detached             = False;
-   VG_(threads)[tid].cancel_st   = True; /* PTHREAD_CANCEL_ENABLE */
-   VG_(threads)[tid].cancel_ty   = True; /* PTHREAD_CANCEL_DEFERRED */
-   VG_(threads)[tid].cancel_pend = NULL; /* not pending */
-   VG_(threads)[tid].custack_used = 0;
-   VG_(threads)[tid].n_signals_returned = 0;
-   VG_(ksigemptyset)(&VG_(threads)[tid].sig_mask);
-   VG_(ksigemptyset)(&VG_(threads)[tid].sigs_waited_for);
-   for (j = 0; j < VG_N_THREAD_KEYS; j++)
-      VG_(threads)[tid].specifics[j] = NULL;
-}
-
-
-/* Initialise the scheduler.  Create a single "main" thread ready to
-   run, with special ThreadId of one.  This is called at startup; the
-   caller takes care to park the client's state is parked in
-   VG_(baseBlock).  
-*/
-void VG_(scheduler_init) ( void )
-{
-   Int      i;
-   Addr     startup_esp;
-   ThreadId tid_main;
-
-   startup_esp = VG_(baseBlock)[VGOFF_(m_esp)];
-
-   if (VG_STACK_MATCHES_BASE(startup_esp, VG_STARTUP_STACK_BASE_1)
-       || VG_STACK_MATCHES_BASE(startup_esp, VG_STARTUP_STACK_BASE_2) 
-       || VG_STACK_MATCHES_BASE(startup_esp, VG_STARTUP_STACK_BASE_3)) {
-      /* Jolly good! */
-   } else {
-      VG_(printf)("%%esp at startup = %p is not near %p, %p or %p; aborting\n", 
-                  (void*)startup_esp, 
-                  (void*)VG_STARTUP_STACK_BASE_1,
-                  (void*)VG_STARTUP_STACK_BASE_2,
-                  (void*)VG_STARTUP_STACK_BASE_3 );
-      VG_(panic)("unexpected %esp at startup");
-   }
-
-   for (i = 0 /* NB; not 1 */; i < VG_N_THREADS; i++) {
-      mostly_clear_thread_record(i);
-      VG_(threads)[i].stack_size           = 0;
-      VG_(threads)[i].stack_base           = (Addr)NULL;
-      VG_(threads)[i].stack_highest_word   = (Addr)NULL;
-   }
-
-   for (i = 0; i < VG_N_WAITING_FDS; i++)
-      vg_waiting_fds[i].fd = -1; /* not in use */
-
-   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
-      vg_thread_keys[i].inuse      = False;
-      vg_thread_keys[i].destructor = NULL;
-   }
-
-   vg_fhstack_used = 0;
-
-   /* Assert this is thread zero, which has certain magic
-      properties. */
-   tid_main = vg_alloc_ThreadState();
-   vg_assert(tid_main == 1); 
-   VG_(threads)[tid_main].status = VgTs_Runnable;
-
-   /* Copy VG_(baseBlock) state to tid_main's slot. */
-   vg_tid_currently_in_baseBlock = tid_main;
-   VG_(save_thread_state) ( tid_main );
-
-   VG_(threads)[tid_main].stack_highest_word 
-      = VG_(threads)[tid_main].m_esp /* -4  ??? */;
-
-   /* So now ... */
-   vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);
-
-   /* Not running client code right now. */
-   VG_(scheduler_jmpbuf_valid) = False;
-}
-
-
-/* What if fd isn't a valid fd? */
-static
-void set_fd_nonblocking ( Int fd )
-{
-   Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
-   vg_assert(!VG_(is_kerror)(res));
-   res |= VKI_O_NONBLOCK;
-   res = VG_(fcntl)( fd, VKI_F_SETFL, res );
-   vg_assert(!VG_(is_kerror)(res));
-}
-
-static
-void set_fd_blocking ( Int fd )
-{
-   Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
-   vg_assert(!VG_(is_kerror)(res));
-   res &= ~VKI_O_NONBLOCK;
-   res = VG_(fcntl)( fd, VKI_F_SETFL, res );
-   vg_assert(!VG_(is_kerror)(res));
-}
-
-static
-Bool fd_is_blockful ( Int fd )
-{
-   Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
-   vg_assert(!VG_(is_kerror)(res));
-   return (res & VKI_O_NONBLOCK) ? False : True;
-}
-
-static
-Bool fd_is_valid ( Int fd )
-{
-   Int res = VG_(fcntl)( fd, VKI_F_GETFL, 0 );
-   return VG_(is_kerror)(res) ? False : True;
-}
-
-
-
-/* vthread tid is returning from a signal handler; modify its
-   stack/regs accordingly. */
-
-/* [Helper fn for handle_signal_return] tid, assumed to be in WaitFD
-   for read or write, has been interrupted by a signal.  Find and
-   clear the relevant vg_waiting_fd[] entry.  Most of the code in this
-   procedure is total paranoia, if you look closely. */
-static
-void cleanup_waiting_fd_table ( ThreadId tid )
-{
-   Int  i, waiters;
-
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_WaitFD);
-   vg_assert(VG_(threads)[tid].m_eax == __NR_read 
-             || VG_(threads)[tid].m_eax == __NR_write);
-
-   /* Excessively paranoidly ... find the fd this op was waiting
-      for, and mark it as not being waited on. */
-   waiters = 0;
-   for (i = 0; i < VG_N_WAITING_FDS; i++) {
-      if (vg_waiting_fds[i].tid == tid) {
-         waiters++;
-         vg_assert(vg_waiting_fds[i].syscall_no == VG_(threads)[tid].m_eax);
-      }
-   }
-   vg_assert(waiters == 1);
-   for (i = 0; i < VG_N_WAITING_FDS; i++)
-      if (vg_waiting_fds[i].tid == tid)
-         break;
-   vg_assert(i < VG_N_WAITING_FDS);
-   vg_assert(vg_waiting_fds[i].fd != -1);
-   vg_waiting_fds[i].fd = -1; /* not in use */
-}
-
-
-static
-void handle_signal_return ( ThreadId tid )
-{
-   Char msg_buf[100];
-   Bool restart_blocked_syscalls;
-   struct vki_timespec * rem;
-
-   vg_assert(VG_(is_valid_tid)(tid));
-
-   /* Increment signal-returned counter.  Used only to implement
-      pause(). */
-   VG_(threads)[tid].n_signals_returned++;
-
-   restart_blocked_syscalls = VG_(signal_returns)(tid);
-
-   if (restart_blocked_syscalls)
-      /* Easy; we don't have to do anything. */
-      return;
-
-   if (VG_(threads)[tid].status == VgTs_WaitFD
-       && (VG_(threads)[tid].m_eax == __NR_read 
-           || VG_(threads)[tid].m_eax == __NR_write)) {
-      /* read() or write() interrupted.  Force a return with EINTR. */
-      cleanup_waiting_fd_table(tid);
-      VG_(threads)[tid].m_eax = -VKI_EINTR;
-      VG_(threads)[tid].status = VgTs_Runnable;
-
-      if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf, 
-            "read() / write() interrupted by signal; return EINTR" );
-         print_sched_event(tid, msg_buf);
-      }
-      return;
-   }
-
-   if (VG_(threads)[tid].status == VgTs_Sleeping
-       && VG_(threads)[tid].m_eax == __NR_nanosleep) {
-      /* We interrupted a nanosleep().  The right thing to do is to
-         write the unused time to nanosleep's second param, but that's
-         too much effort ... we just say that 1 nanosecond was not
-         used, and return EINTR. */
-      rem = (struct vki_timespec *)VG_(threads)[tid].m_ecx; /* arg2 */
-      if (rem != NULL) {
-         rem->tv_sec = 0;
-         rem->tv_nsec = 1;
-      }
-      SET_EAX(tid, -VKI_EINTR);
-      VG_(threads)[tid].status = VgTs_Runnable;
-      return;
-   }
-
-   if (VG_(threads)[tid].status == VgTs_WaitFD) {
-      VG_(panic)("handle_signal_return: unknown interrupted syscall");
-   }
-
-   /* All other cases?  Just return. */
-}
-
-
-static
-void sched_do_syscall ( ThreadId tid )
-{
-   UInt saved_eax;
-   UInt res, syscall_no;
-   UInt fd;
-   Bool orig_fd_blockness;
-   Char msg_buf[100];
-
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-
-   syscall_no = VG_(threads)[tid].m_eax; /* syscall number */
-
-   if (syscall_no == __NR_nanosleep) {
-      UInt t_now, t_awaken;
-      struct vki_timespec* req;
-      req = (struct vki_timespec*)VG_(threads)[tid].m_ebx; /* arg1 */
-      t_now = VG_(read_millisecond_timer)();     
-      t_awaken 
-         = t_now
-           + (UInt)1000ULL * (UInt)(req->tv_sec) 
-           + (UInt)(req->tv_nsec) / 1000000;
-      VG_(threads)[tid].status    = VgTs_Sleeping;
-      VG_(threads)[tid].awaken_at = t_awaken;
-      if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf, "at %d: nanosleep for %d", 
-                               t_now, t_awaken-t_now);
-	 print_sched_event(tid, msg_buf);
-      }
-      /* Force the scheduler to run something else for a while. */
-      return;
-   }
-
-   if (syscall_no != __NR_read && syscall_no != __NR_write) {
-      /* We think it's non-blocking.  Just do it in the normal way. */
-      VG_(perform_assumed_nonblocking_syscall)(tid);
-      /* The thread is still runnable. */
-      return;
-   }
-
-   /* Set the fd to nonblocking, and do the syscall, which will return
-      immediately, in order to lodge a request with the Linux kernel.
-      We later poll for I/O completion using select().  */
-
-   fd = VG_(threads)[tid].m_ebx /* arg1 */;
-
-   /* Deal with error case immediately. */
-   if (!fd_is_valid(fd)) {
-      VG_(message)(Vg_UserMsg, 
-         "Warning: invalid file descriptor %d in syscall %s",
-         fd, syscall_no == __NR_read ? "read()" : "write()" );
-      VG_(check_known_blocking_syscall)(tid, syscall_no, NULL /* PRE */);
-      KERNEL_DO_SYSCALL(tid, res);
-      VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
-      /* We're still runnable. */
-      vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-      return;
-   }
-
-   /* From here onwards we know that fd is valid. */
-
-   orig_fd_blockness = fd_is_blockful(fd);
-   set_fd_nonblocking(fd);
-   vg_assert(!fd_is_blockful(fd));
-   VG_(check_known_blocking_syscall)(tid, syscall_no, NULL /* PRE */);
-
-   /* This trashes the thread's %eax; we have to preserve it. */
-   saved_eax = VG_(threads)[tid].m_eax;
-   KERNEL_DO_SYSCALL(tid,res);
-
-   /* Restore original blockfulness of the fd. */
-   if (orig_fd_blockness)
-      set_fd_blocking(fd);
-   else
-      set_fd_nonblocking(fd);
-
-   if (res != -VKI_EWOULDBLOCK || !orig_fd_blockness) {
-      /* Finish off in the normal way.  Don't restore %EAX, since that
-         now (correctly) holds the result of the call.  We get here if either:
-         1.  The call didn't block, or
-         2.  The fd was already in nonblocking mode before we started to
-             mess with it.  In this case, we're not expecting to handle 
-             the I/O completion -- the client is.  So don't file a 
-             completion-wait entry. 
-      */
-      VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
-      /* We're still runnable. */
-      vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-
-   } else {
-
-      vg_assert(res == -VKI_EWOULDBLOCK && orig_fd_blockness);
-
-      /* It would have blocked.  First, restore %EAX to what it was
-         before our speculative call. */
-      VG_(threads)[tid].m_eax = saved_eax;
-      /* Put this fd in a table of fds on which we are waiting for
-         completion. The arguments for select() later are constructed
-         from this table.  */
-      add_waiting_fd(tid, fd, saved_eax /* which holds the syscall # */);
-      /* Deschedule thread until an I/O completion happens. */
-      VG_(threads)[tid].status = VgTs_WaitFD;
-      if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf,"block until I/O ready on fd %d", fd);
-	 print_sched_event(tid, msg_buf);
-      }
-
-   }
-}
-
-
-/* Find out which of the fds in vg_waiting_fds are now ready to go, by
-   making enquiries with select(), and mark them as ready.  We have to
-   wait for the requesting threads to fall into the the WaitFD state
-   before we can actually finally deliver the results, so this
-   procedure doesn't do that; complete_blocked_syscalls() does it.
-
-   It might seem odd that a thread which has done a blocking syscall
-   is not in WaitFD state; the way this can happen is if it initially
-   becomes WaitFD, but then a signal is delivered to it, so it becomes
-   Runnable for a while.  In this case we have to wait for the
-   sighandler to return, whereupon the WaitFD state is resumed, and
-   only at that point can the I/O result be delivered to it.  However,
-   this point may be long after the fd is actually ready.  
-
-   So, poll_for_ready_fds() merely detects fds which are ready.
-   complete_blocked_syscalls() does the second half of the trick,
-   possibly much later: it delivers the results from ready fds to
-   threads in WaitFD state. 
-*/
-static
-void poll_for_ready_fds ( void )
-{
-   vki_ksigset_t      saved_procmask;
-   vki_fd_set         readfds;
-   vki_fd_set         writefds;
-   vki_fd_set         exceptfds;
-   struct vki_timeval timeout;
-   Int                fd, fd_max, i, n_ready, syscall_no, n_ok;
-   ThreadId           tid;
-   Bool               rd_ok, wr_ok, ex_ok;
-   Char               msg_buf[100];
-
-   struct vki_timespec* rem;
-   UInt                 t_now;
-
-   /* Awaken any sleeping threads whose sleep has expired. */
-   for (tid = 1; tid < VG_N_THREADS; tid++)
-      if (VG_(threads)[tid].status == VgTs_Sleeping)
-         break;
-
-   /* Avoid pointless calls to VG_(read_millisecond_timer). */
-   if (tid < VG_N_THREADS) {
-      t_now = VG_(read_millisecond_timer)();
-      for (tid = 1; tid < VG_N_THREADS; tid++) {
-         if (VG_(threads)[tid].status != VgTs_Sleeping)
-            continue;
-         if (t_now >= VG_(threads)[tid].awaken_at) {
-            /* Resume this thread.  Set to zero the remaining-time
-               (second) arg of nanosleep, since it's used up all its
-               time. */
-            vg_assert(VG_(threads)[tid].m_eax == __NR_nanosleep);
-            rem = (struct vki_timespec *)VG_(threads)[tid].m_ecx; /* arg2 */
-            if (rem != NULL) {
-	       rem->tv_sec = 0;
-               rem->tv_nsec = 0;
-            }
-            /* Make the syscall return 0 (success). */
-            VG_(threads)[tid].m_eax = 0;
-	    /* Reschedule this thread. */
-            VG_(threads)[tid].status = VgTs_Runnable;
-            if (VG_(clo_trace_sched)) {
-               VG_(sprintf)(msg_buf, "at %d: nanosleep done", 
-                                     t_now);
-               print_sched_event(tid, msg_buf);
-            }
-         }
-      }
-   }
-
-   /* And look for threads waiting on file descriptors which are now
-      ready for I/O.*/
-   timeout.tv_sec = 0;
-   timeout.tv_usec = 0;
-
-   VKI_FD_ZERO(&readfds);
-   VKI_FD_ZERO(&writefds);
-   VKI_FD_ZERO(&exceptfds);
-   fd_max = -1;
-   for (i = 0; i < VG_N_WAITING_FDS; i++) {
-      if (vg_waiting_fds[i].fd == -1 /* not in use */) 
-         continue;
-      if (vg_waiting_fds[i].ready /* already ready? */) 
-         continue;
-      fd = vg_waiting_fds[i].fd;
-      /* VG_(printf)("adding QUERY for fd %d\n", fd); */
-      vg_assert(fd >= 0);
-      if (fd > fd_max) 
-         fd_max = fd;
-      tid = vg_waiting_fds[i].tid;
-      vg_assert(VG_(is_valid_tid)(tid));
-      syscall_no = vg_waiting_fds[i].syscall_no;
-      switch (syscall_no) {
-         case __NR_read:
-            /* In order to catch timeout events on fds which are
-               readable and which have been ioctl(TCSETA)'d with a
-               VTIMEout, we appear to need to ask if the fd is
-               writable, for some reason.  Ask me not why.  Since this
-               is strange and potentially troublesome we only do it if
-               the user asks specially. */
-            if (VG_(strstr)(VG_(clo_weird_hacks), "ioctl-VTIME") != NULL)
-               VKI_FD_SET(fd, &writefds);
-            VKI_FD_SET(fd, &readfds); break;
-         case __NR_write: 
-            VKI_FD_SET(fd, &writefds); break;
-         default: 
-            VG_(panic)("poll_for_ready_fds: unexpected syscall");
-            /*NOTREACHED*/
-            break;
-      }
-   }
-
-   /* Short cut: if no fds are waiting, give up now. */
-   if (fd_max == -1)
-      return;
-
-   /* BLOCK ALL SIGNALS.  We don't want the complication of select()
-      getting interrupted. */
-   VG_(block_all_host_signals)( &saved_procmask );
-
-   n_ready = VG_(select)
-                ( fd_max+1, &readfds, &writefds, &exceptfds, &timeout);
-   if (VG_(is_kerror)(n_ready)) {
-      VG_(printf)("poll_for_ready_fds: select returned %d\n", n_ready);
-      VG_(panic)("poll_for_ready_fds: select failed?!");
-      /*NOTREACHED*/
-   }
-   
-   /* UNBLOCK ALL SIGNALS */
-   VG_(restore_all_host_signals)( &saved_procmask );
-
-   /* VG_(printf)("poll_for_io_completions: %d fs ready\n", n_ready); */
-
-   if (n_ready == 0)
-      return;   
-
-   /* Inspect all the fds we know about, and handle any completions that
-      have happened. */
-   /*
-   VG_(printf)("\n\n");
-   for (fd = 0; fd < 100; fd++)
-     if (VKI_FD_ISSET(fd, &writefds) || VKI_FD_ISSET(fd, &readfds)) {
-       VG_(printf)("X"); } else { VG_(printf)("."); };
-   VG_(printf)("\n\nfd_max = %d\n", fd_max);
-   */
-
-   for (fd = 0; fd <= fd_max; fd++) {
-      rd_ok = VKI_FD_ISSET(fd, &readfds);
-      wr_ok = VKI_FD_ISSET(fd, &writefds);
-      ex_ok = VKI_FD_ISSET(fd, &exceptfds);
-
-      n_ok = (rd_ok ? 1 : 0) + (wr_ok ? 1 : 0) + (ex_ok ? 1 : 0);
-      if (n_ok == 0) 
-         continue;
-      if (n_ok > 1) {
-         VG_(printf)("offending fd = %d\n", fd);
-         VG_(panic)("poll_for_ready_fds: multiple events on fd");
-      }
-
-      /* An I/O event completed for fd.  Find the thread which
-         requested this. */
-      for (i = 0; i < VG_N_WAITING_FDS; i++) {
-         if (vg_waiting_fds[i].fd == -1 /* not in use */) 
-            continue;
-         if (vg_waiting_fds[i].fd == fd) 
-            break;
-      }
-
-      /* And a bit more paranoia ... */
-      vg_assert(i >= 0 && i < VG_N_WAITING_FDS);
-
-      /* Mark the fd as ready. */      
-      vg_assert(! vg_waiting_fds[i].ready);
-      vg_waiting_fds[i].ready = True;
-   }
-}
-
-
-/* See comment attached to poll_for_ready_fds() for explaination. */
-static
-void complete_blocked_syscalls ( void )
-{
-   Int      fd, i, res, syscall_no;
-   ThreadId tid;
-   Char     msg_buf[100];
-
-   /* Inspect all the outstanding fds we know about. */
-
-   for (i = 0; i < VG_N_WAITING_FDS; i++) {
-      if (vg_waiting_fds[i].fd == -1 /* not in use */) 
-         continue;
-      if (! vg_waiting_fds[i].ready)
-         continue;
-
-      fd  = vg_waiting_fds[i].fd;
-      tid = vg_waiting_fds[i].tid;
-      vg_assert(VG_(is_valid_tid)(tid));
-
-      /* The thread actually has to be waiting for the I/O event it
-         requested before we can deliver the result! */
-      if (VG_(threads)[tid].status != VgTs_WaitFD)
-         continue;
-
-      /* Ok, actually do it!  We can safely use %EAX as the syscall
-         number, because the speculative call made by
-         sched_do_syscall() doesn't change %EAX in the case where the
-         call would have blocked. */
-      syscall_no = vg_waiting_fds[i].syscall_no;
-      vg_assert(syscall_no == VG_(threads)[tid].m_eax);
-
-      /* In a rare case pertaining to writing into a pipe, write()
-         will block when asked to write > 4096 bytes even though the
-         kernel claims, when asked via select(), that blocking will
-         not occur for a write on that fd.  This can cause deadlocks.
-         An easy answer is to limit the size of the write to 4096
-         anyway and hope that the client program's logic can handle
-         the short write.  That shoulds dubious to me, so we don't do
-         it by default. */
-      if (syscall_no == __NR_write 
-          && VG_(threads)[tid].m_edx /* arg3, count */ > 4096
-          && VG_(strstr)(VG_(clo_weird_hacks), "truncate-writes") != NULL) {
-         /* VG_(printf)("truncate write from %d to 4096\n", 
-            VG_(threads)[tid].m_edx ); */
-         VG_(threads)[tid].m_edx = 4096;
-      }
-
-      KERNEL_DO_SYSCALL(tid,res);
-      VG_(check_known_blocking_syscall)(tid, syscall_no, &res /* POST */);
-
-      /* Reschedule. */
-      VG_(threads)[tid].status = VgTs_Runnable;
-      /* Mark slot as no longer in use. */
-      vg_waiting_fds[i].fd = -1;
-      /* pp_sched_status(); */
-      if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf,"resume due to I/O completion on fd %d", fd);
-	 print_sched_event(tid, msg_buf);
-      }
-   }
-}
-
-
-static
-void check_for_pthread_cond_timedwait ( void )
-{
-   Int i, now;
-   for (i = 1; i < VG_N_THREADS; i++) {
-      if (VG_(threads)[i].status != VgTs_WaitCV)
-         continue;
-      if (VG_(threads)[i].awaken_at == 0xFFFFFFFF /* no timeout */)
-         continue;
-      now = VG_(read_millisecond_timer)();
-      if (now >= VG_(threads)[i].awaken_at) {
-         do_pthread_cond_timedwait_TIMEOUT(i);
-      }
-   }
-}
-
-
-static
-void nanosleep_for_a_while ( void )
-{
-   Int res;
-   struct vki_timespec req;
-   struct vki_timespec rem;
-   req.tv_sec = 0;
-   req.tv_nsec = 20 * 1000 * 1000;
-   res = VG_(nanosleep)( &req, &rem );   
-   vg_assert(res == 0 /* ok */ || res == 1 /* interrupted by signal */);
-}
-
-
-/* ---------------------------------------------------------------------
-   The scheduler proper.
-   ------------------------------------------------------------------ */
-
-/* Run user-space threads until either
-   * Deadlock occurs
-   * One thread asks to shutdown Valgrind
-   * The specified number of basic blocks has gone by.
-*/
-VgSchedReturnCode VG_(scheduler) ( void )
-{
-   ThreadId tid, tid_next;
-   UInt     trc;
-   UInt     dispatch_ctr_SAVED;
-   Int      done_this_time, n_in_bounded_wait;
-   Addr     trans_addr;
-   Bool     sigs_delivered;
-
-   /* For the LRU structures, records when the epoch began. */
-   ULong lru_epoch_started_at = 0;
-
-   /* Start with the root thread.  tid in general indicates the
-      currently runnable/just-finished-running thread. */
-   VG_(last_run_tid) = tid = 1;
-
-   /* This is the top level scheduler loop.  It falls into three
-      phases. */
-   while (True) {
-
-      /* ======================= Phase 0 of 3 =======================
-	 Be paranoid.  Always a good idea. */
-     stage1:
-      scheduler_sanity();
-      VG_(do_sanity_checks)( False );
-
-      /* ======================= Phase 1 of 3 =======================
-         Handle I/O completions and signals.  This may change the
-         status of various threads.  Then select a new thread to run,
-         or declare deadlock, or sleep if there are no runnable
-         threads but some are blocked on I/O.  */
-
-      /* Age the LRU structures if an epoch has been completed. */
-      if (VG_(bbs_done) - lru_epoch_started_at >= VG_BBS_PER_EPOCH) {
-         lru_epoch_started_at = VG_(bbs_done);
-         increment_epoch();
-      }
-
-      /* Was a debug-stop requested? */
-      if (VG_(bbs_to_go) == 0) 
-         goto debug_stop;
-
-      /* Do the following loop until a runnable thread is found, or
-         deadlock is detected. */
-      while (True) {
-
-         /* For stats purposes only. */
-         VG_(num_scheduling_events_MAJOR) ++;
-
-         /* See if any I/O operations which we were waiting for have
-            completed, and, if so, make runnable the relevant waiting
-            threads. */
-         poll_for_ready_fds();
-         complete_blocked_syscalls();
-         check_for_pthread_cond_timedwait();
-
-         /* See if there are any signals which need to be delivered.  If
-            so, choose thread(s) to deliver them to, and build signal
-            delivery frames on those thread(s) stacks. */
-
-	 /* Be careful about delivering signals to a thread waiting
-            for a mutex.  In particular, when the handler is running,
-            that thread is temporarily apparently-not-waiting for the
-            mutex, so if it is unlocked by another thread whilst the
-            handler is running, this thread is not informed.  When the
-            handler returns, the thread resumes waiting on the mutex,
-            even if, as a result, it has missed the unlocking of it.
-            Potential deadlock.  This sounds all very strange, but the
-            POSIX standard appears to require this behaviour.  */
-         sigs_delivered = VG_(deliver_signals)();
-	 if (sigs_delivered)
-            VG_(do_sanity_checks)( False );
-
-         /* Try and find a thread (tid) to run. */
-         tid_next = tid;
-         n_in_bounded_wait = 0;
-         while (True) {
-            tid_next++;
-            if (tid_next >= VG_N_THREADS) tid_next = 1;
-            if (VG_(threads)[tid_next].status == VgTs_WaitFD
-                || VG_(threads)[tid_next].status == VgTs_Sleeping
-                || VG_(threads)[tid_next].status == VgTs_WaitSIG
-                || (VG_(threads)[tid_next].status == VgTs_WaitCV 
-                    && VG_(threads)[tid_next].awaken_at != 0xFFFFFFFF))
-               n_in_bounded_wait ++;
-            if (VG_(threads)[tid_next].status == VgTs_Runnable) 
-               break; /* We can run this one. */
-            if (tid_next == tid) 
-               break; /* been all the way round */
-         }
-         tid = tid_next;
-       
-         if (VG_(threads)[tid].status == VgTs_Runnable) {
-            /* Found a suitable candidate.  Fall out of this loop, so
-               we can advance to stage 2 of the scheduler: actually
-               running the thread. */
-            break;
-	 }
-
-         /* We didn't find a runnable thread.  Now what? */
-         if (n_in_bounded_wait == 0) {
-            /* No runnable threads and no prospect of any appearing
-               even if we wait for an arbitrary length of time.  In
-               short, we have a deadlock. */
-	    VG_(pp_sched_status)();
-            return VgSrc_Deadlock;
-         }
-
-         /* At least one thread is in a fd-wait state.  Delay for a
-            while, and go round again, in the hope that eventually a
-            thread becomes runnable. */
-         nanosleep_for_a_while();
-	 /* pp_sched_status(); */
-	 /* VG_(printf)("."); */
-      }
-
-
-      /* ======================= Phase 2 of 3 =======================
-         Wahey!  We've finally decided that thread tid is runnable, so
-         we now do that.  Run it for as much of a quanta as possible.
-         Trivial requests are handled and the thread continues.  The
-         aim is not to do too many of Phase 1 since it is expensive.  */
-
-      if (0)
-         VG_(printf)("SCHED: tid %d\n", tid);
-
-      /* Figure out how many bbs to ask vg_run_innerloop to do.  Note
-         that it decrements the counter before testing it for zero, so
-         that if VG_(dispatch_ctr) is set to N you get at most N-1
-         iterations.  Also this means that VG_(dispatch_ctr) must
-         exceed zero before entering the innerloop.  Also also, the
-         decrement is done before the bb is actually run, so you
-         always get at least one decrement even if nothing happens.
-      */
-      if (VG_(bbs_to_go) >= VG_SCHEDULING_QUANTUM)
-         VG_(dispatch_ctr) = VG_SCHEDULING_QUANTUM + 1;
-      else
-         VG_(dispatch_ctr) = (UInt)VG_(bbs_to_go) + 1;
-
-      /* ... and remember what we asked for. */
-      dispatch_ctr_SAVED = VG_(dispatch_ctr);
-
-      /* paranoia ... */
-      vg_assert(VG_(threads)[tid].tid == tid);
-
-      /* Actually run thread tid. */
-      while (True) {
-
-         VG_(last_run_tid) = tid;
-
-         /* For stats purposes only. */
-         VG_(num_scheduling_events_MINOR) ++;
-
-         if (0)
-            VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs", 
-                                      tid, VG_(dispatch_ctr) - 1 );
-#        if 0
-         if (VG_(bbs_done) > 31700000 + 0) {
-            dispatch_ctr_SAVED = VG_(dispatch_ctr) = 2;
-            VG_(translate)(&VG_(threads)[tid], VG_(threads)[tid].m_eip,
-                           NULL,NULL,NULL);
-         }
-         vg_assert(VG_(threads)[tid].m_eip != 0);
-#        endif
-
-         trc = run_thread_for_a_while ( tid );
-
-#        if 0
-         if (0 == VG_(threads)[tid].m_eip) {
-            VG_(printf)("tid = %d,  dc = %llu\n", tid, VG_(bbs_done));
-            vg_assert(0 != VG_(threads)[tid].m_eip);
-         }
-#        endif
-
-         /* Deal quickly with trivial scheduling events, and resume the
-            thread. */
-
-         if (trc == VG_TRC_INNER_FASTMISS) {
-            vg_assert(VG_(dispatch_ctr) > 0);
-
-            /* Trivial event.  Miss in the fast-cache.  Do a full
-               lookup for it. */
-            trans_addr 
-               = VG_(search_transtab) ( VG_(threads)[tid].m_eip );
-            if (trans_addr == (Addr)0) {
-               /* Not found; we need to request a translation. */
-               create_translation_for( tid, VG_(threads)[tid].m_eip ); 
-               trans_addr = VG_(search_transtab) ( VG_(threads)[tid].m_eip ); 
-               if (trans_addr == (Addr)0)
-                  VG_(panic)("VG_TRC_INNER_FASTMISS: missing tt_fast entry");
-            }
-            continue; /* with this thread */
-         }
-
-         if (trc == VG_TRC_EBP_JMP_CLIENTREQ) {
-            UInt reqno = *(UInt*)(VG_(threads)[tid].m_eax);
-            /* VG_(printf)("request 0x%x\n", reqno); */
-
-            /* Are we really absolutely totally quitting? */
-            if (reqno == VG_USERREQ__LIBC_FREERES_DONE) {
-               if (0 || VG_(clo_trace_syscalls) || VG_(clo_trace_sched)) {
-                  VG_(message)(Vg_DebugMsg, 
-                     "__libc_freeres() done; really quitting!");
-               }
-               return VgSrc_ExitSyscall;
-            }
-
-            do_client_request(tid);
-            /* Following the request, we try and continue with the
-               same thread if still runnable.  If not, go back to
-               Stage 1 to select a new thread to run. */
-            if (VG_(threads)[tid].status == VgTs_Runnable
-                && reqno != VG_USERREQ__PTHREAD_YIELD)
-               continue; /* with this thread */
-            else
-               goto stage1;
-	 }
-
-         if (trc == VG_TRC_EBP_JMP_SYSCALL) {
-            /* Do a syscall for the vthread tid.  This could cause it
-               to become non-runnable.  One special case: spot the
-               client doing calls to exit() and take this as the cue
-               to exit. */
-#           if 0
-            { UInt* esp; Int i;
-              esp=(UInt*)VG_(threads)[tid].m_esp;
-              VG_(printf)("\nBEFORE\n");
-              for (i = 10; i >= -10; i--)
-                 VG_(printf)("%2d  %p  =  0x%x\n", i, &esp[i], esp[i]);
-            }
-#           endif
-
-            /* Deal with calling __libc_freeres() at exit.  When the
-               client does __NR_exit, it's exiting for good.  So we
-               then run VG_(__libc_freeres_wrapper).  That quits by
-               doing VG_USERREQ__LIBC_FREERES_DONE, and at that point
-               we really exit.  To be safe we nuke all other threads
-               currently running. 
-
-               If not valgrinding (cachegrinding, etc) don't do this.
-               __libc_freeres does some invalid frees which crash
-               the unprotected malloc/free system. */
-            if (VG_(threads)[tid].m_eax == __NR_exit 
-                && !VG_(clo_instrument)) {
-               if (VG_(clo_trace_syscalls) || VG_(clo_trace_sched)) {
-                  VG_(message)(Vg_DebugMsg, 
-                     "Caught __NR_exit; quitting");
-               }
-               return VgSrc_ExitSyscall;
-            }
-
-            if (VG_(threads)[tid].m_eax == __NR_exit) {
-               vg_assert(VG_(clo_instrument));
-               if (0 || VG_(clo_trace_syscalls) || VG_(clo_trace_sched)) {
-                  VG_(message)(Vg_DebugMsg, 
-                     "Caught __NR_exit; running __libc_freeres()");
-               }
-               VG_(nuke_all_threads_except) ( tid );
-               VG_(threads)[tid].m_eip = (UInt)(&VG_(__libc_freeres_wrapper));
-	       vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-               goto stage1; /* party on, dudes (but not for much longer :) */
-            }
-
-            /* Trap syscalls to __NR_sched_yield and just have this
-               thread yield instead.  Not essential, just an
-               optimisation. */
-	    if (VG_(threads)[tid].m_eax == __NR_sched_yield) {
-               SET_EAX(tid, 0); /* syscall returns with success */
-               goto stage1; /* find a new thread to run */
-	    }
-
-            sched_do_syscall(tid);
-
-#           if 0
-            { UInt* esp; Int i;
-              esp=(UInt*)VG_(threads)[tid].m_esp;
-              VG_(printf)("AFTER\n");
-              for (i = 10; i >= -10; i--)
-                 VG_(printf)("%2d  %p  =  0x%x\n", i, &esp[i], esp[i]);
-            }
-#           endif
-
-            if (VG_(threads)[tid].status == VgTs_Runnable) {
-               /* Better do a signal check, since if in a tight loop
-                  with a slow syscall it may be a very long time
-                  before we get back to the main signal check in Stage 1. */
-               sigs_delivered = VG_(deliver_signals)();
-               if (sigs_delivered)
-                  VG_(do_sanity_checks)( False );
-               continue; /* with this thread */
-            } else {
-               goto stage1;
-            }
-	 }
-
-	 /* It's an event we can't quickly deal with.  Give up running
-            this thread and handle things the expensive way. */
-	 break;
-      }
-
-      /* ======================= Phase 3 of 3 =======================
-         Handle non-trivial thread requests, mostly pthread stuff. */
-
-      /* Ok, we've fallen out of the dispatcher for a
-         non-completely-trivial reason. First, update basic-block
-         counters. */
-
-      done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 1;
-      vg_assert(done_this_time >= 0);
-      VG_(bbs_to_go)   -= (ULong)done_this_time;
-      VG_(bbs_done)    += (ULong)done_this_time;
-
-      if (0 && trc != VG_TRC_INNER_FASTMISS)
-         VG_(message)(Vg_DebugMsg, "thread %d:   completed %d bbs, trc %d", 
-                                   tid, done_this_time, (Int)trc );
-
-      if (0 && trc != VG_TRC_INNER_FASTMISS)
-         VG_(message)(Vg_DebugMsg, "thread %d:  %ld bbs, event %s", 
-                                   tid, VG_(bbs_done),
-                                   name_of_sched_event(trc) );
-
-      /* Examine the thread's return code to figure out why it
-         stopped. */
-
-      switch (trc) {
-
-         case VG_TRC_INNER_COUNTERZERO:
-            /* Timeslice is out.  Let a new thread be scheduled,
-               simply by doing nothing, causing us to arrive back at
-               Phase 1. */
-            if (VG_(bbs_to_go) == 0) {
-               goto debug_stop;
-            }
-            vg_assert(VG_(dispatch_ctr) == 0);
-            break;
-
-         case VG_TRC_UNRESUMABLE_SIGNAL:
-            /* It got a SIGSEGV/SIGBUS, which we need to deliver right
-               away.  Again, do nothing, so we wind up back at Phase
-               1, whereupon the signal will be "delivered". */
-	    break;
-
-         default: 
-            VG_(printf)("\ntrc = %d\n", trc);
-            VG_(panic)("VG_(scheduler), phase 3: "
-                       "unexpected thread return code");
-            /* NOTREACHED */
-            break;
-
-      } /* switch (trc) */
-
-      /* That completes Phase 3 of 3.  Return now to the top of the
-	 main scheduler loop, to Phase 1 of 3. */
-
-   } /* top-level scheduler loop */
-
-
-   /* NOTREACHED */
-   VG_(panic)("scheduler: post-main-loop ?!");
-   /* NOTREACHED */
-
-  debug_stop:
-   /* If we exited because of a debug stop, print the translation 
-      of the last block executed -- by translating it again, and 
-      throwing away the result. */
-   VG_(printf)(
-      "======vvvvvvvv====== LAST TRANSLATION ======vvvvvvvv======\n");
-   VG_(translate)( &VG_(threads)[tid], 
-                   VG_(threads)[tid].m_eip, NULL, NULL, NULL );
-   VG_(printf)("\n");
-   VG_(printf)(
-      "======^^^^^^^^====== LAST TRANSLATION ======^^^^^^^^======\n");
-
-   return VgSrc_BbsDone;
-}
-
-
-/* ---------------------------------------------------------------------
-   The pthread implementation.
-   ------------------------------------------------------------------ */
-
-#include <pthread.h>
-#include <errno.h>
-
-#define VG_PTHREAD_STACK_MIN \
-   (VG_PTHREAD_STACK_SIZE - VG_AR_CLIENT_STACKBASE_REDZONE_SZB)
-
-/*  /usr/include/bits/pthreadtypes.h:
-    typedef unsigned long int pthread_t;
-*/
-
-
-/* -----------------------------------------------------------
-   Thread CREATION, JOINAGE and CANCELLATION: HELPER FNS
-   -------------------------------------------------------- */
-
-/* We've decided to action a cancellation on tid.  Make it jump to
-   thread_exit_wrapper() in vg_libpthread.c, passing PTHREAD_CANCELED
-   as the arg. */
-static
-void make_thread_jump_to_cancelhdlr ( ThreadId tid )
-{
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   /* Push PTHREAD_CANCELED on the stack and jump to the cancellation
-      handler -- which is really thread_exit_wrapper() in
-      vg_libpthread.c. */
-   vg_assert(VG_(threads)[tid].cancel_pend != NULL);
-   VG_(threads)[tid].m_esp -= 4;
-   * (UInt*)(VG_(threads)[tid].m_esp) = (UInt)PTHREAD_CANCELED;
-   VG_(threads)[tid].m_eip = (UInt)VG_(threads)[tid].cancel_pend;
-   VG_(threads)[tid].status = VgTs_Runnable;
-   /* Make sure we aren't cancelled again whilst handling this
-      cancellation. */
-   VG_(threads)[tid].cancel_st = False;
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "jump to cancellation handler (hdlr = %p)", 
-         VG_(threads)[tid].cancel_pend);
-      print_sched_event(tid, msg_buf);
-   }
-}
-
-
-
-/* Release resources and generally clean up once a thread has finally
-   disappeared. */
-static
-void cleanup_after_thread_exited ( ThreadId tid )
-{
-   Int           i;
-   vki_ksigset_t irrelevant_sigmask;
-   vg_assert(VG_(is_valid_or_empty_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Empty);
-   /* Mark its stack no-access */
-   if (VG_(clo_instrument) && tid != 1)
-      VGM_(make_noaccess)( VG_(threads)[tid].stack_base,
-                           VG_(threads)[tid].stack_size );
-   /* Forget about any pending signals directed specifically at this
-      thread, and get rid of signal handlers specifically arranged for
-      this thread. */
-   VG_(block_all_host_signals)( &irrelevant_sigmask );
-   VG_(handle_SCSS_change)( False /* lazy update */ );
-
-   /* Clean up the waiting_fd table */
-   for (i = 0; i < VG_N_WAITING_FDS; i++) {
-      if (vg_waiting_fds[i].tid == tid) {
-         vg_waiting_fds[i].fd = -1; /* not in use */
-      }
-   }
-}
-
-
-/* Look for matching pairs of threads waiting for joiners and threads
-   waiting for joinees.  For each such pair copy the return value of
-   the joinee into the joiner, let the joiner resume and discard the
-   joinee. */
-static
-void maybe_rendezvous_joiners_and_joinees ( void )
-{
-   Char     msg_buf[100];
-   void**   thread_return;
-   ThreadId jnr, jee;
-
-   for (jnr = 1; jnr < VG_N_THREADS; jnr++) {
-      if (VG_(threads)[jnr].status != VgTs_WaitJoinee)
-         continue;
-      jee = VG_(threads)[jnr].joiner_jee_tid;
-      if (jee == VG_INVALID_THREADID) 
-         continue;
-      vg_assert(VG_(is_valid_tid)(jee));
-      if (VG_(threads)[jee].status != VgTs_WaitJoiner)
-         continue;
-      /* ok!  jnr is waiting to join with jee, and jee is waiting to be
-         joined by ... well, any thread.  So let's do it! */
-
-      /* Copy return value to where joiner wants it. */
-      thread_return = VG_(threads)[jnr].joiner_thread_return;
-      if (thread_return != NULL) {
-         /* CHECK thread_return writable */
-         if (VG_(clo_instrument)
-             && !VGM_(check_writable)( (Addr)thread_return, 
-                                       sizeof(void*), NULL))
-            VG_(record_pthread_err)( jnr, 
-               "pthread_join: thread_return points to invalid location");
-
-         *thread_return = VG_(threads)[jee].joinee_retval;
-         /* Not really right, since it makes the thread's return value
-            appear to be defined even if it isn't. */
-         if (VG_(clo_instrument))
-            VGM_(make_readable)( (Addr)thread_return, sizeof(void*) );
-      }
-
-      /* Joinee is discarded */
-      VG_(threads)[jee].status = VgTs_Empty; /* bye! */
-      cleanup_after_thread_exited ( jee );
-         if (VG_(clo_trace_sched)) {
-            VG_(sprintf)(msg_buf,
-               "rendezvous with joinee %d.  %d resumes, %d exits.",
-               jee, jnr, jee );
-         print_sched_event(jnr, msg_buf);
-      }
-
-      /* joiner returns with success */
-      VG_(threads)[jnr].status = VgTs_Runnable;
-      SET_EDX(jnr, 0);
-   }
-}
-
-
-/* Nuke all threads other than tid.  POSIX specifies that this should
-   happen in __NR_exec, and after a __NR_fork() when I am the child,
-   as POSIX requires. */
-void VG_(nuke_all_threads_except) ( ThreadId me )
-{
-   ThreadId tid;
-   for (tid = 1; tid < VG_N_THREADS; tid++) {
-      if (tid == me
-          || VG_(threads)[tid].status == VgTs_Empty) 
-         continue;
-      if (0)
-         VG_(printf)(
-            "VG_(nuke_all_threads_except): nuking tid %d\n", tid);
-      VG_(threads)[tid].status = VgTs_Empty;
-      cleanup_after_thread_exited( tid );
-   }
-}
-
-
-/* -----------------------------------------------------------
-   Thread CREATION, JOINAGE and CANCELLATION: REQUESTS
-   -------------------------------------------------------- */
-
-static
-void do__cleanup_push ( ThreadId tid, CleanupEntry* cu )
-{
-   Int  sp;
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   sp = VG_(threads)[tid].custack_used;
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "cleanup_push (fn %p, arg %p) -> slot %d", 
-         cu->fn, cu->arg, sp);
-      print_sched_event(tid, msg_buf);
-   }
-   vg_assert(sp >= 0 && sp <= VG_N_CLEANUPSTACK);
-   if (sp == VG_N_CLEANUPSTACK)
-      VG_(panic)("do__cleanup_push: VG_N_CLEANUPSTACK is too small."
-                 "  Increase and recompile.");
-   VG_(threads)[tid].custack[sp] = *cu;
-   sp++;
-   VG_(threads)[tid].custack_used = sp;
-   SET_EDX(tid, 0);
-}
-
-
-static
-void do__cleanup_pop ( ThreadId tid, CleanupEntry* cu )
-{
-   Int  sp;
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   sp = VG_(threads)[tid].custack_used;
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "cleanup_pop from slot %d", sp-1);
-      print_sched_event(tid, msg_buf);
-   }
-   vg_assert(sp >= 0 && sp <= VG_N_CLEANUPSTACK);
-   if (sp == 0) {
-     SET_EDX(tid, -1);
-     return;
-   }
-   sp--;
-   *cu = VG_(threads)[tid].custack[sp];
-   if (VG_(clo_instrument))
-      VGM_(make_readable)( (Addr)cu, sizeof(CleanupEntry) );
-   VG_(threads)[tid].custack_used = sp;
-   SET_EDX(tid, 0);
-}
-
-
-static
-void do_pthread_yield ( ThreadId tid )
-{
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "yield");
-      print_sched_event(tid, msg_buf);
-   }
-   SET_EDX(tid, 0);
-}
-
-
-static
-void do__testcancel ( ThreadId tid )
-{
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "testcancel");
-      print_sched_event(tid, msg_buf);
-   }
-   if (/* is there a cancellation pending on this thread? */
-       VG_(threads)[tid].cancel_pend != NULL
-       && /* is this thread accepting cancellations? */
-          VG_(threads)[tid].cancel_st) {
-     /* Ok, let's do the cancellation. */
-     make_thread_jump_to_cancelhdlr ( tid );
-   } else {
-      /* No, we keep going. */
-      SET_EDX(tid, 0);
-   }
-}
-
-
-static
-void do__set_cancelstate ( ThreadId tid, Int state )
-{
-   Bool old_st;
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "set_cancelstate to %d (%s)", state, 
-         state==PTHREAD_CANCEL_ENABLE 
-            ? "ENABLE" 
-            : (state==PTHREAD_CANCEL_DISABLE ? "DISABLE" : "???"));
-      print_sched_event(tid, msg_buf);
-   }
-   old_st = VG_(threads)[tid].cancel_st;
-   if (state == PTHREAD_CANCEL_ENABLE) {
-      VG_(threads)[tid].cancel_st = True;
-   } else
-   if (state == PTHREAD_CANCEL_DISABLE) {
-      VG_(threads)[tid].cancel_st = False;
-   } else {
-      VG_(panic)("do__set_cancelstate");
-   }
-   SET_EDX(tid, old_st ? PTHREAD_CANCEL_ENABLE 
-                       : PTHREAD_CANCEL_DISABLE);
-}
-
-
-static
-void do__set_canceltype ( ThreadId tid, Int type )
-{
-   Bool old_ty;
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "set_canceltype to %d (%s)", type, 
-         type==PTHREAD_CANCEL_ASYNCHRONOUS 
-            ? "ASYNCHRONOUS" 
-            : (type==PTHREAD_CANCEL_DEFERRED ? "DEFERRED" : "???"));
-      print_sched_event(tid, msg_buf);
-   }
-   old_ty = VG_(threads)[tid].cancel_ty;
-   if (type == PTHREAD_CANCEL_ASYNCHRONOUS) {
-      VG_(threads)[tid].cancel_ty = False;
-   } else
-   if (type == PTHREAD_CANCEL_DEFERRED) {
-      VG_(threads)[tid].cancel_ty = True;
-   } else {
-      VG_(panic)("do__set_canceltype");
-   }
-   SET_EDX(tid, old_ty ? PTHREAD_CANCEL_DEFERRED 
-                       : PTHREAD_CANCEL_ASYNCHRONOUS);
-}
-
-
-/* Set or get the detach state for thread det. */
-static
-void do__set_or_get_detach ( ThreadId tid, 
-                             Int what, ThreadId det )
-{
-   ThreadId i;
-   Char     msg_buf[100];
-   /* VG_(printf)("do__set_or_get_detach tid %d what %d det %d\n", 
-      tid, what, det); */
-   vg_assert(VG_(is_valid_tid)(tid));
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "set_or_get_detach %d (%s) for tid %d", what,
-         what==0 ? "not-detached" : (
-         what==1 ? "detached" : (
-         what==2 ? "fetch old value" : "???")), 
-         det );
-      print_sched_event(tid, msg_buf);
-   }
-
-   if (!VG_(is_valid_tid)(det)) {
-      SET_EDX(tid, -1);
-      return;
-   }
-
-   switch (what) {
-      case 2: /* get */
-         SET_EDX(tid, VG_(threads)[det].detached ? 1 : 0);
-         return;
-      case 1: /* set detached.  If someone is in a join-wait for det,
-                 do not detach. */
-         for (i = 1; i < VG_N_THREADS; i++) {
-            if (VG_(threads)[i].status == VgTs_WaitJoinee
-                && VG_(threads)[i].joiner_jee_tid == det) {
-               SET_EDX(tid, 0);
-               if (VG_(clo_trace_sched)) {
-                  VG_(sprintf)(msg_buf,
-                     "tid %d not detached because %d in join-wait for it %d",
-                     det, i);
-                  print_sched_event(tid, msg_buf);
-               }
-               return;
-            }
-         }
-         VG_(threads)[det].detached = True;
-         SET_EDX(tid, 0); 
-         return;
-      case 0: /* set not detached */
-         VG_(threads)[det].detached = False;
-         SET_EDX(tid, 0);
-         return;
-      default:
-         VG_(panic)("do__set_or_get_detach");
-   }
-}
-
-
-static
-void do__set_cancelpend ( ThreadId tid, 
-                          ThreadId cee,
-			  void (*cancelpend_hdlr)(void*) )
-{
-   Char msg_buf[100];
-
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (!VG_(is_valid_tid)(cee)) {
-      if (VG_(clo_trace_sched)) {
-         VG_(sprintf)(msg_buf, 
-            "set_cancelpend for invalid tid %d", cee);
-         print_sched_event(tid, msg_buf);
-      }
-      VG_(record_pthread_err)( tid, 
-         "pthread_cancel: target thread does not exist, or invalid");
-      SET_EDX(tid, -VKI_ESRCH);
-      return;
-   }
-
-   VG_(threads)[cee].cancel_pend = cancelpend_hdlr;
-
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "set_cancelpend (hdlr = %p, set by tid %d)", 
-         cancelpend_hdlr, tid);
-      print_sched_event(cee, msg_buf);
-   }
-
-   /* Thread doing the cancelling returns with success. */
-   SET_EDX(tid, 0);
-
-   /* Perhaps we can nuke the cancellee right now? */
-   do__testcancel(cee);
-}
-
-
-static
-void do_pthread_join ( ThreadId tid, 
-                       ThreadId jee, void** thread_return )
-{
-   Char     msg_buf[100];
-   ThreadId i;
-   /* jee, the joinee, is the thread specified as an arg in thread
-      tid's call to pthread_join.  So tid is the join-er. */
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (jee == tid) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_join: attempt to join to self");
-      SET_EDX(tid, EDEADLK); /* libc constant, not a kernel one */
-      VG_(threads)[tid].status = VgTs_Runnable;
-      return;
-   }
-
-   /* Flush any completed pairs, so as to make sure what we're looking
-      at is up-to-date. */
-   maybe_rendezvous_joiners_and_joinees();
-
-   /* Is this a sane request? */
-   if (jee < 0 
-       || jee >= VG_N_THREADS
-       || VG_(threads)[jee].status == VgTs_Empty) {
-      /* Invalid thread to join to. */
-      VG_(record_pthread_err)( tid, 
-         "pthread_join: target thread does not exist, or invalid");
-      SET_EDX(tid, EINVAL);
-      VG_(threads)[tid].status = VgTs_Runnable;
-      return;
-   }
-
-   /* Is anyone else already in a join-wait for jee? */
-   for (i = 1; i < VG_N_THREADS; i++) {
-      if (i == tid) continue;
-      if (VG_(threads)[i].status == VgTs_WaitJoinee
-          && VG_(threads)[i].joiner_jee_tid == jee) {
-         /* Someone already did join on this thread */
-         VG_(record_pthread_err)( tid, 
-            "pthread_join: another thread already "
-            "in join-wait for target thread");
-         SET_EDX(tid, EINVAL);
-         VG_(threads)[tid].status = VgTs_Runnable;
-         return;
-      }
-   }
-
-   /* Mark this thread as waiting for the joinee. */
-   VG_(threads)[tid].status = VgTs_WaitJoinee;
-   VG_(threads)[tid].joiner_thread_return = thread_return;
-   VG_(threads)[tid].joiner_jee_tid = jee;
-
-   /* Look for matching joiners and joinees and do the right thing. */
-   maybe_rendezvous_joiners_and_joinees();
-
-   /* Return value is irrelevant since this this thread becomes
-      non-runnable.  maybe_resume_joiner() will cause it to return the
-      right value when it resumes. */
-
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "wait for joinee %d (may already be ready)", jee);
-      print_sched_event(tid, msg_buf);
-   }
-}
-
-
-/* ( void* ): calling thread waits for joiner and returns the void* to
-   it.  This is one of two ways in which a thread can finally exit --
-   the other is do__quit. */
-static
-void do__wait_joiner ( ThreadId tid, void* retval )
-{
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "do__wait_joiner(retval = %p) (non-detached thread exit)", retval);
-      print_sched_event(tid, msg_buf);
-   }
-   VG_(threads)[tid].status = VgTs_WaitJoiner;
-   VG_(threads)[tid].joinee_retval = retval;
-   maybe_rendezvous_joiners_and_joinees();
-}
-
-
-/* ( no-args ): calling thread disappears from the system forever.
-   Reclaim resources. */
-static
-void do__quit ( ThreadId tid )
-{
-   Char msg_buf[100];
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(threads)[tid].status == VgTs_Runnable);
-   VG_(threads)[tid].status = VgTs_Empty; /* bye! */
-   cleanup_after_thread_exited ( tid );
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "do__quit (detached thread exit)");
-      print_sched_event(tid, msg_buf);
-   }
-   /* Return value is irrelevant; this thread will not get
-      rescheduled. */
-}
-
-
-/* Should never be entered.  If it is, will be on the simulated
-   CPU. */
-static 
-void do__apply_in_new_thread_bogusRA ( void )
-{
-   VG_(panic)("do__apply_in_new_thread_bogusRA");
-}
-
-/* (Fn, Arg): Create a new thread and run Fn applied to Arg in it.  Fn
-   MUST NOT return -- ever.  Eventually it will do either __QUIT or
-   __WAIT_JOINER.  Return the child tid to the parent. */
-static
-void do__apply_in_new_thread ( ThreadId parent_tid,
-                               void* (*fn)(void *), 
-                               void* arg )
-{
-   Addr     new_stack;
-   UInt     new_stk_szb;
-   ThreadId tid;
-   Char     msg_buf[100];
-
-   /* Paranoia ... */
-   vg_assert(sizeof(pthread_t) == sizeof(UInt));
-
-   vg_assert(VG_(threads)[parent_tid].status != VgTs_Empty);
-
-   tid = vg_alloc_ThreadState();
-
-   /* If we've created the main thread's tid, we're in deep trouble :) */
-   vg_assert(tid != 1);
-   vg_assert(VG_(is_valid_or_empty_tid)(tid));
-
-   /* Copy the parent's CPU state into the child's, in a roundabout
-      way (via baseBlock). */
-   VG_(load_thread_state)(parent_tid);
-   VG_(save_thread_state)(tid);
-
-   /* Consider allocating the child a stack, if the one it already has
-      is inadequate. */
-   new_stk_szb = VG_PTHREAD_STACK_MIN;
-
-   if (new_stk_szb > VG_(threads)[tid].stack_size) {
-      /* Again, for good measure :) We definitely don't want to be
-         allocating a stack for the main thread. */
-      vg_assert(tid != 1);
-      /* for now, we don't handle the case of anything other than
-         assigning it for the first time. */
-      vg_assert(VG_(threads)[tid].stack_size == 0);
-      vg_assert(VG_(threads)[tid].stack_base == (Addr)NULL);
-      new_stack = (Addr)VG_(get_memory_from_mmap)( new_stk_szb, 
-                                                   "new thread stack" );
-      VG_(threads)[tid].stack_base = new_stack;
-      VG_(threads)[tid].stack_size = new_stk_szb;
-      VG_(threads)[tid].stack_highest_word
-         = new_stack + new_stk_szb 
-                     - VG_AR_CLIENT_STACKBASE_REDZONE_SZB; /* -4  ??? */;
-   }
-
-   VG_(threads)[tid].m_esp 
-      = VG_(threads)[tid].stack_base 
-        + VG_(threads)[tid].stack_size
-        - VG_AR_CLIENT_STACKBASE_REDZONE_SZB;
-
-   if (VG_(clo_instrument))
-      VGM_(make_noaccess)( VG_(threads)[tid].m_esp, 
-                           VG_AR_CLIENT_STACKBASE_REDZONE_SZB );
-   
-   /* push arg */
-   VG_(threads)[tid].m_esp -= 4;
-   * (UInt*)(VG_(threads)[tid].m_esp) = (UInt)arg;
-
-   /* push (bogus) return address */
-   VG_(threads)[tid].m_esp -= 4;
-   * (UInt*)(VG_(threads)[tid].m_esp) 
-      = (UInt)&do__apply_in_new_thread_bogusRA;
-
-   if (VG_(clo_instrument))
-      VGM_(make_readable)( VG_(threads)[tid].m_esp, 2 * 4 );
-
-   /* this is where we start */
-   VG_(threads)[tid].m_eip = (UInt)fn;
-
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf,
-         "new thread, created by %d", parent_tid );
-      print_sched_event(tid, msg_buf);
-   }
-
-   /* Create new thread with default attrs:
-      deferred cancellation, not detached 
-   */
-   mostly_clear_thread_record(tid);
-   VG_(threads)[tid].status = VgTs_Runnable;
-
-   /* We inherit our parent's signal mask. */
-   VG_(threads)[tid].sig_mask = VG_(threads)[parent_tid].sig_mask;
-   VG_(ksigemptyset)(&VG_(threads)[tid].sigs_waited_for);
-
-   /* return child's tid to parent */
-   SET_EDX(parent_tid, tid); /* success */
-}
-
-
-/* -----------------------------------------------------------
-   MUTEXes
-   -------------------------------------------------------- */
-
-/* pthread_mutex_t is a struct with at 5 words:
-      typedef struct
-      {
-        int __m_reserved;         -- Reserved for future use
-        int __m_count;            -- Depth of recursive locking
-        _pthread_descr __m_owner; -- Owner thread (if recursive or errcheck)
-        int __m_kind;      -- Mutex kind: fast, recursive or errcheck
-        struct _pthread_fastlock __m_lock;  -- Underlying fast lock
-      } pthread_mutex_t;
-
-   #define PTHREAD_MUTEX_INITIALIZER \
-     {0, 0, 0, PTHREAD_MUTEX_TIMED_NP, __LOCK_INITIALIZER}
-   # define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP \
-     {0, 0, 0, PTHREAD_MUTEX_RECURSIVE_NP, __LOCK_INITIALIZER}
-   # define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP \
-     {0, 0, 0, PTHREAD_MUTEX_ERRORCHECK_NP, __LOCK_INITIALIZER}
-   # define PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP \
-     {0, 0, 0, PTHREAD_MUTEX_ADAPTIVE_NP, __LOCK_INITIALIZER}
-
-   How we use it:
-
-   __m_kind  never changes and indicates whether or not it is recursive.
-
-   __m_count indicates the lock count; if 0, the mutex is not owned by 
-             anybody.  
-
-   __m_owner has a ThreadId value stuffed into it.  We carefully arrange 
-             that ThreadId == 0 is invalid (VG_INVALID_THREADID), so that
-             statically initialised mutexes correctly appear 
-             to belong to nobody.
-
-   In summary, a not-in-use mutex is distinguised by having __m_owner
-   == 0 (VG_INVALID_THREADID) and __m_count == 0 too.  If one of those
-   conditions holds, the other should too.
-
-   There is no linked list of threads waiting for this mutex.  Instead
-   a thread in WaitMX state points at the mutex with its waited_on_mx
-   field.  This makes _unlock() inefficient, but simple to implement the
-   right semantics viz-a-viz signals.
-
-   We don't have to deal with mutex initialisation; the client side
-   deals with that for us.  
-*/
-
-/* Helper fns ... */
-static
-void release_one_thread_waiting_on_mutex ( pthread_mutex_t* mutex, 
-                                           Char* caller )
-{
-   Int  i;
-   Char msg_buf[100];
-
-   /* Find some arbitrary thread waiting on this mutex, and make it
-      runnable.  If none are waiting, mark the mutex as not held. */
-   for (i = 1; i < VG_N_THREADS; i++) {
-      if (VG_(threads)[i].status == VgTs_Empty) 
-         continue;
-      if (VG_(threads)[i].status == VgTs_WaitMX 
-          && VG_(threads)[i].associated_mx == mutex)
-         break;
-   }
-
-   vg_assert(i <= VG_N_THREADS);
-   if (i == VG_N_THREADS) {
-      /* Nobody else is waiting on it. */
-      mutex->__m_count = 0;
-      mutex->__m_owner = VG_INVALID_THREADID;
-   } else {
-      /* Notionally transfer the hold to thread i, whose
-         pthread_mutex_lock() call now returns with 0 (success). */
-      /* The .count is already == 1. */
-      vg_assert(VG_(threads)[i].associated_mx == mutex);
-      mutex->__m_owner = (_pthread_descr)i;
-      VG_(threads)[i].status        = VgTs_Runnable;
-      VG_(threads)[i].associated_mx = NULL;
-      /* m_edx already holds pth_mx_lock() success (0) */
-
-      if (VG_(clo_trace_pthread_level) >= 1) {
-         VG_(sprintf)(msg_buf, "%s       mx %p: RESUME", 
-                               caller, mutex );
-         print_pthread_event(i, msg_buf);
-      }
-   }
-}
-
-
-static
-void do_pthread_mutex_lock( ThreadId tid, 
-                            Bool is_trylock, 
-                            pthread_mutex_t* mutex )
-{
-   Char  msg_buf[100];
-   Char* caller
-      = is_trylock ? "pthread_mutex_trylock"
-                   : "pthread_mutex_lock   ";
-
-   if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "%s    mx %p ...", caller, mutex );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   /* Paranoia ... */
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   /* POSIX doesn't mandate this, but for sanity ... */
-   if (mutex == NULL) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_mutex_lock/trylock: mutex is NULL");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-
-   /* More paranoia ... */
-   switch (mutex->__m_kind) {
-#     ifndef GLIBC_2_1
-      case PTHREAD_MUTEX_TIMED_NP:
-      case PTHREAD_MUTEX_ADAPTIVE_NP:
-#     endif
-#     ifdef GLIBC_2_1
-      case PTHREAD_MUTEX_FAST_NP:
-#     endif
-      case PTHREAD_MUTEX_RECURSIVE_NP:
-      case PTHREAD_MUTEX_ERRORCHECK_NP:
-         if (mutex->__m_count >= 0) break;
-         /* else fall thru */
-      default:
-         VG_(record_pthread_err)( tid, 
-            "pthread_mutex_lock/trylock: mutex is invalid");
-         SET_EDX(tid, EINVAL);
-         return;
-   }
-
-   if (mutex->__m_count > 0) {
-
-      vg_assert(VG_(is_valid_tid)((ThreadId)mutex->__m_owner));
-
-      /* Someone has it already. */
-      if ((ThreadId)mutex->__m_owner == tid) {
-         /* It's locked -- by me! */
-         if (mutex->__m_kind == PTHREAD_MUTEX_RECURSIVE_NP) {
-            /* return 0 (success). */
-            mutex->__m_count++;
-            SET_EDX(tid, 0);
-            if (0)
-               VG_(printf)("!!!!!! tid %d, mx %p -> locked %d\n", 
-                           tid, mutex, mutex->__m_count);
-            return;
-         } else {
-            if (is_trylock)
-               SET_EDX(tid, EBUSY);
-            else
-               SET_EDX(tid, EDEADLK);
-            return;
-         }
-      } else {
-         /* Someone else has it; we have to wait.  Mark ourselves
-            thusly. */
-         /* GUARD: __m_count > 0 && __m_owner is valid */
-         if (is_trylock) {
-            /* caller is polling; so return immediately. */
-            SET_EDX(tid, EBUSY);
-         } else {
-            VG_(threads)[tid].status        = VgTs_WaitMX;
-            VG_(threads)[tid].associated_mx = mutex;
-            SET_EDX(tid, 0); /* pth_mx_lock success value */
-            if (VG_(clo_trace_pthread_level) >= 1) {
-               VG_(sprintf)(msg_buf, "%s    mx %p: BLOCK", 
-                                     caller, mutex );
-               print_pthread_event(tid, msg_buf);
-            }
-	 }
-         return;
-      }
-
-   } else {
-      /* Nobody owns it.  Sanity check ... */
-      vg_assert(mutex->__m_owner == VG_INVALID_THREADID);
-      /* We get it! [for the first time]. */
-      mutex->__m_count = 1;
-      mutex->__m_owner = (_pthread_descr)tid;
-      /* return 0 (success). */
-      SET_EDX(tid, 0);
-   }
-
-}
-
-
-static
-void do_pthread_mutex_unlock ( ThreadId tid,
-                               pthread_mutex_t* mutex )
-{
-   Char msg_buf[100];
-
-   if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "pthread_mutex_unlock     mx %p ...", mutex );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   /* Paranoia ... */
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (mutex == NULL) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_mutex_unlock: mutex is NULL");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-
-   /* More paranoia ... */
-   switch (mutex->__m_kind) {
-#     ifndef GLIBC_2_1    
-      case PTHREAD_MUTEX_TIMED_NP:
-      case PTHREAD_MUTEX_ADAPTIVE_NP:
-#     endif
-#     ifdef GLIBC_2_1
-      case PTHREAD_MUTEX_FAST_NP:
-#     endif
-      case PTHREAD_MUTEX_RECURSIVE_NP:
-      case PTHREAD_MUTEX_ERRORCHECK_NP:
-         if (mutex->__m_count >= 0) break;
-         /* else fall thru */
-      default:
-         VG_(record_pthread_err)( tid, 
-            "pthread_mutex_unlock: mutex is invalid");
-         SET_EDX(tid, EINVAL);
-         return;
-   }
-
-   /* Barf if we don't currently hold the mutex. */
-   if (mutex->__m_count == 0) {
-      /* nobody holds it */
-      VG_(record_pthread_err)( tid, 
-         "pthread_mutex_unlock: mutex is not locked");
-      SET_EDX(tid, EPERM);
-      return;
-   }
-
-   if ((ThreadId)mutex->__m_owner != tid) {
-      /* we don't hold it */
-      VG_(record_pthread_err)( tid, 
-         "pthread_mutex_unlock: mutex is locked by a different thread");
-      SET_EDX(tid, EPERM);
-      return;
-   }
-
-   /* If it's a multiply-locked recursive mutex, just decrement the
-      lock count and return. */
-   if (mutex->__m_count > 1) {
-      vg_assert(mutex->__m_kind == PTHREAD_MUTEX_RECURSIVE_NP);
-      mutex->__m_count --;
-      SET_EDX(tid, 0); /* success */
-      return;
-   }
-
-   /* Now we're sure it is locked exactly once, and by the thread who
-      is now doing an unlock on it.  */
-   vg_assert(mutex->__m_count == 1);
-   vg_assert((ThreadId)mutex->__m_owner == tid);
-
-   /* Release at max one thread waiting on this mutex. */
-   release_one_thread_waiting_on_mutex ( mutex, "pthread_mutex_lock" );
-
-   /* Our (tid's) pth_unlock() returns with 0 (success). */
-   SET_EDX(tid, 0); /* Success. */
-}
-
-
-/* -----------------------------------------------------------
-   CONDITION VARIABLES
-   -------------------------------------------------------- */
-
-/* The relevant native types are as follows:
-   (copied from /usr/include/bits/pthreadtypes.h)
-
-   -- Conditions (not abstract because of PTHREAD_COND_INITIALIZER
-   typedef struct
-   {
-     struct _pthread_fastlock __c_lock; -- Protect against concurrent access
-     _pthread_descr __c_waiting;        -- Threads waiting on this condition
-   } pthread_cond_t;
-
-   -- Attribute for conditionally variables.
-   typedef struct
-   {
-     int __dummy;
-   } pthread_condattr_t;
-
-   #define PTHREAD_COND_INITIALIZER {__LOCK_INITIALIZER, 0}
-
-   We don't use any fields of pthread_cond_t for anything at all.
-   Only the identity of the CVs is important.
-
-   Linux pthreads supports no attributes on condition variables, so we
-   don't need to think too hard there.  */
-
-
-static 
-void do_pthread_cond_timedwait_TIMEOUT ( ThreadId tid )
-{
-   Char             msg_buf[100];
-   pthread_mutex_t* mx;
-   pthread_cond_t*  cv;
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_WaitCV
-             && VG_(threads)[tid].awaken_at != 0xFFFFFFFF);
-   mx = VG_(threads)[tid].associated_mx;
-   vg_assert(mx != NULL);
-   cv = VG_(threads)[tid].associated_cv;
-   vg_assert(cv != NULL);
-
-   if (mx->__m_owner == VG_INVALID_THREADID) {
-      /* Currently unheld; hand it out to thread tid. */
-      vg_assert(mx->__m_count == 0);
-      VG_(threads)[tid].status        = VgTs_Runnable;
-      SET_EDX(tid, ETIMEDOUT);      /* pthread_cond_wait return value */
-      VG_(threads)[tid].associated_cv = NULL;
-      VG_(threads)[tid].associated_mx = NULL;
-      mx->__m_owner = (_pthread_descr)tid;
-      mx->__m_count = 1;
-
-      if (VG_(clo_trace_pthread_level) >= 1) {
-         VG_(sprintf)(msg_buf, 
-            "pthread_cond_timedwai cv %p: TIMEOUT with mx %p", 
-            cv, mx );
-         print_pthread_event(tid, msg_buf);
-      }
-   } else {
-      /* Currently held.  Make thread tid be blocked on it. */
-      vg_assert(mx->__m_count > 0);
-      VG_(threads)[tid].status        = VgTs_WaitMX;
-      SET_EDX(tid, ETIMEDOUT);      /* pthread_cond_wait return value */
-      VG_(threads)[tid].associated_cv = NULL;
-      VG_(threads)[tid].associated_mx = mx;
-      if (VG_(clo_trace_pthread_level) >= 1) {
-         VG_(sprintf)(msg_buf, 
-            "pthread_cond_timedwai cv %p: TIMEOUT -> BLOCK for mx %p", 
-            cv, mx );
-         print_pthread_event(tid, msg_buf);
-      }
-
-   }
-}
-
-
-static
-void release_N_threads_waiting_on_cond ( pthread_cond_t* cond, 
-                                         Int n_to_release, 
-                                         Char* caller )
-{
-   Int              i;
-   Char             msg_buf[100];
-   pthread_mutex_t* mx;
-
-   while (True) {
-      if (n_to_release == 0)
-         return;
-
-      /* Find a thread waiting on this CV. */
-      for (i = 1; i < VG_N_THREADS; i++) {
-         if (VG_(threads)[i].status == VgTs_Empty) 
-            continue;
-         if (VG_(threads)[i].status == VgTs_WaitCV 
-             && VG_(threads)[i].associated_cv == cond)
-            break;
-      }
-      vg_assert(i <= VG_N_THREADS);
-
-      if (i == VG_N_THREADS) {
-         /* Nobody else is waiting on it. */
-         return;
-      }
-
-      mx = VG_(threads)[i].associated_mx;
-      vg_assert(mx != NULL);
-
-      if (mx->__m_owner == VG_INVALID_THREADID) {
-         /* Currently unheld; hand it out to thread i. */
-         vg_assert(mx->__m_count == 0);
-         VG_(threads)[i].status        = VgTs_Runnable;
-         VG_(threads)[i].associated_cv = NULL;
-         VG_(threads)[i].associated_mx = NULL;
-         mx->__m_owner = (_pthread_descr)i;
-         mx->__m_count = 1;
-         /* .m_edx already holds pth_cond_wait success value (0) */
-
-         if (VG_(clo_trace_pthread_level) >= 1) {
-            VG_(sprintf)(msg_buf, "%s   cv %p: RESUME with mx %p", 
-                                  caller, cond, mx );
-            print_pthread_event(i, msg_buf);
-         }
-
-      } else {
-         /* Currently held.  Make thread i be blocked on it. */
-         vg_assert(mx->__m_count > 0);
-         VG_(threads)[i].status        = VgTs_WaitMX;
-         VG_(threads)[i].associated_cv = NULL;
-         VG_(threads)[i].associated_mx = mx;
-         SET_EDX(i, 0); /* pth_cond_wait success value */
-
-         if (VG_(clo_trace_pthread_level) >= 1) {
-            VG_(sprintf)(msg_buf, "%s   cv %p: BLOCK for mx %p", 
-                                  caller, cond, mx );
-            print_pthread_event(i, msg_buf);
-         }
-
-      }
-
-      n_to_release--;
-   }
-}
-
-
-static
-void do_pthread_cond_wait ( ThreadId tid,
-                            pthread_cond_t *cond, 
-                            pthread_mutex_t *mutex,
-			    UInt ms_end )
-{
-   Char msg_buf[100];
-
-   /* If ms_end == 0xFFFFFFFF, wait forever (no timeout).  Otherwise,
-      ms_end is the ending millisecond. */
-
-   /* pre: mutex should be a valid mutex and owned by tid. */
-   if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "pthread_cond_wait        cv %p, mx %p, end %d ...", 
-                            cond, mutex, ms_end );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   /* Paranoia ... */
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (mutex == NULL || cond == NULL) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_cond_wait/timedwait: cond or mutex is NULL");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-
-   /* More paranoia ... */
-   switch (mutex->__m_kind) {
-#     ifndef GLIBC_2_1    
-      case PTHREAD_MUTEX_TIMED_NP:
-      case PTHREAD_MUTEX_ADAPTIVE_NP:
-#     endif
-#     ifdef GLIBC_2_1
-      case PTHREAD_MUTEX_FAST_NP:
-#     endif
-      case PTHREAD_MUTEX_RECURSIVE_NP:
-      case PTHREAD_MUTEX_ERRORCHECK_NP:
-         if (mutex->__m_count >= 0) break;
-         /* else fall thru */
-      default:
-         VG_(record_pthread_err)( tid, 
-            "pthread_cond_wait/timedwait: mutex is invalid");
-         SET_EDX(tid, EINVAL);
-         return;
-   }
-
-   /* Barf if we don't currently hold the mutex. */
-   if (mutex->__m_count == 0 /* nobody holds it */
-       || (ThreadId)mutex->__m_owner != tid /* we don't hold it */) {
-         VG_(record_pthread_err)( tid, 
-            "pthread_cond_wait/timedwait: mutex is unlocked "
-            "or is locked but not owned by thread");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-
-   /* Queue ourselves on the condition. */
-   VG_(threads)[tid].status        = VgTs_WaitCV;
-   VG_(threads)[tid].associated_cv = cond;
-   VG_(threads)[tid].associated_mx = mutex;
-   VG_(threads)[tid].awaken_at     = ms_end;
-
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, 
-                   "pthread_cond_wait        cv %p, mx %p: BLOCK", 
-                   cond, mutex );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   /* Release the mutex. */
-   release_one_thread_waiting_on_mutex ( mutex, "pthread_cond_wait " );
-}
-
-
-static
-void do_pthread_cond_signal_or_broadcast ( ThreadId tid, 
-                                           Bool broadcast,
-                                           pthread_cond_t *cond )
-{
-   Char  msg_buf[100];
-   Char* caller 
-      = broadcast ? "pthread_cond_broadcast" 
-                  : "pthread_cond_signal   ";
-
-   if (VG_(clo_trace_pthread_level) >= 2) {
-      VG_(sprintf)(msg_buf, "%s   cv %p ...", 
-                            caller, cond );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   /* Paranoia ... */
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (cond == NULL) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_cond_signal/broadcast: cond is NULL");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-   
-   release_N_threads_waiting_on_cond ( 
-      cond,
-      broadcast ? VG_N_THREADS : 1, 
-      caller
-   );
-
-   SET_EDX(tid, 0); /* success */
-}
-
-
-/* -----------------------------------------------------------
-   THREAD SPECIFIC DATA
-   -------------------------------------------------------- */
-
-static __inline__
-Bool is_valid_key ( ThreadKey k )
-{
-   /* k unsigned; hence no < 0 check */
-   if (k >= VG_N_THREAD_KEYS) return False;
-   if (!vg_thread_keys[k].inuse) return False;
-   return True;
-}
-
-static
-void do_pthread_key_create ( ThreadId tid,
-                             pthread_key_t* key,
-                             void (*destructor)(void*) )
-{
-   Int  i;
-   Char msg_buf[100];
-
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, "pthread_key_create      *key %p, destr %p", 
-                            key, destructor );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(sizeof(pthread_key_t) == sizeof(ThreadKey));
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   for (i = 0; i < VG_N_THREAD_KEYS; i++)
-      if (!vg_thread_keys[i].inuse)   
-         break;
-
-   if (i == VG_N_THREAD_KEYS) {
-      /* SET_EDX(tid, EAGAIN); 
-         return; 
-      */
-      VG_(panic)("pthread_key_create: VG_N_THREAD_KEYS is too low;"
-                 " increase and recompile");
-   }
-
-   vg_thread_keys[i].inuse      = True;
-   vg_thread_keys[i].destructor = destructor;
-
-   /* check key for addressibility */
-   if (VG_(clo_instrument)
-       && !VGM_(check_writable)( (Addr)key, 
-                                 sizeof(pthread_key_t), NULL))
-      VG_(record_pthread_err)( tid, 
-         "pthread_key_create: key points to invalid location");
-   *key = i;
-   if (VG_(clo_instrument))
-      VGM_(make_readable)( (Addr)key, sizeof(pthread_key_t) );
-
-   SET_EDX(tid, 0);
-}
-
-
-static
-void do_pthread_key_delete ( ThreadId tid, pthread_key_t key )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, "pthread_key_delete       key %d", 
-                            key );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-   
-   if (!is_valid_key(key)) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_key_delete: key is invalid");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-
-   vg_thread_keys[key].inuse = False;
-
-   /* Optional.  We're not required to do this, although it shouldn't
-      make any difference to programs which use the key/specifics
-      functions correctly.  */
-#  if 1
-   for (tid = 1; tid < VG_N_THREADS; tid++) {
-      if (VG_(threads)[tid].status != VgTs_Empty)
-         VG_(threads)[tid].specifics[key] = NULL;
-   }
-#  endif
-}
-
-
-static 
-void do_pthread_getspecific ( ThreadId tid, pthread_key_t key )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, "pthread_getspecific      key %d", 
-                            key );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (!is_valid_key(key)) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_getspecific: key is invalid");
-      SET_EDX(tid, (UInt)NULL);
-      return;
-   }
-
-   SET_EDX(tid, (UInt)VG_(threads)[tid].specifics[key]);
-}
-
-
-static
-void do_pthread_setspecific ( ThreadId tid, 
-                              pthread_key_t key, 
-                              void *pointer )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, "pthread_setspecific      key %d, ptr %p", 
-                            key, pointer );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (!is_valid_key(key)) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_setspecific: key is invalid");
-      SET_EDX(tid, EINVAL);
-      return;
-   }
-
-   VG_(threads)[tid].specifics[key] = pointer;
-   SET_EDX(tid, 0);
-}
-
-
-/* Helper for calling destructors at thread exit.  If key is valid,
-   copy the thread's specific value into cu->arg and put the *key*'s
-   destructor fn address in cu->fn.  Then return 0 to the caller.
-   Otherwise return non-zero to the caller. */
-static
-void do__get_key_destr_and_spec ( ThreadId tid, 
-                                  pthread_key_t key,
-                                  CleanupEntry* cu )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, 
-         "get_key_destr_and_arg (key = %d)", key );
-      print_pthread_event(tid, msg_buf);
-   }
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(key >= 0 && key < VG_N_THREAD_KEYS);
-   if (!vg_thread_keys[key].inuse) {
-      SET_EDX(tid, -1);
-      return;
-   }
-   cu->fn = vg_thread_keys[key].destructor;
-   cu->arg = VG_(threads)[tid].specifics[key];
-   if (VG_(clo_instrument))
-      VGM_(make_readable)( (Addr)cu, sizeof(CleanupEntry) );
-   SET_EDX(tid, 0);
-}
-
-
-/* ---------------------------------------------------
-   SIGNALS
-   ------------------------------------------------ */
-
-/* See comment in vg_libthread.c:pthread_sigmask() regarding
-   deliberate confusion of types sigset_t and vki_sigset_t.  Return 0
-   for OK and 1 for some kind of addressing error, which the
-   vg_libpthread.c routine turns into return values 0 and EFAULT
-   respectively. */
-static
-void do_pthread_sigmask ( ThreadId tid,
-                          Int vki_how,
-                          vki_ksigset_t* newmask, 
-                          vki_ksigset_t* oldmask )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, 
-         "pthread_sigmask          vki_how %d, newmask %p, oldmask %p",
-         vki_how, newmask, oldmask );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (VG_(clo_instrument)) {
-      /* check newmask/oldmask are addressible/defined */
-      if (newmask
-          && !VGM_(check_readable)( (Addr)newmask, 
-                                    sizeof(vki_ksigset_t), NULL))
-         VG_(record_pthread_err)( tid, 
-            "pthread_sigmask: newmask contains "
-            "unaddressible or undefined bytes");
-      if (oldmask
-          && !VGM_(check_writable)( (Addr)oldmask, 
-                                    sizeof(vki_ksigset_t), NULL))
-         VG_(record_pthread_err)( tid, 
-            "pthread_sigmask: oldmask contains "
-            "unaddressible bytes");
-   }
-
-   VG_(do_pthread_sigmask_SCSS_upd) ( tid, vki_how, newmask, oldmask );
-
-   if (oldmask && VG_(clo_instrument)) {
-      VGM_(make_readable)( (Addr)oldmask, sizeof(vki_ksigset_t) );
-   }
-
-   /* Success. */
-   SET_EDX(tid, 0);
-}
-
-
-static
-void do_sigwait ( ThreadId tid,
-                  vki_ksigset_t* set, 
-                  Int* sig )
-{
-   vki_ksigset_t irrelevant_sigmask;
-   Char          msg_buf[100];
-
-   if (VG_(clo_trace_signals) || VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, 
-         "suspend due to sigwait(): set %p, sig %p",
-         set, sig );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   /* Change SCSS */
-   VG_(threads)[tid].sigs_waited_for = *set;
-   VG_(threads)[tid].status = VgTs_WaitSIG;
-
-   VG_(block_all_host_signals)( &irrelevant_sigmask );
-   VG_(handle_SCSS_change)( False /* lazy update */ );
-}
-
-
-static
-void do_pthread_kill ( ThreadId tid, /* me */
-                       ThreadId thread, /* thread to signal */
-                       Int sig )
-{
-   Char msg_buf[100];
-
-   if (VG_(clo_trace_signals) || VG_(clo_trace_pthread_level) >= 1) {
-      VG_(sprintf)(msg_buf, 
-         "pthread_kill            thread %d, signo %d",
-         thread, sig );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (!VG_(is_valid_tid)(thread)) {
-      VG_(record_pthread_err)( tid, 
-         "pthread_kill: invalid target thread");
-      SET_EDX(tid, -VKI_ESRCH);
-      return;
-   }
-
-   if (sig < 1 || sig > VKI_KNSIG) {
-      SET_EDX(tid, -VKI_EINVAL);
-      return;
-   }
-
-   VG_(send_signal_to_thread)( thread, sig );
-   SET_EDX(tid, 0);
-}
-
-
-/* -----------------------------------------------------------
-   FORK HANDLERS.
-   -------------------------------------------------------- */
-
-static 
-void do__set_fhstack_used ( ThreadId tid, Int n )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "set_fhstack_used to %d", n );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (n >= 0 && n < VG_N_FORKHANDLERSTACK) {
-      vg_fhstack_used = n;
-      SET_EDX(tid, 0);
-   } else {
-      SET_EDX(tid, -1);
-   }
-}
-
-
-static
-void do__get_fhstack_used ( ThreadId tid )
-{
-   Int  n;
-   Char msg_buf[100];
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "get_fhstack_used" );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   n = vg_fhstack_used;
-   vg_assert(n >= 0 && n < VG_N_FORKHANDLERSTACK);
-   SET_EDX(tid, n);
-}
-
-static
-void do__set_fhstack_entry ( ThreadId tid, Int n, ForkHandlerEntry* fh )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "set_fhstack_entry %d to %p", n, fh );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (VG_(clo_instrument)) {
-      /* check fh is addressible/defined */
-      if (!VGM_(check_readable)( (Addr)fh,
-                                 sizeof(ForkHandlerEntry), NULL)) {
-         VG_(record_pthread_err)( tid, 
-            "pthread_atfork: prepare/parent/child contains "
-            "unaddressible or undefined bytes");
-      }
-   }
-
-   if (n < 0 && n >= VG_N_FORKHANDLERSTACK) {
-      SET_EDX(tid, -1);
-      return;
-   } 
-
-   vg_fhstack[n] = *fh;
-   SET_EDX(tid, 0);
-}
-
-
-static
-void do__get_fhstack_entry ( ThreadId tid, Int n, /*OUT*/
-                                                  ForkHandlerEntry* fh )
-{
-   Char msg_buf[100];
-   if (VG_(clo_trace_sched)) {
-      VG_(sprintf)(msg_buf, "get_fhstack_entry %d", n );
-      print_pthread_event(tid, msg_buf);
-   }
-
-   vg_assert(VG_(is_valid_tid)(tid) 
-             && VG_(threads)[tid].status == VgTs_Runnable);
-
-   if (VG_(clo_instrument)) {
-      /* check fh is addressible/defined */
-      if (!VGM_(check_writable)( (Addr)fh,
-                                 sizeof(ForkHandlerEntry), NULL)) {
-         VG_(record_pthread_err)( tid, 
-            "fork: prepare/parent/child contains "
-            "unaddressible bytes");
-      }
-   }
-
-   if (n < 0 && n >= VG_N_FORKHANDLERSTACK) {
-      SET_EDX(tid, -1);
-      return;
-   } 
-
-   *fh = vg_fhstack[n];
-   SET_EDX(tid, 0);
-
-   if (VG_(clo_instrument)) {
-      VGM_(make_readable)( (Addr)fh, sizeof(ForkHandlerEntry) );
-   }
-}
-
-
-/* ---------------------------------------------------------------------
-   Handle client requests.
-   ------------------------------------------------------------------ */
-
-/* Do a client request for the thread tid.  After the request, tid may
-   or may not still be runnable; if not, the scheduler will have to
-   choose a new thread to run.  
-*/
-static
-void do_client_request ( ThreadId tid )
-{
-#  define RETURN_WITH(vvv)                        \
-       { tst->m_edx = (vvv);                      \
-         tst->sh_edx = VGM_WORD_VALID;            \
-       }
-
-   ThreadState* tst    = &VG_(threads)[tid];
-   UInt*        arg    = (UInt*)(VG_(threads)[tid].m_eax);
-   UInt         req_no = arg[0];
-
-   /* VG_(printf)("req no = 0x%x\n", req_no); */
-   switch (req_no) {
-
-      case VG_USERREQ__MALLOC:
-         RETURN_WITH(
-            (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocMalloc ) 
-         );
-         break;
-
-      case VG_USERREQ__BUILTIN_NEW:
-         RETURN_WITH(
-            (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocNew )
-         );
-         break;
-
-      case VG_USERREQ__BUILTIN_VEC_NEW:
-         RETURN_WITH(
-            (UInt)VG_(client_malloc) ( tst, arg[1], Vg_AllocNewVec )
-         );
-         break;
-
-      case VG_USERREQ__FREE:
-         VG_(client_free) ( tst, (void*)arg[1], Vg_AllocMalloc );
-	 RETURN_WITH(0); /* irrelevant */
-         break;
-
-      case VG_USERREQ__BUILTIN_DELETE:
-         VG_(client_free) ( tst, (void*)arg[1], Vg_AllocNew );
-	 RETURN_WITH(0); /* irrelevant */
-         break;
-
-      case VG_USERREQ__BUILTIN_VEC_DELETE:
-         VG_(client_free) ( tst, (void*)arg[1], Vg_AllocNewVec );
-	 RETURN_WITH(0); /* irrelevant */
-         break;
-
-      case VG_USERREQ__CALLOC:
-         RETURN_WITH(
-            (UInt)VG_(client_calloc) ( tst, arg[1], arg[2] )
-         );
-         break;
-
-      case VG_USERREQ__REALLOC:
-         RETURN_WITH(
-            (UInt)VG_(client_realloc) ( tst, (void*)arg[1], arg[2] )
-         );
-         break;
-
-      case VG_USERREQ__MEMALIGN:
-         RETURN_WITH(
-            (UInt)VG_(client_memalign) ( tst, arg[1], arg[2] )
-         );
-         break;
-
-      case VG_USERREQ__PTHREAD_GET_THREADID:
-         RETURN_WITH(tid);
-         break;
-
-      case VG_USERREQ__RUNNING_ON_VALGRIND:
-         RETURN_WITH(1);
-         break;
-
-      case VG_USERREQ__GET_PTHREAD_TRACE_LEVEL:
-         RETURN_WITH(VG_(clo_trace_pthread_level));
-         break;
-
-      case VG_USERREQ__READ_MILLISECOND_TIMER:
-         RETURN_WITH(VG_(read_millisecond_timer)());
-         break;
-
-      /* Some of these may make thread tid non-runnable, but the
-         scheduler checks for that on return from this function. */
-      case VG_USERREQ__PTHREAD_MUTEX_LOCK:
-         do_pthread_mutex_lock( tid, False, (void *)(arg[1]) );
-         break;
-
-      case VG_USERREQ__PTHREAD_MUTEX_TRYLOCK:
-         do_pthread_mutex_lock( tid, True, (void *)(arg[1]) );
-         break;
-
-      case VG_USERREQ__PTHREAD_MUTEX_UNLOCK:
-         do_pthread_mutex_unlock( tid, (void *)(arg[1]) );
-         break;
-
-      case VG_USERREQ__PTHREAD_GETSPECIFIC:
- 	 do_pthread_getspecific ( tid, (UInt)(arg[1]) );
-         break;
-
-      case VG_USERREQ__SET_CANCELTYPE:
-         do__set_canceltype ( tid, arg[1] );
-         break;
-
-      case VG_USERREQ__CLEANUP_PUSH:
-         do__cleanup_push ( tid, (CleanupEntry*)(arg[1]) );
-         break;
-
-      case VG_USERREQ__CLEANUP_POP:
-         do__cleanup_pop ( tid, (CleanupEntry*)(arg[1]) );
-         break;
-
-      case VG_USERREQ__TESTCANCEL:
-         do__testcancel ( tid );
-         break;
-
-      case VG_USERREQ__GET_N_SIGS_RETURNED:
-         RETURN_WITH(VG_(threads)[tid].n_signals_returned);
-         break;
-
-      case VG_USERREQ__PTHREAD_JOIN:
-         do_pthread_join( tid, arg[1], (void**)(arg[2]) );
-         break;
-
-      case VG_USERREQ__PTHREAD_COND_WAIT:
-         do_pthread_cond_wait( tid, 
-                               (pthread_cond_t *)(arg[1]),
-                               (pthread_mutex_t *)(arg[2]),
-                               0xFFFFFFFF /* no timeout */ );
-         break;
-
-      case VG_USERREQ__PTHREAD_COND_TIMEDWAIT:
-         do_pthread_cond_wait( tid, 
-                               (pthread_cond_t *)(arg[1]),
-                               (pthread_mutex_t *)(arg[2]),
-                               arg[3] /* timeout millisecond point */ );
-         break;
-
-      case VG_USERREQ__PTHREAD_COND_SIGNAL:
-         do_pthread_cond_signal_or_broadcast( 
-            tid, 
-	    False, /* signal, not broadcast */
-            (pthread_cond_t *)(arg[1]) );
-         break;
-
-      case VG_USERREQ__PTHREAD_COND_BROADCAST:
-         do_pthread_cond_signal_or_broadcast( 
-            tid, 
-	    True, /* broadcast, not signal */
-            (pthread_cond_t *)(arg[1]) );
-         break;
-
-      case VG_USERREQ__PTHREAD_KEY_CREATE:
- 	 do_pthread_key_create ( tid, 
-                                 (pthread_key_t*)(arg[1]),
-                                 (void(*)(void*))(arg[2]) );
-	 break;
-
-      case VG_USERREQ__PTHREAD_KEY_DELETE:
- 	 do_pthread_key_delete ( tid, 
-                                 (pthread_key_t)(arg[1]) );
- 	 break;
-
-      case VG_USERREQ__PTHREAD_SETSPECIFIC:
- 	 do_pthread_setspecific ( tid, 
-                                  (pthread_key_t)(arg[1]),
-				  (void*)(arg[2]) );
- 	 break;
-
-      case VG_USERREQ__PTHREAD_SIGMASK:
-         do_pthread_sigmask ( tid,
-                              arg[1],
-                              (vki_ksigset_t*)(arg[2]),
-                              (vki_ksigset_t*)(arg[3]) );
-	 break;
-
-      case VG_USERREQ__SIGWAIT:
-         do_sigwait ( tid,
-                      (vki_ksigset_t*)(arg[1]),
-                      (Int*)(arg[2]) );
-	 break;
-
-      case VG_USERREQ__PTHREAD_KILL:
-         do_pthread_kill ( tid, arg[1], arg[2] );
-	 break;
-
-      case VG_USERREQ__PTHREAD_YIELD:
-         do_pthread_yield ( tid );
-         /* On return from do_client_request(), the scheduler will
-            select a new thread to run. */
-	 break;
-
-      case VG_USERREQ__SET_CANCELSTATE:
-         do__set_cancelstate ( tid, arg[1] );
-         break;
-
-      case VG_USERREQ__SET_OR_GET_DETACH:
-         do__set_or_get_detach ( tid, arg[1], arg[2] );
-         break;
-
-      case VG_USERREQ__SET_CANCELPEND:
-         do__set_cancelpend ( tid, arg[1], (void(*)(void*))arg[2] );
-         break;
-
-      case VG_USERREQ__WAIT_JOINER:
-         do__wait_joiner ( tid, (void*)arg[1] );
-         break;
-
-      case VG_USERREQ__QUIT:
-         do__quit ( tid );
-         break;
-
-      case VG_USERREQ__APPLY_IN_NEW_THREAD:
-         do__apply_in_new_thread ( tid, (void*(*)(void*))arg[1], 
-                                        (void*)arg[2] );
-         break;
-
-      case VG_USERREQ__GET_KEY_D_AND_S:
-         do__get_key_destr_and_spec ( tid, 
-                                      (pthread_key_t)arg[1],
-                                      (CleanupEntry*)arg[2] );
-         break;
-
-      case VG_USERREQ__NUKE_OTHER_THREADS:
-         VG_(nuke_all_threads_except) ( tid );
-         SET_EDX(tid, 0);
-         break;
-
-      case VG_USERREQ__PTHREAD_ERROR:
-         VG_(record_pthread_err)( tid, (Char*)(arg[1]) );
-         SET_EDX(tid, 0);
-         break;
-
-      case VG_USERREQ__SET_FHSTACK_USED:
-         do__set_fhstack_used( tid, (Int)(arg[1]) );
-         break;
-
-      case VG_USERREQ__GET_FHSTACK_USED:
-         do__get_fhstack_used( tid );
-         break;
-
-      case VG_USERREQ__SET_FHSTACK_ENTRY:
-         do__set_fhstack_entry( tid, (Int)(arg[1]),
-                                     (ForkHandlerEntry*)(arg[2]) );
-         break;
-
-      case VG_USERREQ__GET_FHSTACK_ENTRY:
-         do__get_fhstack_entry( tid, (Int)(arg[1]),
-                                     (ForkHandlerEntry*)(arg[2]) );
-         break;
-
-      case VG_USERREQ__MAKE_NOACCESS:
-      case VG_USERREQ__MAKE_WRITABLE:
-      case VG_USERREQ__MAKE_READABLE:
-      case VG_USERREQ__DISCARD:
-      case VG_USERREQ__CHECK_WRITABLE:
-      case VG_USERREQ__CHECK_READABLE:
-      case VG_USERREQ__MAKE_NOACCESS_STACK:
-      case VG_USERREQ__DO_LEAK_CHECK:
-      case VG_USERREQ__DISCARD_TRANSLATIONS:
-         SET_EDX(
-            tid, 
-            VG_(handle_client_request) ( &VG_(threads)[tid], arg )
-         );
-	 break;
-
-      case VG_USERREQ__SIGNAL_RETURNS: 
-         handle_signal_return(tid);
-	 break;
-
-      default:
-         VG_(printf)("panic'd on client request = 0x%x\n", arg[0] );
-         VG_(panic)("do_client_request: "
-                    "unknown request");
-         /*NOTREACHED*/
-         break;
-   }
-
-#  undef RETURN_WITH
-}
-
-
-/* ---------------------------------------------------------------------
-   Sanity checking.
-   ------------------------------------------------------------------ */
-
-/* Internal consistency checks on the sched/pthread structures. */
-static
-void scheduler_sanity ( void )
-{
-   pthread_mutex_t* mx;
-   pthread_cond_t*  cv;
-   Int              i;
-
-   /* VG_(printf)("scheduler_sanity\n"); */
-   for (i = 1; i < VG_N_THREADS; i++) {
-      mx = VG_(threads)[i].associated_mx;
-      cv = VG_(threads)[i].associated_cv;
-      if (VG_(threads)[i].status == VgTs_WaitMX) {
-	 /* If we're waiting on a MX: (1) the mx is not null, (2, 3)
-            it's actually held by someone, since otherwise this thread
-            is deadlocked, (4) the mutex's owner is not us, since
-            otherwise this thread is also deadlocked.  The logic in
-            do_pthread_mutex_lock rejects attempts by a thread to lock
-            a (non-recursive) mutex which it already owns.
-
-            (2) has been seen to fail sometimes.  I don't know why.
-            Possibly to do with signals. */
-         vg_assert(cv == NULL);
-         /* 1 */ vg_assert(mx != NULL);
-	 /* 2 */ vg_assert(mx->__m_count > 0);
-         /* 3 */ vg_assert(VG_(is_valid_tid)((ThreadId)mx->__m_owner));
-         /* 4 */ vg_assert(i != (ThreadId)mx->__m_owner); 
-      } else 
-      if (VG_(threads)[i].status == VgTs_WaitCV) {
-         vg_assert(cv != NULL);
-         vg_assert(mx != NULL);
-      } else {
-         /* Unfortunately these don't hold true when a sighandler is
-            running.  To be fixed. */
-         /* vg_assert(cv == NULL); */
-         /* vg_assert(mx == NULL); */
-      }
-
-      if (VG_(threads)[i].status != VgTs_Empty) {
-         Int
-         stack_used = (Addr)VG_(threads)[i].stack_highest_word 
-                      - (Addr)VG_(threads)[i].m_esp;
-         if (i > 1 /* not the root thread */ 
-             && stack_used 
-                >= (VG_PTHREAD_STACK_MIN - 1000 /* paranoia */)) {
-            VG_(message)(Vg_UserMsg,
-               "Warning: STACK OVERFLOW: "
-               "thread %d: stack used %d, available %d", 
-               i, stack_used, VG_PTHREAD_STACK_MIN );
-            VG_(message)(Vg_UserMsg,
-               "Terminating Valgrind.  If thread(s) "
-               "really need more stack, increase");
-            VG_(message)(Vg_UserMsg,
-               "VG_PTHREAD_STACK_SIZE in vg_include.h and recompile.");
-            VG_(exit)(1);
-	 }
-
-         if (VG_(threads)[i].status == VgTs_WaitSIG) {
-            vg_assert( ! VG_(kisemptysigset)(
-                            & VG_(threads)[i].sigs_waited_for) );
-	 } else {
-            vg_assert( VG_(kisemptysigset)(
-                          & VG_(threads)[i].sigs_waited_for) );
-	 }
-
-      }
-   }
-
-   for (i = 0; i < VG_N_THREAD_KEYS; i++) {
-      if (!vg_thread_keys[i].inuse)
-         vg_assert(vg_thread_keys[i].destructor == NULL);
-   }
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                           vg_scheduler.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_signals.c b/coregrind/vg_signals.c
deleted file mode 100644
index f58ec11e96..0000000000
--- a/coregrind/vg_signals.c
+++ /dev/null
@@ -1,1531 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Implementation of POSIX signals.                             ---*/
-/*---                                                 vg_signals.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-
-#include "vg_include.h"
-#include "vg_constants.h"
-#include "vg_unsafe.h"
-#include "valgrind.h"  /* for VALGRIND_MAGIC_SEQUENCE */
-
-/* Define to give more sanity checking for signals. */
-#define DEBUG_SIGNALS
-
-
-/* KNOWN BUGS 24 May 02:
-
-   - A signal is not masked in its own handler.  Neither are the
-     signals in the signal's blocking mask.
-
-   - There is only one pending set for the entire process, whereas
-     POSIX seems to require each thread have its own pending set.
-     This means that a signal can only be pending for one thread at
-     a time.
-
-   - The following causes an infinite loop: start Hugs, Feb 2001 
-     version, and do Control-C at the prompt.  There is an infinite
-     series of sigints delivered (to the client); but also seemingly
-     to valgrind, which is very strange.  I don't know why.
-
-   Probably a lot more bugs which I haven't discovered yet.
-*/
-
-
-/* ---------------------------------------------------------------------
-   Forwards decls.
-   ------------------------------------------------------------------ */
-
-static void vg_oursignalhandler ( Int sigNo );
-
-
-/* ---------------------------------------------------------------------
-   HIGH LEVEL STUFF TO DO WITH SIGNALS: POLICY (MOSTLY)
-   ------------------------------------------------------------------ */
-
-/* ---------------------------------------------------------------------
-   Signal state for this process.
-   ------------------------------------------------------------------ */
-
-
-/* Base-ment of these arrays[VKI_KNSIG].
-
-   Valid signal numbers are 1 .. VKI_KNSIG inclusive.
-   Rather than subtracting 1 for indexing these arrays, which
-   is tedious and error-prone, they are simply dimensioned 1 larger,
-   and entry [0] is not used. 
- */
-
-
-/* -----------------------------------------------------
-   Static client signal state (SCSS).  This is the state
-   that the client thinks it has the kernel in.  
-   SCSS records verbatim the client's settings.  These 
-   are mashed around only when SKSS is calculated from it.
-   -------------------------------------------------- */
-
-typedef 
-   struct {
-      void* scss_handler;  /* VKI_SIG_DFL or VKI_SIG_IGN or ptr to
-                              client's handler */
-      UInt  scss_flags;
-      vki_ksigset_t scss_mask;
-      void* scss_restorer; /* god knows; we ignore it. */
-   }
-   SCSS_Per_Signal;
-
-typedef 
-   struct {
-      /* per-signal info */
-      SCSS_Per_Signal scss_per_sig[1+VKI_KNSIG];
-
-      /* Signal delivery stack, if any. */
-      vki_kstack_t altstack;
-
-      /* Additional elements to SCSS not stored here:
-         - for each thread, the thread's blocking mask
-         - for each thread in WaitSIG, the set of waited-on sigs
-      */
-      } 
-      SCSS;
-
-static SCSS vg_scss;
-
-
-/* -----------------------------------------------------
-   Static kernel signal state (SKSS).  This is the state
-   that we have the kernel in.  It is computed from SCSS.
-   -------------------------------------------------- */
-
-/* Let's do: 
-     sigprocmask assigns to all thread masks
-     so that at least everything is always consistent
-   Flags:
-     SA_NOCLDSTOP -- passed to kernel
-     SA_ONESHOT or SA_RESETHAND -- required; abort if not set
-     SA_RESTART -- we observe this but set our handlers always to restart
-     SA_NOMASK or SA_NODEFER -- required to not be set; abort if set
-     SA_ONSTACK -- currently not supported; abort if set.
-*/
-
-
-typedef 
-   struct {
-      void* skss_handler;  /* VKI_SIG_DFL or VKI_SIG_IGN 
-                              or ptr to our handler */
-      UInt skss_flags;
-      /* There is no skss_mask, since we know that we will always ask
-         for all signals to be blocked in our one-and-only
-         sighandler. */
-      /* Also there is no skss_restorer. */
-   }
-   SKSS_Per_Signal;
-
-typedef 
-   struct {
-      SKSS_Per_Signal skss_per_sig[1+VKI_KNSIG];
-      vki_ksigset_t skss_sigmask; /* process' blocked signal mask */   
-   } 
-   SKSS;
-
-static SKSS vg_skss;
-
-
-/* -----------------------------------------------------
-   Dynamic client signal state (DCSS).  This holds transient
-   information about state of client signals.
-   -------------------------------------------------- */
-
-typedef 
-   struct {
-      /* True iff a signal has been received but not yet passed to
-         client. */
-      Bool dcss_sigpending[1+VKI_KNSIG];
-      /* If sigpending[] is True, has meaning: 
-         VG_INVALID_THREADID -- to be passed to any suitable thread 
-         other -- to be passed only to the specified thread. */
-      ThreadId dcss_destthread[1+VKI_KNSIG];
-   } 
-   DCSS;
-
-static DCSS vg_dcss;
-
-
-/* ---------------------------------------------------------------------
-   Compute the SKSS required by the current SCSS.
-   ------------------------------------------------------------------ */
-
-static 
-void pp_SKSS ( void )
-{
-   Int sig;
-   VG_(printf)("\n\nSKSS:\n");
-   for (sig = 1; sig <= VKI_KNSIG; sig++) {
-      VG_(printf)("sig %d:  handler 0x%x,  flags 0x%x\n", sig,
-                  vg_skss.skss_per_sig[sig].skss_handler,
-                  vg_skss.skss_per_sig[sig].skss_flags );
-
-   }
-   VG_(printf)("Global sigmask (63 .. 0) = 0x%x 0x%x\n",
-	       vg_skss.skss_sigmask.ws[1],
-	       vg_skss.skss_sigmask.ws[0] );
-}
-
-static __inline__
-Bool is_WaitSIGd_by_any_thread ( Int sig )
-{
-   ThreadId tid;
-   for (tid = 1; tid < VG_N_THREADS; tid++) {
-      if (VG_(threads)[tid].status != VgTs_WaitSIG) 
-         continue;
-      if (VG_(ksigismember)( &VG_(threads)[tid].sigs_waited_for, sig ))
-         return True;
-   }
-   return False;
-}
-
-static __inline__
-Bool is_blocked_by_all_threads ( Int sig )
-{
-   ThreadId tid;
-   for (tid = 1; tid < VG_N_THREADS; tid++) {
-      if (VG_(threads)[tid].status == VgTs_Empty) 
-         continue;
-      if (! VG_(ksigismember)( &VG_(threads)[tid].sig_mask, sig ))
-         return False;
-   }
-   return True;
-}
-
-
-/* This is the core, clever bit.  Computation is as follows:
-
-   For each signal
-      handler = if client has a handler, then our handler
-                else if is WaitSIG'd by any thread, then our handler
-                else if client is DFL, then DFL
-                else (client must be IGN) IGN
-
-      blocked = if is blocked by all threads and not WaitSIG'd by
-                   any thread
-                then BLOCKED 
-                else UNBLOCKED
-*/
-static
-void calculate_SKSS_from_SCSS ( SKSS* dst )
-{
-   Int   sig;
-   void* skss_handler;
-   void* scss_handler;
-   Bool  iz_WaitSIGd_by_any_thread;
-   Bool  iz_blocked_by_all_threads;
-   Bool  skss_blocked;
-   UInt  scss_flags;
-   UInt  skss_flags;
-
-   VG_(ksigemptyset)( &dst->skss_sigmask );
-
-   for (sig = 1; sig <= VKI_KNSIG; sig++) {
-
-      /* Calculate kernel handler and blockedness for sig, as per rules
-         in above comment. */
-
-      iz_WaitSIGd_by_any_thread = is_WaitSIGd_by_any_thread(sig);
-      iz_blocked_by_all_threads = is_blocked_by_all_threads(sig);
-  
-      scss_handler = vg_scss.scss_per_sig[sig].scss_handler;
-      scss_flags   = vg_scss.scss_per_sig[sig].scss_flags;
-
-      /* Restorer */
-      /* 
-      Doesn't seem like we can spin this one.
-      if (vg_scss.scss_per_sig[sig].scss_restorer != NULL)
-         VG_(unimplemented)
-            ("sigactions with non-NULL .sa_restorer field");
-      */
-
-      /* Handler */
-
-      if (scss_handler != VKI_SIG_DFL && scss_handler != VKI_SIG_IGN) {
-         skss_handler = &vg_oursignalhandler;
-      } else
-      if (iz_WaitSIGd_by_any_thread) {
-         skss_handler = &vg_oursignalhandler;
-      } else
-      if (scss_handler == VKI_SIG_DFL) {
-         skss_handler = VKI_SIG_DFL;
-      }
-      else {
-         vg_assert(scss_handler == VKI_SIG_IGN);
-         skss_handler = VKI_SIG_IGN;
-      }
-
-      /* Blockfulness */
-
-      skss_blocked
-         = iz_blocked_by_all_threads && !iz_WaitSIGd_by_any_thread;
-
-      /* Flags */
-
-      skss_flags = 0;
-      /* SA_NOCLDSTOP: pass to kernel */
-      if (scss_flags & VKI_SA_NOCLDSTOP)
-         skss_flags |= VKI_SA_NOCLDSTOP;
-      /* SA_ONESHOT: ignore client setting */
-      /*
-      if (!(scss_flags & VKI_SA_ONESHOT))
-         VG_(unimplemented)
-            ("sigactions without SA_ONESHOT");
-      vg_assert(scss_flags & VKI_SA_ONESHOT);
-      skss_flags |= VKI_SA_ONESHOT;
-      */
-      /* SA_RESTART: ignore client setting and set for us */
-      skss_flags |= VKI_SA_RESTART;
-      /* SA_NOMASK: not allowed */
-      /*
-      .. well, ignore it. 
-      if (scss_flags & VKI_SA_NOMASK)
-         VG_(unimplemented)
-            ("sigactions with SA_NOMASK");
-      vg_assert(!(scss_flags & VKI_SA_NOMASK));
-      */
-      /* SA_ONSTACK: client setting is irrelevant here */
-      /*
-      if (scss_flags & VKI_SA_ONSTACK)
-         VG_(unimplemented)
-            ("signals on an alternative stack (SA_ONSTACK)");
-      vg_assert(!(scss_flags & VKI_SA_ONSTACK));
-      */
-      /* ... but WE ask for on-stack ourselves ... */
-      skss_flags |= VKI_SA_ONSTACK;
-
-      /* Create SKSS entry for this signal. */
-
-      if (skss_blocked
-          && sig != VKI_SIGKILL && sig != VKI_SIGSTOP)
-         VG_(ksigaddset)( &dst->skss_sigmask, sig );
-
-      if (sig != VKI_SIGKILL && sig != VKI_SIGSTOP)
-         dst->skss_per_sig[sig].skss_handler = skss_handler;
-      else
-         dst->skss_per_sig[sig].skss_handler = VKI_SIG_DFL;
-
-      dst->skss_per_sig[sig].skss_flags   = skss_flags;
-   }
-
-   /* Sanity checks. */
-   vg_assert(dst->skss_per_sig[VKI_SIGKILL].skss_handler 
-             == VKI_SIG_DFL);
-   vg_assert(dst->skss_per_sig[VKI_SIGSTOP].skss_handler 
-             == VKI_SIG_DFL);
-   vg_assert(!VG_(ksigismember)( &dst->skss_sigmask, VKI_SIGKILL ));
-   vg_assert(!VG_(ksigismember)( &dst->skss_sigmask, VKI_SIGSTOP ));
-
-   if (0)
-      pp_SKSS();
-}
-
-
-/* ---------------------------------------------------------------------
-   After a possible SCSS change, update SKSS and the kernel itself.
-   ------------------------------------------------------------------ */
-
-/* IMPORTANT NOTE: to avoid race conditions, we must always enter here
-   with ALL KERNEL SIGNALS BLOCKED ! 
-*/
-void VG_(handle_SCSS_change) ( Bool force_update )
-{
-   Int            res, sig;
-   SKSS           skss_old;
-   vki_ksigaction ksa, ksa_old;
-
-#  ifdef DEBUG_SIGNALS
-   vki_ksigset_t  test_sigmask;
-   res = VG_(ksigprocmask)( VKI_SIG_SETMASK /*irrelevant*/, 
-                            NULL, &test_sigmask );
-   vg_assert(res == 0);
-   /* The kernel never says that SIGKILL or SIGSTOP are masked. It is
-      correct! So we fake it here for the purposes only of
-      assertion. */
-   VG_(ksigaddset)( &test_sigmask, VKI_SIGKILL );
-   VG_(ksigaddset)( &test_sigmask, VKI_SIGSTOP );
-   vg_assert(VG_(kisfullsigset)( &test_sigmask ));
-#  endif
-
-   /* Remember old SKSS and calculate new one. */
-   skss_old = vg_skss;
-   calculate_SKSS_from_SCSS ( &vg_skss );
-
-   /* Compare the new SKSS entries vs the old ones, and update kernel
-      where they differ. */
-   for (sig = 1; sig <= VKI_KNSIG; sig++) {
-
-      /* Trying to do anything with SIGKILL is pointless; just ignore
-         it. */
-      if (sig == VKI_SIGKILL || sig == VKI_SIGSTOP)
-         continue;
-
-      /* Aside: take the opportunity to clean up DCSS: forget about any
-         pending signals directed at dead threads. */
-      if (vg_dcss.dcss_sigpending[sig] 
-          && vg_dcss.dcss_destthread[sig] != VG_INVALID_THREADID) {
-         ThreadId tid = vg_dcss.dcss_destthread[sig];
-         vg_assert(VG_(is_valid_or_empty_tid)(tid));
-         if (VG_(threads)[tid].status == VgTs_Empty) {
-            vg_dcss.dcss_sigpending[sig] = False;
-            vg_dcss.dcss_destthread[sig] = VG_INVALID_THREADID;
-            if (VG_(clo_trace_signals)) 
-               VG_(message)(Vg_DebugMsg, 
-                   "discarding pending signal %d due to thread %d exiting",
-                   sig, tid );
-         }
-      }
-
-      /* End of the Aside.  Now the Main Business. */
-
-      if (!force_update) {
-         if ((skss_old.skss_per_sig[sig].skss_handler
-              == vg_skss.skss_per_sig[sig].skss_handler)
-             && (skss_old.skss_per_sig[sig].skss_flags
-                 == vg_skss.skss_per_sig[sig].skss_flags))
-            /* no difference */
-            continue;
-      }
-
-      ksa.ksa_handler = vg_skss.skss_per_sig[sig].skss_handler;
-      ksa.ksa_flags   = vg_skss.skss_per_sig[sig].skss_flags;
-      vg_assert(ksa.ksa_flags & VKI_SA_ONSTACK);
-      VG_(ksigfillset)( &ksa.ksa_mask );
-      VG_(ksigdelset)( &ksa.ksa_mask, VKI_SIGKILL );
-      VG_(ksigdelset)( &ksa.ksa_mask, VKI_SIGSTOP );
-      ksa.ksa_restorer = NULL;
-
-      if (VG_(clo_trace_signals)) 
-         VG_(message)(Vg_DebugMsg, 
-            "setting ksig %d to: hdlr 0x%x, flags 0x%x, "
-            "mask(63..0) 0x%x 0x%x",
-            sig, ksa.ksa_handler,
-            ksa.ksa_flags,
-            ksa.ksa_mask.ws[1], 
-            ksa.ksa_mask.ws[0] 
-         );
-
-      res = VG_(ksigaction)( sig, &ksa, &ksa_old );
-      vg_assert(res == 0);
-
-      /* Since we got the old sigaction more or less for free, might
-         as well extract the maximum sanity-check value from it. */
-      if (!force_update) {
-         vg_assert(ksa_old.ksa_handler 
-                   == skss_old.skss_per_sig[sig].skss_handler);
-         vg_assert(ksa_old.ksa_flags 
-                   == skss_old.skss_per_sig[sig].skss_flags);
-         vg_assert(ksa_old.ksa_restorer 
-                   == NULL);
-         VG_(ksigaddset)( &ksa_old.ksa_mask, VKI_SIGKILL );
-         VG_(ksigaddset)( &ksa_old.ksa_mask, VKI_SIGSTOP );
-         vg_assert(VG_(kisfullsigset)( &ksa_old.ksa_mask ));
-      }
-   }
-
-   /* Just set the new sigmask, even if it's no different from the
-      old, since we have to do this anyway, to unblock the host
-      signals. */
-   if (VG_(clo_trace_signals)) 
-      VG_(message)(Vg_DebugMsg, 
-         "setting kmask(63..0) to 0x%x 0x%x",
-         vg_skss.skss_sigmask.ws[1], 
-         vg_skss.skss_sigmask.ws[0] 
-      );
-
-   VG_(restore_all_host_signals)( &vg_skss.skss_sigmask );
-}
-
-
-/* ---------------------------------------------------------------------
-   Update/query SCSS in accordance with client requests.
-   ------------------------------------------------------------------ */
-
-/* Logic for this alt-stack stuff copied directly from do_sigaltstack
-   in kernel/signal.[ch] */
-
-/* True if we are on the alternate signal stack.  */
-static Int on_sig_stack ( Addr m_esp )
-{
-   return (m_esp - (Addr)vg_scss.altstack.ss_sp 
-           < vg_scss.altstack.ss_size);
-}
-
-static Int sas_ss_flags ( Addr m_esp )
-{
-   return (vg_scss.altstack.ss_size == 0 
-              ? VKI_SS_DISABLE
-              : on_sig_stack(m_esp) ? VKI_SS_ONSTACK : 0);
-}
-
-
-void VG_(do__NR_sigaltstack) ( ThreadId tid )
-{
-   vki_kstack_t* ss;
-   vki_kstack_t* oss;
-   Addr          m_esp;
-
-   vg_assert(VG_(is_valid_tid)(tid));
-   ss    = (vki_kstack_t*)(VG_(threads)[tid].m_ebx);
-   oss   = (vki_kstack_t*)(VG_(threads)[tid].m_ecx);
-   m_esp = VG_(threads)[tid].m_esp;
-
-   if (VG_(clo_trace_signals))
-      VG_(message)(Vg_DebugExtraMsg, 
-         "__NR_sigaltstack: tid %d, "
-         "ss 0x%x, oss 0x%x (current %%esp %p)",
-         tid, (UInt)ss, (UInt)oss, (UInt)m_esp );
-
-   if (oss != NULL) {
-      oss->ss_sp    = vg_scss.altstack.ss_sp;
-      oss->ss_size  = vg_scss.altstack.ss_size;
-      oss->ss_flags = sas_ss_flags(m_esp);
-   }
-
-   if (ss != NULL) {
-      if (on_sig_stack(VG_(threads)[tid].m_esp)) {
-         SET_EAX(tid, -VKI_EPERM);
-         return;
-      }
-      if (ss->ss_flags != VKI_SS_DISABLE 
-          && ss->ss_flags != VKI_SS_ONSTACK 
-          && ss->ss_flags != 0) {
-         SET_EAX(tid, -VKI_EINVAL);
-         return;
-      }
-      if (ss->ss_flags == VKI_SS_DISABLE) {
-         vg_scss.altstack.ss_size = 0;
-         vg_scss.altstack.ss_sp = NULL;
-      } else {
-         if (ss->ss_size < VKI_MINSIGSTKSZ) {
-            SET_EAX(tid, -VKI_ENOMEM);
-            return;
-         }
-      }
-      vg_scss.altstack.ss_sp   = ss->ss_sp;
-      vg_scss.altstack.ss_size = ss->ss_size;
-   }
-   SET_EAX(tid, 0);
-}
-
-
-void VG_(do__NR_sigaction) ( ThreadId tid )
-{
-   Int              signo;
-   vki_ksigaction*  new_act;
-   vki_ksigaction*  old_act;
-   vki_ksigset_t    irrelevant_sigmask;
-
-   vg_assert(VG_(is_valid_tid)(tid));
-   signo     = VG_(threads)[tid].m_ebx; /* int sigNo */
-   new_act   = (vki_ksigaction*)(VG_(threads)[tid].m_ecx);
-   old_act   = (vki_ksigaction*)(VG_(threads)[tid].m_edx);
-
-   if (VG_(clo_trace_signals))
-      VG_(message)(Vg_DebugExtraMsg, 
-         "__NR_sigaction: tid %d, sigNo %d, "
-         "new 0x%x, old 0x%x, new flags 0x%x",
-         tid, signo, (UInt)new_act, (UInt)old_act,
-         (UInt)(new_act ? new_act->ksa_flags : 0) );
-
-   /* Rule out various error conditions.  The aim is to ensure that if
-      when the call is passed to the kernel it will definitely
-      succeed. */
-
-   /* Reject out-of-range signal numbers. */
-   if (signo < 1 || signo > VKI_KNSIG) goto bad_signo;
-
-   /* Reject attempts to set a handler (or set ignore) for SIGKILL. */
-   if ( (signo == VKI_SIGKILL || signo == VKI_SIGSTOP)
-       && new_act
-       && new_act->ksa_handler != VKI_SIG_DFL)
-      goto bad_sigkill_or_sigstop;
-
-   /* If the client supplied non-NULL old_act, copy the relevant SCSS
-      entry into it. */
-   if (old_act) {
-      old_act->ksa_handler  = vg_scss.scss_per_sig[signo].scss_handler;
-      old_act->ksa_flags    = vg_scss.scss_per_sig[signo].scss_flags;
-      old_act->ksa_mask     = vg_scss.scss_per_sig[signo].scss_mask;
-      old_act->ksa_restorer = vg_scss.scss_per_sig[signo].scss_restorer;
-   }
-
-   /* And now copy new SCSS entry from new_act. */
-   if (new_act) {
-      vg_scss.scss_per_sig[signo].scss_handler  = new_act->ksa_handler;
-      vg_scss.scss_per_sig[signo].scss_flags    = new_act->ksa_flags;
-      vg_scss.scss_per_sig[signo].scss_mask     = new_act->ksa_mask;
-      vg_scss.scss_per_sig[signo].scss_restorer = new_act->ksa_restorer;
-   }
-
-   /* All happy bunnies ... */
-   if (new_act) {
-      VG_(block_all_host_signals)( &irrelevant_sigmask );
-      VG_(handle_SCSS_change)( False /* lazy update */ );
-   }
-   SET_EAX(tid, 0);
-   return;
-
-  bad_signo:
-   VG_(message)(Vg_UserMsg,
-                "Warning: bad signal number %d in __NR_sigaction.", 
-                signo);
-   SET_EAX(tid, -VKI_EINVAL);
-   return;
-
-  bad_sigkill_or_sigstop:
-   VG_(message)(Vg_UserMsg,
-      "Warning: attempt to set %s handler in __NR_sigaction.", 
-      signo == VKI_SIGKILL ? "SIGKILL" : "SIGSTOP" );
-
-   SET_EAX(tid, -VKI_EINVAL);
-   return;
-}
-
-
-static
-void do_sigprocmask_bitops ( Int vki_how, 
-			     vki_ksigset_t* orig_set,
-			     vki_ksigset_t* modifier )
-{
-   switch (vki_how) {
-      case VKI_SIG_BLOCK: 
-         VG_(ksigaddset_from_set)( orig_set, modifier );
-         break;
-      case VKI_SIG_UNBLOCK:
-         VG_(ksigdelset_from_set)( orig_set, modifier );
-         break;
-      case VKI_SIG_SETMASK:
-         *orig_set = *modifier;
-         break;
-      default:
-         VG_(panic)("do_sigprocmask_bitops");
-	 break;
-   }
-}
-
-/* Handle blocking mask set/get uniformly for threads and process as a
-   whole.  If tid==VG_INVALID_THREADID, this is really
-   __NR_sigprocmask, in which case we set the masks for all threads to
-   the "set" and return in "oldset" that from the root thread (1).
-   Otherwise, tid will denote a valid thread, in which case we just
-   set/get its mask.
-
-   Note that the thread signal masks are an implicit part of SCSS,
-   which is why this routine is allowed to mess with them.  
-*/
-static
-void do_setmask ( ThreadId tid,
-                  Int how,
-                  vki_ksigset_t* newset,
-		  vki_ksigset_t* oldset )
-{
-   vki_ksigset_t irrelevant_sigmask;
-
-   if (VG_(clo_trace_signals))
-      VG_(message)(Vg_DebugExtraMsg, 
-         "do_setmask: tid = %d (%d means ALL), how = %d (%s), set = %p", 
-         tid,
-         VG_INVALID_THREADID,
-         how,
-         how==VKI_SIG_BLOCK ? "SIG_BLOCK" : (
-            how==VKI_SIG_UNBLOCK ? "SIG_UNBLOCK" : (
-            how==VKI_SIG_SETMASK ? "SIG_SETMASK" : "???")),
-         newset
-      );
-
-   if (tid == VG_INVALID_THREADID) {
-      /* Behave as if __NR_sigprocmask. */
-      if (oldset) {
-         /* A bit fragile.  Should do better here really. */
-         vg_assert(VG_(threads)[1].status != VgTs_Empty);
-         *oldset = VG_(threads)[1].sig_mask;
-      }
-      if (newset) {
-        ThreadId tidd;
-        for (tidd = 1; tidd < VG_N_THREADS; tidd++) {
-            if (VG_(threads)[tidd].status == VgTs_Empty) 
-               continue;
-            do_sigprocmask_bitops ( 
-               how, &VG_(threads)[tidd].sig_mask, newset );
-         }
-      }
-   } else {
-      /* Just do this thread. */
-      vg_assert(VG_(is_valid_tid)(tid));
-      if (oldset)
-         *oldset = VG_(threads)[tid].sig_mask;
-      if (newset)
-         do_sigprocmask_bitops ( 
-            how, &VG_(threads)[tid].sig_mask, newset );
-   }
-
-   if (newset) {
-      VG_(block_all_host_signals)( &irrelevant_sigmask );
-      VG_(handle_SCSS_change)( False /* lazy update */ );
-   }
-}
-
-
-void VG_(do__NR_sigprocmask) ( ThreadId tid,
-                               Int how, 
-                               vki_ksigset_t* set,
-                               vki_ksigset_t* oldset )
-{
-   if (how == VKI_SIG_BLOCK || how == VKI_SIG_UNBLOCK 
-                            || how == VKI_SIG_SETMASK) {
-      vg_assert(VG_(is_valid_tid)(tid));
-      do_setmask ( VG_INVALID_THREADID, how, set, oldset );
-      /* Syscall returns 0 (success) to its thread. */
-      SET_EAX(tid, 0);
-   } else {
-      VG_(message)(Vg_DebugMsg, 
-                  "sigprocmask: unknown `how' field %d", how);
-      SET_EAX(tid, -VKI_EINVAL);
-   }
-}
-
-
-void VG_(do_pthread_sigmask_SCSS_upd) ( ThreadId tid,
-                                        Int how, 
-                                        vki_ksigset_t* set,
-                                        vki_ksigset_t* oldset )
-{
-   /* Assume that how has been validated by caller. */
-   vg_assert(how == VKI_SIG_BLOCK || how == VKI_SIG_UNBLOCK 
-                                  || how == VKI_SIG_SETMASK);
-   vg_assert(VG_(is_valid_tid)(tid));
-   do_setmask ( tid, how, set, oldset );
-   /* The request return code is set in do_pthread_sigmask */
-}
-
-
-void VG_(send_signal_to_thread) ( ThreadId thread, Int sig )
-{
-   Int res;
-   vg_assert(VG_(is_valid_tid)(thread));
-   vg_assert(sig >= 1 && sig <= VKI_KNSIG);
-   
-   switch ((UInt)(vg_scss.scss_per_sig[sig].scss_handler)) {
-
-      case ((UInt)VKI_SIG_IGN):
-         if (VG_(clo_trace_signals)) 
-            VG_(message)(Vg_DebugMsg, 
-               "send_signal %d to_thread %d: IGN, ignored", sig, thread );
-         break;
-
-      case ((UInt)VKI_SIG_DFL):
-         /* This is the tricky case.  Since we don't handle default
-            actions, the simple thing is to send someone round to the
-            front door and signal there.  Then the kernel will do
-            whatever it does with the default action. */
-         res = VG_(kill)( VG_(getpid)(), sig );
-         vg_assert(res == 0);
-         break;
-
-      default:
-         if (!vg_dcss.dcss_sigpending[sig]) {
-            vg_dcss.dcss_sigpending[sig] = True;
-            vg_dcss.dcss_destthread[sig] = thread;
-            if (VG_(clo_trace_signals)) 
-               VG_(message)(Vg_DebugMsg, 
-                  "send_signal %d to_thread %d: now pending", sig, thread );
-         } else {
-            if (vg_dcss.dcss_destthread[sig] == thread) {
-               if (VG_(clo_trace_signals)) 
-                  VG_(message)(Vg_DebugMsg, 
-                     "send_signal %d to_thread %d: already pending ... "
-                     "discarded", sig, thread );
-            } else {
-               if (VG_(clo_trace_signals)) 
-                  VG_(message)(Vg_DebugMsg, 
-                     "send_signal %d to_thread %d: was pending for %d, "
-                     "now pending for %d",
-                     sig, thread, vg_dcss.dcss_destthread[sig], thread );
-               vg_dcss.dcss_destthread[sig] = thread;
-            }
-         }
-   }    
-}
-
-
-/* Store in set the signals which could be delivered to this thread
-   right now (since they are pending) but cannot be, because the
-   thread has masked them out. */
-void VG_(do_sigpending) ( ThreadId tid, vki_ksigset_t* set )
-{
-   Int           sig, res;
-   Bool          maybe_pend;
-   vki_ksigset_t process_pending;
-
-   /* Get the set of signals which are pending for the process as a
-      whole. */
-   res = VG_(sigpending)( &process_pending );
-   vg_assert(res == 0);
-
-   VG_(ksigemptyset)(set);
-   for (sig = 1; sig <= VKI_KNSIG; sig++) {
-
-      /* Figure out if the signal could be pending for this thread.
-         There are two cases. */
-      maybe_pend = False;
-
-      /* Case 1: perhaps the signal is pending for the process as a
-         whole -- that is, is blocked even valgrind's signal
-         handler. */
-      if (VG_(ksigismember)( &process_pending, sig ))
-         maybe_pend = True;
-
-      /* Case 2: the signal has been collected by our handler and is
-         now awaiting disposition inside valgrind. */
-      if (/* is it pending at all? */
-          vg_dcss.dcss_sigpending[sig]
-          && 
-	  /* check it is not specifically directed to some other thread */
-          (vg_dcss.dcss_destthread[sig] == VG_INVALID_THREADID
-           || vg_dcss.dcss_destthread[sig] == tid)
-         )
-         maybe_pend = True;
-
-      if (!maybe_pend)
-         continue; /* this signal just ain't pending! */
-
-      /* Check other necessary conditions now ... */
-
-      if (VG_(ksigismember)( &VG_(threads)[tid].sigs_waited_for, sig ))
-         continue; /* tid is sigwaiting for sig, so will never be
-                      offered to a handler */
-      if (! VG_(ksigismember)( &VG_(threads)[tid].sig_mask, sig ))
-         continue; /* not blocked in this thread */
-
-      /* Ok, sig could be delivered to this thread if only it wasn't
-         masked out.  So we add it to set. */
-      VG_(ksigaddset)( set, sig );
-   }
-}
-
-
-/* ---------------------------------------------------------------------
-   LOW LEVEL STUFF TO DO WITH SIGNALS: IMPLEMENTATION
-   ------------------------------------------------------------------ */
-
-/* ---------------------------------------------------------------------
-   Handy utilities to block/restore all host signals.
-   ------------------------------------------------------------------ */
-
-/* Block all host signals, dumping the old mask in *saved_mask. */
-void VG_(block_all_host_signals) ( /* OUT */ vki_ksigset_t* saved_mask )
-{
-   Int           ret;
-   vki_ksigset_t block_procmask;
-   VG_(ksigfillset)(&block_procmask);
-   ret = VG_(ksigprocmask)
-            (VKI_SIG_SETMASK, &block_procmask, saved_mask);
-   vg_assert(ret == 0);
-}
-
-/* Restore the blocking mask using the supplied saved one. */
-void VG_(restore_all_host_signals) ( /* IN */ vki_ksigset_t* saved_mask )
-{
-   Int ret;
-   ret = VG_(ksigprocmask)(VKI_SIG_SETMASK, saved_mask, NULL);
-   vg_assert(ret == 0);
-}
-
-
-/* ---------------------------------------------------------------------
-   The signal simulation proper.  A simplified version of what the 
-   Linux kernel does.
-   ------------------------------------------------------------------ */
-
-/* A structure in which to save the application's registers
-   during the execution of signal handlers. */
-
-typedef
-   struct {
-      /* These are parameters to the signal handler. */
-      UInt retaddr;   /* Sig handler's (bogus) return address */
-      Int  sigNo;     /* The arg to the sig handler.  */
-      Addr psigInfo;  /* ptr to siginfo_t; NULL for now. */
-      Addr puContext; /* ptr to ucontext; NULL for now. */
-      /* Sanity check word. */
-      UInt magicPI;
-      /* Saved processor state. */
-      UInt fpustate[VG_SIZE_OF_FPUSTATE_W];
-      UInt eax;
-      UInt ecx;
-      UInt edx;
-      UInt ebx;
-      UInt ebp;
-      UInt esp;
-      UInt esi;
-      UInt edi;
-      Addr eip;
-      UInt eflags;
-      /* Scheduler-private stuff: what was the thread's status prior to
-         delivering this signal? */
-      ThreadStatus status;
-      /* Sanity check word.  Is the highest-addressed word; do not
-         move!*/
-      UInt magicE;
-   }
-   VgSigFrame;
-
-
-
-/* Set up a stack frame (VgSigContext) for the client's signal
-   handler.  This includes the signal number and a bogus return
-   address.  */
-static
-void vg_push_signal_frame ( ThreadId tid, int sigNo )
-{
-   Int          i;
-   Addr         esp, esp_top_of_frame;
-   VgSigFrame*  frame;
-   ThreadState* tst;
-
-   vg_assert(sigNo >= 1 && sigNo <= VKI_KNSIG);
-   vg_assert(VG_(is_valid_tid)(tid));
-   tst = & VG_(threads)[tid];
-
-   if (/* this signal asked to run on an alt stack */
-       (vg_scss.scss_per_sig[sigNo].scss_flags & VKI_SA_ONSTACK)
-       && /* there is a defined and enabled alt stack, which we're not
-             already using.  Logic from get_sigframe in
-             arch/i386/kernel/signal.c. */
-          sas_ss_flags(tst->m_esp) == 0
-      ) {
-      esp_top_of_frame 
-         = (Addr)(vg_scss.altstack.ss_sp) + vg_scss.altstack.ss_size;
-      if (VG_(clo_trace_signals))
-         VG_(message)(Vg_DebugMsg,
-            "delivering signal %d to thread %d: on ALT STACK", 
-            sigNo, tid );
-   } else {
-      esp_top_of_frame = tst->m_esp;
-   }
-
-   esp = esp_top_of_frame;
-   esp -= sizeof(VgSigFrame);
-   frame = (VgSigFrame*)esp;
-   /* Assert that the frame is placed correctly. */
-   vg_assert( (sizeof(VgSigFrame) & 0x3) == 0 );
-   vg_assert( ((Char*)(&frame->magicE)) + sizeof(UInt) 
-              == ((Char*)(esp_top_of_frame)) );
-
-   frame->retaddr    = (UInt)(&VG_(signalreturn_bogusRA));
-   frame->sigNo      = sigNo;
-   frame->psigInfo   = (Addr)NULL;
-   frame->puContext  = (Addr)NULL;
-   frame->magicPI    = 0x31415927;
-
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      frame->fpustate[i] = tst->m_fpu[i];
-
-   frame->eax        = tst->m_eax;
-   frame->ecx        = tst->m_ecx;
-   frame->edx        = tst->m_edx;
-   frame->ebx        = tst->m_ebx;
-   frame->ebp        = tst->m_ebp;
-   frame->esp        = tst->m_esp;
-   frame->esi        = tst->m_esi;
-   frame->edi        = tst->m_edi;
-   frame->eip        = tst->m_eip;
-   frame->eflags     = tst->m_eflags;
-
-   frame->status     = tst->status;
-
-   frame->magicE     = 0x27182818;
-
-   /* Set the thread so it will next run the handler. */
-   tst->m_esp  = esp;
-   tst->m_eip  = (Addr)vg_scss.scss_per_sig[sigNo].scss_handler;
-   /* This thread needs to be marked runnable, but we leave that the
-      caller to do. */
-
-   /* Make retaddr, sigNo, psigInfo, puContext fields readable -- at
-      0(%ESP) .. 12(%ESP) */
-   if (VG_(clo_instrument)) {
-      VGM_(make_readable) ( ((Addr)esp)+0,  4 );
-      VGM_(make_readable) ( ((Addr)esp)+4,  4 );
-      VGM_(make_readable) ( ((Addr)esp)+8,  4 );
-      VGM_(make_readable) ( ((Addr)esp)+12, 4 );
-   }
-
-   /* 
-   VG_(printf)("pushed signal frame; %%ESP now = %p, next %%EBP = %p\n", 
-               esp, tst->m_eip);
-   */
-}
-
-
-/* Clear the signal frame created by vg_push_signal_frame, restore the
-   simulated machine state, and return the signal number that the
-   frame was for. */
-static
-Int vg_pop_signal_frame ( ThreadId tid )
-{
-   Addr          esp;
-   Int           sigNo, i;
-   VgSigFrame*   frame;
-   ThreadState*  tst;
-
-   vg_assert(VG_(is_valid_tid)(tid));
-   tst = & VG_(threads)[tid];
-
-   /* Correctly reestablish the frame base address. */
-   esp   = tst->m_esp;
-   frame = (VgSigFrame*)
-              (esp -4 /* because the handler's RET pops the RA */
-                  +20 /* because signalreturn_bogusRA pushes 5 words */);
-
-   vg_assert(frame->magicPI == 0x31415927);
-   vg_assert(frame->magicE  == 0x27182818);
-   if (VG_(clo_trace_signals))
-      VG_(message)(Vg_DebugMsg, 
-         "vg_pop_signal_frame (thread %d): valid magic", tid);
-
-   /* restore machine state */
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      tst->m_fpu[i] = frame->fpustate[i];
-
-   /* Mark the frame structure as nonaccessible. */
-   if (VG_(clo_instrument))
-      VGM_(make_noaccess)( (Addr)frame, sizeof(VgSigFrame) );
-
-   /* Restore machine state from the saved context. */
-   tst->m_eax     = frame->eax;
-   tst->m_ecx     = frame->ecx;
-   tst->m_edx     = frame->edx;
-   tst->m_ebx     = frame->ebx;
-   tst->m_ebp     = frame->ebp;
-   tst->m_esp     = frame->esp;
-   tst->m_esi     = frame->esi;
-   tst->m_edi     = frame->edi;
-   tst->m_eflags  = frame->eflags;
-   tst->m_eip     = frame->eip;
-   sigNo          = frame->sigNo;
-
-   /* And restore the thread's status to what it was before the signal
-      was delivered. */
-   tst->status    = frame->status;
-
-   return sigNo;
-}
-
-
-/* A handler is returning.  Restore the machine state from the stacked
-   VgSigContext and continue with whatever was going on before the
-   handler ran.  Returns the SA_RESTART syscall-restartability-status
-   of the delivered signal. */
-
-Bool VG_(signal_returns) ( ThreadId tid )
-{
-   Int            sigNo;
-   vki_ksigset_t  saved_procmask;
-
-   /* Block host signals ... */
-   VG_(block_all_host_signals)( &saved_procmask );
-
-   /* Pop the signal frame and restore tid's status to what it was
-      before the signal was delivered. */
-   sigNo = vg_pop_signal_frame(tid);
-
-   vg_assert(sigNo >= 1 && sigNo <= VKI_KNSIG);
-
-   /* Unlock and return. */
-   VG_(restore_all_host_signals)( &saved_procmask );
-
-   /* Scheduler now can resume this thread, or perhaps some other.
-      Tell the scheduler whether or not any syscall interrupted by
-      this signal should be restarted, if possible, or no. */
-   return 
-      (vg_scss.scss_per_sig[sigNo].scss_flags & VKI_SA_RESTART)
-         ? True 
-         : False;
-}
-
-
-/* Deliver all pending signals, by building stack frames for their
-   handlers.  Return True if any signals were delivered. */
-Bool VG_(deliver_signals) ( void )
-{
-   vki_ksigset_t  saved_procmask;
-   Int            sigNo;
-   Bool           found, scss_changed;
-   ThreadState*   tst;
-   ThreadId       tid;
-
-   /* A cheap check.  We don't need to have exclusive access to the
-      pending array, because in the worst case, vg_oursignalhandler
-      will add signals, causing us to return, thinking there are no
-      signals to deliver, when in fact there are some.  A subsequent
-      call here will handle the signal(s) we missed.  */
-   found = False;
-   for (sigNo = 1; sigNo <= VKI_KNSIG; sigNo++)
-      if (vg_dcss.dcss_sigpending[sigNo])
-         found = True;
-
-   if (!found) return False;
-
-   /* Now we have to do it properly.  Get exclusive access by
-      blocking all the host's signals.  That means vg_oursignalhandler
-      can't run whilst we are messing with stuff.
-   */
-   scss_changed = False;
-   VG_(block_all_host_signals)( &saved_procmask );
-
-   /* Look for signals to deliver ... */
-   for (sigNo = 1; sigNo <= VKI_KNSIG; sigNo++) {
-
-      if (!vg_dcss.dcss_sigpending[sigNo])
-         continue;
-
-      /* sigNo is pending.  Try to find a suitable thread to deliver
-         it to. */
-      /* First off, are any threads in sigwait() for the signal? 
-         If so just give to one of them and have done. */
-      for (tid = 1; tid < VG_N_THREADS; tid++) {
-         tst = & VG_(threads)[tid];
-         /* Is tid waiting for a signal?  If not, ignore. */
-         if (tst->status != VgTs_WaitSIG)
-            continue;
-         /* Is the signal directed at a specific thread other than
-            this one?  If yes, ignore. */
-         if (vg_dcss.dcss_destthread[sigNo] != VG_INVALID_THREADID
-             && vg_dcss.dcss_destthread[sigNo] != tid)
-            continue;
-         /* Is tid waiting for the signal?  If not, ignore. */
-         if (VG_(ksigismember)(&(tst->sigs_waited_for), sigNo))
-            break;
-      }
-      if (tid < VG_N_THREADS) {
-         UInt* sigwait_args;
-         tst = & VG_(threads)[tid];
-         if (VG_(clo_trace_signals) || VG_(clo_trace_sched))
-            VG_(message)(Vg_DebugMsg,
-               "releasing thread %d from sigwait() due to signal %d",
-               tid, sigNo );
-         sigwait_args = (UInt*)(tst->m_eax);
-         if (NULL != (UInt*)(sigwait_args[2])) {
-            *(Int*)(sigwait_args[2]) = sigNo;
-            if (VG_(clo_instrument))
-               VGM_(make_readable)( (Addr)(sigwait_args[2]), 
-                                    sizeof(UInt));
-         }
-	 SET_EDX(tid, 0);
-         tst->status = VgTs_Runnable;
-         VG_(ksigemptyset)(&tst->sigs_waited_for);
-         scss_changed = True;
-         vg_dcss.dcss_sigpending[sigNo] = False;
-         vg_dcss.dcss_destthread[sigNo] = VG_INVALID_THREADID; 
-                                          /*paranoia*/
-         continue; /* for (sigNo = 1; ...) loop */
-      }
-
-      /* Well, nobody appears to be sigwaiting for it.  So we really
-         are delivering the signal in the usual way.  And that the
-         client really has a handler for this thread! */
-      vg_assert(vg_dcss.dcss_sigpending[sigNo]);
-
-      /* A recent addition, so as to stop seriously wierd progs dying
-         at the following assertion (which this renders redundant,
-         btw). */
-      if (vg_scss.scss_per_sig[sigNo].scss_handler == VKI_SIG_IGN
-          || vg_scss.scss_per_sig[sigNo].scss_handler == VKI_SIG_DFL) {
-         /* Strange; perhaps the handler disappeared before we could
-            deliver the signal. */
-         VG_(message)(Vg_DebugMsg,
-            "discarding signal %d for thread %d because handler missing",
-            sigNo, tid );
-         vg_dcss.dcss_sigpending[sigNo] = False;
-         vg_dcss.dcss_destthread[sigNo] = VG_INVALID_THREADID;
-         continue; /* for (sigNo = 1; ...) loop */
-      }
-
-      vg_assert(vg_scss.scss_per_sig[sigNo].scss_handler != VKI_SIG_IGN
-                && vg_scss.scss_per_sig[sigNo].scss_handler != VKI_SIG_DFL);
-
-      tid = vg_dcss.dcss_destthread[sigNo];
-      vg_assert(tid == VG_INVALID_THREADID 
-                || VG_(is_valid_tid)(tid));
-
-      if (tid != VG_INVALID_THREADID) {
-         /* directed to a specific thread; ensure it actually still
-            exists ... */
-         tst = & VG_(threads)[tid];
-         if (tst->status == VgTs_Empty) {
-            /* dead, for whatever reason; ignore this signal */
-            if (VG_(clo_trace_signals))
-               VG_(message)(Vg_DebugMsg,
-                  "discarding signal %d for nonexistent thread %d",
-                  sigNo, tid );
-            vg_dcss.dcss_sigpending[sigNo] = False;
-            vg_dcss.dcss_destthread[sigNo] = VG_INVALID_THREADID;
-            continue; /* for (sigNo = 1; ...) loop */
-	 }
-      } else {
-         /* not directed to a specific thread, so search for a
-            suitable candidate */
-         for (tid = 1; tid < VG_N_THREADS; tid++) {
-            tst = & VG_(threads)[tid];
-            if (tst->status != VgTs_Empty
-                && !VG_(ksigismember)(&(tst->sig_mask), sigNo))
-               break;
-         }
-         if (tid == VG_N_THREADS) 
-            /* All threads have this signal blocked, so we can't
-               deliver it just now */
-            continue; /* for (sigNo = 1; ...) loop */
-      }
-
-      /* Ok, we can deliver signal sigNo to thread tid. */
-
-      if (VG_(clo_trace_signals))
-         VG_(message)(Vg_DebugMsg,"delivering signal %d to thread %d", 
-                                  sigNo, tid );
-
-      /* Create a signal delivery frame, and set the client's %ESP and
-         %EIP so that when execution continues, we will enter the
-         signal handler with the frame on top of the client's stack,
-         as it expects. */
-      vg_assert(VG_(is_valid_tid)(tid));
-      vg_push_signal_frame ( tid, sigNo );
-      VG_(threads)[tid].status = VgTs_Runnable;
-      
-      /* Signify that the signal has been delivered. */
-      vg_dcss.dcss_sigpending[sigNo] = False;
-      vg_dcss.dcss_destthread[sigNo] = VG_INVALID_THREADID;
-
-      if (vg_scss.scss_per_sig[sigNo].scss_flags & VKI_SA_ONESHOT) {
-         /* Do the ONESHOT thing. */
-         vg_scss.scss_per_sig[sigNo].scss_handler = VKI_SIG_DFL;
-         scss_changed = True;
-      }
-   }
-
-   /* Unlock and return. */
-   if (scss_changed) {
-      /* handle_SCSS_change computes a new kernel blocking mask and
-         applies that. */
-      VG_(handle_SCSS_change)( False /* lazy update */ );
-   } else {
-      /* No SCSS change, so just restore the existing blocking
-         mask. */
-      VG_(restore_all_host_signals)( &saved_procmask );
-   }
-
-   return True;
-}
-
-
-/* Receive a signal from the host, and either discard it or park it in
-   the queue of pending signals.  All other signals will be blocked
-   when this handler runs.  Runs with all host signals blocked, so as
-   to have mutual exclusion when adding stuff to the queue. */
-
-static 
-void vg_oursignalhandler ( Int sigNo )
-{
-   static UInt   segv_warns = 0;
-   ThreadId      tid;
-   Int           dummy_local;
-   Bool          sane;
-   vki_ksigset_t saved_procmask;
-
-   /*
-   if (sigNo == VKI_SIGUSR1) {
-      VG_(printf)("YOWZA!  SIGUSR1\n\n");
-      VG_(clo_trace_pthread_level) = 2;
-      VG_(clo_trace_sched) = True;
-      VG_(clo_trace_syscalls) = True;
-      VG_(clo_trace_signals) = True;
-      return;
-   }
-   */
-
-   if (VG_(clo_trace_signals)) {
-      VG_(start_msg)(Vg_DebugMsg);
-      VG_(add_to_msg)("signal %d arrived ... ", sigNo );
-   }
-   vg_assert(sigNo >= 1 && sigNo <= VKI_KNSIG);
-
-   /* Sanity check.  Ensure we're really running on the signal stack
-      we asked for. */
-   if ( !(
-            ((Char*)(&(VG_(sigstack)[0])) <= (Char*)(&dummy_local))
-            &&
-            ((Char*)(&dummy_local) < (Char*)(&(VG_(sigstack)[10000])))
-         )
-        ) {
-     VG_(message)(Vg_DebugMsg, 
-        "FATAL: signal delivered on the wrong stack?!");
-     VG_(message)(Vg_DebugMsg, 
-        "A possible workaround follows.  Please tell me");
-     VG_(message)(Vg_DebugMsg, 
-        "(jseward@acm.org) if the suggested workaround doesn't help.");
-     VG_(unimplemented)
-        ("support for progs compiled with -p/-pg; "
-         "rebuild your prog without -p/-pg");
-   }
-
-   vg_assert((Char*)(&(VG_(sigstack)[0])) <= (Char*)(&dummy_local));
-   vg_assert((Char*)(&dummy_local) < (Char*)(&(VG_(sigstack)[10000])));
-
-   VG_(block_all_host_signals)( &saved_procmask );
-
-   /* This is a sanity check.  Either a signal has arrived because the
-      client set a handler for it, or because some thread sigwaited on
-      it.  Establish that at least one of these is the case. */
-   sane = False;
-   if (vg_scss.scss_per_sig[sigNo].scss_handler != VKI_SIG_DFL
-       && vg_scss.scss_per_sig[sigNo].scss_handler != VKI_SIG_IGN) {
-      sane = True;
-   } else {
-      for (tid = 1; tid < VG_N_THREADS; tid++) {
-         if (VG_(threads)[tid].status != VgTs_WaitSIG) 
-            continue;
-         if (VG_(ksigismember)(&VG_(threads)[tid].sigs_waited_for, sigNo))
-            sane = True;
-      }
-   }
-   if (!sane) {
-      if (VG_(clo_trace_signals)) {
-         VG_(add_to_msg)("unexpected!");
-         VG_(end_msg)();
-      }
-      /* Note: we panic with all signals blocked here.  Don't think
-         that matters. */
-      VG_(panic)("vg_oursignalhandler: unexpected signal");
-   }
-   /* End of the sanity check. */
-
-   /* Decide what to do with it. */
-   if (vg_dcss.dcss_sigpending[sigNo]) {
-      /* pending; ignore it. */
-      if (VG_(clo_trace_signals)) {
-         VG_(add_to_msg)("already pending; discarded" );
-         VG_(end_msg)();
-      }
-   } else {
-      /* Ok, we'd better deliver it to the client. */
-      /* Queue it up for delivery at some point in the future. */
-      vg_dcss.dcss_sigpending[sigNo] = True;
-      vg_dcss.dcss_destthread[sigNo] = VG_INVALID_THREADID;
-      if (VG_(clo_trace_signals)) {
-         VG_(add_to_msg)("queued" );
-         VG_(end_msg)();
-      }
-   }
-
-   /* We've finished messing with the queue, so re-enable host
-      signals. */
-   VG_(restore_all_host_signals)( &saved_procmask );
-
-   if ( (sigNo == VKI_SIGSEGV || sigNo == VKI_SIGBUS 
-         || sigNo == VKI_SIGFPE || sigNo == VKI_SIGILL)
-        &&
-        VG_(scheduler_jmpbuf_valid)
-      ) {
-      /* Can't continue; must longjmp back to the scheduler and thus
-         enter the sighandler immediately. */
-      VG_(longjmpd_on_signal) = sigNo;
-      __builtin_longjmp(VG_(scheduler_jmpbuf),1);
-   }
-
-   if (sigNo == VKI_SIGSEGV && !VG_(scheduler_jmpbuf_valid)) {
-      if (++segv_warns <= 3) {
-	VG_(message)(Vg_UserMsg, 
-           "Warning: SIGSEGV not in user code; either from syscall kill()" );
-	VG_(message)(Vg_UserMsg, 
-           "   or possible Valgrind bug.  "
-           "This message is only shown 3 times." );
-      }
-   }
-}
-
-
-/* The outer insn loop calls here to reenable a host signal if
-   vg_oursighandler longjmp'd.
-*/
-void VG_(unblock_host_signal) ( Int sigNo )
-{
-   Int ret;
-   vki_ksigset_t set;
-   VG_(ksigemptyset)(&set);
-   ret = VG_(ksigaddset)(&set,sigNo);
-   vg_assert(ret == 0);
-   ret = VG_(ksigprocmask)(VKI_SIG_UNBLOCK,&set,NULL);
-   vg_assert(ret == 0);
-}
-
-
-static __attribute((unused))
-void pp_vg_ksigaction ( vki_ksigaction* sa )
-{
-   Int i;
-   VG_(printf)("vg_ksigaction: handler %p, flags 0x%x, restorer %p\n", 
-               sa->ksa_handler, (UInt)sa->ksa_flags, sa->ksa_restorer);
-   VG_(printf)("vg_ksigaction: { ");
-   for (i = 1; i <= VKI_KNSIG; i++)
-      if (VG_(ksigismember(&(sa->ksa_mask),i)))
-         VG_(printf)("%d ", i);
-   VG_(printf)("}\n");
-}
-
-
-/* At startup, copy the process' real signal state to the SCSS.
-   Whilst doing this, block all real signals.  Then calculate SKSS and
-   set the kernel to that.  Also initialise DCSS. 
-*/
-void VG_(sigstartup_actions) ( void )
-{
-   Int i, ret;
-
-   vki_ksigset_t  saved_procmask;
-   vki_kstack_t   altstack_info;
-   vki_ksigaction sa;
-
-   /* VG_(printf)("SIGSTARTUP\n"); */
-   /* Block all signals.  
-      saved_procmask remembers the previous mask. */
-   VG_(block_all_host_signals)( &saved_procmask );
-
-   /* Copy per-signal settings to SCSS. */
-   for (i = 1; i <= VKI_KNSIG; i++) {
-
-      /* Get the old host action */
-      ret = VG_(ksigaction)(i, NULL, &sa);
-      vg_assert(ret == 0);
-
-      if (VG_(clo_trace_signals))
-         VG_(printf)("snaffling handler 0x%x for signal %d\n", 
-                     (Addr)(sa.ksa_handler), i );
-
-      vg_scss.scss_per_sig[i].scss_handler  = sa.ksa_handler;
-      vg_scss.scss_per_sig[i].scss_flags    = sa.ksa_flags;
-      vg_scss.scss_per_sig[i].scss_mask     = sa.ksa_mask;
-      vg_scss.scss_per_sig[i].scss_restorer = sa.ksa_restorer;
-   }
-
-   /* Copy the alt stack, if any. */
-   ret = VG_(ksigaltstack)(NULL, &vg_scss.altstack);
-   vg_assert(ret == 0);
-
-   /* Copy the process' signal mask into the root thread. */
-   vg_assert(VG_(threads)[1].status == VgTs_Runnable);
-   VG_(threads)[1].sig_mask = saved_procmask;
-
-   /* Initialise DCSS. */
-   for (i = 1; i <= VKI_KNSIG; i++) {
-      vg_dcss.dcss_sigpending[i] = False;
-      vg_dcss.dcss_destthread[i] = VG_INVALID_THREADID;
-   }
-
-   /* Register an alternative stack for our own signal handler to run
-      on. */
-   altstack_info.ss_sp = &(VG_(sigstack)[0]);
-   altstack_info.ss_size = 10000 * sizeof(UInt);
-   altstack_info.ss_flags = 0;
-   ret = VG_(ksigaltstack)(&altstack_info, NULL);
-   if (ret != 0) {
-      VG_(panic)(
-         "vg_sigstartup_actions: couldn't install alternative sigstack");
-   }
-   if (VG_(clo_trace_signals)) {
-      VG_(message)(Vg_DebugExtraMsg, 
-         "vg_sigstartup_actions: sigstack installed ok");
-   }
-
-   /* DEBUGGING HACK */
-   /* VG_(ksignal)(VKI_SIGUSR1, &VG_(oursignalhandler)); */
-
-   /* Calculate SKSS and apply it.  This also sets the initial kernel
-      mask we need to run with. */
-   VG_(handle_SCSS_change)( True /* forced update */ );
-}
-
-
-/* Copy the process' sim signal state to the real state,
-   for when we transfer from the simulated to real CPU.
-   PROBLEM: what if we're running a signal handler when we
-   get here?  Hmm.
-   I guess we wind up in vg_signalreturn_bogusRA, *or* the
-   handler has done/will do a longjmp, in which case we're ok.
-
-   It is important (see vg_startup.S) that this proc does not
-   change the state of the real FPU, since it is called when
-   running the program on the real CPU.
-*/
-void VG_(sigshutdown_actions) ( void )
-{
-   Int i, ret;
-
-   vki_ksigset_t  saved_procmask;
-   vki_ksigaction sa;
-
-   VG_(block_all_host_signals)( &saved_procmask );
-
-   /* Copy per-signal settings from SCSS. */
-   for (i = 1; i <= VKI_KNSIG; i++) {
-
-      sa.ksa_handler  = vg_scss.scss_per_sig[i].scss_handler;
-      sa.ksa_flags    = vg_scss.scss_per_sig[i].scss_flags;
-      sa.ksa_mask     = vg_scss.scss_per_sig[i].scss_mask;
-      sa.ksa_restorer = vg_scss.scss_per_sig[i].scss_restorer;
-
-      if (VG_(clo_trace_signals))
-         VG_(printf)("restoring handler 0x%x for signal %d\n", 
-                     (Addr)(sa.ksa_handler), i );
-
-      /* Set the old host action */
-      ret = VG_(ksigaction)(i, &sa, NULL);
-      if (i != VKI_SIGKILL && i != VKI_SIGSTOP) 
-         vg_assert(ret == 0);
-   }
-
-   /* Restore the sig alt stack. */
-   ret = VG_(ksigaltstack)(&vg_scss.altstack, NULL);
-   vg_assert(ret == 0);
-
-   /* A bit of a kludge -- set the sigmask to that of the root
-      thread. */
-   vg_assert(VG_(threads)[1].status != VgTs_Empty);
-   VG_(restore_all_host_signals)( &VG_(threads)[1].sig_mask );
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                             vg_signals.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_startup.S b/coregrind/vg_startup.S
deleted file mode 100644
index 63ee590153..0000000000
--- a/coregrind/vg_startup.S
+++ /dev/null
@@ -1,233 +0,0 @@
-
-##--------------------------------------------------------------------##
-##--- Startup and shutdown code for Valgrind.                      ---##
-##---                                                 vg_startup.S ---##
-##--------------------------------------------------------------------##
-
-/*
-  This file is part of Valgrind, an x86 protected-mode emulator 
-  designed for debugging and profiling binaries on x86-Unixes.
-
-  Copyright (C) 2000-2002 Julian Seward 
-     jseward@acm.org
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of the
-  License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-  02111-1307, USA.
-
-  The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_constants.h"
-
-
-#---------------------------------------------------------------------
-#
-# Startup and shutdown code for Valgrind.  Particularly hairy.
-#
-# The dynamic linker, ld.so, will run the contents of the .init
-# section, once it has located, mmap-d and and linked the shared
-# libraries needed by the program.  Valgrind is itself a shared
-# library.  ld.so then runs code in the .init sections of each
-# library in turn, in order to give them a chance to initialise
-# themselves.  We hijack this mechanism.  Our startup routine
-# does return -- and execution continues -- except on the
-# synthetic CPU, not the real one.  But ld.so, and the program
-# it is starting, cant tell the difference.
-#
-# The management apologise for the lack of apostrophes in these
-# comments.  GNU as seems to object to them, for some reason.
-
-
-.section .init
-	call VG_(startup)
-.section .fini
-	call VG_(shutdown)
-
-.section .data
-valgrind_already_initted:
-	.word	0
-	
-.section .text
-	
-
-.global VG_(startup)
-VG_(startup):
-	cmpl	$0, valgrind_already_initted
-	je	really_start_up
-	ret
-
-really_start_up:
-	movl	$1, valgrind_already_initted
-	
-        # Record %esp as it was when we got here.  This is because argv/c
-	# and envp[] are passed as args to this function, and we need to see
-	# envp so we can get at the env var VG_ARGS without help from libc.
-	# The stack layout at this point depends on the version of glibc in
-	# use.  See process_cmd_line_options() in vg_main.c for details.
-        movl    %esp, VG_(esp_at_startup)
-        
-	# We have control!  Save the state of the machine in
-	# the simulators state, and switch stacks.
-	# Except ... we cant copy the machines registers into their
-	# final places in vg_baseBlock, because the offsets to them
-	# have not yet been set up.  Instead, they are copied to a
-	# temporary place (m_state_static).  In vg_main.c, once the
-	# baseBlock offsets are set up, values are copied into baseBlock.
-	movl	%eax, VG_(m_state_static)+0
-	movl	%ecx, VG_(m_state_static)+4
-	movl	%edx, VG_(m_state_static)+8
-	movl	%ebx, VG_(m_state_static)+12
-	movl	%esp, VG_(m_state_static)+16
-	movl	%ebp, VG_(m_state_static)+20
-	movl	%esi, VG_(m_state_static)+24
-	movl	%edi, VG_(m_state_static)+28
-	pushfl
-	popl	%eax
-	movl	%eax, VG_(m_state_static)+32
-	fwait
-	fnsave	VG_(m_state_static)+40
-	frstor	VG_(m_state_static)+40
-
-	# keep the first and last 10 words free to check for overruns	
-	movl	$VG_(stack)+39996 -40, %esp
-
-	# Now some real magic.  We need this procedure to return,
-	# since thats what ld.so expects, but running on the
-	# simulator.  So vg_main starts the simulator running at
-	# the insn labelled first_insn_to_simulate.
-
-	movl	$first_insn_to_simulate, VG_(m_state_static)+36
-	jmp	VG_(main)
-first_insn_to_simulate:
-	# Nothing else to do -- just return in the "normal" way.
-	ret
-
-
-
-VG_(shutdown):
-	# Just return, and ignore any attempt by ld.so to call
-	# valgrind.sos exit function.  We just run the client all
-	# the way to the final exit() syscall.  This sidesteps
-	# problems caused by ld.so calling the finalisation code
-	# of other .sos *after* it shuts down valgrind, which
-	# was causing big problems with threads.
-	ret
-
-	
-	
-.global	VG_(switch_to_real_CPU)
-VG_(switch_to_real_CPU):
-	# Once Valgrind has decided it needs to exit,
-	# because the specified number of insns have been completed
-	# during a debugging run, it jumps here, which copies the
-	# simulators state into the real machine state.  Execution
-	# of the rest of the program continues on the real CPU,
-	# and there is no way for the simulator to regain control
-	# after this point.
-	frstor	VG_(m_state_static)+40
-	movl	VG_(m_state_static)+32, %eax
-	pushl	%eax
-	popfl
-	movl	VG_(m_state_static)+0, %eax
-	movl	VG_(m_state_static)+4, %ecx
-	movl	VG_(m_state_static)+8, %edx
-	movl	VG_(m_state_static)+12, %ebx
-	movl	VG_(m_state_static)+16, %esp
-	movl	VG_(m_state_static)+20, %ebp
-	movl	VG_(m_state_static)+24, %esi
-	movl	VG_(m_state_static)+28, %edi
-
-	pushal
-	pushfl
-	# We hope that vg_sigshutdown_actions does not alter
-	# the FPU state.
-	call	 VG_(sigshutdown_actions)
-	popfl
-	popal
-	# re-restore the FPU state anyway ...
-	frstor	VG_(m_state_static)+40	
-	jmp	*VG_(m_state_static)+36
-
-
-
-/*------------------------------------------------------------*/
-/*--- A function to temporarily copy %ESP/%EBP into        ---*/
-/*--- %esp/%ebp and then start up GDB.                     ---*/
-/*------------------------------------------------------------*/
-
-/*
-extern void VG_(swizzle_esp_then_start_GDB) ( Addr m_eip_at_error,
-                                              Addr m_esp_at_error,
-                                              Addr m_ebp_at_error );
-*/
-
-/*--- This is clearly not re-entrant! ---*/
-.data
-vg_ebp_saved_over_GDB_start:
-	.long	0
-vg_esp_saved_over_GDB_start:
-	.long	0
-.text
-	
-.global VG_(swizzle_esp_then_start_GDB)	
-VG_(swizzle_esp_then_start_GDB):
-	pushal
-
-	# remember the simulators current stack/frame pointers
-	movl	%ebp, vg_ebp_saved_over_GDB_start
-	movl	%esp, vg_esp_saved_over_GDB_start
-
-	# get args into regs
-	movl	44(%esp), %eax		# client %EBP
-	movl	40(%esp), %ebx		# client %ESP
-	movl	36(%esp), %ecx		# client %EIP
-
-	# Now that we dont need to refer to simulators stack any more,
-	# put %ESP into %esp
-	movl	%ebx, %esp
-
-	### %esp now refers to clients stack
-	### mess with the clients stack to make it look as if it
-	### called this procedure, since otherwise it will look to gdb
-	### as if the top (currently executing) stack frame of the
-	### client is missing.
-	
-	# push %EIP.  This is a faked-up return address.
-	pushl	%ecx
-
-	# push %EBP.  This is a faked %ebp-chain pointer.
-	pushl	%eax
-
-	movl	%esp, %ebp
-	
-	call	VG_(start_GDB_whilst_on_client_stack)
-
-	# restore the simulators stack/frame pointer
-	movl	vg_ebp_saved_over_GDB_start, %ebp
-	movl	vg_esp_saved_over_GDB_start, %esp
-	
-	popal
-	ret
-
-# gcc puts this construction at the end of every function.  I think it
-# allows the linker to figure out the size of the function.  So we do
-# the same, in the vague hope that it might help GDBs navigation.
-.Lend_of_swizzle:
-	.size	VG_(swizzle_esp_then_start_GDB), .Lend_of_swizzle-VG_(swizzle_esp_then_start_GDB)
-
-
-##--------------------------------------------------------------------##
-##--- end                                             vg_startup.S ---##
-##--------------------------------------------------------------------##
diff --git a/coregrind/vg_symtab2.c b/coregrind/vg_symtab2.c
deleted file mode 100644
index 8330794ee3..0000000000
--- a/coregrind/vg_symtab2.c
+++ /dev/null
@@ -1,2079 +0,0 @@
-/*--------------------------------------------------------------------*/
-/*--- Management of symbols and debugging information.             ---*/
-/*---                                                 vg_symtab2.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-#include <elf.h>          /* ELF defns                      */
-#include <a.out.h>        /* stabs defns                    */
-
-
-/* Majorly rewritten Sun 3 Feb 02 to enable loading symbols from
-   dlopen()ed libraries, which is something that KDE3 does a lot.
-
-   Stabs reader greatly improved by Nick Nethercode, Apr 02.
-
-   16 May 02: when notified about munmap, return a Bool indicating
-   whether or not the area being munmapped had executable permissions.
-   This is then used to determine whether or not
-   VG_(invalid_translations) should be called for that area.  In order
-   that this work even if --instrument=no, in this case we still keep
-   track of the mapped executable segments, but do not load any debug
-   info or symbols.
-*/
-
-/*------------------------------------------------------------*/
-/*--- Structs n stuff                                      ---*/
-/*------------------------------------------------------------*/
-
-/* A structure to hold an ELF symbol (very crudely). */
-typedef 
-   struct { 
-      Addr addr;   /* lowest address of entity */
-      UInt size;   /* size in bytes */
-      Int  nmoff;  /* offset of name in this SegInfo's str tab */
-   }
-   RiSym;
-
-/* Line count at which overflow happens, due to line numbers being stored as
- * shorts in `struct nlist' in a.out.h. */
-#define LINENO_OVERFLOW (1 << (sizeof(short) * 8))
-
-#define LINENO_BITS     20
-#define LOC_SIZE_BITS  (32 - LINENO_BITS)
-#define MAX_LINENO     ((1 << LINENO_BITS) - 1)
-
-/* Unlikely to have any lines with instruction ranges > 4096 bytes */
-#define MAX_LOC_SIZE   ((1 << LOC_SIZE_BITS) - 1)
-
-/* Number used to detect line number overflows;  if one line is 60000-odd
- * smaller than the previous, is was probably an overflow.  
- */
-#define OVERFLOW_DIFFERENCE     (LINENO_OVERFLOW - 5000)
-
-/* A structure to hold addr-to-source info for a single line.  There can be a
- * lot of these, hence the dense packing. */
-typedef
-   struct {
-      /* Word 1 */
-      Addr   addr;                  /* lowest address for this line */
-      /* Word 2 */
-      UShort size:LOC_SIZE_BITS;    /* byte size; we catch overflows of this */
-      UInt   lineno:LINENO_BITS;    /* source line number, or zero */
-      /* Word 3 */
-      UInt   fnmoff;                /* source filename; offset in this 
-                                       SegInfo's str tab */
-   }
-   RiLoc;
-
-
-/* A structure which contains information pertaining to one mapped
-   text segment. */
-typedef
-   struct _SegInfo {
-      struct _SegInfo* next;
-      /* Description of the mapped segment. */
-      Addr   start;
-      UInt   size;
-      UChar* filename; /* in mallocville */
-      UInt   foffset;
-      /* An expandable array of symbols. */
-      RiSym* symtab;
-      UInt   symtab_used;
-      UInt   symtab_size;
-      /* An expandable array of locations. */
-      RiLoc* loctab;
-      UInt   loctab_used;
-      UInt   loctab_size;
-      /* An expandable array of characters -- the string table. */
-      Char*  strtab;
-      UInt   strtab_used;
-      UInt   strtab_size;
-      /* offset    is what we need to add to symbol table entries
-                   to get the real location of that symbol in memory.
-                   For executables, offset is zero.  
-                   For .so's, offset == base_addr.
-                   This seems like a giant kludge to me.
-      */
-      UInt   offset;
-   } 
-   SegInfo;
-
-
-/* -- debug helper -- */
-static void ppSegInfo ( SegInfo* si )
-{
-   VG_(printf)("name: %s\n"
-               "start %p, size %d, foffset %d\n",
-               si->filename?si->filename : (UChar*)"NULL",
-               si->start, si->size, si->foffset );
-}
-
-static void freeSegInfo ( SegInfo* si )
-{
-   vg_assert(si != NULL);
-   if (si->filename) VG_(free)(VG_AR_SYMTAB, si->filename);
-   if (si->symtab) VG_(free)(VG_AR_SYMTAB, si->symtab);
-   if (si->loctab) VG_(free)(VG_AR_SYMTAB, si->loctab);
-   if (si->strtab) VG_(free)(VG_AR_SYMTAB, si->strtab);
-   VG_(free)(VG_AR_SYMTAB, si);
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Adding stuff                                         ---*/
-/*------------------------------------------------------------*/
-
-/* Add a str to the string table, including terminating zero, and
-   return offset of the string in vg_strtab. */
-
-static __inline__
-Int addStr ( SegInfo* si, Char* str )
-{
-   Char* new_tab;
-   Int   new_sz, i, space_needed;
-   
-   space_needed = 1 + VG_(strlen)(str);
-   if (si->strtab_used + space_needed > si->strtab_size) {
-      new_sz = 2 * si->strtab_size;
-      if (new_sz == 0) new_sz = 5000;
-      new_tab = VG_(malloc)(VG_AR_SYMTAB, new_sz);
-      if (si->strtab != NULL) {
-         for (i = 0; i < si->strtab_used; i++)
-            new_tab[i] = si->strtab[i];
-         VG_(free)(VG_AR_SYMTAB, si->strtab);
-      }
-      si->strtab      = new_tab;
-      si->strtab_size = new_sz;
-   }
-
-   for (i = 0; i < space_needed; i++)
-      si->strtab[si->strtab_used+i] = str[i];
-
-   si->strtab_used += space_needed;
-   vg_assert(si->strtab_used <= si->strtab_size);
-   return si->strtab_used - space_needed;
-}
-
-/* Add a symbol to the symbol table. */
-
-static __inline__
-void addSym ( SegInfo* si, RiSym* sym )
-{
-   Int    new_sz, i;
-   RiSym* new_tab;
-
-   /* Ignore zero-sized syms. */
-   if (sym->size == 0) return;
-
-   if (si->symtab_used == si->symtab_size) {
-      new_sz = 2 * si->symtab_size;
-      if (new_sz == 0) new_sz = 500;
-      new_tab = VG_(malloc)(VG_AR_SYMTAB, new_sz * sizeof(RiSym) );
-      if (si->symtab != NULL) {
-         for (i = 0; i < si->symtab_used; i++)
-            new_tab[i] = si->symtab[i];
-         VG_(free)(VG_AR_SYMTAB, si->symtab);
-      }
-      si->symtab = new_tab;
-      si->symtab_size = new_sz;
-   }
-
-   si->symtab[si->symtab_used] = *sym;
-   si->symtab_used++;
-   vg_assert(si->symtab_used <= si->symtab_size);
-}
-
-/* Add a location to the location table. */
-
-static __inline__
-void addLoc ( SegInfo* si, RiLoc* loc )
-{
-   Int    new_sz, i;
-   RiLoc* new_tab;
-
-   /* Zero-sized locs should have been ignored earlier */
-   vg_assert(loc->size > 0);
-
-   if (si->loctab_used == si->loctab_size) {
-      new_sz = 2 * si->loctab_size;
-      if (new_sz == 0) new_sz = 500;
-      new_tab = VG_(malloc)(VG_AR_SYMTAB, new_sz * sizeof(RiLoc) );
-      if (si->loctab != NULL) {
-         for (i = 0; i < si->loctab_used; i++)
-            new_tab[i] = si->loctab[i];
-         VG_(free)(VG_AR_SYMTAB, si->loctab);
-      }
-      si->loctab = new_tab;
-      si->loctab_size = new_sz;
-   }
-
-   si->loctab[si->loctab_used] = *loc;
-   si->loctab_used++;
-   vg_assert(si->loctab_used <= si->loctab_size);
-}
-
-
-/* Top-level place to call to add a source-location mapping entry. */
-
-static __inline__
-void addLineInfo ( SegInfo* si,
-                   Int      fnmoff,
-                   Addr     this,
-                   Addr     next,
-                   Int      lineno,
-                   Int      entry /* only needed for debug printing */
-                 )
-{
-   RiLoc loc;
-   Int size = next - this;
-
-   /* Ignore zero-sized locs */
-   if (this == next) return;
-
-   /* Maximum sanity checking.  Some versions of GNU as do a shabby
-    * job with stabs entries; if anything looks suspicious, revert to
-    * a size of 1.  This should catch the instruction of interest
-    * (since if using asm-level debug info, one instruction will
-    * correspond to one line, unlike with C-level debug info where
-    * multiple instructions can map to the one line), but avoid
-    * catching any other instructions bogusly. */
-   if (this > next) {
-       VG_(message)(Vg_DebugMsg, 
-                    "warning: line info addresses out of order "
-                    "at entry %d: 0x%x 0x%x", entry, this, next);
-       size = 1;
-   }
-
-   if (size > MAX_LOC_SIZE) {
-       if (0)
-       VG_(message)(Vg_DebugMsg, 
-                    "warning: line info address range too large "
-                    "at entry %d: %d", entry, size);
-       size = 1;
-   }
-
-   /* vg_assert(this < si->start + si->size && next-1 >= si->start); */
-   if (this >= si->start + si->size || next-1 < si->start) {
-       if (0)
-       VG_(message)(Vg_DebugMsg, 
-                    "warning: ignoring line info entry falling "
-                    "outside current SegInfo: %p %p %p %p",
-                    si->start, si->start + si->size, 
-                    this, next-1);
-       return;
-   }
-
-   vg_assert(lineno >= 0);
-   if (lineno > MAX_LINENO) {
-       VG_(message)(Vg_UserMsg, 
-                    "warning: ignoring line info entry with "
-                    "huge line number (%d)", lineno);
-       VG_(message)(Vg_UserMsg, 
-                    "         Can't handle line numbers "
-                    "greater than %d, sorry", MAX_LINENO);
-       return;
-   }
-
-   loc.addr      = this;
-   loc.size      = (UShort)size;
-   loc.lineno    = lineno;
-   loc.fnmoff    = fnmoff;
-   addLoc ( si, &loc );
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Helpers                                              ---*/
-/*------------------------------------------------------------*/
-
-/* Non-fatal -- use vg_panic if terminal. */
-static 
-void vg_symerr ( Char* msg )
-{
-   if (VG_(clo_verbosity) > 1)
-      VG_(message)(Vg_UserMsg,"%s", msg );
-}
-
-
-/* Print a symbol. */
-static
-void printSym ( SegInfo* si, Int i )
-{
-  VG_(printf)( "%5d:  %8p .. %8p (%d)      %s\n",
-               i,
-               si->symtab[i].addr, 
-               si->symtab[i].addr + si->symtab[i].size - 1, si->symtab[i].size,
-                &si->strtab[si->symtab[i].nmoff] );
-}
-
-
-#if 0
-/* Print the entire sym tab. */
-static __attribute__ ((unused))
-void printSymtab ( void )
-{
-   Int i;
-   VG_(printf)("\n------ BEGIN vg_symtab ------\n");
-   for (i = 0; i < vg_symtab_used; i++)
-      printSym(i);
-   VG_(printf)("------ BEGIN vg_symtab ------\n");
-}
-#endif
-
-#if 0
-/* Paranoid strcat. */
-static
-void safeCopy ( UChar* dst, UInt maxlen, UChar* src )
-{
-   UInt i = 0, j = 0;
-   while (True) {
-      if (i >= maxlen) return;
-      if (dst[i] == 0) break;
-      i++;
-   }
-   while (True) {
-      if (i >= maxlen) return;
-      dst[i] = src[j];
-      if (src[j] == 0) return;
-      i++; j++;
-   }
-}
-#endif
-
-
-/*------------------------------------------------------------*/
-/*--- Canonicalisers                                       ---*/
-/*------------------------------------------------------------*/
-
-/* Sort the symtab by starting address, and emit warnings if any
-   symbols have overlapping address ranges.  We use that old chestnut,
-   shellsort.  Mash the table around so as to establish the property
-   that addresses are in order and the ranges to not overlap.  This
-   facilitates using binary search to map addresses to symbols when we
-   come to query the table.
-*/
-static 
-void canonicaliseSymtab ( SegInfo* si )
-{
-   /* Magic numbers due to Janet Incerpi and Robert Sedgewick. */
-   Int   incs[16] = { 1, 3, 7, 21, 48, 112, 336, 861, 1968,
-                      4592, 13776, 33936, 86961, 198768, 
-                      463792, 1391376 };
-   Int   lo = 0;
-   Int   hi = si->symtab_used-1;
-   Int   i, j, h, bigN, hp, n_merged, n_truncated;
-   RiSym v;
-   Addr  s1, s2, e1, e2;
-
-#  define SWAP(ty,aa,bb) \
-      do { ty tt = (aa); (aa) = (bb); (bb) = tt; } while (0)
-
-   bigN = hi - lo + 1; if (bigN < 2) return;
-   hp = 0; while (hp < 16 && incs[hp] < bigN) hp++; hp--;
-   vg_assert(0 <= hp && hp < 16);
-
-   for (; hp >= 0; hp--) {
-      h = incs[hp];
-      i = lo + h;
-      while (1) {
-         if (i > hi) break;
-         v = si->symtab[i];
-         j = i;
-         while (si->symtab[j-h].addr > v.addr) {
-            si->symtab[j] = si->symtab[j-h];
-            j = j - h;
-            if (j <= (lo + h - 1)) break;
-         }
-         si->symtab[j] = v;
-         i++;
-      }
-   }
-
-  cleanup_more:
- 
-   /* If two symbols have identical address ranges, favour the
-      one with the longer name. 
-   */
-   do {
-      n_merged = 0;
-      j = si->symtab_used;
-      si->symtab_used = 0;
-      for (i = 0; i < j; i++) {
-         if (i < j-1
-             && si->symtab[i].addr   == si->symtab[i+1].addr
-             && si->symtab[i].size   == si->symtab[i+1].size) {
-            n_merged++;
-            /* merge the two into one */
-            if (VG_(strlen)(&si->strtab[si->symtab[i].nmoff]) 
-                > VG_(strlen)(&si->strtab[si->symtab[i+1].nmoff])) {
-               si->symtab[si->symtab_used++] = si->symtab[i];
-            } else {
-               si->symtab[si->symtab_used++] = si->symtab[i+1];
-            }
-            i++;
-         } else {
-            si->symtab[si->symtab_used++] = si->symtab[i];
-         }
-      }
-      if (VG_(clo_trace_symtab))
-         VG_(printf)( "%d merged\n", n_merged);
-   }
-   while (n_merged > 0);
-
-   /* Detect and "fix" overlapping address ranges. */
-   n_truncated = 0;
-
-   for (i = 0; i < si->symtab_used-1; i++) {
-
-      vg_assert(si->symtab[i].addr <= si->symtab[i+1].addr);
-
-      /* Check for common (no overlap) case. */ 
-      if (si->symtab[i].addr + si->symtab[i].size 
-          <= si->symtab[i+1].addr)
-         continue;
-
-      /* There's an overlap.  Truncate one or the other. */
-      if (VG_(clo_trace_symtab)) {
-         VG_(printf)("overlapping address ranges in symbol table\n\t");
-         printSym(si,i);
-         VG_(printf)("\t");
-         printSym(si,i+1);
-         VG_(printf)("\n");
-      }
-
-      /* Truncate one or the other. */
-      s1 = si->symtab[i].addr;
-      s2 = si->symtab[i+1].addr;
-      e1 = s1 + si->symtab[i].size - 1;
-      e2 = s2 + si->symtab[i+1].size - 1;
-      if (s1 < s2) {
-         e1 = s2-1;
-      } else {
-         vg_assert(s1 == s2);
-         if (e1 > e2) { 
-            s1 = e2+1; SWAP(Addr,s1,s2); SWAP(Addr,e1,e2); 
-         } else 
-         if (e1 < e2) {
-            s2 = e1+1;
-         } else {
-	   /* e1 == e2.  Identical addr ranges.  We'll eventually wind
-              up back at cleanup_more, which will take care of it. */
-	 }
-      }
-      si->symtab[i].addr   = s1;
-      si->symtab[i+1].addr = s2;
-      si->symtab[i].size   = e1 - s1 + 1;
-      si->symtab[i+1].size = e2 - s2 + 1;
-      vg_assert(s1 <= s2);
-      vg_assert(si->symtab[i].size > 0);
-      vg_assert(si->symtab[i+1].size > 0);
-      /* It may be that the i+1 entry now needs to be moved further
-         along to maintain the address order requirement. */
-      j = i+1;
-      while (j < si->symtab_used-1 
-             && si->symtab[j].addr > si->symtab[j+1].addr) {
-         SWAP(RiSym,si->symtab[j],si->symtab[j+1]);
-         j++;
-      }
-      n_truncated++;
-   }
-
-   if (n_truncated > 0) goto cleanup_more;
-
-   /* Ensure relevant postconditions hold. */
-   for (i = 0; i < si->symtab_used-1; i++) {
-      /* No zero-sized symbols. */
-      vg_assert(si->symtab[i].size > 0);
-      /* In order. */
-      vg_assert(si->symtab[i].addr < si->symtab[i+1].addr);
-      /* No overlaps. */
-      vg_assert(si->symtab[i].addr + si->symtab[i].size - 1
-                < si->symtab[i+1].addr);
-   }
-#  undef SWAP
-}
-
-
-
-/* Sort the location table by starting address.  Mash the table around
-   so as to establish the property that addresses are in order and the
-   ranges do not overlap.  This facilitates using binary search to map
-   addresses to locations when we come to query the table.  
-*/
-static 
-void canonicaliseLoctab ( SegInfo* si )
-{
-   /* Magic numbers due to Janet Incerpi and Robert Sedgewick. */
-   Int   incs[16] = { 1, 3, 7, 21, 48, 112, 336, 861, 1968,
-                      4592, 13776, 33936, 86961, 198768, 
-                      463792, 1391376 };
-   Int   lo = 0;
-   Int   hi = si->loctab_used-1;
-   Int   i, j, h, bigN, hp;
-   RiLoc v;
-
-#  define SWAP(ty,aa,bb) \
-      do { ty tt = (aa); (aa) = (bb); (bb) = tt; } while (0);
-
-   /* Sort by start address. */
-
-   bigN = hi - lo + 1; if (bigN < 2) return;
-   hp = 0; while (hp < 16 && incs[hp] < bigN) hp++; hp--;
-   vg_assert(0 <= hp && hp < 16);
-
-   for (; hp >= 0; hp--) {
-      h = incs[hp];
-      i = lo + h;
-      while (1) {
-         if (i > hi) break;
-         v = si->loctab[i];
-         j = i;
-         while (si->loctab[j-h].addr > v.addr) {
-            si->loctab[j] = si->loctab[j-h];
-            j = j - h;
-            if (j <= (lo + h - 1)) break;
-         }
-         si->loctab[j] = v;
-         i++;
-      }
-   }
-
-   /* If two adjacent entries overlap, truncate the first. */
-   for (i = 0; i < si->loctab_used-1; i++) {
-      vg_assert(si->loctab[i].size < 10000);
-      if (si->loctab[i].addr + si->loctab[i].size > si->loctab[i+1].addr) {
-         /* Do this in signed int32 because the actual .size fields
-            are unsigned 16s. */
-         Int new_size = si->loctab[i+1].addr - si->loctab[i].addr;
-         if (new_size < 0) {
-            si->loctab[i].size = 0;
-         } else
-         if (new_size >= 65536) {
-           si->loctab[i].size = 65535;
-         } else {
-           si->loctab[i].size = (UShort)new_size;
-         }
-      }
-   }
-
-   /* Zap any zero-sized entries resulting from the truncation
-      process. */
-   j = 0;
-   for (i = 0; i < si->loctab_used; i++) {
-      if (si->loctab[i].size > 0) {
-         si->loctab[j] = si->loctab[i];
-         j++;
-      }
-   }
-   si->loctab_used = j;
-
-   /* Ensure relevant postconditions hold. */
-   for (i = 0; i < si->loctab_used-1; i++) {
-      /* 
-      VG_(printf)("%d   (%d) %d 0x%x\n", 
-                   i, si->loctab[i+1].confident, 
-                   si->loctab[i+1].size, si->loctab[i+1].addr );
-      */
-      /* No zero-sized symbols. */
-      vg_assert(si->loctab[i].size > 0);
-      /* In order. */
-      vg_assert(si->loctab[i].addr < si->loctab[i+1].addr);
-      /* No overlaps. */
-      vg_assert(si->loctab[i].addr + si->loctab[i].size - 1
-                < si->loctab[i+1].addr);
-   }
-#  undef SWAP
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Read STABS format debug info.                        ---*/
-/*------------------------------------------------------------*/
-
-/* Stabs entry types, from:
- *   The "stabs" debug format
- *   Menapace, Kingdon and MacKenzie
- *   Cygnus Support
- */
-typedef enum { N_GSYM  = 32,    /* Global symbol                    */
-               N_FUN   = 36,    /* Function start or end            */
-               N_STSYM = 38,    /* Data segment file-scope variable */
-               N_LCSYM = 40,    /* BSS segment file-scope variable  */
-               N_RSYM  = 64,    /* Register variable                */
-               N_SLINE = 68,    /* Source line number               */
-               N_SO    = 100,   /* Source file path and name        */
-               N_LSYM  = 128,   /* Stack variable or type           */
-               N_SOL   = 132,   /* Include file name                */
-               N_LBRAC = 192,   /* Start of lexical block           */
-               N_RBRAC = 224    /* End   of lexical block           */
-             } stab_types;
-      
-
-/* Read stabs-format debug info.  This is all rather horrible because
-   stabs is a underspecified, kludgy hack.
-*/
-static
-void read_debuginfo_stabs ( SegInfo* si,
-                            UChar* stabC,   Int stab_sz, 
-                            UChar* stabstr, Int stabstr_sz )
-{
-   Int    i;
-   Int    curr_filenmoff;
-   Addr   curr_fn_stabs_addr = (Addr)NULL;
-   Addr   curr_fnbaseaddr    = (Addr)NULL;
-   Char  *curr_file_name, *curr_fn_name;
-   Int    n_stab_entries;
-   Int    prev_lineno = 0, lineno = 0;
-   Int    lineno_overflows = 0;
-   Bool   same_file = True;
-   struct nlist* stab = (struct nlist*)stabC;
-
-   /* Ok.  It all looks plausible.  Go on and read debug data. 
-         stab kinds: 100   N_SO     a source file name
-                      68   N_SLINE  a source line number
-                      36   N_FUN    start of a function
-
-      In this loop, we maintain a current file name, updated as 
-      N_SO/N_SOLs appear, and a current function base address, 
-      updated as N_FUNs appear.  Based on that, address ranges for 
-      N_SLINEs are calculated, and stuffed into the line info table.
-
-      Finding the instruction address range covered by an N_SLINE is
-      complicated;  see the N_SLINE case below.
-   */
-   curr_filenmoff     = addStr(si,"???");
-   curr_file_name     = curr_fn_name = (Char*)NULL;
-
-   n_stab_entries = stab_sz/(int)sizeof(struct nlist);
-
-   for (i = 0; i < n_stab_entries; i++) {
-#     if 0
-      VG_(printf) ( "   %2d  ", i );
-      VG_(printf) ( "type=0x%x   othr=%d   desc=%d   value=0x%x   strx=%d  %s",
-                    stab[i].n_type, stab[i].n_other, stab[i].n_desc, 
-                    (int)stab[i].n_value,
-                    (int)stab[i].n_un.n_strx, 
-                    stabstr + stab[i].n_un.n_strx );
-      VG_(printf)("\n");
-#     endif
-
-      Char *no_fn_name = "???";
-
-      switch (stab[i].n_type) {
-         UInt next_addr;
-
-         /* Two complicated things here:
-	  *
-          * 1. the n_desc field in 'struct n_list' in a.out.h is only
-          *    16-bits, which gives a maximum of 65535 lines.  We handle
-          *    files bigger than this by detecting heuristically
-          *    overflows -- if the line count goes from 65000-odd to
-          *    0-odd within the same file, we assume it's an overflow.
-          *    Once we switch files, we zero the overflow count.
-          *
-          * 2. To compute the instr address range covered by a single
-          *    line, find the address of the next thing and compute the
-          *    difference.  The approach used depends on what kind of
-          *    entry/entries follow...
-          */
-         case N_SLINE: {
-            Int this_addr = (UInt)stab[i].n_value;
-
-            /* Although stored as a short, neg values really are >
-             * 32768, hence the UShort cast.  Then we use an Int to
-             * handle overflows. */
-            prev_lineno = lineno;
-            lineno      = (Int)((UShort)stab[i].n_desc);
-
-            if (prev_lineno > lineno + OVERFLOW_DIFFERENCE && same_file) {
-               VG_(message)(Vg_DebugMsg, 
-                  "Line number overflow detected (%d --> %d) in %s", 
-                  prev_lineno, lineno, curr_file_name);
-               lineno_overflows++;
-            }
-            same_file = True;
-
-            LOOP:
-            if (i+1 >= n_stab_entries) {
-               /* If it's the last entry, just guess the range is
-                * four; can't do any better */
-               next_addr = this_addr + 4;
-            } else {    
-               switch (stab[i+1].n_type) {
-                  /* Easy, common case: use address of next entry */
-                  case N_SLINE: case N_SO:
-                     next_addr = (UInt)stab[i+1].n_value;
-                     break;
-
-                  /* Boring one: skip, look for something more
-                     useful. */
-                  case N_RSYM: case N_LSYM: case N_LBRAC: case N_RBRAC: 
-                  case N_STSYM: case N_LCSYM: case N_GSYM:
-                     i++;
-                     goto LOOP;
-                     
-                  /* If end-of-this-fun entry, use its address.
-                   * If start-of-next-fun entry, find difference between start
-                   *   of current function and start of next function to work
-                   *   it out.
-                   */
-                  case N_FUN: 
-                     if ('\0' == * (stabstr + stab[i+1].n_un.n_strx) ) {
-                        next_addr = (UInt)stab[i+1].n_value;
-                     } else {
-                        next_addr = 
-                            (UInt)stab[i+1].n_value - curr_fn_stabs_addr;
-                     }
-                     break;
-
-                  /* N_SOL should be followed by an N_SLINE which can
-                     be used */
-                  case N_SOL:
-                     if (i+2 < n_stab_entries && N_SLINE == stab[i+2].n_type) {
-                        next_addr = (UInt)stab[i+2].n_value;
-                        break;
-                     } else {
-                        VG_(printf)("unhandled N_SOL stabs case: %d %d %d", 
-                                    stab[i+1].n_type, i, n_stab_entries);
-                        VG_(panic)("unhandled N_SOL stabs case");
-                     }
-
-                  default:
-                     VG_(printf)("unhandled (other) stabs case: %d %d", 
-                                 stab[i+1].n_type,i);
-                     /* VG_(panic)("unhandled (other) stabs case"); */
-                     next_addr = this_addr + 4;
-                     break;
-               }
-            }
-            
-            addLineInfo ( si, curr_filenmoff, curr_fnbaseaddr + this_addr, 
-                          curr_fnbaseaddr + next_addr,
-                          lineno + lineno_overflows * LINENO_OVERFLOW, i);
-            break;
-         }
-
-         case N_FUN: {
-            if ('\0' != (stabstr + stab[i].n_un.n_strx)[0] ) {
-               /* N_FUN with a name -- indicates the start of a fn.  */
-               curr_fn_stabs_addr = (Addr)stab[i].n_value;
-               curr_fnbaseaddr = si->offset + curr_fn_stabs_addr;
-               curr_fn_name = stabstr + stab[i].n_un.n_strx;
-            } else {
-               curr_fn_name = no_fn_name;
-            }
-            break;
-         }
-
-         case N_SOL:
-            if (lineno_overflows != 0) {
-               VG_(message)(Vg_UserMsg, 
-                            "Warning: file %s is very big (> 65535 lines) "
-                            "Line numbers and annotation for this file might "
-                            "be wrong.  Sorry",
-                            curr_file_name);
-            }
-            /* fall through! */
-         case N_SO: 
-            lineno_overflows = 0;
-
-         /* seems to give lots of locations in header files */
-         /* case 130: */ /* BINCL */
-         { 
-            UChar* nm = stabstr + stab[i].n_un.n_strx;
-            UInt len = VG_(strlen)(nm);
-            
-            if (len > 0 && nm[len-1] != '/') {
-               curr_filenmoff = addStr ( si, nm );
-               curr_file_name = stabstr + stab[i].n_un.n_strx;
-            }
-            else
-               if (len == 0)
-                  curr_filenmoff = addStr ( si, "?1\0" );
-
-            break;
-         }
-
-#        if 0
-         case 162: /* EINCL */
-            curr_filenmoff = addStr ( si, "?2\0" );
-            break;
-#        endif
-
-         default:
-            break;
-      }
-   } /* for (i = 0; i < stab_sz/(int)sizeof(struct nlist); i++) */
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Read DWARF2 format debug info.                       ---*/
-/*------------------------------------------------------------*/
-
-/* Structure found in the .debug_line section.  */
-typedef struct
-{
-  UChar li_length          [4];
-  UChar li_version         [2];
-  UChar li_prologue_length [4];
-  UChar li_min_insn_length [1];
-  UChar li_default_is_stmt [1];
-  UChar li_line_base       [1];
-  UChar li_line_range      [1];
-  UChar li_opcode_base     [1];
-}
-DWARF2_External_LineInfo;
-
-typedef struct
-{
-  UInt   li_length;
-  UShort li_version;
-  UInt   li_prologue_length;
-  UChar  li_min_insn_length;
-  UChar  li_default_is_stmt;
-  Int    li_line_base;
-  UChar  li_line_range;
-  UChar  li_opcode_base;
-}
-DWARF2_Internal_LineInfo;
-
-/* Line number opcodes.  */
-enum dwarf_line_number_ops
-  {
-    DW_LNS_extended_op = 0,
-    DW_LNS_copy = 1,
-    DW_LNS_advance_pc = 2,
-    DW_LNS_advance_line = 3,
-    DW_LNS_set_file = 4,
-    DW_LNS_set_column = 5,
-    DW_LNS_negate_stmt = 6,
-    DW_LNS_set_basic_block = 7,
-    DW_LNS_const_add_pc = 8,
-    DW_LNS_fixed_advance_pc = 9,
-    /* DWARF 3.  */
-    DW_LNS_set_prologue_end = 10,
-    DW_LNS_set_epilogue_begin = 11,
-    DW_LNS_set_isa = 12
-  };
-
-/* Line number extended opcodes.  */
-enum dwarf_line_number_x_ops
-  {
-    DW_LNE_end_sequence = 1,
-    DW_LNE_set_address = 2,
-    DW_LNE_define_file = 3
-  };
-
-typedef struct State_Machine_Registers
-{
-  Addr  address;
-  UInt  file;
-  UInt  line;
-  UInt  column;
-  Int   is_stmt;
-  Int   basic_block;
-  Int   end_sequence;
-  /* This variable hold the number of the last entry seen
-     in the File Table.  */
-  UInt  last_file_entry;
-} SMR;
-
-
-static 
-UInt read_leb128 ( UChar* data, Int* length_return, Int sign )
-{
-  UInt   result = 0;
-  UInt   num_read = 0;
-  Int    shift = 0;
-  UChar  byte;
-
-  do
-    {
-      byte = * data ++;
-      num_read ++;
-
-      result |= (byte & 0x7f) << shift;
-
-      shift += 7;
-
-    }
-  while (byte & 0x80);
-
-  if (length_return != NULL)
-    * length_return = num_read;
-
-  if (sign && (shift < 32) && (byte & 0x40))
-    result |= -1 << shift;
-
-  return result;
-}
-
-
-static SMR state_machine_regs;
-
-static 
-void reset_state_machine ( Int is_stmt )
-{
-  if (0) VG_(printf)("smr.a := %p (reset)\n", 0 );
-  state_machine_regs.address = 0;
-  state_machine_regs.file = 1;
-  state_machine_regs.line = 1;
-  state_machine_regs.column = 0;
-  state_machine_regs.is_stmt = is_stmt;
-  state_machine_regs.basic_block = 0;
-  state_machine_regs.end_sequence = 0;
-  state_machine_regs.last_file_entry = 0;
-}
-
-/* Handled an extend line op.  Returns true if this is the end
-   of sequence.  */
-static 
-int process_extended_line_op( SegInfo *si, UInt** fnames, 
-                              UChar* data, Int is_stmt, Int pointer_size)
-{
-  UChar   op_code;
-  Int     bytes_read;
-  UInt    len;
-  UChar * name;
-  Addr    adr;
-
-  len = read_leb128 (data, & bytes_read, 0);
-  data += bytes_read;
-
-  if (len == 0)
-    {
-      VG_(message)(Vg_UserMsg,
-         "badly formed extended line op encountered!\n");
-      return bytes_read;
-    }
-
-  len += bytes_read;
-  op_code = * data ++;
-
-
-  switch (op_code)
-    {
-    case DW_LNE_end_sequence:
-      if (0) VG_(printf)("1001: si->o %p, smr.a %p\n", 
-                         si->offset, state_machine_regs.address );
-      state_machine_regs.end_sequence = 1; /* JRS: added for compliance
-         with spec; is pointless due to reset_state_machine below 
-      */
-      addLineInfo (si, (*fnames)[state_machine_regs.file], 
-                       si->offset + (state_machine_regs.address - 1), 
-                       si->offset + (state_machine_regs.address), 
-                       0, 0);
-      reset_state_machine (is_stmt);
-      break;
-
-    case DW_LNE_set_address:
-      /* XXX: Pointer size could be 8 */
-      vg_assert(pointer_size == 4);
-      adr = *((Addr *)data);
-      if (0) VG_(printf)("smr.a := %p\n", adr );
-      state_machine_regs.address = adr;
-      break;
-
-    case DW_LNE_define_file:
-      ++ state_machine_regs.last_file_entry;
-      name = data;
-      if (*fnames == NULL)
-        *fnames = VG_(malloc)(VG_AR_SYMTAB, sizeof (UInt) * 2);
-      else
-        *fnames = VG_(realloc)(
-                     VG_AR_SYMTAB, *fnames, 
-                     sizeof(UInt) 
-                        * (state_machine_regs.last_file_entry + 1));
-      (*fnames)[state_machine_regs.last_file_entry] = addStr (si,name);
-      data += VG_(strlen) ((char *) data) + 1;
-      read_leb128 (data, & bytes_read, 0);
-      data += bytes_read;
-      read_leb128 (data, & bytes_read, 0);
-      data += bytes_read;
-      read_leb128 (data, & bytes_read, 0);
-      break;
-
-    default:
-      break;
-    }
-
-  return len;
-}
-
-
-static
-void read_debuginfo_dwarf2 ( SegInfo* si, UChar* dwarf2, Int dwarf2_sz )
-{
-  DWARF2_External_LineInfo * external;
-  DWARF2_Internal_LineInfo   info;
-  UChar *            standard_opcodes;
-  UChar *            data = dwarf2;
-  UChar *            end  = dwarf2 + dwarf2_sz;
-  UChar *            end_of_sequence;
-  UInt  *            fnames = NULL;
-
-  /* Fails due to gcc padding ...
-  vg_assert(sizeof(DWARF2_External_LineInfo)
-            == sizeof(DWARF2_Internal_LineInfo));
-  */
-
-  while (data < end)
-    {
-      external = (DWARF2_External_LineInfo *) data;
-
-      /* Check the length of the block.  */
-      info.li_length = * ((UInt *)(external->li_length));
-
-      if (info.li_length == 0xffffffff)
-       {
-         vg_symerr("64-bit DWARF line info is not supported yet.");
-         break;
-       }
-
-      if (info.li_length + sizeof (external->li_length) > dwarf2_sz)
-       {
-        vg_symerr("DWARF line info appears to be corrupt "
-                  "- the section is too small");
-         return;
-       }
-
-      /* Check its version number.  */
-      info.li_version = * ((UShort *) (external->li_version));
-      if (info.li_version != 2)
-       {
-         vg_symerr("Only DWARF version 2 line info "
-                   "is currently supported.");
-         return;
-       }
-
-      info.li_prologue_length = * ((UInt *) (external->li_prologue_length));
-      info.li_min_insn_length = * ((UChar *)(external->li_min_insn_length));
-      info.li_default_is_stmt = * ((UChar *)(external->li_default_is_stmt));
-
-      /* JRS: changed (UInt*) to (UChar*) */
-      info.li_line_base       = * ((UChar *)(external->li_line_base));
-
-      info.li_line_range      = * ((UChar *)(external->li_line_range));
-      info.li_opcode_base     = * ((UChar *)(external->li_opcode_base)); 
-
-      /* Sign extend the line base field.  */
-      info.li_line_base <<= 24;
-      info.li_line_base >>= 24;
-
-      end_of_sequence = data + info.li_length 
-                             + sizeof (external->li_length);
-
-      reset_state_machine (info.li_default_is_stmt);
-
-      /* Read the contents of the Opcodes table.  */
-      standard_opcodes = data + sizeof (* external);
-
-      /* Read the contents of the Directory table.  */
-      data = standard_opcodes + info.li_opcode_base - 1;
-
-      if (* data == 0) 
-       {
-       }
-      else
-       {
-         /* We ignore the directory table, since gcc gives the entire
-            path as part of the filename */
-         while (* data != 0)
-           {
-             data += VG_(strlen) ((char *) data) + 1;
-           }
-       }
-
-      /* Skip the NUL at the end of the table.  */
-      if (*data != 0) {
-         vg_symerr("can't find NUL at end of DWARF2 directory table");
-         return;
-      }
-      data ++;
-
-      /* Read the contents of the File Name table.  */
-      if (* data == 0)
-       {
-       }
-      else
-       {
-         while (* data != 0)
-           {
-             UChar * name;
-             Int bytes_read;
-
-             ++ state_machine_regs.last_file_entry;
-             name = data;
-             /* Since we don't have realloc (0, ....) == malloc (...)
-		semantics, we need to malloc the first time. */
-
-             if (fnames == NULL)
-               fnames = VG_(malloc)(VG_AR_SYMTAB, sizeof (UInt) * 2);
-             else
-               fnames = VG_(realloc)(VG_AR_SYMTAB, fnames, 
-                           sizeof(UInt) 
-                              * (state_machine_regs.last_file_entry + 1));
-             data += VG_(strlen) ((Char *) data) + 1;
-             fnames[state_machine_regs.last_file_entry] = addStr (si,name);
-
-             read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-             read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-             read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-           }
-       }
-
-      /* Skip the NUL at the end of the table.  */
-      if (*data != 0) {
-         vg_symerr("can't find NUL at end of DWARF2 file name table");
-         return;
-      }
-      data ++;
-
-      /* Now display the statements.  */
-
-      while (data < end_of_sequence)
-       {
-         UChar op_code;
-         Int           adv;
-         Int           bytes_read;
-
-         op_code = * data ++;
-
-         if (op_code >= info.li_opcode_base)
-           {
-             Int advAddr;
-             op_code -= info.li_opcode_base;
-             adv      = (op_code / info.li_line_range) 
-                           * info.li_min_insn_length;
-             advAddr = adv;
-             state_machine_regs.address += adv;
-             if (0) VG_(printf)("smr.a += %p\n", adv );
-             adv = (op_code % info.li_line_range) + info.li_line_base;
-             if (0) VG_(printf)("1002: si->o %p, smr.a %p\n", 
-                                si->offset, state_machine_regs.address );
-             addLineInfo (si, fnames[state_machine_regs.file], 
-                              si->offset + (state_machine_regs.address 
-                                            - advAddr), 
-                              si->offset + (state_machine_regs.address), 
-                              state_machine_regs.line, 0);
-             state_machine_regs.line += adv;
-           }
-         else switch (op_code)
-           {
-           case DW_LNS_extended_op:
-             data += process_extended_line_op (
-                        si, &fnames, data, 
-                        info.li_default_is_stmt, sizeof (Addr));
-             break;
-
-           case DW_LNS_copy:
-             if (0) VG_(printf)("1002: si->o %p, smr.a %p\n", 
-                                si->offset, state_machine_regs.address );
-             addLineInfo (si, fnames[state_machine_regs.file], 
-                              si->offset + state_machine_regs.address, 
-                              si->offset + (state_machine_regs.address + 1),
-                              state_machine_regs.line , 0);
-             state_machine_regs.basic_block = 0; /* JRS added */
-             break;
-
-           case DW_LNS_advance_pc:
-             adv = info.li_min_insn_length 
-                      * read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-             state_machine_regs.address += adv;
-             if (0) VG_(printf)("smr.a += %p\n", adv );
-             break;
-
-           case DW_LNS_advance_line:
-             adv = read_leb128 (data, & bytes_read, 1);
-             data += bytes_read;
-             state_machine_regs.line += adv;
-             break;
-
-           case DW_LNS_set_file:
-             adv = read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-             state_machine_regs.file = adv;
-             break;
-
-           case DW_LNS_set_column:
-             adv = read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-             state_machine_regs.column = adv;
-             break;
-
-           case DW_LNS_negate_stmt:
-             adv = state_machine_regs.is_stmt;
-             adv = ! adv;
-             state_machine_regs.is_stmt = adv;
-             break;
-
-           case DW_LNS_set_basic_block:
-             state_machine_regs.basic_block = 1;
-             break;
-
-           case DW_LNS_const_add_pc:
-             adv = (((255 - info.li_opcode_base) / info.li_line_range)
-                    * info.li_min_insn_length);
-             state_machine_regs.address += adv;
-             if (0) VG_(printf)("smr.a += %p\n", adv );
-             break;
-
-           case DW_LNS_fixed_advance_pc:
-             /* XXX: Need something to get 2 bytes */
-             adv = *((UShort *)data);
-             data += 2;
-             state_machine_regs.address += adv;
-             if (0) VG_(printf)("smr.a += %p\n", adv );
-             break;
-
-           case DW_LNS_set_prologue_end:
-             break;
-
-           case DW_LNS_set_epilogue_begin:
-             break;
-
-           case DW_LNS_set_isa:
-             adv = read_leb128 (data, & bytes_read, 0);
-             data += bytes_read;
-             break;
-
-           default:
-             {
-               int j;
-               for (j = standard_opcodes[op_code - 1]; j > 0 ; --j)
-                 {
-                   read_leb128 (data, &bytes_read, 0);
-                   data += bytes_read;
-                 }
-             }
-             break;
-           }
-       }
-      VG_(free)(VG_AR_SYMTAB, fnames);
-      fnames = NULL;
-    }
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Read info from a .so/exe file.                       ---*/
-/*------------------------------------------------------------*/
-
-/* Read the symbols from the object/exe specified by the SegInfo into
-   the tables within the supplied SegInfo.  */
-static
-void vg_read_lib_symbols ( SegInfo* si )
-{
-   Elf32_Ehdr*   ehdr;       /* The ELF header                          */
-   Elf32_Shdr*   shdr;       /* The section table                       */
-   UChar*        sh_strtab;  /* The section table's string table        */
-   UChar*        stab;       /* The .stab table                         */
-   UChar*        stabstr;    /* The .stab string table                  */
-   UChar*        dwarf2;     /* The DWARF2 location info table          */
-   Int           stab_sz;    /* Size in bytes of the .stab table        */
-   Int           stabstr_sz; /* Size in bytes of the .stab string table */
-   Int           dwarf2_sz;  /* Size in bytes of the DWARF2 srcloc table*/
-   Int           fd;
-   Int           i;
-   Bool          ok;
-   Addr          oimage;
-   Int           n_oimage;
-   struct vki_stat stat_buf;
-
-   oimage = (Addr)NULL;
-   if (VG_(clo_verbosity) > 1)
-      VG_(message)(Vg_UserMsg, "Reading syms from %s", si->filename );
-
-   /* mmap the object image aboard, so that we can read symbols and
-      line number info out of it.  It will be munmapped immediately
-      thereafter; it is only aboard transiently. */
-
-   i = VG_(stat)(si->filename, &stat_buf);
-   if (i != 0) {
-      vg_symerr("Can't stat .so/.exe (to determine its size)?!");
-      return;
-   }
-   n_oimage = stat_buf.st_size;
-
-   fd = VG_(open_read)(si->filename);
-   if (fd == -1) {
-      vg_symerr("Can't open .so/.exe to read symbols?!");
-      return;
-   }
-
-   oimage = (Addr)VG_(mmap)( NULL, n_oimage, 
-                             VKI_PROT_READ, VKI_MAP_PRIVATE, fd, 0 );
-   if (oimage == ((Addr)(-1))) {
-      VG_(message)(Vg_UserMsg,
-                   "mmap failed on %s", si->filename );
-      VG_(close)(fd);
-      return;
-   }
-
-   VG_(close)(fd);
-
-   /* Ok, the object image is safely in oimage[0 .. n_oimage-1]. 
-      Now verify that it is a valid ELF .so or executable image.
-   */
-   ok = (n_oimage >= sizeof(Elf32_Ehdr));
-   ehdr = (Elf32_Ehdr*)oimage;
-
-   if (ok) {
-      ok &= (ehdr->e_ident[EI_MAG0] == 0x7F
-             && ehdr->e_ident[EI_MAG1] == 'E'
-             && ehdr->e_ident[EI_MAG2] == 'L'
-             && ehdr->e_ident[EI_MAG3] == 'F');
-      ok &= (ehdr->e_ident[EI_CLASS] == ELFCLASS32
-             && ehdr->e_ident[EI_DATA] == ELFDATA2LSB
-             && ehdr->e_ident[EI_VERSION] == EV_CURRENT);
-      ok &= (ehdr->e_type == ET_EXEC || ehdr->e_type == ET_DYN);
-      ok &= (ehdr->e_machine == EM_386);
-      ok &= (ehdr->e_version == EV_CURRENT);
-      ok &= (ehdr->e_shstrndx != SHN_UNDEF);
-      ok &= (ehdr->e_shoff != 0 && ehdr->e_shnum != 0);
-   }
-
-   if (!ok) {
-      vg_symerr("Invalid ELF header, or missing stringtab/sectiontab.");
-      VG_(munmap) ( (void*)oimage, n_oimage );
-      return;
-   }
-
-   if (VG_(clo_trace_symtab))
-      VG_(printf)( 
-          "shoff = %d,  shnum = %d,  size = %d,  n_vg_oimage = %d\n",
-          ehdr->e_shoff, ehdr->e_shnum, sizeof(Elf32_Shdr), n_oimage );
-
-   if (ehdr->e_shoff + ehdr->e_shnum*sizeof(Elf32_Shdr) > n_oimage) {
-      vg_symerr("ELF section header is beyond image end?!");
-      VG_(munmap) ( (void*)oimage, n_oimage );
-      return;
-   }
-
-   shdr = (Elf32_Shdr*)(oimage + ehdr->e_shoff);
-   sh_strtab = (UChar*)(oimage + shdr[ehdr->e_shstrndx].sh_offset);
-
-   /* try and read the object's symbol table */
-   {
-      UChar*     o_strtab    = NULL;
-      Elf32_Sym* o_symtab    = NULL;
-      UInt       o_strtab_sz = 0;
-      UInt       o_symtab_sz = 0;
-
-      UChar*     o_got = NULL;
-      UChar*     o_plt = NULL;
-      UInt       o_got_sz = 0;
-      UInt       o_plt_sz = 0;
-
-      Bool       snaffle_it;
-      Addr       sym_addr;
-
-      /* find the .stabstr and .stab sections */
-      for (i = 0; i < ehdr->e_shnum; i++) {
-         if (0 == VG_(strcmp)(".symtab",sh_strtab + shdr[i].sh_name)) {
-            o_symtab    = (Elf32_Sym*)(oimage + shdr[i].sh_offset);
-            o_symtab_sz = shdr[i].sh_size;
-            vg_assert((o_symtab_sz % sizeof(Elf32_Sym)) == 0);
-            /* check image overrun here */
-         }
-         if (0 == VG_(strcmp)(".strtab",sh_strtab + shdr[i].sh_name)) {
-            o_strtab    = (UChar*)(oimage + shdr[i].sh_offset);
-            o_strtab_sz = shdr[i].sh_size;
-            /* check image overrun here */
-         }
-
-         /* find out where the .got and .plt sections will be in the
-            executable image, not in the object image transiently loaded.
-         */
-         if (0 == VG_(strcmp)(".got",sh_strtab + shdr[i].sh_name)) {
-            o_got    = (UChar*)(si->offset
-                                + shdr[i].sh_offset);
-            o_got_sz = shdr[i].sh_size;
-            /* check image overrun here */
-         }
-         if (0 == VG_(strcmp)(".plt",sh_strtab + shdr[i].sh_name)) {
-            o_plt    = (UChar*)(si->offset
-                                + shdr[i].sh_offset);
-            o_plt_sz = shdr[i].sh_size;
-            /* check image overrun here */
-         }
-
-      }
-
-      if (VG_(clo_trace_symtab)) {
-         if (o_plt) VG_(printf)( "PLT: %p .. %p\n",
-                                 o_plt, o_plt + o_plt_sz - 1 );
-         if (o_got) VG_(printf)( "GOT: %p .. %p\n",
-                                 o_got, o_got + o_got_sz - 1 );
-      }
-
-      if (o_strtab == NULL || o_symtab == NULL) {
-         vg_symerr("   object doesn't have a symbol table");
-      } else {
-         /* Perhaps should start at i = 1; ELF docs suggest that entry
-            0 always denotes `unknown symbol'. */
-         for (i = 1; i < o_symtab_sz/sizeof(Elf32_Sym); i++){
-#           if 0
-            VG_(printf)("raw symbol: ");
-            switch (ELF32_ST_BIND(o_symtab[i].st_info)) {
-               case STB_LOCAL:  VG_(printf)("LOC "); break;
-               case STB_GLOBAL: VG_(printf)("GLO "); break;
-               case STB_WEAK:   VG_(printf)("WEA "); break;
-               case STB_LOPROC: VG_(printf)("lop "); break;
-               case STB_HIPROC: VG_(printf)("hip "); break;
-               default:         VG_(printf)("??? "); break;
-            }
-            switch (ELF32_ST_TYPE(o_symtab[i].st_info)) {
-               case STT_NOTYPE:  VG_(printf)("NOT "); break;
-               case STT_OBJECT:  VG_(printf)("OBJ "); break;
-               case STT_FUNC:    VG_(printf)("FUN "); break;
-               case STT_SECTION: VG_(printf)("SEC "); break;
-               case STT_FILE:    VG_(printf)("FIL "); break;
-               case STT_LOPROC:  VG_(printf)("lop "); break;
-               case STT_HIPROC:  VG_(printf)("hip "); break;
-               default:          VG_(printf)("??? "); break;
-            }
-            VG_(printf)(
-                ": value %p, size %d, name %s\n",
-                si->offset+(UChar*)o_symtab[i].st_value,
-                o_symtab[i].st_size,
-                o_symtab[i].st_name 
-                   ? ((Char*)o_strtab+o_symtab[i].st_name) 
-                   : (Char*)"NONAME");                
-#           endif
-
-            /* Figure out if we're interested in the symbol.
-               Firstly, is it of the right flavour? 
-            */
-            snaffle_it
-               =  ( (ELF32_ST_BIND(o_symtab[i].st_info) == STB_GLOBAL ||
-                     ELF32_ST_BIND(o_symtab[i].st_info) == STB_LOCAL /* ||
-		     ELF32_ST_BIND(o_symtab[i].st_info) == STB_WEAK */)
-                    &&
-                    (ELF32_ST_TYPE(o_symtab[i].st_info) == STT_FUNC /*||
-                     ELF32_ST_TYPE(o_symtab[i].st_info) == STT_OBJECT*/)
-                  );
-
-            /* Secondly, if it's apparently in a GOT or PLT, it's really
-               a reference to a symbol defined elsewhere, so ignore it. 
-            */
-            sym_addr = si->offset
-                       + (UInt)o_symtab[i].st_value;
-            if (o_got != NULL
-                && sym_addr >= (Addr)o_got 
-                && sym_addr < (Addr)(o_got+o_got_sz)) {
-               snaffle_it = False;
-               if (VG_(clo_trace_symtab)) {
-	          VG_(printf)( "in GOT: %s\n", 
-                               o_strtab+o_symtab[i].st_name);
-               }
-            }
-            if (o_plt != NULL
-                && sym_addr >= (Addr)o_plt 
-                && sym_addr < (Addr)(o_plt+o_plt_sz)) {
-               snaffle_it = False;
-               if (VG_(clo_trace_symtab)) {
-	          VG_(printf)( "in PLT: %s\n", 
-                               o_strtab+o_symtab[i].st_name);
-               }
-            }
-
-            /* Don't bother if nameless, or zero-sized. */
-            if (snaffle_it
-                && (o_symtab[i].st_name == (Elf32_Word)NULL
-                    || /* VG_(strlen)(o_strtab+o_symtab[i].st_name) == 0 */
-                       /* equivalent but cheaper ... */
-                       * ((UChar*)(o_strtab+o_symtab[i].st_name)) == 0
-                    || o_symtab[i].st_size == 0)) {
-               snaffle_it = False;
-               if (VG_(clo_trace_symtab)) {
-	          VG_(printf)( "size=0: %s\n", 
-                               o_strtab+o_symtab[i].st_name);
-               }
-            }
-
-#           if 0
-            /* Avoid _dl_ junk.  (Why?) */
-            /* 01-02-24: disabled until I find out if it really helps. */
-            if (snaffle_it
-                && (VG_(strncmp)("_dl_", o_strtab+o_symtab[i].st_name, 4) == 0
-                    || VG_(strncmp)("_r_debug", 
-                                   o_strtab+o_symtab[i].st_name, 8) == 0)) {
-               snaffle_it = False;
-               if (VG_(clo_trace_symtab)) {
-                  VG_(printf)( "_dl_ junk: %s\n", 
-                               o_strtab+o_symtab[i].st_name);
-               }
-            }
-#           endif
-
-            /* This seems to significantly reduce the number of junk
-               symbols, and particularly reduces the number of
-               overlapping address ranges.  Don't ask me why ... */
-	    if (snaffle_it && (Int)o_symtab[i].st_value == 0) {
-               snaffle_it = False;
-               if (VG_(clo_trace_symtab)) {
-                  VG_(printf)( "valu=0: %s\n", 
-                               o_strtab+o_symtab[i].st_name);
-               }
-            }
-
-	    /* If no part of the symbol falls within the mapped range,
-               ignore it. */
-            if (sym_addr+o_symtab[i].st_size <= si->start
-                || sym_addr >= si->start+si->size) {
-               snaffle_it = False;
-	    }
-
-            if (snaffle_it) {
-               /* it's an interesting symbol; record ("snaffle") it. */
-               RiSym sym;
-               Char* t0 = o_symtab[i].st_name 
-                             ? (Char*)(o_strtab+o_symtab[i].st_name) 
-                             : (Char*)"NONAME";
-               Int nmoff = addStr ( si, t0 );
-               vg_assert(nmoff >= 0 
-                         /* && 0==VG_(strcmp)(t0,&vg_strtab[nmoff]) */ );
-               vg_assert( (Int)o_symtab[i].st_value >= 0);
-               /* VG_(printf)("%p + %d:   %s\n", si->addr, 
-                              (Int)o_symtab[i].st_value, t0 ); */
-               sym.addr  = sym_addr;
-               sym.size  = o_symtab[i].st_size;
-               sym.nmoff = nmoff;
-               addSym ( si, &sym );
-	    }
-         }
-      }
-   }
-
-   /* Reading of the stabs and/or dwarf2 debug format information, if
-      any. */
-   stabstr    = NULL;
-   stab       = NULL;
-   dwarf2     = NULL;
-   stabstr_sz = 0;
-   stab_sz    = 0;
-   dwarf2_sz  = 0;
-
-   /* find the .stabstr / .stab / .debug_line sections */
-   for (i = 0; i < ehdr->e_shnum; i++) {
-      if (0 == VG_(strcmp)(".stab",sh_strtab + shdr[i].sh_name)) {
-         stab = (UChar*)(oimage + shdr[i].sh_offset);
-         stab_sz = shdr[i].sh_size;
-      }
-      if (0 == VG_(strcmp)(".stabstr",sh_strtab + shdr[i].sh_name)) {
-         stabstr = (UChar*)(oimage + shdr[i].sh_offset);
-         stabstr_sz = shdr[i].sh_size;
-      }
-      if (0 == VG_(strcmp)(".debug_line",sh_strtab + shdr[i].sh_name)) {
-         dwarf2 = (UChar *)(oimage + shdr[i].sh_offset);
-	 dwarf2_sz = shdr[i].sh_size;
-      }
-   }
-
-   if ((stab == NULL || stabstr == NULL) && dwarf2 == NULL) {
-      vg_symerr("   object doesn't have any debug info");
-      VG_(munmap) ( (void*)oimage, n_oimage );
-      return;
-   }
-
-   if ( stab_sz + (UChar*)stab > n_oimage + (UChar*)oimage
-        || stabstr_sz + (UChar*)stabstr 
-           > n_oimage + (UChar*)oimage ) {
-      vg_symerr("   ELF (stabs) debug data is beyond image end?!");
-      VG_(munmap) ( (void*)oimage, n_oimage );
-      return;
-   }
-
-   if ( dwarf2_sz + (UChar*)dwarf2 > n_oimage + (UChar*)oimage ) {
-      vg_symerr("   ELF (dwarf2) debug data is beyond image end?!");
-      VG_(munmap) ( (void*)oimage, n_oimage );
-      return;
-   }
-
-   /* Looks plausible.  Go on and read debug data. */
-   if (stab != NULL && stabstr != NULL) {
-      read_debuginfo_stabs ( si, stab, stab_sz, stabstr, stabstr_sz );
-   }
-
-   if (dwarf2 != NULL) {
-      read_debuginfo_dwarf2 ( si, dwarf2, dwarf2_sz );
-   }
-
-   /* Last, but not least, heave the oimage back overboard. */
-   VG_(munmap) ( (void*)oimage, n_oimage );
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Main entry point for symbols table reading.          ---*/
-/*------------------------------------------------------------*/
-
-/* The root structure for the entire symbol table system.  It is a
-   linked list of SegInfos.  Note that this entire mechanism assumes
-   that what we read from /proc/self/maps doesn't contain overlapping
-   address ranges, and as a result the SegInfos in this list describe
-   disjoint address ranges. 
-*/
-static SegInfo* segInfo = NULL;
-
-
-static
-void read_symtab_callback ( 
-        Addr start, UInt size, 
-        Char rr, Char ww, Char xx, 
-        UInt foffset, UChar* filename )
-{
-   SegInfo* si;
-
-   /* Stay sane ... */
-   if (size == 0)
-      return;
-
-   /* We're only interested in collecting symbols in executable
-      segments which are associated with a real file.  Hence: */
-   if (filename == NULL || xx != 'x')
-      return;
-   if (0 == VG_(strcmp)(filename, "/dev/zero"))
-      return;
-
-   /* Perhaps we already have this one?  If so, skip. */
-   for (si = segInfo; si != NULL; si = si->next) {
-      /*
-      if (0==VG_(strcmp)(si->filename, filename)) 
-         VG_(printf)("same fnames: %c%c%c (%p, %d) (%p, %d) %s\n", 
-                     rr,ww,xx,si->start,si->size,start,size,filename);
-      */
-      /* For some reason the observed size of a mapping can change, so
-         we don't use that to determine uniqueness. */
-      if (si->start == start
-          /* && si->size == size */
-          && 0==VG_(strcmp)(si->filename, filename)) {
-         return;
-      }
-   }
-
-   /* Get the record initialised right. */
-   si = VG_(malloc)(VG_AR_SYMTAB, sizeof(SegInfo));
-   si->next = segInfo;
-   segInfo = si;
-
-   si->start    = start;
-   si->size     = size;
-   si->foffset  = foffset;
-   si->filename = VG_(malloc)(VG_AR_SYMTAB, 1 + VG_(strlen)(filename));
-   VG_(strcpy)(si->filename, filename);
-
-   si->symtab = NULL;
-   si->symtab_size = si->symtab_used = 0;
-   si->loctab = NULL;
-   si->loctab_size = si->loctab_used = 0;
-   si->strtab = NULL;
-   si->strtab_size = si->strtab_used = 0;
-
-   /* Kludge ... */
-   si->offset 
-      = si->start==VG_ASSUMED_EXE_BASE ? 0 : si->start;
-
-   /* And actually fill it up. */
-   if (VG_(clo_instrument) || VG_(clo_cachesim)) {
-      vg_read_lib_symbols ( si );
-      canonicaliseSymtab ( si );
-      canonicaliseLoctab ( si );
-   }
-}
-
-
-/* This one really is the Head Honcho.  Update the symbol tables to
-   reflect the current state of /proc/self/maps.  Rather than re-read
-   everything, just read the entries which are not already in segInfo.
-   So we can call here repeatedly, after every mmap of a non-anonymous
-   segment with execute permissions, for example, to pick up new
-   libraries as they are dlopen'd.  Conversely, when the client does
-   munmap(), vg_symtab_notify_munmap() throws away any symbol tables
-   which happen to correspond to the munmap()d area.  */
-void VG_(read_symbols) ( void )
-{
-   VG_(read_procselfmaps) ( read_symtab_callback );
-
-   /* Do a sanity check on the symbol tables: ensure that the address
-      space pieces they cover do not overlap (otherwise we are severely
-      hosed).  This is a quadratic algorithm, but there shouldn't be
-      many of them.  
-   */
-   { SegInfo *si, *si2;
-     for (si = segInfo; si != NULL; si = si->next) {
-        /* Check no overlap between *si and those in the rest of the
-           list. */
-        for (si2 = si->next; si2 != NULL; si2 = si2->next) {
-           Addr lo = si->start;
-           Addr hi = si->start + si->size - 1;
-           Addr lo2 = si2->start;
-           Addr hi2 = si2->start + si2->size - 1;
-           Bool overlap;
-           vg_assert(lo < hi);
-	   vg_assert(lo2 < hi2);
-           /* the main assertion */
-           overlap = (lo <= lo2 && lo2 <= hi)
-                      || (lo <= hi2 && hi2 <= hi);
-	   if (overlap) {
-              VG_(printf)("\n\nOVERLAPPING SEGMENTS\n" );
-              ppSegInfo ( si );
-              ppSegInfo ( si2 );
-              VG_(printf)("\n\n"); 
-              vg_assert(! overlap);
-	   }
-        }
-     }
-   }    
-}
-
-
-/* When an munmap() call happens, check to see whether it corresponds
-   to a segment for a .so, and if so discard the relevant SegInfo.
-   This might not be a very clever idea from the point of view of
-   accuracy of error messages, but we need to do it in order to
-   maintain the no-overlapping invariant.
-
-   16 May 02: Returns a Bool indicating whether or not the discarded
-   range falls inside a known executable segment.  See comment at top
-   of file for why.
-*/
-Bool VG_(symtab_notify_munmap) ( Addr start, UInt length )
-{
-   SegInfo *prev, *curr;
-
-   prev = NULL;
-   curr = segInfo;
-   while (True) {
-      if (curr == NULL) break;
-      if (start == curr->start) break;
-      prev = curr;
-      curr = curr->next;
-   }
-   if (curr == NULL) 
-      return False;
-
-   VG_(message)(Vg_UserMsg, 
-                "discard syms in %s due to munmap()", 
-                curr->filename ? curr->filename : (UChar*)"???");
-
-   vg_assert(prev == NULL || prev->next == curr);
-
-   if (prev == NULL) {
-      segInfo = curr->next;
-   } else {
-      prev->next = curr->next;
-   }
-
-   freeSegInfo(curr);
-   return True;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Use of symbol table & location info to create        ---*/
-/*--- plausible-looking stack dumps.                       ---*/
-/*------------------------------------------------------------*/
-
-/* Find a symbol-table index containing the specified pointer, or -1
-   if not found.  Binary search.  */
-
-static Int search_one_symtab ( SegInfo* si, Addr ptr )
-{
-   Addr a_mid_lo, a_mid_hi;
-   Int  mid, 
-        lo = 0, 
-        hi = si->symtab_used-1;
-   while (True) {
-      /* current unsearched space is from lo to hi, inclusive. */
-      if (lo > hi) return -1; /* not found */
-      mid      = (lo + hi) / 2;
-      a_mid_lo = si->symtab[mid].addr;
-      a_mid_hi = ((Addr)si->symtab[mid].addr) + si->symtab[mid].size - 1;
-
-      if (ptr < a_mid_lo) { hi = mid-1; continue; } 
-      if (ptr > a_mid_hi) { lo = mid+1; continue; }
-      vg_assert(ptr >= a_mid_lo && ptr <= a_mid_hi);
-      return mid;
-   }
-}
-
-
-/* Search all symtabs that we know about to locate ptr.  If found, set
-   *psi to the relevant SegInfo, and *symno to the symtab entry number
-   within that.  If not found, *psi is set to NULL.  */
-
-static void search_all_symtabs ( Addr ptr, SegInfo** psi, Int* symno )
-{
-   Int      sno;
-   SegInfo* si;
-   for (si = segInfo; si != NULL; si = si->next) {
-      if (si->start <= ptr && ptr < si->start+si->size) {
-         sno = search_one_symtab ( si, ptr );
-         if (sno == -1) goto not_found;
-         *symno = sno;
-         *psi = si;
-         return;
-      }
-   }
-  not_found:
-   *psi = NULL;
-}
-
-
-/* Find a location-table index containing the specified pointer, or -1
-   if not found.  Binary search.  */
-
-static Int search_one_loctab ( SegInfo* si, Addr ptr )
-{
-   Addr a_mid_lo, a_mid_hi;
-   Int  mid, 
-        lo = 0, 
-        hi = si->loctab_used-1;
-   while (True) {
-      /* current unsearched space is from lo to hi, inclusive. */
-      if (lo > hi) return -1; /* not found */
-      mid      = (lo + hi) / 2;
-      a_mid_lo = si->loctab[mid].addr;
-      a_mid_hi = ((Addr)si->loctab[mid].addr) + si->loctab[mid].size - 1;
-
-      if (ptr < a_mid_lo) { hi = mid-1; continue; } 
-      if (ptr > a_mid_hi) { lo = mid+1; continue; }
-      vg_assert(ptr >= a_mid_lo && ptr <= a_mid_hi);
-      return mid;
-   }
-}
-
-
-/* Search all loctabs that we know about to locate ptr.  If found, set
-   *psi to the relevant SegInfo, and *locno to the loctab entry number
-   within that.  If not found, *psi is set to NULL.
-*/
-static void search_all_loctabs ( Addr ptr, SegInfo** psi, Int* locno )
-{
-   Int      lno;
-   SegInfo* si;
-   for (si = segInfo; si != NULL; si = si->next) {
-      if (si->start <= ptr && ptr < si->start+si->size) {
-         lno = search_one_loctab ( si, ptr );
-         if (lno == -1) goto not_found;
-         *locno = lno;
-         *psi = si;
-         return;
-      }
-   }
-  not_found:
-   *psi = NULL;
-}
-
-
-/* The whole point of this whole big deal: map a code address to a
-   plausible symbol name.  Returns False if no idea; otherwise True.
-   Caller supplies buf and nbuf.  If no_demangle is True, don't do
-   demangling, regardless of vg_clo_demangle -- probably because the
-   call has come from vg_what_fn_or_object_is_this. */
-Bool VG_(what_fn_is_this) ( Bool no_demangle, Addr a, 
-                            Char* buf, Int nbuf )
-{
-   SegInfo* si;
-   Int      sno;
-   search_all_symtabs ( a, &si, &sno );
-   if (si == NULL) 
-      return False;
-   if (no_demangle) {
-      VG_(strncpy_safely) 
-         ( buf, & si->strtab[si->symtab[sno].nmoff], nbuf );
-   } else {
-      VG_(demangle) ( & si->strtab[si->symtab[sno].nmoff], buf, nbuf );
-   }
-   return True;
-}
-
-
-/* Map a code address to the name of a shared object file.  Returns
-   False if no idea; otherwise False.  Caller supplies buf and
-   nbuf. */
-static
-Bool vg_what_object_is_this ( Addr a, Char* buf, Int nbuf )
-{
-   SegInfo* si;
-   for (si = segInfo; si != NULL; si = si->next) {
-      if (si->start <= a && a < si->start+si->size) {
-         VG_(strncpy_safely)(buf, si->filename, nbuf);
-         return True;
-      }
-   }
-   return False;
-}
-
-/* Return the name of an erring fn in a way which is useful
-   for comparing against the contents of a suppressions file. 
-   Always writes something to buf.  Also, doesn't demangle the
-   name, because we want to refer to mangled names in the 
-   suppressions file.
-*/
-void VG_(what_obj_and_fun_is_this) ( Addr a,
-                                     Char* obj_buf, Int n_obj_buf,
-                                     Char* fun_buf, Int n_fun_buf )
-{
-   (void)vg_what_object_is_this ( a, obj_buf, n_obj_buf );
-   (void)VG_(what_fn_is_this) ( True, a, fun_buf, n_fun_buf );
-}
-
-
-/* Map a code address to a (filename, line number) pair.  
-   Returns True if successful.
-*/
-Bool VG_(what_line_is_this)( Addr a, 
-                             UChar* filename, Int n_filename, 
-                             UInt* lineno )
-{
-   SegInfo* si;
-   Int      locno;
-   search_all_loctabs ( a, &si, &locno );
-   if (si == NULL) 
-      return False;
-   VG_(strncpy_safely)(filename, & si->strtab[si->loctab[locno].fnmoff], 
-                       n_filename);
-   *lineno = si->loctab[locno].lineno;
-
-   return True;
-}
-
-
-/* Print a mini stack dump, showing the current location. */
-void VG_(mini_stack_dump) ( ExeContext* ec )
-{
-
-#define APPEND(str)                                              \
-   { UChar* sss;                                                 \
-     for (sss = str; n < M_VG_ERRTXT-1 && *sss != 0; n++,sss++)  \
-        buf[n] = *sss;                                           \
-     buf[n] = 0;                                                 \
-   }
-
-   Bool   know_fnname;
-   Bool   know_objname;
-   Bool   know_srcloc;
-   UInt   lineno; 
-   UChar  ibuf[20];
-   UInt   i, n;
-
-   UChar  buf[M_VG_ERRTXT];
-   UChar  buf_fn[M_VG_ERRTXT];
-   UChar  buf_obj[M_VG_ERRTXT];
-   UChar  buf_srcloc[M_VG_ERRTXT];
-
-   Int stop_at = VG_(clo_backtrace_size);
-
-   n = 0;
-
-   know_fnname  = VG_(what_fn_is_this)(False,ec->eips[0], buf_fn, M_VG_ERRTXT);
-   know_objname = vg_what_object_is_this(ec->eips[0], buf_obj, M_VG_ERRTXT);
-   know_srcloc  = VG_(what_line_is_this)(ec->eips[0], 
-                                         buf_srcloc, M_VG_ERRTXT, 
-                                         &lineno);
-
-   APPEND("   at ");
-   VG_(sprintf)(ibuf,"0x%x: ", ec->eips[0]);
-   APPEND(ibuf);
-   if (know_fnname) { 
-      APPEND(buf_fn);
-      if (!know_srcloc && know_objname) {
-         APPEND(" (in ");
-         APPEND(buf_obj);
-         APPEND(")");
-      }
-   } else if (know_objname && !know_srcloc) {
-      APPEND("(within ");
-      APPEND(buf_obj);
-      APPEND(")");
-   } else {
-      APPEND("???");
-   }
-   if (know_srcloc) {
-      APPEND(" (");
-      APPEND(buf_srcloc);
-      APPEND(":");
-      VG_(sprintf)(ibuf,"%d",lineno);
-      APPEND(ibuf);
-      APPEND(")");
-   }
-   VG_(message)(Vg_UserMsg, "%s", buf);
-
-   for (i = 1; i < stop_at && ec->eips[i] != 0; i++) {
-      know_fnname  = VG_(what_fn_is_this)(False,ec->eips[i], buf_fn, M_VG_ERRTXT);
-      know_objname = vg_what_object_is_this(ec->eips[i],buf_obj, M_VG_ERRTXT);
-      know_srcloc  = VG_(what_line_is_this)(ec->eips[i], 
-                                          buf_srcloc, M_VG_ERRTXT, 
-                                          &lineno);
-      n = 0;
-      APPEND("   by ");
-      VG_(sprintf)(ibuf,"0x%x: ",ec->eips[i]);
-      APPEND(ibuf);
-      if (know_fnname) { 
-         APPEND(buf_fn) 
-         if (!know_srcloc && know_objname) {
-            APPEND(" (in ");
-            APPEND(buf_obj);
-            APPEND(")");
-         }
-      } else {
-         if (know_objname && !know_srcloc) {
-            APPEND("(within ");
-            APPEND(buf_obj);
-            APPEND(")"); 
-         } else {
-            APPEND("???");
-         }
-      };
-      if (know_srcloc) {
-         APPEND(" (");
-         APPEND(buf_srcloc);
-         APPEND(":");
-         VG_(sprintf)(ibuf,"%d",lineno);
-         APPEND(ibuf);
-         APPEND(")");
-      }
-      VG_(message)(Vg_UserMsg, "%s", buf);
-   }   
-}
-
-#undef APPEND
-
-/*--------------------------------------------------------------------*/
-/*--- end                                             vg_symtab2.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_syscall.S b/coregrind/vg_syscall.S
deleted file mode 100644
index adabbedbbe..0000000000
--- a/coregrind/vg_syscall.S
+++ /dev/null
@@ -1,104 +0,0 @@
-
-##--------------------------------------------------------------------##
-##--- Support for doing system calls.                              ---##
-##---                                                 vg_syscall.S ---##
-##--------------------------------------------------------------------##
-
-/*
-  This file is part of Valgrind, an x86 protected-mode emulator 
-  designed for debugging and profiling binaries on x86-Unixes.
-
-  Copyright (C) 2000-2002 Julian Seward 
-     jseward@acm.org
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of the
-  License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-  02111-1307, USA.
-
-  The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_constants.h"
-
-
-.globl	VG_(do_syscall)
-
-# NOTE that this routine expects the simulated machines state
-# to be in m_state_static.  Therefore it needs to be wrapped by
-# code which copies from baseBlock before the call, into
-# m_state_static, and back afterwards.
-	
-VG_(do_syscall):
-	# Save all the int registers of the real machines state on the
-	# simulators stack.
-	pushal
-
-	# and save the real FPU state too
-	fwait
-	fnsave	VG_(real_fpu_state_saved_over_syscall)
-	frstor	VG_(real_fpu_state_saved_over_syscall)
-
-	# remember what the simulators stack pointer is
-	movl	%esp, VG_(esp_saved_over_syscall)
-	
-	# Now copy the simulated machines state into the real one
-	# esp still refers to the simulators stack
-	frstor	VG_(m_state_static)+40
-	movl	VG_(m_state_static)+32, %eax
-	pushl	%eax
-	popfl
-	movl	VG_(m_state_static)+0, %eax
-	movl	VG_(m_state_static)+4, %ecx
-	movl	VG_(m_state_static)+8, %edx
-	movl	VG_(m_state_static)+12, %ebx
-	movl	VG_(m_state_static)+16, %esp
-	movl	VG_(m_state_static)+20, %ebp
-	movl	VG_(m_state_static)+24, %esi
-	movl	VG_(m_state_static)+28, %edi
-
-	# esp now refers to the simulatees stack
-	# Do the actual system call
-	int	$0x80
-
-	# restore stack as soon as possible
-	# esp refers to simulatees stack
-	movl	%esp, VG_(m_state_static)+16
-	movl	VG_(esp_saved_over_syscall), %esp
-	# esp refers to simulators stack
-
-	# ... and undo everything else.  
-	# Copy real state back to simulated state.	
-	movl	%eax, VG_(m_state_static)+0
-	movl	%ecx, VG_(m_state_static)+4
-	movl	%edx, VG_(m_state_static)+8
-	movl	%ebx, VG_(m_state_static)+12
-	movl	%ebp, VG_(m_state_static)+20
-	movl	%esi, VG_(m_state_static)+24
-	movl	%edi, VG_(m_state_static)+28
-	pushfl
-	popl	%eax
-	movl	%eax, VG_(m_state_static)+32
-	fwait
-	fnsave	VG_(m_state_static)+40
-	frstor	VG_(m_state_static)+40
-
-	# Restore the state of the simulator
-	frstor	VG_(real_fpu_state_saved_over_syscall)
-	popal
-
-	ret
-
-##--------------------------------------------------------------------##
-##--- end                                             vg_syscall.S ---##
-##--------------------------------------------------------------------##
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
deleted file mode 100644
index 179c0592f5..0000000000
--- a/coregrind/vg_to_ucode.c
+++ /dev/null
@@ -1,4674 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- The JITter: translate x86 code to ucode.                     ---*/
-/*---                                                vg_to_ucode.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-
-/*------------------------------------------------------------*/
-/*--- Renamings of frequently-used global functions.       ---*/
-/*------------------------------------------------------------*/
-
-#define uInstr0   VG_(newUInstr0)
-#define uInstr1   VG_(newUInstr1)
-#define uInstr2   VG_(newUInstr2)
-#define uInstr3   VG_(newUInstr3)
-#define dis       VG_(disassemble)
-#define nameIReg  VG_(nameOfIntReg)
-#define nameISize VG_(nameOfIntSize)
-#define newTemp   VG_(getNewTemp)
-#define uLiteral  VG_(setLiteralField)
-
-
-/*------------------------------------------------------------*/
-/*--- Here so it can be inlined everywhere.                ---*/
-/*------------------------------------------------------------*/
-
-/* Allocate a new temp reg number. */
-__inline__ Int VG_(getNewTemp) ( UCodeBlock* cb )
-{
-   Int t = cb->nextTemp;
-   cb->nextTemp += 2;
-   return t;
-}
-
-Int VG_(getNewShadow) ( UCodeBlock* cb )
-{
-   Int t = cb->nextTemp;
-   cb->nextTemp += 2;
-   return SHADOW(t);
-}
-
-/* Handy predicates. */
-#define SMC_IF_SOME(cb)                              \
-   do {                                              \
-      if (VG_(clo_smc_check) >= VG_CLO_SMC_SOME) {   \
-           LAST_UINSTR((cb)).smc_check = True;       \
-      }                                              \
-   } while (0)
-
-#define SMC_IF_ALL(cb)                               \
-   do {                                              \
-      if (VG_(clo_smc_check) == VG_CLO_SMC_ALL) {    \
-         LAST_UINSTR((cb)).smc_check = True;         \
-      }                                              \
-   } while (0)
-
-
-/*------------------------------------------------------------*/
-/*--- Helper bits and pieces for deconstructing the        ---*/
-/*--- x86 insn stream.                                     ---*/
-/*------------------------------------------------------------*/
-
-static Char* nameGrp1 ( Int opc_aux )
-{
-   static Char* grp1_names[8] 
-     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
-   if (opc_aux < 0 || opc_aux > 7) VG_(panic)("nameGrp1");
-   return grp1_names[opc_aux];
-}
-
-static Char* nameGrp2 ( Int opc_aux )
-{
-   static Char* grp2_names[8] 
-     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
-   if (opc_aux < 0 || opc_aux > 7) VG_(panic)("nameGrp2");
-   return grp2_names[opc_aux];
-}
-
-static Char* nameGrp4 ( Int opc_aux )
-{
-   static Char* grp4_names[8] 
-     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
-   if (opc_aux < 0 || opc_aux > 1) VG_(panic)("nameGrp4");
-   return grp4_names[opc_aux];
-}
-
-static Char* nameGrp5 ( Int opc_aux )
-{
-   static Char* grp5_names[8] 
-     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
-   if (opc_aux < 0 || opc_aux > 6) VG_(panic)("nameGrp5");
-   return grp5_names[opc_aux];
-}
-
-static Char* nameGrp8 ( Int opc_aux )
-{
-   static Char* grp8_names[8] 
-     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
-   if (opc_aux < 4 || opc_aux > 7) VG_(panic)("nameGrp8");
-   return grp8_names[opc_aux];
-}
-
-Char* VG_(nameOfIntReg) ( Int size, Int reg )
-{
-   static Char* ireg32_names[8] 
-     = { "%eax", "%ecx", "%edx", "%ebx", 
-         "%esp", "%ebp", "%esi", "%edi" };
-   static Char* ireg16_names[8] 
-     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
-   static Char* ireg8_names[8] 
-     = { "%al", "%cl", "%dl", "%bl", "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
-   if (reg < 0 || reg > 7) goto bad;
-   switch (size) {
-      case 4: return ireg32_names[reg];
-      case 2: return ireg16_names[reg];
-      case 1: return ireg8_names[reg];
-   }
-  bad:
-   VG_(panic)("nameOfIntReg");
-   return NULL; /*notreached*/
-}
-
-Char VG_(nameOfIntSize) ( Int size )
-{
-   switch (size) {
-      case 4: return 'l';
-      case 2: return 'w';
-      case 1: return 'b';
-      default: VG_(panic)("nameOfIntSize");
-   }
-}
-
-__inline__ UInt VG_(extend_s_8to32) ( UInt x )
-{
-   return (UInt)((((Int)x) << 24) >> 24);
-}
-
-__inline__ static UInt extend_s_16to32 ( UInt x )
-{
-   return (UInt)((((Int)x) << 16) >> 16);
-}
-
-
-/* Get a byte value out of the insn stream and sign-extend to 32
-   bits. */
-__inline__ static UInt getSDisp8 ( Addr eip0 )
-{
-   UChar* eip = (UChar*)eip0;
-   return VG_(extend_s_8to32)( (UInt) (eip[0]) );
-}
-
-__inline__ static UInt getSDisp16 ( Addr eip0 )
-{
-   UChar* eip = (UChar*)eip0;
-   UInt d = *eip++;
-   d |= ((*eip++) << 8);
-   return extend_s_16to32(d);
-}
-
-/* Get a 32-bit value out of the insn stream. */
-__inline__ static UInt getUDisp32 ( Addr eip0 )
-{
-   UChar* eip = (UChar*)eip0;
-   UInt v = eip[3]; v <<= 8;
-   v |= eip[2]; v <<= 8;
-   v |= eip[1]; v <<= 8;
-   v |= eip[0];
-   return v;
-}
-
-__inline__ static UInt getUDisp16 ( Addr eip0 )
-{
-   UChar* eip = (UChar*)eip0;
-   UInt v = eip[1]; v <<= 8;
-   v |= eip[0];
-   return v;
-}
-
-__inline__ static UChar getUChar ( Addr eip0 )
-{
-   UChar* eip = (UChar*)eip0;
-   return eip[0];
-}
-
-__inline__ static UInt LOW24 ( UInt x )
-{
-   return x & 0x00FFFFFF;
-}
-
-__inline__ static UInt HI8 ( UInt x )
-{
-   return x >> 24;
-}
-
-__inline__ static UInt getUDisp ( Int size, Addr eip )
-{
-   switch (size) {
-      case 4: return getUDisp32(eip);
-      case 2: return getUDisp16(eip);
-      case 1: return getUChar(eip);
-      default: VG_(panic)("getUDisp");
-  }
-  return 0; /*notreached*/
-}
-
-__inline__ static UInt getSDisp ( Int size, Addr eip )
-{
-   switch (size) {
-      case 4: return getUDisp32(eip);
-      case 2: return getSDisp16(eip);
-      case 1: return getSDisp8(eip);
-      default: VG_(panic)("getUDisp");
-  }
-  return 0; /*notreached*/
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Flag-related helpers.                                ---*/
-/*------------------------------------------------------------*/
-
-/* For the last uinsn inserted into cb, set the read, written and
-   undefined flags.  Undefined flags are counted as written, but it
-   seems worthwhile to distinguish them. 
-*/
-static __inline__ void uFlagsRWU ( UCodeBlock* cb,
-                                   FlagSet rr, FlagSet ww, FlagSet uu )
-{
-   VG_(setFlagRW)(
-      &LAST_UINSTR(cb), rr, VG_UNION_FLAG_SETS(ww,uu)
-   );
-}
-
-
-static void setFlagsFromUOpcode ( UCodeBlock* cb, Int uopc )
-{
-   switch (uopc) {
-      case XOR: case OR: case AND:
-         uFlagsRWU(cb, FlagsEmpty, FlagsOSZCP,  FlagA); break;
-      case ADC: case SBB: 
-         uFlagsRWU(cb, FlagC,      FlagsOSZACP, FlagsEmpty); break;
-      case ADD: case SUB: case NEG: 
-         uFlagsRWU(cb, FlagsEmpty, FlagsOSZACP, FlagsEmpty); break;
-      case INC: case DEC:
-         uFlagsRWU(cb, FlagsEmpty, FlagsOSZAP,  FlagsEmpty); break;
-      case SHR: case SAR: case SHL:
-         uFlagsRWU(cb, FlagsEmpty, FlagsOSZCP,  FlagA); break;
-      case ROL: case ROR:
-         uFlagsRWU(cb, FlagsEmpty, FlagsOC,     FlagsEmpty); break;
-      case RCR: case RCL: 
-         uFlagsRWU(cb, FlagC,      FlagsOC,     FlagsEmpty); break;
-      case NOT:
-         uFlagsRWU(cb, FlagsEmpty, FlagsEmpty,  FlagsEmpty); break;
-      default: 
-         VG_(printf)("unhandled case is %s\n", 
-                     VG_(nameUOpcode)(True, uopc));
-         VG_(panic)("setFlagsFromUOpcode: unhandled case");
-   }
-}
-
-static __inline__ void uCond ( UCodeBlock* cb, Condcode cond )
-{
-   LAST_UINSTR(cb).cond = cond;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Disassembling addressing modes                       ---*/
-/*------------------------------------------------------------*/
-
-/* Generate ucode to calculate an address indicated by a ModRM and
-   following SIB bytes, getting the value in a new temporary.  The
-   temporary, and the number of bytes in the address mode, are
-   returned, as a pair (length << 8) | temp.  Note that this fn should
-   not be called if the R/M part of the address denotes a register
-   instead of memory.  If buf is non-NULL, text of the addressing mode
-   is placed therein. */
-
-static UInt disAMode ( UCodeBlock* cb, Addr eip0, UChar* buf )
-{
-   UChar* eip        = (UChar*)eip0;
-   UChar  mod_reg_rm = *eip++;
-   Int    tmp        = newTemp(cb);
-
-   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
-      jump table seems a bit excessive. 
-   */
-   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
-   mod_reg_rm |= (mod_reg_rm >> 3);  /* is now XX0XXYYY */
-   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
-   switch (mod_reg_rm) {
-
-      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
-         --> GET %reg, t 
-      */
-      case 0x00: case 0x01: case 0x02: case 0x03: 
-      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
-         { UChar rm  = mod_reg_rm;
-           uInstr2(cb, GET, 4, ArchReg, rm,  TempReg, tmp);
-           if (buf) VG_(sprintf)(buf,"(%s)", nameIReg(4,rm));
-           return (1<<24 | tmp);
-         }
-
-      /* d8(%eax) ... d8(%edi), not including d8(%esp) 
-         --> GET %reg, t ; ADDL d8, t
-      */
-      case 0x08: case 0x09: case 0x0A: case 0x0B: 
-      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
-         { UChar rm  = mod_reg_rm & 7;
-           Int   tmq = newTemp(cb);
-           UInt  d   = getSDisp8((Addr)eip); eip++;
-           uInstr2(cb, GET,  4, ArchReg, rm,  TempReg, tmq);
-           uInstr2(cb, LEA1, 4, TempReg, tmq, TempReg, tmp);
-           LAST_UINSTR(cb).lit32 = d;
-           if (buf) VG_(sprintf)(buf,"%d(%s)", d, nameIReg(4,rm));
-           return (2<<24 | tmp);
-         }
-
-      /* d32(%eax) ... d32(%edi), not including d32(%esp)
-         --> GET %reg, t ; ADDL d8, t
-      */
-      case 0x10: case 0x11: case 0x12: case 0x13: 
-      /* ! 14 */ case 0x15: case 0x16: case 0x17:
-         { UChar rm  = mod_reg_rm & 7;
-           Int   tmq = newTemp(cb);
-           UInt  d   = getUDisp32((Addr)eip); eip += 4;
-           uInstr2(cb, GET,  4, ArchReg, rm,  TempReg, tmq);
-           uInstr2(cb, LEA1, 4, TempReg, tmq, TempReg, tmp);
-           LAST_UINSTR(cb).lit32 = d;
-           if (buf) VG_(sprintf)(buf,"0x%x(%s)", d, nameIReg(4,rm));
-           return (5<<24 | tmp);
-         }
-
-      /* a register, %eax .. %edi.  This shouldn't happen. */
-      case 0x18: case 0x19: case 0x1A: case 0x1B:
-      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
-         VG_(panic)("disAMode: not an addr!");
-
-      /* a 32-bit literal address
-         --> MOV d32, tmp 
-      */
-      case 0x05: 
-         { UInt d = getUDisp32((Addr)eip); eip += 4;
-           uInstr2(cb, MOV, 4, Literal, 0, TempReg, tmp);
-           uLiteral(cb, d);
-           if (buf) VG_(sprintf)(buf,"(0x%x)", d);
-           return (5<<24 | tmp);
-         }
-
-      case 0x04: {
-         /* SIB, with no displacement.  Special cases:
-            -- %esp cannot act as an index value.  
-               If index_r indicates %esp, zero is used for the index.
-            -- when mod is zero and base indicates EBP, base is instead
-               a 32-bit literal.
-            It's all madness, I tell you.  Extract %index, %base and 
-            scale from the SIB byte.  The value denoted is then:
-               | %index == %ESP && %base == %EBP
-               = d32 following SIB byte
-               | %index == %ESP && %base != %EBP
-               = %base
-               | %index != %ESP && %base == %EBP
-               = d32 following SIB byte + (%index << scale)
-               | %index != %ESP && %base != %ESP
-               = %base + (%index << scale)
-
-            What happens to the souls of CPU architects who dream up such
-            horrendous schemes, do you suppose?  
-         */
-         UChar sib     = *eip++;
-         UChar scale   = (sib >> 6) & 3;
-         UChar index_r = (sib >> 3) & 7;
-         UChar base_r  = sib & 7;
-
-         if (index_r != R_ESP && base_r != R_EBP) {
-            Int index_tmp = newTemp(cb);
-            Int base_tmp  = newTemp(cb);
-            uInstr2(cb, GET,  4, ArchReg, index_r,  TempReg, index_tmp);
-            uInstr2(cb, GET,  4, ArchReg, base_r,   TempReg, base_tmp);
-            uInstr3(cb, LEA2, 4, TempReg, base_tmp, TempReg, index_tmp, 
-                                 TempReg, tmp);
-            LAST_UINSTR(cb).lit32   = 0;
-            LAST_UINSTR(cb).extra4b = 1 << scale;
-            if (buf) VG_(sprintf)(buf,"(%s,%s,%d)", nameIReg(4,base_r),
-                                  nameIReg(4,index_r),1<<scale);
-            return (2<<24 | tmp);
-         }
-
-         if (index_r != R_ESP && base_r == R_EBP) {
-            Int index_tmp = newTemp(cb);
-            UInt d = getUDisp32((Addr)eip); eip += 4;
-            uInstr2(cb, GET,  4, ArchReg, index_r,  TempReg, index_tmp);
-            uInstr2(cb, MOV,  4, Literal, 0,        TempReg, tmp);
-            uLiteral(cb, 0);
-            uInstr3(cb, LEA2, 4, TempReg, tmp,      TempReg, index_tmp, 
-                                 TempReg, tmp);
-            LAST_UINSTR(cb).lit32   = d;
-            LAST_UINSTR(cb).extra4b = 1 << scale;
-            if (buf) VG_(sprintf)(buf,"0x%x(,%s,%d)", d, 
-                                  nameIReg(4,index_r),1<<scale);
-            return (6<<24 | tmp);
-         }
-
-         if (index_r == R_ESP && base_r != R_EBP) {
-            uInstr2(cb, GET, 4, ArchReg, base_r, TempReg, tmp);
-            if (buf) VG_(sprintf)(buf,"(%s,,)", nameIReg(4,base_r));
-            return (2<<24 | tmp);
-         }
-
-         if (index_r == R_ESP && base_r == R_EBP) {
-            UInt d = getUDisp32((Addr)eip); eip += 4;
-            uInstr2(cb, MOV, 4, Literal, 0, TempReg, tmp);
-	    uLiteral(cb, d);
-            if (buf) VG_(sprintf)(buf,"0x%x()", d);
-            return (6<<24 | tmp);
-         }
-
-         vg_assert(0);
-      }
-
-      /* SIB, with 8-bit displacement.  Special cases:
-         -- %esp cannot act as an index value.  
-            If index_r indicates %esp, zero is used for the index.
-         Denoted value is:
-            | %index == %ESP
-            = d8 + %base
-            | %index != %ESP
-            = d8 + %base + (%index << scale)
-      */
-      case 0x0C: {
-         UChar sib     = *eip++;
-         UChar scale   = (sib >> 6) & 3;
-         UChar index_r = (sib >> 3) & 7;
-         UChar base_r  = sib & 7;
-         UInt d        = getSDisp8((Addr)eip); eip++;
-
-         if (index_r == R_ESP) {
-            Int tmq = newTemp(cb);
-            uInstr2(cb, GET,  4, ArchReg, base_r,  TempReg, tmq);
-            uInstr2(cb, LEA1, 4, TempReg, tmq, TempReg, tmp);
-            LAST_UINSTR(cb).lit32 = d;
-            if (buf) VG_(sprintf)(buf,"%d(%s,,)", d, nameIReg(4,base_r));
-            return (3<<24 | tmp);
-         } else {
-            Int index_tmp = newTemp(cb);
-            Int base_tmp  = newTemp(cb);
-            uInstr2(cb, GET, 4,  ArchReg, index_r,  TempReg, index_tmp);
-            uInstr2(cb, GET, 4,  ArchReg, base_r,   TempReg, base_tmp);
-            uInstr3(cb, LEA2, 4, TempReg, base_tmp, TempReg, index_tmp, 
-                                 TempReg, tmp);
-            LAST_UINSTR(cb).lit32   = d;
-            LAST_UINSTR(cb).extra4b = 1 << scale;
-            if (buf) VG_(sprintf)(buf,"%d(%s,%s,%d)", d, nameIReg(4,base_r), 
-                                  nameIReg(4,index_r), 1<<scale);
-            return (3<<24 | tmp);
-         }
-         vg_assert(0);
-      }
-
-      /* SIB, with 32-bit displacement.  Special cases:
-         -- %esp cannot act as an index value.  
-            If index_r indicates %esp, zero is used for the index.
-         Denoted value is:
-            | %index == %ESP
-            = d32 + %base
-            | %index != %ESP
-            = d32 + %base + (%index << scale)
-      */
-      case 0x14: {
-         UChar sib     = *eip++;
-         UChar scale   = (sib >> 6) & 3;
-         UChar index_r = (sib >> 3) & 7;
-         UChar base_r  = sib & 7;
-         UInt d        = getUDisp32((Addr)eip); eip += 4;
-
-         if (index_r == R_ESP) {
-            Int tmq = newTemp(cb);
-            uInstr2(cb, GET,  4, ArchReg, base_r,  TempReg, tmq);
-            uInstr2(cb, LEA1, 4, TempReg, tmq, TempReg, tmp);
-            LAST_UINSTR(cb).lit32 = d;
-            if (buf) VG_(sprintf)(buf,"%d(%s,,)", d, nameIReg(4,base_r));
-            return (6<<24 | tmp);
-         } else {
-            Int index_tmp = newTemp(cb);
-            Int base_tmp = newTemp(cb);
-            uInstr2(cb, GET,  4, ArchReg, index_r, TempReg, index_tmp);
-            uInstr2(cb, GET,  4, ArchReg, base_r, TempReg, base_tmp);
-            uInstr3(cb, LEA2, 4, TempReg, base_tmp, TempReg, index_tmp, 
-                                 TempReg, tmp);
-            LAST_UINSTR(cb).lit32   = d;
-            LAST_UINSTR(cb).extra4b = 1 << scale;
-            if (buf) VG_(sprintf)(buf,"%d(%s,%s,%d)", d, nameIReg(4,base_r), 
-                                  nameIReg(4,index_r), 1<<scale);
-            return (6<<24 | tmp);
-         }
-         vg_assert(0);
-      }
-
-      default:
-         VG_(panic)("disAMode");
-         return 0; /*notreached*/
-   }
-}
-
-
-/* Figure out the number of (insn-stream) bytes constituting the amode
-   beginning at eip0.  Is useful for getting hold of literals beyond
-   the end of the amode before it has been disassembled.  */
-
-static UInt lengthAMode ( Addr eip0 )
-{
-   UChar* eip        = (UChar*)eip0;
-   UChar  mod_reg_rm = *eip++;
-
-   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
-      jump table seems a bit excessive. 
-   */
-   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
-   mod_reg_rm |= (mod_reg_rm >> 3);  /* is now XX0XXYYY */
-   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
-   switch (mod_reg_rm) {
-
-      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
-      case 0x00: case 0x01: case 0x02: case 0x03: 
-      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
-         return 1;
-
-      /* d8(%eax) ... d8(%edi), not including d8(%esp). */ 
-      case 0x08: case 0x09: case 0x0A: case 0x0B: 
-      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
-         return 2;
-
-      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
-      case 0x10: case 0x11: case 0x12: case 0x13: 
-      /* ! 14 */ case 0x15: case 0x16: case 0x17:
-         return 5;
-
-      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
-      case 0x18: case 0x19: case 0x1A: case 0x1B:
-      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
-         return 1;
-
-      /* a 32-bit literal address. */
-      case 0x05: return 5;
-
-      /* SIB, no displacement.  */
-      case 0x04: {
-         UChar sib     = *eip++;
-         UChar base_r  = sib & 7;
-         if (base_r == R_EBP) return 6; else return 2;
-      }
-      /* SIB, with 8-bit displacement.  */
-      case 0x0C: return 3;
-
-      /* SIB, with 32-bit displacement.  */
-      case 0x14: return 6;
-
-      default:
-         VG_(panic)("amode_from_RM");
-         return 0; /*notreached*/
-   }
-}
-
-
-/* Extract the reg field from a modRM byte. */
-static __inline__ Int gregOfRM ( UChar mod_reg_rm )
-{
-   return (Int)( (mod_reg_rm >> 3) & 7 );
-}
-
-/* Figure out whether the mod and rm parts of a modRM byte refer to a
-   register or memory.  If so, the byte will have the form 11XXXYYY,
-   where YYY is the register number. */
-static __inline__ Bool epartIsReg ( UChar mod_reg_rm )
-{
-   return (0xC0 == (mod_reg_rm & 0xC0));
-}
-
-/* ... and extract the register number ... */
-static __inline__ Int eregOfRM ( UChar mod_reg_rm )
-{
-   return (Int)(mod_reg_rm & 0x7);
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Disassembling common idioms                          ---*/
-/*------------------------------------------------------------*/
-
-static
-void codegen_XOR_reg_with_itself ( UCodeBlock* cb, Int size, 
-                                   Int ge_reg, Int tmp )
-{
-   if (dis) 
-      VG_(printf)("xor%c %s, %s\n", nameISize(size),
-                  nameIReg(size,ge_reg), nameIReg(size,ge_reg) );
-   uInstr2(cb, MOV, size, Literal, 0, TempReg, tmp);
-   uLiteral(cb, 0);
-   uInstr2(cb, XOR, size, TempReg, tmp, TempReg, tmp);
-   setFlagsFromUOpcode(cb, XOR);
-   uInstr2(cb, PUT, size, TempReg, tmp, ArchReg, ge_reg);
-}
-
-
-/* Handle binary integer instructions of the form
-      op E, G  meaning
-      op reg-or-mem, reg
-   Is passed the a ptr to the modRM byte, the actual operation, and the
-   data size.  Returns the address advanced completely over this
-   instruction.
-
-   E(src) is reg-or-mem
-   G(dst) is reg.
-
-   If E is reg, -->    GET %G,  tmp
-                       OP %E,   tmp
-                       PUT tmp, %G
- 
-   If E is mem and OP is not reversible, 
-                -->    (getAddr E) -> tmpa
-                       LD (tmpa), tmpa
-                       GET %G, tmp2
-                       OP tmpa, tmp2
-                       PUT tmp2, %G
-
-   If E is mem and OP is reversible
-                -->    (getAddr E) -> tmpa
-                       LD (tmpa), tmpa
-                       OP %G, tmpa
-                       PUT tmpa, %G
-*/
-static
-Addr dis_op2_E_G ( UCodeBlock* cb, 
-                   Opcode      opc, 
-                   Bool        keep,
-                   Int         size, 
-                   Addr        eip0,
-                   Char*       t_x86opc )
-{
-   Bool  reversible;
-   UChar rm = getUChar(eip0);
-   UChar dis_buf[50];
-
-   if (epartIsReg(rm)) {
-      Int tmp = newTemp(cb);
-
-      /* Specially handle XOR reg,reg, because that doesn't really
-         depend on reg, and doing the obvious thing potentially
-         generates a spurious value check failure due to the bogus
-         dependency. */
-      if (opc == XOR && gregOfRM(rm) == eregOfRM(rm)) {
-         codegen_XOR_reg_with_itself ( cb, size, gregOfRM(rm), tmp );
-         return 1+eip0;
-      }
-
-      uInstr2(cb, GET, size, ArchReg, gregOfRM(rm), TempReg, tmp);
-      if (opc == AND || opc == OR) {
-         Int tao = newTemp(cb);
-         uInstr2(cb, GET, size, ArchReg, eregOfRM(rm), TempReg, tao); 
-         uInstr2(cb, opc, size, TempReg, tao, TempReg, tmp);
-         setFlagsFromUOpcode(cb, opc);
-      } else {
-         uInstr2(cb, opc, size, ArchReg, eregOfRM(rm), TempReg, tmp);
-         setFlagsFromUOpcode(cb, opc);
-      }
-      if (keep)
-         uInstr2(cb, PUT, size, TempReg, tmp, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("%s%c %s,%s\n", t_x86opc, nameISize(size), 
-                           nameIReg(size,eregOfRM(rm)),
-                           nameIReg(size,gregOfRM(rm)));
-      return 1+eip0;
-   }
-
-   /* E refers to memory */    
-   reversible
-      = (opc == ADD || opc == OR || opc == AND || opc == XOR || opc == ADC)
-           ? True : False;
-   if (reversible) {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      uInstr2(cb, LOAD, size, TempReg, tmpa, TempReg, tmpa);
-
-      if (opc == AND || opc == OR) {
-         Int tao = newTemp(cb);
-         uInstr2(cb, GET, size, ArchReg, gregOfRM(rm), TempReg, tao); 
-         uInstr2(cb, opc, size, TempReg, tao, TempReg, tmpa);
-         setFlagsFromUOpcode(cb, opc);
-      } else {
-         uInstr2(cb, opc,  size, ArchReg, gregOfRM(rm), TempReg, tmpa);
-         setFlagsFromUOpcode(cb, opc);
-      }
-      if (keep)
-         uInstr2(cb, PUT,  size, TempReg, tmpa, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("%s%c %s,%s\n", t_x86opc, nameISize(size), 
-                           dis_buf,nameIReg(size,gregOfRM(rm)));
-      return HI8(pair)+eip0;
-   } else {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      Int  tmp2 = newTemp(cb);
-      uInstr2(cb, LOAD, size, TempReg, tmpa, TempReg, tmpa);
-      uInstr2(cb, GET,  size, ArchReg, gregOfRM(rm), TempReg, tmp2);
-      uInstr2(cb, opc,  size, TempReg, tmpa, TempReg, tmp2);
-      setFlagsFromUOpcode(cb, opc);
-      if (keep)
-         uInstr2(cb, PUT,  size, TempReg, tmp2, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("%s%c %s,%s\n", t_x86opc, nameISize(size), 
-                           dis_buf,nameIReg(size,gregOfRM(rm)));
-      return HI8(pair)+eip0;
-   }
-}
-
-
-
-/* Handle binary integer instructions of the form
-      op G, E  meaning
-      op reg, reg-or-mem
-   Is passed the a ptr to the modRM byte, the actual operation, and the
-   data size.  Returns the address advanced completely over this
-   instruction.
-
-   G(src) is reg.
-   E(dst) is reg-or-mem
-
-   If E is reg, -->    GET %E,  tmp
-                       OP %G,   tmp
-                       PUT tmp, %E
- 
-   If E is mem, -->    (getAddr E) -> tmpa
-                       LD (tmpa), tmpv
-                       OP %G, tmpv
-                       ST tmpv, (tmpa)
-*/
-static
-Addr dis_op2_G_E ( UCodeBlock* cb, 
-                   Opcode      opc, 
-                   Bool        keep,
-                   Int         size, 
-                   Addr        eip0,
-                   Char*       t_x86opc )
-{
-   UChar rm = getUChar(eip0);
-   UChar dis_buf[50];
-
-   if (epartIsReg(rm)) {
-      Int tmp = newTemp(cb);
-
-      /* Specially handle XOR reg,reg, because that doesn't really
-         depend on reg, and doing the obvious thing potentially
-         generates a spurious value check failure due to the bogus
-         dependency. */
-      if (opc == XOR && gregOfRM(rm) == eregOfRM(rm)) {
-         codegen_XOR_reg_with_itself ( cb, size, gregOfRM(rm), tmp );
-         return 1+eip0;
-      }
-
-      uInstr2(cb, GET, size, ArchReg, eregOfRM(rm), TempReg, tmp);
-
-      if (opc == AND || opc == OR) {
-         Int tao = newTemp(cb);
-         uInstr2(cb, GET, size, ArchReg, gregOfRM(rm), TempReg, tao); 
-         uInstr2(cb, opc, size, TempReg, tao, TempReg, tmp);
-         setFlagsFromUOpcode(cb, opc);
-      } else {
-         uInstr2(cb, opc, size, ArchReg, gregOfRM(rm), TempReg, tmp);
-         setFlagsFromUOpcode(cb, opc);
-      }
-      if (keep)
-         uInstr2(cb, PUT, size, TempReg, tmp, ArchReg, eregOfRM(rm));
-      if (dis) VG_(printf)("%s%c %s,%s\n", t_x86opc, nameISize(size), 
-                           nameIReg(size,gregOfRM(rm)),
-                           nameIReg(size,eregOfRM(rm)));
-      return 1+eip0;
-   }
-
-   /* E refers to memory */    
-   {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      Int  tmpv = newTemp(cb);
-      uInstr2(cb, LOAD,  size, TempReg, tmpa, TempReg, tmpv);
-
-      if (opc == AND || opc == OR) {
-         Int tao = newTemp(cb);
-         uInstr2(cb, GET, size, ArchReg, gregOfRM(rm), TempReg, tao); 
-         uInstr2(cb, opc, size, TempReg, tao, TempReg, tmpv);
-         setFlagsFromUOpcode(cb, opc);
-      } else {
-         uInstr2(cb, opc, size, ArchReg, gregOfRM(rm), TempReg, tmpv);
-         setFlagsFromUOpcode(cb, opc);
-      }
-      if (keep) {
-         uInstr2(cb, STORE, size, TempReg, tmpv, TempReg, tmpa);
-         SMC_IF_ALL(cb);
-      }
-      if (dis) VG_(printf)("%s%c %s,%s\n", t_x86opc, nameISize(size), 
-                           nameIReg(size,gregOfRM(rm)), dis_buf);
-      return HI8(pair)+eip0;
-   }
-}
-
-
-/* Handle move instructions of the form
-      mov E, G  meaning
-      mov reg-or-mem, reg
-   Is passed the a ptr to the modRM byte, and the data size.  Returns
-   the address advanced completely over this instruction.
-
-   E(src) is reg-or-mem
-   G(dst) is reg.
-
-   If E is reg, -->    GET %G,  tmpv
-                       PUT tmpv, %G
- 
-   If E is mem  -->    (getAddr E) -> tmpa
-                       LD (tmpa), tmpb
-                       PUT tmpb, %G
-*/
-static
-Addr dis_mov_E_G ( UCodeBlock* cb, 
-                   Int         size, 
-                   Addr        eip0 )
-{
-   UChar rm  = getUChar(eip0);
-   UChar dis_buf[50];
-
-   if (epartIsReg(rm)) {
-      Int tmpv = newTemp(cb);
-      uInstr2(cb, GET, size, ArchReg, eregOfRM(rm), TempReg, tmpv);
-      uInstr2(cb, PUT, size, TempReg, tmpv, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("mov%c %s,%s\n", nameISize(size), 
-                           nameIReg(size,eregOfRM(rm)),
-                           nameIReg(size,gregOfRM(rm)));
-      return 1+eip0;
-   }
-
-   /* E refers to memory */    
-   {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      Int  tmpb = newTemp(cb);
-      uInstr2(cb, LOAD, size, TempReg, tmpa, TempReg, tmpb);
-      uInstr2(cb, PUT,  size, TempReg, tmpb, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("mov%c %s,%s\n", nameISize(size), 
-                           dis_buf,nameIReg(size,gregOfRM(rm)));
-      return HI8(pair)+eip0;
-   }
-}
-
-
-/* Handle move instructions of the form
-      mov G, E  meaning
-      mov reg, reg-or-mem
-   Is passed the a ptr to the modRM byte, and the data size.  Returns
-   the address advanced completely over this instruction.
-
-   G(src) is reg.
-   E(dst) is reg-or-mem
-
-   If E is reg, -->    GET %G,  tmp
-                       PUT tmp, %E
- 
-   If E is mem, -->    (getAddr E) -> tmpa
-                       GET %G, tmpv
-                       ST tmpv, (tmpa) 
-*/
-static
-Addr dis_mov_G_E ( UCodeBlock* cb, 
-                   Int         size, 
-                   Addr        eip0 )
-{
-   UChar rm = getUChar(eip0);
-   UChar dis_buf[50];
-
-   if (epartIsReg(rm)) {
-      Int tmpv = newTemp(cb);
-      uInstr2(cb, GET, size, ArchReg, gregOfRM(rm), TempReg, tmpv);
-      uInstr2(cb, PUT, size, TempReg, tmpv, ArchReg, eregOfRM(rm));
-      if (dis) VG_(printf)("mov%c %s,%s\n", nameISize(size), 
-                           nameIReg(size,gregOfRM(rm)),
-                           nameIReg(size,eregOfRM(rm)));
-      return 1+eip0;
-   }
-
-   /* E refers to memory */    
-   {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      Int  tmpv = newTemp(cb);
-      uInstr2(cb, GET,   size, ArchReg, gregOfRM(rm), TempReg, tmpv);
-      uInstr2(cb, STORE, size, TempReg, tmpv, TempReg, tmpa);
-      SMC_IF_SOME(cb);
-      if (dis) VG_(printf)("mov%c %s,%s\n", nameISize(size), 
-                           nameIReg(size,gregOfRM(rm)), dis_buf);
-      return HI8(pair)+eip0;
-   }
-}
-
-
-/* op $immediate, AL/AX/EAX. */
-static
-Addr dis_op_imm_A ( UCodeBlock* cb, 
-                    Int         size,
-                    Opcode      opc,
-                    Bool        keep,
-                    Addr        eip,
-                    Char*       t_x86opc )
-{
-   Int  tmp = newTemp(cb);
-   UInt lit = getUDisp(size,eip);
-   uInstr2(cb, GET, size, ArchReg, R_EAX, TempReg, tmp);
-   if (opc == AND || opc == OR) {
-      Int tao = newTemp(cb);
-      uInstr2(cb, MOV, size, Literal, 0, TempReg, tao);
-      uLiteral(cb, lit);
-      uInstr2(cb, opc, size, TempReg, tao, TempReg, tmp);
-      setFlagsFromUOpcode(cb, opc);
-   } else {
-      uInstr2(cb, opc, size, Literal, 0, TempReg, tmp);
-      uLiteral(cb, lit);
-      setFlagsFromUOpcode(cb, opc);
-   }
-   if (keep)
-      uInstr2(cb, PUT, size, TempReg, tmp, ArchReg, R_EAX);
-   if (dis) VG_(printf)("%s%c $0x%x, %s\n", t_x86opc, nameISize(size), 
-                        lit, nameIReg(size,R_EAX));
-   return eip+size;
-}
-
-
-/* Sign- and Zero-extending moves. */
-static
-Addr dis_movx_E_G ( UCodeBlock* cb, 
-                    Addr eip, Int szs, Int szd, Bool sign_extend )
-{
-   UChar dis_buf[50];
-   UChar rm = getUChar(eip);
-   if (epartIsReg(rm)) {
-      Int tmpv = newTemp(cb);
-      uInstr2(cb, GET, szs, ArchReg, eregOfRM(rm), TempReg, tmpv);
-      uInstr1(cb, WIDEN, szd, TempReg, tmpv);
-      LAST_UINSTR(cb).extra4b = szs;
-      LAST_UINSTR(cb).signed_widen = sign_extend;
-      uInstr2(cb, PUT, szd, TempReg, tmpv, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("mov%c%c%c %s,%s\n", 
-                           sign_extend ? 's' : 'z',
-                           nameISize(szs), nameISize(szd),
-                           nameIReg(szs,eregOfRM(rm)),
-                           nameIReg(szd,gregOfRM(rm)));
-      return 1+eip;
-   }
-
-   /* E refers to memory */    
-   {
-      UInt pair = disAMode ( cb, eip, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      uInstr2(cb, LOAD, szs, TempReg, tmpa, TempReg, tmpa);
-      uInstr1(cb, WIDEN, szd, TempReg, tmpa);
-      LAST_UINSTR(cb).extra4b = szs;
-      LAST_UINSTR(cb).signed_widen = sign_extend;
-      uInstr2(cb, PUT, szd, TempReg, tmpa, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("mov%c%c%c %s,%s\n", 
-                           sign_extend ? 's' : 'z',
-                           nameISize(szs), nameISize(szd),
-                           dis_buf,
-                           nameIReg(szd,gregOfRM(rm)));
-      return HI8(pair)+eip;
-   }
-}
-
-
-/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
-   16 / 8 bit quantity in the given TempReg.  */
-static
-void codegen_div ( UCodeBlock* cb, Int sz, Int t, Bool signed_divide )
-{
-   Int  helper;
-   Int  ta = newTemp(cb);
-   Int  td = newTemp(cb);
-
-   switch (sz) {
-      case 4: helper = (signed_divide ? VGOFF_(helper_idiv_64_32) 
-                                      : VGOFF_(helper_div_64_32));
-              break;
-      case 2: helper = (signed_divide ? VGOFF_(helper_idiv_32_16) 
-                                      : VGOFF_(helper_div_32_16));
-              break;
-      case 1: helper = (signed_divide ? VGOFF_(helper_idiv_16_8)
-                                      : VGOFF_(helper_div_16_8));
-              break;
-      default: VG_(panic)("codegen_div");
-   }
-   uInstr0(cb, CALLM_S, 0);
-   if (sz == 4 || sz == 2) {
-      uInstr1(cb, PUSH,  sz, TempReg, t);
-      uInstr2(cb, GET,   sz, ArchReg, R_EAX,  TempReg, ta);
-      uInstr1(cb, PUSH,  sz, TempReg, ta);
-      uInstr2(cb, GET,   sz, ArchReg, R_EDX,  TempReg, td);
-      uInstr1(cb, PUSH,  sz, TempReg, td);
-      uInstr1(cb, CALLM,  0, Lit16,   helper);
-      uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsOSZACP);
-      uInstr1(cb, POP,   sz, TempReg, t);
-      uInstr2(cb, PUT,   sz, TempReg, t,      ArchReg, R_EDX);
-      uInstr1(cb, POP,   sz, TempReg, t);
-      uInstr2(cb, PUT,   sz, TempReg, t,      ArchReg, R_EAX);
-      uInstr1(cb, CLEAR,  0, Lit16,   4);
-   } else {
-      uInstr1(cb, PUSH,  1, TempReg, t);
-      uInstr2(cb, GET,   2, ArchReg, R_EAX,  TempReg, ta);
-      uInstr1(cb, PUSH,  2, TempReg, ta);
-      uInstr2(cb, MOV,   1, Literal, 0,      TempReg, td);
-      uLiteral(cb, 0);
-      uInstr1(cb, PUSH,  1, TempReg, td);
-      uInstr1(cb, CALLM, 0, Lit16,   helper);
-      uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsOSZACP);
-      uInstr1(cb, POP,   1, TempReg, t);
-      uInstr2(cb, PUT,   1, TempReg, t,      ArchReg, R_AL);
-      uInstr1(cb, POP,   1, TempReg, t);
-      uInstr2(cb, PUT,   1, TempReg, t,      ArchReg, R_AH);
-      uInstr1(cb, CLEAR, 0, Lit16,   4);
-   }
-   uInstr0(cb, CALLM_E, 0);
-}
-
-
-static 
-Addr dis_Grp1 ( UCodeBlock* cb, Addr eip, UChar modrm, 
-                Int am_sz, Int d_sz, Int sz, UInt d32 )
-{
-   Int   t1, t2, uopc;
-   UInt  pair;
-   UChar dis_buf[50];
-   if (epartIsReg(modrm)) {
-      vg_assert(am_sz == 1);
-      t1  = newTemp(cb);
-      uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: uopc = ADD; break;  case 1: uopc = OR;  break;
-         case 2: uopc = ADC; break;  case 3: uopc = SBB; break;
-         case 4: uopc = AND; break;  case 5: uopc = SUB; break;
-         case 6: uopc = XOR; break;  case 7: uopc = SUB; break;
-         default: VG_(panic)("dis_Grp1(Reg): unhandled case");
-      }
-      if (uopc == AND || uopc == OR) {
-         Int tao = newTemp(cb);
-         uInstr2(cb, MOV, sz, Literal, 0, TempReg, tao);
-         uLiteral(cb, d32);
-         uInstr2(cb, uopc, sz, TempReg, tao, TempReg, t1);
-         setFlagsFromUOpcode(cb, uopc);
-      } else {
-         uInstr2(cb, uopc, sz, Literal, 0, TempReg, t1);
-         uLiteral(cb, d32);
-         setFlagsFromUOpcode(cb, uopc);
-      }
-      if (gregOfRM(modrm) < 7)
-         uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-      eip += (am_sz + d_sz);
-      if (dis)
-         VG_(printf)("%s%c $0x%x, %s\n",
-                     nameGrp1(gregOfRM(modrm)), nameISize(sz), d32, 
-                     nameIReg(sz,eregOfRM(modrm)));
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL);
-      t1   = LOW24(pair);
-      t2   = newTemp(cb);
-      eip  += HI8(pair);
-      eip  += d_sz;
-      uInstr2(cb, LOAD, sz, TempReg, t1, TempReg, t2);
-      switch (gregOfRM(modrm)) {
-         case 0: uopc = ADD; break;  case 1: uopc = OR;  break;
-         case 2: uopc = ADC; break;  case 3: uopc = SBB; break;
-         case 4: uopc = AND; break;  case 5: uopc = SUB; break;
-         case 6: uopc = XOR; break;  case 7: uopc = SUB; break;
-         default: VG_(panic)("dis_Grp1(Mem): unhandled case");
-      }
-      if (uopc == AND || uopc == OR) {
-         Int tao = newTemp(cb);
-         uInstr2(cb, MOV, sz, Literal, 0, TempReg, tao);
-         uLiteral(cb, d32);
-         uInstr2(cb, uopc, sz, TempReg, tao, TempReg, t2);
-         setFlagsFromUOpcode(cb, uopc);
-      } else {
-         uInstr2(cb, uopc, sz, Literal, 0, TempReg, t2);
-         uLiteral(cb, d32);
-         setFlagsFromUOpcode(cb, uopc);
-      }
-      if (gregOfRM(modrm) < 7) {
-         uInstr2(cb, STORE, sz, TempReg, t2, TempReg, t1);
-         SMC_IF_ALL(cb);
-      }
-      if (dis)
-         VG_(printf)("%s%c $0x%x, %s\n",
-                     nameGrp1(gregOfRM(modrm)), nameISize(sz), d32, 
-                     dis_buf);
-   }
-   return eip;
-}
-
-
-/* Group 2 extended opcodes. */
-static
-Addr dis_Grp2 ( UCodeBlock* cb, Addr eip, UChar modrm,
-                Int am_sz, Int d_sz, Int sz, 
-                Tag orig_src_tag, UInt orig_src_val )
-{
-   /* orig_src_tag and orig_src_val denote either ArchReg(%CL) or a
-      Literal.  And eip on entry points at the modrm byte. */
-   Int   t1, t2, uopc;
-   UInt  pair;
-   UChar dis_buf[50];
-   UInt  src_val;
-   Tag   src_tag;
-
-   /* Get the amount to be shifted by into src_tag/src_val. */
-   if (orig_src_tag == ArchReg) {
-      src_val = newTemp(cb);
-      src_tag = TempReg;
-      uInstr2(cb, GET, 1, orig_src_tag, orig_src_val, TempReg, src_val);
-   } else {
-      src_val = orig_src_val;
-      src_tag = Literal;
-   }
-
-   if (epartIsReg(modrm)) {
-      vg_assert(am_sz == 1);
-      t1  = newTemp(cb);
-      uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: uopc = ROL; break;  case 1: uopc = ROR; break;
-         case 2: uopc = RCL; break;  case 3: uopc = RCR; break;
-         case 4: uopc = SHL; break;  case 5: uopc = SHR; break;
-         case 7: uopc = SAR; break;
-         default: VG_(panic)("dis_Grp2(Reg): unhandled case");
-      }
-      if (src_tag == Literal) {
-          uInstr2(cb, uopc, sz, Literal, 0, TempReg, t1);
-	  uLiteral(cb, src_val);
-      } else {
-          uInstr2(cb, uopc, sz, src_tag, src_val, TempReg, t1);
-      }
-      setFlagsFromUOpcode(cb, uopc);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-      eip += (am_sz + d_sz);
-      if (dis) {
-         if (orig_src_tag == Literal)
-            VG_(printf)("%s%c $0x%x, %s\n",
-                        nameGrp2(gregOfRM(modrm)), nameISize(sz), 
-                        orig_src_val, nameIReg(sz,eregOfRM(modrm)));
-         else
-            VG_(printf)("%s%c %s, %s\n",
-                        nameGrp2(gregOfRM(modrm)), nameISize(sz),
-                        nameIReg(1,orig_src_val),
-                        nameIReg(sz,eregOfRM(modrm)));
-      }
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL);
-      t1   = LOW24(pair);
-      t2   = newTemp(cb);
-      eip  += HI8(pair);
-      eip  += d_sz;
-      uInstr2(cb, LOAD, sz, TempReg, t1, TempReg, t2);
-      switch (gregOfRM(modrm)) {
-         case 0: uopc = ROL; break;  case 1: uopc = ROR; break;
-         case 2: uopc = RCL; break;  case 3: uopc = RCR; break;
-         case 4: uopc = SHL; break;  case 5: uopc = SHR; break;
-         case 7: uopc = SAR; break;
-         default: VG_(panic)("dis_Grp2(Reg): unhandled case");
-      }
-      if (src_tag == Literal) {
-         uInstr2(cb, uopc, sz, Literal, 0, TempReg, t2);
-	 uLiteral(cb, src_val);
-      } else {
-         uInstr2(cb, uopc, sz, src_tag, src_val, TempReg, t2);
-      }
-      setFlagsFromUOpcode(cb, uopc);
-      uInstr2(cb, STORE, sz, TempReg, t2, TempReg, t1);
-      SMC_IF_ALL(cb);
-      if (dis) {
-         if (orig_src_tag == Literal)
-            VG_(printf)("%s%c $0x%x, %s\n",
-                        nameGrp2(gregOfRM(modrm)), nameISize(sz), 
-                        orig_src_val, dis_buf);
-         else 
-            VG_(printf)("%s%c %s, %s\n",
-                        nameGrp2(gregOfRM(modrm)), nameISize(sz), 
-                        nameIReg(1,orig_src_val),
-                        dis_buf);
-      }
-   }
-   return eip;
-}
-
-
-
-/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
-static
-Addr dis_Grp8_BT ( UCodeBlock* cb, Addr eip, UChar modrm,
-                   Int am_sz, Int sz, UInt src_val )
-{
-#  define MODIFY_t2_AND_SET_CARRY_FLAG					\
-      /* t2 is the value to be op'd on.  Copy to t_fetched, then	\
-         modify t2, if non-BT. */					\
-      uInstr2(cb, MOV,   4,  TempReg, t2, TempReg, t_fetched);		\
-      uInstr2(cb, MOV,  sz,  Literal, 0,  TempReg, t_mask);		\
-      uLiteral(cb, v_mask);						\
-      switch (gregOfRM(modrm)) {					\
-         case 4: /* BT */  break;					\
-         case 5: /* BTS */ 						\
-            uInstr2(cb, OR, sz, TempReg, t_mask, TempReg, t2); break;	\
-         case 6: /* BTR */						\
-            uInstr2(cb, AND, sz, TempReg, t_mask, TempReg, t2); break;	\
-         case 7: /* BTC */ 						\
-            uInstr2(cb, XOR, sz, TempReg, t_mask, TempReg, t2); break;	\
-      }									\
-      /* Copy relevant bit from t_fetched into carry flag. */		\
-      uInstr2(cb, SHR, sz, Literal, 0, TempReg, t_fetched);		\
-      uLiteral(cb, src_val);						\
-      uInstr2(cb, MOV, sz, Literal, 0, TempReg, t_mask);		\
-      uLiteral(cb, 1);							\
-      uInstr2(cb, AND, sz, TempReg, t_mask, TempReg, t_fetched);	\
-      uInstr1(cb, NEG, sz, TempReg, t_fetched);				\
-      setFlagsFromUOpcode(cb, NEG);
-
-
-   /* src_val denotes a d8.
-      And eip on entry points at the modrm byte. */
-   Int   t1, t2, t_fetched, t_mask;
-   UInt  pair;
-   UChar dis_buf[50];
-   UInt  v_mask;
-
-   /* There is no 1-byte form of this instruction, AFAICS. */
-   vg_assert(sz == 2 || sz == 4);
-
-   /* Limit src_val -- the bit offset -- to something within a word.
-      The Intel docs say that literal offsets larger than a word are
-      masked in this way. */
-   switch (sz) {
-      case 2: src_val &= 15; break;
-      case 4: src_val &= 31; break;
-      default: VG_(panic)("dis_Grp8_BT: invalid size");
-   }
-
-   /* Invent a mask suitable for the operation. */
-
-   switch (gregOfRM(modrm)) {
-      case 4: /* BT */  v_mask = 0; break;
-      case 5: /* BTS */ v_mask = 1 << src_val; break;
-      case 6: /* BTR */ v_mask = ~(1 << src_val); break;
-      case 7: /* BTC */ v_mask = 1 << src_val; break;
-         /* If this needs to be extended, probably simplest to make a
-            new function to handle the other cases (0 .. 3).  The
-            Intel docs do however not indicate any use for 0 .. 3, so
-            we don't expect this to happen. */
-      default: VG_(panic)("dis_Grp8_BT");
-   }
-   /* Probably excessively paranoid. */
-   if (sz == 2)
-      v_mask &= 0x0000FFFF;
-
-   t1        = INVALID_TEMPREG;
-   t_fetched = newTemp(cb);
-   t_mask    = newTemp(cb);
-
-   if (epartIsReg(modrm)) {
-      vg_assert(am_sz == 1);
-      t2 = newTemp(cb);
-
-      /* Fetch the value to be tested and modified. */
-      uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t2);
-      /* Do it! */
-      MODIFY_t2_AND_SET_CARRY_FLAG;
-      /* Dump the result back, if non-BT. */
-      if (gregOfRM(modrm) != 4 /* BT */)
-         uInstr2(cb, PUT, sz, TempReg, t2, ArchReg, eregOfRM(modrm));
-
-      eip += (am_sz + 1);
-      if (dis)
-         VG_(printf)("%s%c $0x%x, %s\n",
-                     nameGrp8(gregOfRM(modrm)), nameISize(sz),
-                     src_val,
-                     nameIReg(sz,eregOfRM(modrm)));
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL);
-      t1   = LOW24(pair);
-      t2   = newTemp(cb);
-      eip  += HI8(pair);
-      eip  += 1;
-
-      /* Fetch the value to be tested and modified. */
-      uInstr2(cb, LOAD,  sz, TempReg, t1, TempReg, t2);
-      /* Do it! */
-      MODIFY_t2_AND_SET_CARRY_FLAG;
-      /* Dump the result back, if non-BT. */
-      if (gregOfRM(modrm) != 4 /* BT */) {
-         uInstr2(cb, STORE, sz, TempReg, t2, TempReg, t1);
-         SMC_IF_ALL(cb);
-      }
-      if (dis)
-            VG_(printf)("%s%c $0x%x, %s\n",
-                        nameGrp8(gregOfRM(modrm)), nameISize(sz), src_val, 
-                        dis_buf);
-   }
-   return eip;
-
-#  undef MODIFY_t2_AND_SET_CARRY_FLAG
-}
-
-
-
-/* Generate ucode to multiply the value in EAX/AX/AL by the register
-   specified by the ereg of modrm, and park the result in
-   EDX:EAX/DX:AX/AX. */
-static void codegen_mul_A_D_Reg ( UCodeBlock* cb, Int sz, 
-                                  UChar modrm, Bool signed_multiply )
-{
-   Int helper = signed_multiply 
-                ?
-                   (sz==1 ? VGOFF_(helper_imul_8_16) 
-                          : (sz==2 ? VGOFF_(helper_imul_16_32) 
-                                   : VGOFF_(helper_imul_32_64)))
-                :
-                   (sz==1 ? VGOFF_(helper_mul_8_16)
-                          : (sz==2 ? VGOFF_(helper_mul_16_32) 
-                                   : VGOFF_(helper_mul_32_64)));
-   Int t1 = newTemp(cb);
-   Int ta = newTemp(cb);
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, GET,   sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-   uInstr1(cb, PUSH,  sz, TempReg, t1);
-   uInstr2(cb, GET,   sz, ArchReg, R_EAX,  TempReg, ta);
-   uInstr1(cb, PUSH,  sz, TempReg, ta);
-   uInstr1(cb, CALLM, 0,  Lit16,   helper);
-   uFlagsRWU(cb, FlagsEmpty, FlagsOC, FlagsSZAP);
-   if (sz > 1) {
-      uInstr1(cb, POP, sz, TempReg, t1);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, R_EDX);
-      uInstr1(cb, POP, sz, TempReg, t1);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, R_EAX);
-   } else {
-      uInstr1(cb, CLEAR, 0, Lit16,   4);
-      uInstr1(cb, POP,   2, TempReg, t1);
-      uInstr2(cb, PUT,   2, TempReg, t1, ArchReg, R_EAX);
-   }
-	uInstr0(cb, CALLM_E, 0);
-   if (dis) VG_(printf)("%s%c %s\n", signed_multiply ? "imul" : "mul",
-                        nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
-
-}
-
-
-/* Generate ucode to multiply the value in EAX/AX/AL by the value in
-   TempReg temp, and park the result in EDX:EAX/DX:AX/AX. */
-static void codegen_mul_A_D_Temp ( UCodeBlock* cb, Int sz, 
-                                   Int temp, Bool signed_multiply,
-                                   UChar* dis_buf )
-{
-   Int helper = signed_multiply 
-                ?
-                   (sz==1 ? VGOFF_(helper_imul_8_16) 
-                          : (sz==2 ? VGOFF_(helper_imul_16_32) 
-                                   : VGOFF_(helper_imul_32_64)))
-                :
-                   (sz==1 ? VGOFF_(helper_mul_8_16) 
-                          : (sz==2 ? VGOFF_(helper_mul_16_32)
-                                   : VGOFF_(helper_mul_32_64)));
-   Int t1 = newTemp(cb);
-   uInstr0(cb, CALLM_S, 0);
-   uInstr1(cb, PUSH,  sz, TempReg, temp);
-   uInstr2(cb, GET,   sz, ArchReg, R_EAX,  TempReg, t1);
-   uInstr1(cb, PUSH,  sz, TempReg, t1);
-   uInstr1(cb, CALLM, 0,  Lit16,   helper);
-   uFlagsRWU(cb, FlagsEmpty, FlagsOC, FlagsSZAP);
-   if (sz > 1) {
-      uInstr1(cb, POP, sz, TempReg, t1);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, R_EDX);
-      uInstr1(cb, POP, sz, TempReg, t1);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, R_EAX);
-   } else {
-      uInstr1(cb, CLEAR, 0, Lit16,   4);
-      uInstr1(cb, POP,   2, TempReg, t1);
-      uInstr2(cb, PUT,   2, TempReg, t1, ArchReg, R_EAX);
-   }
-   uInstr0(cb, CALLM_E, 0);
-   if (dis) VG_(printf)("%s%c %s\n", signed_multiply ? "imul" : "mul",
-                        nameISize(sz), dis_buf);
-}
-
-
-/* Group 3 extended opcodes. */
-static 
-Addr dis_Grp3 ( UCodeBlock* cb, Int sz, Addr eip )
-{
-   Int   t1, t2;
-   UInt  pair, d32;
-   UChar modrm;
-   UChar dis_buf[50];
-   t1 = t2 = INVALID_TEMPREG;
-   modrm = getUChar(eip);
-   if (epartIsReg(modrm)) {
-      t1 = newTemp(cb);
-      switch (gregOfRM(modrm)) {
-         case 0: { /* TEST */
-            Int tao = newTemp(cb);
-            eip++; d32 = getUDisp(sz, eip); eip += sz;
-            uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-	    uInstr2(cb, MOV, sz, Literal, 0, TempReg, tao);
-	    uLiteral(cb, d32);
-            uInstr2(cb, AND, sz, TempReg, tao, TempReg, t1);
-            setFlagsFromUOpcode(cb, AND);
-            if (dis)
-               VG_(printf)("test%c $0x%x, %s\n",
-                   nameISize(sz), d32, nameIReg(sz, eregOfRM(modrm)));
-            break;
-         }
-         case 2: /* NOT */
-            eip++;
-            uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-            uInstr1(cb, NOT, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, NOT);
-            uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-            if (dis)
-               VG_(printf)("not%c %s\n",
-                   nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
-            break;
-         case 3: /* NEG */
-            eip++;
-            uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-            uInstr1(cb, NEG, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, NEG);
-            uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-            if (dis)
-               VG_(printf)("neg%c %s\n",
-                   nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
-            break;
-         case 4: /* MUL */
-            eip++;
-            codegen_mul_A_D_Reg ( cb, sz, modrm, False );
-            break;
-         case 5: /* IMUL */
-            eip++;
-            codegen_mul_A_D_Reg ( cb, sz, modrm, True );
-            break;
-         case 6: /* DIV */
-            eip++;
-            uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-            codegen_div ( cb, sz, t1, False );
-            if (dis)
-               VG_(printf)("div%c %s\n", nameISize(sz), 
-                           nameIReg(sz, eregOfRM(modrm)));
-            break;
-         case 7: /* IDIV */
-            eip++;
-            uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-            codegen_div ( cb, sz, t1, True );
-            if (dis)
-               VG_(printf)("idiv%c %s\n", nameISize(sz), 
-                           nameIReg(sz, eregOfRM(modrm)));
-            break;
-         default: 
-            VG_(printf)(
-               "unhandled Grp3(R) case %d\n", (UInt)gregOfRM(modrm));
-            VG_(panic)("Grp3");
-      }
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-      t2   = LOW24(pair);
-      t1   = newTemp(cb);
-      eip  += HI8(pair);
-      uInstr2(cb, LOAD, sz, TempReg, t2, TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: { /* TEST */
-            Int tao = newTemp(cb);
-            d32 = getUDisp(sz, eip); eip += sz;
-            uInstr2(cb, MOV, sz, Literal, 0, TempReg, tao);
-            uLiteral(cb, d32);
-            uInstr2(cb, AND, sz, TempReg, tao, TempReg, t1);
-            setFlagsFromUOpcode(cb, AND);
-            if (dis)
-               VG_(printf)("test%c $0x%x, %s\n", 
-                           nameISize(sz), d32, dis_buf);
-            break;
-         }
-         case 2: /* NOT */
-            uInstr1(cb, NOT, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, NOT);
-            uInstr2(cb, STORE, sz, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            if (dis)
-               VG_(printf)("not%c %s\n", nameISize(sz), dis_buf);
-            break;
-         case 3: /* NEG */
-            uInstr1(cb, NEG, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, NEG);
-            uInstr2(cb, STORE, sz, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            if (dis)
-               VG_(printf)("neg%c %s\n", nameISize(sz), dis_buf);
-            break;
-         case 4: /* MUL */
-            codegen_mul_A_D_Temp ( cb, sz, t1, False, 
-                                   dis?dis_buf:NULL );
-            break;
-         case 5: /* IMUL */
-            codegen_mul_A_D_Temp ( cb, sz, t1, True, dis?dis_buf:NULL );
-            break;
-         case 6: /* DIV */
-            codegen_div ( cb, sz, t1, False );
-            if (dis)
-               VG_(printf)("div%c %s\n", nameISize(sz), dis_buf);
-            break;
-         case 7: /* IDIV */
-            codegen_div ( cb, sz, t1, True );
-            if (dis)
-               VG_(printf)("idiv%c %s\n", nameISize(sz), dis_buf);
-            break;
-         default: 
-            VG_(printf)(
-               "unhandled Grp3(M) case %d\n", (UInt)gregOfRM(modrm));
-            VG_(panic)("Grp3");
-      }
-   }
-   return eip;
-}
-
-
-/* Group 4 extended opcodes. */
-static
-Addr dis_Grp4 ( UCodeBlock* cb, Addr eip )
-{
-   Int   t1, t2;
-   UInt  pair;
-   UChar modrm;
-   UChar dis_buf[50];
-   t1 = t2 = INVALID_TEMPREG;
-
-   modrm = getUChar(eip);
-   if (epartIsReg(modrm)) {
-      t1 = newTemp(cb);
-      uInstr2(cb, GET, 1, ArchReg, eregOfRM(modrm), TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: /* INC */
-            uInstr1(cb, INC, 1, TempReg, t1);
-            setFlagsFromUOpcode(cb, INC);
-            uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, eregOfRM(modrm));
-            break;
-         case 1: /* DEC */
-            uInstr1(cb, DEC, 1, TempReg, t1);
-            setFlagsFromUOpcode(cb, DEC);
-            uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, eregOfRM(modrm));
-            break;
-         default: 
-            VG_(printf)(
-               "unhandled Grp4(R) case %d\n", (UInt)gregOfRM(modrm));
-            VG_(panic)("Grp4");
-      }
-      eip++;
-      if (dis)
-         VG_(printf)("%sb %s\n", nameGrp4(gregOfRM(modrm)),
-                     nameIReg(1, eregOfRM(modrm)));
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-      t2   = LOW24(pair);
-      t1   = newTemp(cb);
-      uInstr2(cb, LOAD, 1, TempReg, t2, TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: /* INC */ 
-            uInstr1(cb, INC, 1, TempReg, t1);
-            setFlagsFromUOpcode(cb, INC);
-            uInstr2(cb, STORE, 1, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            break;
-         case 1: /* DEC */
-            uInstr1(cb, DEC, 1, TempReg, t1);
-            setFlagsFromUOpcode(cb, DEC);
-            uInstr2(cb, STORE, 1, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            break;
-         default: 
-            VG_(printf)(
-               "unhandled Grp4(M) case %d\n", (UInt)gregOfRM(modrm));
-            VG_(panic)("Grp4");
-      }
-      eip += HI8(pair);
-      if (dis)
-         VG_(printf)("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
-   }
-   return eip;
-}
-
-
-/* Group 5 extended opcodes. */
-static
-Addr dis_Grp5 ( UCodeBlock* cb, Int sz, Addr eip, Bool* isEnd )
-{
-   Int   t1, t2, t3, t4;
-   UInt  pair;
-   UChar modrm;
-   UChar dis_buf[50];
-   t1 = t2 = t3 = t4 = INVALID_TEMPREG;
-
-   modrm = getUChar(eip);
-   if (epartIsReg(modrm)) {
-      t1 = newTemp(cb);
-      uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: /* INC */
-            uInstr1(cb, INC, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, INC);
-            uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-            break;
-         case 1: /* DEC */
-            uInstr1(cb, DEC, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, DEC);
-            uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-            break;
-         case 2: /* call Ev */
-            t3 = newTemp(cb); t4 = newTemp(cb);
-            uInstr2(cb, GET,   4, ArchReg, R_ESP, TempReg, t3);
-            uInstr2(cb, SUB,   4, Literal, 0,     TempReg, t3);
-	    uLiteral(cb, 4);
-            uInstr2(cb, PUT,   4, TempReg, t3,    ArchReg, R_ESP);
-            uInstr2(cb, MOV,   4, Literal, 0,     TempReg, t4);
-	    uLiteral(cb, eip+1);
-            uInstr2(cb, STORE, 4, TempReg, t4,    TempReg, t3);
-            SMC_IF_ALL(cb);
-            uInstr1(cb, JMP,   0, TempReg, t1);
-            uCond(cb, CondAlways);
-            LAST_UINSTR(cb).jmpkind = JmpCall;
-            *isEnd = True;
-            break;
-         case 4: /* jmp Ev */
-            uInstr1(cb, JMP, 0, TempReg, t1);
-            uCond(cb, CondAlways);
-            *isEnd = True;
-            break;
-         default: 
-            VG_(printf)(
-               "unhandled Grp5(R) case %d\n", (UInt)gregOfRM(modrm));
-            VG_(panic)("Grp5");
-      }
-      eip++;
-      if (dis)
-         VG_(printf)("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
-                     nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-      t2   = LOW24(pair);
-      t1   = newTemp(cb);
-      uInstr2(cb, LOAD, sz, TempReg, t2, TempReg, t1);
-      switch (gregOfRM(modrm)) {
-         case 0: /* INC */ 
-            uInstr1(cb, INC, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, INC);
-            uInstr2(cb, STORE, sz, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            break;
-         case 1: /* DEC */
-            uInstr1(cb, DEC, sz, TempReg, t1);
-            setFlagsFromUOpcode(cb, DEC);
-            uInstr2(cb, STORE, sz, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            break;
-         case 2: /* call Ev */
-            t3 = newTemp(cb); t4 = newTemp(cb);
-            uInstr2(cb, GET,   4, ArchReg, R_ESP, TempReg, t3);
-            uInstr2(cb, SUB,   4, Literal, 0,     TempReg, t3);
-            uLiteral(cb, 4);
-            uInstr2(cb, PUT,   4, TempReg, t3,    ArchReg, R_ESP);
-            uInstr2(cb, MOV,   4, Literal, 0,     TempReg, t4);
-	         uLiteral(cb, eip+HI8(pair));
-            uInstr2(cb, STORE, 4, TempReg, t4,    TempReg, t3);
-            SMC_IF_ALL(cb);
-            uInstr1(cb, JMP,   0, TempReg, t1);
-            uCond(cb, CondAlways);
-            LAST_UINSTR(cb).jmpkind = JmpCall;
-            *isEnd = True;
-            break;
-         case 4: /* JMP Ev */
-            uInstr1(cb, JMP, 0, TempReg, t1);
-            uCond(cb, CondAlways);
-            *isEnd = True;
-            break;
-         case 6: /* PUSH Ev */
-            t3 = newTemp(cb);
-            uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t3);
-            uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t3);
-	    uLiteral(cb, sz);
-            uInstr2(cb, PUT,    4, TempReg, t3,    ArchReg, R_ESP);
-            uInstr2(cb, STORE, sz, TempReg, t1,    TempReg, t3);
-            SMC_IF_ALL(cb);
-            break;
-         default: 
-            VG_(printf)(
-               "unhandled Grp5(M) case %d\n", (UInt)gregOfRM(modrm));
-            VG_(panic)("Grp5");
-      }
-      eip += HI8(pair);
-      if (dis)
-         VG_(printf)("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
-                     nameISize(sz), dis_buf);
-   }
-   return eip;
-}
-
-
-/* Template for REPE CMPS<sz>.  Assumes this insn is the last one in
-   the basic block, and so emits a jump to the next insn. */
-static 
-void codegen_REPE_CMPS ( UCodeBlock* cb, Int sz, Addr eip, Addr eip_next )
-{
-   Int tc,  /* ECX */
-       td,  /* EDI */   ts, /* ESI */
-       tdv, /* (EDI) */ tsv /* (ESI) */;
-
-   tdv = newTemp(cb);
-   tsv = newTemp(cb);
-   td = newTemp(cb);
-   ts = newTemp(cb);
-   tc = newTemp(cb);
-
-   uInstr2(cb, GET,   4, ArchReg, R_ECX, TempReg, tc);
-   uInstr2(cb, JIFZ,  4, TempReg, tc,    Literal, 0);
-   uLiteral(cb, eip_next);
-   uInstr1(cb, DEC,   4, TempReg, tc);
-   uInstr2(cb, PUT,   4, TempReg, tc,    ArchReg, R_ECX);
-
-   uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, GET,   4, ArchReg, R_ESI, TempReg, ts);
-
-   uInstr2(cb, LOAD, sz, TempReg, td,    TempReg, tdv);
-   uInstr2(cb, LOAD, sz, TempReg, ts,    TempReg, tsv);
-
-   uInstr2(cb, SUB,  sz, TempReg, tdv,   TempReg, tsv);
-   setFlagsFromUOpcode(cb, SUB);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, tdv);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, tdv);
-
-   uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, tdv);
-	uInstr0(cb, CALLM_E, 0);
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, tdv);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, tdv,    TempReg, td);
-   uInstr2(cb, ADD,   4, TempReg, tdv,    TempReg, ts);
-
-   uInstr2(cb, PUT,   4, TempReg, td,     ArchReg, R_EDI);
-   uInstr2(cb, PUT,   4, TempReg, ts,     ArchReg, R_ESI);
-
-   uInstr1(cb, JMP,   0, Literal, 0);
-   uLiteral(cb, eip);
-   uCond(cb, CondZ);
-   uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-   uInstr1(cb, JMP,   0, Literal, 0);
-   uLiteral(cb, eip_next);
-   uCond(cb, CondAlways);
-}
-
-
-/* Template for REPNE SCAS<sz>.  Assumes this insn is the last one in
-   the basic block, and so emits a jump to the next insn. */
-static 
-void codegen_REPNE_SCAS ( UCodeBlock* cb, Int sz, Addr eip, Addr eip_next )
-{
-   Int ta /* EAX */, tc /* ECX */, td /* EDI */, tv;
-   ta = newTemp(cb);
-   tc = newTemp(cb);
-   tv = newTemp(cb);
-   td = newTemp(cb);
-
-   uInstr2(cb, GET,   4, ArchReg, R_ECX, TempReg, tc);
-   uInstr2(cb, JIFZ,  4, TempReg, tc,    Literal, 0);
-   uLiteral(cb, eip_next);
-   uInstr1(cb, DEC,   4, TempReg, tc);
-   uInstr2(cb, PUT,   4, TempReg, tc,    ArchReg, R_ECX);
-
-   uInstr2(cb, GET,  sz, ArchReg, R_EAX, TempReg, ta);
-   uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, LOAD, sz, TempReg, td,    TempReg, tv);
-   /* next uinstr kills ta, but that's ok -- don't need it again */
-   uInstr2(cb, SUB,  sz, TempReg, tv,    TempReg, ta);
-   setFlagsFromUOpcode(cb, SUB);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, tv);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, tv);
-
-   uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, tv);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, tv);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, tv,    TempReg, td);
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-   uInstr1(cb, JMP,   0, Literal, 0);
-   uLiteral(cb, eip);
-   uCond(cb, CondNZ);
-   uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-   uInstr1(cb, JMP,   0, Literal, 0);
-   uLiteral(cb, eip_next);
-   uCond(cb, CondAlways);
-}
-
-
-/* Template for REPE MOVS<sz>.  Assumes this insn is the last one in
-   the basic block, and so emits a jump to the next insn. */
-static 
-void codegen_REPE_MOVS ( UCodeBlock* cb, Int sz, Addr eip, Addr eip_next )
-{
-   Int ts /* ESI */, tc /* ECX */, td /* EDI */, tv;
-   tc = newTemp(cb);
-   td = newTemp(cb);
-   ts = newTemp(cb);
-   tv = newTemp(cb);
-
-   uInstr2(cb, GET,   4, ArchReg, R_ECX, TempReg, tc);
-   uInstr2(cb, JIFZ,  4, TempReg, tc,    Literal, 0);
-   uLiteral(cb, eip_next);
-   uInstr1(cb, DEC,   4, TempReg, tc);
-   uInstr2(cb, PUT,   4, TempReg, tc,    ArchReg, R_ECX);
-
-   uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, GET,   4, ArchReg, R_ESI, TempReg, ts);
-
-   uInstr2(cb, LOAD,  sz, TempReg, ts,    TempReg, tv);
-   uInstr2(cb, STORE, sz, TempReg, tv,    TempReg, td);
-   SMC_IF_SOME(cb);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, tv);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, tv);
-
-   uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, tv);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, tv);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, tv,    TempReg, td);
-   uInstr2(cb, ADD,   4, TempReg, tv,    TempReg, ts);
-
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-   uInstr2(cb, PUT,   4, TempReg, ts,    ArchReg, R_ESI);
-
-   uInstr1(cb, JMP,   0, Literal, 0);
-   uLiteral(cb, eip);
-   uCond(cb, CondAlways);
-}
-
-
-/* Template for REPE STOS<sz>.  Assumes this insn is the last one in
-   the basic block, and so emits a jump to the next insn. */
-static 
-void codegen_REPE_STOS ( UCodeBlock* cb, Int sz, Addr eip, Addr eip_next )
-{
-   Int ta /* EAX */, tc /* ECX */, td /* EDI */;
-   ta = newTemp(cb);
-   tc = newTemp(cb);
-   td = newTemp(cb);
-
-   uInstr2(cb, GET,    4, ArchReg, R_ECX, TempReg, tc);
-   uInstr2(cb, JIFZ,   4, TempReg, tc,    Literal, 0);
-   uLiteral(cb, eip_next);
-   uInstr1(cb, DEC,    4, TempReg, tc);
-   uInstr2(cb, PUT,    4, TempReg, tc,    ArchReg, R_ECX);
-
-   uInstr2(cb, GET,   sz, ArchReg, R_EAX, TempReg, ta);
-   uInstr2(cb, GET,    4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, STORE, sz, TempReg, ta,    TempReg, td);
-   SMC_IF_SOME(cb);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, ta);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, ta);
-
-   uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, ta);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, ta);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, ta,    TempReg, td);
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-
-   uInstr1(cb, JMP,   0, Literal, 0);
-   uLiteral(cb, eip);
-   uCond(cb, CondAlways);
-}
-
-
-/* Template for CMPS<sz>, _not_ preceded by a REP prefix. */
-static 
-void codegen_CMPS ( UCodeBlock* cb, Int sz )
-{
-   Int td,  /* EDI */   ts, /* ESI */
-       tdv, /* (EDI) */ tsv /* (ESI) */;
-   tdv = newTemp(cb);
-   tsv = newTemp(cb);
-   td  = newTemp(cb);
-   ts  = newTemp(cb);
-
-   uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, GET,   4, ArchReg, R_ESI, TempReg, ts);
-
-   uInstr2(cb, LOAD, sz, TempReg, td,    TempReg, tdv);
-   uInstr2(cb, LOAD, sz, TempReg, ts,    TempReg, tsv);
-
-   uInstr2(cb, SUB,  sz, TempReg, tdv,   TempReg, tsv); 
-   setFlagsFromUOpcode(cb, SUB);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, tdv);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, tdv);
-
-   uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, tdv);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, tdv);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, tdv,    TempReg, td);
-   uInstr2(cb, ADD,   4, TempReg, tdv,    TempReg, ts);
-
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-   uInstr2(cb, PUT,   4, TempReg, ts,    ArchReg, R_ESI);
-}
-
-
-/* Template for MOVS<sz>, _not_ preceded by a REP prefix. */
-static 
-void codegen_MOVS ( UCodeBlock* cb, Int sz )
-{
-   Int tv, /* the value being copied */
-       td, /* EDI */ ts /* ESI */;
-   tv = newTemp(cb);
-   td = newTemp(cb);
-   ts = newTemp(cb);
-
-   uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, GET,   4, ArchReg, R_ESI, TempReg, ts);
-
-   uInstr2(cb, LOAD,  sz, TempReg, ts,    TempReg, tv);
-   uInstr2(cb, STORE, sz, TempReg, tv,    TempReg, td);
-   SMC_IF_SOME(cb);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, tv);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, tv);
-
-   uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, tv);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, tv);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, tv,    TempReg, td);
-   uInstr2(cb, ADD,   4, TempReg, tv,    TempReg, ts);
-
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-   uInstr2(cb, PUT,   4, TempReg, ts,    ArchReg, R_ESI);
-}
-
-
-/* Template for STOS<sz>, _not_ preceded by a REP prefix. */
-static 
-void codegen_STOS ( UCodeBlock* cb, Int sz )
-{
-   Int ta /* EAX */, td /* EDI */;
-   ta = newTemp(cb);
-   td = newTemp(cb);
-
-   uInstr2(cb, GET,   sz, ArchReg, R_EAX, TempReg, ta);
-   uInstr2(cb, GET,    4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, STORE, sz, TempReg, ta,    TempReg, td);
-   SMC_IF_SOME(cb);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, ta);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, ta);
-
-   uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, ta);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, ta);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, ta,    TempReg, td);
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-}
-
-
-/* Template for LODS<sz>, _not_ preceded by a REP prefix. */
-static 
-void codegen_LODS ( UCodeBlock* cb, Int sz )
-{
-   Int ta /* EAX */, ts /* ESI */;
-   ta = newTemp(cb);
-   ts = newTemp(cb);
-
-   uInstr2(cb, GET,    4, ArchReg, R_ESI, TempReg, ts);
-   uInstr2(cb, LOAD,  sz, TempReg, ts,    TempReg, ta);
-   uInstr2(cb, PUT,   sz, TempReg, ta,    ArchReg, R_EAX);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, ta);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, ta);
-
-   uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, ta);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, ta);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, ta,    TempReg, ts);
-   uInstr2(cb, PUT,   4, TempReg, ts,    ArchReg, R_ESI);
-}
-
-
-/* Template for REPNE SCAS<sz>, _not_ preceded by a REP prefix. */
-static 
-void codegen_SCAS ( UCodeBlock* cb, Int sz )
-{
-   Int ta /* EAX */, td /* EDI */, tv;
-   ta = newTemp(cb);
-   tv = newTemp(cb);
-   td = newTemp(cb);
-
-   uInstr2(cb, GET,  sz, ArchReg, R_EAX, TempReg, ta);
-   uInstr2(cb, GET,   4, ArchReg, R_EDI, TempReg, td);
-   uInstr2(cb, LOAD, sz, TempReg, td,    TempReg, tv);
-   /* next uinstr kills ta, but that's ok -- don't need it again */
-   uInstr2(cb, SUB,  sz, TempReg, tv,    TempReg, ta);
-   setFlagsFromUOpcode(cb, SUB);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, MOV,   4, Literal, 0,     TempReg, tv);
-   uLiteral(cb, 0);
-   uInstr1(cb, PUSH,  4, TempReg, tv);
-
-   uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_get_dirflag));
-   uFlagsRWU(cb, FlagD, FlagsEmpty, FlagsEmpty);
-
-   uInstr1(cb, POP,   4, TempReg, tv);
-	uInstr0(cb, CALLM_E, 0);
-
-   if (sz == 4 || sz == 2) {
-      uInstr2(cb, SHL, 4, Literal, 0, TempReg, tv);
-      uLiteral(cb, sz/2);
-   }
-   uInstr2(cb, ADD,   4, TempReg, tv,    TempReg, td);
-   uInstr2(cb, PUT,   4, TempReg, td,    ArchReg, R_EDI);
-}
-
-
-/* (I)MUL E, G.  Supplied eip points to the modR/M byte. */
-static
-Addr dis_mul_E_G ( UCodeBlock* cb, 
-                   Int         size, 
-                   Addr        eip0,
-                   Bool        signed_multiply )
-{
-   Int ta, tg, te, helper;
-   UChar dis_buf[50];
-   UChar rm = getUChar(eip0);
-   ta = INVALID_TEMPREG;
-   te = newTemp(cb);
-   tg = newTemp(cb);
-
-   switch (size) {
-      case 4: helper = signed_multiply ? VGOFF_(helper_imul_32_64) 
-                                       : VGOFF_(helper_mul_32_64);
-              break;
-      case 2: helper = signed_multiply ? VGOFF_(helper_imul_16_32) 
-                                       : VGOFF_(helper_mul_16_32);
-              break;
-      case 1: helper = signed_multiply ? VGOFF_(helper_imul_8_16)
-                                       : VGOFF_(helper_mul_8_16);
-              break;
-      default: VG_(panic)("dis_mul_E_G");
-   }
-
-   uInstr0(cb, CALLM_S, 0);
-   if (epartIsReg(rm)) {
-      uInstr2(cb, GET,   size, ArchReg, eregOfRM(rm), TempReg, te);
-      uInstr2(cb, GET,   size, ArchReg, gregOfRM(rm), TempReg, tg);
-      uInstr1(cb, PUSH,  size, TempReg, te);
-      uInstr1(cb, PUSH,  size, TempReg, tg);
-      uInstr1(cb, CALLM, 0,    Lit16,   helper);
-      uFlagsRWU(cb, FlagsEmpty, FlagsOC, FlagsSZAP);
-      uInstr1(cb, CLEAR, 0,    Lit16,   4);
-      uInstr1(cb, POP,   size, TempReg, tg);
-      uInstr2(cb, PUT,   size, TempReg, tg,   ArchReg, gregOfRM(rm));
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("%smul%c %s, %s\n",
-                           signed_multiply ? "i" : "",
-                           nameISize(size), 
-                           nameIReg(size,eregOfRM(rm)),
-                           nameIReg(size,gregOfRM(rm)));
-      return 1+eip0;
-   } else {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      ta = LOW24(pair);
-      uInstr2(cb, LOAD,  size, TempReg, ta, TempReg, te);
-      uInstr2(cb, GET,   size, ArchReg, gregOfRM(rm), TempReg, tg);
-      uInstr1(cb, PUSH,  size, TempReg, te);
-      uInstr1(cb, PUSH,  size, TempReg, tg);
-      uInstr1(cb, CALLM, 0,    Lit16, helper);
-      uFlagsRWU(cb, FlagsEmpty, FlagsOC, FlagsSZAP);
-      uInstr1(cb, CLEAR, 0,    Lit16,   4);
-      uInstr1(cb, POP,   size, TempReg, tg);
-      uInstr2(cb, PUT,   size, TempReg, tg,   ArchReg, gregOfRM(rm));
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("%smul%c %s, %s\n",
-                           signed_multiply ? "i" : "",
-                           nameISize(size), 
-                           dis_buf,nameIReg(size,gregOfRM(rm)));
-      return HI8(pair)+eip0;
-   }
-}
-
-
-/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
-static
-Addr dis_imul_I_E_G ( UCodeBlock* cb, 
-                      Int         size, 
-                      Addr        eip,
-                      Int         litsize )
-{
-   Int ta, te, tl, helper, d32;
-   UChar dis_buf[50];
-   UChar rm = getUChar(eip);
-   ta = INVALID_TEMPREG;
-   te = newTemp(cb);
-   tl = newTemp(cb);
-
-   switch (size) {
-      case 4: helper = VGOFF_(helper_imul_32_64); break;
-      case 2: helper = VGOFF_(helper_imul_16_32); break;
-      case 1: helper = VGOFF_(helper_imul_8_16); break;
-      default: VG_(panic)("dis_imul_I_E_G");
-   }
-
-   uInstr0(cb, CALLM_S, 0);
-   if (epartIsReg(rm)) {
-      uInstr2(cb, GET,   size, ArchReg, eregOfRM(rm), TempReg, te);
-      uInstr1(cb, PUSH,  size, TempReg, te);
-      eip++;
-   } else {
-      UInt pair = disAMode ( cb, eip, dis?dis_buf:NULL);
-      ta = LOW24(pair);
-      uInstr2(cb, LOAD,  size, TempReg, ta, TempReg, te);
-      uInstr1(cb, PUSH,  size, TempReg, te);
-      eip += HI8(pair);
-   }
-
-   d32 = getSDisp(litsize,eip);
-   eip += litsize;
-
-   uInstr2(cb, MOV,   size, Literal, 0,   TempReg, tl);
-   uLiteral(cb, d32);
-   uInstr1(cb, PUSH,  size, TempReg, tl);
-   uInstr1(cb, CALLM, 0,    Lit16, helper);
-   uFlagsRWU(cb, FlagsEmpty, FlagsOC, FlagsSZAP);
-   uInstr1(cb, CLEAR, 0,    Lit16,   4);
-   uInstr1(cb, POP,   size, TempReg, te);
-   uInstr2(cb, PUT,   size, TempReg, te,   ArchReg, gregOfRM(rm));
-   uInstr0(cb, CALLM_E, 0);
-
-   if (dis) {
-      if (epartIsReg(rm)) {
-         VG_(printf)("imul %d, %s, %s\n", d32, nameIReg(size,eregOfRM(rm)),
-                                          nameIReg(size,gregOfRM(rm)));
-      } else {
-         VG_(printf)("imul %d, %s, %s\n", d32, dis_buf,
-                                          nameIReg(size,gregOfRM(rm)));
-      }
-   }
-
-   return eip;
-}   
-
-
-/* Handle FPU insns which read/write memory.  On entry, eip points to
-   the second byte of the insn (the one following D8 .. DF). */
-static 
-Addr dis_fpu_mem ( UCodeBlock* cb, Int size, Bool is_write, 
-                   Addr eip, UChar first_byte )
-{
-   Int   ta;
-   UInt  pair;
-   UChar dis_buf[50];
-   UChar second_byte = getUChar(eip);
-   vg_assert(second_byte < 0xC0);
-   second_byte &= 0x38;
-   pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-   ta   = LOW24(pair);
-   eip  += HI8(pair);
-   uInstr2(cb, is_write ? FPU_W : FPU_R, size,
-               Lit16, 
-               (((UShort)first_byte) << 8) | ((UShort)second_byte),
-               TempReg, ta);
-   if (is_write) SMC_IF_ALL(cb);
-   if (dis) {
-      if (is_write)
-         VG_(printf)("fpu_w_%d 0x%x:0x%x, %s\n",
-                     size, (UInt)first_byte, 
-                           (UInt)second_byte, dis_buf );
-      else
-         VG_(printf)("fpu_r_%d %s, 0x%x:0x%x\n",
-                     size, dis_buf,
-                     (UInt)first_byte, 
-                     (UInt)second_byte );
-   }
-   return eip;
-}
-
-
-/* Handle FPU insns which don't reference memory.  On entry, eip points to
-   the second byte of the insn (the one following D8 .. DF). */
-static 
-Addr dis_fpu_no_mem ( UCodeBlock* cb, Addr eip, UChar first_byte )
-{
-   Bool  sets_ZCP    = False;
-   Bool  uses_ZCP    = False;
-   UChar second_byte = getUChar(eip); eip++;
-   vg_assert(second_byte >= 0xC0);
-
-   /* Does the insn write any integer condition codes (%EIP) ? */
-
-   if (first_byte == 0xDB && second_byte >= 0xF0 && second_byte <= 0xF7) {
-      /* FCOMI */
-      sets_ZCP = True;
-   } else
-   if (first_byte == 0xDF && second_byte >= 0xF0 && second_byte <= 0xF7) {
-      /* FCOMIP */
-      sets_ZCP = True;
-   } else
-   if (first_byte == 0xDB && second_byte >= 0xE8 && second_byte <= 0xEF) {
-      /* FUCOMI */
-      sets_ZCP = True;
-   } else
-   if (first_byte == 0xDF && second_byte >= 0xE8 && second_byte <= 0xEF) {
-      /* FUCOMIP */
-      sets_ZCP = True;
-   } 
-
-   /* Dually, does the insn read any integer condition codes (%EIP) ? */
-
-   if (first_byte == 0xDA && second_byte >= 0xC0 && second_byte <= 0xDF) {
-      /* FCMOVB  %st(n), %st(0)
-         FCMOVE  %st(n), %st(0)
-         FCMOVBE %st(n), %st(0)
-         FCMOVU  %st(n), %st(0)
-      */
-      uses_ZCP = True;
-   } else
-   if (first_byte == 0xDB && second_byte >= 0xC0 && second_byte <= 0xDF) {
-      /* FCMOVNB  %st(n), %st(0)
-         FCMOVNE  %st(n), %st(0)
-         FCMOVNBE %st(n), %st(0)
-         FCMOVNU  %st(n), %st(0)
-      */
-      uses_ZCP = True;
-   }
-
-   uInstr1(cb, FPU, 0,
-               Lit16,
-               (((UShort)first_byte) << 8) | ((UShort)second_byte)
-          );
-   if (uses_ZCP) {
-      /* VG_(printf)("!!! --- FPU insn which reads %EFLAGS\n"); */
-      uFlagsRWU(cb, FlagsZCP, FlagsEmpty, FlagsEmpty);
-      vg_assert(!sets_ZCP);
-   }
-   if (sets_ZCP) {
-      /* VG_(printf)("!!! --- FPU insn which writes %EFLAGS\n"); */
-      uFlagsRWU(cb, FlagsEmpty, FlagsZCP, FlagsEmpty);
-      vg_assert(!uses_ZCP);
-   }
-
-   if (dis) VG_(printf)("fpu 0x%x:0x%x%s%s\n",
-                        (UInt)first_byte, (UInt)second_byte,
-                        uses_ZCP ? " -rZCP" : "",
-                        sets_ZCP ? " -wZCP" : "" );
-   return eip;
-}
-
-
-/* Top-level handler for all FPU insns.  On entry, eip points to the
-   second byte of the insn. */
-static
-Addr dis_fpu ( UCodeBlock* cb, UChar first_byte, Addr eip )
-{
-   const Bool rd = False; 
-   const Bool wr = True;
-   UChar second_byte = getUChar(eip);
-
-   /* Handle FSTSW %ax specially. */
-   if (first_byte == 0xDF && second_byte == 0xE0) {
-      Int t1 = newTemp(cb);
-      uInstr0(cb, CALLM_S, 0);
-      uInstr2(cb, MOV,   4, Literal, 0,  TempReg, t1);
-      uLiteral(cb, 0);
-      uInstr1(cb, PUSH,  4, TempReg, t1);
-      uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_fstsw_AX) );
-      uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
-      uInstr1(cb, POP,   2,  TempReg, t1);
-      uInstr2(cb, PUT,   2,  TempReg, t1, ArchReg, R_EAX);
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("fstsw %%ax\n");
-      eip++;
-      return eip;
-   }
-
-   /* Handle all non-memory FPU ops simply. */
-   if (second_byte >= 0xC0)
-      return dis_fpu_no_mem ( cb, eip, first_byte );
-
-   /* The insn references memory; need to determine 
-      whether it reads or writes, and at what size. */
-   switch (first_byte) {
-
-      case 0xD8:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FADDs */
-            case 1: /* FMULs */
-            case 2: /* FCOMs */
-            case 3: /* FCOMPs */
-            case 4: /* FSUBs */
-            case 5: /* FSUBRs */
-            case 6: /* FDIVs */
-            case 7: /* FDIVRs */
-               return dis_fpu_mem(cb, 4, rd, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      case 0xD9:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FLDs */
-               return dis_fpu_mem(cb, 4, rd, eip, first_byte); 
-            case 2: /* FSTs */
-            case 3: /* FSTPs */
-               return dis_fpu_mem(cb, 4, wr, eip, first_byte); 
-            case 4: /* FLDENV */
-               return dis_fpu_mem(cb, 28, rd, eip, first_byte);
-            case 5: /* FLDCW */
-               return dis_fpu_mem(cb, 2, rd, eip, first_byte); 
-            case 6: /* FNSTENV */
-               return dis_fpu_mem(cb, 28, wr, eip, first_byte);
-            case 7: /* FSTCW */
-               /* HACK!  FSTCW actually writes 2 bytes, not 4.  glibc
-                  gets lots of moaning in __floor() if we do the right
-                  thing here. */
-               /* Later ... hack disabled .. we do do the Right Thing. */
-               return dis_fpu_mem(cb, /*4*/ 2, wr, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      case 0xDA:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FIADD */
-            case 1: /* FIMUL */
-            case 2: /* FICOM */
-            case 3: /* FICOMP */
-            case 4: /* FISUB */
-            case 5: /* FISUBR */
-            case 6: /* FIDIV */
-            case 7: /* FIDIVR */
-               return dis_fpu_mem(cb, 4, rd, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      case 0xDB:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FILD dword-integer */
-               return dis_fpu_mem(cb, 4, rd, eip, first_byte); 
-            case 2: /* FIST dword-integer */
-               return dis_fpu_mem(cb, 4, wr, eip, first_byte); 
-            case 3: /* FISTPl */
-               return dis_fpu_mem(cb, 4, wr, eip, first_byte); 
-            case 5: /* FLD extended-real */
-               return dis_fpu_mem(cb, 10, rd, eip, first_byte); 
-            case 7: /* FSTP extended-real */
-               return dis_fpu_mem(cb, 10, wr, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      case 0xDC:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FADD double-real */
-            case 1: /* FMUL double-real */
-            case 2: /* FCOM double-real */
-            case 3: /* FCOMP double-real */
-            case 4: /* FSUB double-real */
-            case 5: /* FSUBR double-real */
-            case 6: /* FDIV double-real */
-            case 7: /* FDIVR double-real */
-               return dis_fpu_mem(cb, 8, rd, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      case 0xDD:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FLD double-real */
-               return dis_fpu_mem(cb, 8, rd, eip, first_byte); 
-            case 2: /* FST double-real */
-            case 3: /* FSTP double-real */
-               return dis_fpu_mem(cb, 8, wr, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      case 0xDF:
-         switch ((second_byte >> 3) & 7) {
-            case 0: /* FILD word-integer */
-               return dis_fpu_mem(cb, 2, rd, eip, first_byte); 
-            case 2: /* FIST word-integer */
-               return dis_fpu_mem(cb, 2, wr, eip, first_byte); 
-            case 3: /* FISTP word-integer */
-               return dis_fpu_mem(cb, 2, wr, eip, first_byte); 
-            case 5: /* FILD qword-integer */
-               return dis_fpu_mem(cb, 8, rd, eip, first_byte); 
-            case 7: /* FISTP qword-integer */
-               return dis_fpu_mem(cb, 8, wr, eip, first_byte); 
-            default: 
-               goto unhandled;
-         }
-         break;
-
-      default: goto unhandled;
-   }
-
-  unhandled: 
-   VG_(printf)("dis_fpu: unhandled memory case 0x%2x:0x%2x(%d)\n",
-               (UInt)first_byte, (UInt)second_byte, 
-               (UInt)((second_byte >> 3) & 7) );
-   VG_(panic)("dis_fpu: unhandled opcodes");
-}
-
-
-/* Double length left shifts.  Apparently only required in v-size (no
-   b- variant). */
-static
-Addr dis_SHLRD_Gv_Ev ( UCodeBlock* cb, Addr eip, UChar modrm,
-                       Int sz, 
-                       Tag amt_tag, UInt amt_val,
-                       Bool left_shift )
-{
-   /* amt_tag and amt_val denote either ArchReg(%CL) or a Literal.
-      And eip on entry points at the modrm byte. */
-   Int   t, t1, t2, ta, helper;
-   UInt  pair;
-   UChar dis_buf[50];
-
-   vg_assert(sz == 2 || sz == 4);
-
-   helper = left_shift 
-               ? (sz==4 ? VGOFF_(helper_shldl) 
-                        : VGOFF_(helper_shldw))
-               : (sz==4 ? VGOFF_(helper_shrdl) 
-                        : VGOFF_(helper_shrdw));
-
-   /* Get the amount to be shifted by onto the stack. */
-   t = newTemp(cb);
-   t1 = newTemp(cb);
-   t2 = newTemp(cb);
-   if (amt_tag == ArchReg) {
-      vg_assert(amt_val == R_CL);
-      uInstr2(cb, GET, 1, ArchReg, amt_val, TempReg, t);
-   } else {
-      uInstr2(cb, MOV, 1, Literal, 0, TempReg, t);
-      uLiteral(cb, amt_val);
-   }
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr1(cb, PUSH, 1, TempReg, t);
-
-   /* The E-part is the destination; this is shifted.  The G-part
-      supplies bits to be shifted into the E-part, but is not
-      changed. */
-
-   uInstr2(cb, GET,  sz, ArchReg, gregOfRM(modrm), TempReg, t1);
-   uInstr1(cb, PUSH, sz, TempReg, t1);
-
-   if (epartIsReg(modrm)) {
-      eip++;
-      uInstr2(cb, GET,   sz, ArchReg, eregOfRM(modrm), TempReg, t2);
-      uInstr1(cb, PUSH,  sz, TempReg, t2);
-      uInstr1(cb, CALLM, 0,  Lit16,   helper);
-      uFlagsRWU(cb, FlagsEmpty, FlagsOSZACP, FlagsEmpty);
-      uInstr1(cb, POP,   sz, TempReg, t);
-      uInstr2(cb, PUT,   sz, TempReg, t, ArchReg, eregOfRM(modrm));
-      if (dis)
-         VG_(printf)("shld%c %%cl, %s, %s\n",
-                     nameISize(sz), nameIReg(sz, gregOfRM(modrm)), 
-                     nameIReg(sz, eregOfRM(modrm)));
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-      ta   = LOW24(pair);
-      eip  += HI8(pair);
-      uInstr2(cb, LOAD,  sz, TempReg, ta,     TempReg, t2);
-      uInstr1(cb, PUSH,  sz, TempReg, t2);
-      uInstr1(cb, CALLM, 0,  Lit16,   helper);
-      uFlagsRWU(cb, FlagsEmpty, FlagsOSZACP, FlagsEmpty);
-      uInstr1(cb, POP,   sz, TempReg, t);
-      uInstr2(cb, STORE, sz, TempReg, t,      TempReg, ta);
-      SMC_IF_ALL(cb);
-      if (dis)
-         VG_(printf)("shld%c %%cl, %s, %s\n",
-                     nameISize(sz), nameIReg(sz, gregOfRM(modrm)), 
-                     dis_buf);
-   }
-  
-   if (amt_tag == Literal) eip++;
-   uInstr1(cb, CLEAR, 0, Lit16, 8);
-
-   uInstr0(cb, CALLM_E, 0);
-   return eip;
-}
-
-
-/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
-   required. */
-
-typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
-
-static Char* nameBtOp ( BtOp op )
-{
-   switch (op) {
-      case BtOpNone:  return "";
-      case BtOpSet:   return "s";
-      case BtOpReset: return "r";
-      case BtOpComp:  return "c";
-      default: VG_(panic)("nameBtOp");
-   }
-}
-
-
-static
-Addr dis_bt_G_E ( UCodeBlock* cb, Int sz, Addr eip, BtOp op )
-{
-   UInt  pair;
-   UChar dis_buf[50];
-   UChar modrm;
-
-   Int t_addr, t_bitno, t_mask, t_fetched, t_esp, temp, lit;
-
-   /* 2 and 4 are actually possible. */
-   vg_assert(sz == 2 || sz == 4);
-   /* We only handle 4. */
-   vg_assert(sz == 4);
-
-   t_addr = t_bitno = t_mask 
-          = t_fetched = t_esp = temp = INVALID_TEMPREG;
-
-   t_fetched = newTemp(cb);
-   t_bitno   = newTemp(cb);
-   temp      = newTemp(cb);
-   lit       = newTemp(cb);
-
-   modrm = getUChar(eip);
-
-   uInstr2(cb, GET,  sz, ArchReg, gregOfRM(modrm), TempReg, t_bitno);
-
-   if (epartIsReg(modrm)) {
-      eip++;
-      /* Get it onto the client's stack. */
-      t_esp = newTemp(cb);
-      t_addr = newTemp(cb);
-      uInstr2(cb, GET,   4, ArchReg,  R_ESP, TempReg, t_esp);
-      uInstr2(cb, SUB,  sz, Literal,  0,     TempReg, t_esp);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,   4, TempReg,  t_esp, ArchReg, R_ESP);
-      uInstr2(cb, GET,   sz, ArchReg, eregOfRM(modrm), TempReg, temp);
-      uInstr2(cb, STORE, sz, TempReg, temp, TempReg, t_esp);
-      /* Make ta point at it. */
-      uInstr2(cb, MOV,   4,  TempReg, t_esp, TempReg, t_addr);
-      /* Mask out upper bits of the shift amount, since we're doing a
-         reg. */
-      uInstr2(cb, MOV, 4, Literal, 0, TempReg, lit);
-      uLiteral(cb, sz == 4 ? 31 : 15);
-      uInstr2(cb, AND, 4, TempReg, lit, TempReg, t_bitno);
-   } else {
-      pair   = disAMode ( cb, eip, dis?dis_buf:NULL );
-      t_addr = LOW24(pair);
-      eip   += HI8(pair);
-   }
-  
-   /* At this point: ta points to the address being operated on.  If
-      it was a reg, we will have pushed it onto the client's stack.
-      t_bitno is the bit number, suitable masked in the case of a reg.  */
-   
-   /* Now the main sequence. */
-
-   uInstr2(cb, MOV, 4, TempReg, t_bitno, TempReg, temp);
-   uInstr2(cb, SAR, 4, Literal, 0, TempReg, temp);
-   uLiteral(cb, 3);
-   uInstr2(cb, ADD, 4, TempReg, temp, TempReg, t_addr);
-   /* ta now holds effective address */
-
-   uInstr2(cb, MOV, 4, Literal, 0, TempReg, lit);
-   uLiteral(cb, 7);
-   uInstr2(cb, AND, 4, TempReg, lit, TempReg, t_bitno);
-   /* bitno contains offset of bit within byte */
-
-   if (op != BtOpNone) {
-      t_mask = newTemp(cb);
-      uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_mask);
-      uLiteral(cb, 1);
-      uInstr2(cb, SHL, 4, TempReg, t_bitno, TempReg, t_mask);
-   }
-   /* mask is now a suitable byte mask */
-
-   uInstr2(cb, LOAD, 1, TempReg, t_addr, TempReg, t_fetched);
-   if (op != BtOpNone) {
-      uInstr2(cb, MOV, 4, TempReg, t_fetched, TempReg, temp);
-      switch (op) {
-         case BtOpSet: 
-            uInstr2(cb, OR, 4, TempReg, t_mask, TempReg, temp); 
-            break;
-         case BtOpComp: 
-            uInstr2(cb, XOR, 4, TempReg, t_mask, TempReg, temp); 
-            break;
-         case BtOpReset: 
-            uInstr1(cb, NOT, 4, TempReg, t_mask);
-            uInstr2(cb, AND, 4, TempReg, t_mask, TempReg, temp); 
-            break;
-         default: 
-            VG_(panic)("dis_bt_G_E");
-      }
-      uInstr2(cb, STORE, 1, TempReg, temp, TempReg, t_addr);
-   }
-
-   /* Side effect done; now get selected bit into Carry flag */
-
-   uInstr2(cb, SHR, 4, TempReg, t_bitno, TempReg, t_fetched);
-   /* at bit 0 of fetched */
-
-   uInstr2(cb, MOV, 4, Literal, 0, TempReg, lit);
-   uLiteral(cb, 1);
-   uInstr2(cb, AND, 4, TempReg, lit, TempReg, t_fetched);
-   /* fetched is now 1 or 0 */
-
-   /* NEG is a handy way to convert zero/nonzero into the carry
-      flag. */
-   uInstr1(cb, NEG, 4, TempReg, t_fetched);
-   setFlagsFromUOpcode(cb, NEG);
-   /* fetched is now in carry flag */
-
-   /* Move reg operand from stack back to reg */
-   if (epartIsReg(modrm)) {
-      /* t_esp still points at it. */
-      uInstr2(cb, LOAD, sz, TempReg, t_esp, TempReg, temp);
-      uInstr2(cb, PUT,  sz, TempReg, temp, ArchReg, eregOfRM(modrm));
-      uInstr2(cb, ADD,  sz, Literal, 0, TempReg, t_esp);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,  4,  TempReg, t_esp, ArchReg, R_ESP);
-   }
-
-   if (epartIsReg(modrm)) {
-      if (dis)
-         VG_(printf)("bt%s%c %s, %s\n",
-                     nameBtOp(op),
-                     nameISize(sz), nameIReg(sz, gregOfRM(modrm)), 
-                     nameIReg(sz, eregOfRM(modrm)));
-   } else {
-      if (dis)
-         VG_(printf)("bt%s%c %s, %s\n",
-                     nameBtOp(op),
-                     nameISize(sz), nameIReg(sz, gregOfRM(modrm)), 
-                     dis_buf);
-   }
- 
-   return eip;
-}
-
-
-
-
-/* Handle BSF/BSR.  Only v-size seems necessary. */
-static
-Addr dis_bs_E_G ( UCodeBlock* cb, Int sz, Addr eip, Bool fwds )
-{
-   Int   t, t1, ta, helper;
-   UInt  pair;
-   UChar dis_buf[50];
-   UChar modrm;
-
-   vg_assert(sz == 2 || sz == 4);
-   vg_assert(sz==4);
-
-   helper = fwds ? VGOFF_(helper_bsf) : VGOFF_(helper_bsr);
-   modrm  = getUChar(eip);
-   t1     = newTemp(cb);
-   t      = newTemp(cb);
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr2(cb, GET,  sz, ArchReg, gregOfRM(modrm), TempReg, t1);
-   uInstr1(cb, PUSH, sz, TempReg, t1);
-
-   if (epartIsReg(modrm)) {
-      eip++;
-      uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t);
-      if (dis)
-         VG_(printf)("bs%c%c %s, %s\n",
-                     fwds ? 'f' : 'r',
-                     nameISize(sz), nameIReg(sz, eregOfRM(modrm)), 
-                     nameIReg(sz, gregOfRM(modrm)));
-   } else {
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-      ta   = LOW24(pair);
-      eip  += HI8(pair);
-      uInstr2(cb, LOAD, sz, TempReg, ta, TempReg, t);
-      if (dis)
-         VG_(printf)("bs%c%c %s, %s\n",
-                     fwds ? 'f' : 'r',
-                     nameISize(sz), dis_buf,
-                     nameIReg(sz, gregOfRM(modrm)));
-   }
-
-   uInstr1(cb, PUSH,  sz,  TempReg, t);
-   uInstr1(cb, CALLM, 0,   Lit16, helper);
-   uFlagsRWU(cb, FlagsEmpty, FlagZ, FlagsOSACP);
-   uInstr1(cb, POP,   sz,  TempReg, t);
-   uInstr1(cb, POP,   sz,  TempReg, t);
-   uInstr2(cb, PUT,   sz,  TempReg, t, ArchReg, gregOfRM(modrm));
-   uInstr0(cb, CALLM_E, 0);
-
-   return eip;
-}
-
-
-static 
-void codegen_xchg_eAX_Reg ( UCodeBlock* cb, Int sz, Int reg )
-{
-   Int t1, t2;
-   vg_assert(sz == 2 || sz == 4);
-   t1 = newTemp(cb);
-   t2 = newTemp(cb);
-   uInstr2(cb, GET, sz, ArchReg, R_EAX, TempReg, t1);
-   uInstr2(cb, GET, sz, ArchReg, reg,   TempReg, t2);
-   uInstr2(cb, PUT, sz, TempReg, t2,    ArchReg, R_EAX);
-   uInstr2(cb, PUT, sz, TempReg, t1,    ArchReg, reg);
-   if (dis)
-      VG_(printf)("xchg%c %s, %s\n", nameISize(sz),
-                  nameIReg(sz, R_EAX), nameIReg(sz, reg));
-}
-
-
-static 
-void codegen_SAHF ( UCodeBlock* cb )
-{
-   Int t   = newTemp(cb);
-   Int t2  = newTemp(cb);
-   uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t);
-
-   /* Mask out parts of t not corresponding to %AH.  This stops the
-      instrumenter complaining if they are undefined.  Otherwise, the
-      instrumenter would check all 32 bits of t at the PUSH, which
-      could be the cause of incorrect warnings.  Discovered by Daniel
-      Veillard <veillard@redhat.com>. 
-   */
-   uInstr2(cb, MOV, 4, Literal, 0, TempReg, t2);
-   uLiteral(cb, 0x0000FF00);
-   uInstr2(cb, AND, 4, TempReg, t2, TempReg, t);
-   /* We deliberately don't set the condition codes here, since this
-      AND is purely internal to Valgrind and nothing to do with the
-      client's state. */
-
-   uInstr0(cb, CALLM_S, 0);
-   uInstr1(cb, PUSH,  4, TempReg, t);
-   uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_SAHF));
-   uFlagsRWU(cb, FlagsEmpty, FlagsSZACP, FlagsEmpty);
-   uInstr1(cb, CLEAR, 0, Lit16, 4);
-   uInstr0(cb, CALLM_E, 0);
-}
-
-
-static
-Addr dis_cmpxchg_G_E ( UCodeBlock* cb, 
-                       Int         size, 
-                       Addr        eip0 )
-{
-   Int   ta, junk, dest, src, acc;
-   UChar dis_buf[50];
-   UChar rm;
-
-   rm   = getUChar(eip0);
-   acc  = newTemp(cb);
-   src  = newTemp(cb);
-   dest = newTemp(cb);
-   junk = newTemp(cb);
-   /* Only needed to get gcc's dataflow analyser off my back. */
-   ta   = INVALID_TEMPREG;
-
-   if (epartIsReg(rm)) {
-     uInstr2(cb, GET, size, ArchReg, eregOfRM(rm), TempReg, dest);
-     eip0++;
-     if (dis) VG_(printf)("cmpxchg%c %s,%s\n", 
-                          nameISize(size),
-                          nameIReg(size,gregOfRM(rm)),
-                          nameIReg(size,eregOfRM(rm)) );
-     nameIReg(size,eregOfRM(rm));
-   } else {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL );
-      ta        = LOW24(pair);
-      uInstr2(cb, LOAD, size, TempReg, ta, TempReg, dest);
-      eip0 += HI8(pair);
-      if (dis) VG_(printf)("cmpxchg%c %s,%s\n",  nameISize(size), 
-                           nameIReg(size,gregOfRM(rm)), dis_buf);
-   }
-
-   uInstr2(cb, GET, size, ArchReg, gregOfRM(rm), TempReg, src);
-   uInstr2(cb, GET, size, ArchReg, R_EAX,        TempReg, acc);
-   uInstr2(cb, MOV, size, TempReg, acc,          TempReg, junk);
-   uInstr2(cb, SUB, size, TempReg, dest,         TempReg, junk);
-   setFlagsFromUOpcode(cb, SUB);
-
-   uInstr2(cb, CMOV, 4, TempReg, src,  TempReg, dest);
-   uCond(cb, CondZ);
-   uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-   uInstr2(cb, CMOV, 4, TempReg, dest, TempReg, acc);
-   uCond(cb, CondNZ);
-   uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-
-   uInstr2(cb, PUT, size, TempReg, acc, ArchReg, R_EAX);
-   if (epartIsReg(rm)) {
-     uInstr2(cb, PUT,   size, TempReg, dest, ArchReg, eregOfRM(rm));
-   } else {
-     uInstr2(cb, STORE, size, TempReg, dest, TempReg, ta);
-   }
-
-   return eip0;
-}
-
-
-/* Handle conditional move instructions of the form
-      cmovcc E(reg-or-mem), G(reg)
-
-   E(src) is reg-or-mem
-   G(dst) is reg.
-
-   If E is reg, -->    GET %E, tmps
-                       GET %G, tmpd
-                       CMOVcc tmps, tmpd
-                       PUT tmpd, %G
- 
-   If E is mem  -->    (getAddr E) -> tmpa
-                       LD (tmpa), tmps
-                       GET %G, tmpd
-                       CMOVcc tmps, tmpd
-                       PUT tmpd, %G
-*/
-static
-Addr dis_cmov_E_G ( UCodeBlock* cb, 
-                    Int         size, 
-                    Condcode    cond,
-                    Addr        eip0 )
-{
-   UChar rm  = getUChar(eip0);
-   UChar dis_buf[50];
-
-   Int tmps = newTemp(cb);
-   Int tmpd = newTemp(cb);   
-
-   if (epartIsReg(rm)) {
-      uInstr2(cb, GET,  size, ArchReg, eregOfRM(rm), TempReg, tmps);
-      uInstr2(cb, GET,  size, ArchReg, gregOfRM(rm), TempReg, tmpd);
-      uInstr2(cb, CMOV,    4, TempReg, tmps, TempReg, tmpd);
-      uCond(cb, cond);
-      uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-      uInstr2(cb, PUT, size, TempReg, tmpd, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("cmov%c%s %s,%s\n", 
-                           nameISize(size), 
-                           VG_(nameCondcode)(cond),
-                           nameIReg(size,eregOfRM(rm)),
-                           nameIReg(size,gregOfRM(rm)));
-      return 1+eip0;
-   }
-
-   /* E refers to memory */    
-   {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa = LOW24(pair);
-      uInstr2(cb, LOAD, size, TempReg, tmpa, TempReg, tmps);
-      uInstr2(cb, GET,  size, ArchReg, gregOfRM(rm), TempReg, tmpd);
-      uInstr2(cb, CMOV,    4, TempReg, tmps, TempReg, tmpd);
-      uCond(cb, cond);
-      uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-      uInstr2(cb, PUT, size, TempReg, tmpd, ArchReg, gregOfRM(rm));
-      if (dis) VG_(printf)("cmov%c%s %s,%s\n", 
-                           nameISize(size), 
-                           VG_(nameCondcode)(cond),
-                           dis_buf,
-                           nameIReg(size,gregOfRM(rm)));
-      return HI8(pair)+eip0;
-   }
-}
-
-
-static
-Addr dis_xadd_G_E ( UCodeBlock* cb, 
-                    Int         sz, 
-                    Addr        eip0 )
-{
-   UChar rm  = getUChar(eip0);
-   UChar dis_buf[50];
-
-   Int tmpd = newTemp(cb);   
-   Int tmpt = newTemp(cb);
-
-   if (epartIsReg(rm)) {
-      uInstr2(cb, GET, sz, ArchReg, eregOfRM(rm), TempReg, tmpd);
-      uInstr2(cb, GET, sz, ArchReg, gregOfRM(rm), TempReg, tmpt);
-      uInstr2(cb, ADD, sz, TempReg, tmpd, TempReg, tmpt);
-      setFlagsFromUOpcode(cb, ADD);
-      uInstr2(cb, PUT, sz, TempReg, tmpt, ArchReg, eregOfRM(rm));
-      uInstr2(cb, PUT, sz, TempReg, tmpd, ArchReg, gregOfRM(rm));
-      if (dis)
-         VG_(printf)("xadd%c %s, %s\n", nameISize(sz), 
-                     nameIReg(sz,gregOfRM(rm)), 
-                     nameIReg(sz,eregOfRM(rm)));
-      return 1+eip0;
-   } else {
-      UInt pair = disAMode ( cb, eip0, dis?dis_buf:NULL);
-      Int  tmpa  = LOW24(pair);
-      uInstr2(cb, LOAD, sz, TempReg, tmpa,          TempReg, tmpd);
-      uInstr2(cb, GET,  sz, ArchReg, gregOfRM(rm),  TempReg, tmpt);
-      uInstr2(cb,  ADD, sz, TempReg, tmpd, TempReg, tmpt);
-      setFlagsFromUOpcode(cb, ADD);
-      uInstr2(cb, STORE, sz, TempReg, tmpt, TempReg, tmpa);
-      SMC_IF_SOME(cb);
-      uInstr2(cb, PUT, sz, TempReg, tmpd, ArchReg, gregOfRM(rm));
-      if (dis)
-         VG_(printf)("xadd%c %s, %s\n", nameISize(sz), 
-                     nameIReg(sz,gregOfRM(rm)), 
-                     dis_buf);
-      return HI8(pair)+eip0;
-   }
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Disassembling entire basic blocks                    ---*/
-/*------------------------------------------------------------*/
-
-/* Disassemble a single instruction into ucode, returning the update
-   eip, and setting *isEnd to True if this is the last insn in a basic
-   block.  Also do debug printing if (dis). */
-
-static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd )
-{
-   UChar opc, modrm, abyte;
-   UInt  d32, pair;
-   Int   t1, t2, t3, t4;
-   UChar dis_buf[50];
-   Int   am_sz, d_sz;
-
-   Int   sz           = 4;
-   Int   first_uinstr = cb->used;
-   *isEnd = False;
-   t1 = t2 = t3 = t4 = INVALID_TEMPREG;
-
-   if (dis) VG_(printf)("\t0x%x:  ", eip);
-
-   /* Spot the client-request magic sequence. */
-   {
-      UChar* myeip = (UChar*)eip;
-      /* Spot this:
-         C1C01D                roll $29, %eax
-         C1C003                roll $3,  %eax
-         C1C81B                rorl $27, %eax
-         C1C805                rorl $5,  %eax
-         C1C00D                roll $13, %eax
-         C1C013                roll $19, %eax      
-      */
-      if (myeip[ 0] == 0xC1 && myeip[ 1] == 0xC0 && myeip[ 2] == 0x1D &&
-          myeip[ 3] == 0xC1 && myeip[ 4] == 0xC0 && myeip[ 5] == 0x03 &&
-          myeip[ 6] == 0xC1 && myeip[ 7] == 0xC8 && myeip[ 8] == 0x1B &&
-          myeip[ 9] == 0xC1 && myeip[10] == 0xC8 && myeip[11] == 0x05 &&
-          myeip[12] == 0xC1 && myeip[13] == 0xC0 && myeip[14] == 0x0D &&
-          myeip[15] == 0xC1 && myeip[16] == 0xC0 && myeip[17] == 0x13
-         ) {
-         eip += 18;
-         uInstr1(cb, JMP,  0, Literal, 0);
-         uLiteral(cb, eip);
-         uCond(cb, CondAlways);
-         LAST_UINSTR(cb).jmpkind = JmpClientReq;
-         *isEnd = True;
-         if (dis) 
-            VG_(printf)("%%edx = client_request ( %%eax )\n");
-         return eip;
-      }
-   }
-
-   /* Skip a LOCK prefix. */
-   if (getUChar(eip) == 0xF0) { 
-      /* VG_(printf)("LOCK LOCK LOCK LOCK LOCK \n"); */
-      eip++;
-   }
-
-   /* Crap out if we see a segment override prefix. */
-   if (getUChar(eip) == 0x65) {
-      VG_(message)(Vg_DebugMsg, "");
-      VG_(message)(Vg_DebugMsg, "Possible workaround for the following abort: do not use special");
-      VG_(message)(Vg_DebugMsg, "PII/PIII-specific pthreads library (possibly in /lib/i686/*.so).");
-      VG_(message)(Vg_DebugMsg, "You might be able to kludge around this by renaming /lib/i686 to");
-      VG_(message)(Vg_DebugMsg, "/lib/i686-HIDDEN.  On RedHat 7.2 this causes ld.so to fall back");
-      VG_(message)(Vg_DebugMsg, "to using the less specialised versions in /lib instead, which");
-      VG_(message)(Vg_DebugMsg, "valgrind might be able to better deal with.");
-      VG_(message)(Vg_DebugMsg, "");
-      VG_(message)(Vg_DebugMsg, "WARNING. WARNING. WARNING. WARNING. WARNING. WARNING. WARNING.");
-      VG_(message)(Vg_DebugMsg, "WARNING: The suggested kludge may also render your system unbootable");
-      VG_(message)(Vg_DebugMsg, "WARNING: or otherwise totally screw it up.  Only try this if you");
-      VG_(message)(Vg_DebugMsg, "WARNING: know what you are doing, and are prepared to take risks.");
-      VG_(message)(Vg_DebugMsg, "YOU HAVE BEEN WARNED. YOU HAVE BEEN WARNED. YOU HAVE BEEN WARNED.");
-      VG_(message)(Vg_DebugMsg, "");
-      VG_(message)(Vg_DebugMsg, "Another consideration is that this may well mean your application");
-      VG_(message)(Vg_DebugMsg, "uses threads, which valgrind doesn't currently support, so even if");
-      VG_(message)(Vg_DebugMsg, "you work around this problem, valgrind may abort later if it sees");
-      VG_(message)(Vg_DebugMsg, "a clone() system call.");
-      VG_(unimplemented)("x86 segment override (SEG=GS) prefix; see above for details");
-   }
-
-   /* Detect operand-size overrides. */
-   if (getUChar(eip) == 0x66) { sz = 2; eip++; };
-
-   opc = getUChar(eip); eip++;
-
-   switch (opc) {
-
-   /* ------------------------ Control flow --------------- */
-
-   case 0xC2: /* RET imm16 */
-      d32 = getUDisp16(eip); eip += 2;
-      goto do_Ret;
-   case 0xC3: /* RET */
-      d32 = 0;
-      goto do_Ret;
-   do_Ret:
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, GET,  4, ArchReg, R_ESP, TempReg, t1);
-      uInstr2(cb, LOAD, 4, TempReg, t1,    TempReg, t2);
-      uInstr2(cb, ADD,  4, Literal, 0,     TempReg, t1);
-      uLiteral(cb, 4+d32);
-      uInstr2(cb, PUT,  4, TempReg, t1,    ArchReg, R_ESP);
-      uInstr1(cb, JMP,  0, TempReg, t2);
-      uCond(cb, CondAlways);
-      LAST_UINSTR(cb).jmpkind = JmpRet;
-
-      *isEnd = True;
-      if (dis) {
-         if (d32 == 0) VG_(printf)("ret\n"); 
-                  else VG_(printf)("ret %d\n", d32);
-      }
-      break;
-      
-   case 0xE8: /* CALL J4 */
-      d32 = getUDisp32(eip); eip += 4;
-      d32 += eip; /* eip now holds return-to addr, d32 is call-to addr */
-      if (d32 == eip && getUChar(eip) >= 0x58 
-                     && getUChar(eip) <= 0x5F) {
-         /* Specially treat the position-independent-code idiom 
-                 call X
-              X: popl %reg
-            as 
-                 movl %eip, %reg.
-            since this generates better code, but for no other reason. */
-         Int archReg = getUChar(eip) - 0x58;
-         /* VG_(printf)("-- fPIC thingy\n"); */
-         t1 = newTemp(cb);
-         uInstr2(cb, MOV, 4, Literal, 0, TempReg, t1);
-         uLiteral(cb, eip);
-         uInstr2(cb, PUT, 4, TempReg, t1,  ArchReg, archReg);
-         eip++; /* Step over the POP */
-         if (dis) 
-            VG_(printf)("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
-      } else {
-         /* The normal sequence for a call. */
-         t1 = newTemp(cb); t2 = newTemp(cb); t3 = newTemp(cb);
-         uInstr2(cb, GET,   4, ArchReg, R_ESP, TempReg, t3);
-         uInstr2(cb, MOV,   4, TempReg, t3,    TempReg, t1);
-         uInstr2(cb, SUB,   4, Literal, 0,     TempReg, t1);
-	 uLiteral(cb, 4);
-         uInstr2(cb, PUT,   4, TempReg, t1,    ArchReg, R_ESP);
-         uInstr2(cb, MOV,   4, Literal, 0,     TempReg, t2);
-	 uLiteral(cb, eip);
-         uInstr2(cb, STORE, 4, TempReg, t2,    TempReg, t1);
-         SMC_IF_ALL(cb);
-         uInstr1(cb, JMP,   0, Literal, 0);
-	 uLiteral(cb, d32);
-         uCond(cb, CondAlways);
-         LAST_UINSTR(cb).jmpkind = JmpCall;
-         *isEnd = True;
-         if (dis) VG_(printf)("call 0x%x\n",d32);
-      }
-      break;
-
-   case 0xC9: /* LEAVE */
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, GET,  4, ArchReg, R_EBP, TempReg, t1);
-      uInstr2(cb, PUT,  4, TempReg, t1, ArchReg, R_ESP);
-      uInstr2(cb, LOAD, 4, TempReg, t1, TempReg, t2);
-      uInstr2(cb, PUT,  4, TempReg, t2, ArchReg, R_EBP);
-      uInstr2(cb, ADD,  4, Literal, 0, TempReg, t1);
-      uLiteral(cb, 4);
-      uInstr2(cb, PUT,  4, TempReg, t1, ArchReg, R_ESP);
-      if (dis) VG_(printf)("leave");
-      break;
-
-   /* ---------------- Misc wierd-ass insns --------------- */
-
-   case 0x27: /* DAA */
-   case 0x2F: /* DAS */
-      t1 = newTemp(cb);
-      uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t1);
-      /* Widen %AL to 32 bits, so it's all defined when we push it. */
-      uInstr1(cb, WIDEN, 4, TempReg, t1);
-      LAST_UINSTR(cb).extra4b = 1;
-      LAST_UINSTR(cb).signed_widen = False;
-      uInstr0(cb, CALLM_S, 0);
-      uInstr1(cb, PUSH, 4, TempReg, t1);
-      uInstr1(cb, CALLM, 0, Lit16, 
-                  opc == 0x27 ? VGOFF_(helper_DAA) : VGOFF_(helper_DAS) );
-      uFlagsRWU(cb, FlagsAC, FlagsOSZACP, FlagsEmpty);
-      uInstr1(cb, POP, 4, TempReg, t1);
-      uInstr0(cb, CALLM_E, 0);
-      uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, R_AL);
-      if (dis) VG_(printf)(opc == 0x27 ? "daa\n" : "das\n");
-      break;
-
-   /* ------------------------ CWD/CDQ -------------------- */
-
-   case 0x98: /* CBW */
-      t1 = newTemp(cb);
-      if (sz == 4) {
-         uInstr2(cb, GET,   2, ArchReg, R_EAX, TempReg, t1);
-         uInstr1(cb, WIDEN, 4, TempReg, t1); /* 4 == dst size */
-         LAST_UINSTR(cb).extra4b = 2; /* the source size */
-         LAST_UINSTR(cb).signed_widen = True;
-         uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, R_EAX);
-         if (dis) VG_(printf)("cwd\n");
-      } else {
-         vg_assert(sz == 2);
-         uInstr2(cb, GET,   1, ArchReg, R_EAX, TempReg, t1);
-         uInstr1(cb, WIDEN, 2, TempReg, t1); /* 2 == dst size */
-         LAST_UINSTR(cb).extra4b = 1; /* the source size */
-         LAST_UINSTR(cb).signed_widen = True;
-         uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
-         if (dis) VG_(printf)("cbw\n");
-      }
-      break;
-
-   case 0x99: /* CWD/CDQ */
-      t1 = newTemp(cb);
-      uInstr2(cb, GET, sz, ArchReg, R_EAX, TempReg, t1);
-      uInstr2(cb, SAR, sz, Literal, 0,     TempReg, t1);
-      uLiteral(cb, sz == 2 ? 15  : 31);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, R_EDX);
-      if (dis) VG_(printf)(sz == 2 ? "cwdq\n" : "cdqq\n");
-      break;
-
-   /* ------------------------ FPU ops -------------------- */
-
-   case 0x9E: /* SAHF */
-      codegen_SAHF ( cb );
-      if (dis) VG_(printf)("sahf\n");
-      break;
-
-   case 0x9B: /* FWAIT */
-      /* ignore? */
-      if (dis) VG_(printf)("fwait\n");
-      break;
-
-   case 0xD8:
-   case 0xD9:
-   case 0xDA:
-   case 0xDB:
-   case 0xDC:
-   case 0xDD:
-   case 0xDE:
-   case 0xDF:
-      eip = dis_fpu ( cb, opc, eip );
-      break;
-
-   /* ------------------------ INC & DEC ------------------ */
-
-   case 0x40: /* INC eAX */
-   case 0x41: /* INC eCX */
-   case 0x42: /* INC eDX */
-   case 0x43: /* INC eBX */
-   case 0x45: /* INC eBP */
-   case 0x46: /* INC eSI */
-   case 0x47: /* INC eDI */
-      t1 = newTemp(cb);
-      uInstr2(cb, GET, sz, ArchReg, (UInt)(opc - 0x40),
-                             TempReg, t1);
-      uInstr1(cb, INC, sz, TempReg, t1);
-      setFlagsFromUOpcode(cb, INC);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg,
-                             (UInt)(opc - 0x40));
-      if (dis)
-         VG_(printf)("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
-      break;
-
-   case 0x48: /* DEC eAX */
-   case 0x49: /* DEC eCX */
-   case 0x4A: /* DEC eDX */
-   case 0x4B: /* DEC eBX */
-   case 0x4D: /* DEC eBP */
-   case 0x4E: /* DEC eSI */
-   case 0x4F: /* DEC eDI */
-      t1 = newTemp(cb);
-      uInstr2(cb, GET, sz, ArchReg, (UInt)(opc - 0x48),
-                             TempReg, t1);
-      uInstr1(cb, DEC, sz, TempReg, t1);
-      setFlagsFromUOpcode(cb, DEC);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg,
-                             (UInt)(opc - 0x48));
-      if (dis)
-         VG_(printf)("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
-      break;
-
-   /* ------------------------ INT ------------------------ */
-
-   case 0xCD: /* INT imm8 */
-      d32 = getUChar(eip); eip++;
-      if (d32 != 0x80) VG_(panic)("disInstr: INT but not 0x80 !");
-      /* It's important that all ArchRegs carry their up-to-date value
-         at this point.  So we declare an end-of-block here, which
-         forces any TempRegs caching ArchRegs to be flushed. */
-      uInstr1(cb, JMP,  0, Literal, 0);
-      uLiteral(cb, eip);
-      uCond(cb, CondAlways);
-      LAST_UINSTR(cb).jmpkind = JmpSyscall;
-      *isEnd = True;
-      if (dis) VG_(printf)("int $0x80\n");
-      break;
-
-   /* ------------------------ Jcond, byte offset --------- */
-
-   case 0xEB: /* Jb (jump, byte offset) */
-      d32 = (eip+1) + getSDisp8(eip); eip++;
-      uInstr1(cb, JMP, 0, Literal, 0);
-      uLiteral(cb, d32);
-      uCond(cb, CondAlways);
-      *isEnd = True;
-      if (dis)
-         VG_(printf)("jmp-8 0x%x\n", d32);
-      break;
-
-   case 0xE9: /* Jv (jump, 16/32 offset) */
-      d32 = (eip+sz) + getSDisp(sz,eip); eip += sz;
-      uInstr1(cb, JMP, 0, Literal, 0);
-      uLiteral(cb, d32);
-      uCond(cb, CondAlways);
-      *isEnd = True;
-      if (dis)
-        VG_(printf)("jmp 0x%x\n", d32);
-      break;
-
-   case 0x70:
-   case 0x71:
-   case 0x72: /* JBb/JNAEb (jump below) */
-   case 0x73: /* JNBb/JAEb (jump not below) */
-   case 0x74: /* JZb/JEb (jump zero) */
-   case 0x75: /* JNZb/JNEb (jump not zero) */
-   case 0x76: /* JBEb/JNAb (jump below or equal) */
-   case 0x77: /* JNBEb/JAb (jump not below or equal) */
-   case 0x78: /* JSb (jump negative) */
-   case 0x79: /* JSb (jump not negative) */
-   case 0x7A: /* JP (jump parity even) */
-   case 0x7B: /* JNP/JPO (jump parity odd) */
-   case 0x7C: /* JLb/JNGEb (jump less) */
-   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
-   case 0x7E: /* JLEb/JNGb (jump less or equal) */
-   case 0x7F: /* JGb/JNLEb (jump greater) */
-      d32 = (eip+1) + getSDisp8(eip); eip++;
-      uInstr1(cb, JMP, 0, Literal, 0);
-      uLiteral(cb, d32);
-      uCond(cb, (Condcode)(opc - 0x70));
-      uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-      /* It's actually acceptable not to end this basic block at a
-         control transfer, reducing the number of jumps through
-         vg_dispatch, at the expense of possibly translating the insns
-         following this jump twice.  This does give faster code, but
-         on the whole I don't think the effort is worth it. */
-      uInstr1(cb, JMP, 0, Literal, 0);
-      uLiteral(cb, eip);
-      uCond(cb, CondAlways);
-      *isEnd = True;
-      /* The above 3 lines would be removed if the bb was not to end
-         here. */
-      if (dis)
-         VG_(printf)("j%s-8 0x%x\n", VG_(nameCondcode)(opc - 0x70), d32);
-      break;
-
-   case 0xE3: /* JECXZ or perhaps JCXZ, depending on OSO ?  Intel
-                 manual says it depends on address size override,
-                 which doesn't sound right to me. */
-      d32 = (eip+1) + getSDisp8(eip); eip++;
-      t1 = newTemp(cb);
-      uInstr2(cb, GET,  4,  ArchReg, R_ECX, TempReg, t1);
-      uInstr2(cb, JIFZ, 4,  TempReg, t1,    Literal, 0);
-      uLiteral(cb, d32);
-      if (dis)
-         VG_(printf)("j%sz 0x%x\n", nameIReg(sz, R_ECX), d32);
-      break;
-
-   case 0xE2: /* LOOP disp8 */
-      /* Again, the docs say this uses ECX/CX as a count depending on
-         the address size override, not the operand one.  Since we
-         don't handle address size overrides, I guess that means
-         ECX. */
-      d32 = (eip+1) + getSDisp8(eip); eip++;
-      t1 = newTemp(cb);
-      uInstr2(cb, GET,  4, ArchReg, R_ECX, TempReg, t1);
-      uInstr1(cb, DEC,  4, TempReg, t1);
-      uInstr2(cb, PUT,  4, TempReg, t1,    ArchReg, R_ECX);
-      uInstr2(cb, JIFZ, 4, TempReg, t1,    Literal, 0);
-      uLiteral(cb, eip);
-      uInstr1(cb, JMP,  0, Literal, 0);
-      uLiteral(cb, d32);
-      uCond(cb, CondAlways);
-      *isEnd = True;
-      if (dis)
-         VG_(printf)("loop 0x%x\n", d32);
-      break;
-
-   /* ------------------------ IMUL ----------------------- */
-
-   case 0x69: /* IMUL Iv, Ev, Gv */
-      eip = dis_imul_I_E_G ( cb, sz, eip, sz );
-      break;
-   case 0x6B: /* IMUL Ib, Ev, Gv */
-      eip = dis_imul_I_E_G ( cb, sz, eip, 1 );
-      break;
-
-   /* ------------------------ MOV ------------------------ */
-
-   case 0x88: /* MOV Gb,Eb */
-      eip = dis_mov_G_E(cb, 1, eip);
-      break;
-
-   case 0x89: /* MOV Gv,Ev */
-      eip = dis_mov_G_E(cb, sz, eip);
-      break;
-
-   case 0x8A: /* MOV Eb,Gb */
-      eip = dis_mov_E_G(cb, 1, eip);
-      break;
- 
-   case 0x8B: /* MOV Ev,Gv */
-      eip = dis_mov_E_G(cb, sz, eip);
-      break;
- 
-   case 0x8D: /* LEA M,Gv */
-      modrm = getUChar(eip);
-      if (epartIsReg(modrm)) 
-         VG_(panic)("LEA M,Gv: modRM refers to register");
-      pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-      eip  += HI8(pair);
-      t1   = LOW24(pair);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, gregOfRM(modrm));
-      if (dis)
-         VG_(printf)("lea%c %s, %s\n", nameISize(sz), dis_buf, 
-                                       nameIReg(sz,gregOfRM(modrm)));
-      break;
-
-   case 0xA0: /* MOV Ob,AL */
-      sz = 1;
-      /* Fall through ... */
-   case 0xA1: /* MOV Ov,eAX */
-      d32 = getUDisp32(eip); eip += 4;
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, MOV,   4, Literal, 0,   TempReg, t2);
-      uLiteral(cb, d32);
-      uInstr2(cb, LOAD, sz, TempReg, t2,  TempReg, t1);
-      uInstr2(cb, PUT,  sz, TempReg, t1,  ArchReg, R_EAX);
-      if (dis) VG_(printf)("mov%c 0x%x,%s\n", nameISize(sz), 
-                           d32, nameIReg(sz,R_EAX));
-      break;
-
-   case 0xA2: /* MOV AL,Ob */
-      sz = 1;
-      /* Fall through ... */
-   case 0xA3: /* MOV eAX,Ov */
-      d32 = getUDisp32(eip); eip += 4;
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, GET,   sz, ArchReg, R_EAX, TempReg, t1);
-      uInstr2(cb, MOV,    4, Literal, 0,     TempReg, t2);
-      uLiteral(cb, d32);
-      uInstr2(cb, STORE, sz, TempReg, t1,    TempReg, t2);
-      SMC_IF_SOME(cb);
-      if (dis) VG_(printf)("mov%c %s,0x%x\n", nameISize(sz), 
-                           nameIReg(sz,R_EAX), d32);
-      break;
-
-   case 0xB0: /* MOV imm,AL */
-   case 0xB1: /* MOV imm,CL */
-   case 0xB2: /* MOV imm,DL */
-   case 0xB3: /* MOV imm,BL */
-   case 0xB4: /* MOV imm,AH */
-   case 0xB5: /* MOV imm,CH */
-   case 0xB6: /* MOV imm,DH */
-   case 0xB7: /* MOV imm,BH */
-      d32 = getUChar(eip); eip += 1;
-      t1 = newTemp(cb);
-      uInstr2(cb, MOV, 1, Literal, 0,  TempReg, t1);
-      uLiteral(cb, d32);
-      uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, opc-0xB0);
-      if (dis) VG_(printf)("movb $0x%x,%s\n", d32,
-                           nameIReg(1,opc-0xB0));
-      break;
-
-   case 0xB8: /* MOV imm,eAX */
-   case 0xB9: /* MOV imm,eCX */
-   case 0xBA: /* MOV imm,eDX */
-   case 0xBB: /* MOV imm,eBX */
-   case 0xBD: /* MOV imm,eBP */
-   case 0xBE: /* MOV imm,eSI */
-   case 0xBF: /* MOV imm,eDI */
-      d32 = getUDisp(sz,eip); eip += sz;
-      t1 = newTemp(cb);
-      uInstr2(cb, MOV, sz, Literal, 0,  TempReg, t1);
-      uLiteral(cb, d32);
-      uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, opc-0xB8);
-      if (dis) VG_(printf)("mov%c $0x%x,%s\n", nameISize(sz), d32,
-                           nameIReg(sz,opc-0xB8));
-      break;
-
-   case 0xC6: /* MOV Ib,Eb */
-      sz = 1;
-      goto do_Mov_I_E;
-   case 0xC7: /* MOV Iv,Ev */
-      goto do_Mov_I_E;
-
-   do_Mov_I_E:
-      modrm = getUChar(eip);
-      if (epartIsReg(modrm)) {
-         d32 = getUDisp(sz,eip); eip += sz;
-         t1 = newTemp(cb);
-         uInstr2(cb, MOV, sz, Literal, 0,  TempReg, t1);
-	 uLiteral(cb, d32);
-         uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, eregOfRM(modrm));
-         if (dis) VG_(printf)("mov%c $0x%x, %s\n", nameISize(sz), d32, 
-                              nameIReg(sz,eregOfRM(modrm)));
-      } else {
-         pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-         eip += HI8(pair);
-         d32 = getUDisp(sz,eip); eip += sz;
-         t1 = newTemp(cb);
-         t2 = LOW24(pair);
-         uInstr2(cb, MOV, sz, Literal, 0, TempReg, t1);
-	 uLiteral(cb, d32);
-         uInstr2(cb, STORE, sz, TempReg, t1, TempReg, t2);
-         SMC_IF_SOME(cb);
-         if (dis) VG_(printf)("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
-      }
-      break;
-
-   /* ------------------------ opl imm, A ----------------- */
-
-   case 0x04: /* ADD Ib, AL */
-      eip = dis_op_imm_A(cb, 1, ADD, True, eip, "add" );
-      break;
-   case 0x05: /* ADD Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, ADD, True, eip, "add" );
-      break;
-
-   case 0x0C: /* OR Ib, AL */
-      eip = dis_op_imm_A(cb, 1, OR, True, eip, "or" );
-      break;
-   case 0x0D: /* OR Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, OR, True, eip, "or" );
-      break;
-
-   case 0x1C: /* SBB Ib, AL */
-      eip = dis_op_imm_A(cb, 1, SBB, True, eip, "sbb" );
-      break;
-
-   case 0x24: /* AND Ib, AL */
-      eip = dis_op_imm_A(cb, 1, AND, True, eip, "and" );
-      break;
-   case 0x25: /* AND Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, AND, True, eip, "and" );
-      break;
-
-   case 0x2C: /* SUB Ib, AL */
-      eip = dis_op_imm_A(cb, 1, SUB, True, eip, "sub" );
-      break;
-   case 0x2D: /* SUB Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, SUB, True, eip, "sub" );
-      break;
-
-   case 0x34: /* XOR Ib, AL */
-      eip = dis_op_imm_A(cb, 1, XOR, True, eip, "xor" );
-      break;
-   case 0x35: /* XOR Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, XOR, True, eip, "xor" );
-      break;
-
-   case 0x3C: /* CMP Ib, AL */
-      eip = dis_op_imm_A(cb, 1, SUB, False, eip, "cmp" );
-      break;
-   case 0x3D: /* CMP Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, SUB, False, eip, "cmp" );
-      break;
-
-   case 0xA8: /* TEST Ib, AL */
-      eip = dis_op_imm_A(cb, 1, AND, False, eip, "test" );
-      break;
-   case 0xA9: /* TEST Iv, eAX */
-      eip = dis_op_imm_A(cb, sz, AND, False, eip, "test" );
-      break;
-
-   /* ------------------------ opl Ev, Gv ----------------- */
-
-   case 0x02: /* ADD Eb,Gb */
-      eip = dis_op2_E_G ( cb, ADD, True, 1, eip, "add" );
-      break;
-   case 0x03: /* ADD Ev,Gv */
-      eip = dis_op2_E_G ( cb, ADD, True, sz, eip, "add" );
-      break;
-
-   case 0x0A: /* OR Eb,Gb */
-      eip = dis_op2_E_G ( cb, OR, True, 1, eip, "or" );
-      break;
-   case 0x0B: /* OR Ev,Gv */
-      eip = dis_op2_E_G ( cb, OR, True, sz, eip, "or" );
-      break;
-
-   case 0x12: /* ADC Eb,Gb */
-      eip = dis_op2_E_G ( cb, ADC, True, 1, eip, "adc" );
-      break;
-   case 0x13: /* ADC Ev,Gv */
-      eip = dis_op2_E_G ( cb, ADC, True, sz, eip, "adc" );
-      break;
-
-   case 0x1B: /* SBB Ev,Gv */
-      eip = dis_op2_E_G ( cb, SBB, True, sz, eip, "sbb" );
-      break;
-
-   case 0x22: /* AND Eb,Gb */
-      eip = dis_op2_E_G ( cb, AND, True, 1, eip, "and" );
-      break;
-   case 0x23: /* AND Ev,Gv */
-      eip = dis_op2_E_G ( cb, AND, True, sz, eip, "and" );
-      break;
-
-   case 0x2A: /* SUB Eb,Gb */
-      eip = dis_op2_E_G ( cb, SUB, True, 1, eip, "sub" );
-      break;
-   case 0x2B: /* SUB Ev,Gv */
-      eip = dis_op2_E_G ( cb, SUB, True, sz, eip, "sub" );
-      break;
-
-   case 0x32: /* XOR Eb,Gb */
-      eip = dis_op2_E_G ( cb, XOR, True, 1, eip, "xor" );
-      break;
-   case 0x33: /* XOR Ev,Gv */
-      eip = dis_op2_E_G ( cb, XOR, True, sz, eip, "xor" );
-      break;
-
-   case 0x3A: /* CMP Eb,Gb */
-      eip = dis_op2_E_G ( cb, SUB, False, 1, eip, "cmp" );
-      break;
-   case 0x3B: /* CMP Ev,Gv */
-      eip = dis_op2_E_G ( cb, SUB, False, sz, eip, "cmp" );
-      break;
-
-   case 0x84: /* TEST Eb,Gb */
-      eip = dis_op2_E_G ( cb, AND, False, 1, eip, "test" );
-      break;
-   case 0x85: /* TEST Ev,Gv */
-      eip = dis_op2_E_G ( cb, AND, False, sz, eip, "test" );
-      break;
-
-   /* ------------------------ opl Gv, Ev ----------------- */
-
-   case 0x00: /* ADD Gb,Eb */
-      eip = dis_op2_G_E ( cb, ADD, True, 1, eip, "add" );
-      break;
-   case 0x01: /* ADD Gv,Ev */
-      eip = dis_op2_G_E ( cb, ADD, True, sz, eip, "add" );
-      break;
-
-   case 0x08: /* OR Gb,Eb */
-      eip = dis_op2_G_E ( cb, OR, True, 1, eip, "or" );
-      break;
-   case 0x09: /* OR Gv,Ev */
-      eip = dis_op2_G_E ( cb, OR, True, sz, eip, "or" );
-      break;
-
-   case 0x11: /* ADC Gv,Ev */
-      eip = dis_op2_G_E ( cb, ADC, True, sz, eip, "adc" );
-      break;
-
-   case 0x19: /* SBB Gv,Ev */
-      eip = dis_op2_G_E ( cb, SBB, True, sz, eip, "sbb" );
-      break;
-
-   case 0x20: /* AND Gb,Eb */
-      eip = dis_op2_G_E ( cb, AND, True, 1, eip, "and" );
-      break;
-   case 0x21: /* AND Gv,Ev */
-      eip = dis_op2_G_E ( cb, AND, True, sz, eip, "and" );
-      break;
-
-   case 0x28: /* SUB Gb,Eb */
-      eip = dis_op2_G_E ( cb, SUB, True, 1, eip, "sub" );
-      break;
-   case 0x29: /* SUB Gv,Ev */
-      eip = dis_op2_G_E ( cb, SUB, True, sz, eip, "sub" );
-      break;
-
-   case 0x30: /* XOR Gb,Eb */
-      eip = dis_op2_G_E ( cb, XOR, True, 1, eip, "xor" );
-      break;
-   case 0x31: /* XOR Gv,Ev */
-      eip = dis_op2_G_E ( cb, XOR, True, sz, eip, "xor" );
-      break;
-
-   case 0x38: /* CMP Gb,Eb */
-      eip = dis_op2_G_E ( cb, SUB, False, 1, eip, "cmp" );
-      break;
-   case 0x39: /* CMP Gv,Ev */
-      eip = dis_op2_G_E ( cb, SUB, False, sz, eip, "cmp" );
-      break;
-
-   /* ------------------------ POP ------------------------ */
-
-   case 0x58: /* POP eAX */
-   case 0x59: /* POP eCX */
-   case 0x5A: /* POP eDX */
-   case 0x5B: /* POP eBX */
-   case 0x5D: /* POP eBP */
-   case 0x5E: /* POP eSI */
-   case 0x5F: /* POP eDI */
-    { Int   n_pops;
-      Addr  eipS, eipE;
-      UChar ch;
-      if (sz != 4)           goto normal_pop_case;
-      if (VG_(clo_cachesim)) goto normal_pop_case;
-      /* eip points at first pop insn + 1.  Make eipS and eipE
-         bracket the sequence. */
-      eipE = eipS = eip - 1;
-      while (True) { 
-         ch = getUChar(eipE+1);
-         if (ch < 0x58 || ch > 0x5F || ch == 0x5C) break;
-         eipE++;
-      }
-      n_pops = eipE - eipS + 1;
-      if (0 && n_pops > 1) VG_(printf)("%d pops\n", n_pops);
-      t1 = newTemp(cb); t3 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP,    TempReg, t1);
-      for (; eipS <= eipE; eipS++) {
-         ch = getUChar(eipS);
-	 uInstr2(cb, LOAD, 4, TempReg, t1, TempReg, t3);
-         uInstr2(cb, PUT,  4, TempReg, t3, ArchReg, ch-0x58);
-         uInstr2(cb, ADD,  4, Literal, 0,        TempReg, t1);
-         uLiteral(cb, 4);
-         SMC_IF_ALL(cb);
-         if (dis) 
-            VG_(printf)("popl %s\n", nameIReg(4,ch-0x58));
-      }
-      uInstr2(cb, PUT,    4, TempReg, t1,       ArchReg, R_ESP);
-      eip = eipE + 1;
-      break;
-    }
-
-   case 0x5C: /* POP eSP */
-   normal_pop_case:
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP,    TempReg, t2);
-      uInstr2(cb, LOAD,  sz, TempReg, t2,       TempReg, t1);
-      uInstr2(cb, ADD,    4, Literal, 0,        TempReg, t2);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,    4, TempReg, t2,       ArchReg, R_ESP);
-      uInstr2(cb, PUT,   sz, TempReg, t1,       ArchReg, opc-0x58);
-      if (dis) 
-         VG_(printf)("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
-      break;
-
-   case 0x9D: /* POPF */
-      vg_assert(sz == 2 || sz == 4);
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP,    TempReg, t2);
-      uInstr2(cb, LOAD,  sz, TempReg, t2,       TempReg, t1);
-      uInstr2(cb, ADD,    4, Literal, 0,        TempReg, t2);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,    4, TempReg, t2,       ArchReg, R_ESP);
-      uInstr1(cb, PUTF,  sz, TempReg, t1);
-      /* PUTF writes all the flags we are interested in */
-      uFlagsRWU(cb, FlagsEmpty, FlagsALL, FlagsEmpty);
-      if (dis) 
-         VG_(printf)("popf%c\n", nameISize(sz));
-      break;
-
-   case 0x61: /* POPA */
-    { Int reg;
-      /* Just to keep things sane, we assert for a size 4.  It's
-         probably OK for size 2 as well, but I'd like to find a test
-         case; ie, have the assertion fail, before committing to it.
-         If it fails for you, uncomment the sz == 2 bit, try again,
-         and let me know whether or not it works.  (jseward@acm.org).  */
-      vg_assert(sz == 4 /* || sz == 2 */);
-
-      /* Eight values are popped, one per register, but the value of
-         %esp on the stack is ignored and instead incremented (in one
-         hit at the end) for each of the values. */
-      t1 = newTemp(cb); t2 = newTemp(cb); t3 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
-      uInstr2(cb, MOV,    4, TempReg, t2,    TempReg, t3);
-
-      /* Do %edi, %esi, %ebp */
-      for (reg = 7; reg >= 5; reg--) {
-          uInstr2(cb, LOAD,  sz, TempReg, t2, TempReg, t1);
-          uInstr2(cb, ADD,    4, Literal, 0,  TempReg, t2);
-          uLiteral(cb, sz);
-          uInstr2(cb, PUT,   sz, TempReg, t1, ArchReg, reg);
-      }
-      /* Ignore (skip) value of %esp on stack. */
-      uInstr2(cb, ADD,    4, Literal, 0,  TempReg, t2);
-      uLiteral(cb, sz);
-      /* Do %ebx, %edx, %ecx, %eax */
-      for (reg = 3; reg >= 0; reg--) {
-          uInstr2(cb, LOAD,  sz, TempReg, t2, TempReg, t1);
-          uInstr2(cb, ADD,    4, Literal, 0,  TempReg, t2);
-          uLiteral(cb, sz);
-          uInstr2(cb, PUT,   sz, TempReg, t1, ArchReg, reg);
-      }
-      uInstr2(cb, ADD,    4, Literal, 0,  TempReg, t3);
-      uLiteral(cb, sz * 8);             /* One 'sz' per register */
-      uInstr2(cb, PUT,    4, TempReg, t3, ArchReg, R_ESP);
-      if (dis)
-         VG_(printf)("popa%c\n", nameISize(sz));
-      break;
-    }
-
-   case 0x8F: /* POPL/POPW m32 */
-     { UInt pair1;
-       Int  tmpa;
-       UChar rm = getUChar(eip);
-
-       /* make sure this instruction is correct POP */
-       vg_assert(!epartIsReg(rm) && (gregOfRM(rm) == 0));
-       /* and has correct size */
-       vg_assert(sz == 4);      
-       
-       t1 = newTemp(cb); t3 = newTemp(cb);
-       /* set t1 to ESP: t1 = ESP */
-       uInstr2(cb, GET,  4, ArchReg, R_ESP,    TempReg, t1);
-       /* load M[ESP] to virtual register t3: t3 = M[t1] */
-       uInstr2(cb, LOAD, 4, TempReg, t1, TempReg, t3);
-       /* resolve MODR/M */
-       pair1 = disAMode ( cb, eip, dis?dis_buf:NULL);              
-       
-       tmpa = LOW24(pair1);
-       /*  uInstr2(cb, LOAD, sz, TempReg, tmpa, TempReg, tmpa); */
-       /* store value from stack in memory, M[m32] = t3 */       
-       uInstr2(cb, STORE, 4, TempReg, t3, TempReg, tmpa);
-
-       /* increase ESP */
-       uInstr2(cb, ADD,    4, Literal, 0,        TempReg, t1);
-       uLiteral(cb, sz);
-       uInstr2(cb, PUT,    4, TempReg, t1,       ArchReg, R_ESP);
-
-       if (dis) 
-          VG_(printf)("popl %s\n", dis_buf);
-
-       eip += HI8(pair1);
-       break;
-     }
-
-   /* ------------------------ PUSH ----------------------- */
-
-   case 0x50: /* PUSH eAX */
-   case 0x51: /* PUSH eCX */
-   case 0x52: /* PUSH eDX */
-   case 0x53: /* PUSH eBX */
-   case 0x55: /* PUSH eBP */
-   case 0x56: /* PUSH eSI */
-   case 0x57: /* PUSH eDI */
-    { Int   n_pushes;
-      Addr  eipS, eipE;
-      UChar ch;
-      if (sz != 4)           goto normal_push_case;
-      if (VG_(clo_cachesim)) goto normal_push_case;
-      /* eip points at first push insn + 1.  Make eipS and eipE
-         bracket the sequence. */
-      eipE = eipS = eip - 1;
-      while (True) { 
-         ch = getUChar(eipE+1);
-         if (ch < 0x50 || ch > 0x57 || ch == 0x54) break;
-         eipE++;
-      }
-      n_pushes = eipE - eipS + 1;
-      if (0 && n_pushes > 1) VG_(printf)("%d pushes\n", n_pushes);
-      t1 = newTemp(cb); t2 = newTemp(cb); t3 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP,    TempReg, t1);
-      uInstr2(cb, MOV,    4, TempReg, t1,       TempReg, t2);
-      uInstr2(cb, SUB,    4, Literal, 0,        TempReg, t2);
-      uLiteral(cb, 4 * n_pushes);
-      uInstr2(cb, PUT,    4, TempReg, t2,       ArchReg, R_ESP);
-      for (; eipS <= eipE; eipS++) {
-         ch = getUChar(eipS);
-         uInstr2(cb, SUB,    4, Literal, 0,        TempReg, t1);
-         uLiteral(cb, 4);
-         uInstr2(cb, GET, 4, ArchReg, ch-0x50, TempReg, t3);
-	 uInstr2(cb, STORE, 4, TempReg, t3, TempReg, t1);
-         SMC_IF_ALL(cb);
-         if (dis) 
-            VG_(printf)("pushl %s\n", nameIReg(4,ch-0x50));
-      }
-      eip = eipE + 1;
-      break;
-    }
-
-   case 0x54: /* PUSH eSP */
-   normal_push_case:
-      /* This is the Right Way, in that the value to be pushed is
-         established before %esp is changed, so that pushl %esp
-         correctly pushes the old value. */
-      t1 = newTemp(cb); t2 = newTemp(cb); t3 = newTemp(cb);
-      uInstr2(cb, GET,   sz, ArchReg, opc-0x50, TempReg, t1);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP,    TempReg, t3);
-      uInstr2(cb, MOV,    4, TempReg, t3,       TempReg, t2);
-      uInstr2(cb, SUB,    4, Literal, 0,        TempReg, t2);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,    4, TempReg, t2,       ArchReg, R_ESP);
-      uInstr2(cb, STORE, sz, TempReg, t1,       TempReg, t2);
-      SMC_IF_ALL(cb);
-      if (dis) 
-         VG_(printf)("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
-      break;
-
-   case 0x68: /* PUSH Iv */
-      d32 = getUDisp(sz,eip); eip += sz;
-      goto do_push_I;
-   case 0x6A: /* PUSH Ib, sign-extended to sz */
-      d32 = getSDisp8(eip); eip += 1;
-      goto do_push_I;
-   do_push_I:
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t1);
-      uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t1);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,    4, TempReg, t1,    ArchReg, R_ESP);
-      uInstr2(cb, MOV,   sz, Literal, 0,     TempReg, t2);
-      uLiteral(cb, d32);
-      uInstr2(cb, STORE, sz, TempReg, t2,    TempReg, t1);
-      SMC_IF_ALL(cb);
-      if (dis) 
-         VG_(printf)("push%c $0x%x\n", nameISize(sz), d32);
-      break;
-
-   case 0x9C: /* PUSHF */
-      vg_assert(sz == 2 || sz == 4);
-      t1 = newTemp(cb); t2 = newTemp(cb); t3 = newTemp(cb);
-      uInstr1(cb, GETF,  sz, TempReg, t1);
-      /* GETF reads all the flags we are interested in */
-      uFlagsRWU(cb, FlagsALL, FlagsEmpty, FlagsEmpty);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP,    TempReg, t3);
-      uInstr2(cb, MOV,    4, TempReg, t3,       TempReg, t2);
-      uInstr2(cb, SUB,    4, Literal, 0,        TempReg, t2);
-      uLiteral(cb, sz);
-      uInstr2(cb, PUT,    4, TempReg, t2,       ArchReg, R_ESP);
-      uInstr2(cb, STORE, sz, TempReg, t1,       TempReg, t2);
-      SMC_IF_ALL(cb);
-      if (dis) 
-         VG_(printf)("pushf%c\n", nameISize(sz));
-      break;
-
-   case 0x60: /* PUSHA */
-    { Int reg;
-      /* Just to keep things sane, we assert for a size 4.  It's
-         probably OK for size 2 as well, but I'd like to find a test
-         case; ie, have the assertion fail, before committing to it.
-         If it fails for you, uncomment the sz == 2 bit, try again,
-         and let me know whether or not it works.  (jseward@acm.org).  */
-      vg_assert(sz == 4 /* || sz == 2 */);
-
-      /* This is the Right Way, in that the value to be pushed is
-         established before %esp is changed, so that pusha
-         correctly pushes the old %esp value.  New value of %esp is
-         pushed at start. */
-      t1 = newTemp(cb); t2 = newTemp(cb); t3 = newTemp(cb);
-      t4 = newTemp(cb);
-      uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t3);
-      uInstr2(cb, MOV,    4, TempReg, t3,    TempReg, t2);
-      uInstr2(cb, MOV,    4, TempReg, t3,    TempReg, t4);
-      uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t4);
-      uLiteral(cb, sz * 8);             /* One 'sz' per register. */
-      uInstr2(cb, PUT,    4, TempReg, t4,  ArchReg, R_ESP);
-      /* Do %eax, %ecx, %edx, %ebx */
-      for (reg = 0; reg <= 3; reg++) {
-         uInstr2(cb, GET,   sz, ArchReg, reg, TempReg, t1);
-         uInstr2(cb, SUB,    4, Literal,   0, TempReg, t2);
-         uLiteral(cb, sz);
-         uInstr2(cb, STORE, sz, TempReg,  t1, TempReg, t2);
-         SMC_IF_ALL(cb);
-      }
-      /* Push old value of %esp */
-      uInstr2(cb, SUB,    4, Literal,   0, TempReg, t2);
-      uLiteral(cb, sz);
-      uInstr2(cb, STORE, sz, TempReg,  t3, TempReg, t2);
-      SMC_IF_ALL(cb);
-      /* Do %ebp, %esi, %edi */
-      for (reg = 5; reg <= 7; reg++) {
-         uInstr2(cb, GET,   sz, ArchReg, reg, TempReg, t1);
-         uInstr2(cb, SUB,    4, Literal,   0, TempReg, t2);
-         uLiteral(cb, sz);
-         uInstr2(cb, STORE, sz, TempReg,  t1, TempReg, t2);
-         SMC_IF_ALL(cb);
-      }
-      if (dis)
-         VG_(printf)("pusha%c\n", nameISize(sz));
-      break;
-    }
-
-   /* ------------------------ SCAS et al ----------------- */
-
-   case 0xA4: /* MOVSb, no REP prefix */
-      codegen_MOVS ( cb, 1 );
-      if (dis) VG_(printf)("movsb\n");
-      break;
-   case 0xA5: /* MOVSv, no REP prefix */
-      codegen_MOVS ( cb, sz );
-      if (dis) VG_(printf)("movs%c\n", nameISize(sz));
-      break;
-
-   case 0xA6: /* CMPSb, no REP prefix */
-      codegen_CMPS ( cb, 1 );
-      if (dis) VG_(printf)("cmpsb\n");
-      break;
-
-   case 0xAA: /* STOSb, no REP prefix */
-      codegen_STOS ( cb, 1 );
-      if (dis) VG_(printf)("stosb\n");
-      break;
-   case 0xAB: /* STOSv, no REP prefix */
-      codegen_STOS ( cb, sz );
-      if (dis) VG_(printf)("stos%c\n", nameISize(sz));
-      break;
-
-   case 0xAC: /* LODSb, no REP prefix */
-      codegen_LODS ( cb, 1 );
-      if (dis) VG_(printf)("lodsb\n");
-      break;
-   case 0xAD: /* LODSv, no REP prefix */
-      codegen_LODS ( cb, sz );
-      if (dis) VG_(printf)("lods%c\n", nameISize(sz));
-      break;
-
-   case 0xAE: /* SCASb, no REP prefix */
-      codegen_SCAS ( cb, 1 );
-      if (dis) VG_(printf)("scasb\n");
-      break;
-
-   case 0xFC: /* CLD */
-      uInstr0(cb, CALLM_S, 0);
-      uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_CLD));
-      uFlagsRWU(cb, FlagsEmpty, FlagD, FlagsEmpty);
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("cld\n");
-      break;
-
-   case 0xFD: /* STD */
-      uInstr0(cb, CALLM_S, 0);
-      uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_STD));
-      uFlagsRWU(cb, FlagsEmpty, FlagD, FlagsEmpty);
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("std\n");
-      break;
-
-   case 0xF8: /* CLC */
-      uInstr0(cb, CALLM_S, 0);
-      uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_CLC));
-      uFlagsRWU(cb, FlagsEmpty, FlagC, FlagsOSZAP);
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("clc\n");
-      break;
-
-   case 0xF9: /* STC */
-      uInstr0(cb, CALLM_S, 0);
-      uInstr1(cb, CALLM, 0, Lit16, VGOFF_(helper_STC));
-      uFlagsRWU(cb, FlagsEmpty, FlagC, FlagsOSZCP);
-      uInstr0(cb, CALLM_E, 0);
-      if (dis) VG_(printf)("stc\n");
-      break;
-
-   case 0xF2: { /* REPNE prefix insn */
-      Addr eip_orig = eip - 1;
-      abyte = getUChar(eip); eip++;
-      if (abyte == 0x66) { sz = 2; abyte = getUChar(eip); eip++; }
-
-      if (abyte == 0xAE || 0xAF) { /* REPNE SCAS<sz> */
-         if (abyte == 0xAE) sz = 1;
-         codegen_REPNE_SCAS ( cb, sz, eip_orig, eip );
-         *isEnd = True;         
-         if (dis) VG_(printf)("repne scas%c\n", nameISize(sz));
-      }
-      else {
-         VG_(printf)("REPNE then 0x%x\n", (UInt)abyte);
-         VG_(panic)("Unhandled REPNE case");
-      }
-      break;
-   }
-
-   case 0xF3: { /* REPE prefix insn */
-      Addr eip_orig = eip - 1;
-      abyte = getUChar(eip); eip++;
-      if (abyte == 0x66) { sz = 2; abyte = getUChar(eip); eip++; }
-
-      if (abyte == 0xA4 || abyte == 0xA5) { /* REPE MOV<sz> */
-         if (abyte == 0xA4) sz = 1;
-         codegen_REPE_MOVS ( cb, sz, eip_orig, eip );
-         *isEnd = True;
-         if (dis) VG_(printf)("repe mov%c\n", nameISize(sz));
-      }
-      else 
-      if (abyte == 0xA6 || abyte == 0xA7) { /* REPE CMP<sz> */
-         if (abyte == 0xA6) sz = 1;
-         codegen_REPE_CMPS ( cb, sz, eip_orig, eip );
-         *isEnd = True;
-         if (dis) VG_(printf)("repe cmps%c\n", nameISize(sz));
-      } 
-      else
-      if (abyte == 0xAA || abyte == 0xAB) { /* REPE STOS<sz> */
-         if (abyte == 0xAA) sz = 1;
-         codegen_REPE_STOS ( cb, sz, eip_orig, eip );
-         *isEnd = True;
-         if (dis) VG_(printf)("repe stos%c\n", nameISize(sz));
-      }
-      else
-      if (abyte == 0x90) { /* REPE NOP (PAUSE) */
-         if (dis) VG_(printf)("repe nop (P4 pause)\n");
-         /* do nothing; apparently a hint to the P4 re spin-wait loop */
-      } else {
-         VG_(printf)("REPE then 0x%x\n", (UInt)abyte);
-         VG_(panic)("Unhandled REPE case");
-      }
-      break;
-   }
-
-   /* ------------------------ XCHG ----------------------- */
-
-   case 0x86: /* XCHG Gb,Eb */
-      sz = 1;
-      /* Fall through ... */
-   case 0x87: /* XCHG Gv,Ev */
-      modrm = getUChar(eip);
-      t1 = newTemp(cb); t2 = newTemp(cb);
-      if (epartIsReg(modrm)) {
-         uInstr2(cb, GET, sz, ArchReg, eregOfRM(modrm), TempReg, t1);
-         uInstr2(cb, GET, sz, ArchReg, gregOfRM(modrm), TempReg, t2);
-         uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, gregOfRM(modrm));
-         uInstr2(cb, PUT, sz, TempReg, t2, ArchReg, eregOfRM(modrm));
-         eip++;
-         if (dis)
-            VG_(printf)("xchg%c %s, %s\n", nameISize(sz), 
-                        nameIReg(sz,gregOfRM(modrm)), 
-                        nameIReg(sz,eregOfRM(modrm)));
-      } else {
-         pair = disAMode ( cb, eip, dis?dis_buf:NULL);
-         t3   = LOW24(pair);
-         uInstr2(cb, LOAD, sz, TempReg, t3, TempReg, t1);
-         uInstr2(cb, GET, sz, ArchReg, gregOfRM(modrm), TempReg, t2);
-         uInstr2(cb, STORE, sz, TempReg, t2, TempReg, t3);
-         SMC_IF_SOME(cb);
-         uInstr2(cb, PUT, sz, TempReg, t1, ArchReg, gregOfRM(modrm));
-         eip += HI8(pair);
-         if (dis)
-            VG_(printf)("xchg%c %s, %s\n", nameISize(sz), 
-                        nameIReg(sz,gregOfRM(modrm)), 
-                        dis_buf);
-      }
-      break;
-
-   case 0x90: /* XCHG eAX,eAX */
-      if (dis) VG_(printf)("nop\n");
-      break;
-   case 0x91: /* XCHG eCX,eSI */
-   case 0x96: /* XCHG eAX,eSI */
-   case 0x97: /* XCHG eAX,eDI */
-      codegen_xchg_eAX_Reg ( cb, sz, opc - 0x90 );
-      break;
-
-   /* ------------------------ (Grp1 extensions) ---------- */
-
-   case 0x80: /* Grp1 Ib,Eb */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      sz    = 1;
-      d_sz  = 1;
-      d32   = getSDisp8(eip + am_sz);
-      eip   = dis_Grp1 ( cb, eip, modrm, am_sz, d_sz, sz, d32 );
-      break;
-
-   case 0x81: /* Grp1 Iv,Ev */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = sz;
-      d32   = getUDisp(d_sz, eip + am_sz);
-      eip   = dis_Grp1 ( cb, eip, modrm, am_sz, d_sz, sz, d32 );
-      break;
-
-   case 0x83: /* Grp1 Ib,Ev */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = 1;
-      d32   = getSDisp8(eip + am_sz);
-      eip   = dis_Grp1 ( cb, eip, modrm, am_sz, d_sz, sz, d32 );
-      break;
-
-   /* ------------------------ (Grp2 extensions) ---------- */
-
-   case 0xC0: /* Grp2 Ib,Eb */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = 1;
-      d32   = getSDisp8(eip + am_sz);
-      sz    = 1;
-      eip   = dis_Grp2 ( cb, eip, modrm, am_sz, d_sz, sz, Literal, d32 );
-      break;
-
-   case 0xC1: /* Grp2 Ib,Ev */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = 1;
-      d32   = getSDisp8(eip + am_sz);
-      eip   = dis_Grp2 ( cb, eip, modrm, am_sz, d_sz, sz, Literal, d32 );
-      break;
-
-   case 0xD0: /* Grp2 1,Eb */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = 0;
-      d32   = 1;
-      sz    = 1;
-      eip   = dis_Grp2 ( cb, eip, modrm, am_sz, d_sz, sz, Literal, d32 );
-      break;
-
-   case 0xD1: /* Grp2 1,Ev */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = 0;
-      d32   = 1;
-      eip   = dis_Grp2 ( cb, eip, modrm, am_sz, d_sz, sz, Literal, d32 );
-      break;
-
-   case 0xD3: /* Grp2 CL,Ev */
-      modrm = getUChar(eip);
-      am_sz = lengthAMode(eip);
-      d_sz  = 0;
-      eip   = dis_Grp2 ( cb, eip, modrm, am_sz, d_sz, sz, ArchReg, R_ECX );
-      break;
-
-   /* ------------------------ (Grp3 extensions) ---------- */
-
-   case 0xF6: /* Grp3 Eb */
-      eip = dis_Grp3 ( cb, 1, eip );
-      break;
-   case 0xF7: /* Grp3 Ev */
-      eip = dis_Grp3 ( cb, sz, eip );
-      break;
-
-   /* ------------------------ (Grp4 extensions) ---------- */
-
-   case 0xFE: /* Grp4 Eb */
-      eip = dis_Grp4 ( cb, eip );
-      break;
-
-   /* ------------------------ (Grp5 extensions) ---------- */
-
-   case 0xFF: /* Grp5 Ev */
-      eip = dis_Grp5 ( cb, sz, eip, isEnd );
-      break;
-
-   /* ------------------------ Escapes to 2-byte opcodes -- */
-
-   case 0x0F: {
-      opc = getUChar(eip); eip++;
-      switch (opc) {
-
-      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
-
-      case 0xBA: /* Grp8 Ib,Ev */
-         modrm = getUChar(eip);
-         am_sz = lengthAMode(eip);
-         d32   = getSDisp8(eip + am_sz);
-         eip = dis_Grp8_BT ( cb, eip, modrm, am_sz, sz, d32 );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
-
-      case 0xBC: /* BSF Gv,Ev */
-         eip = dis_bs_E_G ( cb, sz, eip, True );
-         break;
-      case 0xBD: /* BSR Gv,Ev */
-         eip = dis_bs_E_G ( cb, sz, eip, False );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
-
-      case 0xC8: /* BSWAP %eax */
-      case 0xC9:
-      case 0xCA:
-      case 0xCB:
-      case 0xCC:
-      case 0xCD:
-      case 0xCE:
-      case 0xCF: /* BSWAP %edi */
-         /* AFAICS from the Intel docs, this only exists at size 4. */
-         vg_assert(sz == 4);
-         t1 = newTemp(cb);
-         uInstr2(cb, GET,   4, ArchReg, opc-0xC8, TempReg, t1);
-	 uInstr1(cb, BSWAP, 4, TempReg, t1);
-         uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, opc-0xC8);
-         if (dis) VG_(printf)("bswapl %s\n", nameIReg(4, opc-0xC8));
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
-
-      case 0xA3: /* BT Gv,Ev */
-         eip = dis_bt_G_E ( cb, sz, eip, BtOpNone );
-         break;
-      case 0xB3: /* BTR Gv,Ev */
-         eip = dis_bt_G_E ( cb, sz, eip, BtOpReset );
-         break;
-      case 0xAB: /* BTS Gv,Ev */
-         eip = dis_bt_G_E ( cb, sz, eip, BtOpSet );
-         break;
-      case 0xBB: /* BTC Gv,Ev */
-         eip = dis_bt_G_E ( cb, sz, eip, BtOpComp );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
-
-      case 0x40:
-      case 0x41:
-      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
-      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
-      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
-      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
-      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
-      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
-      case 0x48: /* CMOVSb (cmov negative) */
-      case 0x49: /* CMOVSb (cmov not negative) */
-      case 0x4A: /* CMOVP (cmov parity even) */
-      case 0x4B: /* CMOVNP (cmov parity odd) */
-      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
-      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
-      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
-      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
-         eip = dis_cmov_E_G(cb, sz, (Condcode)(opc - 0x40), eip);
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
-
-      case 0xB1: /* CMPXCHG Gv,Ev */
-         eip = dis_cmpxchg_G_E ( cb, sz, eip );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
-
-      case 0xA2: /* CPUID */
-         t1 = newTemp(cb);
-         t2 = newTemp(cb);
-         t3 = newTemp(cb);
-         t4 = newTemp(cb);
-         uInstr0(cb, CALLM_S, 0);
-
-         uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
-         uInstr1(cb, PUSH,  4, TempReg, t1);
-
-         uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
-         uLiteral(cb, 0);
-         uInstr1(cb, PUSH,  4, TempReg, t2);
-
-         uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
-         uLiteral(cb, 0);
-         uInstr1(cb, PUSH,  4, TempReg, t3);
-
-         uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
-         uLiteral(cb, 0);
-         uInstr1(cb, PUSH,  4, TempReg, t4);
-
-         uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
-         uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
-
-         uInstr1(cb, POP,   4, TempReg, t4);
-         uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
-
-         uInstr1(cb, POP,   4, TempReg, t3);
-         uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
-
-         uInstr1(cb, POP,   4, TempReg, t2);
-         uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
-
-         uInstr1(cb, POP,   4, TempReg, t1);
-         uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
-
-         uInstr0(cb, CALLM_E, 0);
-         if (dis) VG_(printf)("cpuid\n");
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
-
-      case 0xB6: /* MOVZXb Eb,Gv */
-         eip = dis_movx_E_G ( cb, eip, 1, 4, False );
-         break;
-      case 0xB7: /* MOVZXw Ew,Gv */
-         eip = dis_movx_E_G ( cb, eip, 2, 4, False );
-         break;
-
-      case 0xBE: /* MOVSXb Eb,Gv */
-         eip = dis_movx_E_G ( cb, eip, 1, 4, True );
-         break;
-      case 0xBF: /* MOVSXw Ew,Gv */
-         eip = dis_movx_E_G ( cb, eip, 2, 4, True );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
-
-      case 0xAF: /* IMUL Ev, Gv */
-         eip = dis_mul_E_G ( cb, sz, eip, True );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
-      case 0x80:
-      case 0x81:
-      case 0x82: /* JBb/JNAEb (jump below) */
-      case 0x83: /* JNBb/JAEb (jump not below) */
-      case 0x84: /* JZb/JEb (jump zero) */
-      case 0x85: /* JNZb/JNEb (jump not zero) */
-      case 0x86: /* JBEb/JNAb (jump below or equal) */
-      case 0x87: /* JNBEb/JAb (jump not below or equal) */
-      case 0x88: /* JSb (jump negative) */
-      case 0x89: /* JSb (jump not negative) */
-      case 0x8A: /* JP (jump parity even) */
-      case 0x8B: /* JNP/JPO (jump parity odd) */
-      case 0x8C: /* JLb/JNGEb (jump less) */
-      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
-      case 0x8E: /* JLEb/JNGb (jump less or equal) */
-      case 0x8F: /* JGb/JNLEb (jump greater) */
-         d32 = (eip+4) + getUDisp32(eip); eip += 4;
-         uInstr1(cb, JMP, 0, Literal, 0);
-	 uLiteral(cb, d32);
-         uCond(cb, (Condcode)(opc - 0x80));
-         uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-         uInstr1(cb, JMP, 0, Literal, 0);
-	 uLiteral(cb, eip);
-         uCond(cb, CondAlways);
-         *isEnd = True;
-         if (dis)
-            VG_(printf)("j%s-32 0x%x\n", 
-                        VG_(nameCondcode)(opc - 0x80), d32);
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
-
-      case 0x31: /* RDTSC */
-         t1 = newTemp(cb);
-         t2 = newTemp(cb);
-         t3 = newTemp(cb);
-         uInstr0(cb, CALLM_S, 0);
-         uInstr2(cb, MOV,   4, Literal, 0, TempReg, t1);
-         uLiteral(cb, 0);
-         uInstr1(cb, PUSH,  4, TempReg, t1);
-         uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
-         uLiteral(cb, 0);
-         uInstr1(cb, PUSH,  4, TempReg, t2);
-         uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_RDTSC));
-         uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
-         uInstr1(cb, POP,   4, TempReg, t3);
-         uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_EDX);
-         uInstr1(cb, POP,   4, TempReg, t3);
-         uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_EAX);
-         uInstr0(cb, CALLM_E, 0);
-         if (dis) VG_(printf)("rdtsc\n");
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
-      case 0x90:
-      case 0x91:
-      case 0x92: /* set-Bb/set-NAEb (jump below) */
-      case 0x93: /* set-NBb/set-AEb (jump not below) */
-      case 0x94: /* set-Zb/set-Eb (jump zero) */
-      case 0x95: /* set-NZb/set-NEb (jump not zero) */
-      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
-      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
-      case 0x98: /* set-Sb (jump negative) */
-      case 0x99: /* set-Sb (jump not negative) */
-      case 0x9A: /* set-P (jump parity even) */
-      case 0x9B: /* set-NP (jump parity odd) */
-      case 0x9C: /* set-Lb/set-NGEb (jump less) */
-      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
-      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
-      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
-         modrm = getUChar(eip);
-         t1 = newTemp(cb);
-         if (epartIsReg(modrm)) {
-            eip++;
-            uInstr1(cb, CC2VAL, 1, TempReg, t1);
-            uCond(cb, (Condcode)(opc-0x90));
-            uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-            uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, eregOfRM(modrm));
-            if (dis) VG_(printf)("set%s %s\n", 
-                                 VG_(nameCondcode)(opc-0x90), 
-                                 nameIReg(1,eregOfRM(modrm)));
-         } else {
-            pair = disAMode ( cb, eip, dis?dis_buf:NULL );
-            t2 = LOW24(pair);
-            eip += HI8(pair);
-            uInstr1(cb, CC2VAL, 1, TempReg, t1);
-            uCond(cb, (Condcode)(opc-0x90));
-            uFlagsRWU(cb, FlagsOSZACP, FlagsEmpty, FlagsEmpty);
-            uInstr2(cb, STORE, 1, TempReg, t1, TempReg, t2);
-            SMC_IF_ALL(cb);
-            if (dis) VG_(printf)("set%s %s\n", 
-                                 VG_(nameCondcode)(opc-0x90), 
-                                 dis_buf);
-         }
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
-
-      case 0xA4: /* SHLDv imm8,Gv,Ev */
-         modrm = getUChar(eip);
-         eip = dis_SHLRD_Gv_Ev ( 
-                  cb, eip, modrm, sz, 
-                  Literal, getUChar(eip + lengthAMode(eip)),
-                  True );
-         break;
-      case 0xA5: /* SHLDv %cl,Gv,Ev */
-         modrm = getUChar(eip);
-         eip = dis_SHLRD_Gv_Ev ( 
-                  cb, eip, modrm, sz, ArchReg, R_CL, True );
-         break;
-
-      case 0xAC: /* SHRDv imm8,Gv,Ev */
-         modrm = getUChar(eip);
-         eip = dis_SHLRD_Gv_Ev ( 
-                  cb, eip, modrm, sz, 
-                  Literal, getUChar(eip + lengthAMode(eip)),
-                  False );
-         break;
-      case 0xAD: /* SHRDv %cl,Gv,Ev */
-         modrm = getUChar(eip);
-         eip = dis_SHLRD_Gv_Ev ( 
-                  cb, eip, modrm, sz, ArchReg, R_CL, False );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
-
-      case 0xC1: /* XADD Gv,Ev */
-         eip = dis_xadd_G_E ( cb, sz, eip );
-         break;
-
-      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
-
-      default:
-         VG_(printf)("disInstr: unhandled 2-byte opcode 0x%x\n", 
-                     (UInt)opc);
-	 VG_(printf)("This _might_ be the result of executing an "
-                     "MMX, SSE, SSE2 or 3DNow!\n" );
-	 VG_(printf)("instruction.  Valgrind does not currently "
-                     "support such instructions.  Sorry.\n" );
-         VG_(unimplemented)("unhandled x86 0x0F 2-byte opcode");
-      }
-
-      break;
-   }
-
-   /* ------------------------ ??? ------------------------ */
-
-   default:
-      VG_(printf)("disInstr: unhandled opcode 0x%x then 0x%x\n", 
-                  (UInt)opc, (UInt)getUChar(eip));
-      if (opc == 0x8C)
-         VG_(nvidia_moan)();
-      VG_(panic)("unhandled x86 opcode");
-   }
-
-   if (dis)
-      VG_(printf)("\n");
-   for (; first_uinstr < cb->used; first_uinstr++) {
-      Bool sane = VG_(saneUInstr)(True, &cb->instrs[first_uinstr]);
-      if (dis || !sane) 
-         VG_(ppUInstr)(sane ? first_uinstr : -1,
-                       &cb->instrs[first_uinstr]);
-      vg_assert(sane);
-   }
-
-   return eip;
-}
-
-
-/* Disassemble a complete basic block, starting at eip, and dumping
-   the ucode into cb.  Returns the size, in bytes, of the basic
-   block. */
-
-Int VG_(disBB) ( UCodeBlock* cb, Addr eip0 )
-{
-   Addr eip   = eip0;
-   Bool isEnd = False;
-   Bool block_sane;
-   Int INCEIP_allowed_lag = 4;
-   Int delta = 0;
-
-   if (dis) VG_(printf)("\n");
-
-   /* When cache simulating, to ensure cache misses are attributed to the
-    * correct line we ensure EIP is always correct.   This is done by:
-    *
-    * a) Using eager INCEIP updating to cope with all instructions except those
-    *    at the end of a basic block.
-    *
-    * b) Patching in the size of the original x86 instr in the `extra4b' field
-    *    of JMPs at the end of a basic block.  Two cases:
-    *       - Jcond followed by Juncond:  patch the Jcond
-    *       - Juncond alone:              patch the Juncond
-    *
-    * See vg_cachesim_instrument() for how this is used. 
-    */
-   if (VG_(clo_cachesim)) {
-       INCEIP_allowed_lag = 0;
-   }
-
-   if (VG_(clo_single_step)) {
-      eip = disInstr ( cb, eip, &isEnd );
-
-      /* Add a JMP to the next (single x86 instruction) BB if it doesn't
-       * already end with a JMP instr. We also need to check for no UCode,
-       * which occurs if the x86 instr was a nop */
-      if (cb->used == 0 || LAST_UINSTR(cb).opcode != JMP) {
-         uInstr1(cb, JMP, 0, Literal, 0);
-         uLiteral(cb, eip);
-         uCond(cb, CondAlways);
-         if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-      }
-      delta = eip - eip0;
-
-   } else {
-      Addr eip2;
-      while (!isEnd) {
-         eip2 = disInstr ( cb, eip, &isEnd );
-         delta += (eip2 - eip);
-         eip = eip2;
-         /* Split up giant basic blocks into pieces, so the
-            translations fall within 64k. */
-         if (eip - eip0 > 2000 && !isEnd) {
-            if (VG_(clo_verbosity) > 2)
-               VG_(message)(Vg_DebugMsg,
-                  "Warning: splitting giant basic block into pieces");
-            uInstr1(cb, JMP, 0, Literal, 0);
-            uLiteral(cb, eip);
-            uCond(cb, CondAlways);
-            if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-            isEnd = True;
-
-         } else if (delta > INCEIP_allowed_lag && !isEnd) {
-            uInstr1(cb, INCEIP, 0, Lit16, delta);
-            if (dis) VG_(ppUInstr)(cb->used-1, &cb->instrs[cb->used-1]);
-            delta = 0;
-         }
-         if (dis) VG_(printf)("\n");
-      }
-   }
-   if (VG_(clo_cachesim)) {
-      /* Patch instruction size into earliest JMP. */
-      if (cb->used >= 2 && JMP == cb->instrs[cb->used - 2].opcode) {
-         cb->instrs[cb->used - 2].extra4b = delta;
-      } else {
-         LAST_UINSTR(cb).extra4b = delta;
-      }
-   }
-
-   block_sane = VG_(saneUCodeBlock)(cb);
-   if (!block_sane) {
-      VG_(ppUCodeBlock)(cb, "block failing sanity check");
-      vg_assert(block_sane);
-   }
-
-   return eip - eip0;
-}
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                            vg_to_ucode.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
deleted file mode 100644
index 27a02754cf..0000000000
--- a/coregrind/vg_translate.c
+++ /dev/null
@@ -1,3172 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- The JITter proper: register allocation & code improvement    ---*/
-/*---                                               vg_translate.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-
-/*------------------------------------------------------------*/
-/*--- Renamings of frequently-used global functions.       ---*/
-/*------------------------------------------------------------*/
-
-#define uInstr1   VG_(newUInstr1)
-#define uInstr2   VG_(newUInstr2)
-#define uInstr3   VG_(newUInstr3)
-#define dis       VG_(disassemble)
-#define nameIReg  VG_(nameOfIntReg)
-#define nameISize VG_(nameOfIntSize)
-#define uLiteral  VG_(setLiteralField)
-#define newTemp   VG_(getNewTemp)
-#define newShadow VG_(getNewShadow)
-
-
-/*------------------------------------------------------------*/
-/*--- Memory management for the translater.                ---*/
-/*------------------------------------------------------------*/
-
-#define N_JITBLOCKS    4
-#define N_JITBLOCK_SZ  5000
-
-static UChar jitstorage[N_JITBLOCKS][N_JITBLOCK_SZ];
-static Bool  jitstorage_inuse[N_JITBLOCKS];
-static Bool  jitstorage_initdone = False;
-
-static __inline__ void jitstorage_initialise ( void )
-{
-   Int i;
-   if (jitstorage_initdone) return;
-   jitstorage_initdone = True;
-   for (i = 0; i < N_JITBLOCKS; i++)
-      jitstorage_inuse[i] = False; 
-}
-
-void* VG_(jitmalloc) ( Int nbytes )
-{
-   Int i;
-   jitstorage_initialise();
-   if (nbytes > N_JITBLOCK_SZ) {
-      /* VG_(printf)("too large: %d\n", nbytes); */
-      return VG_(malloc)(VG_AR_PRIVATE, nbytes);
-   }
-   for (i = 0; i < N_JITBLOCKS; i++) {
-      if (!jitstorage_inuse[i]) {
-         jitstorage_inuse[i] = True;
-         /* VG_(printf)("alloc %d -> %d\n", nbytes, i ); */
-         return & jitstorage[i][0];
-      }
-   }
-   VG_(panic)("out of slots in vg_jitmalloc\n");
-   return VG_(malloc)(VG_AR_PRIVATE, nbytes);
-}
-
-void VG_(jitfree) ( void* ptr )
-{
-   Int i;
-   jitstorage_initialise();
-   for (i = 0; i < N_JITBLOCKS; i++) {
-      if (ptr == & jitstorage[i][0]) {
-         vg_assert(jitstorage_inuse[i]);
-         jitstorage_inuse[i] = False;
-         return;
-      }
-   }
-   VG_(free)(VG_AR_PRIVATE, ptr);
-}
-
-/*------------------------------------------------------------*/
-/*--- Basics                                               ---*/
-/*------------------------------------------------------------*/
-
-UCodeBlock* VG_(allocCodeBlock) ( void )
-{
-   UCodeBlock* cb = VG_(malloc)(VG_AR_PRIVATE, sizeof(UCodeBlock));
-   cb->used = cb->size = cb->nextTemp = 0;
-   cb->instrs = NULL;
-   return cb;
-}
-
-
-void VG_(freeCodeBlock) ( UCodeBlock* cb )
-{
-   if (cb->instrs) VG_(free)(VG_AR_PRIVATE, cb->instrs);
-   VG_(free)(VG_AR_PRIVATE, cb);
-}
-
-
-/* Ensure there's enough space in a block to add one uinstr. */
-static __inline__
-void ensureUInstr ( UCodeBlock* cb )
-{
-   if (cb->used == cb->size) {
-      if (cb->instrs == NULL) {
-         vg_assert(cb->size == 0);
-         vg_assert(cb->used == 0);
-         cb->size = 8;
-         cb->instrs = VG_(malloc)(VG_AR_PRIVATE, 8 * sizeof(UInstr));
-      } else {
-         Int i;
-         UInstr* instrs2 = VG_(malloc)(VG_AR_PRIVATE, 
-                                       2 * sizeof(UInstr) * cb->size);
-         for (i = 0; i < cb->used; i++)
-            instrs2[i] = cb->instrs[i];
-         cb->size *= 2;
-         VG_(free)(VG_AR_PRIVATE, cb->instrs);
-         cb->instrs = instrs2;
-      }
-   }
-
-   vg_assert(cb->used < cb->size);
-}
-
-
-__inline__ 
-void VG_(emptyUInstr) ( UInstr* u )
-{
-   u->val1 = u->val2 = u->val3 = 0;
-   u->tag1 = u->tag2 = u->tag3 = NoValue;
-   u->flags_r = u->flags_w = FlagsEmpty;
-   u->jmpkind = JmpBoring;
-   u->smc_check = u->signed_widen = False;
-   u->lit32    = 0;
-   u->opcode   = 0;
-   u->size     = 0;
-   u->cond     = 0;
-   u->extra4b  = 0;
-}
-
-
-/* Add an instruction to a ucode block, and return the index of the
-   instruction. */
-__inline__
-void VG_(newUInstr3) ( UCodeBlock* cb, Opcode opcode, Int sz,
-                       Tag tag1, UInt val1,
-                       Tag tag2, UInt val2,
-                       Tag tag3, UInt val3 )
-{
-   UInstr* ui;
-   ensureUInstr(cb);
-   ui = & cb->instrs[cb->used];
-   cb->used++;
-   VG_(emptyUInstr)(ui);
-   ui->val1   = val1;
-   ui->val2   = val2;
-   ui->val3   = val3;
-   ui->opcode = opcode;
-   ui->tag1   = tag1;
-   ui->tag2   = tag2;
-   ui->tag3   = tag3;
-   ui->size   = sz;
-   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
-   if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
-   if (tag3 == TempReg) vg_assert(val3 != INVALID_TEMPREG);
-}
-
-
-__inline__
-void VG_(newUInstr2) ( UCodeBlock* cb, Opcode opcode, Int sz,
-                       Tag tag1, UInt val1,
-                       Tag tag2, UInt val2 )
-{
-   UInstr* ui;
-   ensureUInstr(cb);
-   ui = & cb->instrs[cb->used];
-   cb->used++;
-   VG_(emptyUInstr)(ui);
-   ui->val1   = val1;
-   ui->val2   = val2;
-   ui->opcode = opcode;
-   ui->tag1   = tag1;
-   ui->tag2   = tag2;
-   ui->size   = sz;
-   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
-   if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
-}
-
-
-__inline__
-void VG_(newUInstr1) ( UCodeBlock* cb, Opcode opcode, Int sz,
-                       Tag tag1, UInt val1 )
-{
-   UInstr* ui;
-   ensureUInstr(cb);
-   ui = & cb->instrs[cb->used];
-   cb->used++;
-   VG_(emptyUInstr)(ui);
-   ui->val1   = val1;
-   ui->opcode = opcode;
-   ui->tag1   = tag1;
-   ui->size   = sz;
-   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
-}
-
-
-__inline__
-void VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz )
-{
-   UInstr* ui;
-   ensureUInstr(cb);
-   ui = & cb->instrs[cb->used];
-   cb->used++;
-   VG_(emptyUInstr)(ui);
-   ui->opcode = opcode;
-   ui->size   = sz;
-}
-
-/* Copy an instruction into the given codeblock. */
-__inline__ 
-void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr )
-{
-   ensureUInstr(cb);
-   cb->instrs[cb->used] = *instr;
-   cb->used++;
-}
-
-/* Copy auxiliary info from one uinstr to another. */
-static __inline__ 
-void copyAuxInfoFromTo ( UInstr* src, UInstr* dst )
-{
-   dst->cond          = src->cond;
-   dst->extra4b       = src->extra4b;
-   dst->smc_check     = src->smc_check;
-   dst->signed_widen  = src->signed_widen;
-   dst->jmpkind       = src->jmpkind;
-   dst->flags_r       = src->flags_r;
-   dst->flags_w       = src->flags_w;
-}
-
-
-/* Set the flag R/W sets on a uinstr. */
-void VG_(setFlagRW) ( UInstr* u, FlagSet fr, FlagSet fw )
-{
-   /* VG_(ppUInstr)(-1,u); */
-   vg_assert(fr == (fr & FlagsALL));
-   vg_assert(fw == (fw & FlagsALL));
-   u->flags_r = fr;
-   u->flags_w = fw;
-}
-
-
-/* Set the lit32 field of the most recent uinsn. */
-void VG_(setLiteralField) ( UCodeBlock* cb, UInt lit32 )
-{
-   LAST_UINSTR(cb).lit32 = lit32;
-}
-
-
-Bool VG_(anyFlagUse) ( UInstr* u )
-{
-   return (u->flags_r != FlagsEmpty 
-           || u->flags_w != FlagsEmpty);
-}
-
-
-
-
-/* Convert a rank in the range 0 .. VG_MAX_REALREGS-1 into an Intel
-   register number.  This effectively defines the order in which real
-   registers are allocated.  %ebp is excluded since it is permanently
-   reserved for pointing at VG_(baseBlock).  %edi is a general spare
-   temp used for Left4 and various misc tag ops.
-
-   Important!  If you change the set of allocatable registers from
-   %eax, %ebx, %ecx, %edx, %esi you must change the
-   save/restore sequences in various places to match!  
-*/
-__inline__ Int VG_(rankToRealRegNo) ( Int rank )
-{
-   switch (rank) {
-#     if 1
-      /* Probably the best allocation ordering. */
-      case 0: return R_EAX;
-      case 1: return R_EBX;
-      case 2: return R_ECX;
-      case 3: return R_EDX;
-      case 4: return R_ESI;
-#     else
-      /* Contrary; probably the worst.  Helpful for debugging, tho. */
-      case 4: return R_EAX;
-      case 3: return R_EBX;
-      case 2: return R_ECX;
-      case 1: return R_EDX;
-      case 0: return R_ESI;
-#     endif
-      default: VG_(panic)("rankToRealRegNo");
-   }
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Sanity checking uinstrs.                             ---*/
-/*------------------------------------------------------------*/
-
-/* This seems as good a place as any to record some important stuff
-   about ucode semantics.
-
-   * TempRegs are 32 bits wide.  LOADs of 8/16 bit values into a
-     TempReg are defined to zero-extend the loaded value to 32 bits.
-     This is needed to make the translation of movzbl et al work
-     properly.
-
-   * Similarly, GETs of a 8/16 bit ArchRegs are zero-extended.
-
-   * Arithmetic on TempRegs is at the specified size.  For example,
-     SUBW t1, t2 has to result in a real 16 bit x86 subtraction 
-     being emitted -- not a 32 bit one.
-
-   * On some insns we allow the cc bit to be set.  If so, the
-     intention is that the simulated machine's %eflags register
-     is copied into that of the real machine before the insn,
-     and copied back again afterwards.  This means that the 
-     code generated for that insn must be very careful only to
-     update %eflags in the intended way.  This is particularly
-     important for the routines referenced by CALL insns.
-*/
-
-/* Meaning of operand kinds is as follows:
-
-   ArchReg  is a register of the simulated CPU, stored in memory,
-            in vg_m_state.m_eax .. m_edi.  These values are stored
-            using the Intel register encoding.
-
-   RealReg  is a register of the real CPU.  There are VG_MAX_REALREGS
-            available for allocation.  As with ArchRegs, these values
-            are stored using the Intel register encoding.
-
-   TempReg  is a temporary register used to express the results of
-            disassembly.  There is an unlimited supply of them -- 
-            register allocation and spilling eventually assigns them 
-            to RealRegs.
-
-   SpillNo  is a spill slot number.  The number of required spill
-            slots is VG_MAX_PSEUDOS, in general.  Only allowed 
-            as the ArchReg operand of GET and PUT.
-
-   Lit16    is a signed 16-bit literal value.
-
-   Literal  is a 32-bit literal value.  Each uinstr can only hold
-            one of these.
-
-   The disassembled code is expressed purely in terms of ArchReg,
-   TempReg and Literal operands.  Eventually, register allocation
-   removes all the TempRegs, giving a result using ArchRegs, RealRegs,
-   and Literals.  New x86 code can easily be synthesised from this.
-   There are carefully designed restrictions on which insns can have
-   which operands, intended to make it possible to generate x86 code
-   from the result of register allocation on the ucode efficiently and
-   without need of any further RealRegs.
-
-   Restrictions on insns (as generated by the disassembler) are as
-   follows:
-
-      A=ArchReg   S=SpillNo   T=TempReg   L=Literal   R=RealReg
-      N=NoValue
-
-         GETF       T       N       N
-         PUTF       T       N       N
-
-         GET        A,S     T       N
-         PUT        T       A,S     N
-         LOAD       T       T       N
-         STORE      T       T       N
-         MOV        T,L     T       N
-         CMOV       T       T       N
-         WIDEN      T       N       N
-         JMP        T,L     N       N
-         CALLM      L       N       N
-         CALLM_S    N       N       N
-         CALLM_E    N       N       N
-         PUSH,POP   T       N       N
-         CLEAR      L       N       N
-
-         AND, OR
-                    T       T       N
-
-         ADD, ADC, XOR, SUB, SBB
-                    A,L,T   T       N
-
-         SHL, SHR, SAR, ROL, ROR, RCL, RCR
-                    L,T     T       N
-
-         NOT, NEG, INC, DEC, CC2VAL, BSWAP
-                    T       N       N
-
-         JIFZ       T       L       N
-
-         FPU_R      L       T       N
-         FPU_W      L       T       N
-         FPU        L       T       N
-
-         LEA1       T       T   (const in a seperate field)
-         LEA2       T       T       T   (const & shift ditto)
-
-         INCEIP     L       N       N
- 
-   and for instrumentation insns:
-
-         LOADV      T       T       N
-         STOREV     T,L     T       N
-         GETV       A       T       N
-         PUTV       T,L     A       N
-         GETVF      T       N       N
-         PUTVF      T       N       N
-         WIDENV     T       N       N
-         TESTV      A,T     N       N
-         SETV       A,T     N       N
-         TAG1       T       N       N
-         TAG2       T       T       N
-
-   Before register allocation, S operands should not appear anywhere.
-   After register allocation, all T operands should have been
-   converted into Rs, and S operands are allowed in GET and PUT --
-   denoting spill saves/restores.  
-
-   The size field should be 0 for insns for which it is meaningless,
-   ie those which do not directly move/operate on data.
-*/
-Bool VG_(saneUInstr) ( Bool beforeRA, UInstr* u )
-{
-#  define TR1 (beforeRA ? (u->tag1 == TempReg) : (u->tag1 == RealReg))
-#  define TR2 (beforeRA ? (u->tag2 == TempReg) : (u->tag2 == RealReg))
-#  define TR3 (beforeRA ? (u->tag3 == TempReg) : (u->tag3 == RealReg))
-#  define A1  (u->tag1 == ArchReg)
-#  define A2  (u->tag2 == ArchReg)
-#  define AS1 ((u->tag1 == ArchReg) || ((!beforeRA && (u->tag1 == SpillNo))))
-#  define AS2 ((u->tag2 == ArchReg) || ((!beforeRA && (u->tag2 == SpillNo))))
-#  define AS3 ((u->tag3 == ArchReg) || ((!beforeRA && (u->tag3 == SpillNo))))
-#  define L1  (u->tag1 == Literal && u->val1 == 0)
-#  define L2  (u->tag2 == Literal && u->val2 == 0)
-#  define Ls1 (u->tag1 == Lit16)
-#  define Ls3 (u->tag3 == Lit16)
-#  define N1  (u->tag1 == NoValue)
-#  define N2  (u->tag2 == NoValue)
-#  define N3  (u->tag3 == NoValue)
-#  define SZ4 (u->size == 4)
-#  define SZ2 (u->size == 2)
-#  define SZ1 (u->size == 1)
-#  define SZ0 (u->size == 0)
-#  define CC0 (u->flags_r == FlagsEmpty && u->flags_w == FlagsEmpty)
-#  define FLG_RD (u->flags_r == FlagsALL && u->flags_w == FlagsEmpty)
-#  define FLG_WR (u->flags_r == FlagsEmpty && u->flags_w == FlagsALL)
-#  define FLG_RD_WR_MAYBE                                         \
-       ((u->flags_r == FlagsEmpty && u->flags_w == FlagsEmpty)    \
-        || (u->flags_r == FlagsEmpty && u->flags_w == FlagsZCP)   \
-        || (u->flags_r == FlagsZCP && u->flags_w == FlagsEmpty))
-#  define CC1 (!(CC0))
-#  define SZ4_IF_TR1 ((u->tag1 == TempReg || u->tag1 == RealReg) \
-                      ? (u->size == 4) : True)
-
-   Int n_lits = 0;
-   if (u->tag1 == Literal) n_lits++;
-   if (u->tag2 == Literal) n_lits++;
-   if (u->tag3 == Literal) n_lits++;
-   if (n_lits > 1) 
-      return False;
-
-   switch (u->opcode) {
-      case GETF:
-         return (SZ2 || SZ4) && TR1 && N2 && N3 && FLG_RD;
-      case PUTF:
-         return (SZ2 || SZ4) && TR1 && N2 && N3 && FLG_WR;
-      case CALLM_S: case CALLM_E:
-         return SZ0 && N1 && N2 && N3;
-      case INCEIP:
-         return SZ0 && CC0 && Ls1 && N2 && N3;
-      case LEA1:
-         return CC0 && TR1 && TR2 && N3 && SZ4;
-      case LEA2:
-         return CC0 && TR1 && TR2 && TR3 && SZ4;
-      case NOP: 
-         return SZ0 && CC0 && N1 && N2 && N3;
-      case GET: 
-         return CC0 && AS1 && TR2 && N3;
-      case PUT: 
-         return CC0 && TR1 && AS2 && N3;
-      case LOAD: case STORE: 
-         return CC0 && TR1 && TR2 && N3;
-      case MOV:
-         return CC0 && (TR1 || L1) && TR2 && N3 && SZ4_IF_TR1;
-      case CMOV:
-         return CC1 && TR1 && TR2 && N3 && SZ4;
-      case JMP: 
-         return (u->cond==CondAlways ? CC0 : CC1)
-                && (TR1 || L1) && N2 && SZ0 && N3;
-      case CLEAR:
-         return CC0 && Ls1 && N2 && SZ0 && N3;
-      case CALLM:
-         return SZ0 && Ls1 && N2 && N3;
-      case PUSH: case POP:
-         return CC0 && TR1 && N2 && N3;
-      case AND: case OR:
-         return TR1 && TR2 && N3;
-      case ADD: case ADC: case XOR: case SUB: case SBB:
-         return (A1 || TR1 || L1) && TR2 && N3;
-      case SHL: case SHR: case SAR: case ROL: case ROR: case RCL: case RCR:
-         return       (TR1 || L1) && TR2 && N3;
-      case NOT: case NEG: case INC: case DEC:
-         return        TR1 && N2 && N3;
-      case BSWAP:
-         return TR1 && N2 && N3 && CC0 && SZ4;
-      case CC2VAL: 
-         return CC1 && SZ1 && TR1 && N2 && N3;
-      case JIFZ:
-         return CC0 && SZ4 && TR1 && L2 && N3;
-      case FPU_R:  case FPU_W: 
-         return CC0 && Ls1 && TR2 && N3;
-      case FPU: 
-         return SZ0 && FLG_RD_WR_MAYBE && Ls1 && N2 && N3;
-      case LOADV:
-         return CC0 && TR1 && TR2 && N3;
-      case STOREV:
-         return CC0 && (TR1 || L1) && TR2 && N3;
-      case GETV: 
-         return CC0 && A1 && TR2 && N3;
-      case PUTV: 
-         return CC0 && (TR1 || L1) && A2 && N3;
-      case GETVF: 
-         return CC0 && TR1 && N2 && N3 && SZ0;
-      case PUTVF: 
-         return CC0 && TR1 && N2 && N3 && SZ0;
-      case WIDEN:
-         return CC0 && TR1 && N2 && N3;
-      case TESTV: 
-         return CC0 && (A1 || TR1) && N2 && N3;
-      case SETV:
-         return CC0 && (A1 || TR1) && N2 && N3;
-      case TAG1:
-         return CC0 && TR1 && N2 && Ls3 && SZ0;
-      case TAG2:
-         return CC0 && TR1 && TR2 && Ls3 && SZ0;
-      default: 
-         VG_(panic)("vg_saneUInstr: unhandled opcode");
-   }
-#  undef SZ4_IF_TR1
-#  undef CC0
-#  undef CC1
-#  undef SZ4
-#  undef SZ2
-#  undef SZ1
-#  undef SZ0
-#  undef TR1
-#  undef TR2
-#  undef TR3
-#  undef A1
-#  undef A2
-#  undef AS1
-#  undef AS2
-#  undef AS3
-#  undef L1
-#  undef Ls1
-#  undef L2
-#  undef Ls3
-#  undef N1
-#  undef N2
-#  undef N3
-#  undef FLG_RD
-#  undef FLG_WR
-#  undef FLG_RD_WR_MAYBE 
-}
-
-
-/* Sanity checks to do with CALLMs in UCodeBlocks. */
-Bool VG_(saneUCodeBlock) ( UCodeBlock* cb )
-{
-   Int  callm = 0;
-   Int  callm_s = 0;
-   Int  callm_e = 0;
-   Int  callm_ptr, calls_ptr;
-   Int  i, j, t;
-   Bool incall = False;
-
-   /* Ensure the number of CALLM, CALLM_S and CALLM_E are the same. */
-
-   for (i = 0; i < cb->used; i++) {
-      switch (cb->instrs[i].opcode) {
-         case CALLM:
-            if (!incall) return False;
-            callm++; 
-            break;
-         case CALLM_S: 
-            if (incall) return False;
-            incall = True;
-            callm_s++; 
-            break;
-         case CALLM_E: 
-            if (!incall) return False;
-            incall = False;
-            callm_e++; 
-            break;
-         case PUSH: case POP: case CLEAR:
-            if (!incall) return False;
-            break;
-         default:
-            break;
-      }
-   }
-   if (incall) return False;
-   if (callm != callm_s || callm != callm_e) return False;
-
-   /* Check the sections between CALLM_S and CALLM's.  Ensure that no
-      PUSH uinsn pushes any TempReg that any other PUSH in the same
-      section pushes.  Ie, check that the TempReg args to PUSHes in
-      the section are unique.  If not, the instrumenter generates
-      incorrect code for CALLM insns. */
-
-   callm_ptr = 0;
-
- find_next_CALLM:
-   /* Search for the next interval, making calls_ptr .. callm_ptr
-      bracket it. */
-   while (callm_ptr < cb->used 
-          && cb->instrs[callm_ptr].opcode != CALLM)
-      callm_ptr++;
-   if (callm_ptr == cb->used)
-      return True;
-   vg_assert(cb->instrs[callm_ptr].opcode == CALLM);
-
-   calls_ptr = callm_ptr - 1;
-   while (cb->instrs[calls_ptr].opcode != CALLM_S)
-      calls_ptr--;
-   vg_assert(cb->instrs[calls_ptr].opcode == CALLM_S);
-   vg_assert(calls_ptr >= 0);
-
-   /* VG_(printf)("interval from %d to %d\n", calls_ptr, callm_ptr ); */
-
-   /* For each PUSH insn in the interval ... */
-   for (i = calls_ptr + 1; i < callm_ptr; i++) {
-      if (cb->instrs[i].opcode != PUSH) continue;
-      t = cb->instrs[i].val1;
-      /* Ensure no later PUSH insns up to callm_ptr push the same
-         TempReg.  Return False if any such are found. */
-      for (j = i+1; j < callm_ptr; j++) {
-         if (cb->instrs[j].opcode == PUSH &&
-             cb->instrs[j].val1 == t)
-            return False;
-      }
-   }
-
-   /* This interval is clean.  Keep going ... */
-   callm_ptr++;
-   goto find_next_CALLM;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Printing uinstrs.                                    ---*/
-/*------------------------------------------------------------*/
-
-Char* VG_(nameCondcode) ( Condcode cond )
-{
-   switch (cond) {
-      case CondO:      return "o";
-      case CondNO:     return "no";
-      case CondB:      return "b";
-      case CondNB:     return "nb";
-      case CondZ:      return "z";
-      case CondNZ:     return "nz";
-      case CondBE:     return "be";
-      case CondNBE:    return "nbe";
-      case CondS:      return "s";
-      case ConsNS:     return "ns";
-      case CondP:      return "p";
-      case CondNP:     return "np";
-      case CondL:      return "l";
-      case CondNL:     return "nl";
-      case CondLE:     return "le";
-      case CondNLE:    return "nle";
-      case CondAlways: return "MP"; /* hack! */
-      default: VG_(panic)("nameCondcode");
-   }
-}
-
-
-static void vg_ppFlagSet ( Char* prefix, FlagSet set )
-{
-   VG_(printf)("%s", prefix);
-   if (set & FlagD) VG_(printf)("D");
-   if (set & FlagO) VG_(printf)("O");
-   if (set & FlagS) VG_(printf)("S");
-   if (set & FlagZ) VG_(printf)("Z");
-   if (set & FlagA) VG_(printf)("A");
-   if (set & FlagC) VG_(printf)("C");
-   if (set & FlagP) VG_(printf)("P");
-}
-
-
-static void ppTempReg ( Int tt )
-{
-   if ((tt & 1) == 0)
-      VG_(printf)("t%d", tt);
-   else
-      VG_(printf)("q%d", tt-1);
-}
-
-
-static void ppUOperand ( UInstr* u, Int operandNo, Int sz, Bool parens )
-{
-   UInt tag, val;
-   switch (operandNo) {
-      case 1: tag = u->tag1; val = u->val1; break;
-      case 2: tag = u->tag2; val = u->val2; break;
-      case 3: tag = u->tag3; val = u->val3; break;
-      default: VG_(panic)("ppUOperand(1)");
-   }
-   if (tag == Literal) val = u->lit32;
-
-   if (parens) VG_(printf)("(");
-   switch (tag) {
-      case TempReg: ppTempReg(val); break;
-      case RealReg: VG_(printf)("%s",nameIReg(sz==0 ? 4 : sz,val)); break;
-      case Literal: VG_(printf)("$0x%x", val); break;
-      case Lit16:   VG_(printf)("$0x%x", val); break;
-      case NoValue: VG_(printf)("NoValue"); break;
-      case ArchReg: VG_(printf)("%S",nameIReg(sz,val)); break;
-      case SpillNo: VG_(printf)("spill%d", val); break;
-      default: VG_(panic)("ppUOperand(2)");
-   }
-   if (parens) VG_(printf)(")");
-}
-
-
-Char* VG_(nameUOpcode) ( Bool upper, Opcode opc )
-{
-   switch (opc) {
-      case ADD:   return (upper ? "ADD" : "add");
-      case ADC:   return (upper ? "ADC" : "adc");
-      case AND:   return (upper ? "AND" : "and");
-      case OR:    return (upper ? "OR"  : "or");
-      case XOR:   return (upper ? "XOR" : "xor");
-      case SUB:   return (upper ? "SUB" : "sub");
-      case SBB:   return (upper ? "SBB" : "sbb");
-      case SHL:   return (upper ? "SHL" : "shl");
-      case SHR:   return (upper ? "SHR" : "shr");
-      case SAR:   return (upper ? "SAR" : "sar");
-      case ROL:   return (upper ? "ROL" : "rol");
-      case ROR:   return (upper ? "ROR" : "ror");
-      case RCL:   return (upper ? "RCL" : "rcl");
-      case RCR:   return (upper ? "RCR" : "rcr");
-      case NOT:   return (upper ? "NOT" : "not");
-      case NEG:   return (upper ? "NEG" : "neg");
-      case INC:   return (upper ? "INC" : "inc");
-      case DEC:   return (upper ? "DEC" : "dec");
-      case BSWAP: return (upper ? "BSWAP" : "bswap");
-      default:    break;
-   }
-   if (!upper) VG_(panic)("vg_nameUOpcode: invalid !upper");
-   switch (opc) {
-      case GETVF:   return "GETVF";
-      case PUTVF:   return "PUTVF";
-      case TAG1:    return "TAG1";
-      case TAG2:    return "TAG2";
-      case CALLM_S: return "CALLM_S";
-      case CALLM_E: return "CALLM_E";
-      case INCEIP:  return "INCEIP";
-      case LEA1:    return "LEA1";
-      case LEA2:    return "LEA2";
-      case NOP:     return "NOP";
-      case GET:     return "GET";
-      case PUT:     return "PUT";
-      case GETF:    return "GETF";
-      case PUTF:    return "PUTF";
-      case LOAD:    return "LD" ;
-      case STORE:   return "ST" ;
-      case MOV:     return "MOV";
-      case CMOV:    return "CMOV";
-      case WIDEN:   return "WIDEN";
-      case JMP:     return "J"    ;
-      case JIFZ:    return "JIFZ" ;
-      case CALLM:   return "CALLM";
-      case PUSH:    return "PUSH" ;
-      case POP:     return "POP"  ;
-      case CLEAR:   return "CLEAR";
-      case CC2VAL:  return "CC2VAL";
-      case FPU_R:   return "FPU_R";
-      case FPU_W:   return "FPU_W";
-      case FPU:     return "FPU"  ;
-      case LOADV:   return "LOADV";
-      case STOREV:  return "STOREV";
-      case GETV:    return "GETV";
-      case PUTV:    return "PUTV";
-      case TESTV:   return "TESTV";
-      case SETV:    return "SETV";
-      default:      VG_(panic)("nameUOpcode: unhandled case");
-   }
-}
-
-
-void VG_(ppUInstr) ( Int instrNo, UInstr* u )
-{
-   VG_(printf)("\t%4d: %s", instrNo, 
-                            VG_(nameUOpcode)(True, u->opcode));
-   if (u->opcode == JMP || u->opcode == CC2VAL)
-      VG_(printf)("%s", VG_(nameCondcode(u->cond)));
-
-   switch (u->size) {
-      case 0:  VG_(printf)("o"); break;
-      case 1:  VG_(printf)("B"); break;
-      case 2:  VG_(printf)("W"); break;
-      case 4:  VG_(printf)("L"); break;
-      case 8:  VG_(printf)("Q"); break;
-      default: VG_(printf)("%d", (Int)u->size); break;
-   }
-
-   switch (u->opcode) {
-
-      case TAG1:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, 4, False);
-         VG_(printf)(" = %s ( ", VG_(nameOfTagOp)( u->val3 ));
-         ppUOperand(u, 1, 4, False);
-         VG_(printf)(" )");
-         break;
-
-      case TAG2:
-         VG_(printf)("\t");
-         ppUOperand(u, 2, 4, False);
-         VG_(printf)(" = %s ( ", VG_(nameOfTagOp)( u->val3 ));
-         ppUOperand(u, 1, 4, False);
-         VG_(printf)(", ");
-         ppUOperand(u, 2, 4, False);
-         VG_(printf)(" )");
-         break;
-
-      case CALLM_S: case CALLM_E:
-         break;
-
-      case INCEIP:
-         VG_(printf)("\t$%d", u->val1);
-         break;
-
-      case LEA2:
-         VG_(printf)("\t%d(" , u->lit32);
-         ppUOperand(u, 1, 4, False);
-         VG_(printf)(",");
-         ppUOperand(u, 2, 4, False);
-         VG_(printf)(",%d), ", (Int)u->extra4b);
-         ppUOperand(u, 3, 4, False);
-         break;
-
-      case LEA1:
-         VG_(printf)("\t%d" , u->lit32);
-         ppUOperand(u, 1, 4, True);
-         VG_(printf)(", ");
-         ppUOperand(u, 2, 4, False);
-         break;
-
-      case NOP:
-         break;
-
-      case FPU_W:
-         VG_(printf)("\t0x%x:0x%x, ",
-                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
-         ppUOperand(u, 2, 4, True);
-         break;
-
-      case FPU_R:
-         VG_(printf)("\t");
-         ppUOperand(u, 2, 4, True);
-         VG_(printf)(", 0x%x:0x%x",
-                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
-         break;
-
-      case FPU:
-         VG_(printf)("\t0x%x:0x%x",
-                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
-         break;
-
-      case STOREV: case LOADV:
-      case GET: case PUT: case MOV: case LOAD: case STORE: case CMOV:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, u->opcode==LOAD || u->opcode==LOADV); 
-         VG_(printf)(", ");
-         ppUOperand(u, 2, u->size, u->opcode==STORE || u->opcode==STOREV);
-         break;
-
-      case GETF: case PUTF:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False);
-         break;
-
-      case JMP: case CC2VAL:
-      case PUSH: case POP: case CLEAR: case CALLM:
-         if (u->opcode == JMP) {
-            switch (u->jmpkind) {
-               case JmpCall:      VG_(printf)("-c"); break;
-               case JmpRet:       VG_(printf)("-r"); break;
-               case JmpSyscall:   VG_(printf)("-sys"); break;
-               case JmpClientReq: VG_(printf)("-cli"); break;
-               default: break;
-            }
-         }
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False);
-         break;
-
-      case JIFZ:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False);
-         VG_(printf)(", ");
-         ppUOperand(u, 2, u->size, False);
-         break;
-
-      case PUTVF: case GETVF:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, 0, False); 
-         break;
-
-      case NOT: case NEG: case INC: case DEC: case BSWAP:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False); 
-         break;
-
-      case ADD: case ADC: case AND: case OR:  
-      case XOR: case SUB: case SBB:   
-      case SHL: case SHR: case SAR: 
-      case ROL: case ROR: case RCL: case RCR:   
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False); 
-         VG_(printf)(", ");
-         ppUOperand(u, 2, u->size, False);
-         break;
-
-      case GETV: case PUTV:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->opcode==PUTV ? 4 : u->size, False);
-         VG_(printf)(", ");
-         ppUOperand(u, 2, u->opcode==GETV ? 4 : u->size, False);
-         break;
-
-      case WIDEN:
-         VG_(printf)("_%c%c", VG_(toupper)(nameISize(u->extra4b)),
-                              u->signed_widen?'s':'z');
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False);
-         break;
-
-      case TESTV: case SETV:
-         VG_(printf)("\t");
-         ppUOperand(u, 1, u->size, False);
-         break;
-
-      default: VG_(panic)("ppUInstr: unhandled opcode");
-   }
-
-   if (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty) {
-      VG_(printf)("  (");
-      if (u->flags_r != FlagsEmpty) 
-         vg_ppFlagSet("-r", u->flags_r);
-      if (u->flags_w != FlagsEmpty) 
-         vg_ppFlagSet("-w", u->flags_w);
-      VG_(printf)(")");
-   }
-   VG_(printf)("\n");
-}
-
-
-void VG_(ppUCodeBlock) ( UCodeBlock* cb, Char* title )
-{
-   Int i;
-   VG_(printf)("\n%s\n", title);
-   for (i = 0; i < cb->used; i++)
-      if (0 || cb->instrs[i].opcode != NOP)
-         VG_(ppUInstr) ( i, &cb->instrs[i] );
-   VG_(printf)("\n");
-}
-
-
-/*------------------------------------------------------------*/
-/*--- uinstr helpers for register allocation               ---*/
-/*--- and code improvement.                                ---*/
-/*------------------------------------------------------------*/
-
-/* A structure for communicating temp uses, and for indicating
-   temp->real register mappings for patchUInstr. */
-typedef
-   struct {
-      Int   realNo;
-      Int   tempNo;
-      Bool  isWrite;
-   }
-   TempUse;
-
-
-/* Get the temp use of a uinstr, parking them in an array supplied by
-   the caller, which is assumed to be big enough.  Return the number
-   of entries.  Insns which read _and_ write a register wind up
-   mentioning it twice.  Entries are placed in the array in program
-   order, so that if a reg is read-modified-written, it appears first
-   as a read and then as a write.  
-*/
-static __inline__ 
-Int getTempUsage ( UInstr* u, TempUse* arr )
-{
-
-#  define RD(ono)                                  \
-      if (mycat(u->tag,ono) == TempReg)            \
-         { arr[n].tempNo  = mycat(u->val,ono);     \
-           arr[n].isWrite = False; n++; }
-#  define WR(ono)                                  \
-      if (mycat(u->tag,ono) == TempReg)            \
-         { arr[n].tempNo  = mycat(u->val,ono);     \
-           arr[n].isWrite = True; n++; }
-
-   Int n = 0;
-   switch (u->opcode) {
-      case LEA1: RD(1); WR(2); break;
-      case LEA2: RD(1); RD(2); WR(3); break;
-
-      case NOP: case FPU: case INCEIP: case CALLM_S: case CALLM_E: break;
-      case FPU_R: case FPU_W: RD(2); break;
-
-      case GETF:  WR(1); break;
-      case PUTF:  RD(1); break;
-
-      case GET:   WR(2); break;
-      case PUT:   RD(1); break;
-      case LOAD:  RD(1); WR(2); break;
-      case STORE: RD(1); RD(2); break;
-      case MOV:   RD(1); WR(2); break;
-
-      case JMP:   RD(1); break;
-      case CLEAR: case CALLM: break;
-
-      case PUSH: RD(1); break;
-      case POP:  WR(1); break;
-
-      case TAG2:
-      case CMOV:
-      case ADD: case ADC: case AND: case OR:  
-      case XOR: case SUB: case SBB:   
-         RD(1); RD(2); WR(2); break;
-
-      case SHL: case SHR: case SAR: 
-      case ROL: case ROR: case RCL: case RCR:
-         RD(1); RD(2); WR(2); break;
-
-      case NOT: case NEG: case INC: case DEC: case TAG1: case BSWAP:
-         RD(1); WR(1); break;
-
-      case WIDEN: RD(1); WR(1); break;
-
-      case CC2VAL: WR(1); break;
-      case JIFZ: RD(1); break;
-
-      /* These sizes are only ever consulted when the instrumentation
-         code is being added, so the following can return
-         manifestly-bogus sizes. */
-      case LOADV:   RD(1); WR(2); break;
-      case STOREV:  RD(1); RD(2); break;
-      case GETV:    WR(2); break;
-      case PUTV:    RD(1); break;
-      case TESTV:   RD(1); break;
-      case SETV:    WR(1); break;
-      case PUTVF:   RD(1); break;
-      case GETVF:   WR(1); break;
-
-      default: VG_(panic)("getTempUsage: unhandled opcode");
-   }
-   return n;
-
-#  undef RD
-#  undef WR
-}
-
-
-/* Change temp regs in u into real regs, as directed by tmap. */
-static __inline__ 
-void patchUInstr ( UInstr* u, TempUse* tmap, Int n_tmap )
-{
-   Int i;
-   if (u->tag1 == TempReg) {
-      for (i = 0; i < n_tmap; i++)
-         if (tmap[i].tempNo == u->val1) break;
-      if (i == n_tmap) VG_(panic)("patchUInstr(1)");
-      u->tag1 = RealReg;
-      u->val1 = tmap[i].realNo;
-   }
-   if (u->tag2 == TempReg) {
-      for (i = 0; i < n_tmap; i++)
-         if (tmap[i].tempNo == u->val2) break;
-      if (i == n_tmap) VG_(panic)("patchUInstr(2)");
-      u->tag2 = RealReg;
-      u->val2 = tmap[i].realNo;
-   }
-   if (u->tag3 == TempReg) {
-      for (i = 0; i < n_tmap; i++)
-         if (tmap[i].tempNo == u->val3) break;
-      if (i == n_tmap) VG_(panic)("patchUInstr(3)");
-      u->tag3 = RealReg;
-      u->val3 = tmap[i].realNo;
-   }
-}
-
-
-/* Tedious x86-specific hack which compensates for the fact that the
-   register numbers for %ah .. %dh do not correspond to those for %eax
-   .. %edx.  It maps a (reg size, reg no) pair to the number of the
-   containing 32-bit reg. */
-static __inline__ 
-Int containingArchRegOf ( Int sz, Int aregno )
-{
-   switch (sz) {
-      case 4: return aregno;
-      case 2: return aregno;
-      case 1: return aregno >= 4 ? aregno-4 : aregno;
-      default: VG_(panic)("containingArchRegOf");
-   }
-}
-
-
-/* If u reads an ArchReg, return the number of the containing arch
-   reg.  Otherwise return -1.  Used in redundant-PUT elimination. */
-static __inline__ 
-Int maybe_uinstrReadsArchReg ( UInstr* u )
-{
-   switch (u->opcode) {
-      case GET:
-      case ADD: case ADC: case AND: case OR:  
-      case XOR: case SUB: case SBB:   
-      case SHL: case SHR: case SAR: case ROL: 
-      case ROR: case RCL: case RCR:
-         if (u->tag1 == ArchReg) 
-            return containingArchRegOf ( u->size, u->val1 ); 
-         else
-            return -1;
-
-      case GETF: case PUTF:
-      case CALLM_S: case CALLM_E:
-      case INCEIP:
-      case LEA1:
-      case LEA2:
-      case NOP:
-      case PUT:
-      case LOAD:
-      case STORE:
-      case MOV:
-      case CMOV:
-      case JMP:
-      case CALLM: case CLEAR: case PUSH: case POP:
-      case NOT: case NEG: case INC: case DEC: case BSWAP:
-      case CC2VAL:
-      case JIFZ:
-      case FPU: case FPU_R: case FPU_W:
-      case WIDEN:
-         return -1;
-
-      default: 
-         VG_(ppUInstr)(0,u);
-         VG_(panic)("maybe_uinstrReadsArchReg: unhandled opcode");
-   }
-}
-
-static __inline__
-Bool uInstrMentionsTempReg ( UInstr* u, Int tempreg )
-{
-   Int i, k;
-   TempUse tempUse[3];
-   k = getTempUsage ( u, &tempUse[0] );
-   for (i = 0; i < k; i++)
-      if (tempUse[i].tempNo == tempreg)
-         return True;
-   return False;
-}
-
-
-/*------------------------------------------------------------*/
-/*--- ucode improvement.                                   ---*/
-/*------------------------------------------------------------*/
-
-/* Improve the code in cb by doing
-   -- Redundant ArchReg-fetch elimination
-   -- Redundant PUT elimination
-   -- Redundant cond-code restore/save elimination
-   The overall effect of these is to allow target registers to be
-   cached in host registers over multiple target insns.  
-*/
-static void vg_improve ( UCodeBlock* cb )
-{
-   Int     i, j, k, m, n, ar, tr, told, actual_areg;
-   Int     areg_map[8];
-   Bool    annul_put[8];
-   TempUse tempUse[3];
-   UInstr* u;
-   Bool    wr;
-   Int*    last_live_before;
-   FlagSet future_dead_flags;
-
-   if (cb->nextTemp > 0)
-      last_live_before = VG_(jitmalloc) ( cb->nextTemp * sizeof(Int) );
-   else
-      last_live_before = NULL;
-
-   
-   /* PASS 1: redundant GET elimination.  (Actually, more general than
-      that -- eliminates redundant fetches of ArchRegs). */
-
-   /* Find the live-range-ends for all temporaries.  Duplicates code
-      in the register allocator :-( */
-
-   for (i = 0; i < cb->nextTemp; i++) last_live_before[i] = -1;
-
-   for (i = cb->used-1; i >= 0; i--) {
-      u = &cb->instrs[i];
-
-      k = getTempUsage(u, &tempUse[0]);
-
-      /* For each temp usage ... bwds in program order. */
-      for (j = k-1; j >= 0; j--) {
-         tr = tempUse[j].tempNo;
-         wr = tempUse[j].isWrite;
-         if (last_live_before[tr] == -1) {
-            vg_assert(tr >= 0 && tr < cb->nextTemp);
-            last_live_before[tr] = wr ? (i+1) : i;
-         }
-      }
-
-   }
-
-#  define BIND_ARCH_TO_TEMP(archreg,tempreg)\
-   { Int q;                                           \
-     /* Invalidate any old binding(s) to tempreg. */  \
-     for (q = 0; q < 8; q++)                          \
-        if (areg_map[q] == tempreg) areg_map[q] = -1; \
-     /* Add the new binding. */                       \
-     areg_map[archreg] = (tempreg);                   \
-   }
-
-   /* Set up the A-reg map. */
-   for (i = 0; i < 8; i++) areg_map[i] = -1;
-
-   /* Scan insns. */
-   for (i = 0; i < cb->used; i++) {
-      u = &cb->instrs[i];
-      if (u->opcode == GET && u->size == 4) {
-         /* GET; see if it can be annulled. */
-         vg_assert(u->tag1 == ArchReg);
-         vg_assert(u->tag2 == TempReg);
-         ar   = u->val1;
-         tr   = u->val2;
-         told = areg_map[ar];
-         if (told != -1 && last_live_before[told] <= i) {
-            /* ar already has an old mapping to told, but that runs
-               out here.  Annul this GET, rename tr to told for the
-               rest of the block, and extend told's live range to that
-               of tr.  */
-            u->opcode = NOP;
-            u->tag1 = u->tag2 = NoValue;
-            n = last_live_before[tr] + 1;
-            if (n > cb->used) n = cb->used;
-            last_live_before[told] = last_live_before[tr];
-            last_live_before[tr] = i-1;
-            if (VG_(disassemble))
-               VG_(printf)(
-                  "at %d: delete GET, rename t%d to t%d in (%d .. %d)\n", 
-                  i, tr, told,i+1, n-1);
-            for (m = i+1; m < n; m++) {
-               if (cb->instrs[m].tag1 == TempReg 
-                   && cb->instrs[m].val1 == tr) 
-                 cb->instrs[m].val1 = told;
-               if (cb->instrs[m].tag2 == TempReg 
-                   && cb->instrs[m].val2 == tr) 
-                 cb->instrs[m].val2 = told;
-            }
-            BIND_ARCH_TO_TEMP(ar,told);
-         }
-         else
-            BIND_ARCH_TO_TEMP(ar,tr);
-      }
-      else if (u->opcode == GET && u->size != 4) {
-         /* Invalidate any mapping for this archreg.  */
-         actual_areg = containingArchRegOf ( u->size, u->val1 );
-         areg_map[actual_areg] = -1;
-      } 
-      else if (u->opcode == PUT && u->size == 4) {
-         /* PUT; re-establish t -> a binding */
-         vg_assert(u->tag1 == TempReg);
-         vg_assert(u->tag2 == ArchReg);
-         BIND_ARCH_TO_TEMP(u->val2, u->val1);
-      }
-      else if (u->opcode == PUT && u->size != 4) {
-         /* Invalidate any mapping for this archreg. */
-         actual_areg = containingArchRegOf ( u->size, u->val2 );
-         areg_map[actual_areg] = -1;
-      } else {
-
-         /* see if insn has an archreg as a read operand; if so try to
-            map it. */
-         if (u->tag1 == ArchReg && u->size == 4 
-                                && areg_map[u->val1] != -1) {
-            switch (u->opcode) {
-               case ADD: case SUB: case AND: case OR: case XOR:
-               case ADC: case SBB:
-               case SHL: case SHR: case SAR: case ROL: case ROR:
-               case RCL: case RCR:
-                  if (VG_(disassemble)) 
-                     VG_(printf)(
-                        "at %d: change ArchReg %S to TempReg t%d\n", 
-                        i, nameIReg(4,u->val1), areg_map[u->val1]);
-                  u->tag1 = TempReg;
-                  u->val1 = areg_map[u->val1];
-                  /* Remember to extend the live range of the TempReg,
-                     if necessary. */
-                  if (last_live_before[u->val1] < i)
-                     last_live_before[u->val1] = i;
-                  break;
-               default: 
-                  break;
-            }
-         }
-
-         /* boring insn; invalidate any mappings to temps it writes */
-         k = getTempUsage(u, &tempUse[0]);
-
-         for (j = 0; j < k; j++) {
-            wr  = tempUse[j].isWrite;
-            if (!wr) continue;
-            tr = tempUse[j].tempNo;
-            for (m = 0; m < 8; m++)
-               if (areg_map[m] == tr) areg_map[m] = -1;
-         }
-      }
-         
-   }
-
-#  undef BIND_ARCH_TO_TEMP
-
-   /* PASS 2: redundant PUT elimination.  Don't annul (delay) puts of
-      %ESP, since the memory check machinery always requires the
-      in-memory value of %ESP to be up to date.  Although this isn't
-      actually required by other analyses (cache simulation), it's
-      simplest to be consistent for all end-uses. */
-   for (j = 0; j < 8; j++)
-      annul_put[j] = False;
-
-   for (i = cb->used-1; i >= 0; i--) {
-      u = &cb->instrs[i];
-      if (u->opcode == NOP) continue;
-
-      if (u->opcode == PUT && u->size == 4) {
-         vg_assert(u->tag2 == ArchReg);
-         actual_areg = containingArchRegOf ( 4, u->val2 );
-         if (annul_put[actual_areg]) {
-            vg_assert(actual_areg != R_ESP);
-            u->opcode = NOP;
-            u->tag1 = u->tag2 = NoValue;
-            if (VG_(disassemble)) 
-               VG_(printf)("at %d: delete PUT\n", i );
-         } else {
-            if (actual_areg != R_ESP)
-               annul_put[actual_areg] = True;
-         }
-      } 
-      else if (u->opcode == PUT && u->size != 4) { 
-         actual_areg = containingArchRegOf ( u->size, u->val2 );
-         annul_put[actual_areg] = False;
-      } 
-      else if (u->opcode == JMP || u->opcode == JIFZ
-               || u->opcode == CALLM) {
-         for (j = 0; j < 8; j++)
-            annul_put[j] = False;
-      }
-      else {
-         /* If an instruction reads an ArchReg, the immediately
-            preceding PUT cannot be annulled. */
-         actual_areg = maybe_uinstrReadsArchReg ( u );
-         if (actual_areg != -1)      
-            annul_put[actual_areg] = False;
-      }
-   }
-
-   /* PASS 2a: redundant-move elimination.  Given MOV t1, t2 and t1 is
-      dead after this point, annul the MOV insn and rename t2 to t1.
-      Further modifies the last_live_before map. */
-
-#  if 0
-   VG_(ppUCodeBlock)(cb, "Before MOV elimination" );
-   for (i = 0; i < cb->nextTemp; i++)
-     VG_(printf)("llb[t%d]=%d   ", i, last_live_before[i]);
-   VG_(printf)("\n");
-#  endif
-
-   for (i = 0; i < cb->used-1; i++) {
-      u = &cb->instrs[i];
-      if (u->opcode != MOV) continue;
-      if (u->tag1 == Literal) continue;
-      vg_assert(u->tag1 == TempReg);
-      vg_assert(u->tag2 == TempReg);
-      if (last_live_before[u->val1] == i) {
-         if (VG_(disassemble))
-            VG_(printf)(
-               "at %d: delete MOV, rename t%d to t%d in (%d .. %d)\n",
-               i, u->val2, u->val1, i+1, last_live_before[u->val2] );
-         for (j = i+1; j <= last_live_before[u->val2]; j++) {
-            if (cb->instrs[j].tag1 == TempReg 
-                && cb->instrs[j].val1 == u->val2)
-               cb->instrs[j].val1 = u->val1;
-            if (cb->instrs[j].tag2 == TempReg 
-                && cb->instrs[j].val2 == u->val2)
-               cb->instrs[j].val2 = u->val1;
-         }
-         last_live_before[u->val1] = last_live_before[u->val2];
-         last_live_before[u->val2] = i-1;
-         u->opcode = NOP;
-         u->tag1 = u->tag2 = NoValue;
-      }
-   }
-
-   /* PASS 3: redundant condition-code restore/save elimination.
-      Scan backwards from the end.  future_dead_flags records the set
-      of flags which are dead at this point, that is, will be written
-      before they are next read.  Earlier uinsns which write flags
-      already in future_dead_flags can have their writes annulled.  
-   */
-   future_dead_flags = FlagsEmpty;
-
-   for (i = cb->used-1; i >= 0; i--) {
-      u = &cb->instrs[i];
-
-      /* We might never make it to insns beyond this one, so be
-         conservative. */
-      if (u->opcode == JIFZ || u->opcode == JMP) {
-         future_dead_flags = FlagsEmpty;
-         continue;
-      } 
-
-      /* PUTF modifies the %EFLAGS in essentially unpredictable ways.
-         For example people try to mess with bit 21 to see if CPUID
-         works.  The setting may or may not actually take hold.  So we
-         play safe here. */
-      if (u->opcode == PUTF) {
-         future_dead_flags = FlagsEmpty;
-         continue;
-      } 
-
-      /* We can annul the flags written by this insn if it writes a
-         subset (or eq) of the set of flags known to be dead after
-         this insn.  If not, just record the flags also written by
-         this insn.*/
-      if (u->flags_w != FlagsEmpty
-          && VG_IS_FLAG_SUBSET(u->flags_w, future_dead_flags)) {
-         if (VG_(disassemble)) {
-            VG_(printf)("at %d: annul flag write ", i);
-            vg_ppFlagSet("", u->flags_w);
-            VG_(printf)(" due to later ");
-            vg_ppFlagSet("", future_dead_flags);
-            VG_(printf)("\n");
-         }
-         u->flags_w = FlagsEmpty;
-      } else {
-        future_dead_flags 
-           = VG_UNION_FLAG_SETS ( u->flags_w, future_dead_flags );
-      }
-
-      /* If this insn also reads flags, empty out future_dead_flags so
-         as to force preceding writes not to be annulled. */
-      if (u->flags_r != FlagsEmpty)
-         future_dead_flags = FlagsEmpty;
-   }
-
-   if (last_live_before) 
-      VG_(jitfree) ( last_live_before );
-}
-
-
-/*------------------------------------------------------------*/
-/*--- The new register allocator.                          ---*/
-/*------------------------------------------------------------*/
-
-typedef
-   struct {
-      /* Becomes live for the first time after this insn ... */
-      Int live_after;
-      /* Becomes dead for the last time after this insn ... */
-      Int dead_before;
-      /* The "home" spill slot, if needed.  Never changes. */
-      Int spill_no;
-      /* Where is it?  VG_NOVALUE==in a spill slot; else in reg. */
-      Int real_no;
-   }
-   TempInfo;
-
-
-/* Take a ucode block and allocate its TempRegs to RealRegs, or put
-   them in spill locations, and add spill code, if there are not
-   enough real regs.  The usual register allocation deal, in short.  
-
-   Important redundancy of representation:
-
-     real_to_temp maps real reg ranks (RRRs) to TempReg nos, or
-     to VG_NOVALUE if the real reg has no currently assigned TempReg.
-
-     The .real_no field of a TempInfo gives the current RRR for
-     this TempReg, or VG_NOVALUE if the TempReg is currently
-     in memory, in which case it is in the SpillNo denoted by
-     spillno.
-
-   These pieces of information (a fwds-bwds mapping, really) must 
-   be kept consistent!
-
-   This allocator uses the so-called Second Chance Bin Packing
-   algorithm, as described in "Quality and Speed in Linear-scan
-   Register Allocation" (Traub, Holloway and Smith, ACM PLDI98,
-   pp142-151).  It is simple and fast and remarkably good at
-   minimising the amount of spill code introduced.
-*/
-
-static
-UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
-{
-   TempInfo*    temp_info;
-   Int          real_to_temp[VG_MAX_REALREGS];
-   Bool         is_spill_cand[VG_MAX_REALREGS];
-   Int          ss_busy_until_before[VG_MAX_SPILLSLOTS];
-   Int          i, j, k, m, r, tno, max_ss_no;
-   Bool         wr, defer, isRead, spill_reqd;
-   TempUse      tempUse[3];
-   UCodeBlock*  c2;
-
-   /* Used to denote ... well, "no value" in this fn. */
-#  define VG_NOTHING (-2)
-
-   /* Initialise the TempReg info.  */
-   if (c1->nextTemp > 0)
-      temp_info = VG_(jitmalloc)(c1->nextTemp * sizeof(TempInfo) );
-   else
-      temp_info = NULL;
-
-   for (i = 0; i < c1->nextTemp; i++) {
-      temp_info[i].live_after  = VG_NOTHING;
-      temp_info[i].dead_before = VG_NOTHING;
-      temp_info[i].spill_no    = VG_NOTHING;
-      /* temp_info[i].real_no is not yet relevant. */
-   }
-
-   spill_reqd = False;
-
-   /* Scan fwds to establish live ranges. */
-
-   for (i = 0; i < c1->used; i++) {
-      k = getTempUsage(&c1->instrs[i], &tempUse[0]);
-      vg_assert(k >= 0 && k <= 3);
-
-      /* For each temp usage ... fwds in program order */
-      for (j = 0; j < k; j++) {
-         tno = tempUse[j].tempNo;
-         wr  = tempUse[j].isWrite;
-         if (wr) {
-            /* Writes hold a reg live until after this insn. */
-            if (temp_info[tno].live_after == VG_NOTHING)
-               temp_info[tno].live_after = i;
-            if (temp_info[tno].dead_before < i + 1)
-               temp_info[tno].dead_before = i + 1;
-         } else {
-            /* First use of a tmp should be a write. */
-            vg_assert(temp_info[tno].live_after != VG_NOTHING);
-            /* Reads only hold it live until before this insn. */
-            if (temp_info[tno].dead_before < i)
-               temp_info[tno].dead_before = i;
-         }
-      }
-   }
-
-#  if 0
-   /* Sanity check on live ranges.  Expensive but correct. */
-   for (i = 0; i < c1->nextTemp; i++) {
-      vg_assert( (temp_info[i].live_after == VG_NOTHING 
-                  && temp_info[i].dead_before == VG_NOTHING)
-                 || (temp_info[i].live_after != VG_NOTHING 
-                     && temp_info[i].dead_before != VG_NOTHING) );
-   }
-#  endif
-
-   /* Do a rank-based allocation of TempRegs to spill slot numbers.
-      We put as few as possible values in spill slots, but
-      nevertheless need to have an assignment to them just in case. */
-
-   max_ss_no = -1;
-
-   for (i = 0; i < VG_MAX_SPILLSLOTS; i++)
-      ss_busy_until_before[i] = 0;
-  
-   for (i = 0; i < c1->nextTemp; i++) {
-
-      /* True iff this temp is unused. */
-      if (temp_info[i].live_after == VG_NOTHING) 
-         continue;
-
-      /* Find the lowest-numbered spill slot which is available at the
-         start point of this interval, and assign the interval to
-         it. */
-      for (j = 0; j < VG_MAX_SPILLSLOTS; j++)
-         if (ss_busy_until_before[j] <= temp_info[i].live_after)
-            break;
-      if (j == VG_MAX_SPILLSLOTS) {
-         VG_(printf)("VG_MAX_SPILLSLOTS is too low; increase and recompile.\n");
-         VG_(panic)("register allocation failed -- out of spill slots");
-      }
-      ss_busy_until_before[j] = temp_info[i].dead_before;
-      temp_info[i].spill_no = j;
-      if (j > max_ss_no)
-         max_ss_no = j;
-   }
-
-   VG_(total_reg_rank) += (max_ss_no+1);
-
-   /* Show live ranges and assigned spill slot nos. */
-
-   if (VG_(disassemble)) {
-      VG_(printf)("Live Range Assignments\n");
-
-      for (i = 0; i < c1->nextTemp; i++) {
-         if (temp_info[i].live_after == VG_NOTHING) 
-            continue;
-         VG_(printf)(
-            "   LR %d is   after %d to before %d   spillno %d\n",
-            i,
-            temp_info[i].live_after,
-            temp_info[i].dead_before,
-            temp_info[i].spill_no
-         );
-      }
-   }
-
-   /* Now that we've established a spill slot number for each used
-      temporary, we can go ahead and do the core of the "Second-chance
-      binpacking" allocation algorithm. */
-
-   /* Resulting code goes here.  We generate it all in a forwards
-      pass. */
-   c2 = VG_(allocCodeBlock)();
-
-   /* At the start, no TempRegs are assigned to any real register.
-      Correspondingly, all temps claim to be currently resident in
-      their spill slots, as computed by the previous two passes. */
-   for (i = 0; i < VG_MAX_REALREGS; i++)
-      real_to_temp[i] = VG_NOTHING;
-   for (i = 0; i < c1->nextTemp; i++)
-      temp_info[i].real_no = VG_NOTHING;
-
-   if (VG_(disassemble))
-      VG_(printf)("\n");
-
-   /* Process each insn in turn. */
-   for (i = 0; i < c1->used; i++) {
-
-      if (c1->instrs[i].opcode == NOP) continue;
-      VG_(uinstrs_prealloc)++;
-
-#     if 0
-      /* Check map consistency.  Expensive but correct. */
-      for (r = 0; r < VG_MAX_REALREGS; r++) {
-         if (real_to_temp[r] != VG_NOTHING) {
-            tno = real_to_temp[r];
-            vg_assert(tno >= 0 && tno < c1->nextTemp);
-            vg_assert(temp_info[tno].real_no == r);
-         }
-      }
-      for (tno = 0; tno < c1->nextTemp; tno++) {
-         if (temp_info[tno].real_no != VG_NOTHING) {
-            r = temp_info[tno].real_no;
-            vg_assert(r >= 0 && r < VG_MAX_REALREGS);
-            vg_assert(real_to_temp[r] == tno);
-         }
-      }
-#     endif
-
-      if (VG_(disassemble))
-         VG_(ppUInstr)(i, &c1->instrs[i]);
-
-      /* First, free up enough real regs for this insn.  This may
-         generate spill stores since we may have to evict some TempRegs
-         currently in real regs.  Also generates spill loads. */
-
-      k = getTempUsage(&c1->instrs[i], &tempUse[0]);
-      vg_assert(k >= 0 && k <= 3);
-
-      /* For each ***different*** temp mentioned in the insn .... */
-      for (j = 0; j < k; j++) {
-
-         /* First check if the temp is mentioned again later; if so,
-            ignore this mention.  We only want to process each temp
-            used by the insn once, even if it is mentioned more than
-            once. */
-         defer = False;
-         tno = tempUse[j].tempNo;
-         for (m = j+1; m < k; m++)
-            if (tempUse[m].tempNo == tno) 
-               defer = True;
-         if (defer) 
-            continue;
-
-         /* Now we're trying to find a register for tempUse[j].tempNo.
-            First of all, if it already has a register assigned, we
-            don't need to do anything more. */
-         if (temp_info[tno].real_no != VG_NOTHING)
-            continue;
-
-         /* No luck.  The next thing to do is see if there is a
-            currently unassigned register available.  If so, bag it. */
-         for (r = 0; r < VG_MAX_REALREGS; r++) {
-            if (real_to_temp[r] == VG_NOTHING)
-               break;
-         }
-         if (r < VG_MAX_REALREGS) {
-            real_to_temp[r]        = tno;
-            temp_info[tno].real_no = r;
-            continue;
-         }
-
-         /* Unfortunately, that didn't pan out either.  So we'll have
-            to eject some other unfortunate TempReg into a spill slot
-            in order to free up a register.  Of course, we need to be
-            careful not to eject some other TempReg needed by this
-            insn.
-
-            Select r in 0 .. VG_MAX_REALREGS-1 such that
-            real_to_temp[r] is not mentioned in 
-            tempUse[0 .. k-1].tempNo, since it would be just plain 
-            wrong to eject some other TempReg which we need to use in 
-            this insn.
-
-            It is here that it is important to make a good choice of
-            register to spill.  */
-
-         /* First, mark those regs which are not spill candidates. */
-         for (r = 0; r < VG_MAX_REALREGS; r++) {
-            is_spill_cand[r] = True;
-            for (m = 0; m < k; m++) {
-               if (real_to_temp[r] == tempUse[m].tempNo) {
-                  is_spill_cand[r] = False;
-                  break;
-               }
-            }
-         }
-
-         /* We can choose any r satisfying is_spill_cand[r].  However,
-            try to make a good choice.  First, try and find r such
-            that the associated TempReg is already dead. */
-         for (r = 0; r < VG_MAX_REALREGS; r++) {
-            if (is_spill_cand[r] && 
-                temp_info[real_to_temp[r]].dead_before <= i)
-               goto have_spill_cand;
-         }
-
-         /* No spill cand is mapped to a dead TempReg.  Now we really
-           _do_ have to generate spill code.  Choose r so that the
-           next use of its associated TempReg is as far ahead as
-           possible, in the hope that this will minimise the number of
-           consequent reloads required.  This is a bit expensive, but
-           we don't have to do it very often. */
-         {
-            Int furthest_r = VG_MAX_REALREGS;
-            Int furthest = 0;
-            for (r = 0; r < VG_MAX_REALREGS; r++) {
-               if (!is_spill_cand[r]) continue;
-               for (m = i+1; m < c1->used; m++)
-                  if (uInstrMentionsTempReg(&c1->instrs[m], 
-                                            real_to_temp[r]))
-                     break;
-               if (m > furthest) {
-                  furthest   = m;
-                  furthest_r = r;
-               }
-            }
-            r = furthest_r;
-            goto have_spill_cand;
-         }
-
-         have_spill_cand:
-         if (r == VG_MAX_REALREGS)
-            VG_(panic)("new reg alloc: out of registers ?!");
-
-         /* Eject r.  Important refinement: don't bother if the
-            associated TempReg is now dead. */
-         vg_assert(real_to_temp[r] != VG_NOTHING);
-         vg_assert(real_to_temp[r] != tno);
-         temp_info[real_to_temp[r]].real_no = VG_NOTHING;
-         if (temp_info[real_to_temp[r]].dead_before > i) {
-            uInstr2(c2, PUT, 4, 
-                        RealReg, VG_(rankToRealRegNo)(r), 
-                        SpillNo, temp_info[real_to_temp[r]].spill_no);
-            VG_(uinstrs_spill)++;
-            spill_reqd = True;
-            if (VG_(disassemble))
-               VG_(ppUInstr)(c2->used-1, &LAST_UINSTR(c2));
-         }
-
-         /* Decide if tno is read. */
-         isRead = False;
-         for (m = 0; m < k; m++)
-            if (tempUse[m].tempNo == tno && !tempUse[m].isWrite) 
-               isRead = True;
-
-         /* If so, generate a spill load. */
-         if (isRead) {
-            uInstr2(c2, GET, 4, 
-                        SpillNo, temp_info[tno].spill_no, 
-                        RealReg, VG_(rankToRealRegNo)(r) );
-            VG_(uinstrs_spill)++;
-            spill_reqd = True;
-            if (VG_(disassemble))
-               VG_(ppUInstr)(c2->used-1, &LAST_UINSTR(c2));
-         }
-
-         /* Update the forwards and backwards maps. */
-         real_to_temp[r]        = tno;
-         temp_info[tno].real_no = r;
-      }
-
-      /* By this point, all TempRegs mentioned by the insn have been
-         bought into real regs.  We now copy the insn to the output
-         and use patchUInstr to convert its rTempRegs into
-         realregs. */
-      for (j = 0; j < k; j++)
-         tempUse[j].realNo 
-            = VG_(rankToRealRegNo)(temp_info[tempUse[j].tempNo].real_no);
-      VG_(copyUInstr)(c2, &c1->instrs[i]);
-      patchUInstr(&LAST_UINSTR(c2), &tempUse[0], k);
-
-      if (VG_(disassemble)) {
-         VG_(ppUInstr)(c2->used-1, &LAST_UINSTR(c2));
-         VG_(printf)("\n");
-      }
-   }
-
-   if (temp_info != NULL)
-      VG_(jitfree)(temp_info);
-
-   VG_(freeCodeBlock)(c1);
-
-   if (spill_reqd) 
-      VG_(translations_needing_spill)++;
-
-   return c2;
-
-#  undef VG_NOTHING
-
-}
-
-
-/*------------------------------------------------------------*/
-/*--- New instrumentation machinery.                       ---*/
-/*------------------------------------------------------------*/
-
-static
-VgTagOp get_VgT_ImproveOR_TQ ( Int sz )
-{
-   switch (sz) {
-      case 4: return VgT_ImproveOR4_TQ;
-      case 2: return VgT_ImproveOR2_TQ;
-      case 1: return VgT_ImproveOR1_TQ;
-      default: VG_(panic)("get_VgT_ImproveOR_TQ");
-   }
-}
-
-
-static
-VgTagOp get_VgT_ImproveAND_TQ ( Int sz )
-{
-   switch (sz) {
-      case 4: return VgT_ImproveAND4_TQ;
-      case 2: return VgT_ImproveAND2_TQ;
-      case 1: return VgT_ImproveAND1_TQ;
-      default: VG_(panic)("get_VgT_ImproveAND_TQ");
-   }
-}
-
-
-static
-VgTagOp get_VgT_Left ( Int sz )
-{
-   switch (sz) {
-      case 4: return VgT_Left4;
-      case 2: return VgT_Left2;
-      case 1: return VgT_Left1;
-      default: VG_(panic)("get_VgT_Left");
-   }
-}
-
-
-static
-VgTagOp get_VgT_UifU ( Int sz )
-{
-   switch (sz) {
-      case 4: return VgT_UifU4;
-      case 2: return VgT_UifU2;
-      case 1: return VgT_UifU1;
-      case 0: return VgT_UifU0;
-      default: VG_(panic)("get_VgT_UifU");
-   }
-}
-
-
-static
-VgTagOp get_VgT_DifD ( Int sz )
-{
-   switch (sz) {
-      case 4: return VgT_DifD4;
-      case 2: return VgT_DifD2;
-      case 1: return VgT_DifD1;
-      default: VG_(panic)("get_VgT_DifD");
-   }
-}
-
-
-static 
-VgTagOp get_VgT_PCast ( Int szs, Int szd )
-{
-   if (szs == 4 && szd == 0) return VgT_PCast40;
-   if (szs == 2 && szd == 0) return VgT_PCast20;
-   if (szs == 1 && szd == 0) return VgT_PCast10;
-   if (szs == 0 && szd == 1) return VgT_PCast01;
-   if (szs == 0 && szd == 2) return VgT_PCast02;
-   if (szs == 0 && szd == 4) return VgT_PCast04;
-   if (szs == 1 && szd == 4) return VgT_PCast14;
-   if (szs == 1 && szd == 2) return VgT_PCast12;
-   if (szs == 1 && szd == 1) return VgT_PCast11;
-   VG_(printf)("get_VgT_PCast(%d,%d)\n", szs, szd);
-   VG_(panic)("get_VgT_PCast");
-}
-
-
-static 
-VgTagOp get_VgT_Widen ( Bool syned, Int szs, Int szd )
-{
-   if (szs == 1 && szd == 2 && syned)  return VgT_SWiden12;
-   if (szs == 1 && szd == 2 && !syned) return VgT_ZWiden12;
-
-   if (szs == 1 && szd == 4 && syned)  return VgT_SWiden14;
-   if (szs == 1 && szd == 4 && !syned) return VgT_ZWiden14;
-
-   if (szs == 2 && szd == 4 && syned)  return VgT_SWiden24;
-   if (szs == 2 && szd == 4 && !syned) return VgT_ZWiden24;
-
-   VG_(printf)("get_VgT_Widen(%d,%d,%d)\n", (Int)syned, szs, szd);
-   VG_(panic)("get_VgT_Widen");
-}
-
-/* Pessimally cast the spec'd shadow from one size to another. */
-static 
-void create_PCast ( UCodeBlock* cb, Int szs, Int szd, Int tempreg )
-{
-   if (szs == 0 && szd == 0)
-      return;
-   uInstr3(cb, TAG1, 0, TempReg, tempreg, 
-                        NoValue, 0, 
-                        Lit16,   get_VgT_PCast(szs,szd));
-}
-
-
-/* Create a signed or unsigned widen of the spec'd shadow from one
-   size to another.  The only allowed size transitions are 1->2, 1->4
-   and 2->4. */
-static 
-void create_Widen ( UCodeBlock* cb, Bool signed_widen,
-                    Int szs, Int szd, Int tempreg )
-{
-   if (szs == szd) return;
-   uInstr3(cb, TAG1, 0, TempReg, tempreg, 
-                        NoValue, 0, 
-                        Lit16,   get_VgT_Widen(signed_widen,szs,szd));
-}
-
-
-/* Get the condition codes into a new shadow, at the given size. */
-static
-Int create_GETVF ( UCodeBlock* cb, Int sz )
-{
-   Int tt = newShadow(cb);
-   uInstr1(cb, GETVF, 0, TempReg, tt);
-   create_PCast(cb, 0, sz, tt);
-   return tt;
-}
-
-
-/* Save the condition codes from the spec'd shadow. */
-static
-void create_PUTVF ( UCodeBlock* cb, Int sz, Int tempreg )
-{
-   if (sz == 0) {
-      uInstr1(cb, PUTVF, 0, TempReg, tempreg);
-   } else { 
-      Int tt = newShadow(cb);
-      uInstr2(cb, MOV, 4, TempReg, tempreg, TempReg, tt);
-      create_PCast(cb, sz, 0, tt);
-      uInstr1(cb, PUTVF, 0, TempReg, tt);
-   }
-}
-
-
-/* Do Left on the spec'd shadow. */
-static 
-void create_Left ( UCodeBlock* cb, Int sz, Int tempreg )
-{
-   uInstr3(cb, TAG1, 0, 
-               TempReg, tempreg,
-               NoValue, 0, 
-               Lit16, get_VgT_Left(sz));
-}
-
-
-/* Do UifU on ts and td, putting the result in td. */
-static 
-void create_UifU ( UCodeBlock* cb, Int sz, Int ts, Int td )
-{
-   uInstr3(cb, TAG2, 0, TempReg, ts, TempReg, td,
-               Lit16, get_VgT_UifU(sz));
-}
-
-
-/* Do DifD on ts and td, putting the result in td. */
-static 
-void create_DifD ( UCodeBlock* cb, Int sz, Int ts, Int td )
-{
-   uInstr3(cb, TAG2, 0, TempReg, ts, TempReg, td,
-               Lit16, get_VgT_DifD(sz));
-}
-
-
-/* Do HelpAND on value tval and tag tqqq, putting the result in
-   tqqq. */
-static 
-void create_ImproveAND_TQ ( UCodeBlock* cb, Int sz, Int tval, Int tqqq )
-{
-   uInstr3(cb, TAG2, 0, TempReg, tval, TempReg, tqqq,
-               Lit16, get_VgT_ImproveAND_TQ(sz));
-}
-
-
-/* Do HelpOR on value tval and tag tqqq, putting the result in
-   tqqq. */
-static 
-void create_ImproveOR_TQ ( UCodeBlock* cb, Int sz, Int tval, Int tqqq )
-{
-   uInstr3(cb, TAG2, 0, TempReg, tval, TempReg, tqqq,
-               Lit16, get_VgT_ImproveOR_TQ(sz));
-}
-
-
-/* Get the shadow for an operand described by (tag, val).  Emit code
-   to do this and return the identity of the shadow holding the
-   result.  The result tag is always copied into a new shadow, so it
-   can be modified without trashing the original.*/
-static
-Int /* TempReg */ getOperandShadow ( UCodeBlock* cb, 
-                                     Int sz, Int tag, Int val )
-{
-   Int sh;
-   sh = newShadow(cb);
-   if (tag == TempReg) {
-      uInstr2(cb, MOV, 4, TempReg, SHADOW(val), TempReg, sh);
-      return sh;
-   }
-   if (tag == Literal) {
-      uInstr1(cb, SETV, sz, TempReg, sh);
-      return sh;
-   }
-   if (tag == ArchReg) {
-      uInstr2(cb, GETV, sz, ArchReg, val, TempReg, sh);
-      return sh;
-   }
-   VG_(panic)("getOperandShadow");
-}
-
-
-
-/* Create and return an instrumented version of cb_in.  Free cb_in
-   before returning. */
-static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
-{
-   UCodeBlock* cb;
-   Int         i, j;
-   UInstr*     u_in;
-   Int         qs, qd, qt, qtt;
-   cb = VG_(allocCodeBlock)();
-   cb->nextTemp = cb_in->nextTemp;
-
-   for (i = 0; i < cb_in->used; i++) {
-      qs = qd = qt = qtt = INVALID_TEMPREG;
-      u_in = &cb_in->instrs[i];
-
-      /* if (i > 0) uInstr1(cb, NOP, 0, NoValue, 0); */
-
-      /* VG_(ppUInstr)(0, u_in); */
-      switch (u_in->opcode) {
-
-         case NOP:
-            break;
-
-         case INCEIP:
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Loads and stores.  Test the V bits for the address.  24
-            Mar 02: since the address is A-checked anyway, there's not
-            really much point in doing the V-check too, unless you
-            think that you might use addresses which are undefined but
-            still addressible.  Hence the optionalisation of the V
-            check.
-
-            The LOADV/STOREV does an addressibility check for the
-            address. */
-
-         case LOAD: 
-            if (VG_(clo_check_addrVs)) {
-               uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
-               uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
-            }
-            uInstr2(cb, LOADV, u_in->size, 
-                        TempReg, u_in->val1,
-                        TempReg, SHADOW(u_in->val2));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-         case STORE:
-            if (VG_(clo_check_addrVs)) {
-               uInstr1(cb, TESTV,  4, TempReg, SHADOW(u_in->val2));
-               uInstr1(cb, SETV,   4, TempReg, SHADOW(u_in->val2));
-            }
-            uInstr2(cb, STOREV, u_in->size,
-                        TempReg, SHADOW(u_in->val1), 
-                        TempReg, u_in->val2);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Moving stuff around.  Make the V bits follow accordingly,
-            but don't do anything else.  */
-
-         case GET:
-            uInstr2(cb, GETV, u_in->size,
-                        ArchReg, u_in->val1,
-                        TempReg, SHADOW(u_in->val2));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-         case PUT:
-            uInstr2(cb, PUTV, u_in->size, 
-                        TempReg, SHADOW(u_in->val1),
-                        ArchReg, u_in->val2);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         case GETF:
-            /* This is not the smartest way to do it, but should work. */
-            qd = create_GETVF(cb, u_in->size);
-            uInstr2(cb, MOV, 4, TempReg, qd, TempReg, SHADOW(u_in->val1));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-         case PUTF:
-            create_PUTVF(cb, u_in->size, SHADOW(u_in->val1));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         case MOV:
-            switch (u_in->tag1) {
-               case TempReg: 
-                  uInstr2(cb, MOV, 4,
-                              TempReg, SHADOW(u_in->val1),
-                              TempReg, SHADOW(u_in->val2));
-                  break;
-               case Literal: 
-                  uInstr1(cb, SETV, u_in->size, 
-                              TempReg, SHADOW(u_in->val2));
-                  break;
-               default: 
-                  VG_(panic)("vg_instrument: MOV");
-            }
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Special case of add, where one of the operands is a literal.
-            lea1(t) = t + some literal.
-            Therefore: lea1#(qa) = left(qa) 
-         */
-         case LEA1:
-            vg_assert(u_in->size == 4 && !VG_(anyFlagUse)(u_in));
-            qs = SHADOW(u_in->val1);
-            qd = SHADOW(u_in->val2);
-            uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qd);
-            create_Left(cb, u_in->size, qd);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Another form of add.  
-            lea2(ts,tt,shift) = ts + (tt << shift); shift is a literal
-                                and is 0,1,2 or 3.
-            lea2#(qs,qt) = left(qs `UifU` (qt << shift)).
-            Note, subtly, that the shift puts zeroes at the bottom of qt,
-            meaning Valid, since the corresponding shift of tt puts 
-            zeroes at the bottom of tb.
-         */
-         case LEA2: {
-            Int shift;
-            vg_assert(u_in->size == 4 && !VG_(anyFlagUse)(u_in));
-            switch (u_in->extra4b) {
-               case 1: shift = 0; break;
-               case 2: shift = 1; break;
-               case 4: shift = 2; break;
-               case 8: shift = 3; break;
-               default: VG_(panic)( "vg_instrument(LEA2)" );
-            }
-            qs = SHADOW(u_in->val1);
-            qt = SHADOW(u_in->val2);
-            qd = SHADOW(u_in->val3);
-            uInstr2(cb, MOV, 4, TempReg, qt, TempReg, qd);
-            if (shift > 0) {
-               uInstr2(cb, SHL, 4, Literal, 0, TempReg, qd);
-               uLiteral(cb, shift);
-            }
-            create_UifU(cb, 4, qs, qd);
-            create_Left(cb, u_in->size, qd);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-         }
-
-         /* inc#/dec#(qd) = q `UifU` left(qd) = left(qd) */
-         case INC: case DEC:
-            qd = SHADOW(u_in->val1);
-            create_Left(cb, u_in->size, qd);
-            if (u_in->flags_w != FlagsEmpty)
-               create_PUTVF(cb, u_in->size, qd);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* This is a HACK (approximation :-) */
-         /* rcl#/rcr#(qs,qd) 
-               = let q0 = pcast-sz-0(qd) `UifU` pcast-sz-0(qs) `UifU` eflags#
-                 eflags# = q0
-                 qd =pcast-0-sz(q0)
-            Ie, cast everything down to a single bit, then back up.
-            This assumes that any bad bits infect the whole word and 
-            the eflags.
-         */
-         case RCL: case RCR:
-	    vg_assert(u_in->flags_r != FlagsEmpty);
-            /* The following assertion looks like it makes sense, but is
-               actually wrong.  Consider this:
-                  rcll    %eax
-                  imull   %eax, %eax
-               The rcll writes O and C but so does the imull, so the O and C 
-               write of the rcll is annulled by the prior improvement pass.
-               Noticed by Kevin Ryde <user42@zip.com.au>
-            */
-	    /* vg_assert(u_in->flags_w != FlagsEmpty); */
-            qs = getOperandShadow(cb, u_in->size, u_in->tag1, u_in->val1);
-            /* We can safely modify qs; cast it to 0-size. */
-            create_PCast(cb, u_in->size, 0, qs);
-            qd = SHADOW(u_in->val2);
-            create_PCast(cb, u_in->size, 0, qd);
-            /* qs is cast-to-0(shift count#), and qd is cast-to-0(value#). */
-            create_UifU(cb, 0, qs, qd);
-            /* qs is now free; reuse it for the flag definedness. */
-            qs = create_GETVF(cb, 0);
-            create_UifU(cb, 0, qs, qd);
-            create_PUTVF(cb, 0, qd);
-            create_PCast(cb, 0, u_in->size, qd);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* for OP in shl shr sar rol ror
-            (qs is shift count#, qd is value to be OP#d)
-            OP(ts,td)
-            OP#(qs,qd)
-               = pcast-1-sz(qs) `UifU` OP(ts,qd)
-            So we apply OP to the tag bits too, and then UifU with
-            the shift count# to take account of the possibility of it
-            being undefined.
-            
-            A bit subtle:
-               ROL/ROR rearrange the tag bits as per the value bits.
-               SHL/SHR shifts zeroes into the value, and corresponding 
-                  zeroes indicating Definedness into the tag.
-               SAR copies the top bit of the value downwards, and therefore
-                  SAR also copies the definedness of the top bit too.
-            So in all five cases, we just apply the same op to the tag 
-            bits as is applied to the value bits.  Neat!
-         */
-         case SHL:
-         case SHR: case SAR:
-         case ROL: case ROR: {
-            Int t_amount = INVALID_TEMPREG;
-            vg_assert(u_in->tag1 == TempReg || u_in->tag1 == Literal);
-            vg_assert(u_in->tag2 == TempReg);
-            qd = SHADOW(u_in->val2);
-
-            /* Make qs hold shift-count# and make
-               t_amount be a TempReg holding the shift count. */
-            if (u_in->tag1 == Literal) {
-               t_amount = newTemp(cb);
-               uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_amount);
-               uLiteral(cb, u_in->lit32);
-               qs = SHADOW(t_amount);
-               uInstr1(cb, SETV, 1, TempReg, qs);
-            } else {
-               t_amount = u_in->val1;
-               qs = SHADOW(u_in->val1);
-            }
-
-            uInstr2(cb, u_in->opcode, 
-                        u_in->size, 
-                        TempReg, t_amount, 
-                        TempReg, qd);
-            qt = newShadow(cb);
-            uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt);
-            create_PCast(cb, 1, u_in->size, qt);
-            create_UifU(cb, u_in->size, qt, qd);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-         }
-
-         /* One simple tag operation. */
-         case WIDEN:
-            vg_assert(u_in->tag1 == TempReg);
-            create_Widen(cb, u_in->signed_widen, u_in->extra4b, u_in->size, 
-                             SHADOW(u_in->val1));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* not#(x) = x (since bitwise independent) */
-         case NOT:
-            vg_assert(u_in->tag1 == TempReg);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* neg#(x) = left(x) (derivable from case for SUB) */
-         case NEG:
-            vg_assert(u_in->tag1 == TempReg);
-            create_Left(cb, u_in->size, SHADOW(u_in->val1));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* bswap#(x) = bswap(x) */
-         case BSWAP:
-            vg_assert(u_in->tag1 == TempReg);
-            vg_assert(u_in->size == 4);
-            qd = SHADOW(u_in->val1);
-            uInstr1(cb, BSWAP, 4, TempReg, qd);
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* cc2val#(qd) = pcast-0-to-size(eflags#) */
-         case CC2VAL:
-            vg_assert(u_in->tag1 == TempReg);
-            vg_assert(u_in->flags_r != FlagsEmpty);
-            qt = create_GETVF(cb, u_in->size);
-            uInstr2(cb, MOV, 4, TempReg, qt, TempReg, SHADOW(u_in->val1));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* cmov#(qs,qd) = cmov(qs,qd)
-            That is, do the cmov of tags using the same flags as for
-            the data (obviously).  However, first do a test on the 
-            validity of the flags.
-         */
-         case CMOV:
-            vg_assert(u_in->size == 4);
-            vg_assert(u_in->tag1 == TempReg);
-            vg_assert(u_in->tag2 == TempReg);
-            vg_assert(u_in->flags_r != FlagsEmpty);
-            vg_assert(u_in->flags_w == FlagsEmpty);
-            qs = SHADOW(u_in->val1);
-            qd = SHADOW(u_in->val2);
-            qt = create_GETVF(cb, 0);
-            uInstr1(cb, TESTV, 0, TempReg, qt);
-            /* qt should never be referred to again.  Nevertheless
-               ... */
-            uInstr1(cb, SETV, 0, TempReg, qt);
-
-            uInstr2(cb, CMOV, 4, TempReg, qs, TempReg, qd);
-            LAST_UINSTR(cb).cond    = u_in->cond;
-            LAST_UINSTR(cb).flags_r = u_in->flags_r;
-
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* add#/sub#(qs,qd) 
-               = qs `UifU` qd `UifU` left(qs) `UifU` left(qd)
-               = left(qs) `UifU` left(qd)
-               = left(qs `UifU` qd)
-            adc#/sbb#(qs,qd)
-               = left(qs `UifU` qd) `UifU` pcast(eflags#)
-            Second arg (dest) is TempReg.
-            First arg (src) is Literal or TempReg or ArchReg. 
-         */
-         case ADD: case SUB:
-         case ADC: case SBB:
-            qd = SHADOW(u_in->val2);
-            qs = getOperandShadow(cb, u_in->size, u_in->tag1, u_in->val1);
-            create_UifU(cb, u_in->size, qs, qd);
-            create_Left(cb, u_in->size, qd);
-            if (u_in->opcode == ADC || u_in->opcode == SBB) {
-               vg_assert(u_in->flags_r != FlagsEmpty);
-               qt = create_GETVF(cb, u_in->size);
-               create_UifU(cb, u_in->size, qt, qd);
-            }
-            if (u_in->flags_w != FlagsEmpty) {
-               create_PUTVF(cb, u_in->size, qd);
-            }
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* xor#(qs,qd) = qs `UifU` qd */
-         case XOR:
-            qd = SHADOW(u_in->val2);
-            qs = getOperandShadow(cb, u_in->size, u_in->tag1, u_in->val1);
-            create_UifU(cb, u_in->size, qs, qd);
-            if (u_in->flags_w != FlagsEmpty) {
-               create_PUTVF(cb, u_in->size, qd);
-            }
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* and#/or#(qs,qd) 
-               = (qs `UifU` qd) `DifD` improve(vs,qs) 
-                                `DifD` improve(vd,qd)
-            where improve is the relevant one of
-                Improve{AND,OR}_TQ
-            Use the following steps, with qt as a temp:
-               qt = improve(vd,qd)
-               qd = qs `UifU` qd
-               qd = qt `DifD` qd
-               qt = improve(vs,qs)
-               qd = qt `DifD` qd
-         */
-         case AND: case OR:
-            vg_assert(u_in->tag1 == TempReg);
-            vg_assert(u_in->tag2 == TempReg);
-            qd = SHADOW(u_in->val2);
-            qs = SHADOW(u_in->val1);
-            qt = newShadow(cb);
-
-            /* qt = improve(vd,qd) */
-            uInstr2(cb, MOV, 4, TempReg, qd, TempReg, qt);
-            if (u_in->opcode == AND)
-               create_ImproveAND_TQ(cb, u_in->size, u_in->val2, qt);
-            else
-               create_ImproveOR_TQ(cb, u_in->size, u_in->val2, qt);
-            /* qd = qs `UifU` qd */
-            create_UifU(cb, u_in->size, qs, qd);
-            /* qd = qt `DifD` qd */
-            create_DifD(cb, u_in->size, qt, qd);
-            /* qt = improve(vs,qs) */
-            uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt);
-            if (u_in->opcode == AND)
-               create_ImproveAND_TQ(cb, u_in->size, u_in->val1, qt);
-            else
-               create_ImproveOR_TQ(cb, u_in->size, u_in->val1, qt);
-            /* qd = qt `DifD` qd */
-               create_DifD(cb, u_in->size, qt, qd);
-            /* So, finally qd is the result tag. */
-            if (u_in->flags_w != FlagsEmpty) {
-               create_PUTVF(cb, u_in->size, qd);
-            }
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Machinery to do with supporting CALLM.  Copy the start and
-            end markers only to make the result easier to read
-            (debug); they generate no code and have no effect. 
-         */
-         case CALLM_S: case CALLM_E:
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Copy PUSH and POP verbatim.  Arg/result absval
-            calculations are done when the associated CALL is
-            processed.  CLEAR has no effect on absval calculations but
-            needs to be copied.  
-         */
-         case PUSH: case POP: case CLEAR:
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* In short:
-               callm#(a1# ... an#) = (a1# `UifU` ... `UifU` an#)
-            We have to decide on a size to do the computation at,
-            although the choice doesn't affect correctness.  We will
-            do a pcast to the final size anyway, so the only important
-            factor is to choose a size which minimises the total
-            number of casts needed.  Valgrind: just use size 0,
-            regardless.  It may not be very good for performance
-            but does simplify matters, mainly by reducing the number
-            of different pessimising casts which have to be implemented.
-         */
-         case CALLM: {
-            UInstr* uu;
-            Bool res_used;
-
-            /* Now generate the code.  Get the final result absval
-               into qt. */
-            qt  = newShadow(cb);
-            qtt = newShadow(cb);
-            uInstr1(cb, SETV, 0, TempReg, qt);
-            for (j = i-1; cb_in->instrs[j].opcode != CALLM_S; j--) {
-               uu = & cb_in->instrs[j];
-               if (uu->opcode != PUSH) continue;
-               /* cast via a temporary */
-               uInstr2(cb, MOV, 4, TempReg, SHADOW(uu->val1),
-                                   TempReg, qtt);
-               create_PCast(cb, uu->size, 0, qtt);
-               create_UifU(cb, 0, qtt, qt);
-            }
-            /* Remembering also that flags read count as inputs. */
-            if (u_in->flags_r != FlagsEmpty) {
-               qtt = create_GETVF(cb, 0);
-               create_UifU(cb, 0, qtt, qt);
-            }
-
-            /* qt now holds the result tag.  If any results from the
-               call are used, either by fetching with POP or
-               implicitly by writing the flags, we copy the result
-               absval to the relevant location.  If not used, the call
-               must have been for its side effects, so we test qt here
-               and now.  Note that this assumes that all values
-               removed by POP continue to be live.  So dead args
-               *must* be removed with CLEAR, not by POPping them into
-               a dummy tempreg. 
-            */
-            res_used = False;
-            for (j = i+1; cb_in->instrs[j].opcode != CALLM_E; j++) {
-               uu = & cb_in->instrs[j];
-               if (uu->opcode != POP) continue;
-               /* Cast via a temp. */
-               uInstr2(cb, MOV, 4, TempReg, qt, TempReg, qtt);
-               create_PCast(cb, 0, uu->size, qtt);
-               uInstr2(cb, MOV, 4, TempReg, qtt, 
-                                   TempReg, SHADOW(uu->val1));
-               res_used = True;
-            }
-            if (u_in->flags_w != FlagsEmpty) {
-               create_PUTVF(cb, 0, qt);
-               res_used = True;
-            }
-            if (!res_used) {
-               uInstr1(cb, TESTV, 0, TempReg, qt);
-               /* qt should never be referred to again.  Nevertheless
-                  ... */
-               uInstr1(cb, SETV, 0, TempReg, qt);
-            }
-            VG_(copyUInstr)(cb, u_in);
-            break;
-         }
-         /* Whew ... */
-
-         case JMP:
-            if (u_in->tag1 == TempReg) {
-               uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
-               uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
-            } else {
-               vg_assert(u_in->tag1 == Literal);
-            }
-            if (u_in->cond != CondAlways) {
-               vg_assert(u_in->flags_r != FlagsEmpty);
-               qt = create_GETVF(cb, 0);
-               uInstr1(cb, TESTV, 0, TempReg, qt);
-               /* qt should never be referred to again.  Nevertheless
-                  ... */
-               uInstr1(cb, SETV, 0, TempReg, qt);
-            }
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         case JIFZ:
-            uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
-            uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* Emit a check on the address used.  For FPU_R, the value
-            loaded into the FPU is checked at the time it is read from
-            memory (see synth_fpu_mem_check_actions).  */
-         case FPU_R: case FPU_W:
-            vg_assert(u_in->tag2 == TempReg);
-            uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val2));
-            uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val2));
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         /* For FPU insns not referencing memory, just copy thru. */
-         case FPU: 
-            VG_(copyUInstr)(cb, u_in);
-            break;
-
-         default:
-            VG_(ppUInstr)(0, u_in);
-            VG_(panic)( "vg_instrument: unhandled case");
-
-      } /* end of switch (u_in->opcode) */
-
-   } /* end of for loop */
-
-   VG_(freeCodeBlock)(cb_in);
-   return cb;
-}
-
-/*------------------------------------------------------------*/
-/*--- Clean up mem check instrumentation.                  ---*/
-/*------------------------------------------------------------*/
-
-#define VGC_IS_SHADOW(tempreg) ((tempreg % 2) == 1)
-#define VGC_UNDEF ((UChar)100)
-#define VGC_VALUE ((UChar)101)
-
-#define NOP_no_msg(uu)                                         \
-   do { uu->opcode = NOP; } while (False)
-
-#define NOP_tag1_op(uu)                                        \
-   do { uu->opcode = NOP;                                      \
-        if (VG_(disassemble))                                  \
-           VG_(printf)("at %d: delete %s due to defd arg\n",   \
-                       i, VG_(nameOfTagOp(u->val3)));          \
-   } while (False)
-
-#define SETV_tag1_op(uu,newsz)                                 \
-   do { uu->opcode = SETV;                                     \
-        uu->size = newsz;                                      \
-        uu->tag2 = uu->tag3 = NoValue;                         \
-        if (VG_(disassemble))                                  \
-           VG_(printf)("at %d: convert %s to SETV%d "          \
-                       "due to defd arg\n",                    \
-                       i, VG_(nameOfTagOp(u->val3)), newsz);   \
-   } while (False)
-
-
-
-/* Run backwards and delete SETVs on shadow temps for which the next
-   action is a write.  Needs an env saying whether or not the next
-   action is a write.  The supplied UCodeBlock is destructively
-   modified.
-*/
-static void vg_delete_redundant_SETVs ( UCodeBlock* cb )
-{
-   Bool*   next_is_write;
-   Int     i, j, k, n_temps;
-   UInstr* u;
-   TempUse tempUse[3];
-
-   n_temps = cb->nextTemp;
-   if (n_temps == 0) return;
-
-   next_is_write = VG_(jitmalloc)(n_temps * sizeof(Bool));
-
-   for (i = 0; i < n_temps; i++) next_is_write[i] = True;
-
-   for (i = cb->used-1; i >= 0; i--) {
-      u = &cb->instrs[i];
-
-      /* If we're not checking address V bits, there will be a lot of
-         GETVs, TAG1s and TAG2s calculating values which are never
-         used.  These first three cases get rid of them. */
-
-      if (u->opcode == GETV && VGC_IS_SHADOW(u->val2) 
-                            && next_is_write[u->val2]
-                            && !VG_(clo_check_addrVs)) {
-         u->opcode = NOP;
-         u->size = 0;
-         if (VG_(disassemble)) 
-            VG_(printf)("at %d: delete GETV\n", i);
-      } else
-
-      if (u->opcode == TAG1 && VGC_IS_SHADOW(u->val1) 
-                            && next_is_write[u->val1]
-                            && !VG_(clo_check_addrVs)) {
-         u->opcode = NOP;
-         u->size = 0;
-         if (VG_(disassemble)) 
-            VG_(printf)("at %d: delete TAG1\n", i);
-      } else
-
-      if (u->opcode == TAG2 && VGC_IS_SHADOW(u->val2) 
-                            && next_is_write[u->val2]
-                            && !VG_(clo_check_addrVs)) {
-         u->opcode = NOP;
-         u->size = 0;
-         if (VG_(disassemble)) 
-            VG_(printf)("at %d: delete TAG2\n", i);
-      } else
-
-      /* We do the rest of these regardless of whether or not
-         addresses are V-checked. */
-
-      if (u->opcode == MOV && VGC_IS_SHADOW(u->val2) 
-                           && next_is_write[u->val2]) {
-         /* This MOV is pointless because the target is dead at this
-            point.  Delete it. */
-         u->opcode = NOP;
-         u->size = 0;
-         if (VG_(disassemble)) 
-            VG_(printf)("at %d: delete MOV\n", i);
-      } else
-
-      if (u->opcode == SETV) {
-         if (u->tag1 == TempReg) {
-            vg_assert(VGC_IS_SHADOW(u->val1));
-            if (next_is_write[u->val1]) {
-               /* This write is pointless, so annul it. */
-               u->opcode = NOP;
-               u->size = 0;
-               if (VG_(disassemble)) 
-                  VG_(printf)("at %d: delete SETV\n", i);
-            } else {
-               /* This write has a purpose; don't annul it, but do
-                  notice that we did it. */
-               next_is_write[u->val1] = True;
-            }
-              
-         }
-
-      } else {
-         /* Find out what this insn does to the temps. */
-         k = getTempUsage(u, &tempUse[0]);
-         vg_assert(k <= 3);
-         for (j = k-1; j >= 0; j--) {
-            next_is_write[ tempUse[j].tempNo ]
-                         = tempUse[j].isWrite;
-         }
-      }
-
-   }
-
-   VG_(jitfree)(next_is_write);
-}
-
-
-/* Run forwards, propagating and using the is-completely-defined
-   property.  This removes a lot of redundant tag-munging code.
-   Unfortunately it requires intimate knowledge of how each uinstr and
-   tagop modifies its arguments.  This duplicates knowledge of uinstr
-   tempreg uses embodied in getTempUsage(), which is unfortunate. 
-   The supplied UCodeBlock* is modified in-place.
-
-   For each value temp, def[] should hold VGC_VALUE.
-
-   For each shadow temp, def[] may hold 4,2,1 or 0 iff that shadow is
-   definitely known to be fully defined at that size.  In all other
-   circumstances a shadow's def[] entry is VGC_UNDEF, meaning possibly
-   undefined.  In cases of doubt, VGC_UNDEF is always safe.
-*/
-static void vg_propagate_definedness ( UCodeBlock* cb )
-{
-   UChar*  def;
-   Int     i, j, k, t, n_temps;
-   UInstr* u;
-   TempUse tempUse[3];
-
-   n_temps = cb->nextTemp;
-   if (n_temps == 0) return;
-
-   def = VG_(jitmalloc)(n_temps * sizeof(UChar));
-   for (i = 0; i < n_temps; i++) 
-      def[i] = VGC_IS_SHADOW(i) ? VGC_UNDEF : VGC_VALUE;
-
-   /* Run forwards, detecting and using the all-defined property. */
-
-   for (i = 0; i < cb->used; i++) {
-      u = &cb->instrs[i];
-      switch (u->opcode) {
-
-      /* Tag-handling uinstrs. */
-
-         /* Deal with these quickly. */
-         case NOP:
-         case INCEIP:
-            break;
-
-         /* Make a tag defined. */
-         case SETV:
-            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-            def[u->val1] = u->size;
-            break;
-
-         /* Check definedness of a tag. */
-         case TESTV:
-            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-            if (def[u->val1] <= 4) { 
-               vg_assert(def[u->val1] == u->size); 
-               NOP_no_msg(u);
-               if (VG_(disassemble)) 
-                  VG_(printf)("at %d: delete TESTV on defd arg\n", i);
-            }
-            break;
-
-         /* Applies to both values and tags.  Propagate Definedness
-            property through copies.  Note that this isn't optional;
-            we *have* to do this to keep def[] correct. */
-         case MOV:
-            vg_assert(u->tag2 == TempReg);
-            if (u->tag1 == TempReg) {
-               if (VGC_IS_SHADOW(u->val1)) {
-                  vg_assert(VGC_IS_SHADOW(u->val2));
-                  def[u->val2] = def[u->val1];
-               }
-            }
-            break;
-
-         case PUTV:
-            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-            if (def[u->val1] <= 4) {
-               vg_assert(def[u->val1] == u->size);
-               u->tag1 = Literal;
-               u->val1 = 0;
-               switch (u->size) {
-                  case 4: u->lit32 = 0x00000000; break;
-                  case 2: u->lit32 = 0xFFFF0000; break;
-                  case 1: u->lit32 = 0xFFFFFF00; break;
-                  default: VG_(panic)("vg_cleanup(PUTV)");
-               }
-               if (VG_(disassemble)) 
-                  VG_(printf)(
-                     "at %d: propagate definedness into PUTV\n", i);
-            }
-            break;
-
-         case STOREV:
-            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-            if (def[u->val1] <= 4) {
-               vg_assert(def[u->val1] == u->size);
-               u->tag1 = Literal;
-               u->val1 = 0;
-               switch (u->size) {
-                  case 4: u->lit32 = 0x00000000; break;
-                  case 2: u->lit32 = 0xFFFF0000; break;
-                  case 1: u->lit32 = 0xFFFFFF00; break;
-                  default: VG_(panic)("vg_cleanup(STOREV)");
-               }
-               if (VG_(disassemble)) 
-                  VG_(printf)(
-                     "at %d: propagate definedness into STandV\n", i);
-            }
-            break;
-
-         /* Nothing interesting we can do with this, I think. */
-         case PUTVF:
-            break;
-
-         /* Tag handling operations. */
-         case TAG2:
-            vg_assert(u->tag2 == TempReg && VGC_IS_SHADOW(u->val2));
-            vg_assert(u->tag3 == Lit16);
-            /* Ultra-paranoid "type" checking. */
-            switch (u->val3) {
-               case VgT_ImproveAND4_TQ: case VgT_ImproveAND2_TQ:
-               case VgT_ImproveAND1_TQ: case VgT_ImproveOR4_TQ:
-               case VgT_ImproveOR2_TQ: case VgT_ImproveOR1_TQ:
-                  vg_assert(u->tag1 == TempReg && !VGC_IS_SHADOW(u->val1));
-                  break;
-               default:
-                  vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-                  break;
-            }
-            switch (u->val3) {
-               Int sz;
-               case VgT_UifU4: 
-                  sz = 4; goto do_UifU;
-               case VgT_UifU2: 
-                  sz = 2; goto do_UifU;
-               case VgT_UifU1:
-                  sz = 1; goto do_UifU;
-               case VgT_UifU0:
-                  sz = 0; goto do_UifU;
-               do_UifU:
-                  vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-                  vg_assert(u->tag2 == TempReg && VGC_IS_SHADOW(u->val2));
-                  if (def[u->val1] <= 4) {
-                     /* UifU.  The first arg is defined, so result is
-                        simply second arg.  Delete this operation. */
-                     vg_assert(def[u->val1] == sz);
-                     NOP_no_msg(u);
-                     if (VG_(disassemble)) 
-                        VG_(printf)(
-                           "at %d: delete UifU%d due to defd arg1\n", 
-                           i, sz);
-                  }
-                  else 
-                  if (def[u->val2] <= 4) {
-                     /* UifU.  The second arg is defined, so result is
-                        simply first arg.  Copy to second. */
-                     vg_assert(def[u->val2] == sz);
-                     u->opcode = MOV; 
-                     u->size = 4;
-                     u->tag3 = NoValue;
-                     def[u->val2] = def[u->val1];
-                     if (VG_(disassemble)) 
-                        VG_(printf)(
-                           "at %d: change UifU%d to MOV due to defd"
-                           " arg2\n", 
-                           i, sz);
-                  }
-                  break;
-               case VgT_ImproveAND4_TQ:
-                  sz = 4; goto do_ImproveAND;
-               case VgT_ImproveAND1_TQ:
-                  sz = 1; goto do_ImproveAND;
-               do_ImproveAND:
-                  /* Implements Q = T OR Q.  So if Q is entirely defined,
-                     ie all 0s, we get MOV T, Q. */
-		  if (def[u->val2] <= 4) {
-                     vg_assert(def[u->val2] == sz);
-                     u->size = 4; /* Regardless of sz */
-                     u->opcode = MOV;
-                     u->tag3 = NoValue;
-                     def[u->val2] = VGC_UNDEF;
-                     if (VG_(disassemble)) 
-                        VG_(printf)(
-                            "at %d: change ImproveAND%d_TQ to MOV due "
-                            "to defd arg2\n", 
-                            i, sz);
-                  }
-                  break;
-               default: 
-                  goto unhandled;
-            }
-            break;
-
-         case TAG1:
-            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
-            if (def[u->val1] > 4) break;
-            /* We now know that the arg to the op is entirely defined.
-               If the op changes the size of the arg, we must replace
-               it with a SETV at the new size.  If it doesn't change
-               the size, we can delete it completely. */
-            switch (u->val3) {
-               /* Maintain the same size ... */
-               case VgT_Left4: 
-                  vg_assert(def[u->val1] == 4);
-                  NOP_tag1_op(u);
-                  break;
-               case VgT_PCast11: 
-                  vg_assert(def[u->val1] == 1);
-                  NOP_tag1_op(u);
-                  break;
-               /* Change size ... */
-               case VgT_PCast40: 
-                  vg_assert(def[u->val1] == 4);
-                  SETV_tag1_op(u,0);
-                  def[u->val1] = 0;
-                  break;
-               case VgT_PCast14: 
-                  vg_assert(def[u->val1] == 1);
-                  SETV_tag1_op(u,4);
-                  def[u->val1] = 4;
-                  break;
-               case VgT_PCast12: 
-                  vg_assert(def[u->val1] == 1);
-                  SETV_tag1_op(u,2);
-                  def[u->val1] = 2;
-                  break;
-               case VgT_PCast10: 
-                  vg_assert(def[u->val1] == 1);
-                  SETV_tag1_op(u,0);
-                  def[u->val1] = 0;
-                  break;
-               case VgT_PCast02: 
-                  vg_assert(def[u->val1] == 0);
-                  SETV_tag1_op(u,2);
-                  def[u->val1] = 2;
-                  break;
-               default: 
-                  goto unhandled;
-            }
-            if (VG_(disassemble)) 
-               VG_(printf)(
-                  "at %d: delete TAG1 %s due to defd arg\n",
-                  i, VG_(nameOfTagOp(u->val3)));
-            break;
-
-         default:
-         unhandled:
-            /* We don't know how to handle this uinstr.  Be safe, and 
-               set to VGC_VALUE or VGC_UNDEF all temps written by it. */
-            k = getTempUsage(u, &tempUse[0]);
-            vg_assert(k <= 3);
-            for (j = 0; j < k; j++) {
-               t = tempUse[j].tempNo;
-               vg_assert(t >= 0 && t < n_temps);
-               if (!tempUse[j].isWrite) {
-                  /* t is read; ignore it. */
-                  if (0&& VGC_IS_SHADOW(t) && def[t] <= 4)
-                     VG_(printf)("ignoring def %d at %s %s\n", 
-                                 def[t], 
-                                 VG_(nameUOpcode)(True, u->opcode),
-                                 (u->opcode == TAG1 || u->opcode == TAG2)
-                                    ? VG_(nameOfTagOp)(u->val3) 
-                                    : (Char*)"");
-               } else {
-                  /* t is written; better nullify it. */
-                  def[t] = VGC_IS_SHADOW(t) ? VGC_UNDEF : VGC_VALUE;
-               }
-            }
-      }
-   }
-
-   VG_(jitfree)(def);
-}
-
-
-/* Top level post-instrumentation cleanup function. */
-static void vg_cleanup ( UCodeBlock* cb )
-{
-   vg_propagate_definedness ( cb );
-   vg_delete_redundant_SETVs ( cb );
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Main entry point for the JITter.                     ---*/
-/*------------------------------------------------------------*/
-
-/* Translate the basic block beginning at orig_addr, placing the
-   translation in a vg_malloc'd block, the address and size of which
-   are returned in trans_addr and trans_size.  Length of the original
-   block is also returned in orig_size.  If the latter three are NULL,
-   this call is being done for debugging purposes, in which case (a)
-   throw away the translation once it is made, and (b) produce a load
-   of debugging output. 
-*/
-void VG_(translate) ( ThreadState* tst, 
-                         /* Identity of thread needing this block */
-                      Addr  orig_addr,
-                      UInt* orig_size,
-                      Addr* trans_addr,
-                      UInt* trans_size )
-{
-   Int         n_disassembled_bytes, final_code_size;
-   Bool        debugging_translation;
-   UChar*      final_code;
-   UCodeBlock* cb;
-
-   VGP_PUSHCC(VgpTranslate);
-   debugging_translation
-      = orig_size == NULL || trans_addr == NULL || trans_size == NULL;
-
-   dis = True;
-   dis = debugging_translation;
-
-   /* Check if we're being asked to jump to a silly address, and if so
-      record an error message before potentially crashing the entire
-      system. */
-   if (VG_(clo_instrument) && !debugging_translation && !dis) {
-      Addr bad_addr;
-      Bool ok = VGM_(check_readable) ( orig_addr, 1, &bad_addr );
-      if (!ok) {
-         VG_(record_jump_error)(tst, bad_addr);
-      }
-   }
-
-   /* if (VG_(overall_in_count) >= 4800) dis=True; */
-   if (VG_(disassemble))
-      VG_(printf)("\n");
-   if (0 || dis 
-       || (VG_(overall_in_count) > 0 &&
-           (VG_(overall_in_count) % 1000 == 0))) {
-      if (0&& (VG_(clo_verbosity) > 1 || dis))
-         VG_(message)(Vg_UserMsg,
-              "trans# %d, bb# %lu, in %d, out %d",
-              VG_(overall_in_count), 
-              VG_(bbs_done),
-              VG_(overall_in_osize), VG_(overall_in_tsize),
-              orig_addr );
-   }
-   cb = VG_(allocCodeBlock)();
-
-   /* Disassemble this basic block into cb. */
-   /* VGP_PUSHCC(VgpToUCode); */
-   n_disassembled_bytes = VG_(disBB) ( cb, orig_addr );
-   /* VGP_POPCC; */
-   /* dis=True; */
-   /* if (0&& VG_(translations_done) < 617)  */
-   /*    dis=False; */
-   /* Try and improve the code a bit. */
-   if (VG_(clo_optimise)) {
-      /* VGP_PUSHCC(VgpImprove); */
-      vg_improve ( cb );
-      if (VG_(disassemble)) 
-         VG_(ppUCodeBlock) ( cb, "Improved code:" );
-      /* VGP_POPCC; */
-   }
-   /* dis=False; */
-   /* Add instrumentation code. */
-   if (VG_(clo_instrument)) {
-      /* VGP_PUSHCC(VgpInstrument); */
-      cb = vg_instrument(cb);
-      /* VGP_POPCC; */
-      if (VG_(disassemble)) 
-         VG_(ppUCodeBlock) ( cb, "Instrumented code:" );
-      if (VG_(clo_cleanup)) {
-         /* VGP_PUSHCC(VgpCleanup); */
-         vg_cleanup(cb);
-         /* VGP_POPCC; */
-         if (VG_(disassemble)) 
-            VG_(ppUCodeBlock) ( cb, "Cleaned-up instrumented code:" );
-      }
-   }
-
-   //VG_(disassemble) = True;
-
-   /* Add cache simulation code. */
-   if (VG_(clo_cachesim)) {
-      /* VGP_PUSHCC(VgpCacheInstrument); */
-      cb = VG_(cachesim_instrument)(cb, orig_addr);
-      /* VGP_POPCC; */
-      if (VG_(disassemble)) 
-         VG_(ppUCodeBlock) ( cb, "Cachesim instrumented code:" );
-   }
-   
-   //VG_(disassemble) = False;
-   
-   /* Allocate registers. */
-   /* VGP_PUSHCC(VgpRegAlloc); */
-   cb = vg_do_register_allocation ( cb );
-   /* VGP_POPCC; */
-   /* dis=False; */
-   /* 
-   if (VG_(disassemble))
-      VG_(ppUCodeBlock) ( cb, "After Register Allocation:");
-   */
-
-   /* VGP_PUSHCC(VgpFromUcode); */
-   /* NB final_code is allocated with VG_(jitmalloc), not VG_(malloc)
-      and so must be VG_(jitfree)'d. */
-   final_code = VG_(emit_code)(cb, &final_code_size );
-   /* VGP_POPCC; */
-   VG_(freeCodeBlock)(cb);
-
-   if (debugging_translation) {
-      /* Only done for debugging -- throw away final result. */
-      VG_(jitfree)(final_code);
-   } else {
-      /* Doing it for real -- return values to caller. */
-      *orig_size = n_disassembled_bytes;
-      *trans_addr = (Addr)final_code;
-      *trans_size = final_code_size;
-   }
-   VGP_POPCC;
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                           vg_translate.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_transtab.c b/coregrind/vg_transtab.c
deleted file mode 100644
index a6e15b3053..0000000000
--- a/coregrind/vg_transtab.c
+++ /dev/null
@@ -1,566 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Management of the translation table and cache.               ---*/
-/*---                                                vg_transtab.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-#include "vg_constants.h"
-
-/* #define DEBUG_TRANSTAB */
-
-
-/*------------------------------------------------------------*/
-/*--- Management of the LRU-based translation table+cache. ---*/
-/*------------------------------------------------------------*/
-
-/* These sizes were set up so as to be able to debug large KDE 3
-   applications (are there any small ones?) without excessive amounts
-   of code retranslation.  */
-
-/* Size of the translation cache, in bytes. */
-#define VG_TC_SIZE /*1000000*/ /*16000000*/ 32000000 /*40000000*/
-
-/* Do a LRU pass when the translation cache becomes this full. */
-#define VG_TC_LIMIT_PERCENT 98
-
-/* When doing an LRU pass, reduce TC fullness to this level. */
-#define VG_TC_TARGET_PERCENT 85
-
-/* Number of entries in the translation table.  This must be a prime
-   number in order to make the hashing work properly. */
-#define VG_TT_SIZE /*5281*/ /*100129*/ 200191 /*250829*/
-
-/* Do an LRU pass when the translation table becomes this full. */
-#define VG_TT_LIMIT_PERCENT /*67*/ 80
-
-/* When doing an LRU pass, reduce TT fullness to this level. */
-#define VG_TT_TARGET_PERCENT /*60*/ 70
-
-/* The number of age steps we track.  0 means the current epoch,
-   N_EPOCHS-1 means used the epoch N_EPOCHS-1 or more ago.  */
-#define VG_N_EPOCHS /*2000*/ /*4000*/ 20000
-
-/* This TT entry is empty.  There is no associated TC storage. */
-#define VG_TTE_EMPTY   ((Addr)1)
-/* This TT entry has been deleted, in the sense that it does not
-   contribute to the orig->trans mapping.  However, the ex-translation
-   it points at still occupies space in TC.  This slot cannot be
-   re-used without doing an LRU pass. */
-#define VG_TTE_DELETED ((Addr)3)
-
-/* The TC.  This used to be statically allocated, but that forces many
-   SecMap arrays to be pointlessly allocated at startup, bloating the
-   process size by about 22M and making startup slow.  So now we
-   dynamically allocate it at startup time.
-   was: static UChar vg_tc[VG_TC_SIZE];
-*/
-static UChar* vg_tc = NULL;
-
-/* Count of bytes used in the TC.  This includes those pointed to from
-   VG_TTE_DELETED entries. */
-static Int vg_tc_used = 0;
-
-/* The TT.  Like TC, for the same reason, is dynamically allocated at
-   startup. 
-   was: static TTEntry vg_tt[VG_TT_SIZE];
-*/
-static TTEntry* vg_tt = NULL;
-
-/* Count of non-empty TT entries.  This includes deleted ones. */
-static Int vg_tt_used = 0;
-
-/* Fast helper for the TT.  A direct-mapped cache which holds a
-   pointer to a TT entry which may or may not be the correct one, but
-   which we hope usually is.  This array is referred to directly from
-   vg_dispatch.S. */
-Addr VG_(tt_fast)[VG_TT_FAST_SIZE];
-
-/* For reading/writing the misaligned TT-index word at immediately
-   preceding every translation in TC. */
-#if 0
-   /* Big sigh.  However reasonable this seems, there are those who
-      set AC in %EFLAGS (Alignment Check) to 1, causing bus errors.  A
-      proper solution is for valgrind to properly virtualise AC, like
-      the other flags (DOSZACP).  The current cheap hack simply avoids
-      all misaligned accesses, so valgrind doesn't fault even if AC is
-      set. */
-#  define VG_READ_MISALIGNED_WORD(aaa) (*((UInt*)(aaa)))
-#  define VG_WRITE_MISALIGNED_WORD(aaa,vvv) *((UInt*)(aaa)) = ((UInt)(vvv))
-#else
-  static __inline__
-  UInt VG_READ_MISALIGNED_WORD ( Addr aaa )
-  {
-     UInt w = 0;
-     UChar* p = (UChar*)aaa;
-     w = 0xFF & ((UInt)(p[3]));
-     w = (w << 8) | (0xFF & ((UInt)(p[2])));
-     w = (w << 8) | (0xFF & ((UInt)(p[1])));
-     w = (w << 8) | (0xFF & ((UInt)(p[0])));
-     return w;
-  }
-
-  static __inline__
-  void VG_WRITE_MISALIGNED_WORD ( Addr aaa, UInt vvv )
-  {
-     UChar* p = (UChar*)aaa;
-     p[0] = vvv & 0xFF;
-     p[1] = (vvv >> 8) & 0xFF;
-     p[2] = (vvv >> 16) & 0xFF;
-     p[3] = (vvv >> 24) & 0xFF;
-  }
-#endif
-
-
-/* Used for figuring out an age threshold for translations. */
-static Int vg_bytes_in_epoch[VG_N_EPOCHS];
-static Int vg_entries_in_epoch[VG_N_EPOCHS];
-
-
-/* Just so these counts can be queried without making them globally
-   visible. */
-void VG_(get_tt_tc_used) ( UInt* tt_used, UInt* tc_used )
-{
-   *tt_used = vg_tt_used;
-   *tc_used = vg_tc_used;
-}
-
-
-/* Do the LRU thing on TT/TC, clearing them back to the target limits
-   if they are over the threshold limits. 
-*/
-void VG_(maybe_do_lru_pass) ( void )
-{
-   Int i, j, r, w, thresh, ttno;
-   TTEntry* tte;
-
-   const Int tc_limit  = (Int)(((double)VG_TC_SIZE * (double)VG_TC_LIMIT_PERCENT)
-                                / (double)100.0);
-   const Int tt_limit  = (Int)(((double)VG_TT_SIZE * (double)VG_TT_LIMIT_PERCENT)
-                                / (double)100.0);
-   const Int tc_target = (Int)(((double)VG_TC_SIZE * (double)VG_TC_TARGET_PERCENT)
-                                / (double)100.0);
-   const Int tt_target = (Int)(((double)VG_TT_SIZE * (double)VG_TT_TARGET_PERCENT)
-                                / (double)100.0);
-
-   /* Decide quickly if we need to do an LRU pass ? */
-   if (vg_tc_used <= tc_limit && vg_tt_used <= tt_limit)
-      return;
-
-#  ifdef DEBUG_TRANSTAB
-   VG_(sanity_check_tc_tt)();
-#  endif
-
-   VGP_PUSHCC(VgpDoLRU);
-   /*   
-   VG_(printf)(
-      "limits: tc_limit %d, tt_limit %d, tc_target %d, tt_target %d\n",
-      tc_limit, tt_limit, tc_target, tt_target);
-   */
-
-   if (VG_(clo_verbosity) > 2)
-      VG_(printf)(" pre-LRU: tc %d (target %d),  tt %d (target %d)\n",
-	          vg_tc_used, tc_target, vg_tt_used, tt_target);
-
-   /* Yes we do.  Figure out what threshold age is required in order to
-      shrink both the TC and TT occupancy below TC_TARGET_PERCENT and
-      TT_TARGET_PERCENT respectively. */
-
-   VG_(number_of_lrus)++;
-
-   /* Count the number of TC bytes and TT entries in each epoch. */
-   for (i = 0; i < VG_N_EPOCHS; i++)
-      vg_bytes_in_epoch[i] = vg_entries_in_epoch[i] = 0;
-
-   for (i = 0; i < VG_TT_SIZE; i++) {
-      if (vg_tt[i].orig_addr == VG_TTE_EMPTY 
-          || vg_tt[i].orig_addr == VG_TTE_DELETED) 
-            continue;
-      j = vg_tt[i].mru_epoch;
-      vg_assert(j <= VG_(current_epoch));
-      j = VG_(current_epoch) - j;
-      if (j >= VG_N_EPOCHS) j = VG_N_EPOCHS-1;
-      vg_assert(0 <= j && j < VG_N_EPOCHS);
-      /* Greater j now means older. */
-      vg_entries_in_epoch[j]++;
-      vg_bytes_in_epoch[j] += 4+vg_tt[i].trans_size;
-   }
-
-   /*
-   for (i = 0; i < VG_N_EPOCHS; i++)
-      VG_(printf)("epoch %d: ents %d, bytes %d\n", 
-                  i, vg_entries_in_epoch[i], vg_bytes_in_epoch[i]);
-   */
-
-   /* Cumulatise.  Make vg_{bytes,entries}_in_epoch[n] contain the
-      counts for itself and all younger epochs. */
-   for (i = 1; i < VG_N_EPOCHS; i++) {
-      vg_entries_in_epoch[i] += vg_entries_in_epoch[i-1];
-      vg_bytes_in_epoch[i] += vg_bytes_in_epoch[i-1];
-   }
-
-   for (thresh = 0; thresh < VG_N_EPOCHS; thresh++) {
-      if (vg_entries_in_epoch[thresh] > tt_target 
-          || vg_bytes_in_epoch[thresh] >= tc_target)
-         break;
-   }
-
-   if (VG_(clo_verbosity) > 2)
-      VG_(printf)(
-         "     LRU: discard translations %d or more epochs since last use\n",
-         thresh
-      );
-
-   thresh = VG_(current_epoch) - thresh;
-
-   /* Ok, so we will hit our targets if we retain all entries most
-      recently used at most thresh epochs ago.  Traverse the TT and
-      mark such entries as deleted. */
-   for (i = 0; i < VG_TT_SIZE; i++) {
-      if (vg_tt[i].orig_addr == VG_TTE_EMPTY 
-          || vg_tt[i].orig_addr == VG_TTE_DELETED) 
-         continue;
-      if (vg_tt[i].mru_epoch <= thresh) {
-         vg_tt[i].orig_addr = VG_TTE_DELETED;
-	 VG_(this_epoch_out_count) ++;
-	 VG_(this_epoch_out_osize) += vg_tt[i].orig_size;
-	 VG_(this_epoch_out_tsize) += vg_tt[i].trans_size;
-	 VG_(overall_out_count) ++;
-	 VG_(overall_out_osize) += vg_tt[i].orig_size;
-	 VG_(overall_out_tsize) += vg_tt[i].trans_size;
-      }
-   }
-
-   /* Now compact the TC, sliding live entries downwards to fill spaces
-      left by deleted entries.  In this loop, r is the offset in TC of
-      the current translation under consideration, and w is the next
-      allocation point. */
-   r = w = 0;
-   while (True) {
-      if (r >= vg_tc_used) break;
-      /* The first four bytes of every translation contain the index
-         of its TT entry.  The TT entry's .trans_addr field points at
-         the start of the code proper, not at this 4-byte index, so
-         that we don't constantly have to keep adding 4 in the main
-         lookup/dispatch loop. */
-
-      ttno = VG_READ_MISALIGNED_WORD((Addr)(&vg_tc[r]));
-      vg_assert(ttno >= 0 && ttno < VG_TT_SIZE);
-      tte = & vg_tt[ ttno ];
-      vg_assert(tte->orig_addr != VG_TTE_EMPTY);
-      if (tte->orig_addr != VG_TTE_DELETED) {
-         /* We want to keep this one alive. */
-         /* Sanity check the pointer back to TC. */
-         vg_assert(tte->trans_addr == (Addr)&vg_tc[r+4]);
-         for (i = 0; i < 4+tte->trans_size; i++)
-            vg_tc[w+i] = vg_tc[r+i];
-         tte->trans_addr = (Addr)&vg_tc[w+4];
-         w += 4+tte->trans_size;
-      } else {
-         tte->orig_addr = VG_TTE_EMPTY;
-         vg_tt_used--;
-      }
-      r += 4+tte->trans_size;
-   }
-   /* should have traversed an exact number of translations, with no
-      slop at the end. */
-   vg_assert(w <= r);
-   vg_assert(r == vg_tc_used);
-   vg_assert(w <= r);
-   vg_assert(w <= tc_target);
-   vg_tc_used = w;
-
-   vg_assert(vg_tt_used >= 0);
-   vg_assert(vg_tt_used <= tt_target);
-
-   /* Invalidate the fast cache, since it is now out of date.  It will get
-      reconstructed incrementally when the client resumes. */
-   VG_(invalidate_tt_fast)();
-
-   if (VG_(clo_verbosity) > 2)
-      VG_(printf)("post-LRU: tc %d (target %d),  tt %d (target %d)\n",
-	          vg_tc_used, tc_target, vg_tt_used, tt_target);
-
-   if (VG_(clo_verbosity) > 1)
-      VG_(message)(Vg_UserMsg,   
-         "epoch %d (bb %luk): thresh %d, "
-         "out %d (%dk -> %dk), new TT %d, TC %dk",
-         VG_(current_epoch), 
-         VG_(bbs_done) / 1000,
-         VG_(current_epoch) - thresh, 
-         VG_(this_epoch_out_count),
-         VG_(this_epoch_out_osize) / 1000,
-         VG_(this_epoch_out_tsize) / 1000,
-         vg_tt_used, vg_tc_used / 1000
-      );
-
-   /* Reconstruct the SMC detection structures. */
-#  ifdef DEBUG_TRANSTAB
-   for (i = 0; i < VG_TT_SIZE; i++)
-      vg_assert(vg_tt[i].orig_addr != VG_TTE_DELETED);
-#  endif
-   VG_(sanity_check_tc_tt)();
-
-   VGP_POPCC;
-}
-
-
-/* Do a sanity check on TT/TC.
-*/
-void VG_(sanity_check_tc_tt) ( void )
-{
-   Int      i, counted_entries, counted_bytes;
-   TTEntry* tte;
-   counted_entries = 0;
-   counted_bytes   = 0;
-   for (i = 0; i < VG_TT_SIZE; i++) {
-      tte = &vg_tt[i];
-      if (tte->orig_addr == VG_TTE_EMPTY) continue;
-      vg_assert(tte->mru_epoch >= 0);
-      vg_assert(tte->mru_epoch <= VG_(current_epoch));
-      counted_entries++;
-      counted_bytes += 4+tte->trans_size;
-      vg_assert(tte->trans_addr >= (Addr)&vg_tc[4]);
-      vg_assert(tte->trans_addr < (Addr)&vg_tc[vg_tc_used]);
-      vg_assert(VG_READ_MISALIGNED_WORD(tte->trans_addr-4) == i);
-   }
-   vg_assert(counted_entries == vg_tt_used);
-   vg_assert(counted_bytes == vg_tc_used);
-}
-
-
-/* Add this already-filled-in entry to the TT.  Assumes that the
-   relevant code chunk has been placed in TC, along with a dummy back
-   pointer, which is inserted here.  
-*/
-extern void VG_(add_to_trans_tab) ( TTEntry* tte )
-{
-   Int i;
-   /*
-   VG_(printf)("add_to_trans_tab(%d) %x %d %x %d\n",
-               vg_tt_used, tte->orig_addr, tte->orig_size, 
-               tte->trans_addr, tte->trans_size);
-   */
-   vg_assert(tte->orig_addr != VG_TTE_DELETED 
-             && tte->orig_addr != VG_TTE_EMPTY);
-   /* Hash to get initial probe point. */
-   i = ((UInt)(tte->orig_addr)) % VG_TT_SIZE;
-   while (True) {
-      if (vg_tt[i].orig_addr == tte->orig_addr)
-         VG_(panic)("add_to_trans_tab: duplicate");
-      if (vg_tt[i].orig_addr == VG_TTE_EMPTY) {
-         /* Put it here, and set the back pointer. */
-         vg_tt[i] = *tte;
-         VG_WRITE_MISALIGNED_WORD(tte->trans_addr-4, i);
-         vg_tt_used++;
-         return;
-      }
-      i++;
-      if (i == VG_TT_SIZE) i = 0;
-   }
-}
-
-
-/* Copy a new translation's code into TC, leaving a 4-byte hole for
-   the back pointer, and returning a pointer to the code proper (not
-   the hole) in TC. 
-*/
-Addr VG_(copy_to_transcache) ( Addr trans_addr, Int trans_size )
-{
-   Int i;
-   Addr ret_addr;
-   if (4+trans_size > VG_TC_SIZE-vg_tc_used)
-      VG_(panic)("copy_to_transcache: not enough free space?!");
-   /* Leave a hole for the back pointer to the TT entry. */
-   vg_tc_used += 4;
-   ret_addr = (Addr)&vg_tc[vg_tc_used];
-   for (i = 0; i < trans_size; i++)
-      vg_tc[vg_tc_used+i] = ((UChar*)trans_addr)[i];
-   vg_tc_used += trans_size;
-   return ret_addr;
-}
-
-
-/* Invalidate the tt_fast cache, for whatever reason.  Tricky.  We
-   have to find a TTE_EMPTY slot to point all entries at. */
-void VG_(invalidate_tt_fast)( void )
-{
-   Int i, j;
-   for (i = 0; i < VG_TT_SIZE && vg_tt[i].orig_addr != VG_TTE_EMPTY; i++)
-      ;
-   vg_assert(i < VG_TT_SIZE 
-             && vg_tt[i].orig_addr == VG_TTE_EMPTY);
-   for (j = 0; j < VG_TT_FAST_SIZE; j++)
-      VG_(tt_fast)[j] = (Addr)&vg_tt[i];
-}
-
-
-/* Search TT to find the translated address of the supplied original,
-   or NULL if not found.  This routine is used when we miss in
-   VG_(tt_fast). 
-*/
-static __inline__ TTEntry* search_trans_table ( Addr orig_addr )
-{
-   //static Int queries = 0;
-   //static Int probes = 0;
-   Int i;
-   /* Hash to get initial probe point. */
-   //   if (queries == 10000) {
-   //  VG_(printf)("%d queries, %d probes\n", queries, probes);
-   //  queries = probes = 0;
-   //}
-   //queries++;
-   i = ((UInt)orig_addr) % VG_TT_SIZE;
-   while (True) {
-      //probes++;
-      if (vg_tt[i].orig_addr == orig_addr)
-         return &vg_tt[i];
-      if (vg_tt[i].orig_addr == VG_TTE_EMPTY)
-         return NULL;
-      i++;
-      if (i == VG_TT_SIZE) i = 0;
-   }
-}
-
-
-/* Find the translation address for a given (original) code address.
-   If found, update VG_(tt_fast) so subsequent lookups are fast.  If
-   no translation can be found, return zero.  This routine is (the
-   only one) called from vg_run_innerloop.  */
-Addr VG_(search_transtab) ( Addr original_addr )
-{
-   TTEntry* tte;
-   VGP_PUSHCC(VgpSlowFindT);
-   tte = search_trans_table ( original_addr );
-   if (tte == NULL) {
-      /* We didn't find it.  vg_run_innerloop will have to request a
-         translation. */
-      VGP_POPCC;
-      return (Addr)0;
-   } else {
-      /* Found it.  Put the search result into the fast cache now.
-         Also set the mru_epoch to mark this translation as used. */
-      UInt cno = (UInt)original_addr & VG_TT_FAST_MASK;
-      VG_(tt_fast)[cno] = (Addr)tte;
-      VG_(tt_fast_misses)++;
-      tte->mru_epoch = VG_(current_epoch);
-      VGP_POPCC;
-      return tte->trans_addr;
-   }
-}
-
-
-/* Invalidate translations of original code [start .. start + range - 1].
-   This is slow, so you *really* don't want to call it very often. 
-*/
-void VG_(invalidate_translations) ( Addr start, UInt range )
-{
-   Addr  i_start, i_end, o_start, o_end;
-   UInt  out_count, out_osize, out_tsize;
-   Int   i;
-
-#  ifdef DEBUG_TRANSTAB
-   VG_(sanity_check_tc_tt)();
-#  endif
-   i_start = start;
-   i_end   = start + range - 1;
-   out_count = out_osize = out_tsize = 0;
-
-   for (i = 0; i < VG_TT_SIZE; i++) {
-      if (vg_tt[i].orig_addr == VG_TTE_EMPTY
-          || vg_tt[i].orig_addr == VG_TTE_DELETED) continue;
-      o_start = vg_tt[i].orig_addr;
-      o_end = o_start + vg_tt[i].orig_size - 1;
-      if (o_end < i_start || o_start > i_end)
-         continue;
-      if (VG_(clo_cachesim))
-         VG_(cachesim_notify_discard)( & vg_tt[i] );
-      vg_tt[i].orig_addr = VG_TTE_DELETED;
-      VG_(this_epoch_out_count) ++;
-      VG_(this_epoch_out_osize) += vg_tt[i].orig_size;
-      VG_(this_epoch_out_tsize) += vg_tt[i].trans_size;
-      VG_(overall_out_count) ++;
-      VG_(overall_out_osize) += vg_tt[i].orig_size;
-      VG_(overall_out_tsize) += vg_tt[i].trans_size;
-      out_count ++;
-      out_osize += vg_tt[i].orig_size;
-      out_tsize += vg_tt[i].trans_size;
-   }
-
-   if (out_count > 0) {
-      VG_(invalidate_tt_fast)();
-      VG_(sanity_check_tc_tt)();
-#     ifdef DEBUG_TRANSTAB
-      { Addr aa;
-        for (aa = i_start; aa <= i_end; aa++)
-           vg_assert(search_trans_table ( aa ) == NULL);
-      }
-#     endif
-   }
-
-   if (1|| VG_(clo_verbosity) > 1)
-      VG_(message)(Vg_UserMsg,   
-         "discard %d (%d -> %d) translations in range %p .. %p",
-         out_count, out_osize, out_tsize, i_start, i_end );
-}
-
-
-/*------------------------------------------------------------*/
-/*--- Initialisation.                                      ---*/
-/*------------------------------------------------------------*/
-
-void VG_(init_tt_tc) ( void )
-{
-   Int i;
-
-   /* Allocate the translation table and translation cache. */
-   vg_assert(vg_tc == NULL);
-   vg_tc = VG_(get_memory_from_mmap) ( VG_TC_SIZE * sizeof(UChar), 
-                                       "trans-cache" );
-   vg_assert(vg_tc != NULL);
-
-   vg_assert(vg_tt == NULL);
-   vg_tt = VG_(get_memory_from_mmap) ( VG_TT_SIZE * sizeof(TTEntry),
-                                       "trans-table" );
-   vg_assert(vg_tt != NULL);
-
-   /* The main translation table is empty. */
-   vg_tt_used = 0;
-   for (i = 0; i < VG_TT_SIZE; i++) {
-      vg_tt[i].orig_addr = VG_TTE_EMPTY;
-   }
-
-   /* The translation table's fast cache is empty.  Point all entries
-      at the first TT entry, which is, of course, empty. */
-   for (i = 0; i < VG_TT_FAST_SIZE; i++)
-      VG_(tt_fast)[i] = (Addr)(&vg_tt[0]);
-}
-
-/*--------------------------------------------------------------------*/
-/*--- end                                            vg_transtab.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_unsafe.h b/coregrind/vg_unsafe.h
deleted file mode 100644
index 0f726468d0..0000000000
--- a/coregrind/vg_unsafe.h
+++ /dev/null
@@ -1,91 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A header file for making sense of syscalls.  Unsafe in the   ---*/
-/*--- sense that we don't call any functions mentioned herein.     ---*/
-/*---                                                  vg_unsafe.h ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-
-/* These includes are only used for making sense of the args for
-   system calls. */
-#include <asm/unistd.h>   /* for system call numbers */
-#include <sys/mman.h>     /* for PROT_* */
-#include <sys/utsname.h>  /* for uname */
-#include <sys/time.h>     /* for struct timeval & struct timezone */
-#include <linux/net.h>    /* for the SYS_* constants */
-#include <sys/resource.h> /* for struct rlimit */
-#include <linux/shm.h>    /* for struct shmid_ds & struct ipc_perm */
-#include <sys/socket.h>   /* for struct msghdr */
-#include <sys/un.h>       /* for sockaddr_un */
-#include <net/if.h>       /* for struct ifreq et al */
-#include <net/if_arp.h>   /* for struct arpreq */
-#include <net/route.h>    /* for struct rtentry */
-#include <asm/ipc.h>      /* for struct ipc_kludge */
-#include <linux/msg.h>    /* for struct msgbuf */
-#include <linux/sem.h>    /* for struct sembuf */
-
-#include <linux/isdn.h>   /* for ISDN ioctls */
-#include <scsi/sg.h>      /* for the SG_* ioctls */
-#include <sched.h>        /* for struct sched_param */
-#include <linux/sysctl.h> /* for struct __sysctl_args */
-#include <linux/cdrom.h>  /* for cd-rom ioctls */
-
-#define __USE_LARGEFILE64
-#include <sys/stat.h>     /* for struct stat */
-#undef __USE_LARGEFILE64
-
-#include <asm/ioctls.h>   /* for stuff for dealing with ioctl :( */
-#include <sys/soundcard.h> /* for various soundcard ioctl constants :( */
-
-#ifndef GLIBC_2_1
-#  include <linux/rtc.h>   /* for RTC_* ioctls */
-#endif
-
-#include <termios.h>
-#include <pty.h>
-
-/* 2.2 stuff ... */
-#include <sys/uio.h>
-
-/* Both */
-#include <utime.h>
-#include <sys/times.h>    /* for struct tms */
-
-/* 2.0 at least, for gid_t and loff_t */
-#include <sys/types.h>
-
-#include <sys/statfs.h>
-
-#include <sys/sysinfo.h>
-
-#include <sys/poll.h>
-
-
-/*--------------------------------------------------------------------*/
-/*--- end                                              vg_unsafe.h ---*/
-/*--------------------------------------------------------------------*/
diff --git a/coregrind/vg_valgrinq_dummy.c b/coregrind/vg_valgrinq_dummy.c
deleted file mode 100644
index a0b14410ea..0000000000
--- a/coregrind/vg_valgrinq_dummy.c
+++ /dev/null
@@ -1,43 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Used to make a dummy valgrinq.so, which does nothing at all. ---*/
-/*---                                          vg_valgrinq_dummy.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-/* For the rationale behind this file, look at
-   VG_(mash_LD_PRELOAD_string) in vg_main.c. */
-
-/* Remember not to use a variable of this name in any program you want
-   to debug :-) */
-int dont_mess_with_the_RSCDS = 0;
-
-/* If you are bored, perhaps have a look at http://www.rscds.org. */
-
-/*--------------------------------------------------------------------*/
-/*--- end                                      vg_valgrinq_dummy.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/helgrind/Makefile.am b/helgrind/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/helgrind/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/include/valgrind.h b/include/valgrind.h
deleted file mode 100644
index 5a819c78ae..0000000000
--- a/include/valgrind.h
+++ /dev/null
@@ -1,243 +0,0 @@
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-
-#ifndef __VALGRIND_H
-#define __VALGRIND_H
-
-
-/* This file is for inclusion into client (your!) code.
-
-   You can use these macros to manipulate and query memory permissions
-   inside your own programs.
-
-   The resulting executables will still run without Valgrind, just a
-   little bit more slowly than they otherwise would, but otherwise
-   unchanged.  
-
-   When run on Valgrind with --client-perms=yes, Valgrind observes
-   these macro calls and takes appropriate action.  When run on
-   Valgrind with --client-perms=no (the default), Valgrind observes
-   these macro calls but does not take any action as a result.  */
-
-
-
-/* This defines the magic code sequence which the JITter spots and
-   handles magically.  Don't look too closely at this; it will rot
-   your brain.  Valgrind dumps the result value in %EDX, so we first
-   copy the default value there, so that it is returned when not
-   running on Valgrind.  Since %EAX points to a block of mem
-   containing the args, you can pass as many args as you want like
-   this.  Currently this is set up to deal with 4 args since that's
-   the max that we appear to need (pthread_create).  
-*/
-#define VALGRIND_MAGIC_SEQUENCE(                                        \
-        _zzq_rlval,   /* result lvalue */                               \
-        _zzq_default, /* result returned when running on real CPU */    \
-        _zzq_request, /* request code */                                \
-        _zzq_arg1,    /* request first param */                         \
-        _zzq_arg2,    /* request second param */                        \
-        _zzq_arg3,    /* request third param */                         \
-        _zzq_arg4     /* request fourth param */ )                      \
-                                                                        \
-  { volatile unsigned int _zzq_args[5];                                 \
-    _zzq_args[0] = (volatile unsigned int)(_zzq_request);               \
-    _zzq_args[1] = (volatile unsigned int)(_zzq_arg1);                  \
-    _zzq_args[2] = (volatile unsigned int)(_zzq_arg2);                  \
-    _zzq_args[3] = (volatile unsigned int)(_zzq_arg3);                  \
-    _zzq_args[4] = (volatile unsigned int)(_zzq_arg4);                  \
-    asm volatile("movl %1, %%eax\n\t"                                   \
-                 "movl %2, %%edx\n\t"                                   \
-                 "roll $29, %%eax ; roll $3, %%eax\n\t"                 \
-                 "rorl $27, %%eax ; rorl $5, %%eax\n\t"                 \
-                 "roll $13, %%eax ; roll $19, %%eax\n\t"                \
-                 "movl %%edx, %0\t"                                     \
-                 : "=r" (_zzq_rlval)                                    \
-                 : "r" (&_zzq_args[0]), "r" (_zzq_default)              \
-                 : "eax", "edx", "cc", "memory"                         \
-                );                                                      \
-  }
-
-
-/* Some request codes.  There are many more of these, but most are not
-   exposed to end-user view.  These are the public ones, all of the
-   form 0x1000 + small_number. 
-*/
-
-#define VG_USERREQ__MAKE_NOACCESS        0x1001
-#define VG_USERREQ__MAKE_WRITABLE        0x1002
-#define VG_USERREQ__MAKE_READABLE        0x1003
-#define VG_USERREQ__DISCARD              0x1004
-#define VG_USERREQ__CHECK_WRITABLE       0x1005
-#define VG_USERREQ__CHECK_READABLE       0x1006
-#define VG_USERREQ__MAKE_NOACCESS_STACK  0x1007
-#define VG_USERREQ__RUNNING_ON_VALGRIND  0x1008
-#define VG_USERREQ__DO_LEAK_CHECK        0x1009 /* untested */
-#define VG_USERREQ__DISCARD_TRANSLATIONS 0x100A
-
-
-/* Client-code macros to manipulate the state of memory. */
-
-/* Mark memory at _qzz_addr as unaddressible and undefined for
-   _qzz_len bytes.  Returns an int handle pertaining to the block
-   descriptions Valgrind will use in subsequent error messages. */
-#define VALGRIND_MAKE_NOACCESS(_qzz_addr,_qzz_len)               \
-   ({unsigned int _qzz_res;                                      \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0 /* default return */,    \
-                            VG_USERREQ__MAKE_NOACCESS,           \
-                            _qzz_addr, _qzz_len, 0, 0);          \
-    _qzz_res;                                                    \
-   })
-
-/* Similarly, mark memory at _qzz_addr as addressible but undefined
-   for _qzz_len bytes. */
-#define VALGRIND_MAKE_WRITABLE(_qzz_addr,_qzz_len)               \
-   ({unsigned int _qzz_res;                                      \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0 /* default return */,    \
-                            VG_USERREQ__MAKE_WRITABLE,           \
-                            _qzz_addr, _qzz_len, 0, 0);          \
-    _qzz_res;                                                    \
-   })
-
-/* Similarly, mark memory at _qzz_addr as addressible and defined
-   for _qzz_len bytes. */
-#define VALGRIND_MAKE_READABLE(_qzz_addr,_qzz_len)               \
-   ({unsigned int _qzz_res;                                      \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0 /* default return */,    \
-                            VG_USERREQ__MAKE_READABLE,           \
-                            _qzz_addr, _qzz_len, 0, 0);          \
-    _qzz_res;                                                    \
-   })
-
-/* Discard a block-description-handle obtained from the above three
-   macros.  After this, Valgrind will no longer be able to relate
-   addressing errors to the user-defined block associated with the
-   handle.  The permissions settings associated with the handle remain
-   in place.  Returns 1 for an invalid handle, 0 for a valid
-   handle. */
-#define VALGRIND_DISCARD(_qzz_blkindex)                          \
-   ({unsigned int _qzz_res;                                      \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0 /* default return */,    \
-                            VG_USERREQ__DISCARD,                 \
-                            0, _qzz_blkindex, 0, 0);             \
-    _qzz_res;                                                    \
-   })
-
-
-
-/* Client-code macros to check the state of memory. */
-
-/* Check that memory at _qzz_addr is addressible for _qzz_len bytes.
-   If suitable addressibility is not established, Valgrind prints an
-   error message and returns the address of the first offending byte.
-   Otherwise it returns zero. */
-#define VALGRIND_CHECK_WRITABLE(_qzz_addr,_qzz_len)                \
-   ({unsigned int _qzz_res;                                        \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0,                           \
-                            VG_USERREQ__CHECK_WRITABLE,            \
-                            _qzz_addr, _qzz_len, 0, 0);            \
-    _qzz_res;                                                      \
-   })
-
-/* Check that memory at _qzz_addr is addressible and defined for
-   _qzz_len bytes.  If suitable addressibility and definedness are not
-   established, Valgrind prints an error message and returns the
-   address of the first offending byte.  Otherwise it returns zero. */
-#define VALGRIND_CHECK_READABLE(_qzz_addr,_qzz_len)                \
-   ({unsigned int _qzz_res;                                        \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0,                           \
-                            VG_USERREQ__CHECK_READABLE,            \
-                            _qzz_addr, _qzz_len, 0, 0);            \
-    _qzz_res;                                                      \
-   })
-
-
-/* Use this macro to force the definedness and addressibility of a
-   value to be checked.  If suitable addressibility and definedness
-   are not established, Valgrind prints an error message and returns
-   the address of the first offending byte.  Otherwise it returns
-   zero. */
-#define VALGRIND_CHECK_DEFINED(__lvalue)                           \
-   (void)                                                          \
-   VALGRIND_CHECK_READABLE(                                        \
-      (volatile unsigned char *)&(__lvalue),                       \
-                      (unsigned int)(sizeof (__lvalue)))
-
-
-
-/* Mark memory, intended to be on the client's stack, at _qzz_addr as
-   unaddressible and undefined for _qzz_len bytes.  Does not return a
-   value.  The record associated with this setting will be
-   automatically removed by Valgrind when the containing routine
-   exits. */
-#define VALGRIND_MAKE_NOACCESS_STACK(_qzz_addr,_qzz_len)           \
-   {unsigned int _qzz_res;                                         \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0,                           \
-                            VG_USERREQ__MAKE_NOACCESS_STACK,       \
-                            _qzz_addr, _qzz_len, 0, 0);            \
-   }
-
-
-/* Returns 1 if running on Valgrind, 0 if running on the real CPU. 
-   Currently implemented but untested. */
-#define RUNNING_ON_VALGRIND                                        \
-   ({unsigned int _qzz_res;                                        \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0 /* returned if not */,     \
-                            VG_USERREQ__RUNNING_ON_VALGRIND,       \
-                            0, 0, 0, 0);                           \
-    _qzz_res;                                                      \
-   })
-
-
-/* Mark memory, intended to be on the client's stack, at _qzz_addr as
-   unaddressible and undefined for _qzz_len bytes.  Does not return a
-   value.  The record associated with this setting will be
-   automatically removed by Valgrind when the containing routine
-   exits.  
-
-   Currently implemented but untested.
-*/
-#define VALGRIND_DO_LEAK_CHECK                                     \
-   {unsigned int _qzz_res;                                         \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0,                           \
-                            VG_USERREQ__DO_LEAK_CHECK,             \
-                            0, 0, 0, 0);                           \
-   }
-
-
-/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
-   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
-   since it provides a way to make sure valgrind will retranslate the
-   invalidated area.  Returns no value. */
-#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)          \
-   {unsigned int _qzz_res;                                         \
-    VALGRIND_MAGIC_SEQUENCE(_qzz_res, 0,                           \
-                            VG_USERREQ__DISCARD_TRANSLATIONS,      \
-                            _qzz_addr, _qzz_len, 0, 0);            \
-   }
-
-
-#endif
diff --git a/include/vg_profile.c b/include/vg_profile.c
deleted file mode 100644
index 34e98d6e70..0000000000
--- a/include/vg_profile.c
+++ /dev/null
@@ -1,111 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Profiling machinery -- not for release builds!               ---*/
-/*---                                                 vg_profile.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, an x86 protected-mode emulator 
-   designed for debugging and profiling binaries on x86-Unixes.
-
-   Copyright (C) 2000-2002 Julian Seward 
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file LICENSE.
-*/
-
-#include "vg_include.h"
-
-#ifdef VG_PROFILE
-
-/* get rid of these, if possible */
-#include <signal.h>
-#include <sys/time.h>
-
-#define VGP_PAIR(enumname,str) str
-static const Char* vgp_names[VGP_M_CCS] = { VGP_LIST };
-#undef VGP_PAIR
-
-static Int   vgp_nticks;
-static Int   vgp_counts[VGP_M_CCS];
-static Int   vgp_entries[VGP_M_CCS];
-
-static Int   vgp_sp;
-static VgpCC vgp_stack[VGP_M_STACK];
-
-void VGP_(tick) ( int sigNo )
-{
-   Int cc;
-   vgp_nticks++;
-   cc = vgp_stack[vgp_sp];
-   vg_assert(cc >= 0 && cc < VGP_M_CCS);
-   vgp_counts[ cc ]++;
-}
-
-void VGP_(init_profiling) ( void )
-{
-   struct itimerval value;
-   Int i, ret;
-
-   for (i = 0; i < VGP_M_CCS; i++)
-     vgp_counts[i] = vgp_entries[i] = 0;
-
-   vgp_nticks = 0;
-   vgp_sp = -1;
-   VGP_(pushcc) ( VgpUnc );
-
-   value.it_interval.tv_sec  = 0;
-   value.it_interval.tv_usec = 10 * 1000;
-   value.it_value = value.it_interval;
-
-   signal(SIGPROF, VGP_(tick) );
-   ret = setitimer(ITIMER_PROF, &value, NULL);
-   if (ret != 0) VG_(panic)("vgp_init_profiling");
-}
-
-void VGP_(done_profiling) ( void )
-{
-   Int i;
-   VG_(printf)("Profiling done, %d ticks\n", vgp_nticks);
-   for (i = 0; i < VGP_M_CCS; i++)
-      VG_(printf)("%2d: %4d (%3d %%%%) ticks,  %8d entries   for  %s\n",
-                  i, vgp_counts[i], 
-                  (Int)(1000.0 * (double)vgp_counts[i] / (double)vgp_nticks),
-                  vgp_entries[i],
-                  vgp_names[i] );
-}
-
-void VGP_(pushcc) ( VgpCC cc )
-{
-   if (vgp_sp >= VGP_M_STACK-1) VG_(panic)("vgp_pushcc");
-   vgp_sp++;
-   vgp_stack[vgp_sp] = cc;
-   vgp_entries[ cc ] ++;
-}
-
-void VGP_(popcc) ( void )
-{
-   if (vgp_sp <= 0) VG_(panic)("vgp_popcc");
-   vgp_sp--;
-}
-
-#endif /* VG_PROFILE */
-
-/*--------------------------------------------------------------------*/
-/*--- end                                             vg_profile.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/lackey/Makefile.am b/lackey/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/lackey/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/memcheck/Makefile.am b/memcheck/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/memcheck/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0
diff --git a/memcheck/docs/Makefile.am b/memcheck/docs/Makefile.am
deleted file mode 100644
index e8a58fa18e..0000000000
--- a/memcheck/docs/Makefile.am
+++ /dev/null
@@ -1,5 +0,0 @@
-docdir = $(datadir)/doc/valgrind
-
-doc_DATA = index.html manual.html nav.html techdocs.html
-
-EXTRA_DIST = $(doc_DATA)
diff --git a/memcheck/docs/index.html b/memcheck/docs/index.html
deleted file mode 100644
index 1111702565..0000000000
--- a/memcheck/docs/index.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<html>
-
-<head>
-  <meta http-equiv="Content-Type"     
-        content="text/html; charset=iso-8859-1">
-  <meta http-equiv="Content-Language" content="en-gb">
-  <meta name="generator" 
-        content="Mozilla/4.76 (X11; U; Linux 2.4.1-0.1.9 i586) [Netscape]">
-  <meta name="author" content="Julian Seward <jseward@acm.org>">
-  <meta name="description" content="say what this prog does">
-  <meta name="keywords" content="Valgrind, memory checker, x86, GPL">
-  <title>Valgrind's user manual</title>
-</head>
-
-<frameset cols="150,*">
-  <frame name="nav" target="main" src="nav.html">
-  <frame name="main" src="manual.html" scrolling="auto">
-  <noframes>
-    <body>
-     <p>This page uses frames, but your browser doesn't support them.</p>
-    </body>
-  </noframes>
-</frameset>
-
-</html>
diff --git a/memcheck/docs/manual.html b/memcheck/docs/manual.html
deleted file mode 100644
index b715ee3dfe..0000000000
--- a/memcheck/docs/manual.html
+++ /dev/null
@@ -1,2702 +0,0 @@
-<html>
-  <head>
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-    <title>Valgrind</title>
-  </head>
-
-<body bgcolor="#ffffff">
-
-<a name="title">&nbsp;</a>
-<h1 align=center>Valgrind, version 1.0.0</h1>
-<center>This manual was last updated on 20020726</center>
-<p>
-
-<center>
-<a href="mailto:jseward@acm.org">jseward@acm.org</a><br>
-Copyright &copy; 2000-2002 Julian Seward
-<p>
-Valgrind is licensed under the GNU General Public License, 
-version 2<br>
-An open-source tool for finding memory-management problems in
-Linux-x86 executables.
-</center>
-
-<p>
-
-<hr width="100%">
-<a name="contents"></a>
-<h2>Contents of this manual</h2>
-
-<h4>1&nbsp; <a href="#intro">Introduction</a></h4>
-    1.1&nbsp; <a href="#whatfor">What Valgrind is for</a><br>
-    1.2&nbsp; <a href="#whatdoes">What it does with your program</a>
-
-<h4>2&nbsp; <a href="#howtouse">How to use it, and how to make sense 
-    of the results</a></h4>
-    2.1&nbsp; <a href="#starta">Getting started</a><br>
-    2.2&nbsp; <a href="#comment">The commentary</a><br>
-    2.3&nbsp; <a href="#report">Reporting of errors</a><br>
-    2.4&nbsp; <a href="#suppress">Suppressing errors</a><br>
-    2.5&nbsp; <a href="#flags">Command-line flags</a><br>
-    2.6&nbsp; <a href="#errormsgs">Explaination of error messages</a><br>
-    2.7&nbsp; <a href="#suppfiles">Writing suppressions files</a><br>
-    2.8&nbsp; <a href="#clientreq">The Client Request mechanism</a><br>
-    2.9&nbsp; <a href="#pthreads">Support for POSIX pthreads</a><br>
-    2.10&nbsp; <a href="#install">Building and installing</a><br>
-    2.11&nbsp; <a href="#problems">If you have problems</a><br>
-
-<h4>3&nbsp; <a href="#machine">Details of the checking machinery</a></h4>
-    3.1&nbsp; <a href="#vvalue">Valid-value (V) bits</a><br>
-    3.2&nbsp; <a href="#vaddress">Valid-address (A)&nbsp;bits</a><br>
-    3.3&nbsp; <a href="#together">Putting it all together</a><br>
-    3.4&nbsp; <a href="#signals">Signals</a><br>
-    3.5&nbsp; <a href="#leaks">Memory leak detection</a><br>
-
-<h4>4&nbsp; <a href="#limits">Limitations</a></h4>
-
-<h4>5&nbsp; <a href="#howitworks">How it works -- a rough overview</a></h4>
-    5.1&nbsp; <a href="#startb">Getting started</a><br>
-    5.2&nbsp; <a href="#engine">The translation/instrumentation engine</a><br>
-    5.3&nbsp; <a href="#track">Tracking the status of memory</a><br>
-    5.4&nbsp; <a href="#sys_calls">System calls</a><br>
-    5.5&nbsp; <a href="#sys_signals">Signals</a><br>
-
-<h4>6&nbsp; <a href="#example">An example</a></h4>
-
-<h4>7&nbsp; <a href="#cache">Cache profiling</a></h4>
-
-<h4>8&nbsp; <a href="techdocs.html">The design and implementation of Valgrind</a></h4>
-
-<hr width="100%">
-
-<a name="intro"></a>
-<h2>1&nbsp; Introduction</h2>
-
-<a name="whatfor"></a>
-<h3>1.1&nbsp; What Valgrind is for</h3>
-
-Valgrind is a tool to help you find memory-management problems in your
-programs. When a program is run under Valgrind's supervision, all
-reads and writes of memory are checked, and calls to
-malloc/new/free/delete are intercepted. As a result, Valgrind can
-detect problems such as:
-<ul>
-  <li>Use of uninitialised memory</li>
-  <li>Reading/writing memory after it has been free'd</li>
-  <li>Reading/writing off the end of malloc'd blocks</li>
-  <li>Reading/writing inappropriate areas on the stack</li>
-  <li>Memory leaks -- where pointers to malloc'd blocks are lost
-  forever</li>
-  <li>Mismatched use of malloc/new/new [] vs free/delete/delete
-  []</li>
-  <li>Some misuses of the POSIX pthreads API</li>
-</ul>
-
-Problems like these can be difficult to find by other means, often
-lying undetected for long periods, then causing occasional,
-difficult-to-diagnose crashes.
-
-<p>
-Valgrind is closely tied to details of the CPU, operating system and
-to a less extent, compiler and basic C libraries. This makes it
-difficult to make it portable, so I have chosen at the outset to
-concentrate on what I believe to be a widely used platform: Linux on
-x86s.  Valgrind uses the standard Unix <code>./configure</code>,
-<code>make</code>, <code>make install</code> mechanism, and I have
-attempted to ensure that it works on machines with kernel 2.2 or 2.4
-and glibc 2.1.X or 2.2.X.  This should cover the vast majority of
-modern Linux installations.
-
-
-<p>
-Valgrind is licensed under the GNU General Public License, version
-2. Read the file LICENSE in the source distribution for details.  Some
-of the PThreads test cases, <code>test/pth_*.c</code>, are taken from
-"Pthreads Programming" by Bradford Nichols, Dick Buttlar &amp; Jacqueline
-Proulx Farrell, ISBN 1-56592-115-1, published by O'Reilly &amp;
-Associates, Inc.
-
-
-<a name="whatdoes"></a>
-<h3>1.2&nbsp; What it does with your program</h3>
-
-Valgrind is designed to be as non-intrusive as possible. It works
-directly with existing executables. You don't need to recompile,
-relink, or otherwise modify, the program to be checked. Simply place
-the word <code>valgrind</code> at the start of the command line
-normally used to run the program. So, for example, if you want to run
-the command <code>ls -l</code> on Valgrind, simply issue the
-command: <code>valgrind ls -l</code>.
-
-<p>Valgrind takes control of your program before it starts. Debugging
-information is read from the executable and associated libraries, so
-that error messages can be phrased in terms of source code
-locations. Your program is then run on a synthetic x86 CPU which
-checks every memory access. All detected errors are written to a
-log. When the program finishes, Valgrind searches for and reports on
-leaked memory.
-
-<p>You can run pretty much any dynamically linked ELF x86 executable
-using Valgrind. Programs run 25 to 50 times slower, and take a lot
-more memory, than they usually would. It works well enough to run
-large programs. For example, the Konqueror web browser from the KDE
-Desktop Environment, version 3.0, runs slowly but usably on Valgrind.
-
-<p>Valgrind simulates every single instruction your program executes.
-Because of this, it finds errors not only in your application but also
-in all supporting dynamically-linked (<code>.so</code>-format)
-libraries, including the GNU C library, the X client libraries, Qt, if
-you work with KDE, and so on. That often includes libraries, for
-example the GNU C library, which contain memory access violations, but
-which you cannot or do not want to fix.
-
-<p>Rather than swamping you with errors in which you are not
-interested, Valgrind allows you to selectively suppress errors, by
-recording them in a suppressions file which is read when Valgrind
-starts up.  The build mechanism attempts to select suppressions which
-give reasonable behaviour for the libc and XFree86 versions detected
-on your machine.
-
-
-<p><a href="#example">Section 6</a> shows an example of use.
-<p>
-<hr width="100%">
-
-<a name="howtouse"></a>
-<h2>2&nbsp; How to use it, and how to make sense of the results</h2>
-
-<a name="starta"></a>
-<h3>2.1&nbsp; Getting started</h3>
-
-First off, consider whether it might be beneficial to recompile your
-application and supporting libraries with optimisation disabled and
-debugging info enabled (the <code>-g</code> flag).  You don't have to
-do this, but doing so helps Valgrind produce more accurate and less
-confusing error reports.  Chances are you're set up like this already,
-if you intended to debug your program with GNU gdb, or some other
-debugger.  
-
-<p>
-A plausible compromise is to use <code>-g -O</code>.
-Optimisation levels above <code>-O</code> have been observed, on very
-rare occasions, to cause gcc to generate code which fools Valgrind's
-error tracking machinery into wrongly reporting uninitialised value
-errors.  <code>-O</code> gets you the vast majority of the benefits of
-higher optimisation levels anyway, so you don't lose much there.
-
-<p>
-Valgrind understands both the older "stabs" debugging format, used by
-gcc versions prior to 3.1, and the newer DWARF2 format used by gcc 3.1
-and later.
-
-<p>
-Then just run your application, but place the word
-<code>valgrind</code> in front of your usual command-line invokation.
-Note that you should run the real (machine-code) executable here.  If
-your application is started by, for example, a shell or perl script,
-you'll need to modify it to invoke Valgrind on the real executables.
-Running such scripts directly under Valgrind will result in you
-getting error reports pertaining to <code>/bin/sh</code>,
-<code>/usr/bin/perl</code>, or whatever interpreter you're using.
-This almost certainly isn't what you want and can be confusing.
-
-<a name="comment"></a>
-<h3>2.2&nbsp; The commentary</h3>
-
-Valgrind writes a commentary, detailing error reports and other
-significant events.  The commentary goes to standard output by
-default.  This may interfere with your program, so you can ask for it
-to be directed elsewhere.
-
-<p>All lines in the commentary are of the following form:<br>
-<pre>
-  ==12345== some-message-from-Valgrind
-</pre>
-<p>The <code>12345</code>  is the process ID.  This scheme makes it easy
-to distinguish program output from Valgrind commentary, and also easy
-to differentiate commentaries from different processes which have
-become merged together, for whatever reason.
-
-<p>By default, Valgrind writes only essential messages to the commentary,
-so as to avoid flooding you with information of secondary importance.
-If you want more information about what is happening, re-run, passing
-the <code>-v</code> flag to Valgrind.
-
-
-<a name="report"></a>
-<h3>2.3&nbsp; Reporting of errors</h3>
-
-When Valgrind detects something bad happening in the program, an error
-message is written to the commentary.  For example:<br>
-<pre>
-  ==25832== Invalid read of size 4
-  ==25832==    at 0x8048724: BandMatrix::ReSize(int, int, int) (bogon.cpp:45)
-  ==25832==    by 0x80487AF: main (bogon.cpp:66)
-  ==25832==    by 0x40371E5E: __libc_start_main (libc-start.c:129)
-  ==25832==    by 0x80485D1: (within /home/sewardj/newmat10/bogon)
-  ==25832==    Address 0xBFFFF74C is not stack'd, malloc'd or free'd
-</pre>
-
-<p>This message says that the program did an illegal 4-byte read of
-address 0xBFFFF74C, which, as far as it can tell, is not a valid stack
-address, nor corresponds to any currently malloc'd or free'd blocks.
-The read is happening at line 45 of <code>bogon.cpp</code>, called
-from line 66 of the same file, etc.  For errors associated with an
-identified malloc'd/free'd block, for example reading free'd memory,
-Valgrind reports not only the location where the error happened, but
-also where the associated block was malloc'd/free'd.
-
-<p>Valgrind remembers all error reports.  When an error is detected,
-it is compared against old reports, to see if it is a duplicate.  If
-so, the error is noted, but no further commentary is emitted.  This
-avoids you being swamped with bazillions of duplicate error reports.
-
-<p>If you want to know how many times each error occurred, run with
-the <code>-v</code> option.  When execution finishes, all the reports
-are printed out, along with, and sorted by, their occurrence counts.
-This makes it easy to see which errors have occurred most frequently.
-
-<p>Errors are reported before the associated operation actually
-happens.  For example, if you program decides to read from address
-zero, Valgrind will emit a message to this effect, and the program
-will then duly die with a segmentation fault.
-
-<p>In general, you should try and fix errors in the order that they
-are reported.  Not doing so can be confusing.  For example, a program
-which copies uninitialised values to several memory locations, and
-later uses them, will generate several error messages.  The first such
-error message may well give the most direct clue to the root cause of
-the problem.
-
-<p>The process of detecting duplicate errors is quite an expensive
-one and can become a significant performance overhead if your program
-generates huge quantities of errors.  To avoid serious problems here,
-Valgrind will simply stop collecting errors after 300 different errors
-have been seen, or 30000 errors in total have been seen.  In this
-situation you might as well stop your program and fix it, because
-Valgrind won't tell you anything else useful after this.  Note that
-the 300/30000 limits apply after suppressed errors are removed.  These
-limits are defined in <code>vg_include.h</code> and can be increased
-if necessary.
-
-<p>To avoid this cutoff you can use the
-<code>--error-limit=no</code> flag.  Then valgrind will always show
-errors, regardless of how many there are.  Use this flag carefully,
-since it may have a dire effect on performance.
-
-
-<a name="suppress"></a>
-<h3>2.4&nbsp; Suppressing errors</h3>
-
-Valgrind detects numerous problems in the base libraries, such as the
-GNU C library, and the XFree86 client libraries, which come
-pre-installed on your GNU/Linux system.  You can't easily fix these,
-but you don't want to see these errors (and yes, there are many!)  So
-Valgrind reads a list of errors to suppress at startup.  
-A default suppression file is cooked up by the
-<code>./configure</code> script.
-
-<p>You can modify and add to the suppressions file at your leisure,
-or, better, write your own.  Multiple suppression files are allowed.
-This is useful if part of your project contains errors you can't or
-don't want to fix, yet you don't want to continuously be reminded of
-them.
-
-<p>Each error to be suppressed is described very specifically, to
-minimise the possibility that a suppression-directive inadvertantly
-suppresses a bunch of similar errors which you did want to see.  The
-suppression mechanism is designed to allow precise yet flexible
-specification of errors to suppress.
-
-<p>If you use the <code>-v</code> flag, at the end of execution, Valgrind
-prints out one line for each used suppression, giving its name and the
-number of times it got used.  Here's the suppressions used by a run of
-<code>ls -l</code>:
-<pre>
-  --27579-- supp: 1 socketcall.connect(serv_addr)/__libc_connect/__nscd_getgrgid_r
-  --27579-- supp: 1 socketcall.connect(serv_addr)/__libc_connect/__nscd_getpwuid_r
-  --27579-- supp: 6 strrchr/_dl_map_object_from_fd/_dl_map_object
-</pre>
-
-<a name="flags"></a>
-<h3>2.5&nbsp; Command-line flags</h3>
-
-You invoke Valgrind like this:
-<pre>
-  valgrind [options-for-Valgrind] your-prog [options for your-prog]
-</pre>
-
-<p>Note that Valgrind also reads options from the environment variable
-<code>$VALGRIND</code>, and processes them before the command-line
-options.
-
-<p>Valgrind's default settings succeed in giving reasonable behaviour
-in most cases.  Available options, in no particular order, are as
-follows:
-<ul>
-  <li><code>--help</code></li><br>
-
-  <li><code>--version</code><br>
-      <p>The usual deal.</li><br><p>
-
-  <li><code>-v --verbose</code><br>
-      <p>Be more verbose.  Gives extra information on various aspects
-      of your program, such as: the shared objects loaded, the
-      suppressions used, the progress of the instrumentation engine,
-      and warnings about unusual behaviour.
-      </li><br><p>
-
-  <li><code>-q --quiet</code><br>
-      <p>Run silently, and only print error messages.  Useful if you
-      are running regression tests or have some other automated test
-      machinery.
-      </li><br><p>
-
-  <li><code>--demangle=no</code><br>
-      <code>--demangle=yes</code> [the default]
-      <p>Disable/enable automatic demangling (decoding) of C++ names.
-      Enabled by default.  When enabled, Valgrind will attempt to
-      translate encoded C++ procedure names back to something
-      approaching the original.  The demangler handles symbols mangled
-      by g++ versions 2.X and 3.X.
-
-      <p>An important fact about demangling is that function
-      names mentioned in suppressions files should be in their mangled
-      form.  Valgrind does not demangle function names when searching
-      for applicable suppressions, because to do otherwise would make
-      suppressions file contents dependent on the state of Valgrind's
-      demangling machinery, and would also be slow and pointless.
-      </li><br><p>
-
-  <li><code>--num-callers=&lt;number&gt;</code> [default=4]<br>
-      <p>By default, Valgrind shows four levels of function call names
-      to help you identify program locations.  You can change that
-      number with this option.  This can help in determining the
-      program's location in deeply-nested call chains.  Note that errors
-      are commoned up using only the top three function locations (the
-      place in the current function, and that of its two immediate
-      callers).  So this doesn't affect the total number of errors
-      reported.  
-      <p>
-      The maximum value for this is 50.  Note that higher settings
-      will make Valgrind run a bit more slowly and take a bit more
-      memory, but can be useful when working with programs with
-      deeply-nested call chains.  
-      </li><br><p>
-
-  <li><code>--gdb-attach=no</code> [the default]<br>
-      <code>--gdb-attach=yes</code>
-      <p>When enabled, Valgrind will pause after every error shown,
-      and print the line
-      <br>
-      <code>---- Attach to GDB ? --- [Return/N/n/Y/y/C/c] ----</code>
-      <p>
-      Pressing <code>Ret</code>, or <code>N</code> <code>Ret</code>
-      or <code>n</code> <code>Ret</code>, causes Valgrind not to
-      start GDB for this error.
-      <p>
-      <code>Y</code> <code>Ret</code>
-      or <code>y</code> <code>Ret</code> causes Valgrind to
-      start GDB, for the program at this point.  When you have
-      finished with GDB, quit from it, and the program will continue.
-      Trying to continue from inside GDB doesn't work.
-      <p>
-      <code>C</code> <code>Ret</code>
-      or <code>c</code> <code>Ret</code> causes Valgrind not to
-      start GDB, and not to ask again.
-      <p>
-      <code>--gdb-attach=yes</code> conflicts with
-      <code>--trace-children=yes</code>.  You can't use them together.
-      Valgrind refuses to start up in this situation.  1 May 2002:
-      this is a historical relic which could be easily fixed if it
-      gets in your way.  Mail me and complain if this is a problem for
-      you.  </li><br><p>
-     
-  <li><code>--partial-loads-ok=yes</code> [the default]<br>
-      <code>--partial-loads-ok=no</code>
-      <p>Controls how Valgrind handles word (4-byte) loads from
-      addresses for which some bytes are addressible and others
-      are not.  When <code>yes</code> (the default), such loads
-      do not elicit an address error.  Instead, the loaded V bytes
-      corresponding to the illegal addresses indicate undefined, and
-      those corresponding to legal addresses are loaded from shadow 
-      memory, as usual.
-      <p>
-      When <code>no</code>, loads from partially
-      invalid addresses are treated the same as loads from completely
-      invalid addresses: an illegal-address error is issued,
-      and the resulting V bytes indicate valid data.
-      </li><br><p>
-
-  <li><code>--sloppy-malloc=no</code> [the default]<br>
-      <code>--sloppy-malloc=yes</code>
-      <p>When enabled, all requests for malloc/calloc are rounded up
-      to a whole number of machine words -- in other words, made
-      divisible by 4.  For example, a request for 17 bytes of space
-      would result in a 20-byte area being made available.  This works
-      around bugs in sloppy libraries which assume that they can
-      safely rely on malloc/calloc requests being rounded up in this
-      fashion.  Without the workaround, these libraries tend to
-      generate large numbers of errors when they access the ends of
-      these areas.  
-      <p>
-      Valgrind snapshots dated 17 Feb 2002 and later are
-      cleverer about this problem, and you should no longer need to 
-      use this flag.  To put it bluntly, if you do need to use this
-      flag, your program violates the ANSI C semantics defined for
-      <code>malloc</code> and <code>free</code>, even if it appears to
-      work correctly, and you should fix it, at least if you hope for
-      maximum portability.
-      </li><br><p>
-
-  <li><code>--alignment=&lt;number></code> [default: 4]<br> <p>By
-      default valgrind's <code>malloc</code>, <code>realloc</code>,
-      etc, return 4-byte aligned addresses.  These are suitable for
-      any accesses on x86 processors. 
-      Some programs might however assume that <code>malloc</code> et
-      al return 8- or more aligned memory.
-      These programs are broken and should be fixed, but
-      if this is impossible for whatever reason the alignment can be
-      increased using this parameter.  The supplied value must be
-      between 4 and 4096 inclusive, and must be a power of two.</li><br><p>
-
-  <li><code>--trace-children=no</code> [the default]<br>
-      <code>--trace-children=yes</code>
-      <p>When enabled, Valgrind will trace into child processes.  This
-      is confusing and usually not what you want, so is disabled by
-      default.  As of 1 May 2002, tracing into a child process from a
-      parent which uses <code>libpthread.so</code> is probably broken
-      and is likely to cause breakage.  Please report any such
-      problems to me.  </li><br><p>
-
-  <li><code>--freelist-vol=&lt;number></code> [default: 1000000]
-      <p>When the client program releases memory using free (in C) or
-      delete (C++), that memory is not immediately made available for
-      re-allocation.  Instead it is marked inaccessible and placed in
-      a queue of freed blocks.  The purpose is to delay the point at
-      which freed-up memory comes back into circulation.  This
-      increases the chance that Valgrind will be able to detect
-      invalid accesses to blocks for some significant period of time
-      after they have been freed.  
-      <p>
-      This flag specifies the maximum total size, in bytes, of the
-      blocks in the queue.  The default value is one million bytes.
-      Increasing this increases the total amount of memory used by
-      Valgrind but may detect invalid uses of freed blocks which would
-      otherwise go undetected.</li><br><p>
-
-  <li><code>--logfile-fd=&lt;number></code> [default: 2, stderr]
-      <p>Specifies the file descriptor on which Valgrind communicates
-      all of its messages.  The default, 2, is the standard error
-      channel.  This may interfere with the client's own use of
-      stderr.  To dump Valgrind's commentary in a file without using
-      stderr, something like the following works well (sh/bash
-      syntax):<br>
-      <code>&nbsp;&nbsp;
-            valgrind --logfile-fd=9 my_prog 9> logfile</code><br>
-      That is: tell Valgrind to send all output to file descriptor 9,
-      and ask the shell to route file descriptor 9 to "logfile".
-      </li><br><p>
-
-  <li><code>--suppressions=&lt;filename></code> 
-      [default: $PREFIX/lib/valgrind/default.supp]
-      <p>Specifies an extra
-      file from which to read descriptions of errors to suppress.  You
-      may use as many extra suppressions files as you
-      like.</li><br><p>
-
-  <li><code>--leak-check=no</code> [default]<br>
-      <code>--leak-check=yes</code> 
-      <p>When enabled, search for memory leaks when the client program
-      finishes.  A memory leak means a malloc'd block, which has not
-      yet been free'd, but to which no pointer can be found.  Such a
-      block can never be free'd by the program, since no pointer to it
-      exists.  Leak checking is disabled by default because it tends
-      to generate dozens of error messages.  </li><br><p>
-
-  <li><code>--show-reachable=no</code> [default]<br>
-      <code>--show-reachable=yes</code> 
-      <p>When disabled, the memory leak detector only shows blocks for
-      which it cannot find a pointer to at all, or it can only find a
-      pointer to the middle of.  These blocks are prime candidates for
-      memory leaks.  When enabled, the leak detector also reports on
-      blocks which it could find a pointer to.  Your program could, at
-      least in principle, have freed such blocks before exit.
-      Contrast this to blocks for which no pointer, or only an
-      interior pointer could be found: they are more likely to
-      indicate memory leaks, because you do not actually have a
-      pointer to the start of the block which you can hand to
-      <code>free</code>, even if you wanted to.  </li><br><p>
-
-  <li><code>--leak-resolution=low</code> [default]<br>
-      <code>--leak-resolution=med</code> <br>
-      <code>--leak-resolution=high</code>
-      <p>When doing leak checking, determines how willing Valgrind is
-      to consider different backtraces to be the same.  When set to
-      <code>low</code>, the default, only the first two entries need
-      match.  When <code>med</code>, four entries have to match.  When
-      <code>high</code>, all entries need to match.  
-      <p>
-      For hardcore leak debugging, you probably want to use
-      <code>--leak-resolution=high</code> together with 
-      <code>--num-callers=40</code> or some such large number.  Note
-      however that this can give an overwhelming amount of
-      information, which is why the defaults are 4 callers and
-      low-resolution matching.
-      <p>
-      Note that the <code>--leak-resolution=</code> setting does not
-      affect Valgrind's ability to find leaks.  It only changes how
-      the results are presented.
-      </li><br><p>
-
-  <li><code>--workaround-gcc296-bugs=no</code> [default]<br>
-      <code>--workaround-gcc296-bugs=yes</code> <p>When enabled,
-      assume that reads and writes some small distance below the stack
-      pointer <code>%esp</code> are due to bugs in gcc 2.96, and does
-      not report them.  The "small distance" is 256 bytes by default.
-      Note that gcc 2.96 is the default compiler on some popular Linux
-      distributions (RedHat 7.X, Mandrake) and so you may well need to
-      use this flag.  Do not use it if you do not have to, as it can
-      cause real errors to be overlooked.  Another option is to use a
-      gcc/g++ which does not generate accesses below the stack
-      pointer.  2.95.3 seems to be a good choice in this respect.
-      <p>
-      Unfortunately (27 Feb 02) it looks like g++ 3.0.4 has a similar
-      bug, so you may need to issue this flag if you use 3.0.4.  A
-      while later (early Apr 02) this is confirmed as a scheduling bug
-      in g++-3.0.4.
-      </li><br><p>
-
-  <li><code>--error-limit=yes</code> [default]<br>
-      <code>--error-limit=no</code> <p>When enabled, valgrind stops
-      reporting errors after 30000 in total, or 300 different ones,
-      have been seen.  This is to stop the error tracking machinery
-      from becoming a huge performance overhead in programs with many
-      errors.  </li><br><p>
-
-  <li><code>--cachesim=no</code> [default]<br>
-      <code>--cachesim=yes</code> <p>When enabled, turns off memory
-      checking, and turns on cache profiling.  Cache profiling is
-      described in detail in <a href="#cache">Section 7</a>.
-      </li><br><p>
-
-  <li><code>--weird-hacks=hack1,hack2,...</code>
-      Pass miscellaneous hints to Valgrind which slightly modify the
-      simulated behaviour in nonstandard or dangerous ways, possibly
-      to help the simulation of strange features.  By default no hacks
-      are enabled.  Use with caution!  Currently known hacks are:
-      <p>
-      <ul>
-      <li><code>ioctl-VTIME</code> Use this if you have a program
-          which sets readable file descriptors to have a timeout by
-          doing <code>ioctl</code> on them with a
-          <code>TCSETA</code>-style command <b>and</b> a non-zero
-          <code>VTIME</code> timeout value.  This is considered
-          potentially dangerous and therefore is not engaged by
-          default, because it is (remotely) conceivable that it could
-          cause threads doing <code>read</code> to incorrectly block
-          the entire process.
-          <p>
-          You probably want to try this one if you have a program
-          which unexpectedly blocks in a <code>read</code> from a file
-          descriptor which you know to have been messed with by
-          <code>ioctl</code>.  This could happen, for example, if the
-          descriptor is used to read input from some kind of screen
-          handling library.
-          <p>
-          To find out if your program is blocking unexpectedly in the
-          <code>read</code> system call, run with
-          <code>--trace-syscalls=yes</code> flag.
-      <p>
-      <li><code>truncate-writes</code> Use this if you have a threaded
-          program which appears to unexpectedly block whilst writing
-          into a pipe.  The effect is to modify all calls to
-          <code>write()</code> so that requests to write more than
-          4096 bytes are treated as if they only requested a write of
-          4096 bytes.  Valgrind does this by changing the
-          <code>count</code> argument of <code>write()</code>, as
-          passed to the kernel, so that it is at most 4096.  The
-          amount of data written will then be less than the client
-          program asked for, but the client should have a loop around
-          its <code>write()</code> call to check whether the requested
-          number of bytes have been written.  If not, it should issue
-          further <code>write()</code> calls until all the data is
-          written.
-          <p>
-          This all sounds pretty dodgy to me, which is why I've made
-          this behaviour only happen on request.  It is not the
-          default behaviour.  At the time of writing this (30 June
-          2002) I have only seen one example where this is necessary,
-          so either the problem is extremely rare or nobody is using
-          Valgrind :-)
-          <p>
-          On experimentation I see that <code>truncate-writes</code>
-          doesn't interact well with <code>ioctl-VTIME</code>, so you
-          probably don't want to try both at once.
-          <p>
-          As above, to find out if your program is blocking
-          unexpectedly in the <code>write()</code> system call, you
-          may find the <code>--trace-syscalls=yes
-          --trace-sched=yes</code> flags useful.
-      </ul>
-
-      </li><p>
-</ul>
-
-There are also some options for debugging Valgrind itself.  You
-shouldn't need to use them in the normal run of things.  Nevertheless:
-
-<ul>
-
-  <li><code>--single-step=no</code> [default]<br>
-      <code>--single-step=yes</code>
-      <p>When enabled, each x86 insn is translated seperately into
-      instrumented code.  When disabled, translation is done on a
-      per-basic-block basis, giving much better translations.</li><br>
-      <p>
-
-  <li><code>--optimise=no</code><br>
-      <code>--optimise=yes</code> [default]
-      <p>When enabled, various improvements are applied to the
-      intermediate code, mainly aimed at allowing the simulated CPU's
-      registers to be cached in the real CPU's registers over several
-      simulated instructions.</li><br>
-      <p>
-
-  <li><code>--instrument=no</code><br>
-      <code>--instrument=yes</code> [default]
-      <p>When disabled, the translations don't actually contain any
-      instrumentation.</li><br>
-      <p>
-
-  <li><code>--cleanup=no</code><br>
-      <code>--cleanup=yes</code> [default]
-      <p>When enabled, various improvments are applied to the
-      post-instrumented intermediate code, aimed at removing redundant
-      value checks.</li><br>
-      <p>
-
-  <li><code>--trace-syscalls=no</code> [default]<br>
-      <code>--trace-syscalls=yes</code>
-      <p>Enable/disable tracing of system call intercepts.</li><br>
-      <p>
-
-  <li><code>--trace-signals=no</code> [default]<br>
-      <code>--trace-signals=yes</code>
-      <p>Enable/disable tracing of signal handling.</li><br>
-      <p>
-
-  <li><code>--trace-sched=no</code> [default]<br>
-      <code>--trace-sched=yes</code>
-      <p>Enable/disable tracing of thread scheduling events.</li><br>
-      <p>
-
-  <li><code>--trace-pthread=none</code> [default]<br>
-      <code>--trace-pthread=some</code> <br>
-      <code>--trace-pthread=all</code>
-      <p>Specifies amount of trace detail for pthread-related events.</li><br>
-      <p>
-
-  <li><code>--trace-symtab=no</code> [default]<br>
-      <code>--trace-symtab=yes</code>
-      <p>Enable/disable tracing of symbol table reading.</li><br>
-      <p>
-
-  <li><code>--trace-malloc=no</code> [default]<br>
-      <code>--trace-malloc=yes</code>
-      <p>Enable/disable tracing of malloc/free (et al) intercepts.
-      </li><br>
-      <p>
-
-  <li><code>--stop-after=&lt;number></code> 
-      [default: infinity, more or less]
-      <p>After &lt;number> basic blocks have been executed, shut down
-      Valgrind and switch back to running the client on the real CPU.
-      </li><br>
-      <p>
-
-  <li><code>--dump-error=&lt;number></code> [default: inactive]
-      <p>After the program has exited, show gory details of the
-      translation of the basic block containing the &lt;number>'th
-      error context.  When used with <code>--single-step=yes</code>,
-      can show the exact x86 instruction causing an error.  This is
-      all fairly dodgy and doesn't work at all if threads are
-      involved.</li><br>
-      <p>
-</ul>
-
-
-<a name="errormsgs"></a>
-<h3>2.6&nbsp; Explaination of error messages</h3>
-
-Despite considerable sophistication under the hood, Valgrind can only
-really detect two kinds of errors, use of illegal addresses, and use
-of undefined values.  Nevertheless, this is enough to help you
-discover all sorts of memory-management nasties in your code.  This
-section presents a quick summary of what error messages mean.  The
-precise behaviour of the error-checking machinery is described in
-<a href="#machine">Section 4</a>.
-
-
-<h4>2.6.1&nbsp; Illegal read / Illegal write errors</h4>
-For example:
-<pre>
-  Invalid read of size 4
-     at 0x40F6BBCC: (within /usr/lib/libpng.so.2.1.0.9)
-     by 0x40F6B804: (within /usr/lib/libpng.so.2.1.0.9)
-     by 0x40B07FF4: read_png_image__FP8QImageIO (kernel/qpngio.cpp:326)
-     by 0x40AC751B: QImageIO::read() (kernel/qimage.cpp:3621)
-     Address 0xBFFFF0E0 is not stack'd, malloc'd or free'd
-</pre>
-
-<p>This happens when your program reads or writes memory at a place
-which Valgrind reckons it shouldn't.  In this example, the program did
-a 4-byte read at address 0xBFFFF0E0, somewhere within the
-system-supplied library libpng.so.2.1.0.9, which was called from
-somewhere else in the same library, called from line 326 of
-qpngio.cpp, and so on.
-
-<p>Valgrind tries to establish what the illegal address might relate
-to, since that's often useful.  So, if it points into a block of
-memory which has already been freed, you'll be informed of this, and
-also where the block was free'd at.  Likewise, if it should turn out
-to be just off the end of a malloc'd block, a common result of
-off-by-one-errors in array subscripting, you'll be informed of this
-fact, and also where the block was malloc'd.
-
-<p>In this example, Valgrind can't identify the address.  Actually the
-address is on the stack, but, for some reason, this is not a valid
-stack address -- it is below the stack pointer, %esp, and that isn't
-allowed.  In this particular case it's probably caused by gcc
-generating invalid code, a known bug in various flavours of gcc.
-
-<p>Note that Valgrind only tells you that your program is about to
-access memory at an illegal address.  It can't stop the access from
-happening.  So, if your program makes an access which normally would
-result in a segmentation fault, you program will still suffer the same
-fate -- but you will get a message from Valgrind immediately prior to
-this.  In this particular example, reading junk on the stack is
-non-fatal, and the program stays alive.
-
-
-<h4>2.6.2&nbsp; Use of uninitialised values</h4>
-For example:
-<pre>
-  Conditional jump or move depends on uninitialised value(s)
-     at 0x402DFA94: _IO_vfprintf (_itoa.h:49)
-     by 0x402E8476: _IO_printf (printf.c:36)
-     by 0x8048472: main (tests/manuel1.c:8)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-</pre>
-
-<p>An uninitialised-value use error is reported when your program uses
-a value which hasn't been initialised -- in other words, is undefined.
-Here, the undefined value is used somewhere inside the printf()
-machinery of the C library.  This error was reported when running the
-following small program:
-<pre>
-  int main()
-  {
-    int x;
-    printf ("x = %d\n", x);
-  }
-</pre>
-
-<p>It is important to understand that your program can copy around
-junk (uninitialised) data to its heart's content.  Valgrind observes
-this and keeps track of the data, but does not complain.  A complaint
-is issued only when your program attempts to make use of uninitialised
-data.  In this example, x is uninitialised.  Valgrind observes the
-value being passed to _IO_printf and thence to _IO_vfprintf, but makes
-no comment.  However, _IO_vfprintf has to examine the value of x so it
-can turn it into the corresponding ASCII string, and it is at this
-point that Valgrind complains.
-
-<p>Sources of uninitialised data tend to be:
-<ul>
-  <li>Local variables in procedures which have not been initialised,
-      as in the example above.</li><br><p>
-
-  <li>The contents of malloc'd blocks, before you write something
-      there.  In C++, the new operator is a wrapper round malloc, so
-      if you create an object with new, its fields will be
-      uninitialised until you fill them in, which is only Right and
-      Proper.</li>
-</ul>
-
-
-
-<h4>2.6.3&nbsp; Illegal frees</h4>
-For example:
-<pre>
-  Invalid free()
-     at 0x4004FFDF: free (ut_clientmalloc.c:577)
-     by 0x80484C7: main (tests/doublefree.c:10)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/doublefree)
-     Address 0x3807F7B4 is 0 bytes inside a block of size 177 free'd
-     at 0x4004FFDF: free (ut_clientmalloc.c:577)
-     by 0x80484C7: main (tests/doublefree.c:10)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/doublefree)
-</pre>
-<p>Valgrind keeps track of the blocks allocated by your program with
-malloc/new, so it can know exactly whether or not the argument to
-free/delete is legitimate or not.  Here, this test program has
-freed the same block twice.  As with the illegal read/write errors,
-Valgrind attempts to make sense of the address free'd.  If, as
-here, the address is one which has previously been freed, you wil
-be told that -- making duplicate frees of the same block easy to spot.
-
-
-<h4>2.6.4&nbsp; When a block is freed with an inappropriate
-deallocation function</h4>
-In the following example, a block allocated with <code>new[]</code>
-has wrongly been deallocated with <code>free</code>:
-<pre>
-  Mismatched free() / delete / delete []
-     at 0x40043249: free (vg_clientfuncs.c:171)
-     by 0x4102BB4E: QGArray::~QGArray(void) (tools/qgarray.cpp:149)
-     by 0x4C261C41: PptDoc::~PptDoc(void) (include/qmemarray.h:60)
-     by 0x4C261F0E: PptXml::~PptXml(void) (pptxml.cc:44)
-     Address 0x4BB292A8 is 0 bytes inside a block of size 64 alloc'd
-     at 0x4004318C: __builtin_vec_new (vg_clientfuncs.c:152)
-     by 0x4C21BC15: KLaola::readSBStream(int) const (klaola.cc:314)
-     by 0x4C21C155: KLaola::stream(KLaola::OLENode const *) (klaola.cc:416)
-     by 0x4C21788F: OLEFilter::convert(QCString const &amp;) (olefilter.cc:272)
-</pre>
-The following was told to me be the KDE 3 developers.  I didn't know
-any of it myself.  They also implemented the check itself.
-<p>
-In C++ it's important to deallocate memory in a way compatible with
-how it was allocated.  The deal is:
-<ul>
-<li>If allocated with <code>malloc</code>, <code>calloc</code>,
-    <code>realloc</code>, <code>valloc</code> or
-    <code>memalign</code>, you must deallocate with <code>free</code>.
-<li>If allocated with <code>new[]</code>, you must deallocate with
-    <code>delete[]</code>.
-<li>If allocated with <code>new</code>, you must deallocate with
-    <code>delete</code>.
-</ul>
-The worst thing is that on Linux apparently it doesn't matter if you
-do muddle these up, and it all seems to work ok, but the same program
-may then crash on a different platform, Solaris for example.  So it's
-best to fix it properly.  According to the KDE folks "it's amazing how
-many C++ programmers don't know this".  
-<p>
-Pascal Massimino adds the following clarification:
-<code>delete[]</code> must be called associated with a
-<code>new[]</code> because the compiler stores the size of the array
-and the pointer-to-member to the destructor of the array's content
-just before the pointer actually returned.  This implies a
-variable-sized overhead in what's returned by <code>new</code> or
-<code>new[]</code>.  It rather surprising how compilers [Ed:
-runtime-support libraries?] are robust to mismatch in
-<code>new</code>/<code>delete</code>
-<code>new[]</code>/<code>delete[]</code>.
-
-
-<h4>2.6.5&nbsp; Passing system call parameters with inadequate
-read/write permissions</h4>
-
-Valgrind checks all parameters to system calls.  If a system call
-needs to read from a buffer provided by your program, Valgrind checks
-that the entire buffer is addressible and has valid data, ie, it is
-readable.  And if the system call needs to write to a user-supplied
-buffer, Valgrind checks that the buffer is addressible.  After the
-system call, Valgrind updates its administrative information to
-precisely reflect any changes in memory permissions caused by the
-system call.
-
-<p>Here's an example of a system call with an invalid parameter:
-<pre>
-  #include &lt;stdlib.h>
-  #include &lt;unistd.h>
-  int main( void )
-  {
-    char* arr = malloc(10);
-    (void) write( 1 /* stdout */, arr, 10 );
-    return 0;
-  }
-</pre>
-
-<p>You get this complaint ...
-<pre>
-  Syscall param write(buf) contains uninitialised or unaddressable byte(s)
-     at 0x4035E072: __libc_write
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/badwrite)
-     by &lt;bogus frame pointer> ???
-     Address 0x3807E6D0 is 0 bytes inside a block of size 10 alloc'd
-     at 0x4004FEE6: malloc (ut_clientmalloc.c:539)
-     by 0x80484A0: main (tests/badwrite.c:6)
-     by 0x402A6E5E: __libc_start_main (libc-start.c:129)
-     by 0x80483B1: (within tests/badwrite)
-</pre>
-
-<p>... because the program has tried to write uninitialised junk from
-the malloc'd block to the standard output.
-
-
-<h4>2.6.6&nbsp; Warning messages you might see</h4>
-
-Most of these only appear if you run in verbose mode (enabled by
-<code>-v</code>):
-<ul>
-<li> <code>More than 50 errors detected.  Subsequent errors
-     will still be recorded, but in less detail than before.</code>
-     <br>
-     After 50 different errors have been shown, Valgrind becomes 
-     more conservative about collecting them.  It then requires only 
-     the program counters in the top two stack frames to match when
-     deciding whether or not two errors are really the same one.
-     Prior to this point, the PCs in the top four frames are required
-     to match.  This hack has the effect of slowing down the
-     appearance of new errors after the first 50.  The 50 constant can
-     be changed by recompiling Valgrind.
-<p>
-<li> <code>More than 300 errors detected.  I'm not reporting any more.
-     Final error counts may be inaccurate.  Go fix your
-     program!</code>
-     <br>
-     After 300 different errors have been detected, Valgrind ignores
-     any more.  It seems unlikely that collecting even more different
-     ones would be of practical help to anybody, and it avoids the
-     danger that Valgrind spends more and more of its time comparing
-     new errors against an ever-growing collection.  As above, the 300
-     number is a compile-time constant.
-<p>
-<li> <code>Warning: client switching stacks?</code>
-     <br>
-     Valgrind spotted such a large change in the stack pointer, %esp,
-     that it guesses the client is switching to a different stack.
-     At this point it makes a kludgey guess where the base of the new
-     stack is, and sets memory permissions accordingly.  You may get
-     many bogus error messages following this, if Valgrind guesses
-     wrong.  At the moment "large change" is defined as a change of
-     more that 2000000 in the value of the %esp (stack pointer)
-     register.
-<p>
-<li> <code>Warning: client attempted to close Valgrind's logfile fd &lt;number>
-     </code>
-     <br>
-     Valgrind doesn't allow the client
-     to close the logfile, because you'd never see any diagnostic
-     information after that point.  If you see this message,
-     you may want to use the <code>--logfile-fd=&lt;number></code> 
-     option to specify a different logfile file-descriptor number.
-<p>
-<li> <code>Warning: noted but unhandled ioctl &lt;number></code>
-     <br>
-     Valgrind observed a call to one of the vast family of
-     <code>ioctl</code> system calls, but did not modify its
-     memory status info (because I have not yet got round to it).
-     The call will still have gone through, but you may get spurious
-     errors after this as a result of the non-update of the memory info.
-<p>
-<li> <code>Warning: set address range perms: large range &lt;number></code>
-     <br> 
-     Diagnostic message, mostly for my benefit, to do with memory 
-     permissions.
-</ul>
-
-
-<a name="suppfiles"></a>
-<h3>2.7&nbsp; Writing suppressions files</h3>
-
-A suppression file describes a bunch of errors which, for one reason
-or another, you don't want Valgrind to tell you about.  Usually the
-reason is that the system libraries are buggy but unfixable, at least
-within the scope of the current debugging session.  Multiple
-suppressions files are allowed.  By default, Valgrind uses
-<code>$PREFIX/lib/valgrind/default.supp</code>.
-
-<p>
-You can ask to add suppressions from another file, by specifying
-<code>--suppressions=/path/to/file.supp</code>.
-
-<p>Each suppression has the following components:<br>
-<ul>
-
-  <li>Its name.  This merely gives a handy name to the suppression, by
-      which it is referred to in the summary of used suppressions
-      printed out when a program finishes.  It's not important what
-      the name is; any identifying string will do.
-      <p>
-
-  <li>The nature of the error to suppress.  Either: 
-      <code>Value1</code>, 
-      <code>Value2</code>,
-      <code>Value4</code> or
-      <code>Value8</code>,
-      meaning an uninitialised-value error when
-      using a value of 1, 2, 4 or 8 bytes.
-      Or
-      <code>Cond</code> (or its old name, <code>Value0</code>),
-      meaning use of an uninitialised CPU condition code.  Or: 
-      <code>Addr1</code>,
-      <code>Addr2</code>, 
-      <code>Addr4</code> or 
-      <code>Addr8</code>, meaning an invalid address during a
-      memory access of 1, 2, 4 or 8 bytes respectively.  Or 
-      <code>Param</code>,
-      meaning an invalid system call parameter error.  Or
-      <code>Free</code>, meaning an invalid or mismatching free.
-      Or <code>PThread</code>, meaning any kind of complaint to do
-      with the PThreads API.</li><br>
-      <p>
-
-  <li>The "immediate location" specification.  For Value and Addr
-      errors, is either the name of the function in which the error
-      occurred, or, failing that, the full path the the .so file
-      containing the error location.  For Param errors, is the name of
-      the offending system call parameter.  For Free errors, is the
-      name of the function doing the freeing (eg, <code>free</code>,
-      <code>__builtin_vec_delete</code>, etc)</li><br>
-      <p>
-
-  <li>The caller of the above "immediate location".  Again, either a
-      function or shared-object name.</li><br>
-      <p>
-
-  <li>Optionally, one or two extra calling-function or object names,
-      for greater precision.</li>
-</ul>
-
-<p>
-Locations may be either names of shared objects or wildcards matching
-function names.  They begin <code>obj:</code> and <code>fun:</code>
-respectively.  Function and object names to match against may use the 
-wildcard characters <code>*</code> and <code>?</code>.
-
-A suppression only suppresses an error when the error matches all the
-details in the suppression.  Here's an example:
-<pre>
-  {
-    __gconv_transform_ascii_internal/__mbrtowc/mbtowc
-    Value4
-    fun:__gconv_transform_ascii_internal
-    fun:__mbr*toc
-    fun:mbtowc
-  }
-</pre>
-
-<p>What is means is: suppress a use-of-uninitialised-value error, when
-the data size is 4, when it occurs in the function
-<code>__gconv_transform_ascii_internal</code>, when that is called
-from any function of name matching <code>__mbr*toc</code>, 
-when that is called from
-<code>mbtowc</code>.  It doesn't apply under any other circumstances.
-The string by which this suppression is identified to the user is
-__gconv_transform_ascii_internal/__mbrtowc/mbtowc.
-
-<p>Another example:
-<pre>
-  {
-    libX11.so.6.2/libX11.so.6.2/libXaw.so.7.0
-    Value4
-    obj:/usr/X11R6/lib/libX11.so.6.2
-    obj:/usr/X11R6/lib/libX11.so.6.2
-    obj:/usr/X11R6/lib/libXaw.so.7.0
-  }
-</pre>
-
-<p>Suppress any size 4 uninitialised-value error which occurs anywhere
-in <code>libX11.so.6.2</code>, when called from anywhere in the same
-library, when called from anywhere in <code>libXaw.so.7.0</code>.  The
-inexact specification of locations is regrettable, but is about all
-you can hope for, given that the X11 libraries shipped with Red Hat
-7.2 have had their symbol tables removed.
-
-<p>Note -- since the above two examples did not make it clear -- that
-you can freely mix the <code>obj:</code> and <code>fun:</code>
-styles of description within a single suppression record.
-
-
-<a name="clientreq"></a>
-<h3>2.8&nbsp; The Client Request mechanism</h3>
-
-Valgrind has a trapdoor mechanism via which the client program can
-pass all manner of requests and queries to Valgrind.  Internally, this
-is used extensively to make malloc, free, signals, threads, etc, work,
-although you don't see that.
-<p>
-For your convenience, a subset of these so-called client requests is
-provided to allow you to tell Valgrind facts about the behaviour of
-your program, and conversely to make queries.  In particular, your
-program can tell Valgrind about changes in memory range permissions
-that Valgrind would not otherwise know about, and so allows clients to
-get Valgrind to do arbitrary custom checks.
-<p>
-Clients need to include the header file <code>valgrind.h</code> to
-make this work.  The macros therein have the magical property that
-they generate code in-line which Valgrind can spot.  However, the code
-does nothing when not run on Valgrind, so you are not forced to run
-your program on Valgrind just because you use the macros in this file.
-Also, you are not required to link your program with any extra
-supporting libraries.
-<p>
-A brief description of the available macros:
-<ul>
-<li><code>VALGRIND_MAKE_NOACCESS</code>,
-    <code>VALGRIND_MAKE_WRITABLE</code> and
-    <code>VALGRIND_MAKE_READABLE</code>.  These mark address
-    ranges as completely inaccessible, accessible but containing
-    undefined data, and accessible and containing defined data,
-    respectively.  Subsequent errors may have their faulting
-    addresses described in terms of these blocks.  Returns a
-    "block handle".  Returns zero when not run on Valgrind.
-<p>
-<li><code>VALGRIND_DISCARD</code>: At some point you may want
-    Valgrind to stop reporting errors in terms of the blocks
-    defined by the previous three macros.  To do this, the above
-    macros return a small-integer "block handle".  You can pass
-    this block handle to <code>VALGRIND_DISCARD</code>.  After
-    doing so, Valgrind will no longer be able to relate
-    addressing errors to the user-defined block associated with
-    the handle.  The permissions settings associated with the
-    handle remain in place; this just affects how errors are
-    reported, not whether they are reported.  Returns 1 for an
-    invalid handle and 0 for a valid handle (although passing
-    invalid handles is harmless).  Always returns 0 when not run
-    on Valgrind.
-<p>
-<li><code>VALGRIND_CHECK_NOACCESS</code>,
-    <code>VALGRIND_CHECK_WRITABLE</code> and
-    <code>VALGRIND_CHECK_READABLE</code>: check immediately
-    whether or not the given address range has the relevant
-    property, and if not, print an error message.  Also, for the
-    convenience of the client, returns zero if the relevant
-    property holds; otherwise, the returned value is the address
-    of the first byte for which the property is not true.
-    Always returns 0 when not run on Valgrind.
-<p>
-<li><code>VALGRIND_CHECK_NOACCESS</code>: a quick and easy way
-    to find out whether Valgrind thinks a particular variable
-    (lvalue, to be precise) is addressible and defined.  Prints
-    an error message if not.  Returns no value.
-<p>
-<li><code>VALGRIND_MAKE_NOACCESS_STACK</code>: a highly
-    experimental feature.  Similarly to
-    <code>VALGRIND_MAKE_NOACCESS</code>, this marks an address
-    range as inaccessible, so that subsequent accesses to an
-    address in the range gives an error.  However, this macro
-    does not return a block handle.  Instead, all annotations
-    created like this are reviewed at each client
-    <code>ret</code> (subroutine return) instruction, and those
-    which now define an address range block the client's stack
-    pointer register (<code>%esp</code>) are automatically
-    deleted.
-    <p>
-    In other words, this macro allows the client to tell
-    Valgrind about red-zones on its own stack.  Valgrind
-    automatically discards this information when the stack
-    retreats past such blocks.  Beware: hacky and flaky, and
-    probably interacts badly with the new pthread support.
-<p>
-<li><code>RUNNING_ON_VALGRIND</code>: returns 1 if running on
-    Valgrind, 0 if running on the real CPU.
-<p>
-<li><code>VALGRIND_DO_LEAK_CHECK</code>: run the memory leak detector
-    right now.  Returns no value.  I guess this could be used to
-    incrementally check for leaks between arbitrary places in the
-    program's execution.  Warning: not properly tested!
-<p>
-<li><code>VALGRIND_DISCARD_TRANSLATIONS</code>: discard translations
-    of code in the specified address range.  Useful if you are
-    debugging a JITter or some other dynamic code generation system.
-    After this call, attempts to execute code in the invalidated
-    address range will cause valgrind to make new translations of that
-    code, which is probably the semantics you want.  Note that this is
-    implemented naively, and involves checking all 200191 entries in
-    the translation table to see if any of them overlap the specified
-    address range.  So try not to call it often, or performance will
-    nosedive.  Note that you can be clever about this: you only need
-    to call it when an area which previously contained code is
-    overwritten with new code.  You can choose to write code into
-    fresh memory, and just call this occasionally to discard large
-    chunks of old code all at once.
-    <p>
-    Warning: minimally tested, especially for the cache simulator.
-</ul>
-<p>
-
-
-<a name="pthreads"></a>
-<h3>2.9&nbsp; Support for POSIX Pthreads</h3>
-
-As of late April 02, Valgrind supports programs which use POSIX
-pthreads.  Doing this has proved technically challenging but is now
-mostly complete.  It works well enough for significant threaded
-applications to work.
-<p>
-It works as follows: threaded apps are (dynamically) linked against
-<code>libpthread.so</code>.  Usually this is the one installed with
-your Linux distribution.  Valgrind, however, supplies its own
-<code>libpthread.so</code> and automatically connects your program to
-it instead.
-<p>
-The fake <code>libpthread.so</code> and Valgrind cooperate to
-implement a user-space pthreads package.  This approach avoids the 
-horrible implementation problems of implementing a truly
-multiprocessor version of Valgrind, but it does mean that threaded
-apps run only on one CPU, even if you have a multiprocessor machine.
-<p>
-Valgrind schedules your threads in a round-robin fashion, with all
-threads having equal priority.  It switches threads every 50000 basic
-blocks (typically around 300000 x86 instructions), which means you'll
-get a much finer interleaving of thread executions than when run
-natively.  This in itself may cause your program to behave differently
-if you have some kind of concurrency, critical race, locking, or
-similar, bugs.
-<p>
-The current (valgrind-1.0 release) state of pthread support is as
-follows:
-<ul>
-<li>Mutexes, condition variables, thread-specific data,
-    <code>pthread_once</code>, reader-writer locks, semaphores,
-    cleanup stacks, cancellation and thread detaching currently work.
-    Various attribute-like calls are handled but ignored; you get a
-    warning message.
-<p>
-<li>Currently the following syscalls are thread-safe (nonblocking):
-    <code>write</code> <code>read</code> <code>nanosleep</code>
-    <code>sleep</code> <code>select</code> <code>poll</code> 
-    <code>recvmsg</code> and
-    <code>accept</code>.
-<p>
-<li>Signals in pthreads are now handled properly(ish): 
-    <code>pthread_sigmask</code>, <code>pthread_kill</code>,
-    <code>sigwait</code> and <code>raise</code> are now implemented.
-    Each thread has its own signal mask, as POSIX requires.
-    It's a bit kludgey -- there's a system-wide pending signal set,
-    rather than one for each thread.  But hey.
-</ul>
-
-
-As of 18 May 02, the following threaded programs now work fine on my
-RedHat 7.2 box: Opera 6.0Beta2, KNode in KDE 3.0, Mozilla-0.9.2.1 and
-Galeon-0.11.3, both as supplied with RedHat 7.2.  Also Mozilla 1.0RC2.
-OpenOffice 1.0.  MySQL 3.something (the current stable release).
-
-<a name="install"></a>
-<h3>2.10&nbsp; Building and installing</h3>
-
-We now use the standard Unix <code>./configure</code>,
-<code>make</code>, <code>make install</code> mechanism, and I have
-attempted to ensure that it works on machines with kernel 2.2 or 2.4
-and glibc 2.1.X or 2.2.X.  I don't think there is much else to say.
-There are no options apart from the usual <code>--prefix</code> that
-you should give to <code>./configure</code>.
-
-<p>
-The <code>configure</code> script tests the version of the X server
-currently indicated by the current <code>$DISPLAY</code>.  This is a
-known bug.  The intention was to detect the version of the current
-XFree86 client libraries, so that correct suppressions could be
-selected for them, but instead the test checks the server version.
-This is just plain wrong.
-
-<p>
-If you are building a binary package of Valgrind for distribution,
-please read <code>README_PACKAGERS</code>.  It contains some important
-information.
-
-<p>
-Apart from that there is no excitement here.  Let me know if you have
-build problems.
-
-
-
-<a name="problems"></a>
-<h3>2.11&nbsp; If you have problems</h3>
-Mail me (<a href="mailto:jseward@acm.org">jseward@acm.org</a>).
-
-<p>See <a href="#limits">Section 4</a> for the known limitations of
-Valgrind, and for a list of programs which are known not to work on
-it.
-
-<p>The translator/instrumentor has a lot of assertions in it.  They
-are permanently enabled, and I have no plans to disable them.  If one
-of these breaks, please mail me!
-
-<p>If you get an assertion failure on the expression
-<code>chunkSane(ch)</code> in <code>vg_free()</code> in
-<code>vg_malloc.c</code>, this may have happened because your program
-wrote off the end of a malloc'd block, or before its beginning.
-Valgrind should have emitted a proper message to that effect before
-dying in this way.  This is a known problem which I should fix.
-<p>
-
-<hr width="100%">
-
-<a name="machine"></a>
-<h2>3&nbsp; Details of the checking machinery</h2>
-
-Read this section if you want to know, in detail, exactly what and how
-Valgrind is checking.
-
-<a name="vvalue"></a>
-<h3>3.1&nbsp; Valid-value (V) bits</h3>
-
-It is simplest to think of Valgrind implementing a synthetic Intel x86
-CPU which is identical to a real CPU, except for one crucial detail.
-Every bit (literally) of data processed, stored and handled by the
-real CPU has, in the synthetic CPU, an associated "valid-value" bit,
-which says whether or not the accompanying bit has a legitimate value.
-In the discussions which follow, this bit is referred to as the V
-(valid-value) bit.
-
-<p>Each byte in the system therefore has a 8 V bits which follow
-it wherever it goes.  For example, when the CPU loads a word-size item
-(4 bytes) from memory, it also loads the corresponding 32 V bits from
-a bitmap which stores the V bits for the process' entire address
-space.  If the CPU should later write the whole or some part of that
-value to memory at a different address, the relevant V bits will be
-stored back in the V-bit bitmap.
-
-<p>In short, each bit in the system has an associated V bit, which
-follows it around everywhere, even inside the CPU.  Yes, the CPU's
-(integer and <code>%eflags</code>) registers have their own V bit
-vectors.
-
-<p>Copying values around does not cause Valgrind to check for, or
-report on, errors.  However, when a value is used in a way which might
-conceivably affect the outcome of your program's computation, the
-associated V bits are immediately checked.  If any of these indicate
-that the value is undefined, an error is reported.
-
-<p>Here's an (admittedly nonsensical) example:
-<pre>
-  int i, j;
-  int a[10], b[10];
-  for (i = 0; i &lt; 10; i++) {
-    j = a[i];
-    b[i] = j;
-  }
-</pre>
-
-<p>Valgrind emits no complaints about this, since it merely copies
-uninitialised values from <code>a[]</code> into <code>b[]</code>, and
-doesn't use them in any way.  However, if the loop is changed to
-<pre>
-  for (i = 0; i &lt; 10; i++) {
-    j += a[i];
-  }
-  if (j == 77) 
-     printf("hello there\n");
-</pre>
-then Valgrind will complain, at the <code>if</code>, that the
-condition depends on uninitialised values.
-
-<p>Most low level operations, such as adds, cause Valgrind to 
-use the V bits for the operands to calculate the V bits for the
-result.  Even if the result is partially or wholly undefined,
-it does not complain.
-
-<p>Checks on definedness only occur in two places: when a value is
-used to generate a memory address, and where control flow decision
-needs to be made.  Also, when a system call is detected, valgrind
-checks definedness of parameters as required.
-
-<p>If a check should detect undefinedness, an error message is
-issued.  The resulting value is subsequently regarded as well-defined.
-To do otherwise would give long chains of error messages.  In effect,
-we say that undefined values are non-infectious.
-
-<p>This sounds overcomplicated.  Why not just check all reads from
-memory, and complain if an undefined value is loaded into a CPU register? 
-Well, that doesn't work well, because perfectly legitimate C programs routinely
-copy uninitialised values around in memory, and we don't want endless complaints
-about that.  Here's the canonical example.  Consider a struct
-like this:
-<pre>
-  struct S { int x; char c; };
-  struct S s1, s2;
-  s1.x = 42;
-  s1.c = 'z';
-  s2 = s1;
-</pre>
-
-<p>The question to ask is: how large is <code>struct S</code>, in
-bytes?  An int is 4 bytes and a char one byte, so perhaps a struct S
-occupies 5 bytes?  Wrong.  All (non-toy) compilers I know of will
-round the size of <code>struct S</code> up to a whole number of words,
-in this case 8 bytes.  Not doing this forces compilers to generate
-truly appalling code for subscripting arrays of <code>struct
-S</code>'s.
-
-<p>So s1 occupies 8 bytes, yet only 5 of them will be initialised.
-For the assignment <code>s2 = s1</code>, gcc generates code to copy
-all 8 bytes wholesale into <code>s2</code> without regard for their
-meaning.  If Valgrind simply checked values as they came out of
-memory, it would yelp every time a structure assignment like this
-happened.  So the more complicated semantics described above is
-necessary.  This allows gcc to copy <code>s1</code> into
-<code>s2</code> any way it likes, and a warning will only be emitted
-if the uninitialised values are later used.
-
-<p>One final twist to this story.  The above scheme allows garbage to
-pass through the CPU's integer registers without complaint.  It does
-this by giving the integer registers V tags, passing these around in
-the expected way.  This complicated and computationally expensive to
-do, but is necessary.  Valgrind is more simplistic about
-floating-point loads and stores.  In particular, V bits for data read
-as a result of floating-point loads are checked at the load
-instruction.  So if your program uses the floating-point registers to
-do memory-to-memory copies, you will get complaints about
-uninitialised values.  Fortunately, I have not yet encountered a
-program which (ab)uses the floating-point registers in this way.
-
-<a name="vaddress"></a>
-<h3>3.2&nbsp; Valid-address (A) bits</h3>
-
-Notice that the previous section describes how the validity of values
-is established and maintained without having to say whether the
-program does or does not have the right to access any particular
-memory location.  We now consider the latter issue.
-
-<p>As described above, every bit in memory or in the CPU has an
-associated valid-value (V) bit.  In addition, all bytes in memory, but
-not in the CPU, have an associated valid-address (A) bit.  This
-indicates whether or not the program can legitimately read or write
-that location.  It does not give any indication of the validity or the
-data at that location -- that's the job of the V bits -- only whether
-or not the location may be accessed.
-
-<p>Every time your program reads or writes memory, Valgrind checks the
-A bits associated with the address.  If any of them indicate an
-invalid address, an error is emitted.  Note that the reads and writes
-themselves do not change the A bits, only consult them.
-
-<p>So how do the A bits get set/cleared?  Like this:
-
-<ul>
-  <li>When the program starts, all the global data areas are marked as
-      accessible.</li><br>
-      <p>
-
-  <li>When the program does malloc/new, the A bits for the exactly the
-      area allocated, and not a byte more, are marked as accessible.
-      Upon freeing the area the A bits are changed to indicate
-      inaccessibility.</li><br>
-      <p>
-
-  <li>When the stack pointer register (%esp) moves up or down, A bits
-      are set.  The rule is that the area from %esp up to the base of
-      the stack is marked as accessible, and below %esp is
-      inaccessible.  (If that sounds illogical, bear in mind that the
-      stack grows down, not up, on almost all Unix systems, including
-      GNU/Linux.)  Tracking %esp like this has the useful side-effect
-      that the section of stack used by a function for local variables
-      etc is automatically marked accessible on function entry and
-      inaccessible on exit.</li><br>
-      <p>
-
-  <li>When doing system calls, A bits are changed appropriately.  For
-      example, mmap() magically makes files appear in the process's
-      address space, so the A bits must be updated if mmap()
-      succeeds.</li><br>
-      <p>
-
-  <li>Optionally, your program can tell Valgrind about such changes
-      explicitly, using the client request mechanism described above.
-</ul>
-
-
-<a name="together"></a>
-<h3>3.3&nbsp; Putting it all together</h3>
-Valgrind's checking machinery can be summarised as follows:
-
-<ul>
-  <li>Each byte in memory has 8 associated V (valid-value) bits,
-      saying whether or not the byte has a defined value, and a single
-      A (valid-address) bit, saying whether or not the program
-      currently has the right to read/write that address.</li><br>
-      <p>
-
-  <li>When memory is read or written, the relevant A bits are
-      consulted.  If they indicate an invalid address, Valgrind emits
-      an Invalid read or Invalid write error.</li><br>
-      <p>
-
-  <li>When memory is read into the CPU's integer registers, the
-      relevant V bits are fetched from memory and stored in the
-      simulated CPU.  They are not consulted.</li><br>
-      <p>
-
-  <li>When an integer register is written out to memory, the V bits
-      for that register are written back to memory too.</li><br>
-      <p>
-
-  <li>When memory is read into the CPU's floating point registers, the
-      relevant V bits are read from memory and they are immediately
-      checked.  If any are invalid, an uninitialised value error is
-      emitted.  This precludes using the floating-point registers to
-      copy possibly-uninitialised memory, but simplifies Valgrind in
-      that it does not have to track the validity status of the
-      floating-point registers.</li><br>
-      <p>
-
-  <li>As a result, when a floating-point register is written to
-      memory, the associated V bits are set to indicate a valid
-      value.</li><br>
-      <p>
-
-  <li>When values in integer CPU registers are used to generate a
-      memory address, or to determine the outcome of a conditional
-      branch, the V bits for those values are checked, and an error
-      emitted if any of them are undefined.</li><br>
-      <p>
-
-  <li>When values in integer CPU registers are used for any other
-      purpose, Valgrind computes the V bits for the result, but does
-      not check them.</li><br>
-      <p>
-
-  <li>One the V bits for a value in the CPU have been checked, they
-      are then set to indicate validity.  This avoids long chains of
-      errors.</li><br>
-      <p>
-
-  <li>When values are loaded from memory, valgrind checks the A bits
-      for that location and issues an illegal-address warning if
-      needed.  In that case, the V bits loaded are forced to indicate
-      Valid, despite the location being invalid.
-      <p>
-      This apparently strange choice reduces the amount of confusing
-      information presented to the user.  It avoids the
-      unpleasant phenomenon in which memory is read from a place which
-      is both unaddressible and contains invalid values, and, as a
-      result, you get not only an invalid-address (read/write) error,
-      but also a potentially large set of uninitialised-value errors,
-      one for every time the value is used.
-      <p>
-      There is a hazy boundary case to do with multi-byte loads from
-      addresses which are partially valid and partially invalid.  See
-      details of the flag <code>--partial-loads-ok</code> for details.
-      </li><br>
-</ul>
-
-Valgrind intercepts calls to malloc, calloc, realloc, valloc,
-memalign, free, new and delete.  The behaviour you get is:
-
-<ul>
-
-  <li>malloc/new: the returned memory is marked as addressible but not
-      having valid values.  This means you have to write on it before
-      you can read it.</li><br>
-      <p>
-
-  <li>calloc: returned memory is marked both addressible and valid,
-      since calloc() clears the area to zero.</li><br>
-      <p>
-
-  <li>realloc: if the new size is larger than the old, the new section
-      is addressible but invalid, as with malloc.</li><br>
-      <p>
-
-  <li>If the new size is smaller, the dropped-off section is marked as
-      unaddressible.  You may only pass to realloc a pointer
-      previously issued to you by malloc/calloc/new/realloc.</li><br>
-      <p>
-
-  <li>free/delete: you may only pass to free a pointer previously
-      issued to you by malloc/calloc/new/realloc, or the value
-      NULL. Otherwise, Valgrind complains.  If the pointer is indeed
-      valid, Valgrind marks the entire area it points at as
-      unaddressible, and places the block in the freed-blocks-queue.
-      The aim is to defer as long as possible reallocation of this
-      block.  Until that happens, all attempts to access it will
-      elicit an invalid-address error, as you would hope.</li><br>
-</ul>
-
-
-
-<a name="signals"></a>
-<h3>3.4&nbsp; Signals</h3>
-
-Valgrind provides suitable handling of signals, so, provided you stick
-to POSIX stuff, you should be ok.  Basic sigaction() and sigprocmask()
-are handled.  Signal handlers may return in the normal way or do
-longjmp(); both should work ok.  As specified by POSIX, a signal is
-blocked in its own handler.  Default actions for signals should work
-as before.  Etc, etc.
-
-<p>Under the hood, dealing with signals is a real pain, and Valgrind's
-simulation leaves much to be desired.  If your program does
-way-strange stuff with signals, bad things may happen.  If so, let me
-know.  I don't promise to fix it, but I'd at least like to be aware of
-it.
-
-
-<a name="leaks"></a>
-<h3>3.5&nbsp; Memory leak detection</h3>
-
-Valgrind keeps track of all memory blocks issued in response to calls
-to malloc/calloc/realloc/new.  So when the program exits, it knows
-which blocks are still outstanding -- have not been returned, in other
-words.  Ideally, you want your program to have no blocks still in use
-at exit.  But many programs do.
-
-<p>For each such block, Valgrind scans the entire address space of the
-process, looking for pointers to the block.  One of three situations
-may result:
-
-<ul>
-  <li>A pointer to the start of the block is found.  This usually
-      indicates programming sloppiness; since the block is still
-      pointed at, the programmer could, at least in principle, free'd
-      it before program exit.</li><br>
-      <p>
-
-  <li>A pointer to the interior of the block is found.  The pointer
-      might originally have pointed to the start and have been moved
-      along, or it might be entirely unrelated.  Valgrind deems such a
-      block as "dubious", that is, possibly leaked,
-      because it's unclear whether or
-      not a pointer to it still exists.</li><br>
-      <p>
-
-  <li>The worst outcome is that no pointer to the block can be found.
-      The block is classified as "leaked", because the
-      programmer could not possibly have free'd it at program exit,
-      since no pointer to it exists.  This might be a symptom of
-      having lost the pointer at some earlier point in the
-      program.</li>
-</ul>
-
-Valgrind reports summaries about leaked and dubious blocks.
-For each such block, it will also tell you where the block was
-allocated.  This should help you figure out why the pointer to it has
-been lost.  In general, you should attempt to ensure your programs do
-not have any leaked or dubious blocks at exit.
-
-<p>The precise area of memory in which Valgrind searches for pointers
-is: all naturally-aligned 4-byte words for which all A bits indicate
-addressibility and all V bits indicated that the stored value is
-actually valid.
-
-<p><hr width="100%">
-
-
-<a name="limits"></a>
-<h2>4&nbsp; Limitations</h2>
-
-The following list of limitations seems depressingly long.  However,
-most programs actually work fine.
-
-<p>Valgrind will run x86-GNU/Linux ELF dynamically linked binaries, on
-a kernel 2.2.X or 2.4.X system, subject to the following constraints:
-
-<ul>
-  <li>No MMX, SSE, SSE2, 3DNow instructions.  If the translator
-      encounters these, Valgrind will simply give up.  It may be
-      possible to add support for them at a later time. Intel added a
-      few instructions such as "cmov" to the integer instruction set
-      on Pentium and later processors, and these are supported.
-      Nevertheless it's safest to think of Valgrind as implementing
-      the 486 instruction set.</li><br>
-      <p>
-
-  <li>Pthreads support is improving, but there are still significant
-      limitations in that department.  See the section above on
-      Pthreads.  Note that your program must be dynamically linked
-      against <code>libpthread.so</code>, so that Valgrind can
-      substitute its own implementation at program startup time.  If
-      you're statically linked against it, things will fail
-      badly.</li><br>
-      <p>
-
-  <li>Valgrind assumes that the floating point registers are not used
-      as intermediaries in memory-to-memory copies, so it immediately
-      checks V bits in floating-point loads/stores.  If you want to
-      write code which copies around possibly-uninitialised values,
-      you must ensure these travel through the integer registers, not
-      the FPU.</li><br>
-      <p>
-
-  <li>If your program does its own memory management, rather than
-      using malloc/new/free/delete, it should still work, but
-      Valgrind's error checking won't be so effective.</li><br>
-      <p>
-
-  <li>Valgrind's signal simulation is not as robust as it could be.
-      Basic POSIX-compliant sigaction and sigprocmask functionality is
-      supplied, but it's conceivable that things could go badly awry
-      if you do wierd things with signals.  Workaround: don't.
-      Programs that do non-POSIX signal tricks are in any case
-      inherently unportable, so should be avoided if
-      possible.</li><br>
-      <p>
-
-  <li>Programs which switch stacks are not well handled.  Valgrind
-      does have support for this, but I don't have great faith in it.
-      It's difficult -- there's no cast-iron way to decide whether a
-      large change in %esp is as a result of the program switching
-      stacks, or merely allocating a large object temporarily on the
-      current stack -- yet Valgrind needs to handle the two situations
-      differently.  1 May 02: this probably interacts badly with the
-      new pthread support.  I haven't checked properly.</li><br>
-      <p>
-
-  <li>x86 instructions, and system calls, have been implemented on
-      demand.  So it's possible, although unlikely, that a program
-      will fall over with a message to that effect.  If this happens,
-      please mail me ALL the details printed out, so I can try and
-      implement the missing feature.</li><br>
-      <p>
-
-  <li>x86 floating point works correctly, but floating-point code may
-      run even more slowly than integer code, due to my simplistic
-      approach to FPU emulation.</li><br>
-      <p>
-
-  <li>You can't Valgrind-ize statically linked binaries.  Valgrind
-      relies on the dynamic-link mechanism to gain control at
-      startup.</li><br>
-      <p>
-
-  <li>Memory consumption of your program is majorly increased whilst
-      running under Valgrind.  This is due to the large amount of
-      adminstrative information maintained behind the scenes.  Another
-      cause is that Valgrind dynamically translates the original
-      executable.  Translated, instrumented code is 14-16 times larger
-      than the original (!) so you can easily end up with 30+ MB of
-      translations when running (eg) a web browser.
-      </li>
-</ul>
-
-Programs which are known not to work are:
-
-<ul>
-  <li>emacs starts up but immediately concludes it is out of memory
-      and aborts.  Emacs has it's own memory-management scheme, but I
-      don't understand why this should interact so badly with
-      Valgrind.  Emacs works fine if you build it to use the standard
-      malloc/free routines.</li><br>
-      <p>
-</ul>
-
-Known platform-specific limitations, as of release 1.0.0:
-
-<ul>
-  <li>On Red Hat 7.3, there have been reports of link errors (at
-      program start time) for threaded programs using
-      <code>__pthread_clock_gettime</code> and
-      <code>__pthread_clock_settime</code>.  This appears to be due to
-      <code>/lib/librt-2.2.5.so</code> needing them.  Unfortunately I
-      do not understand enough about this problem to fix it properly,
-      and I can't reproduce it on my test RedHat 7.3 system.  Please
-      mail me if you have more information / understanding.  </li><br>
-      <p>
-  <li>
-      1.0.0 now partially works on Red Hat 7.3.92 ("Limbo"
-      public beta).  However, don't expect a smooth ride.
-      Basically valgrind won't work as-is with any 
-      glibc-2.3 based system.  Limbo is just a little pre glibc-2.3 
-      and it just about works.  Limbo is also gcc-3.1 based and so
-      suffers from the problems in the following point.</li><br>
-      <p>
-  <li>
-      Inlining of string functions with gcc-3.1 or above causes a
-      large number of false reports of uninitialised value uses.  I
-      know what the problem is and roughly how to fix it, but I need
-      to devise a reasonably efficient fix.  Try to reduce the
-      optimisation level, or use <code>-fno-builtin-strlen</code> in
-      the meantime.  Or use an earlier gcc.</li><br>
-      <p>
-</ul>
-
-
-<p><hr width="100%">
-
-
-<a name="howitworks"></a>
-<h2>5&nbsp; How it works -- a rough overview</h2>
-Some gory details, for those with a passion for gory details.  You
-don't need to read this section if all you want to do is use Valgrind.
-
-<a name="startb"></a>
-<h3>5.1&nbsp; Getting started</h3>
-
-Valgrind is compiled into a shared object, valgrind.so.  The shell
-script valgrind sets the LD_PRELOAD environment variable to point to
-valgrind.so.  This causes the .so to be loaded as an extra library to
-any subsequently executed dynamically-linked ELF binary, viz, the
-program you want to debug.
-
-<p>The dynamic linker allows each .so in the process image to have an
-initialisation function which is run before main().  It also allows
-each .so to have a finalisation function run after main() exits.
-
-<p>When valgrind.so's initialisation function is called by the dynamic
-linker, the synthetic CPU to starts up.  The real CPU remains locked
-in valgrind.so for the entire rest of the program, but the synthetic
-CPU returns from the initialisation function.  Startup of the program
-now continues as usual -- the dynamic linker calls all the other .so's
-initialisation routines, and eventually runs main().  This all runs on
-the synthetic CPU, not the real one, but the client program cannot
-tell the difference.
-
-<p>Eventually main() exits, so the synthetic CPU calls valgrind.so's
-finalisation function.  Valgrind detects this, and uses it as its cue
-to exit.  It prints summaries of all errors detected, possibly checks
-for memory leaks, and then exits the finalisation routine, but now on
-the real CPU.  The synthetic CPU has now lost control -- permanently
--- so the program exits back to the OS on the real CPU, just as it
-would have done anyway.
-
-<p>On entry, Valgrind switches stacks, so it runs on its own stack.
-On exit, it switches back.  This means that the client program
-continues to run on its own stack, so we can switch back and forth
-between running it on the simulated and real CPUs without difficulty.
-This was an important design decision, because it makes it easy (well,
-significantly less difficult) to debug the synthetic CPU.
-
-
-<a name="engine"></a>
-<h3>5.2&nbsp; The translation/instrumentation engine</h3>
-
-Valgrind does not directly run any of the original program's code.  Only
-instrumented translations are run.  Valgrind maintains a translation
-table, which allows it to find the translation quickly for any branch
-target (code address).  If no translation has yet been made, the
-translator - a just-in-time translator - is summoned.  This makes an
-instrumented translation, which is added to the collection of
-translations.  Subsequent jumps to that address will use this
-translation.
-
-<p>Valgrind no longer directly supports detection of self-modifying
-code.  Such checking is expensive, and in practice (fortunately)
-almost no applications need it.  However, to help people who are
-debugging dynamic code generation systems, there is a Client Request 
-(basically a macro you can put in your program) which directs Valgrind
-to discard translations in a given address range.  So Valgrind can
-still work in this situation provided the client tells it when
-code has become out-of-date and needs to be retranslated.
-
-<p>The JITter translates basic blocks -- blocks of straight-line-code
--- as single entities.  To minimise the considerable difficulties of
-dealing with the x86 instruction set, x86 instructions are first
-translated to a RISC-like intermediate code, similar to sparc code,
-but with an infinite number of virtual integer registers.  Initially
-each insn is translated seperately, and there is no attempt at
-instrumentation.
-
-<p>The intermediate code is improved, mostly so as to try and cache
-the simulated machine's registers in the real machine's registers over
-several simulated instructions.  This is often very effective.  Also,
-we try to remove redundant updates of the simulated machines's
-condition-code register.
-
-<p>The intermediate code is then instrumented, giving more
-intermediate code.  There are a few extra intermediate-code operations
-to support instrumentation; it is all refreshingly simple.  After
-instrumentation there is a cleanup pass to remove redundant value
-checks.
-
-<p>This gives instrumented intermediate code which mentions arbitrary
-numbers of virtual registers.  A linear-scan register allocator is
-used to assign real registers and possibly generate spill code.  All
-of this is still phrased in terms of the intermediate code.  This
-machinery is inspired by the work of Reuben Thomas (MITE).
-
-<p>Then, and only then, is the final x86 code emitted.  The
-intermediate code is carefully designed so that x86 code can be
-generated from it without need for spare registers or other
-inconveniences.
-
-<p>The translations are managed using a traditional LRU-based caching
-scheme.  The translation cache has a default size of about 14MB.
-
-<a name="track"></a>
-
-<h3>5.3&nbsp; Tracking the status of memory</h3> Each byte in the
-process' address space has nine bits associated with it: one A bit and
-eight V bits.  The A and V bits for each byte are stored using a
-sparse array, which flexibly and efficiently covers arbitrary parts of
-the 32-bit address space without imposing significant space or
-performance overheads for the parts of the address space never
-visited.  The scheme used, and speedup hacks, are described in detail
-at the top of the source file vg_memory.c, so you should read that for
-the gory details.
-
-<a name="sys_calls"></a>
-
-<h3>5.4 System calls</h3>
-All system calls are intercepted.  The memory status map is consulted
-before and updated after each call.  It's all rather tiresome.  See
-vg_syscall_mem.c for details.
-
-<a name="sys_signals"></a>
-
-<h3>5.5&nbsp; Signals</h3>
-All system calls to sigaction() and sigprocmask() are intercepted.  If
-the client program is trying to set a signal handler, Valgrind makes a
-note of the handler address and which signal it is for.  Valgrind then
-arranges for the same signal to be delivered to its own handler.
-
-<p>When such a signal arrives, Valgrind's own handler catches it, and
-notes the fact.  At a convenient safe point in execution, Valgrind
-builds a signal delivery frame on the client's stack and runs its
-handler.  If the handler longjmp()s, there is nothing more to be said.
-If the handler returns, Valgrind notices this, zaps the delivery
-frame, and carries on where it left off before delivering the signal.
-
-<p>The purpose of this nonsense is that setting signal handlers
-essentially amounts to giving callback addresses to the Linux kernel.
-We can't allow this to happen, because if it did, signal handlers
-would run on the real CPU, not the simulated one.  This means the
-checking machinery would not operate during the handler run, and,
-worse, memory permissions maps would not be updated, which could cause
-spurious error reports once the handler had returned.
-
-<p>An even worse thing would happen if the signal handler longjmp'd
-rather than returned: Valgrind would completely lose control of the
-client program.
-
-<p>Upshot: we can't allow the client to install signal handlers
-directly.  Instead, Valgrind must catch, on behalf of the client, any
-signal the client asks to catch, and must delivery it to the client on
-the simulated CPU, not the real one.  This involves considerable
-gruesome fakery; see vg_signals.c for details.
-<p>
-
-<hr width="100%">
-
-<a name="example"></a>
-<h2>6&nbsp; Example</h2>
-This is the log for a run of a small program. The program is in fact
-correct, and the reported error is as the result of a potentially serious
-code generation bug in GNU g++ (snapshot 20010527).
-<pre>
-sewardj@phoenix:~/newmat10$
-~/Valgrind-6/valgrind -v ./bogon 
-==25832== Valgrind 0.10, a memory error detector for x86 RedHat 7.1.
-==25832== Copyright (C) 2000-2001, and GNU GPL'd, by Julian Seward.
-==25832== Startup, with flags:
-==25832== --suppressions=/home/sewardj/Valgrind/redhat71.supp
-==25832== reading syms from /lib/ld-linux.so.2
-==25832== reading syms from /lib/libc.so.6
-==25832== reading syms from /mnt/pima/jrs/Inst/lib/libgcc_s.so.0
-==25832== reading syms from /lib/libm.so.6
-==25832== reading syms from /mnt/pima/jrs/Inst/lib/libstdc++.so.3
-==25832== reading syms from /home/sewardj/Valgrind/valgrind.so
-==25832== reading syms from /proc/self/exe
-==25832== loaded 5950 symbols, 142333 line number locations
-==25832== 
-==25832== Invalid read of size 4
-==25832==    at 0x8048724: _ZN10BandMatrix6ReSizeEiii (bogon.cpp:45)
-==25832==    by 0x80487AF: main (bogon.cpp:66)
-==25832==    by 0x40371E5E: __libc_start_main (libc-start.c:129)
-==25832==    by 0x80485D1: (within /home/sewardj/newmat10/bogon)
-==25832==    Address 0xBFFFF74C is not stack'd, malloc'd or free'd
-==25832==
-==25832== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
-==25832== malloc/free: in use at exit: 0 bytes in 0 blocks.
-==25832== malloc/free: 0 allocs, 0 frees, 0 bytes allocated.
-==25832== For a detailed leak analysis, rerun with: --leak-check=yes
-==25832==
-==25832== exiting, did 1881 basic blocks, 0 misses.
-==25832== 223 translations, 3626 bytes in, 56801 bytes out.
-</pre>
-<p>The GCC folks fixed this about a week before gcc-3.0 shipped.
-<hr width="100%">
-<p>
-
-
-
-<a name="cache"></a>
-<h2>7&nbsp; Cache profiling</h2>
-As well as memory debugging, Valgrind also allows you to do cache simulations
-and annotate your source line-by-line with the number of cache misses.  In
-particular, it records:
-<ul>
-  <li>L1 instruction cache reads and misses;
-  <li>L1 data cache reads and read misses, writes and write misses;
-  <li>L2 unified cache reads and read misses, writes and writes misses.
-</ul>
-On a modern x86 machine, an L1 miss will typically cost around 10 cycles,
-and an L2 miss can cost as much as 200 cycles. Detailed cache profiling can be
-very useful for improving the performance of your program.<p>
-
-Also, since one instruction cache read is performed per instruction executed,
-you can find out how many instructions are executed per line, which can be
-useful for traditional profiling and test coverage.<p>
-
-Any feedback, bug-fixes, suggestions, etc, welcome.
-
-
-<h3>7.1&nbsp; Overview</h3>
-First off, as for normal Valgrind use, you probably want to turn on debugging
-info (the <code>-g</code> flag).  But by contrast with normal Valgrind use, you
-probably <b>do</b> want to turn optimisation on, since you should profile your
-program as it will be normally run.
-
-The two steps are:
-<ol>
-  <li>Run your program with <code>cachegrind</code> in front of the
-      normal command line invocation.  When the program finishes,
-      Valgrind will print summary cache statistics. It also collects
-      line-by-line information in a file <code>cachegrind.out</code>.
-      <p>
-      This step should be done every time you want to collect
-      information about a new program, a changed program, or about the
-      same program with different input.
-  </li>
-  <p>
-  <li>Generate a function-by-function summary, and possibly annotate
-      source files with 'vg_annotate'. Source files to annotate can be
-      specified manually, or manually on the command line, or
-      "interesting" source files can be annotated automatically with
-      the <code>--auto=yes</code> option.  You can annotate C/C++
-      files or assembly language files equally easily.
-      <p>
-      This step can be performed as many times as you like for each
-      Step 2.  You may want to do multiple annotations showing
-      different information each time.<p>
-  </li>
-</ol>
-
-The steps are described in detail in the following sections.<p>
-
-
-<h3>7.2&nbsp; Cache simulation specifics</h3>
-
-Cachegrind uses a simulation for a machine with a split L1 cache and a unified
-L2 cache.  This configuration is used for all (modern) x86-based machines we
-are aware of.  Old Cyrix CPUs had a unified I and D L1 cache, but they are
-ancient history now.<p>
-
-The more specific characteristics of the simulation are as follows.
-
-<ul>
-  <li>Write-allocate: when a write miss occurs, the block written to
-      is brought into the D1 cache.  Most modern caches have this
-      property.</li><p>
-
-  <li>Bit-selection hash function: the line(s) in the cache to which a
-      memory block maps is chosen by the middle bits M--(M+N-1) of the
-      byte address, where:
-      <ul>
-        <li>&nbsp;line size = 2^M bytes&nbsp;</li>
-        <li>(cache size / line size) = 2^N bytes</li>
-      </ul> </li><p>
-
-  <li>Inclusive L2 cache: the L2 cache replicates all the entries of
-      the L1 cache.  This is standard on Pentium chips, but AMD
-      Athlons use an exclusive L2 cache that only holds blocks evicted
-      from L1.  Ditto AMD Durons and most modern VIAs.</li><p>
-</ul>
-
-The cache configuration simulated (cache size, associativity and line size) is
-determined automagically using the CPUID instruction.  If you have an old
-machine that (a) doesn't support the CPUID instruction, or (b) supports it in
-an early incarnation that doesn't give any cache information, then Cachegrind
-will fall back to using a default configuration (that of a model 3/4 Athlon).
-Cachegrind will tell you if this happens.  You can manually specify one, two or
-all three levels (I1/D1/L2) of the cache from the command line using the
-<code>--I1</code>, <code>--D1</code> and <code>--L2</code> options.<p>
-
-Other noteworthy behaviour:
-
-<ul>
-  <li>References that straddle two cache lines are treated as follows:
-  <ul>
-    <li>If both blocks hit --&gt; counted as one hit</li>
-    <li>If one block hits, the other misses --&gt; counted as one miss</li>
-    <li>If both blocks miss --&gt; counted as one miss (not two)</li>
-  </ul><p></li>
-
-  <li>Instructions that modify a memory location (eg. <code>inc</code> and
-      <code>dec</code>) are counted as doing just a read, ie. a single data
-      reference.  This may seem strange, but since the write can never cause a
-      miss (the read guarantees the block is in the cache) it's not very
-      interesting.<p>
-
-      Thus it measures not the number of times the data cache is accessed, but
-      the number of times a data cache miss could occur.<p>
-      </li>
-</ul>
-
-If you are interested in simulating a cache with different properties, it is
-not particularly hard to write your own cache simulator, or to modify the
-existing ones in <code>vg_cachesim_I1.c</code>, <code>vg_cachesim_D1.c</code>,
-<code>vg_cachesim_L2.c</code> and <code>vg_cachesim_gen.c</code>.  We'd be
-interested to hear from anyone who does.
-
-<a name="profile"></a>
-<h3>7.3&nbsp; Profiling programs</h3>
-
-Cache profiling is enabled by using the <code>--cachesim=yes</code>
-option to the <code>valgrind</code> shell script.  Alternatively, it
-is probably more convenient to use the <code>cachegrind</code> script.
-Either way automatically turns off Valgrind's memory checking functions,
-since the cache simulation is slow enough already, and you probably
-don't want to do both at once.
-<p>
-To gather cache profiling information about the program <code>ls
--l</code>, type:
-
-<blockquote><code>cachegrind ls -l</code></blockquote>
-
-The program will execute (slowly).  Upon completion, summary statistics
-that look like this will be printed:
-
-<pre>
-==31751== I   refs:      27,742,716
-==31751== I1  misses:           276
-==31751== L2  misses:           275
-==31751== I1  miss rate:        0.0%
-==31751== L2i miss rate:        0.0%
-==31751== 
-==31751== D   refs:      15,430,290  (10,955,517 rd + 4,474,773 wr)
-==31751== D1  misses:        41,185  (    21,905 rd +    19,280 wr)
-==31751== L2  misses:        23,085  (     3,987 rd +    19,098 wr)
-==31751== D1  miss rate:        0.2% (       0.1%   +       0.4%)
-==31751== L2d miss rate:        0.1% (       0.0%   +       0.4%)
-==31751== 
-==31751== L2 misses:         23,360  (     4,262 rd +    19,098 wr)
-==31751== L2 miss rate:         0.0% (       0.0%   +       0.4%)
-</pre>
-
-Cache accesses for instruction fetches are summarised first, giving the
-number of fetches made (this is the number of instructions executed, which
-can be useful to know in its own right), the number of I1 misses, and the
-number of L2 instruction (<code>L2i</code>) misses.<p>
-
-Cache accesses for data follow. The information is similar to that of the
-instruction fetches, except that the values are also shown split between reads
-and writes (note each row's <code>rd</code> and <code>wr</code> values add up
-to the row's total).<p>
-
-Combined instruction and data figures for the L2 cache follow that.<p>
-
-
-<h3>7.4&nbsp; Output file</h3>
-
-As well as printing summary information, Cachegrind also writes
-line-by-line cache profiling information to a file named
-<code>cachegrind.out</code>.  This file is human-readable, but is best
-interpreted by the accompanying program <code>vg_annotate</code>,
-described in the next section.
-<p>
-Things to note about the <code>cachegrind.out</code> file:
-<ul>
-  <li>It is written every time <code>valgrind --cachesim=yes</code> or
-      <code>cachegrind</code> is run, and will overwrite any existing
-      <code>cachegrind.out</code> in the current directory.</li>
-  <p>
-  <li>It can be huge: <code>ls -l</code> generates a file of about
-      350KB.  Browsing a few files and web pages with a Konqueror
-      built with full debugging information generates a file
-      of around 15 MB.</li>
-</ul>
-
-<a name="profileflags"></a>
-<h3>7.5&nbsp; Cachegrind options</h3>
-Cachegrind accepts all the options that Valgrind does, although some of them
-(ones related to memory checking) don't do anything when cache profiling.<p>
-
-The interesting cache-simulation specific options are:
-
-<ul>
-  <li><code>--I1=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><br>
-      <code>--D1=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><br> 
-      <code>--L2=&lt;size&gt;,&lt;associativity&gt;,&lt;line_size&gt;</code><p> 
-      [default: uses CPUID for automagic cache configuration]<p>
-
-      Manually specifies the I1/D1/L2 cache configuration, where
-      <code>size</code> and <code>line_size</code> are measured in bytes.  The
-      three items must be comma-separated, but with no spaces, eg:
-
-      <blockquote><code>cachegrind --I1=65535,2,64</code></blockquote>
-
-      You can specify one, two or three of the I1/D1/L2 caches.  Any level not
-      manually specified will be simulated using the configuration found in the
-      normal way (via the CPUID instruction, or failing that, via defaults).
-</ul>
-
-  
-<a name="annotate"></a>
-<h3>7.6&nbsp; Annotating C/C++ programs</h3>
-
-Before using <code>vg_annotate</code>, it is worth widening your
-window to be at least 120-characters wide if possible, as the output
-lines can be quite long.
-<p>
-To get a function-by-function summary, run <code>vg_annotate</code> in
-directory containing a <code>cachegrind.out</code> file.  The output
-looks like this:
-
-<pre>
---------------------------------------------------------------------------------
-I1 cache:              65536 B, 64 B, 2-way associative
-D1 cache:              65536 B, 64 B, 2-way associative
-L2 cache:              262144 B, 64 B, 8-way associative
-Command:               concord vg_to_ucode.c
-Events recorded:       Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Events shown:          Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Event sort order:      Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Threshold:             99%
-Chosen for annotation:
-Auto-annotation:       on
-
---------------------------------------------------------------------------------
-Ir         I1mr I2mr Dr         D1mr   D2mr  Dw        D1mw   D2mw
---------------------------------------------------------------------------------
-27,742,716  276  275 10,955,517 21,905 3,987 4,474,773 19,280 19,098  PROGRAM TOTALS
-
---------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr        D1mr  D2mr  Dw        D1mw   D2mw    file:function
---------------------------------------------------------------------------------
-8,821,482    5    5 2,242,702 1,621    73 1,794,230      0      0  getc.c:_IO_getc
-5,222,023    4    4 2,276,334    16    12   875,959      1      1  concord.c:get_word
-2,649,248    2    2 1,344,810 7,326 1,385         .      .      .  vg_main.c:strcmp
-2,521,927    2    2   591,215     0     0   179,398      0      0  concord.c:hash
-2,242,740    2    2 1,046,612   568    22   448,548      0      0  ctype.c:tolower
-1,496,937    4    4   630,874 9,000 1,400   279,388      0      0  concord.c:insert
-  897,991   51   51   897,831    95    30        62      1      1  ???:???
-  598,068    1    1   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__flockfile
-  598,068    0    0   299,034     0     0   149,517      0      0  ../sysdeps/generic/lockfile.c:__funlockfile
-  598,024    4    4   213,580    35    16   149,506      0      0  vg_clientmalloc.c:malloc
-  446,587    1    1   215,973 2,167   430   129,948 14,057 13,957  concord.c:add_existing
-  341,760    2    2   128,160     0     0   128,160      0      0  vg_clientmalloc.c:vg_trap_here_WRAPPER
-  320,782    4    4   150,711   276     0    56,027     53     53  concord.c:init_hash_table
-  298,998    1    1   106,785     0     0    64,071      1      1  concord.c:create
-  149,518    0    0   149,516     0     0         1      0      0  ???:tolower@@GLIBC_2.0
-  149,518    0    0   149,516     0     0         1      0      0  ???:fgetc@@GLIBC_2.0
-   95,983    4    4    38,031     0     0    34,409  3,152  3,150  concord.c:new_word_node
-   85,440    0    0    42,720     0     0    21,360      0      0  vg_clientmalloc.c:vg_bogus_epilogue
-</pre>
-
-First up is a summary of the annotation options:
-                    
-<ul>
-  <li>I1 cache, D1 cache, L2 cache: cache configuration.  So you know the
-      configuration with which these results were obtained.</li><p>
-
-  <li>Command: the command line invocation of the program under
-      examination.</li><p>
-
-  <li>Events recorded: event abbreviations are:<p>
-  <ul>
-    <li><code>Ir  </code>:  I cache reads (ie. instructions executed)</li>
-    <li><code>I1mr</code>: I1 cache read misses</li>
-    <li><code>I2mr</code>: L2 cache instruction read misses</li>
-    <li><code>Dr  </code>:  D cache reads (ie. memory reads)</li>
-    <li><code>D1mr</code>: D1 cache read misses</li>
-    <li><code>D2mr</code>: L2 cache data read misses</li>
-    <li><code>Dw  </code>:  D cache writes (ie. memory writes)</li>
-    <li><code>D1mw</code>: D1 cache write misses</li>
-    <li><code>D2mw</code>: L2 cache data write misses</li>
-  </ul><p>
-      Note that D1 total accesses is given by <code>D1mr</code> +
-      <code>D1mw</code>, and that L2 total accesses is given by
-      <code>I2mr</code> + <code>D2mr</code> + <code>D2mw</code>.</li><p>
-
-  <li>Events shown: the events shown (a subset of events gathered).  This can
-      be adjusted with the <code>--show</code> option.</li><p>
-
-  <li>Event sort order: the sort order in which functions are shown.  For
-      example, in this case the functions are sorted from highest
-      <code>Ir</code> counts to lowest.  If two functions have identical
-      <code>Ir</code> counts, they will then be sorted by <code>I1mr</code>
-      counts, and so on.  This order can be adjusted with the
-      <code>--sort</code> option.<p>
-
-      Note that this dictates the order the functions appear.  It is <b>not</b>
-      the order in which the columns appear;  that is dictated by the "events
-      shown" line (and can be changed with the <code>--show</code> option).
-      </li><p>
-
-  <li>Threshold: <code>vg_annotate</code> by default omits functions
-      that cause very low numbers of misses to avoid drowning you in
-      information.  In this case, vg_annotate shows summaries the
-      functions that account for 99% of the <code>Ir</code> counts;
-      <code>Ir</code> is chosen as the threshold event since it is the
-      primary sort event.  The threshold can be adjusted with the
-      <code>--threshold</code> option.</li><p>
-
-  <li>Chosen for annotation: names of files specified manually for annotation; 
-      in this case none.</li><p>
-
-  <li>Auto-annotation: whether auto-annotation was requested via the 
-      <code>--auto=yes</code> option. In this case no.</li><p>
-</ul>
-
-Then follows summary statistics for the whole program. These are similar
-to the summary provided when running <code>cachegrind</code>.<p>
-  
-Then follows function-by-function statistics. Each function is
-identified by a <code>file_name:function_name</code> pair. If a column
-contains only a dot it means the function never performs
-that event (eg. the third row shows that <code>strcmp()</code>
-contains no instructions that write to memory). The name
-<code>???</code> is used if the the file name and/or function name
-could not be determined from debugging information. If most of the
-entries have the form <code>???:???</code> the program probably wasn't
-compiled with <code>-g</code>.  If any code was invalidated (either due to
-self-modifying code or unloading of shared objects) its counts are aggregated
-into a single cost centre written as <code>(discarded):(discarded)</code>.<p>
-
-It is worth noting that functions will come from three types of source files:
-<ol>
-  <li> From the profiled program (<code>concord.c</code> in this example).</li>
-  <li>From libraries (eg. <code>getc.c</code>)</li>
-  <li>From Valgrind's implementation of some libc functions (eg.
-      <code>vg_clientmalloc.c:malloc</code>).  These are recognisable because
-      the filename begins with <code>vg_</code>, and is probably one of
-      <code>vg_main.c</code>, <code>vg_clientmalloc.c</code> or
-      <code>vg_mylibc.c</code>.
-  </li>
-</ol>
-
-There are two ways to annotate source files -- by choosing them
-manually, or with the <code>--auto=yes</code> option. To do it
-manually, just specify the filenames as arguments to
-<code>vg_annotate</code>. For example, the output from running
-<code>vg_annotate concord.c</code> for our example produces the same
-output as above followed by an annotated version of
-<code>concord.c</code>, a section of which looks like:
-
-<pre>
---------------------------------------------------------------------------------
--- User-annotated source: concord.c
---------------------------------------------------------------------------------
-Ir        I1mr I2mr Dr      D1mr  D2mr  Dw      D1mw   D2mw
-
-[snip]
-
-        .    .    .       .     .     .       .      .      .  void init_hash_table(char *file_name, Word_Node *table[])
-        3    1    1       .     .     .       1      0      0  {
-        .    .    .       .     .     .       .      .      .      FILE *file_ptr;
-        .    .    .       .     .     .       .      .      .      Word_Info *data;
-        1    0    0       .     .     .       1      1      1      int line = 1, i;
-        .    .    .       .     .     .       .      .      .
-        5    0    0       .     .     .       3      0      0      data = (Word_Info *) create(sizeof(Word_Info));
-        .    .    .       .     .     .       .      .      .
-    4,991    0    0   1,995     0     0     998      0      0      for (i = 0; i < TABLE_SIZE; i++)
-    3,988    1    1   1,994     0     0     997     53     52          table[i] = NULL;
-        .    .    .       .     .     .       .      .      .
-        .    .    .       .     .     .       .      .      .      /* Open file, check it. */
-        6    0    0       1     0     0       4      0      0      file_ptr = fopen(file_name, "r");
-        2    0    0       1     0     0       .      .      .      if (!(file_ptr)) {
-        .    .    .       .     .     .       .      .      .          fprintf(stderr, "Couldn't open '%s'.\n", file_name);
-        1    1    1       .     .     .       .      .      .          exit(EXIT_FAILURE);
-        .    .    .       .     .     .       .      .      .      }
-        .    .    .       .     .     .       .      .      .
-  165,062    1    1  73,360     0     0  91,700      0      0      while ((line = get_word(data, line, file_ptr)) != EOF)
-  146,712    0    0  73,356     0     0  73,356      0      0          insert(data->;word, data->line, table);
-        .    .    .       .     .     .       .      .      .
-        4    0    0       1     0     0       2      0      0      free(data);
-        4    0    0       1     0     0       2      0      0      fclose(file_ptr);
-        3    0    0       2     0     0       .      .      .  }
-</pre>
-
-(Although column widths are automatically minimised, a wide terminal is clearly
-useful.)<p>
-  
-Each source file is clearly marked (<code>User-annotated source</code>) as
-having been chosen manually for annotation.  If the file was found in one of
-the directories specified with the <code>-I</code>/<code>--include</code>
-option, the directory and file are both given.<p>
-
-Each line is annotated with its event counts.  Events not applicable for a line
-are represented by a `.';  this is useful for distinguishing between an event
-which cannot happen, and one which can but did not.<p> 
-
-Sometimes only a small section of a source file is executed.  To minimise
-uninteresting output, Valgrind only shows annotated lines and lines within a
-small distance of annotated lines.  Gaps are marked with the line numbers so
-you know which part of a file the shown code comes from, eg:
-
-<pre>
-(figures and code for line 704)
--- line 704 ----------------------------------------
--- line 878 ----------------------------------------
-(figures and code for line 878)
-</pre>
-
-The amount of context to show around annotated lines is controlled by the
-<code>--context</code> option.<p>
-
-To get automatic annotation, run <code>vg_annotate --auto=yes</code>.
-vg_annotate will automatically annotate every source file it can find that is
-mentioned in the function-by-function summary.  Therefore, the files chosen for
-auto-annotation  are affected by the <code>--sort</code> and
-<code>--threshold</code> options.  Each source file is clearly marked
-(<code>Auto-annotated source</code>) as being chosen automatically.  Any files
-that could not be found are mentioned at the end of the output, eg:    
-
-<pre>
---------------------------------------------------------------------------------
-The following files chosen for auto-annotation could not be found:
---------------------------------------------------------------------------------
-  getc.c
-  ctype.c
-  ../sysdeps/generic/lockfile.c
-</pre>
-
-This is quite common for library files, since libraries are usually compiled
-with debugging information, but the source files are often not present on a
-system.  If a file is chosen for annotation <b>both</b> manually and
-automatically, it is marked as <code>User-annotated source</code>.
-
-Use the <code>-I/--include</code> option to tell Valgrind where to look for
-source files if the filenames found from the debugging information aren't
-specific enough.
-
-Beware that vg_annotate can take some time to digest large
-<code>cachegrind.out</code> files, eg. 30 seconds or more.  Also beware that
-auto-annotation can produce a lot of output if your program is large!
-
-
-<h3>7.7&nbsp; Annotating assembler programs</h3>
-
-Valgrind can annotate assembler programs too, or annotate the
-assembler generated for your C program.  Sometimes this is useful for
-understanding what is really happening when an interesting line of C
-code is translated into multiple instructions.<p>
-
-To do this, you just need to assemble your <code>.s</code> files with
-assembler-level debug information.  gcc doesn't do this, but you can
-use the GNU assembler with the <code>--gstabs</code> option to
-generate object files with this information, eg:
-
-<blockquote><code>as --gstabs foo.s</code></blockquote>
-
-You can then profile and annotate source files in the same way as for C/C++
-programs.
-
-
-<h3>7.8&nbsp; <code>vg_annotate</code> options</h3>
-<ul>
-  <li><code>-h, --help</code></li><p>
-  <li><code>-v, --version</code><p>
-
-      Help and version, as usual.</li>
-
-  <li><code>--sort=A,B,C</code> [default: order in 
-      <code>cachegrind.out</code>]<p>
-      Specifies the events upon which the sorting of the function-by-function
-      entries will be based.  Useful if you want to concentrate on eg. I cache
-      misses (<code>--sort=I1mr,I2mr</code>), or D cache misses
-      (<code>--sort=D1mr,D2mr</code>), or L2 misses
-      (<code>--sort=D2mr,I2mr</code>).</li><p>
-
-  <li><code>--show=A,B,C</code> [default: all, using order in
-      <code>cachegrind.out</code>]<p>
-      Specifies which events to show (and the column order). Default is to use
-      all present in the <code>cachegrind.out</code> file (and use the order in
-      the file).</li><p>
-
-  <li><code>--threshold=X</code> [default: 99%] <p>
-      Sets the threshold for the function-by-function summary.  Functions are
-      shown that account for more than X% of the primary sort event.  If
-      auto-annotating, also affects which files are annotated.
-      
-      Note: thresholds can be set for more than one of the events by appending
-      any events for the <code>--sort</code> option with a colon and a number
-      (no spaces, though).  E.g. if you want to see the functions that cover
-      99% of L2 read misses and 99% of L2 write misses, use this option:
-      
-      <blockquote><code>--sort=D2mr:99,D2mw:99</code></blockquote>
-      </li><p>
-
-  <li><code>--auto=no</code> [default]<br>
-      <code>--auto=yes</code> <p>
-      When enabled, automatically annotates every file that is mentioned in the
-      function-by-function summary that can be found.  Also gives a list of
-      those that couldn't be found.
-
-  <li><code>--context=N</code> [default: 8]<p>
-      Print N lines of context before and after each annotated line.  Avoids
-      printing large sections of source files that were not executed.  Use a 
-      large number (eg. 10,000) to show all source lines.
-      </li><p>
-
-  <li><code>-I=&lt;dir&gt;, --include=&lt;dir&gt;</code> 
-      [default: empty string]<p>
-      Adds a directory to the list in which to search for files.  Multiple
-      -I/--include options can be given to add multiple directories.
-</ul>
-  
-
-<h3>7.9&nbsp; Warnings</h3>
-There are a couple of situations in which vg_annotate issues warnings.
-
-<ul>
-  <li>If a source file is more recent than the <code>cachegrind.out</code>
-      file.  This is because the information in <code>cachegrind.out</code> is
-      only recorded with line numbers, so if the line numbers change at all in
-      the source (eg. lines added, deleted, swapped), any annotations will be 
-      incorrect.<p>
-
-  <li>If information is recorded about line numbers past the end of a file.
-      This can be caused by the above problem, ie. shortening the source file
-      while using an old <code>cachegrind.out</code> file.  If this happens,
-      the figures for the bogus lines are printed anyway (clearly marked as
-      bogus) in case they are important.</li><p>
-</ul>
-
-
-<h3>7.10&nbsp; Things to watch out for</h3>
-Some odd things that can occur during annotation:
-
-<ul>
-  <li>If annotating at the assembler level, you might see something like this:
-
-      <pre>
-      1    0    0  .    .    .  .    .    .          leal -12(%ebp),%eax
-      1    0    0  .    .    .  1    0    0          movl %eax,84(%ebx)
-      2    0    0  0    0    0  1    0    0          movl $1,-20(%ebp)
-      .    .    .  .    .    .  .    .    .          .align 4,0x90
-      1    0    0  .    .    .  .    .    .          movl $.LnrB,%eax
-      1    0    0  .    .    .  1    0    0          movl %eax,-16(%ebp)
-      </pre>
-
-      How can the third instruction be executed twice when the others are
-      executed only once?  As it turns out, it isn't.  Here's a dump of the
-      executable, using <code>objdump -d</code>:
-
-      <pre>
-      8048f25:       8d 45 f4                lea    0xfffffff4(%ebp),%eax
-      8048f28:       89 43 54                mov    %eax,0x54(%ebx)
-      8048f2b:       c7 45 ec 01 00 00 00    movl   $0x1,0xffffffec(%ebp)
-      8048f32:       89 f6                   mov    %esi,%esi
-      8048f34:       b8 08 8b 07 08          mov    $0x8078b08,%eax
-      8048f39:       89 45 f0                mov    %eax,0xfffffff0(%ebp)
-      </pre>
-
-      Notice the extra <code>mov %esi,%esi</code> instruction.  Where did this
-      come from?  The GNU assembler inserted it to serve as the two bytes of
-      padding needed to align the <code>movl $.LnrB,%eax</code> instruction on
-      a four-byte boundary, but pretended it didn't exist when adding debug
-      information.  Thus when Valgrind reads the debug info it thinks that the
-      <code>movl $0x1,0xffffffec(%ebp)</code> instruction covers the address
-      range 0x8048f2b--0x804833 by itself, and attributes the counts for the
-      <code>mov %esi,%esi</code> to it.<p>
-  </li>
-
-  <li>Inlined functions can cause strange results in the function-by-function
-      summary.  If a function <code>inline_me()</code> is defined in
-      <code>foo.h</code> and inlined in the functions <code>f1()</code>,
-      <code>f2()</code> and <code>f3()</code> in <code>bar.c</code>, there will
-      not be a <code>foo.h:inline_me()</code> function entry.  Instead, there
-      will be separate function entries for each inlining site, ie.
-      <code>foo.h:f1()</code>, <code>foo.h:f2()</code> and
-      <code>foo.h:f3()</code>.  To find the total counts for
-      <code>foo.h:inline_me()</code>, add up the counts from each entry.<p>
-
-      The reason for this is that although the debug info output by gcc
-      indicates the switch from <code>bar.c</code> to <code>foo.h</code>, it
-      doesn't indicate the name of the function in <code>foo.h</code>, so
-      Valgrind keeps using the old one.<p>
-
-  <li>Sometimes, the same filename might be represented with a relative name
-      and with an absolute name in different parts of the debug info, eg:
-      <code>/home/user/proj/proj.h</code> and <code>../proj.h</code>.  In this
-      case, if you use auto-annotation, the file will be annotated twice with
-      the counts split between the two.<p>
-  </li>
-
-  <li>Files with more than 65,535 lines cause difficulties for the stabs debug
-      info reader.  This is because the line number in the <code>struct
-      nlist</code> defined in <code>a.out.h</code> under Linux is only a 16-bit
-      value.  Valgrind can handle some files with more than 65,535 lines
-      correctly by making some guesses to identify line number overflows.  But
-      some cases are beyond it, in which case you'll get a warning message
-      explaining that annotations for the file might be incorrect.<p>
-  </li>
-
-  <li>If you compile some files with <code>-g</code> and some without, some
-      events that take place in a file without debug info could be attributed
-      to the last line of a file with debug info (whichever one gets placed
-      before the non-debug-info file in the executable).<p>
-  </li>
-</ul>
-
-This list looks long, but these cases should be fairly rare.<p>
-
-Note: stabs is not an easy format to read.  If you come across bizarre
-annotations that look like might be caused by a bug in the stabs reader,
-please let us know.<p>
-
-
-<h3>7.11&nbsp; Accuracy</h3>
-Valgrind's cache profiling has a number of shortcomings:
-
-<ul>
-  <li>It doesn't account for kernel activity -- the effect of system calls on
-      the cache contents is ignored.</li><p>
-
-  <li>It doesn't account for other process activity (although this is probably
-      desirable when considering a single program).</li><p>
-
-  <li>It doesn't account for virtual-to-physical address mappings;  hence the
-      entire simulation is not a true representation of what's happening in the
-      cache.</li><p>
-
-  <li>It doesn't account for cache misses not visible at the instruction level,
-      eg. those arising from TLB misses, or speculative execution.</li><p>
-
-  <li>Valgrind's custom <code>malloc()</code> will allocate memory in different
-      ways to the standard <code>malloc()</code>, which could warp the results.
-      </li><p>
-
-  <li>Valgrind's custom threads implementation will schedule threads
-      differently to the standard one.  This too could warp the results for
-      threaded programs.
-      </li><p>
-
-  <li>The instructions <code>bts</code>, <code>btr</code> and <code>btc</code>
-      will incorrectly be counted as doing a data read if both the arguments
-      are registers, eg:
-
-      <blockquote><code>btsl %eax, %edx</code></blockquote>
-
-      This should only happen rarely.
-</ul>
-
-Another thing worth nothing is that results are very sensitive.  Changing the
-size of the <code>valgrind.so</code> file, the size of the program being
-profiled, or even the length of its name can perturb the results.  Variations
-will be small, but don't expect perfectly repeatable results if your program
-changes at all.<p>
-
-While these factors mean you shouldn't trust the results to be super-accurate,
-hopefully they should be close enough to be useful.<p>
-
-
-<h3>7.12&nbsp; Todo</h3>
-<ul>
-  <li>Program start-up/shut-down calls a lot of functions that aren't
-      interesting and just complicate the output.  Would be nice to exclude
-      these somehow.</li>
-  <p>
-</ul> 
-<hr width="100%">
-</body>
-</html>
-
diff --git a/memcheck/docs/nav.html b/memcheck/docs/nav.html
deleted file mode 100644
index ad920ad443..0000000000
--- a/memcheck/docs/nav.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<html>
-  <head>
-    <title>Valgrind</title>
-    <base target="main">
-    <style type="text/css">
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-  </head>
-
-  <body>
-    <br>
-    <a href="manual.html#contents"><b>Contents of this manual</b></a><br>
-    <a href="manual.html#intro">1 Introduction</a><br>
-    <a href="manual.html#whatfor">1.1 What Valgrind is for</a><br>
-    <a href="manual.html#whatdoes">1.2 What it does with
-       your program</a>
-    <p>
-    <a href="manual.html#howtouse">2 <b>How to use it, and how to
-       make sense of the results</b></a><br>
-    <a href="manual.html#starta">2.1 Getting started</a><br>
-    <a href="manual.html#comment">2.2 The commentary</a><br>
-    <a href="manual.html#report">2.3 Reporting of errors</a><br>
-    <a href="manual.html#suppress">2.4 Suppressing errors</a><br>
-    <a href="manual.html#flags">2.5 Command-line flags</a><br>
-    <a href="manual.html#errormsgs">2.6 Explanation of error messages</a><br>
-    <a href="manual.html#suppfiles">2.7 Writing suppressions files</a><br>
-    <a href="manual.html#clientreq">2.8 The Client Request mechanism</a><br>
-    <a href="manual.html#pthreads">2.9 Support for POSIX pthreads</a><br>
-    <a href="manual.html#install">2.10 Building and installing</a><br>
-    <a href="manual.html#problems">2.11 If you have problems</a>
-    <p>
-    <a href="manual.html#machine">3 <b>Details of the checking machinery</b></a><br>
-    <a href="manual.html#vvalue">3.1 Valid-value (V) bits</a><br>
-    <a href="manual.html#vaddress">3.2 Valid-address (A) bits</a><br>
-    <a href="manual.html#together">3.3 Putting it all together</a><br>
-    <a href="manual.html#signals">3.4 Signals</a><br>
-    <a href="manual.html#leaks">3.5 Memory leak detection</a>
-    <p>
-    <a href="manual.html#limits">4 <b>Limitations</b></a><br>
-    <p>
-    <a href="manual.html#howitworks">5 <b>How it works -- a rough overview</b></a><br>
-    <a href="manual.html#startb">5.1 Getting started</a><br>
-    <a href="manual.html#engine">5.2 The translation/instrumentation engine</a><br>
-    <a href="manual.html#track">5.3 Tracking the status of memory</a><br>
-    <a href="manual.html#sys_calls">5.4 System calls</a><br>
-    <a href="manual.html#sys_signals">5.5 Signals</a>
-    <p>
-    <a href="manual.html#example">6 <b>An example</b></a><br>
-    <p>
-    <a href="manual.html#cache">7 <b>Cache profiling</b></a></h4>
-    <p>
-    <a href="techdocs.html">8 <b>The design and implementation of Valgrind</b></a><br>
-
-</body>
-</html>
diff --git a/memcheck/docs/techdocs.html b/memcheck/docs/techdocs.html
deleted file mode 100644
index 2e1cc8b7e9..0000000000
--- a/memcheck/docs/techdocs.html
+++ /dev/null
@@ -1,2524 +0,0 @@
-<html>
-  <head>
-    <style type="text/css">
-      body      { background-color: #ffffff;
-                  color:            #000000;
-                  font-family:      Times, Helvetica, Arial;
-                  font-size:        14pt}
-      h4        { margin-bottom:    0.3em}
-      code      { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      pre       { color:            #000000;
-                  font-family:      Courier; 
-                  font-size:        13pt }
-      a:link    { color:            #0000C0;
-                  text-decoration:  none; }
-      a:visited { color:            #0000C0; 
-                  text-decoration:  none; }
-      a:active  { color:            #0000C0;
-                  text-decoration:  none; }
-    </style>
-    <title>The design and implementation of Valgrind</title>
-  </head>
-
-<body bgcolor="#ffffff">
-
-<a name="title">&nbsp;</a>
-<h1 align=center>The design and implementation of Valgrind</h1>
-
-<center>
-Detailed technical notes for hackers, maintainers and the
-overly-curious<br>
-These notes pertain to snapshot 20020306<br>
-<p>
-<a href="mailto:jseward@acm.org">jseward@acm.org<br>
-<a href="http://developer.kde.org/~sewardj">http://developer.kde.org/~sewardj</a><br>
-Copyright &copy; 2000-2002 Julian Seward
-<p>
-Valgrind is licensed under the GNU General Public License, 
-version 2<br>
-An open-source tool for finding memory-management problems in
-x86 GNU/Linux executables.
-</center>
-
-<p>
-
-
-
-
-<hr width="100%">
-
-<h2>Introduction</h2>
-
-This document contains a detailed, highly-technical description of the
-internals of Valgrind.  This is not the user manual; if you are an
-end-user of Valgrind, you do not want to read this.  Conversely, if
-you really are a hacker-type and want to know how it works, I assume
-that you have read the user manual thoroughly.
-<p>
-You may need to read this document several times, and carefully.  Some
-important things, I only say once.
-
-
-<h3>History</h3>
-
-Valgrind came into public view in late Feb 2002.  However, it has been
-under contemplation for a very long time, perhaps seriously for about
-five years.  Somewhat over two years ago, I started working on the x86
-code generator for the Glasgow Haskell Compiler
-(http://www.haskell.org/ghc), gaining familiarity with x86 internals
-on the way.  I then did Cacheprof (http://www.cacheprof.org), gaining
-further x86 experience.  Some time around Feb 2000 I started
-experimenting with a user-space x86 interpreter for x86-Linux.  This
-worked, but it was clear that a JIT-based scheme would be necessary to
-give reasonable performance for Valgrind.  Design work for the JITter
-started in earnest in Oct 2000, and by early 2001 I had an x86-to-x86
-dynamic translator which could run quite large programs.  This
-translator was in a sense pointless, since it did not do any
-instrumentation or checking.
-
-<p>
-Most of the rest of 2001 was taken up designing and implementing the
-instrumentation scheme.  The main difficulty, which consumed a lot
-of effort, was to design a scheme which did not generate large numbers
-of false uninitialised-value warnings.  By late 2001 a satisfactory
-scheme had been arrived at, and I started to test it on ever-larger
-programs, with an eventual eye to making it work well enough so that
-it was helpful to folks debugging the upcoming version 3 of KDE.  I've
-used KDE since before version 1.0, and wanted to Valgrind to be an
-indirect contribution to the KDE 3 development effort.  At the start of
-Feb 02 the kde-core-devel crew started using it, and gave a huge
-amount of helpful feedback and patches in the space of three weeks.
-Snapshot 20020306 is the result.
-
-<p>
-In the best Unix tradition, or perhaps in the spirit of Fred Brooks'
-depressing-but-completely-accurate epitaph "build one to throw away;
-you will anyway", much of Valgrind is a second or third rendition of
-the initial idea.  The instrumentation machinery
-(<code>vg_translate.c</code>, <code>vg_memory.c</code>) and core CPU
-simulation (<code>vg_to_ucode.c</code>, <code>vg_from_ucode.c</code>)
-have had three redesigns and rewrites; the register allocator,
-low-level memory manager (<code>vg_malloc2.c</code>) and symbol table
-reader (<code>vg_symtab2.c</code>) are on the second rewrite.  In a
-sense, this document serves to record some of the knowledge gained as
-a result.
-
-
-<h3>Design overview</h3>
-
-Valgrind is compiled into a Linux shared object,
-<code>valgrind.so</code>, and also a dummy one,
-<code>valgrinq.so</code>, of which more later.  The
-<code>valgrind</code> shell script adds <code>valgrind.so</code> to
-the <code>LD_PRELOAD</code> list of extra libraries to be
-loaded with any dynamically linked library.  This is a standard trick,
-one which I assume the <code>LD_PRELOAD</code> mechanism was developed
-to support.
-
-<p>
-<code>valgrind.so</code>
-is linked with the <code>-z initfirst</code> flag, which requests that
-its initialisation code is run before that of any other object in the
-executable image.  When this happens, valgrind gains control.  The
-real CPU becomes "trapped" in <code>valgrind.so</code> and the 
-translations it generates.  The synthetic CPU provided by Valgrind
-does, however, return from this initialisation function.  So the 
-normal startup actions, orchestrated by the dynamic linker
-<code>ld.so</code>, continue as usual, except on the synthetic CPU,
-not the real one.  Eventually <code>main</code> is run and returns,
-and then the finalisation code of the shared objects is run,
-presumably in inverse order to which they were initialised.  Remember,
-this is still all happening on the simulated CPU.  Eventually
-<code>valgrind.so</code>'s own finalisation code is called.  It spots
-this event, shuts down the simulated CPU, prints any error summaries
-and/or does leak detection, and returns from the initialisation code
-on the real CPU.  At this point, in effect the real and synthetic CPUs
-have merged back into one, Valgrind has lost control of the program,
-and the program finally <code>exit()s</code> back to the kernel in the
-usual way.
-
-<p>
-The normal course of activity, one Valgrind has started up, is as
-follows.  Valgrind never runs any part of your program (usually
-referred to as the "client"), not a single byte of it, directly.
-Instead it uses function <code>VG_(translate)</code> to translate
-basic blocks (BBs, straight-line sequences of code) into instrumented
-translations, and those are run instead.  The translations are stored
-in the translation cache (TC), <code>vg_tc</code>, with the
-translation table (TT), <code>vg_tt</code> supplying the
-original-to-translation code address mapping.  Auxiliary array
-<code>VG_(tt_fast)</code> is used as a direct-map cache for fast
-lookups in TT; it usually achieves a hit rate of around 98% and
-facilitates an orig-to-trans lookup in 4 x86 insns, which is not bad.
-
-<p>
-Function <code>VG_(dispatch)</code> in <code>vg_dispatch.S</code> is
-the heart of the JIT dispatcher.  Once a translated code address has
-been found, it is executed simply by an x86 <code>call</code>
-to the translation.  At the end of the translation, the next 
-original code addr is loaded into <code>%eax</code>, and the 
-translation then does a <code>ret</code>, taking it back to the
-dispatch loop, with, interestingly, zero branch mispredictions.  
-The address requested in <code>%eax</code> is looked up first in
-<code>VG_(tt_fast)</code>, and, if not found, by calling C helper
-<code>VG_(search_transtab)</code>.  If there is still no translation 
-available, <code>VG_(dispatch)</code> exits back to the top-level
-C dispatcher <code>VG_(toploop)</code>, which arranges for 
-<code>VG_(translate)</code> to make a new translation.  All fairly
-unsurprising, really.  There are various complexities described below.
-
-<p>
-The translator, orchestrated by <code>VG_(translate)</code>, is
-complicated but entirely self-contained.  It is described in great
-detail in subsequent sections.  Translations are stored in TC, with TT
-tracking administrative information.  The translations are subject to
-an approximate LRU-based management scheme.  With the current
-settings, the TC can hold at most about 15MB of translations, and LRU
-passes prune it to about 13.5MB.  Given that the
-orig-to-translation expansion ratio is about 13:1 to 14:1, this means
-TC holds translations for more or less a megabyte of original code,
-which generally comes to about 70000 basic blocks for C++ compiled
-with optimisation on.  Generating new translations is expensive, so it
-is worth having a large TC to minimise the (capacity) miss rate.
-
-<p>
-The dispatcher, <code>VG_(dispatch)</code>, receives hints from
-the translations which allow it to cheaply spot all control 
-transfers corresponding to x86 <code>call</code> and <code>ret</code>
-instructions.  It has to do this in order to spot some special events:
-<ul>
-<li>Calls to <code>VG_(shutdown)</code>.  This is Valgrind's cue to
-    exit.  NOTE: actually this is done a different way; it should be
-    cleaned up.
-<p>
-<li>Returns of system call handlers, to the return address 
-    <code>VG_(signalreturn_bogusRA)</code>.  The signal simulator
-    needs to know when a signal handler is returning, so we spot
-    jumps (returns) to this address.
-<p>
-<li>Calls to <code>vg_trap_here</code>.  All <code>malloc</code>,
-    <code>free</code>, etc calls that the client program makes are
-    eventually routed to a call to <code>vg_trap_here</code>,
-    and Valgrind does its own special thing with these calls.
-    In effect this provides a trapdoor, by which Valgrind can
-    intercept certain calls on the simulated CPU, run the call as it
-    sees fit itself (on the real CPU), and return the result to
-    the simulated CPU, quite transparently to the client program.
-</ul>
-Valgrind intercepts the client's <code>malloc</code>,
-<code>free</code>, etc,
-calls, so that it can store additional information.  Each block 
-<code>malloc</code>'d by the client gives rise to a shadow block
-in which Valgrind stores the call stack at the time of the
-<code>malloc</code>
-call.  When the client calls <code>free</code>, Valgrind tries to
-find the shadow block corresponding to the address passed to
-<code>free</code>, and emits an error message if none can be found.
-If it is found, the block is placed on the freed blocks queue 
-<code>vg_freed_list</code>, it is marked as inaccessible, and
-its shadow block now records the call stack at the time of the
-<code>free</code> call.  Keeping <code>free</code>'d blocks in
-this queue allows Valgrind to spot all (presumably invalid) accesses
-to them.  However, once the volume of blocks in the free queue 
-exceeds <code>VG_(clo_freelist_vol)</code>, blocks are finally
-removed from the queue.
-
-<p>
-Keeping track of A and V bits (note: if you don't know what these are,
-you haven't read the user guide carefully enough) for memory is done
-in <code>vg_memory.c</code>.  This implements a sparse array structure
-which covers the entire 4G address space in a way which is reasonably
-fast and reasonably space efficient.  The 4G address space is divided
-up into 64K sections, each covering 64Kb of address space.  Given a
-32-bit address, the top 16 bits are used to select one of the 65536
-entries in <code>VG_(primary_map)</code>.  The resulting "secondary"
-(<code>SecMap</code>) holds A and V bits for the 64k of address space
-chunk corresponding to the lower 16 bits of the address.
-
-
-<h3>Design decisions</h3>
-
-Some design decisions were motivated by the need to make Valgrind
-debuggable.  Imagine you are writing a CPU simulator.  It works fairly
-well.  However, you run some large program, like Netscape, and after
-tens of millions of instructions, it crashes.  How can you figure out
-where in your simulator the bug is?
-
-<p>
-Valgrind's answer is: cheat.  Valgrind is designed so that it is
-possible to switch back to running the client program on the real
-CPU at any point.  Using the <code>--stop-after= </code> flag, you can 
-ask Valgrind to run just some number of basic blocks, and then 
-run the rest of the way on the real CPU.  If you are searching for
-a bug in the simulated CPU, you can use this to do a binary search,
-which quickly leads you to the specific basic block which is
-causing the problem.  
-
-<p>
-This is all very handy.  It does constrain the design in certain
-unimportant ways.  Firstly, the layout of memory, when viewed from the
-client's point of view, must be identical regardless of whether it is
-running on the real or simulated CPU.  This means that Valgrind can't
-do pointer swizzling -- well, no great loss -- and it can't run on 
-the same stack as the client -- again, no great loss.  
-Valgrind operates on its own stack, <code>VG_(stack)</code>, which
-it switches to at startup, temporarily switching back to the client's
-stack when doing system calls for the client.
-
-<p>
-Valgrind also receives signals on its own stack,
-<code>VG_(sigstack)</code>, but for different gruesome reasons
-discussed below.
-
-<p>
-This nice clean switch-back-to-the-real-CPU-whenever-you-like story
-is muddied by signals.  Problem is that signals arrive at arbitrary
-times and tend to slightly perturb the basic block count, with the
-result that you can get close to the basic block causing a problem but
-can't home in on it exactly.  My kludgey hack is to define
-<code>SIGNAL_SIMULATION</code> to 1 towards the bottom of 
-<code>vg_syscall_mem.c</code>, so that signal handlers are run on the
-real CPU and don't change the BB counts.
-
-<p>
-A second hole in the switch-back-to-real-CPU story is that Valgrind's
-way of delivering signals to the client is different from that of the
-kernel.  Specifically, the layout of the signal delivery frame, and
-the mechanism used to detect a sighandler returning, are different.
-So you can't expect to make the transition inside a sighandler and
-still have things working, but in practice that's not much of a
-restriction.
-
-<p>
-Valgrind's implementation of <code>malloc</code>, <code>free</code>,
-etc, (in <code>vg_clientmalloc.c</code>, not the low-level stuff in
-<code>vg_malloc2.c</code>) is somewhat complicated by the need to 
-handle switching back at arbitrary points.  It does work tho.
-
-
-
-<h3>Correctness</h3>
-
-There's only one of me, and I have a Real Life (tm) as well as hacking
-Valgrind [allegedly :-].  That means I don't have time to waste
-chasing endless bugs in Valgrind.  My emphasis is therefore on doing
-everything as simply as possible, with correctness, stability and
-robustness being the number one priority, more important than
-performance or functionality.  As a result:
-<ul>
-<li>The code is absolutely loaded with assertions, and these are
-    <b>permanently enabled.</b>  I have no plan to remove or disable
-    them later.  Over the past couple of months, as valgrind has
-    become more widely used, they have shown their worth, pulling
-    up various bugs which would otherwise have appeared as
-    hard-to-find segmentation faults.
-    <p>
-    I am of the view that it's acceptable to spend 5% of the total
-    running time of your valgrindified program doing assertion checks
-    and other internal sanity checks.
-<p>
-<li>Aside from the assertions, valgrind contains various sets of
-    internal sanity checks, which get run at varying frequencies
-    during normal operation.  <code>VG_(do_sanity_checks)</code>
-    runs every 1000 basic blocks, which means 500 to 2000 times/second 
-    for typical machines at present.  It checks that Valgrind hasn't
-    overrun its private stack, and does some simple checks on the
-    memory permissions maps.  Once every 25 calls it does some more
-    extensive checks on those maps.  Etc, etc.
-    <p>
-    The following components also have sanity check code, which can
-    be enabled to aid debugging:
-    <ul>
-    <li>The low-level memory-manager
-        (<code>VG_(mallocSanityCheckArena)</code>).  This does a 
-        complete check of all blocks and chains in an arena, which
-        is very slow.  Is not engaged by default.
-    <p>
-    <li>The symbol table reader(s): various checks to ensure
-        uniqueness of mappings; see <code>VG_(read_symbols)</code>
-        for a start.  Is permanently engaged.
-    <p>
-    <li>The A and V bit tracking stuff in <code>vg_memory.c</code>.
-        This can be compiled with cpp symbol
-        <code>VG_DEBUG_MEMORY</code> defined, which removes all the
-        fast, optimised cases, and uses simple-but-slow fallbacks
-        instead.  Not engaged by default.
-    <p>
-    <li>Ditto <code>VG_DEBUG_LEAKCHECK</code>.
-    <p>
-    <li>The JITter parses x86 basic blocks into sequences of 
-        UCode instructions.  It then sanity checks each one with
-        <code>VG_(saneUInstr)</code> and sanity checks the sequence
-        as a whole with <code>VG_(saneUCodeBlock)</code>.  This stuff
-        is engaged by default, and has caught some way-obscure bugs
-        in the simulated CPU machinery in its time.
-    <p>
-    <li>The system call wrapper does
-        <code>VG_(first_and_last_secondaries_look_plausible)</code> after
-        every syscall; this is known to pick up bugs in the syscall
-        wrappers.  Engaged by default.
-    <p>
-    <li>The main dispatch loop, in <code>VG_(dispatch)</code>, checks
-        that translations do not set <code>%ebp</code> to any value
-        different from <code>VG_EBP_DISPATCH_CHECKED</code> or
-        <code>& VG_(baseBlock)</code>.  In effect this test is free,
-        and is permanently engaged.
-    <p>
-    <li>There are a couple of ifdefed-out consistency checks I
-        inserted whilst debugging the new register allocater, 
-        <code>vg_do_register_allocation</code>.
-    </ul>
-<p>
-<li>I try to avoid techniques, algorithms, mechanisms, etc, for which
-    I can supply neither a convincing argument that they are correct,
-    nor sanity-check code which might pick up bugs in my
-    implementation.  I don't always succeed in this, but I try.
-    Basically the idea is: avoid techniques which are, in practice,
-    unverifiable, in some sense.   When doing anything, always have in
-    mind: "how can I verify that this is correct?"
-</ul>
-
-<p>
-Some more specific things are:
-
-<ul>
-<li>Valgrind runs in the same namespace as the client, at least from
-    <code>ld.so</code>'s point of view, and it therefore absolutely
-    had better not export any symbol with a name which could clash
-    with that of the client or any of its libraries.  Therefore, all
-    globally visible symbols exported from <code>valgrind.so</code>
-    are defined using the <code>VG_</code> CPP macro.  As you'll see
-    from <code>vg_constants.h</code>, this appends some arbitrary
-    prefix to the symbol, in order that it be, we hope, globally
-    unique.  Currently the prefix is <code>vgPlain_</code>.  For
-    convenience there are also <code>VGM_</code>, <code>VGP_</code>
-    and <code>VGOFF_</code>.  All locally defined symbols are declared
-    <code>static</code> and do not appear in the final shared object.
-    <p>
-    To check this, I periodically do 
-    <code>nm valgrind.so | grep " T "</code>, 
-    which shows you all the globally exported text symbols.
-    They should all have an approved prefix, except for those like
-    <code>malloc</code>, <code>free</code>, etc, which we deliberately
-    want to shadow and take precedence over the same names exported
-    from <code>glibc.so</code>, so that valgrind can intercept those
-    calls easily.  Similarly, <code>nm valgrind.so | grep " D "</code>
-    allows you to find any rogue data-segment symbol names.
-<p>
-<li>Valgrind tries, and almost succeeds, in being completely
-    independent of all other shared objects, in particular of
-    <code>glibc.so</code>.  For example, we have our own low-level
-    memory manager in <code>vg_malloc2.c</code>, which is a fairly
-    standard malloc/free scheme augmented with arenas, and
-    <code>vg_mylibc.c</code> exports reimplementations of various bits
-    and pieces you'd normally get from the C library.
-    <p>
-    Why all the hassle?  Because imagine the potential chaos of both
-    the simulated and real CPUs executing in <code>glibc.so</code>.
-    It just seems simpler and cleaner to be completely self-contained,
-    so that only the simulated CPU visits <code>glibc.so</code>.  In
-    practice it's not much hassle anyway.  Also, valgrind starts up
-    before glibc has a chance to initialise itself, and who knows what
-    difficulties that could lead to.  Finally, glibc has definitions
-    for some types, specifically <code>sigset_t</code>, which conflict
-    (are different from) the Linux kernel's idea of same.  When 
-    Valgrind wants to fiddle around with signal stuff, it wants to
-    use the kernel's definitions, not glibc's definitions.  So it's 
-    simplest just to keep glibc out of the picture entirely.
-    <p>
-    To find out which glibc symbols are used by Valgrind, reinstate
-    the link flags <code>-nostdlib -Wl,-no-undefined</code>.  This
-    causes linking to fail, but will tell you what you depend on.
-    I have mostly, but not entirely, got rid of the glibc
-    dependencies; what remains is, IMO, fairly harmless.  AFAIK the
-    current dependencies are: <code>memset</code>,
-    <code>memcmp</code>, <code>stat</code>, <code>system</code>,
-    <code>sbrk</code>, <code>setjmp</code> and <code>longjmp</code>.
-
-<p>
-<li>Similarly, valgrind should not really import any headers other
-    than the Linux kernel headers, since it knows of no API other than
-    the kernel interface to talk to.  At the moment this is really not
-    in a good state, and <code>vg_syscall_mem</code> imports, via
-    <code>vg_unsafe.h</code>, a significant number of C-library
-    headers so as to know the sizes of various structs passed across
-    the kernel boundary.  This is of course completely bogus, since
-    there is no guarantee that the C library's definitions of these
-    structs matches those of the kernel.  I have started to sort this
-    out using <code>vg_kerneliface.h</code>, into which I had intended
-    to copy all kernel definitions which valgrind could need, but this
-    has not gotten very far.  At the moment it mostly contains
-    definitions for <code>sigset_t</code> and <code>struct
-    sigaction</code>, since the kernel's definition for these really
-    does clash with glibc's.  I plan to use a <code>vki_</code> prefix
-    on all these types and constants, to denote the fact that they
-    pertain to <b>V</b>algrind's <b>K</b>ernel <b>I</b>nterface.
-    <p>
-    Another advantage of having a <code>vg_kerneliface.h</code> file
-    is that it makes it simpler to interface to a different kernel.
-    Once can, for example, easily imagine writing a new
-    <code>vg_kerneliface.h</code> for FreeBSD, or x86 NetBSD.
-
-</ul>
-
-<h3>Current limitations</h3>
-
-No threads.  I think fixing this is close to a research-grade problem.
-<p>
-No MMX.  Fixing this should be relatively easy, using the same giant
-trick used for x86 FPU instructions.  See below.
-<p>
-Support for weird (non-POSIX) signal stuff is patchy.  Does anybody
-care?
-<p>
-
-
-
-
-<hr width="100%">
-
-<h2>The instrumenting JITter</h2>
-
-This really is the heart of the matter.  We begin with various side
-issues.
-
-<h3>Run-time storage, and the use of host registers</h3>
-
-Valgrind translates client (original) basic blocks into instrumented
-basic blocks, which live in the translation cache TC, until either the
-client finishes or the translations are ejected from TC to make room
-for newer ones.
-<p>
-Since it generates x86 code in memory, Valgrind has complete control
-of the use of registers in the translations.  Now pay attention.  I
-shall say this only once, and it is important you understand this.  In
-what follows I will refer to registers in the host (real) cpu using
-their standard names, <code>%eax</code>, <code>%edi</code>, etc.  I
-refer to registers in the simulated CPU by capitalising them:
-<code>%EAX</code>, <code>%EDI</code>, etc.  These two sets of
-registers usually bear no direct relationship to each other; there is
-no fixed mapping between them.  This naming scheme is used fairly
-consistently in the comments in the sources.
-<p>
-Host registers, once things are up and running, are used as follows:
-<ul>
-<li><code>%esp</code>, the real stack pointer, points
-    somewhere in Valgrind's private stack area,
-    <code>VG_(stack)</code> or, transiently, into its signal delivery
-    stack, <code>VG_(sigstack)</code>.
-<p>
-<li><code>%edi</code> is used as a temporary in code generation; it
-    is almost always dead, except when used for the <code>Left</code>
-    value-tag operations.
-<p>
-<li><code>%eax</code>, <code>%ebx</code>, <code>%ecx</code>,
-    <code>%edx</code> and <code>%esi</code> are available to
-    Valgrind's register allocator.  They are dead (carry unimportant
-    values) in between translations, and are live only in
-    translations.  The one exception to this is <code>%eax</code>,
-    which, as mentioned far above, has a special significance to the
-    dispatch loop <code>VG_(dispatch)</code>: when a translation
-    returns to the dispatch loop, <code>%eax</code> is expected to
-    contain the original-code-address of the next translation to run.
-    The register allocator is so good at minimising spill code that
-    using five regs and not having to save/restore <code>%edi</code>
-    actually gives better code than allocating to <code>%edi</code>
-    as well, but then having to push/pop it around special uses.
-<p>
-<li><code>%ebp</code> points permanently at
-    <code>VG_(baseBlock)</code>.  Valgrind's translations are
-    position-independent, partly because this is convenient, but also
-    because translations get moved around in TC as part of the LRUing
-    activity.  <b>All</b> static entities which need to be referred to
-    from generated code, whether data or helper functions, are stored
-    starting at <code>VG_(baseBlock)</code> and are therefore reached
-    by indexing from <code>%ebp</code>.  There is but one exception, 
-    which is that by placing the value
-    <code>VG_EBP_DISPATCH_CHECKED</code>
-    in <code>%ebp</code> just before a return to the dispatcher, 
-    the dispatcher is informed that the next address to run, 
-    in <code>%eax</code>, requires special treatment.
-<p>
-<li>The real machine's FPU state is pretty much unimportant, for
-    reasons which will become obvious.  Ditto its <code>%eflags</code>
-    register.
-</ul>
-
-<p>
-The state of the simulated CPU is stored in memory, in
-<code>VG_(baseBlock)</code>, which is a block of 200 words IIRC.
-Recall that <code>%ebp</code> points permanently at the start of this
-block.  Function <code>vg_init_baseBlock</code> decides what the
-offsets of various entities in <code>VG_(baseBlock)</code> are to be,
-and allocates word offsets for them.  The code generator then emits
-<code>%ebp</code> relative addresses to get at those things.  The
-sequence in which entities are allocated has been carefully chosen so
-that the 32 most popular entities come first, because this means 8-bit
-offsets can be used in the generated code.
-
-<p>
-If I was clever, I could make <code>%ebp</code> point 32 words along 
-<code>VG_(baseBlock)</code>, so that I'd have another 32 words of
-short-form offsets available, but that's just complicated, and it's
-not important -- the first 32 words take 99% (or whatever) of the
-traffic.
-
-<p>
-Currently, the sequence of stuff in <code>VG_(baseBlock)</code> is as
-follows:
-<ul>
-<li>9 words, holding the simulated integer registers,
-    <code>%EAX</code> .. <code>%EDI</code>, and the simulated flags,
-    <code>%EFLAGS</code>.
-<p>
-<li>Another 9 words, holding the V bit "shadows" for the above 9 regs.
-<p>
-<li>The <b>addresses</b> of various helper routines called from
-    generated code: 
-    <code>VG_(helper_value_check4_fail)</code>,
-    <code>VG_(helper_value_check0_fail)</code>,
-    which register V-check failures,
-    <code>VG_(helperc_STOREV4)</code>,
-    <code>VG_(helperc_STOREV1)</code>,
-    <code>VG_(helperc_LOADV4)</code>,
-    <code>VG_(helperc_LOADV1)</code>,
-    which do stores and loads of V bits to/from the 
-    sparse array which keeps track of V bits in memory,
-    and
-    <code>VGM_(handle_esp_assignment)</code>, which messes with
-    memory addressibility resulting from changes in <code>%ESP</code>.
-<p>
-<li>The simulated <code>%EIP</code>.
-<p>
-<li>24 spill words, for when the register allocator can't make it work
-    with 5 measly registers.
-<p>
-<li>Addresses of helpers <code>VG_(helperc_STOREV2)</code>,
-    <code>VG_(helperc_LOADV2)</code>.  These are here because 2-byte
-    loads and stores are relatively rare, so are placed above the
-    magic 32-word offset boundary.
-<p>
-<li>For similar reasons, addresses of helper functions 
-    <code>VGM_(fpu_write_check)</code> and
-    <code>VGM_(fpu_read_check)</code>, which handle the A/V maps
-    testing and changes required by FPU writes/reads.  
-<p>
-<li>Some other boring helper addresses:
-    <code>VG_(helper_value_check2_fail)</code> and
-    <code>VG_(helper_value_check1_fail)</code>.  These are probably
-    never emitted now, and should be removed.
-<p>
-<li>The entire state of the simulated FPU, which I believe to be
-    108 bytes long.
-<p>
-<li>Finally, the addresses of various other helper functions in
-    <code>vg_helpers.S</code>, which deal with rare situations which
-    are tedious or difficult to generate code in-line for.
-</ul>
-
-<p>
-As a general rule, the simulated machine's state lives permanently in
-memory at <code>VG_(baseBlock)</code>.  However, the JITter does some
-optimisations which allow the simulated integer registers to be
-cached in real registers over multiple simulated instructions within
-the same basic block.  These are always flushed back into memory at
-the end of every basic block, so that the in-memory state is
-up-to-date between basic blocks.  (This flushing is implied by the
-statement above that the real machine's allocatable registers are
-dead in between simulated blocks).
-
-
-<h3>Startup, shutdown, and system calls</h3>
-
-Getting into of Valgrind (<code>VG_(startup)</code>, called from
-<code>valgrind.so</code>'s initialisation section), really means
-copying the real CPU's state into <code>VG_(baseBlock)</code>, and
-then installing our own stack pointer, etc, into the real CPU, and
-then starting up the JITter.  Exiting valgrind involves copying the
-simulated state back to the real state.
-
-<p>
-Unfortunately, there's a complication at startup time.  Problem is
-that at the point where we need to take a snapshot of the real CPU's
-state, the offsets in <code>VG_(baseBlock)</code> are not set up yet,
-because to do so would involve disrupting the real machine's state
-significantly.  The way round this is to dump the real machine's state
-into a temporary, static block of memory,
-<code>VG_(m_state_static)</code>.  We can then set up the
-<code>VG_(baseBlock)</code> offsets at our leisure, and copy into it
-from <code>VG_(m_state_static)</code> at some convenient later time.
-This copying is done by
-<code>VG_(copy_m_state_static_to_baseBlock)</code>.
-
-<p>
-On exit, the inverse transformation is (rather unnecessarily) used:
-stuff in <code>VG_(baseBlock)</code> is copied to
-<code>VG_(m_state_static)</code>, and the assembly stub then copies
-from <code>VG_(m_state_static)</code> into the real machine registers.
-
-<p>
-Doing system calls on behalf of the client (<code>vg_syscall.S</code>)
-is something of a half-way house.  We have to make the world look
-sufficiently like that which the client would normally have to make
-the syscall actually work properly, but we can't afford to lose
-control.  So the trick is to copy all of the client's state, <b>except
-its program counter</b>, into the real CPU, do the system call, and
-copy the state back out.  Note that the client's state includes its
-stack pointer register, so one effect of this partial restoration is
-to cause the system call to be run on the client's stack, as it should
-be.
-
-<p>
-As ever there are complications.  We have to save some of our own state
-somewhere when restoring the client's state into the CPU, so that we
-can keep going sensibly afterwards.  In fact the only thing which is
-important is our own stack pointer, but for paranoia reasons I save 
-and restore our own FPU state as well, even though that's probably
-pointless.
-
-<p>
-The complication on the above complication is, that for horrible
-reasons to do with signals, we may have to handle a second client
-system call whilst the client is blocked inside some other system 
-call (unbelievable!).  That means there's two sets of places to 
-dump Valgrind's stack pointer and FPU state across the syscall,
-and we decide which to use by consulting
-<code>VG_(syscall_depth)</code>, which is in turn maintained by
-<code>VG_(wrap_syscall)</code>.
-
-
-
-<h3>Introduction to UCode</h3>
-
-UCode lies at the heart of the x86-to-x86 JITter.  The basic premise
-is that dealing the the x86 instruction set head-on is just too darn
-complicated, so we do the traditional compiler-writer's trick and
-translate it into a simpler, easier-to-deal-with form.
-
-<p>
-In normal operation, translation proceeds through six stages,
-coordinated by <code>VG_(translate)</code>:
-<ol>
-<li>Parsing of an x86 basic block into a sequence of UCode
-    instructions (<code>VG_(disBB)</code>).
-<p>
-<li>UCode optimisation (<code>vg_improve</code>), with the aim of
-    caching simulated registers in real registers over multiple
-    simulated instructions, and removing redundant simulated
-    <code>%EFLAGS</code> saving/restoring.
-<p>
-<li>UCode instrumentation (<code>vg_instrument</code>), which adds
-    value and address checking code.
-<p>
-<li>Post-instrumentation cleanup (<code>vg_cleanup</code>), removing
-    redundant value-check computations.
-<p>
-<li>Register allocation (<code>vg_do_register_allocation</code>),
-    which, note, is done on UCode.
-<p>
-<li>Emission of final instrumented x86 code
-    (<code>VG_(emit_code)</code>).
-</ol>
-
-<p>
-Notice how steps 2, 3, 4 and 5 are simple UCode-to-UCode
-transformation passes, all on straight-line blocks of UCode (type
-<code>UCodeBlock</code>).  Steps 2 and 4 are optimisation passes and
-can be disabled for debugging purposes, with
-<code>--optimise=no</code> and <code>--cleanup=no</code> respectively.
-
-<p>
-Valgrind can also run in a no-instrumentation mode, given
-<code>--instrument=no</code>.  This is useful for debugging the JITter
-quickly without having to deal with the complexity of the
-instrumentation mechanism too.  In this mode, steps 3 and 4 are
-omitted.
-
-<p>
-These flags combine, so that <code>--instrument=no</code> together with 
-<code>--optimise=no</code> means only steps 1, 5 and 6 are used.
-<code>--single-step=yes</code> causes each x86 instruction to be
-treated as a single basic block.  The translations are terrible but
-this is sometimes instructive.  
-
-<p>
-The <code>--stop-after=N</code> flag switches back to the real CPU
-after <code>N</code> basic blocks.  It also re-JITs the final basic
-block executed and prints the debugging info resulting, so this
-gives you a way to get a quick snapshot of how a basic block looks as
-it passes through the six stages mentioned above.  If you want to 
-see full information for every block translated (probably not, but
-still ...) find, in <code>VG_(translate)</code>, the lines
-<br><code>   dis = True;</code>
-<br><code>   dis = debugging_translation;</code>
-<br>
-and comment out the second line.  This will spew out debugging
-junk faster than you can possibly imagine.
-
-
-
-<h3>UCode operand tags: type <code>Tag</code></h3>
-
-UCode is, more or less, a simple two-address RISC-like code.  In
-keeping with the x86 AT&T assembly syntax, generally speaking the
-first operand is the source operand, and the second is the destination
-operand, which is modified when the uinstr is notionally executed.
-
-<p>
-UCode instructions have up to three operand fields, each of which has
-a corresponding <code>Tag</code> describing it.  Possible values for
-the tag are:
-
-<ul>
-<li><code>NoValue</code>: indicates that the field is not in use.
-<p>
-<li><code>Lit16</code>: the field contains a 16-bit literal.
-<p>
-<li><code>Literal</code>: the field denotes a 32-bit literal, whose
-    value is stored in the <code>lit32</code> field of the uinstr
-    itself.  Since there is only one <code>lit32</code> for the whole
-    uinstr, only one operand field may contain this tag.
-<p>
-<li><code>SpillNo</code>: the field contains a spill slot number, in
-    the range 0 to 23 inclusive, denoting one of the spill slots
-    contained inside <code>VG_(baseBlock)</code>.  Such tags only
-    exist after register allocation.
-<p>
-<li><code>RealReg</code>: the field contains a number in the range 0
-    to 7 denoting an integer x86 ("real") register on the host.  The
-    number is the Intel encoding for integer registers.  Such tags
-    only exist after register allocation.
-<p>
-<li><code>ArchReg</code>: the field contains a number in the range 0
-    to 7 denoting an integer x86 register on the simulated CPU.  In
-    reality this means a reference to one of the first 8 words of
-    <code>VG_(baseBlock)</code>.  Such tags can exist at any point in
-    the translation process.
-<p>
-<li>Last, but not least, <code>TempReg</code>.  The field contains the
-    number of one of an infinite set of virtual (integer)
-    registers. <code>TempReg</code>s are used everywhere throughout
-    the translation process; you can have as many as you want.  The
-    register allocator maps as many as it can into
-    <code>RealReg</code>s and turns the rest into
-    <code>SpillNo</code>s, so <code>TempReg</code>s should not exist
-    after the register allocation phase.
-    <p>
-    <code>TempReg</code>s are always 32 bits long, even if the data
-    they hold is logically shorter.  In that case the upper unused
-    bits are required, and, I think, generally assumed, to be zero.  
-    <code>TempReg</code>s holding V bits for quantities shorter than 
-    32 bits are expected to have ones in the unused places, since a
-    one denotes "undefined".
-</ul>
-
-
-<h3>UCode instructions: type <code>UInstr</code></h3>
-
-<p>
-UCode was carefully designed to make it possible to do register
-allocation on UCode and then translate the result into x86 code
-without needing any extra registers ... well, that was the original
-plan, anyway.  Things have gotten a little more complicated since
-then.  In what follows, UCode instructions are referred to as uinstrs,
-to distinguish them from x86 instructions.  Uinstrs of course have
-uopcodes which are (naturally) different from x86 opcodes.
-
-<p>
-A uinstr (type <code>UInstr</code>) contains
-various fields, not all of which are used by any one uopcode:
-<ul>
-<li>Three 16-bit operand fields, <code>val1</code>, <code>val2</code>
-    and <code>val3</code>.
-<p>
-<li>Three tag fields, <code>tag1</code>, <code>tag2</code>
-    and <code>tag3</code>.  Each of these has a value of type
-    <code>Tag</code>,
-    and they describe what the <code>val1</code>, <code>val2</code>
-    and <code>val3</code> fields contain.
-<p>
-<li>A 32-bit literal field.
-<p>
-<li>Two <code>FlagSet</code>s, specifying which x86 condition codes are
-    read and written by the uinstr.
-<p>
-<li>An opcode byte, containing a value of type <code>Opcode</code>.
-<p>
-<li>A size field, indicating the data transfer size (1/2/4/8/10) in
-    cases where this makes sense, or zero otherwise.
-<p>
-<li>A condition-code field, which, for jumps, holds a
-    value of type <code>Condcode</code>, indicating the condition
-    which applies.  The encoding is as it is in the x86 insn stream,
-    except we add a 17th value <code>CondAlways</code> to indicate
-    an unconditional transfer.
-<p>
-<li>Various 1-bit flags, indicating whether this insn pertains to an
-    x86 CALL or RET instruction, whether a widening is signed or not,
-    etc.
-</ul>
-
-<p>
-UOpcodes (type <code>Opcode</code>) are divided into two groups: those
-necessary merely to express the functionality of the x86 code, and
-extra uopcodes needed to express the instrumentation.  The former
-group contains:
-<ul>
-<li><code>GET</code> and <code>PUT</code>, which move values from the
-    simulated CPU's integer registers (<code>ArchReg</code>s) into
-    <code>TempReg</code>s, and back.  <code>GETF</code> and
-    <code>PUTF</code> do the corresponding thing for the simulated
-    <code>%EFLAGS</code>.  There are no corresponding insns for the
-    FPU register stack, since we don't explicitly simulate its
-    registers.
-<p>
-<li><code>LOAD</code> and <code>STORE</code>, which, in RISC-like
-    fashion, are the only uinstrs able to interact with memory.
-<p>
-<li><code>MOV</code> and <code>CMOV</code> allow unconditional and
-    conditional moves of values between <code>TempReg</code>s.
-<p>
-<li>ALU operations.  Again in RISC-like fashion, these only operate on
-    <code>TempReg</code>s (before reg-alloc) or <code>RealReg</code>s
-    (after reg-alloc).  These are: <code>ADD</code>, <code>ADC</code>,
-    <code>AND</code>, <code>OR</code>, <code>XOR</code>,
-    <code>SUB</code>, <code>SBB</code>, <code>SHL</code>,
-    <code>SHR</code>, <code>SAR</code>, <code>ROL</code>,
-    <code>ROR</code>, <code>RCL</code>, <code>RCR</code>,
-    <code>NOT</code>, <code>NEG</code>, <code>INC</code>,
-    <code>DEC</code>, <code>BSWAP</code>, <code>CC2VAL</code> and
-    <code>WIDEN</code>.  <code>WIDEN</code> does signed or unsigned
-    value widening.  <code>CC2VAL</code> is used to convert condition
-    codes into a value, zero or one.  The rest are obvious.
-    <p>
-    To allow for more efficient code generation, we bend slightly the
-    restriction at the start of the previous para: for
-    <code>ADD</code>, <code>ADC</code>, <code>XOR</code>,
-    <code>SUB</code> and <code>SBB</code>, we allow the first (source)
-    operand to also be an <code>ArchReg</code>, that is, one of the
-    simulated machine's registers.  Also, many of these ALU ops allow
-    the source operand to be a literal.  See
-    <code>VG_(saneUInstr)</code> for the final word on the allowable
-    forms of uinstrs.
-<p>
-<li><code>LEA1</code> and <code>LEA2</code> are not strictly
-    necessary, but allow faciliate better translations.  They
-    record the fancy x86 addressing modes in a direct way, which
-    allows those amodes to be emitted back into the final
-    instruction stream more or less verbatim.
-<p>
-<li><code>CALLM</code> calls a machine-code helper, one of the methods
-    whose address is stored at some <code>VG_(baseBlock)</code>
-    offset.  <code>PUSH</code> and <code>POP</code> move values
-    to/from <code>TempReg</code> to the real (Valgrind's) stack, and
-    <code>CLEAR</code> removes values from the stack.
-    <code>CALLM_S</code> and <code>CALLM_E</code> delimit the
-    boundaries of call setups and clearings, for the benefit of the
-    instrumentation passes.  Getting this right is critical, and so
-    <code>VG_(saneUCodeBlock)</code> makes various checks on the use
-    of these uopcodes.
-    <p>
-    It is important to understand that these uopcodes have nothing to
-    do with the x86 <code>call</code>, <code>return,</code>
-    <code>push</code> or <code>pop</code> instructions, and are not
-    used to implement them.  Those guys turn into combinations of
-    <code>GET</code>, <code>PUT</code>, <code>LOAD</code>,
-    <code>STORE</code>, <code>ADD</code>, <code>SUB</code>, and
-    <code>JMP</code>.  What these uopcodes support is calling of
-    helper functions such as <code>VG_(helper_imul_32_64)</code>,
-    which do stuff which is too difficult or tedious to emit inline.
-<p>
-<li><code>FPU</code>, <code>FPU_R</code> and <code>FPU_W</code>.
-    Valgrind doesn't attempt to simulate the internal state of the
-    FPU at all.  Consequently it only needs to be able to distinguish
-    FPU ops which read and write memory from those that don't, and
-    for those which do, it needs to know the effective address and
-    data transfer size.  This is made easier because the x86 FP
-    instruction encoding is very regular, basically consisting of
-    16 bits for a non-memory FPU insn and 11 (IIRC) bits + an address mode
-    for a memory FPU insn.  So our <code>FPU</code> uinstr carries
-    the 16 bits in its <code>val1</code> field.  And
-    <code>FPU_R</code> and <code>FPU_W</code> carry 11 bits in that
-    field, together with the identity of a <code>TempReg</code> or
-    (later) <code>RealReg</code> which contains the address.
-<p>
-<li><code>JIFZ</code> is unique, in that it allows a control-flow
-    transfer which is not deemed to end a basic block.  It causes a
-    jump to a literal (original) address if the specified argument
-    is zero.
-<p>
-<li>Finally, <code>INCEIP</code> advances the simulated
-    <code>%EIP</code> by the specified literal amount.  This supports
-    lazy <code>%EIP</code> updating, as described below.
-</ul>
-
-<p>
-Stages 1 and 2 of the 6-stage translation process mentioned above
-deal purely with these uopcodes, and no others.  They are
-sufficient to express pretty much all the x86 32-bit protected-mode 
-instruction set, at
-least everything understood by a pre-MMX original Pentium (P54C). 
-
-<p>
-Stages 3, 4, 5 and 6 also deal with the following extra
-"instrumentation" uopcodes.  They are used to express all the
-definedness-tracking and -checking machinery which valgrind does.  In
-later sections we show how to create checking code for each of the
-uopcodes above.  Note that these instrumentation uopcodes, although
-some appearing complicated, have been carefully chosen so that
-efficient x86 code can be generated for them.  GNU superopt v2.5 did a
-great job helping out here.  Anyways, the uopcodes are as follows:
-
-<ul>
-<li><code>GETV</code> and <code>PUTV</code> are analogues to
-    <code>GET</code> and <code>PUT</code> above.  They are identical
-    except that they move the V bits for the specified values back and
-    forth to <code>TempRegs</code>, rather than moving the values
-    themselves.
-<p>
-<li>Similarly, <code>LOADV</code> and <code>STOREV</code> read and
-    write V bits from the synthesised shadow memory that Valgrind
-    maintains.  In fact they do more than that, since they also do
-    address-validity checks, and emit complaints if the read/written
-    addresses are unaddressible.
-<p>
-<li><code>TESTV</code>, whose parameters are a <code>TempReg</code>
-    and a size, tests the V bits in the <code>TempReg</code>, at the
-    specified operation size (0/1/2/4 byte) and emits an error if any
-    of them indicate undefinedness.  This is the only uopcode capable
-    of doing such tests.
-<p>
-<li><code>SETV</code>, whose parameters are also <code>TempReg</code>
-    and a size, makes the V bits in the <code>TempReg</code> indicated
-    definedness, at the specified operation size.  This is usually
-    used to generate the correct V bits for a literal value, which is
-    of course fully defined.
-<p>
-<li><code>GETVF</code> and <code>PUTVF</code> are analogues to
-    <code>GETF</code> and <code>PUTF</code>.  They move the single V
-    bit used to model definedness of <code>%EFLAGS</code> between its
-    home in <code>VG_(baseBlock)</code> and the specified
-    <code>TempReg</code>.
-<p>
-<li><code>TAG1</code> denotes one of a family of unary operations on
-    <code>TempReg</code>s containing V bits.  Similarly,
-    <code>TAG2</code> denotes one in a family of binary operations on
-    V bits.
-</ul>
-
-<p>
-These 10 uopcodes are sufficient to express Valgrind's entire
-definedness-checking semantics.  In fact most of the interesting magic
-is done by the <code>TAG1</code> and <code>TAG2</code>
-suboperations.
-
-<p>
-First, however, I need to explain about V-vector operation sizes.
-There are 4 sizes: 1, 2 and 4, which operate on groups of 8, 16 and 32
-V bits at a time, supporting the usual 1, 2 and 4 byte x86 operations.
-However there is also the mysterious size 0, which really means a
-single V bit.  Single V bits are used in various circumstances; in
-particular, the definedness of <code>%EFLAGS</code> is modelled with a
-single V bit.  Now might be a good time to also point out that for
-V bits, 1 means "undefined" and 0 means "defined".  Similarly, for A
-bits, 1 means "invalid address" and 0 means "valid address".  This
-seems counterintuitive (and so it is), but testing against zero on
-x86s saves instructions compared to testing against all 1s, because
-many ALU operations set the Z flag for free, so to speak.
-
-<p>
-With that in mind, the tag ops are:
-
-<ul>
-<li><b>(UNARY) Pessimising casts</b>: <code>VgT_PCast40</code>,
-    <code>VgT_PCast20</code>, <code>VgT_PCast10</code>,
-    <code>VgT_PCast01</code>, <code>VgT_PCast02</code> and
-    <code>VgT_PCast04</code>.  A "pessimising cast" takes a V-bit
-    vector at one size, and creates a new one at another size,
-    pessimised in the sense that if any of the bits in the source
-    vector indicate undefinedness, then all the bits in the result
-    indicate undefinedness.  In this case the casts are all to or from
-    a single V bit, so for example <code>VgT_PCast40</code> is a
-    pessimising cast from 32 bits to 1, whereas
-    <code>VgT_PCast04</code> simply copies the single source V bit
-    into all 32 bit positions in the result.  Surprisingly, these ops
-    can all be implemented very efficiently.
-    <p>
-    There are also the pessimising casts <code>VgT_PCast14</code>,
-    from 8 bits to 32, <code>VgT_PCast12</code>, from 8 bits to 16,
-    and <code>VgT_PCast11</code>, from 8 bits to 8.  This last one
-    seems nonsensical, but in fact it isn't a no-op because, as
-    mentioned above, any undefined (1) bits in the source infect the
-    entire result.
-<p>
-<li><b>(UNARY) Propagating undefinedness upwards in a word</b>:
-    <code>VgT_Left4</code>, <code>VgT_Left2</code> and
-    <code>VgT_Left1</code>.  These are used to simulate the worst-case
-    effects of carry propagation in adds and subtracts.  They return a
-    V vector identical to the original, except that if the original
-    contained any undefined bits, then it and all bits above it are
-    marked as undefined too.  Hence the Left bit in the names.
-<p>
-<li><b>(UNARY) Signed and unsigned value widening</b>:
-     <code>VgT_SWiden14</code>, <code>VgT_SWiden24</code>,
-     <code>VgT_SWiden12</code>, <code>VgT_ZWiden14</code>,
-     <code>VgT_ZWiden24</code> and <code>VgT_ZWiden12</code>.  These
-     mimic the definedness effects of standard signed and unsigned
-     integer widening.  Unsigned widening creates zero bits in the new
-     positions, so <code>VgT_ZWiden*</code> accordingly park mark
-     those parts of their argument as defined.  Signed widening copies
-     the sign bit into the new positions, so <code>VgT_SWiden*</code>
-     copies the definedness of the sign bit into the new positions.
-     Because 1 means undefined and 0 means defined, these operations
-     can (fascinatingly) be done by the same operations which they
-     mimic.  Go figure.
-<p>
-<li><b>(BINARY) Undefined-if-either-Undefined,
-     Defined-if-either-Defined</b>: <code>VgT_UifU4</code>,
-     <code>VgT_UifU2</code>, <code>VgT_UifU1</code>,
-     <code>VgT_UifU0</code>, <code>VgT_DifD4</code>,
-     <code>VgT_DifD2</code>, <code>VgT_DifD1</code>.  These do simple
-     bitwise operations on pairs of V-bit vectors, with
-     <code>UifU</code> giving undefined if either arg bit is
-     undefined, and <code>DifD</code> giving defined if either arg bit
-     is defined.  Abstract interpretation junkies, if any make it this
-     far, may like to think of them as meets and joins (or is it joins
-     and meets) in the definedness lattices.  
-<p>
-<li><b>(BINARY; one value, one V bits) Generate argument improvement
-    terms for AND and OR</b>: <code>VgT_ImproveAND4_TQ</code>,
-    <code>VgT_ImproveAND2_TQ</code>, <code>VgT_ImproveAND1_TQ</code>,
-    <code>VgT_ImproveOR4_TQ</code>, <code>VgT_ImproveOR2_TQ</code>,
-    <code>VgT_ImproveOR1_TQ</code>.  These help out with AND and OR
-    operations.  AND and OR have the inconvenient property that the
-    definedness of the result depends on the actual values of the
-    arguments as well as their definedness.  At the bit level:
-    <br><code>1 AND undefined = undefined</code>, but 
-    <br><code>0 AND undefined = 0</code>, and similarly 
-    <br><code>0 OR  undefined = undefined</code>, but 
-    <br><code>1 OR  undefined = 1</code>.
-    <br>
-    <p>
-    It turns out that gcc (quite legitimately) generates code which
-    relies on this fact, so we have to model it properly in order to
-    avoid flooding users with spurious value errors.  The ultimate
-    definedness result of AND and OR is calculated using
-    <code>UifU</code> on the definedness of the arguments, but we
-    also <code>DifD</code> in some "improvement" terms which 
-    take into account the above phenomena.  
-    <p>
-    <code>ImproveAND</code> takes as its first argument the actual
-    value of an argument to AND (the T) and the definedness of that
-    argument (the Q), and returns a V-bit vector which is defined (0)
-    for bits which have value 0 and are defined; this, when
-    <code>DifD</code> into the final result causes those bits to be
-    defined even if the corresponding bit in the other argument is undefined.
-    <p>
-    The <code>ImproveOR</code> ops do the dual thing for OR
-    arguments.  Note that XOR does not have this property that one
-    argument can make the other irrelevant, so there is no need for
-    such complexity for XOR.
-</ul>
-
-<p>
-That's all the tag ops.  If you stare at this long enough, and then
-run Valgrind and stare at the pre- and post-instrumented ucode, it
-should be fairly obvious how the instrumentation machinery hangs
-together.
-
-<p>
-One point, if you do this: in order to make it easy to differentiate
-<code>TempReg</code>s carrying values from <code>TempReg</code>s
-carrying V bit vectors, Valgrind prints the former as (for example)
-<code>t28</code> and the latter as <code>q28</code>; the fact that
-they carry the same number serves to indicate their relationship.
-This is purely for the convenience of the human reader; the register
-allocator and code generator don't regard them as different.
-
-
-<h3>Translation into UCode</h3>
-
-<code>VG_(disBB)</code> allocates a new <code>UCodeBlock</code> and
-then uses <code>disInstr</code> to translate x86 instructions one at a
-time into UCode, dumping the result in the <code>UCodeBlock</code>.
-This goes on until a control-flow transfer instruction is encountered.
-
-<p>
-Despite the large size of <code>vg_to_ucode.c</code>, this translation
-is really very simple.  Each x86 instruction is translated entirely
-independently of its neighbours, merrily allocating new
-<code>TempReg</code>s as it goes.  The idea is to have a simple
-translator -- in reality, no more than a macro-expander -- and the --
-resulting bad UCode translation is cleaned up by the UCode
-optimisation phase which follows.  To give you an idea of some x86
-instructions and their translations (this is a complete basic block,
-as Valgrind sees it):
-<pre>
-        0x40435A50:  incl %edx
-
-           0: GETL      %EDX, t0
-           1: INCL      t0  (-wOSZAP)
-           2: PUTL      t0, %EDX
-
-        0x40435A51:  movsbl (%edx),%eax
-
-           3: GETL      %EDX, t2
-           4: LDB       (t2), t2
-           5: WIDENL_Bs t2
-           6: PUTL      t2, %EAX
-
-        0x40435A54:  testb $0x20, 1(%ecx,%eax,2)
-
-           7: GETL      %EAX, t6
-           8: GETL      %ECX, t8
-           9: LEA2L     1(t8,t6,2), t4
-          10: LDB       (t4), t10
-          11: MOVB      $0x20, t12
-          12: ANDB      t12, t10  (-wOSZACP)
-          13: INCEIPo   $9
-
-        0x40435A59:  jnz-8 0x40435A50
-
-          14: Jnzo      $0x40435A50  (-rOSZACP)
-          15: JMPo      $0x40435A5B
-</pre>
-
-<p>
-Notice how the block always ends with an unconditional jump to the
-next block.  This is a bit unnecessary, but makes many things simpler.
-
-<p>
-Most x86 instructions turn into sequences of <code>GET</code>,
-<code>PUT</code>, <code>LEA1</code>, <code>LEA2</code>,
-<code>LOAD</code> and <code>STORE</code>.  Some complicated ones
-however rely on calling helper bits of code in 
-<code>vg_helpers.S</code>.  The ucode instructions <code>PUSH</code>,
-<code>POP</code>, <code>CALL</code>, <code>CALLM_S</code> and
-<code>CALLM_E</code> support this.  The calling convention is somewhat
-ad-hoc and is not the C calling convention.  The helper routines must 
-save all integer registers, and the flags, that they use.  Args are
-passed on the stack underneath the return address, as usual, and if 
-result(s) are to be returned, it (they) are either placed in dummy arg
-slots created by the ucode <code>PUSH</code> sequence, or just
-overwrite the incoming args.
-
-<p>
-In order that the instrumentation mechanism can handle calls to these
-helpers, <code>VG_(saneUCodeBlock)</code> enforces the following
-restrictions on calls to helpers:
-
-<ul>
-<li>Each <code>CALL</code> uinstr must be bracketed by a preceding
-    <code>CALLM_S</code> marker (dummy uinstr) and a trailing
-    <code>CALLM_E</code> marker.  These markers are used by the
-    instrumentation mechanism later to establish the boundaries of the
-    <code>PUSH</code>, <code>POP</code> and <code>CLEAR</code>
-    sequences for the call.
-<p>
-<li><code>PUSH</code>, <code>POP</code> and <code>CLEAR</code>
-    may only appear inside sections bracketed by <code>CALLM_S</code>
-    and <code>CALLM_E</code>, and nowhere else.
-<p>
-<li>In any such bracketed section, no two <code>PUSH</code> insns may
-    push the same <code>TempReg</code>.  Dually, no two two
-    <code>POP</code>s may pop the same <code>TempReg</code>.
-<p>
-<li>Finally, although this is not checked, args should be removed from
-    the stack with <code>CLEAR</code>, rather than <code>POP</code>s
-    into a <code>TempReg</code> which is not subsequently used.  This
-    is because the instrumentation mechanism assumes that all values
-    <code>POP</code>ped from the stack are actually used.
-</ul>
-
-Some of the translations may appear to have redundant
-<code>TempReg</code>-to-<code>TempReg</code> moves.  This helps the
-next phase, UCode optimisation, to generate better code.
-
-
-
-<h3>UCode optimisation</h3>
-
-UCode is then subjected to an improvement pass
-(<code>vg_improve()</code>), which blurs the boundaries between the
-translations of the original x86 instructions.  It's pretty
-straightforward.  Three transformations are done:
-
-<ul>
-<li>Redundant <code>GET</code> elimination.  Actually, more general
-    than that -- eliminates redundant fetches of ArchRegs.  In our
-    running example, uinstr 3 <code>GET</code>s <code>%EDX</code> into
-    <code>t2</code> despite the fact that, by looking at the previous
-    uinstr, it is already in <code>t0</code>.  The <code>GET</code> is
-    therefore removed, and <code>t2</code> renamed to <code>t0</code>.
-    Assuming <code>t0</code> is allocated to a host register, it means
-    the simulated <code>%EDX</code> will exist in a host CPU register
-    for more than one simulated x86 instruction, which seems to me to
-    be a highly desirable property.
-    <p>
-    There is some mucking around to do with subregisters;
-    <code>%AL</code> vs <code>%AH</code> <code>%AX</code> vs
-    <code>%EAX</code> etc.  I can't remember how it works, but in
-    general we are very conservative, and these tend to invalidate the
-    caching. 
-<p>
-<li>Redundant <code>PUT</code> elimination.  This annuls
-    <code>PUT</code>s of values back to simulated CPU registers if a
-    later <code>PUT</code> would overwrite the earlier
-    <code>PUT</code> value, and there is no intervening reads of the
-    simulated register (<code>ArchReg</code>).
-    <p>
-    As before, we are paranoid when faced with subregister references.
-    Also, <code>PUT</code>s of <code>%ESP</code> are never annulled,
-    because it is vital the instrumenter always has an up-to-date
-    <code>%ESP</code> value available, <code>%ESP</code> changes
-    affect addressibility of the memory around the simulated stack
-    pointer.
-    <p>
-    The implication of the above paragraph is that the simulated
-    machine's registers are only lazily updated once the above two
-    optimisation phases have run, with the exception of
-    <code>%ESP</code>.  <code>TempReg</code>s go dead at the end of
-    every basic block, from which is is inferrable that any
-    <code>TempReg</code> caching a simulated CPU reg is flushed (back
-    into the relevant <code>VG_(baseBlock)</code> slot) at the end of
-    every basic block.  The further implication is that the simulated
-    registers are only up-to-date at in between basic blocks, and not
-    at arbitrary points inside basic blocks.  And the consequence of
-    that is that we can only deliver signals to the client in between
-    basic blocks.  None of this seems any problem in practice.
-<p>
-<li>Finally there is a simple def-use thing for condition codes.  If
-    an earlier uinstr writes the condition codes, and the next uinsn
-    along which actually cares about the condition codes writes the
-    same or larger set of them, but does not read any, the earlier
-    uinsn is marked as not writing any condition codes.  This saves 
-    a lot of redundant cond-code saving and restoring.
-</ul>
-
-The effect of these transformations on our short block is rather
-unexciting, and shown below.  On longer basic blocks they can
-dramatically improve code quality.
-
-<pre>
-at 3: delete GET, rename t2 to t0 in (4 .. 6)
-at 7: delete GET, rename t6 to t0 in (8 .. 9)
-at 1: annul flag write OSZAP due to later OSZACP
-
-Improved code:
-           0: GETL      %EDX, t0
-           1: INCL      t0
-           2: PUTL      t0, %EDX
-           4: LDB       (t0), t0
-           5: WIDENL_Bs t0
-           6: PUTL      t0, %EAX
-           8: GETL      %ECX, t8
-           9: LEA2L     1(t8,t0,2), t4
-          10: LDB       (t4), t10
-          11: MOVB      $0x20, t12
-          12: ANDB      t12, t10  (-wOSZACP)
-          13: INCEIPo   $9
-          14: Jnzo      $0x40435A50  (-rOSZACP)
-          15: JMPo      $0x40435A5B
-</pre>
-
-<h3>UCode instrumentation</h3>
-
-Once you understand the meaning of the instrumentation uinstrs,
-discussed in detail above, the instrumentation scheme is fairly
-straighforward.  Each uinstr is instrumented in isolation, and the
-instrumentation uinstrs are placed before the original uinstr.
-Our running example continues below.  I have placed a blank line 
-after every original ucode, to make it easier to see which
-instrumentation uinstrs correspond to which originals.
-
-<p>
-As mentioned somewhere above, <code>TempReg</code>s carrying values 
-have names like <code>t28</code>, and each one has a shadow carrying
-its V bits, with names like <code>q28</code>.  This pairing aids in
-reading instrumented ucode.
-
-<p>
-One decision about all this is where to have "observation points",
-that is, where to check that V bits are valid.  I use a minimalistic
-scheme, only checking where a failure of validity could cause the 
-original program to (seg)fault.  So the use of values as memory
-addresses causes a check, as do conditional jumps (these cause a check
-on the definedness of the condition codes).  And arguments
-<code>PUSH</code>ed for helper calls are checked, hence the wierd
-restrictions on help call preambles described above.
-
-<p>
-Another decision is that once a value is tested, it is thereafter
-regarded as defined, so that we do not emit multiple undefined-value
-errors for the same undefined value.  That means that
-<code>TESTV</code> uinstrs are always followed by <code>SETV</code> 
-on the same (shadow) <code>TempReg</code>s.  Most of these
-<code>SETV</code>s are redundant and are removed by the
-post-instrumentation cleanup phase.
-
-<p>
-The instrumentation for calling helper functions deserves further
-comment.  The definedness of results from a helper is modelled using
-just one V bit.  So, in short, we do pessimising casts of the
-definedness of all the args, down to a single bit, and then
-<code>UifU</code> these bits together.  So this single V bit will say
-"undefined" if any part of any arg is undefined.  This V bit is then
-pessimally cast back up to the result(s) sizes, as needed.  If, by
-seeing that all the args are got rid of with <code>CLEAR</code> and
-none with <code>POP</code>, Valgrind sees that the result of the call
-is not actually used, it immediately examines the result V bit with a
-<code>TESTV</code> -- <code>SETV</code> pair.  If it did not do this,
-there would be no observation point to detect that the some of the
-args to the helper were undefined.  Of course, if the helper's results
-are indeed used, we don't do this, since the result usage will
-presumably cause the result definedness to be checked at some suitable
-future point.
-
-<p>
-In general Valgrind tries to track definedness on a bit-for-bit basis,
-but as the above para shows, for calls to helpers we throw in the
-towel and approximate down to a single bit.  This is because it's too
-complex and difficult to track bit-level definedness through complex
-ops such as integer multiply and divide, and in any case there is no
-reasonable code fragments which attempt to (eg) multiply two
-partially-defined values and end up with something meaningful, so
-there seems little point in modelling multiplies, divides, etc, in
-that level of detail.
-
-<p>
-Integer loads and stores are instrumented with firstly a test of the
-definedness of the address, followed by a <code>LOADV</code> or
-<code>STOREV</code> respectively.  These turn into calls to 
-(for example) <code>VG_(helperc_LOADV4)</code>.  These helpers do two
-things: they perform an address-valid check, and they load or store V
-bits from/to the relevant address in the (simulated V-bit) memory.
-
-<p>
-FPU loads and stores are different.  As above the definedness of the
-address is first tested.  However, the helper routine for FPU loads
-(<code>VGM_(fpu_read_check)</code>) emits an error if either the
-address is invalid or the referenced area contains undefined values.
-It has to do this because we do not simulate the FPU at all, and so
-cannot track definedness of values loaded into it from memory, so we
-have to check them as soon as they are loaded into the FPU, ie, at
-this point.  We notionally assume that everything in the FPU is
-defined.
-
-<p>
-It follows therefore that FPU writes first check the definedness of
-the address, then the validity of the address, and finally mark the
-written bytes as well-defined.
-
-<p>
-If anyone is inspired to extend Valgrind to MMX/SSE insns, I suggest
-you use the same trick.  It works provided that the FPU/MMX unit is
-not used to merely as a conduit to copy partially undefined data from
-one place in memory to another.  Unfortunately the integer CPU is used
-like that (when copying C structs with holes, for example) and this is
-the cause of much of the elaborateness of the instrumentation here
-described.
-
-<p>
-<code>vg_instrument()</code> in <code>vg_translate.c</code> actually
-does the instrumentation.  There are comments explaining how each
-uinstr is handled, so we do not repeat that here.  As explained
-already, it is bit-accurate, except for calls to helper functions.
-Unfortunately the x86 insns <code>bt/bts/btc/btr</code> are done by
-helper fns, so bit-level accuracy is lost there.  This should be fixed
-by doing them inline; it will probably require adding a couple new
-uinstrs.  Also, left and right rotates through the carry flag (x86
-<code>rcl</code> and <code>rcr</code>) are approximated via a single
-V bit; so far this has not caused anyone to complain.  The
-non-carry rotates, <code>rol</code> and <code>ror</code>, are much
-more common and are done exactly.  Re-visiting the instrumentation for
-AND and OR, they seem rather verbose, and I wonder if it could be done
-more concisely now.
-
-<p>
-The lowercase <code>o</code> on many of the uopcodes in the running
-example indicates that the size field is zero, usually meaning a
-single-bit operation.
-
-<p>
-Anyroads, the post-instrumented version of our running example looks
-like this:
-
-<pre>
-Instrumented code:
-           0: GETVL     %EDX, q0
-           1: GETL      %EDX, t0
-
-           2: TAG1o     q0 = Left4 ( q0 )
-           3: INCL      t0
-
-           4: PUTVL     q0, %EDX
-           5: PUTL      t0, %EDX
-
-           6: TESTVL    q0
-           7: SETVL     q0
-           8: LOADVB    (t0), q0
-           9: LDB       (t0), t0
-
-          10: TAG1o     q0 = SWiden14 ( q0 )
-          11: WIDENL_Bs t0
-
-          12: PUTVL     q0, %EAX
-          13: PUTL      t0, %EAX
-
-          14: GETVL     %ECX, q8
-          15: GETL      %ECX, t8
-
-          16: MOVL      q0, q4
-          17: SHLL      $0x1, q4
-          18: TAG2o     q4 = UifU4 ( q8, q4 )
-          19: TAG1o     q4 = Left4 ( q4 )
-          20: LEA2L     1(t8,t0,2), t4
-
-          21: TESTVL    q4
-          22: SETVL     q4
-          23: LOADVB    (t4), q10
-          24: LDB       (t4), t10
-
-          25: SETVB     q12
-          26: MOVB      $0x20, t12
-
-          27: MOVL      q10, q14
-          28: TAG2o     q14 = ImproveAND1_TQ ( t10, q14 )
-          29: TAG2o     q10 = UifU1 ( q12, q10 )
-          30: TAG2o     q10 = DifD1 ( q14, q10 )
-          31: MOVL      q12, q14
-          32: TAG2o     q14 = ImproveAND1_TQ ( t12, q14 )
-          33: TAG2o     q10 = DifD1 ( q14, q10 )
-          34: MOVL      q10, q16
-          35: TAG1o     q16 = PCast10 ( q16 )
-          36: PUTVFo    q16
-          37: ANDB      t12, t10  (-wOSZACP)
-
-          38: INCEIPo   $9
-
-          39: GETVFo    q18
-          40: TESTVo    q18
-          41: SETVo     q18
-          42: Jnzo      $0x40435A50  (-rOSZACP)
-
-          43: JMPo      $0x40435A5B
-</pre>
-
-
-<h3>UCode post-instrumentation cleanup</h3>
-
-<p>
-This pass, coordinated by <code>vg_cleanup()</code>, removes redundant
-definedness computation created by the simplistic instrumentation
-pass.  It consists of two passes,
-<code>vg_propagate_definedness()</code> followed by
-<code>vg_delete_redundant_SETVs</code>.
-
-<p>
-<code>vg_propagate_definedness()</code> is a simple
-constant-propagation and constant-folding pass.  It tries to determine
-which <code>TempReg</code>s containing V bits will always indicate
-"fully defined", and it propagates this information as far as it can,
-and folds out as many operations as possible.  For example, the
-instrumentation for an ADD of a literal to a variable quantity will be
-reduced down so that the definedness of the result is simply the
-definedness of the variable quantity, since the literal is by
-definition fully defined.
-
-<p>
-<code>vg_delete_redundant_SETVs</code> removes <code>SETV</code>s on
-shadow <code>TempReg</code>s for which the next action is a write.
-I don't think there's anything else worth saying about this; it is
-simple.  Read the sources for details.
-
-<p>
-So the cleaned-up running example looks like this.  As above, I have
-inserted line breaks after every original (non-instrumentation) uinstr
-to aid readability.  As with straightforward ucode optimisation, the
-results in this block are undramatic because it is so short; longer
-blocks benefit more because they have more redundancy which gets
-eliminated.
-
-
-<pre>
-at 29: delete UifU1 due to defd arg1
-at 32: change ImproveAND1_TQ to MOV due to defd arg2
-at 41: delete SETV
-at 31: delete MOV
-at 25: delete SETV
-at 22: delete SETV
-at 7: delete SETV
-
-           0: GETVL     %EDX, q0
-           1: GETL      %EDX, t0
-
-           2: TAG1o     q0 = Left4 ( q0 )
-           3: INCL      t0
-
-           4: PUTVL     q0, %EDX
-           5: PUTL      t0, %EDX
-
-           6: TESTVL    q0
-           8: LOADVB    (t0), q0
-           9: LDB       (t0), t0
-
-          10: TAG1o     q0 = SWiden14 ( q0 )
-          11: WIDENL_Bs t0
-
-          12: PUTVL     q0, %EAX
-          13: PUTL      t0, %EAX
-
-          14: GETVL     %ECX, q8
-          15: GETL      %ECX, t8
-
-          16: MOVL      q0, q4
-          17: SHLL      $0x1, q4
-          18: TAG2o     q4 = UifU4 ( q8, q4 )
-          19: TAG1o     q4 = Left4 ( q4 )
-          20: LEA2L     1(t8,t0,2), t4
-
-          21: TESTVL    q4
-          23: LOADVB    (t4), q10
-          24: LDB       (t4), t10
-
-          26: MOVB      $0x20, t12
-
-          27: MOVL      q10, q14
-          28: TAG2o     q14 = ImproveAND1_TQ ( t10, q14 )
-          30: TAG2o     q10 = DifD1 ( q14, q10 )
-          32: MOVL      t12, q14
-          33: TAG2o     q10 = DifD1 ( q14, q10 )
-          34: MOVL      q10, q16
-          35: TAG1o     q16 = PCast10 ( q16 )
-          36: PUTVFo    q16
-          37: ANDB      t12, t10  (-wOSZACP)
-
-          38: INCEIPo   $9
-          39: GETVFo    q18
-          40: TESTVo    q18
-          42: Jnzo      $0x40435A50  (-rOSZACP)
-
-          43: JMPo      $0x40435A5B
-</pre>
-
-
-<h3>Translation from UCode</h3>
-
-This is all very simple, even though <code>vg_from_ucode.c</code>
-is a big file.  Position-independent x86 code is generated into 
-a dynamically allocated array <code>emitted_code</code>; this is
-doubled in size when it overflows.  Eventually the array is handed
-back to the caller of <code>VG_(translate)</code>, who must copy
-the result into TC and TT, and free the array.
-
-<p>
-This file is structured into four layers of abstraction, which,
-thankfully, are glued back together with extensive
-<code>__inline__</code> directives.  From the bottom upwards:
-
-<ul>
-<li>Address-mode emitters, <code>emit_amode_regmem_reg</code> et al.
-<p>
-<li>Emitters for specific x86 instructions.  There are quite a lot of
-    these, with names such as <code>emit_movv_offregmem_reg</code>.
-    The <code>v</code> suffix is Intel parlance for a 16/32 bit insn;
-    there are also <code>b</code> suffixes for 8 bit insns.
-<p>
-<li>The next level up are the <code>synth_*</code> functions, which
-    synthesise possibly a sequence of raw x86 instructions to do some
-    simple task.  Some of these are quite complex because they have to
-    work around Intel's silly restrictions on subregister naming.  See 
-    <code>synth_nonshiftop_reg_reg</code> for example.
-<p>
-<li>Finally, at the top of the heap, we have
-    <code>emitUInstr()</code>,
-    which emits code for a single uinstr.
-</ul>
-
-<p>
-Some comments:
-<ul>
-<li>The hack for FPU instructions becomes apparent here.  To do a
-    <code>FPU</code> ucode instruction, we load the simulated FPU's
-    state into from its <code>VG_(baseBlock)</code> into the real FPU
-    using an x86 <code>frstor</code> insn, do the ucode
-    <code>FPU</code> insn on the real CPU, and write the updated FPU
-    state back into <code>VG_(baseBlock)</code> using an
-    <code>fnsave</code> instruction.  This is pretty brutal, but is
-    simple and it works, and even seems tolerably efficient.  There is
-    no attempt to cache the simulated FPU state in the real FPU over
-    multiple back-to-back ucode FPU instructions.
-    <p>
-    <code>FPU_R</code> and <code>FPU_W</code> are also done this way,
-    with the minor complication that we need to patch in some
-    addressing mode bits so the resulting insn knows the effective
-    address to use.  This is easy because of the regularity of the x86
-    FPU instruction encodings.
-<p>
-<li>An analogous trick is done with ucode insns which claim, in their
-    <code>flags_r</code> and <code>flags_w</code> fields, that they
-    read or write the simulated <code>%EFLAGS</code>.  For such cases
-    we first copy the simulated <code>%EFLAGS</code> into the real
-    <code>%eflags</code>, then do the insn, then, if the insn says it
-    writes the flags, copy back to <code>%EFLAGS</code>.  This is a
-    bit expensive, which is why the ucode optimisation pass goes to
-    some effort to remove redundant flag-update annotations.
-</ul>
-
-<p>
-And so ... that's the end of the documentation for the instrumentating
-translator!  It's really not that complex, because it's composed as a
-sequence of simple(ish) self-contained transformations on
-straight-line blocks of code.
-
-
-<h3>Top-level dispatch loop</h3>
-
-Urk.  In <code>VG_(toploop)</code>.  This is basically boring and
-unsurprising, not to mention fiddly and fragile.  It needs to be
-cleaned up.  
-
-<p>
-The only perhaps surprise is that the whole thing is run
-on top of a <code>setjmp</code>-installed exception handler, because,
-supposing a translation got a segfault, we have to bail out of the
-Valgrind-supplied exception handler <code>VG_(oursignalhandler)</code>
-and immediately start running the client's segfault handler, if it has
-one.  In particular we can't finish the current basic block and then
-deliver the signal at some convenient future point, because signals
-like SIGILL, SIGSEGV and SIGBUS mean that the faulting insn should not
-simply be re-tried.  (I'm sure there is a clearer way to explain this).
-
-
-<h3>Exceptions, creating new translations</h3>
-<h3>Self-modifying code</h3>
-
-<h3>Lazy updates of the simulated program counter</h3>
-
-Simulated <code>%EIP</code> is not updated after every simulated x86
-insn as this was regarded as too expensive.  Instead ucode
-<code>INCEIP</code> insns move it along as and when necessary.
-Currently we don't allow it to fall more than 4 bytes behind reality
-(see <code>VG_(disBB)</code> for the way this works).
-<p>
-Note that <code>%EIP</code> is always brought up to date by the inner
-dispatch loop in <code>VG_(dispatch)</code>, so that if the client
-takes a fault we know at least which basic block this happened in.
-
-
-<h3>The translation cache and translation table</h3>
-
-<h3>Signals</h3>
-
-Horrible, horrible.  <code>vg_signals.c</code>.
-Basically, since we have to intercept all system
-calls anyway, we can see when the client tries to install a signal
-handler.  If it does so, we make a note of what the client asked to
-happen, and ask the kernel to route the signal to our own signal
-handler, <code>VG_(oursignalhandler)</code>.  This simply notes the
-delivery of signals, and returns.  
-
-<p>
-Every 1000 basic blocks, we see if more signals have arrived.  If so,
-<code>VG_(deliver_signals)</code> builds signal delivery frames on the
-client's stack, and allows their handlers to be run.  Valgrind places
-in these signal delivery frames a bogus return address,
-</code>VG_(signalreturn_bogusRA)</code>, and checks all jumps to see
-if any jump to it.  If so, this is a sign that a signal handler is
-returning, and if so Valgrind removes the relevant signal frame from
-the client's stack, restores the from the signal frame the simulated
-state before the signal was delivered, and allows the client to run
-onwards.  We have to do it this way because some signal handlers never
-return, they just <code>longjmp()</code>, which nukes the signal
-delivery frame.
-
-<p>
-The Linux kernel has a different but equally horrible hack for
-detecting signal handler returns.  Discovering it is left as an
-exercise for the reader.
-
-
-
-<h3>Errors, error contexts, error reporting, suppressions</h3>
-<h3>Client malloc/free</h3>
-<h3>Low-level memory management</h3>
-<h3>A and V bitmaps</h3>
-<h3>Symbol table management</h3>
-<h3>Dealing with system calls</h3>
-<h3>Namespace management</h3>
-<h3>GDB attaching</h3>
-<h3>Non-dependence on glibc or anything else</h3>
-<h3>The leak detector</h3>
-<h3>Performance problems</h3>
-<h3>Continuous sanity checking</h3>
-<h3>Tracing, or not tracing, child processes</h3>
-<h3>Assembly glue for syscalls</h3>
-
-
-<hr width="100%">
-
-<h2>Extensions</h2>
-
-Some comments about Stuff To Do.
-
-<h3>Bugs</h3>
-
-Stephan Kulow and Marc Mutz report problems with kmail in KDE 3 CVS
-(RC2 ish) when run on Valgrind.  Stephan has it deadlocking; Marc has
-it looping at startup.  I can't repro either behaviour. Needs
-repro-ing and fixing.
-
-
-<h3>Threads</h3>
-
-Doing a good job of thread support strikes me as almost a
-research-level problem.  The central issues are how to do fast cheap
-locking of the <code>VG_(primary_map)</code> structure, whether or not
-accesses to the individual secondary maps need locking, what
-race-condition issues result, and whether the already-nasty mess that
-is the signal simulator needs further hackery.
-
-<p>
-I realise that threads are the most-frequently-requested feature, and
-I am thinking about it all.  If you have guru-level understanding of 
-fast mutual exclusion mechanisms and race conditions, I would be
-interested in hearing from you.
-
-
-<h3>Verification suite</h3>
-
-Directory <code>tests/</code> contains various ad-hoc tests for
-Valgrind.  However, there is no systematic verification or regression
-suite, that, for example, exercises all the stuff in
-<code>vg_memory.c</code>, to ensure that illegal memory accesses and
-undefined value uses are detected as they should be.  It would be good
-to have such a suite.
-
-
-<h3>Porting to other platforms</h3>
-
-It would be great if Valgrind was ported to FreeBSD and x86 NetBSD,
-and to x86 OpenBSD, if it's possible (doesn't OpenBSD use a.out-style
-executables, not ELF ?)
-
-<p>
-The main difficulties, for an x86-ELF platform, seem to be:
-
-<ul>
-<li>You'd need to rewrite the <code>/proc/self/maps</code> parser
-    (<code>vg_procselfmaps.c</code>).
-    Easy.
-<p>
-<li>You'd need to rewrite <code>vg_syscall_mem.c</code>, or, more
-    specifically, provide one for your OS.  This is tedious, but you
-    can implement syscalls on demand, and the Linux kernel interface
-    is, for the most part, going to look very similar to the *BSD
-    interfaces, so it's really a copy-paste-and-modify-on-demand job.
-    As part of this, you'd need to supply a new
-    <code>vg_kerneliface.h</code> file.
-<p>
-<li>You'd also need to change the syscall wrappers for Valgrind's
-    internal use, in <code>vg_mylibc.c</code>.
-</ul>
-
-All in all, I think a port to x86-ELF *BSDs is not really very
-difficult, and in some ways I would like to see it happen, because
-that would force a more clear factoring of Valgrind into platform
-dependent and independent pieces.  Not to mention, *BSD folks also
-deserve to use Valgrind just as much as the Linux crew do.
-
-
-<p>
-<hr width="100%">
-
-<h2>Easy stuff which ought to be done</h2>
-
-<h3>MMX instructions</h3>
-
-MMX insns should be supported, using the same trick as for FPU insns.
-If the MMX registers are not used to copy uninitialised junk from one
-place to another in memory, this means we don't have to actually
-simulate the internal MMX unit state, so the FPU hack applies.  This
-should be fairly easy.
-
-
-
-<h3>Fix stabs-info reader</h3>
-
-The machinery in <code>vg_symtab2.c</code> which reads "stabs" style
-debugging info is pretty weak.  It usually correctly translates 
-simulated program counter values into line numbers and procedure
-names, but the file name is often completely wrong.  I think the
-logic used to parse "stabs" entries is weak.  It should be fixed.
-The simplest solution, IMO, is to copy either the logic or simply the
-code out of GNU binutils which does this; since GDB can clearly get it
-right, binutils (or GDB?) must have code to do this somewhere.
-
-
-
-
-
-<h3>BT/BTC/BTS/BTR</h3>
-
-These are x86 instructions which test, complement, set, or reset, a
-single bit in a word.  At the moment they are both incorrectly
-implemented and incorrectly instrumented.
-
-<p>
-The incorrect instrumentation is due to use of helper functions.  This
-means we lose bit-level definedness tracking, which could wind up
-giving spurious uninitialised-value use errors.  The Right Thing to do
-is to invent a couple of new UOpcodes, I think <code>GET_BIT</code>
-and <code>SET_BIT</code>, which can be used to implement all 4 x86
-insns, get rid of the helpers, and give bit-accurate instrumentation
-rules for the two new UOpcodes.
-
-<p>
-I realised the other day that they are mis-implemented too.  The x86
-insns take a bit-index and a register or memory location to access.
-For registers the bit index clearly can only be in the range zero to
-register-width minus 1, and I assumed the same applied to memory
-locations too.  But evidently not; for memory locations the index can
-be arbitrary, and the processor will index arbitrarily into memory as
-a result.  This too should be fixed.  Sigh.  Presumably indexing
-outside the immediate word is not actually used by any programs yet
-tested on Valgrind, for otherwise they (presumably) would simply not
-work at all.  If you plan to hack on this, first check the Intel docs
-to make sure my understanding is really correct.
-
-
-
-<h3>Using PREFETCH instructions</h3>
-
-Here's a small but potentially interesting project for performance
-junkies.  Experiments with valgrind's code generator and optimiser(s)
-suggest that reducing the number of instructions executed in the
-translations and mem-check helpers gives disappointingly small
-performance improvements.  Perhaps this is because performance of
-Valgrindified code is limited by cache misses.  After all, each read
-in the original program now gives rise to at least three reads, one
-for the <code>VG_(primary_map)</code>, one of the resulting
-secondary, and the original.  Not to mention, the instrumented
-translations are 13 to 14 times larger than the originals.  All in all
-one would expect the memory system to be hammered to hell and then
-some.
-
-<p>
-So here's an idea.  An x86 insn involving a read from memory, after
-instrumentation, will turn into ucode of the following form:
-<pre>
-    ... calculate effective addr, into ta and qa ...
-    TESTVL qa             -- is the addr defined?
-    LOADV (ta), qloaded   -- fetch V bits for the addr
-    LOAD  (ta), tloaded   -- do the original load
-</pre>
-At the point where the <code>LOADV</code> is done, we know the actual
-address (<code>ta</code>) from which the real <code>LOAD</code> will
-be done.  We also know that the <code>LOADV</code> will take around
-20 x86 insns to do.  So it seems plausible that doing a prefetch of
-<code>ta</code> just before the <code>LOADV</code> might just avoid a
-miss at the <code>LOAD</code> point, and that might be a significant
-performance win.
-
-<p>
-Prefetch insns are notoriously tempermental, more often than not
-making things worse rather than better, so this would require
-considerable fiddling around.  It's complicated because Intels and
-AMDs have different prefetch insns with different semantics, so that
-too needs to be taken into account.  As a general rule, even placing
-the prefetches before the <code>LOADV</code> insn is too near the
-<code>LOAD</code>; the ideal distance is apparently circa 200 CPU
-cycles.  So it might be worth having another analysis/transformation
-pass which pushes prefetches as far back as possible, hopefully 
-immediately after the effective address becomes available.
-
-<p>
-Doing too many prefetches is also bad because they soak up bus
-bandwidth / cpu resources, so some cleverness in deciding which loads
-to prefetch and which to not might be helpful.  One can imagine not
-prefetching client-stack-relative (<code>%EBP</code> or
-<code>%ESP</code>) accesses, since the stack in general tends to show
-good locality anyway.
-
-<p>
-There's quite a lot of experimentation to do here, but I think it
-might make an interesting week's work for someone.
-
-<p>
-As of 15-ish March 2002, I've started to experiment with this, using
-the AMD <code>prefetch/prefetchw</code> insns.
-
-
-
-<h3>User-defined permission ranges</h3>
-
-This is quite a large project -- perhaps a month's hacking for a
-capable hacker to do a good job -- but it's potentially very
-interesting.  The outcome would be that Valgrind could detect a 
-whole class of bugs which it currently cannot.
-
-<p>
-The presentation falls into two pieces.
-
-<p>
-<b>Part 1: user-defined address-range permission setting</b>
-<p>
-
-Valgrind intercepts the client's <code>malloc</code>,
-<code>free</code>, etc calls, watches system calls, and watches the
-stack pointer move.  This is currently the only way it knows about
-which addresses are valid and which not.  Sometimes the client program
-knows extra information about its memory areas.  For example, the
-client could at some point know that all elements of an array are
-out-of-date.  We would like to be able to convey to Valgrind this
-information that the array is now addressable-but-uninitialised, so
-that Valgrind can then warn if elements are used before they get new
-values. 
-
-<p>
-What I would like are some macros like this:
-<pre>
-   VALGRIND_MAKE_NOACCESS(addr, len)
-   VALGRIND_MAKE_WRITABLE(addr, len)
-   VALGRIND_MAKE_READABLE(addr, len)
-</pre>
-and also, to check that memory is addressible/initialised,
-<pre>
-   VALGRIND_CHECK_ADDRESSIBLE(addr, len)
-   VALGRIND_CHECK_INITIALISED(addr, len)
-</pre>
-
-<p>
-I then include in my sources a header defining these macros, rebuild
-my app, run under Valgrind, and get user-defined checks.
-
-<p>
-Now here's a neat trick.  It's a nuisance to have to re-link the app
-with some new library which implements the above macros.  So the idea
-is to define the macros so that the resulting executable is still
-completely stand-alone, and can be run without Valgrind, in which case
-the macros do nothing, but when run on Valgrind, the Right Thing
-happens.  How to do this?  The idea is for these macros to turn into a
-piece of inline assembly code, which (1) has no effect when run on the
-real CPU, (2) is easily spotted by Valgrind's JITter, and (3) no sane
-person would ever write, which is important for avoiding false matches
-in (2).  So here's a suggestion:
-<pre>
-   VALGRIND_MAKE_NOACCESS(addr, len)
-</pre>
-becomes (roughly speaking)
-<pre>
-   movl addr, %eax
-   movl len,  %ebx
-   movl $1,   %ecx   -- 1 describes the action; MAKE_WRITABLE might be
-                     -- 2, etc
-   rorl $13, %ecx
-   rorl $19, %ecx
-   rorl $11, %eax
-   rorl $21, %eax
-</pre>
-The rotate sequences have no effect, and it's unlikely they would
-appear for any other reason, but they define a unique byte-sequence
-which the JITter can easily spot.  Using the operand constraints
-section at the end of a gcc inline-assembly statement, we can tell gcc
-that the assembly fragment kills <code>%eax</code>, <code>%ebx</code>,
-<code>%ecx</code> and the condition codes, so this fragment is made
-harmless when not running on Valgrind, runs quickly when not on
-Valgrind, and does not require any other library support.
-
-
-<p>
-<b>Part 2: using it to detect interference between stack variables</b>
-<p>
-
-Currently Valgrind cannot detect errors of the following form:
-<pre>
-void fooble ( void )
-{
-   int a[10];
-   int b[10];
-   a[10] = 99;
-}
-</pre>
-Now imagine rewriting this as
-<pre>
-void fooble ( void )
-{
-   int spacer0;
-   int a[10];
-   int spacer1;
-   int b[10];
-   int spacer2;
-   VALGRIND_MAKE_NOACCESS(&spacer0, sizeof(int));
-   VALGRIND_MAKE_NOACCESS(&spacer1, sizeof(int));
-   VALGRIND_MAKE_NOACCESS(&spacer2, sizeof(int));
-   a[10] = 99;
-}
-</pre>
-Now the invalid write is certain to hit <code>spacer0</code> or
-<code>spacer1</code>, so Valgrind will spot the error.
-
-<p>
-There are two complications.
-
-<p>
-The first is that we don't want to annotate sources by hand, so the
-Right Thing to do is to write a C/C++ parser, annotator, prettyprinter
-which does this automatically, and run it on post-CPP'd C/C++ source.
-See http://www.cacheprof.org for an example of a system which
-transparently inserts another phase into the gcc/g++ compilation
-route.  The parser/prettyprinter is probably not as hard as it sounds;
-I would write it in Haskell, a powerful functional language well
-suited to doing symbolic computation, with which I am intimately
-familar.  There is already a C parser written in Haskell by someone in
-the Haskell community, and that would probably be a good starting
-point.
-
-<p>
-The second complication is how to get rid of these
-<code>NOACCESS</code> records inside Valgrind when the instrumented
-function exits; after all, these refer to stack addresses and will
-make no sense whatever when some other function happens to re-use the
-same stack address range, probably shortly afterwards.  I think I
-would be inclined to define a special stack-specific macro
-<pre>
-   VALGRIND_MAKE_NOACCESS_STACK(addr, len)
-</pre>
-which causes Valgrind to record the client's <code>%ESP</code> at the
-time it is executed.  Valgrind will then watch for changes in
-<code>%ESP</code> and discard such records as soon as the protected
-area is uncovered by an increase in <code>%ESP</code>.  I hesitate
-with this scheme only because it is potentially expensive, if there
-are hundreds of such records, and considering that changes in
-<code>%ESP</code> already require expensive messing with stack access
-permissions.
-
-<p>
-This is probably easier and more robust than for the instrumenter 
-program to try and spot all exit points for the procedure and place
-suitable deallocation annotations there.  Plus C++ procedures can 
-bomb out at any point if they get an exception, so spotting return
-points at the source level just won't work at all.
-
-<p>
-Although some work, it's all eminently doable, and it would make
-Valgrind into an even-more-useful tool.
-
-
-<p>
-
-
-<hr width="100%">
-
-<h2>Cache profiling</h2>
-Valgrind is a very nice platform for doing cache profiling and other kinds of
-simulation, because it converts horrible x86 instructions into nice clean
-RISC-like UCode.  For example, for cache profiling we are interested in
-instructions that read and write memory;  in UCode there are only four
-instructions that do this:  <code>LOAD</code>, <code>STORE</code>,
-<code>FPU_R</code> and <code>FPU_W</code>.  By contrast, because of the x86
-addressing modes, almost every instruction can read or write memory.<p>
-
-Most of the cache profiling machinery is in the file
-<code>vg_cachesim.c</code>.<p>
-
-These notes are a somewhat haphazard guide to how Valgrind's cache profiling
-works.<p>
-
-<h3>Cost centres</h3>
-Valgrind gathers cache profiling about every instruction executed,
-individually.  Each instruction has a <b>cost centre</b> associated with it.
-There are two kinds of cost centre: one for instructions that don't reference
-memory (<code>iCC</code>), and one for instructions that do
-(<code>idCC</code>):
-
-<pre>
-typedef struct _CC {
-   ULong a;
-   ULong m1;
-   ULong m2;
-} CC;
-
-typedef struct _iCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I;
-} iCC;
-   
-typedef struct _idCC {
-   /* word 1 */
-   UChar tag;
-   UChar instr_size;
-   UChar data_size;
-
-   /* words 2+ */
-   Addr instr_addr;
-   CC I; 
-   CC D; 
-} idCC; 
-</pre>
-
-Each <code>CC</code> has three fields <code>a</code>, <code>m1</code>,
-<code>m2</code> for recording references, level 1 misses and level 2 misses.
-Each of these is a 64-bit <code>ULong</code> -- the numbers can get very large,
-ie. greater than 4.2 billion allowed by a 32-bit unsigned int.<p>
-
-A <code>iCC</code> has one <code>CC</code> for instruction cache accesses.  A
-<code>idCC</code> has two, one for instruction cache accesses, and one for data
-cache accesses.<p>
-
-The <code>iCC</code> and <code>dCC</code> structs also store unchanging
-information about the instruction:
-<ul>
-  <li>An instruction-type identification tag (explained below)</li><p>
-  <li>Instruction size</li><p>
-  <li>Data reference size (<code>idCC</code> only)</li><p>
-  <li>Instruction address</li><p>
-</ul>
-
-Note that data address is not one of the fields for <code>idCC</code>.  This is
-because for many memory-referencing instructions the data address can change
-each time it's executed (eg. if it uses register-offset addressing).  We have
-to give this item to the cache simulation in a different way (see
-Instrumentation section below). Some memory-referencing instructions do always
-reference the same address, but we don't try to treat them specialy in order to
-keep things simple.<p>
-
-Also note that there is only room for recording info about one data cache
-access in an <code>idCC</code>.  So what about instructions that do a read then
-a write, such as:
-
-<blockquote><code>inc %(esi)</code></blockquote>
-
-In a write-allocate cache, as simulated by Valgrind, the write cannot miss,
-since it immediately follows the read which will drag the block into the cache
-if it's not already there.  So the write access isn't really interesting, and
-Valgrind doesn't record it.  This means that Valgrind doesn't measure
-memory references, but rather memory references that could miss in the cache.
-This behaviour is the same as that used by the AMD Athlon hardware counters.
-It also has the benefit of simplifying the implementation -- instructions that
-read and write memory can be treated like instructions that read memory.<p>
-
-<h3>Storing cost-centres</h3>
-Cost centres are stored in a way that makes them very cheap to lookup, which is
-important since one is looked up for every original x86 instruction
-executed.<p>
-
-Valgrind does JIT translations at the basic block level, and cost centres are
-also setup and stored at the basic block level.  By doing things carefully, we
-store all the cost centres for a basic block in a contiguous array, and lookup
-comes almost for free.<p>
-
-Consider this part of a basic block (for exposition purposes, pretend it's an
-entire basic block):
-
-<pre>
-movl $0x0,%eax
-movl $0x99, -4(%ebp)
-</pre>
-
-The translation to UCode looks like this:
-                
-<pre>
-MOVL      $0x0, t20
-PUTL      t20, %EAX
-INCEIPo   $5
-
-LEA1L     -4(t4), t14
-MOVL      $0x99, t18
-STL       t18, (t14)
-INCEIPo   $7
-</pre>
-
-The first step is to allocate the cost centres.  This requires a preliminary
-pass to count how many x86 instructions were in the basic block, and their
-types (and thus sizes).  UCode translations for single x86 instructions are
-delimited by the <code>INCEIPo</code> instruction, the argument of which gives
-the byte size of the instruction (note that lazy INCEIP updating is turned off
-to allow this).<p>
-
-We can tell if an x86 instruction references memory by looking for
-<code>LDL</code> and <code>STL</code> UCode instructions, and thus what kind of
-cost centre is required.  From this we can determine how many cost centres we
-need for the basic block, and their sizes.  We can then allocate them in a
-single array.<p>
-
-Consider the example code above.  After the preliminary pass, we know we need
-two cost centres, one <code>iCC</code> and one <code>dCC</code>.  So we
-allocate an array to store these which looks like this:
-
-<pre>
-|(uninit)|      tag         (1 byte)
-|(uninit)|      instr_size  (1 bytes)
-|(uninit)|      (padding)   (2 bytes)
-|(uninit)|      instr_addr  (4 bytes)
-|(uninit)|      I.a         (8 bytes)
-|(uninit)|      I.m1        (8 bytes)
-|(uninit)|      I.m2        (8 bytes)
-
-|(uninit)|      tag         (1 byte)
-|(uninit)|      instr_size  (1 byte)
-|(uninit)|      data_size   (1 byte)
-|(uninit)|      (padding)   (1 byte)
-|(uninit)|      instr_addr  (4 bytes)
-|(uninit)|      I.a         (8 bytes)
-|(uninit)|      I.m1        (8 bytes)
-|(uninit)|      I.m2        (8 bytes)
-|(uninit)|      D.a         (8 bytes)
-|(uninit)|      D.m1        (8 bytes)
-|(uninit)|      D.m2        (8 bytes)
-</pre>
-
-(We can see now why we need tags to distinguish between the two types of cost
-centres.)<p>
-
-We also record the size of the array.  We look up the debug info of the first
-instruction in the basic block, and then stick the array into a table indexed
-by filename and function name.  This makes it easy to dump the information
-quickly to file at the end.<p>
-
-<h3>Instrumentation</h3>
-The instrumentation pass has two main jobs:
-
-<ol>
-  <li>Fill in the gaps in the allocated cost centres.</li><p>
-  <li>Add UCode to call the cache simulator for each instruction.</li><p>
-</ol>
-
-The instrumentation pass steps through the UCode and the cost centres in
-tandem.  As each original x86 instruction's UCode is processed, the appropriate
-gaps in the instructions cost centre are filled in, for example:
-
-<pre>
-|INSTR_CC|      tag         (1 byte)
-|5       |      instr_size  (1 bytes)
-|(uninit)|      (padding)   (2 bytes)
-|i_addr1 |      instr_addr  (4 bytes)
-|0       |      I.a         (8 bytes)
-|0       |      I.m1        (8 bytes)
-|0       |      I.m2        (8 bytes)
-
-|WRITE_CC|      tag         (1 byte)
-|7       |      instr_size  (1 byte)
-|4       |      data_size   (1 byte)
-|(uninit)|      (padding)   (1 byte)
-|i_addr2 |      instr_addr  (4 bytes)
-|0       |      I.a         (8 bytes)
-|0       |      I.m1        (8 bytes)
-|0       |      I.m2        (8 bytes)
-|0       |      D.a         (8 bytes)
-|0       |      D.m1        (8 bytes)
-|0       |      D.m2        (8 bytes)
-</pre>
-
-(Note that this step is not performed if a basic block is re-translated;  see
-<a href="#retranslations">here</a> for more information.)<p>
-
-GCC inserts padding before the <code>instr_size</code> field so that it is word
-aligned.<p>
-
-The instrumentation added to call the cache simulation function looks like this
-(instrumentation is indented to distinguish it from the original UCode):
-
-<pre>
-MOVL      $0x0, t20
-PUTL      t20, %EAX
-  PUSHL     %eax
-  PUSHL     %ecx
-  PUSHL     %edx
-  MOVL      $0x4091F8A4, t46  # address of 1st CC
-  PUSHL     t46
-  CALLMo    $0x12             # second cachesim function
-  CLEARo    $0x4
-  POPL      %edx
-  POPL      %ecx
-  POPL      %eax
-INCEIPo   $5
-
-LEA1L     -4(t4), t14
-MOVL      $0x99, t18
-  MOVL      t14, t42
-STL       t18, (t14)
-  PUSHL     %eax
-  PUSHL     %ecx
-  PUSHL     %edx
-  PUSHL     t42
-  MOVL      $0x4091F8C4, t44  # address of 2nd CC
-  PUSHL     t44
-  CALLMo    $0x13             # second cachesim function
-  CLEARo    $0x8
-  POPL      %edx
-  POPL      %ecx
-  POPL      %eax
-INCEIPo   $7
-</pre>
-
-Consider the first instruction's UCode.  Each call is surrounded by three
-<code>PUSHL</code> and <code>POPL</code> instructions to save and restore the
-caller-save registers.  Then the address of the instruction's cost centre is
-pushed onto the stack, to be the first argument to the cache simulation
-function.  The address is known at this point because we are doing a
-simultaneous pass through the cost centre array.  This means the cost centre
-lookup for each instruction is almost free (just the cost of pushing an
-argument for a function call).  Then the call to the cache simulation function
-for non-memory-reference instructions is made (note that the
-<code>CALLMo</code> UInstruction takes an offset into a table of predefined
-functions;  it is not an absolute address), and the single argument is
-<code>CLEAR</code>ed from the stack.<p>
-
-The second instruction's UCode is similar.  The only difference is that, as
-mentioned before, we have to pass the address of the data item referenced to
-the cache simulation function too.  This explains the <code>MOVL t14,
-t42</code> and <code>PUSHL t42</code> UInstructions.  (Note that the seemingly
-redundant <code>MOV</code>ing will probably be optimised away during register
-allocation.)<p>
-
-Note that instead of storing unchanging information about each instruction
-(instruction size, data size, etc) in its cost centre, we could have passed in
-these arguments to the simulation function.  But this would slow the calls down
-(two or three extra arguments pushed onto the stack).  Also it would bloat the
-UCode instrumentation by amounts similar to the space required for them in the
-cost centre;  bloated UCode would also fill the translation cache more quickly,
-requiring more translations for large programs and slowing them down more.<p>
-
-<a name="retranslations"></a>
-<h3>Handling basic block retranslations</h3>
-The above description ignores one complication.  Valgrind has a limited size
-cache for basic block translations;  if it fills up, old translations are
-discarded.  If a discarded basic block is executed again, it must be
-re-translated.<p>
-
-However, we can't use this approach for profiling -- we can't throw away cost
-centres for instructions in the middle of execution!  So when a basic block is
-translated, we first look for its cost centre array in the hash table.  If
-there is no cost centre array, it must be the first translation, so we proceed
-as described above.  But if there is a cost centre array already, it must be a
-retranslation.  In this case, we skip the cost centre allocation and
-initialisation steps, but still do the UCode instrumentation step.<p>
-
-<h3>The cache simulation</h3>
-The cache simulation is fairly straightforward.  It just tracks which memory
-blocks are in the cache at the moment (it doesn't track the contents, since
-that is irrelevant).<p>
-
-The interface to the simulation is quite clean.  The functions called from the
-UCode contain calls to the simulation functions in the files
-<Code>vg_cachesim_{I1,D1,L2}.c</code>;  these calls are inlined so that only
-one function call is done per simulated x86 instruction.  The file
-<code>vg_cachesim.c</code> simply <code>#include</code>s the three files
-containing the simulation, which makes plugging in new cache simulations is
-very easy -- you just replace the three files and recompile.<p>
-
-<h3>Output</h3>
-Output is fairly straightforward, basically printing the cost centre for every
-instruction, grouped by files and functions.  Total counts (eg. total cache
-accesses, total L1 misses) are calculated when traversing this structure rather
-than during execution, to save time;  the cache simulation functions are called
-so often that even one or two extra adds can make a sizeable difference.<p>
-
-Input file has the following format:
-
-<pre>
-file         ::= desc_line* cmd_line events_line data_line+ summary_line
-desc_line    ::= "desc:" ws? non_nl_string
-cmd_line     ::= "cmd:" ws? cmd
-events_line  ::= "events:" ws? (event ws)+
-data_line    ::= file_line | fn_line | count_line
-file_line    ::= ("fl=" | "fi=" | "fe=") filename
-fn_line      ::= "fn=" fn_name
-count_line   ::= line_num ws? (count ws)+
-summary_line ::= "summary:" ws? (count ws)+
-count        ::= num | "."
-</pre>
-
-Where:
-
-<ul>
-  <li><code>non_nl_string</code> is any string not containing a newline.</li><p>
-  <li><code>cmd</code> is a command line invocation.</li><p>
-  <li><code>filename</code> and <code>fn_name</code> can be anything.</li><p>
-  <li><code>num</code> and <code>line_num</code> are decimal numbers.</li><p>
-  <li><code>ws</code> is whitespace.</li><p>
-  <li><code>nl</code> is a newline.</li><p>
-</ul>
-
-The contents of the "desc:" lines is printed out at the top of the summary.
-This is a generic way of providing simulation specific information, eg. for
-giving the cache configuration for cache simulation.<p>
-
-Counts can be "." to represent "N/A", eg. the number of write misses for an
-instruction that doesn't write to memory.<p>
-
-The number of counts in each <code>line</code> and the
-<code>summary_line</code> should not exceed the number of events in the
-<code>event_line</code>.  If the number in each <code>line</code> is less,
-vg_annotate treats those missing as though they were a "." entry.  <p>
-
-A <code>file_line</code> changes the current file name.  A <code>fn_line</code>
-changes the current function name.  A <code>count_line</code> contains counts
-that pertain to the current filename/fn_name.  A "fn=" <code>file_line</code>
-and a <code>fn_line</code> must appear before any <code>count_line</code>s to
-give the context of the first <code>count_line</code>s.<p>
-
-Each <code>file_line</code> should be immediately followed by a
-<code>fn_line</code>.  "fi=" <code>file_lines</code> are used to switch
-filenames for inlined functions; "fe=" <code>file_lines</code> are similar, but
-are put at the end of a basic block in which the file name hasn't been switched
-back to the original file name.  (fi and fe lines behave the same, they are
-only distinguished to help debugging.)<p>
-
-
-<h3>Summary of performance features</h3>
-Quite a lot of work has gone into making the profiling as fast as possible.
-This is a summary of the important features:
-
-<ul>
-  <li>The basic block-level cost centre storage allows almost free cost centre
-      lookup.</li><p>
-  
-  <li>Only one function call is made per instruction simulated;  even this
-      accounts for a sizeable percentage of execution time, but it seems
-      unavoidable if we want flexibility in the cache simulator.</li><p>
-
-  <li>Unchanging information about an instruction is stored in its cost centre,
-      avoiding unnecessary argument pushing, and minimising UCode
-      instrumentation bloat.</li><p>
-
-  <li>Summary counts are calculated at the end, rather than during
-      execution.</li><p>
-
-  <li>The <code>cachegrind.out</code> output files can contain huge amounts of
-      information; file format was carefully chosen to minimise file
-      sizes.</li><p>
-</ul>
-
-
-<h3>Annotation</h3>
-Annotation is done by vg_annotate.  It is a fairly straightforward Perl script
-that slurps up all the cost centres, and then runs through all the chosen
-source files, printing out cost centres with them.  It too has been carefully
-optimised.
-
-
-<h3>Similar work, extensions</h3>
-It would be relatively straightforward to do other simulations and obtain
-line-by-line information about interesting events.  A good example would be
-branch prediction -- all branches could be instrumented to interact with a
-branch prediction simulator, using very similar techniques to those described
-above.<p>
-
-In particular, vg_annotate would not need to change -- the file format is such
-that it is not specific to the cache simulation, but could be used for any kind
-of line-by-line information.  The only part of vg_annotate that is specific to
-the cache simulation is the name of the input file
-(<code>cachegrind.out</code>), although it would be very simple to add an
-option to control this.<p>
-
-</body>
-</html>
diff --git a/none/Makefile.am b/none/Makefile.am
deleted file mode 100644
index 60553ddac6..0000000000
--- a/none/Makefile.am
+++ /dev/null
@@ -1,110 +0,0 @@
-SUBDIRS = demangle . docs tests
-
-CFLAGS = $(WERROR) -DVG_LIBDIR="\"$(libdir)"\" \
-		-Winline -Wall -Wshadow -O -fomit-frame-pointer -g
-
-valdir = $(libdir)/valgrind
-
-LDFLAGS = -Wl,-z -Wl,initfirst
-
-INCLUDES = -I$(srcdir)/demangle
-
-bin_SCRIPTS = valgrind cachegrind vg_annotate
-
-SUPP_FILES = glibc-2.1.supp glibc-2.2.supp xfree-3.supp xfree-4.supp
-
-val_DATA = $(SUPP_FILES) default.supp
-
-BUILT_SOURCES = default.supp
-
-default.supp: $(SUPP_FILES)
-
-bzdist: dist
-	gunzip -c $(PACKAGE)-$(VERSION).tar.gz | bzip2 > $(PACKAGE)-$(VERSION).tar.bz2
-
-EXTRA_DIST = $(val_DATA) \
-	PATCHES_APPLIED ACKNOWLEDGEMENTS \
-	README_KDE3_FOLKS README_PACKAGERS \
-	README_MISSING_SYSCALL_OR_IOCTL TODO dosyms vg_libpthread.vs \
-	valgrind.spec valgrind.spec.in
-
-val_PROGRAMS = valgrind.so valgrinq.so libpthread.so
-
-libpthread_so_SOURCES = vg_libpthread.c vg_libpthread_unimp.c
-
-valgrinq_so_SOURCES = vg_valgrinq_dummy.c
-
-valgrind_so_SOURCES = \
-	vg_clientfuncs.c \
-	vg_scheduler.c \
-        vg_cachesim.c \
-	vg_clientmalloc.c \
-	vg_clientperms.c \
-	vg_demangle.c \
-	vg_dispatch.S \
-	vg_errcontext.c \
-	vg_execontext.c \
-	vg_from_ucode.c \
-	vg_helpers.S \
-	vg_main.c \
-	vg_malloc2.c \
-	vg_memory.c \
-	vg_messages.c \
-	vg_mylibc.c \
-	vg_procselfmaps.c \
-	vg_profile.c \
-	vg_signals.c \
-	vg_startup.S \
-	vg_symtab2.c \
-	vg_syscall_mem.c \
-	vg_syscall.S \
-	vg_to_ucode.c \
-	vg_translate.c \
-	vg_transtab.c \
-	vg_vtagops.c
-
-valgrind_so_LDADD = \
-	demangle/cp-demangle.o \
-	demangle/cplus-dem.o \
-	demangle/dyn-string.o \
-	demangle/safe-ctype.o
-
-include_HEADERS = valgrind.h
-
-noinst_HEADERS = \
-        vg_cachesim_gen.c       \
-        vg_cachesim_I1.c        \
-        vg_cachesim_D1.c        \
-        vg_cachesim_L2.c        \
-        vg_kerneliface.h        \
-        vg_include.h            \
-        vg_constants.h          \
-        vg_unsafe.h
-
-MANUAL_DEPS = $(noinst_HEADERS) $(include_HEADERS) 
-
-vg_memory.o: vg_memory.c $(MANUAL_DEPS)
-	$(COMPILE) -O2 @PREFERRED_STACK_BOUNDARY@ -c $<
-
-vg_clientfuncs.o: vg_clientfuncs.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-vg_libpthread.o: vg_libpthread.c $(MANUAL_DEPS)
-	$(COMPILE) -fno-omit-frame-pointer -c $<
-
-valgrind.so$(EXEEXT): $(valgrind_so_OBJECTS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -shared -o valgrind.so \
-		$(valgrind_so_OBJECTS) $(valgrind_so_LDADD)
-
-valgrinq.so$(EXEEXT): $(valgrinq_so_OBJECTS)
-	$(CC) $(CFLAGS) -shared -o valgrinq.so $(valgrinq_so_OBJECTS)
-
-libpthread.so$(EXEEXT): $(libpthread_so_OBJECTS) $(srcdir)/vg_libpthread.vs
-	$(CC) -Wall -Werror -g -O -shared -fpic -o libpthread.so \
-		$(libpthread_so_OBJECTS) \
-		-Wl,-version-script $(srcdir)/vg_libpthread.vs
-
-install-exec-hook:
-	$(mkinstalldirs) $(DESTDIR)$(valdir)
-	rm -f $(DESTDIR)$(valdir)/libpthread.so.0
-	$(LN_S) libpthread.so $(DESTDIR)$(valdir)/libpthread.so.0