From 551874920f7f49fcf18b69e2a3f23d948db2c50b Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Mon, 27 Mar 2023 17:27:56 +1100
Subject: [PATCH] Rewrite `cg_merge` in Python.

It's currently written in C, but `cg_annotate` and `cg_diff` are written in
Python. It's better to have them all in the same language.

The good news is that the Python code is 4.5x shorter than the C code.
The bad news is that the Python code is roughly 3x slower than the C
code. But `cg_merge` isn't used that often, so I think it's a reasonable
trade-off.
---
 cachegrind/Makefile.am                 |   27 +-
 cachegrind/cg_merge.c                  | 1580 ------------------------
 cachegrind/cg_merge.in                 |  339 +++++
 cachegrind/tests/Makefile.am           |    2 +
 cachegrind/tests/ann-merge-x.rs        |    5 +
 cachegrind/tests/ann-merge-y.rs        |    6 +
 cachegrind/tests/ann-merge1.post.exp   |   66 +
 cachegrind/tests/ann-merge1.stderr.exp |   17 +
 cachegrind/tests/ann-merge1.vgtest     |    7 +
 cachegrind/tests/ann-merge1a.cgout     |   19 +
 cachegrind/tests/ann-merge1b.cgout     |   14 +
 configure.ac                           |    1 +
 12 files changed, 482 insertions(+), 1601 deletions(-)
 delete mode 100644 cachegrind/cg_merge.c
 create mode 100755 cachegrind/cg_merge.in
 create mode 100644 cachegrind/tests/ann-merge-x.rs
 create mode 100644 cachegrind/tests/ann-merge-y.rs
 create mode 100644 cachegrind/tests/ann-merge1.post.exp
 create mode 100644 cachegrind/tests/ann-merge1.stderr.exp
 create mode 100644 cachegrind/tests/ann-merge1.vgtest
 create mode 100644 cachegrind/tests/ann-merge1a.cgout
 create mode 100644 cachegrind/tests/ann-merge1b.cgout
diff --git a/cachegrind/Makefile.am b/cachegrind/Makefile.am
index 34717ae541..cbaa90522b 100644
--- a/cachegrind/Makefile.am
+++ b/cachegrind/Makefile.am
@@ -10,32 +10,13 @@ EXTRA_DIST = \
 # Headers, etc
 #----------------------------------------------------------------------------
 
-bin_SCRIPTS = cg_annotate cg_diff
+bin_SCRIPTS = cg_annotate cg_diff cg_merge
 
 noinst_HEADERS = \
 	cg_arch.h \
 	cg_branchpred.c \
 	cg_sim.c
 
-#----------------------------------------------------------------------------
-# cg_merge (built for the primary target only)
-#----------------------------------------------------------------------------
-
-bin_PROGRAMS = cg_merge
-
-cg_merge_SOURCES = cg_merge.c
-cg_merge_CPPFLAGS  = $(AM_CPPFLAGS_PRI)
-cg_merge_CFLAGS    = $(AM_CFLAGS_PRI)
-cg_merge_CCASFLAGS = $(AM_CCASFLAGS_PRI)
-cg_merge_LDFLAGS   = $(AM_CFLAGS_PRI)
-# If there is no secondary platform, and the platforms include x86-darwin,
-# then the primary platform must be x86-darwin.  Hence:
-if ! VGCONF_HAVE_PLATFORM_SEC
-if VGCONF_PLATFORMS_INCLUDE_X86_DARWIN
-cg_merge_LDFLAGS   += -Wl,-read_only_relocs -Wl,suppress
-endif
-endif
-
 #----------------------------------------------------------------------------
 # cachegrind-<platform>
 #----------------------------------------------------------------------------
@@ -101,4 +82,8 @@ pyann:
 pydiff:
 	+../auxprogs/pybuild.sh cg_diff.in cg_diff
 
-.PHONY: pyann pydiff
+# "Build" `cg_merge`. The `+` avoids warnings about the jobserver.
+pymerge:
+	+../auxprogs/pybuild.sh cg_merge.in cg_merge
+
+.PHONY: pyann pydiff pymerge
diff --git a/cachegrind/cg_merge.c b/cachegrind/cg_merge.c
deleted file mode 100644
index 4d13cb5d19..0000000000
--- a/cachegrind/cg_merge.c
+++ /dev/null
@@ -1,1580 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- A program that merges multiple cachegrind output files.      ---*/
-/*---                                                   cg_merge.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-  This file is part of Cachegrind, a Valgrind tool for cache
-  profiling programs.
-
-  Copyright (C) 2002-2017 Nicholas Nethercote
-     njn@valgrind.org
-
-  AVL tree code derived from
-  ANSI C Library for maintenance of AVL Balanced Trees
-  (C) 2000 Daniel Nagy, Budapest University of Technology and Economics
-  Released under GNU General Public License (GPL) version 2
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of the
-  License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The GNU General Public License is contained in the file COPYING.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <ctype.h>
-
-typedef  signed long   Word;
-typedef  unsigned long UWord;
-typedef  unsigned char Bool;
-#define True ((Bool)1)
-#define False ((Bool)0)
-typedef  signed int    Int;
-typedef  unsigned int  UInt;
-typedef  unsigned long long int ULong;
-typedef  signed char   Char;
-typedef  size_t        SizeT;
-
-
-//------------------------------------------------------------------//
-//---                           WordFM                           ---//
-//---                      Public interface                      ---//
-//------------------------------------------------------------------//
-
-typedef  struct _WordFM  WordFM; /* opaque */
-
-/* Initialise a WordFM */
-void initFM ( WordFM* t, 
-              void*   (*alloc_nofail)( SizeT ),
-              void    (*dealloc)(void*),
-              Word    (*kCmp)(Word,Word) );
-
-/* Allocate and initialise a WordFM */
-WordFM* newFM( void* (*alloc_nofail)( SizeT ),
-               void  (*dealloc)(void*),
-               Word  (*kCmp)(Word,Word) );
-
-/* Free up the FM.  If kFin is non-NULL, it is applied to keys
-   before the FM is deleted; ditto with vFin for vals. */
-void deleteFM ( WordFM*, void(*kFin)(Word), void(*vFin)(Word) );
-
-/* Add (k,v) to fm.  If a binding for k already exists, it is updated
-   to map to this new v.  In that case we should really return the
-   previous v so that caller can finalise it.  Oh well. */
-void addToFM ( WordFM* fm, Word k, Word v );
-
-// Delete key from fm, returning associated val if found
-Bool delFromFM ( WordFM* fm, /*OUT*/Word* oldV, Word key );
-
-// Look up in fm, assigning found val at spec'd address
-Bool lookupFM ( WordFM* fm, /*OUT*/Word* valP, Word key );
-
-Word sizeFM ( WordFM* fm );
-
-// set up FM for iteration
-void initIterFM ( WordFM* fm );
-
-// get next key/val pair.  Will assert if fm has been modified
-// or looked up in since initIterFM was called.
-Bool nextIterFM ( WordFM* fm, /*OUT*/Word* pKey, /*OUT*/Word* pVal );
-
-// clear the I'm iterating flag
-void doneIterFM ( WordFM* fm );
-
-// Deep copy a FM.  If dopyK is NULL, keys are copied verbatim.
-// If non-null, dopyK is applied to each key to generate the
-// version in the new copy.  In that case, if the argument to dopyK
-// is non-NULL but the result is NULL, it is assumed that dopyK
-// could not allocate memory, in which case the copy is abandoned
-// and NULL is returned.  Ditto with dopyV for values.
-WordFM* dopyFM ( WordFM* fm, Word(*dopyK)(Word), Word(*dopyV)(Word) );
-
-//------------------------------------------------------------------//
-//---                         end WordFM                         ---//
-//---                      Public interface                      ---//
-//------------------------------------------------------------------//
-
-
-static const char* argv0 = "cg_merge";
-
-/* Keep track of source filename/line no so as to be able to
-   print decent error messages. */
-typedef
-   struct {
-      FILE* fp;
-      UInt  lno;
-      char* filename;
-   }
-   SOURCE;
-
-static void printSrcLoc ( SOURCE* s )
-{
-   fprintf(stderr, "%s: near %s line %u\n", argv0, s->filename, s->lno-1);
-}
-
-__attribute__((noreturn))
-static void mallocFail ( SOURCE* s, const char* who )
-{
-   fprintf(stderr, "%s: out of memory in %s\n", argv0, who );
-   printSrcLoc( s );
-   exit(2);
-}
-
-__attribute__((noreturn))
-static void parseError ( SOURCE* s, const char* msg )
-{
-   fprintf(stderr, "%s: parse error: %s\n", argv0, msg );
-   printSrcLoc( s );
-   exit(1);
-}
-
-__attribute__((noreturn))
-static void barf ( SOURCE* s, const char* msg )
-{
-   fprintf(stderr, "%s: %s\n", argv0, msg );
-   printSrcLoc( s );
-   exit(1);
-}
-
-// Read a line. Return the line read, or NULL if at EOF.
-// The line is allocated dynamically but will be overwritten with
-// every invocation. Caller must not free it.
-static const char *readline ( SOURCE* s )
-{
-   static char  *line = NULL;
-   static size_t linesiz = 0;
-
-   int ch, i = 0;
-
-   while (1) {
-      ch = getc(s->fp);
-      if (ch != EOF) {
-          if (i + 1 >= linesiz) {
-             linesiz += 500;
-             line = realloc(line, linesiz * sizeof *line);
-             if (line == NULL)
-                mallocFail(s, "readline:");
-          }
-          line[i++] = ch;
-          line[i] = 0;
-          if (ch == '\n') {
-             line[i-1] = 0;
-             s->lno++;
-             break;
-          }
-      } else {
-         if (ferror(s->fp)) {
-            perror(argv0);
-            barf(s, "I/O error while reading input file");
-         } else {
-            // hit EOF
-            break;
-         }
-      }
-   }
-   return i == 0 ? NULL : line;
-}
-
-static Bool streqn ( const char* s1, const char* s2, size_t n )
-{
-   return 0 == strncmp(s1, s2, n);
-}
-
-static Bool streq ( const char* s1, const char* s2 )
-{
-   return 0 == strcmp(s1, s2 );
-}
-
-
-////////////////////////////////////////////////////////////////
-
-typedef
-   struct {
-      char* fi_name;
-      char* fn_name;
-   }
-   FileFn;
-
-typedef
-   struct {
-      Int n_counts;
-      ULong* counts;
-   }
-   Counts;
-
-typedef
-   struct {
-      // null-terminated vector of desc_lines
-      char** desc_lines;
-
-      // Cmd line
-      char* cmd_line;
-
-      // Events line
-      char* events_line;
-      Int   n_events;
-
-      // Summary line (copied from input)
-      char* summary_line;
-
-      /* Outermost map is
-            WordFM FileFn* innerMap
-         where innerMap is   WordFM line-number=UWord Counts */
-      WordFM* outerMap;
-
-      // Summary counts (computed whilst parsing)
-      // should match .summary_line
-      Counts* summary;
-   }
-   CacheProfFile;
-
-static FileFn* new_FileFn ( char* file_name, char* fn_name )
-{
-   FileFn* ffn = malloc(sizeof(FileFn));
-   if (ffn == NULL)
-      return NULL;
-   ffn->fi_name = file_name;
-   ffn->fn_name = fn_name;
-   return ffn;
-}
-
-static void ddel_FileFn ( FileFn* ffn )
-{
-   if (ffn->fi_name)
-      free(ffn->fi_name);
-   if (ffn->fn_name)
-      free(ffn->fn_name);
-   memset(ffn, 0, sizeof(FileFn));
-   free(ffn);
-}
-
-static FileFn* dopy_FileFn ( FileFn* ff )
-{
-   char *fi2, *fn2;
-   fi2 = strdup(ff->fi_name);
-   if (fi2 == NULL) return NULL;
-   fn2 = strdup(ff->fn_name);
-   if (fn2 == NULL) {
-      free(fi2);
-      return NULL;
-   }
-   return new_FileFn( fi2, fn2 );
-}
-
-static Counts* new_Counts ( Int n_counts, /*COPIED*/ULong* counts )
-{
-   Int i;
-   Counts* cts = malloc(sizeof(Counts));
-   if (cts == NULL)
-      return NULL;
-
-   assert(n_counts >= 0);
-   cts->counts = malloc(n_counts * sizeof(ULong));
-   if (cts->counts == NULL) {
-      free(cts);
-      return NULL;
-   }
-
-   cts->n_counts = n_counts;
-   for (i = 0; i < n_counts; i++)
-      cts->counts[i] = counts[i];
-
-   return cts;
-}
-
-static Counts* new_Counts_Zeroed ( Int n_counts )
-{
-   Int i;
-   Counts* cts = malloc(sizeof(Counts));
-   if (cts == NULL)
-      return NULL;
-
-   assert(n_counts >= 0);
-   cts->counts = malloc(n_counts * sizeof(ULong));
-   if (cts->counts == NULL) {
-      free(cts);
-      return NULL;
-   }
-
-   cts->n_counts = n_counts;
-   for (i = 0; i < n_counts; i++)
-      cts->counts[i] = 0;
-
-   return cts;
-}
-
-static void sdel_Counts ( Counts* cts )
-{
-   memset(cts, 0, sizeof(Counts));
-   free(cts);
-}
-
-static void ddel_Counts ( Counts* cts )
-{
-   if (cts->counts)
-      free(cts->counts);
-   memset(cts, 0, sizeof(Counts));
-   free(cts);
-}
-
-static Counts* dopy_Counts ( Counts* cts )
-{
-   return new_Counts( cts->n_counts, cts->counts );
-}
-
-static
-CacheProfFile* new_CacheProfFile ( char**  desc_lines,
-                                   char*   cmd_line,
-                                   char*   events_line,
-                                   Int     n_events,
-                                   char*   summary_line,
-                                   WordFM* outerMap,
-                                   Counts* summary )
-{
-   CacheProfFile* cpf = malloc(sizeof(CacheProfFile));
-   if (cpf == NULL)
-      return NULL;
-   cpf->desc_lines   = desc_lines;
-   cpf->cmd_line     = cmd_line;
-   cpf->events_line  = events_line;
-   cpf->n_events     = n_events;
-   cpf->summary_line = summary_line;
-   cpf->outerMap     = outerMap;
-   cpf->summary      = summary;
-   return cpf;
-}
-
-static WordFM* dopy_InnerMap ( WordFM* innerMap )
-{
-   return dopyFM ( innerMap, NULL,
-                             (Word(*)(Word))dopy_Counts );
-}
-
-static void ddel_InnerMap ( WordFM* innerMap )
-{
-   deleteFM( innerMap, NULL, (void(*)(Word))ddel_Counts );
-}
-
-static void ddel_CacheProfFile ( CacheProfFile* cpf )
-{
-   char** p;
-   if (cpf->desc_lines) {
-      for (p = cpf->desc_lines; *p; p++)
-         free(*p);
-      free(cpf->desc_lines);
-   }
-   if (cpf->cmd_line)
-      free(cpf->cmd_line);
-   if (cpf->events_line)
-      free(cpf->events_line);
-   if (cpf->summary_line)
-      free(cpf->summary_line);
-   if (cpf->outerMap)
-      deleteFM( cpf->outerMap, (void(*)(Word))ddel_FileFn, 
-                               (void(*)(Word))ddel_InnerMap );
-   if (cpf->summary)
-      ddel_Counts(cpf->summary);
-
-   memset(cpf, 0, sizeof(CacheProfFile));
-   free(cpf);
-}
-
-static void showCounts ( FILE* f, Counts* c )
-{
-   Int i;
-   for (i = 0; i < c->n_counts; i++) {
-      fprintf(f, "%lld ", c->counts[i]);
-   }
-}
-
-static void show_CacheProfFile ( FILE* f, CacheProfFile* cpf )
-{
-   Int     i;
-   char**  d;
-   FileFn* topKey;
-   WordFM* topVal;
-   UWord   subKey;
-   Counts* subVal;  
-
-   for (d = cpf->desc_lines; *d; d++)
-      fprintf(f, "%s\n", *d);
-   fprintf(f, "%s\n", cpf->cmd_line);
-   fprintf(f, "%s\n", cpf->events_line);
-
-   initIterFM( cpf->outerMap );
-   while (nextIterFM( cpf->outerMap, (Word*)(&topKey), (Word*)(&topVal) )) {
-      fprintf(f, "fl=%s\nfn=%s\n", 
-                 topKey->fi_name, topKey->fn_name );
-      initIterFM( topVal );
-      while (nextIterFM( topVal, (Word*)(&subKey), (Word*)(&subVal) )) {
-         fprintf(f, "%ld   ", subKey );
-         showCounts( f, subVal );
-         fprintf(f, "\n");
-      }
-      doneIterFM( topVal );
-   }
-   doneIterFM( cpf->outerMap );
-
-   //fprintf(f, "%s\n", cpf->summary_line);
-   fprintf(f, "summary:");
-   for (i = 0; i < cpf->summary->n_counts; i++)
-      fprintf(f, " %lld", cpf->summary->counts[i]);
-   fprintf(f, "\n");
-}
-
-////////////////////////////////////////////////////////////////
-
-static Word cmp_FileFn ( Word s1, Word s2 )
-{
-   FileFn* ff1 = (FileFn*)s1;
-   FileFn* ff2 = (FileFn*)s2;
-   Word r = strcmp(ff1->fi_name, ff2->fi_name);
-   if (r == 0)
-      r = strcmp(ff1->fn_name, ff2->fn_name);
-   return r;
-}
-
-static Word cmp_unboxed_UWord ( Word s1, Word s2 )
-{
-   UWord u1 = (UWord)s1;
-   UWord u2 = (UWord)s2;
-   if (u1 < u2) return -1;
-   if (u1 > u2) return 1;
-   return 0;
-}
-
-////////////////////////////////////////////////////////////////
-
-static Bool parse_ULong ( /*OUT*/ULong* res, /*INOUT*/const char** pptr)
-{
-   ULong u64;
-   const char* ptr = *pptr;
-   while (isspace(*ptr)) ptr++;
-   if (!isdigit(*ptr)) {
-      *pptr = ptr;
-      return False; /* end of string, or junk */
-   }
-   u64 = 0;
-   while (isdigit(*ptr)) {
-      u64 = (u64 * 10) + (ULong)(*ptr - '0');
-      ptr++;
-   }
-   *res = u64;
-   *pptr = ptr;
-   return True;
-}
-
-// str is a line of integers, starting with a line number.  Parse it,
-// returning the first number in *lnno and the rest in a newly
-// allocated Counts struct.  If lnno is non-NULL, treat the first
-// number as a line number and assign it to *lnno instead of
-// incorporating it in the counts array.
-static 
-Counts* splitUpCountsLine ( SOURCE* s, /*OUT*/UWord* lnno, const char* str )
-{
-   Bool    ok;
-   Counts* counts;
-   ULong   *tmpC = NULL;
-   UInt     n_tmpC = 0, tmpCsize = 0;
-   while (1) {
-      if (n_tmpC >= tmpCsize) {
-         tmpCsize += 50;
-         tmpC = realloc(tmpC, tmpCsize * sizeof *tmpC);
-         if (tmpC == NULL)
-            mallocFail(s, "splitUpCountsLine:");
-      }
-      ok = parse_ULong( &tmpC[n_tmpC], &str );
-      if (!ok)
-         break;
-      n_tmpC++;
-   }
-   if (*str != 0)
-      parseError(s, "garbage in counts line");
-   if (lnno ? (n_tmpC < 2) : (n_tmpC < 1))
-      parseError(s, "too few counts in count line");
-
-   if (lnno) {
-      *lnno = (UWord)tmpC[0];
-      counts = new_Counts( n_tmpC-1, /*COPIED*/&tmpC[1] );
-   } else {
-      counts = new_Counts( n_tmpC, /*COPIED*/&tmpC[0] );
-   }
-   free(tmpC);
-
-   return counts;
-}
-
-static void addCounts ( SOURCE* s, /*OUT*/Counts* counts1, Counts* counts2 )
-{
-   Int i;
-   if (counts1->n_counts != counts2->n_counts)
-      parseError(s, "addCounts: inconsistent number of counts");
-   for (i = 0; i < counts1->n_counts; i++)
-      counts1->counts[i] += counts2->counts[i];
-}
-
-static Bool addCountsToMap ( SOURCE* s,
-                             WordFM* counts_map, 
-                             UWord lnno, Counts* newCounts )
-{
-   Counts* oldCounts;
-   // look up lnno in the map.  If none present, add a binding
-   // lnno->counts.  If present, add counts to the existing entry.
-   if (lookupFM( counts_map, (Word*)(&oldCounts), (Word)lnno )) {
-      // merge with existing binding
-      addCounts( s, oldCounts, newCounts );
-      return True;
-   } else {
-      // create new binding
-      addToFM( counts_map, (Word)lnno, (Word)newCounts );
-      return False;
-   }
-}
-
-static
-void handle_counts ( SOURCE* s,
-                     CacheProfFile* cpf, 
-                     const char* fi, const char* fn, const char* newCountsStr )
-{
-   WordFM* countsMap;
-   Bool    freeNewCounts;
-   UWord   lnno;
-   Counts* newCounts;
-   FileFn* topKey; 
-
-   if (0)  printf("%s %s %s\n", fi, fn, newCountsStr );
-
-   // parse the numbers
-   newCounts = splitUpCountsLine( s, &lnno, newCountsStr );
-
-   // Did we get the right number?
-   if (newCounts->n_counts != cpf->n_events)
-      goto oom;
-
-   // allocate the key
-   topKey = malloc(sizeof(FileFn));
-   if (topKey) {
-      topKey->fi_name = strdup(fi);
-      topKey->fn_name = strdup(fn);
-   }
-   if (! (topKey && topKey->fi_name && topKey->fn_name))
-      mallocFail(s, "handle_counts:");
-
-   // search for it
-   if (lookupFM( cpf->outerMap, (Word*)(&countsMap), (Word)topKey )) {
-      // found it.  Merge in new counts
-      freeNewCounts = addCountsToMap( s, countsMap, lnno, newCounts );
-      ddel_FileFn(topKey);
-   } else {
-      // not found in the top map.  Create new entry
-      countsMap = newFM( malloc, free, cmp_unboxed_UWord );
-      if (!countsMap)
-         goto oom;
-      addToFM( cpf->outerMap, (Word)topKey, (Word)countsMap );
-      freeNewCounts = addCountsToMap( s, countsMap, lnno, newCounts );
-   }
-
-   // also add to running summary total
-   addCounts( s, cpf->summary, newCounts );
-
-   // if safe to do so, free up the count vector
-   if (freeNewCounts)
-      ddel_Counts(newCounts);
-
-   return;
-
-  oom:
-   parseError(s, "# counts doesn't match # events");
-}
-
-
-/* Parse a complete file from the stream in 's'.  If a parse error
-   happens, do not return; instead exit via parseError().  If an
-   out-of-memory condition happens, do not return; instead exit via
-   mallocError().
-*/
-static CacheProfFile* parse_CacheProfFile ( SOURCE* s )
-{
-   Int            i;
-   char**         tmp_desclines = NULL;
-   unsigned       tmp_desclines_size = 0;
-   char*          p;
-   int            n_tmp_desclines = 0;
-   CacheProfFile* cpf;
-   Counts*        summaryRead; 
-   char*          curr_fn = strdup("???");
-   char*          curr_fl = strdup("???");
-   const char*    line;
-
-   cpf = new_CacheProfFile( NULL, NULL, NULL, 0, NULL, NULL, NULL );
-   if (cpf == NULL)
-      mallocFail(s, "parse_CacheProfFile(1)");
-
-   // Parse "desc:" lines
-   while (1) {
-      line = readline(s);
-      if (!line) 
-         break;
-      if (!streqn(line, "desc: ", 6))
-         break;
-      if (n_tmp_desclines >= tmp_desclines_size) {
-         tmp_desclines_size += 100;
-         tmp_desclines = realloc(tmp_desclines,
-                                 tmp_desclines_size * sizeof *tmp_desclines);
-         if (tmp_desclines == NULL)
-            mallocFail(s, "parse_CacheProfFile(1)");
-      }
-      tmp_desclines[n_tmp_desclines++] = strdup(line);
-   }
-
-   if (n_tmp_desclines == 0)
-      parseError(s, "parse_CacheProfFile: no DESC lines present");
-
-   cpf->desc_lines = malloc( (1+n_tmp_desclines) * sizeof(char*) );
-   if (cpf->desc_lines == NULL)
-      mallocFail(s, "parse_CacheProfFile(2)");
-
-   cpf->desc_lines[n_tmp_desclines] = NULL;
-   for (i = 0; i < n_tmp_desclines; i++)
-      cpf->desc_lines[i] = tmp_desclines[i];
-
-   // Parse "cmd:" line
-   if (!streqn(line, "cmd: ", 5))
-      parseError(s, "parse_CacheProfFile: no CMD line present");
-
-   cpf->cmd_line = strdup(line);
-   if (cpf->cmd_line == NULL)
-      mallocFail(s, "parse_CacheProfFile(3)");
-
-   // Parse "events:" line and figure out how many events there are
-   line = readline(s);
-   if (!line)
-      parseError(s, "parse_CacheProfFile: eof before EVENTS line");
-   if (!streqn(line, "events: ", 8))
-      parseError(s, "parse_CacheProfFile: no EVENTS line present");
-
-   // figure out how many events there are by counting the number
-   // of space-alphanum transitions in the events_line
-   cpf->events_line = strdup(line);
-   if (cpf->events_line == NULL)
-      mallocFail(s, "parse_CacheProfFile(3)");
-
-   cpf->n_events = 0;
-   assert(cpf->events_line[6] == ':');
-   for (p = &cpf->events_line[6]; *p; p++) {
-      if (p[0] == ' ' && isalpha(p[1]))
-         cpf->n_events++;
-   }
-
-   // create the running cross-check summary
-   cpf->summary = new_Counts_Zeroed( cpf->n_events );
-   if (cpf->summary == NULL)
-      mallocFail(s, "parse_CacheProfFile(4)");
-
-   // create the outer map (file+fn name --> inner map)
-   cpf->outerMap = newFM ( malloc, free, cmp_FileFn );
-   if (cpf->outerMap == NULL)
-      mallocFail(s, "parse_CacheProfFile(5)");
-
-   // process count lines
-   while (1) {
-      line = readline(s);
-      if (!line)
-         parseError(s, "parse_CacheProfFile: eof before SUMMARY line");
-
-      if (isdigit(line[0])) {
-         handle_counts(s, cpf, curr_fl, curr_fn, line);
-         continue;
-      }
-      else
-      if (streqn(line, "fn=", 3)) {
-         free(curr_fn);
-         curr_fn = strdup(line+3);
-         continue;
-      }
-      else
-      if (streqn(line, "fl=", 3)) {
-         free(curr_fl);
-         curr_fl = strdup(line+3);
-         continue;
-      }
-      else
-      if (streqn(line, "summary: ", 9)) {
-         break;
-      }
-      else
-         parseError(s, "parse_CacheProfFile: unexpected line in main data");
-   }
-
-   // finally, the "summary:" line
-   if (!streqn(line, "summary: ", 9))
-      parseError(s, "parse_CacheProfFile: missing SUMMARY line");
-
-   cpf->summary_line = strdup(line);
-   if (cpf->summary_line == NULL)
-      mallocFail(s, "parse_CacheProfFile(6)");
-
-   // there should be nothing more
-   line = readline(s);
-   if (line)
-      parseError(s, "parse_CacheProfFile: "
-                    "extraneous content after SUMMARY line");
-
-   // check the summary counts are as expected
-   summaryRead = splitUpCountsLine( s, NULL, &cpf->summary_line[8] );
-   if (summaryRead == NULL)
-      mallocFail(s, "parse_CacheProfFile(7)");
-   if (summaryRead->n_counts != cpf->n_events)
-      parseError(s, "parse_CacheProfFile: wrong # counts in SUMMARY line");
-   for (i = 0; i < summaryRead->n_counts; i++) {
-      if (summaryRead->counts[i] != cpf->summary->counts[i]) {
-         parseError(s, "parse_CacheProfFile: "
-                       "computed vs stated SUMMARY counts mismatch");
-      }
-   }
-   free(summaryRead->counts);
-   sdel_Counts(summaryRead);
-
-   // since the summary counts are OK, free up the summary_line text
-   // which contains the same info.
-   free(cpf->summary_line);
-   cpf->summary_line = NULL;
-
-   free(tmp_desclines);
-   free(curr_fn);
-   free(curr_fl);
-
-   // All looks OK
-   return cpf;
-}
-
-
-static void merge_CacheProfInfo ( SOURCE* s,
-                                  /*MOD*/CacheProfFile* dst,
-                                  CacheProfFile* src )
-{
-   /* For each (filefn, innerMap) in src
-      if filefn not in dst
-         add binding dopy(filefn)->dopy(innerMap) in src
-      else
-         // merge src->innerMap with dst->innerMap
-         for each (lineno, counts) in src->innerMap
-         if lineno not in dst->innerMap
-            add binding lineno->dopy(counts) to dst->innerMap
-         else
-            add counts into dst->innerMap[lineno]
-   */
-   /* Outer iterator:  FileFn* -> WordFM* (inner iterator)
-      Inner iterator:  UWord   -> Counts*
-   */
-   FileFn* soKey;
-   WordFM* soVal;
-   WordFM* doVal;
-   UWord   siKey;
-   Counts* siVal;
-   Counts* diVal;
-
-   /* First check mundane things: that the events: lines are
-      identical. */
-   if (!streq( dst->events_line, src->events_line ))
-     barf(s, "\"events:\" line of most recent file does "
-             "not match those previously processed");
-
-   initIterFM( src->outerMap );
-
-   // for (filefn, innerMap) in src
-   while (nextIterFM( src->outerMap, (Word*)&soKey, (Word*)&soVal )) {
-
-      // is filefn in dst?   
-      if (! lookupFM( dst->outerMap, (Word*)&doVal, (Word)soKey )) {
-
-         // no .. add dopy(filefn) -> dopy(innerMap) to src
-         FileFn* c_soKey = dopy_FileFn(soKey);
-         WordFM* c_soVal = dopy_InnerMap(soVal);
-         if ((!c_soKey) || (!c_soVal)) goto oom;
-         addToFM( dst->outerMap, (Word)c_soKey, (Word)c_soVal );
-
-      } else {
-
-         // yes .. merge the two innermaps
-         initIterFM( soVal );
-
-         // for (lno, counts) in soVal (source inner map)
-         while (nextIterFM( soVal, (Word*)&siKey, (Word*)&siVal )) {
-
-            // is lno in the corresponding dst inner map?
-            if (! lookupFM( doVal, (Word*)&diVal, siKey )) {
-
-               // no .. add lineno->dopy(counts) to dst inner map
-               Counts* c_siVal = dopy_Counts( siVal );
-               if (!c_siVal) goto oom;
-               addToFM( doVal, siKey, (Word)c_siVal );
-
-            } else {
-
-               // yes .. merge counts into dst inner map val
-               addCounts( s, diVal, siVal );
-
-            }
-         }
-
-      }
-
-   }
-
-   // add the summaries too
-   addCounts(s, dst->summary, src->summary );
-
-   return;
-
-  oom:
-   mallocFail(s, "merge_CacheProfInfo");
-}
-
-static void usage ( void )
-{
-   fprintf(stderr, "%s: Merges multiple cachegrind output files into one\n", 
-                   argv0);
-   fprintf(stderr, "%s: usage: %s [-o outfile] [files-to-merge]\n", 
-                   argv0, argv0);
-   exit(1);
-}
-
-int main ( int argc, char** argv )
-{
-   Int            i;
-   SOURCE         src;
-   CacheProfFile  *cpf, *cpfTmp;
-
-   FILE*          outfile = NULL;
-   char*          outfilename = NULL;
-   Int            outfileix = 0;
-
-   if (argv[0])
-      argv0 = argv[0];
-
-   if (argc < 2)
-      usage();
-
-   for (i = 1; i < argc; i++) {
-      if (streq(argv[i], "-h") || streq(argv[i], "--help"))
-         usage();
-   }
-
-   /* Scan args, looking for '-o outfilename'. */
-   for (i = 1; i < argc; i++) {
-      if (streq(argv[i], "-o")) {
-         if (i+1 < argc) {
-            outfilename = argv[i+1];
-            outfileix   = i;
-            break;
-         } else {
-            usage();
-         }
-      }
-   }
-
-   cpf = NULL;
-
-   for (i = 1; i < argc; i++) {
-
-      if (i == outfileix) {
-         /* Skip '-o' and whatever follows it */
-         i += 1;
-         continue;
-      }
-
-      fprintf(stderr, "%s: parsing %s\n", argv0, argv[i]);
-      src.lno      = 1;
-      src.filename = argv[i];
-      src.fp       = fopen(src.filename, "r");
-      if (!src.fp) {
-         perror(argv0);
-         barf(&src, "Cannot open input file");
-      }
-      assert(src.fp);
-      cpfTmp = parse_CacheProfFile( &src );
-      fclose(src.fp);
-
-      /* If this isn't the first file, merge */
-      if (cpf == NULL) {
-         /* this is the first file */
-         cpf = cpfTmp;
-      } else {
-         /* not the first file; merge */
-         fprintf(stderr, "%s: merging %s\n", argv0, argv[i]);
-         merge_CacheProfInfo( &src, cpf, cpfTmp );
-         ddel_CacheProfFile( cpfTmp );
-      }
-
-   }
-
-   /* Now create the output file. */
-
-   if (cpf) {
-
-      fprintf(stderr, "%s: writing %s\n", 
-                       argv0, outfilename ? outfilename : "(stdout)" );
-
-      /* Write the output. */
-      if (outfilename) {
-         outfile = fopen(outfilename, "w");
-         if (!outfile) {
-            fprintf(stderr, "%s: can't create output file %s\n", 
-                            argv0, outfilename);
-            perror(argv0);
-            exit(1);
-         }
-      } else {
-         outfile = stdout;
-      }
-
-      show_CacheProfFile( outfile, cpf );
-      if (ferror(outfile)) {
-         fprintf(stderr, "%s: error writing output file %s\n", 
-                         argv0, outfilename ? outfilename : "(stdout)" );
-         perror(argv0);
-         if (outfile != stdout)
-            fclose(outfile);
-         exit(1);
-      }
-
-      fflush(outfile);
-      if (outfile != stdout)
-         fclose( outfile );
-
-      ddel_CacheProfFile( cpf );
-   }
-
-   return 0;
-}
-
-
-//------------------------------------------------------------------//
-//---                           WordFM                           ---//
-//---                       Implementation                       ---//
-//------------------------------------------------------------------//
-
-/* ------------ Implementation ------------ */
-
-/* One element of the AVL tree */
-typedef
-   struct _AvlNode {
-      Word key;
-      Word val;
-      struct _AvlNode* left;
-      struct _AvlNode* right;
-      Char balance;
-   }
-   AvlNode;
-
-typedef 
-   struct {
-      Word w;
-      Bool b;
-   }
-   MaybeWord;
-
-#define WFM_STKMAX    32    // At most 2**32 entries can be iterated over
-
-struct _WordFM {
-   AvlNode* root;
-   void*    (*alloc_nofail)( SizeT );
-   void     (*dealloc)(void*);
-   Word     (*kCmp)(Word,Word);
-   AvlNode* nodeStack[WFM_STKMAX]; // Iterator node stack
-   Int      numStack[WFM_STKMAX];  // Iterator num stack
-   Int      stackTop;              // Iterator stack pointer, one past end
-}; 
-
-/* forward */
-static Bool avl_removeroot_wrk(AvlNode** t, Word(*kCmp)(Word,Word));
-
-/* Swing to the left.  Warning: no balance maintenance. */
-static void avl_swl ( AvlNode** root )
-{
-   AvlNode* a = *root;
-   AvlNode* b = a->right;
-   *root    = b;
-   a->right = b->left;
-   b->left  = a;
-}
-
-/* Swing to the right.  Warning: no balance maintenance. */
-static void avl_swr ( AvlNode** root )
-{
-   AvlNode* a = *root;
-   AvlNode* b = a->left;
-   *root    = b;
-   a->left  = b->right;
-   b->right = a;
-}
-
-/* Balance maintenance after especially nasty swings. */
-static void avl_nasty ( AvlNode* root )
-{
-   switch (root->balance) {
-      case -1: 
-         root->left->balance  = 0;
-         root->right->balance = 1;
-         break;
-      case 1:
-         root->left->balance  = -1;
-         root->right->balance = 0;
-         break;
-      case 0:
-         root->left->balance  = 0;
-         root->right->balance = 0;
-         break;
-      default:
-         assert(0);
-   }
-   root->balance=0;
-}
-
-/* Find size of a non-NULL tree. */
-static Word size_avl_nonNull ( AvlNode* nd )
-{
-   return 1 + (nd->left  ? size_avl_nonNull(nd->left)  : 0)
-            + (nd->right ? size_avl_nonNull(nd->right) : 0);
-}
-
-/* Insert element a into the AVL tree t.  Returns True if the depth of
-   the tree has grown.  If element with that key is already present,
-   just copy a->val to existing node, first returning old ->val field
-   of existing node in *oldV, so that the caller can finalize it
-   however it wants.
-*/
-static 
-Bool avl_insert_wrk ( AvlNode**         rootp, 
-                      /*OUT*/MaybeWord* oldV,
-                      AvlNode*          a, 
-                      Word              (*kCmp)(Word,Word) )
-{
-   Word cmpres;
-
-   /* initialize */
-   a->left    = 0;
-   a->right   = 0;
-   a->balance = 0;
-   oldV->b    = False;
-
-   /* insert into an empty tree? */
-   if (!(*rootp)) {
-      (*rootp) = a;
-      return True;
-   }
- 
-   cmpres = kCmp( (*rootp)->key, a->key );
-
-   if (cmpres > 0) {
-      /* insert into the left subtree */
-      if ((*rootp)->left) {
-         AvlNode* left_subtree = (*rootp)->left;
-         if (avl_insert_wrk(&left_subtree, oldV, a, kCmp)) {
-            switch ((*rootp)->balance--) {
-               case  1: return False;
-               case  0: return True;
-               case -1: break;
-               default: assert(0);
-            }
-            if ((*rootp)->left->balance < 0) {
-               avl_swr( rootp );
-               (*rootp)->balance = 0;
-               (*rootp)->right->balance = 0;
-            } else {
-               avl_swl( &((*rootp)->left) );
-               avl_swr( rootp );
-               avl_nasty( *rootp );
-            }
-         } else {
-            (*rootp)->left = left_subtree;
-         }
-         return False;
-      } else {
-         (*rootp)->left = a;
-         if ((*rootp)->balance--) 
-            return False;
-         return True;
-      }
-      assert(0);/*NOTREACHED*/
-   }
-   else 
-   if (cmpres < 0) {
-      /* insert into the right subtree */
-      if ((*rootp)->right) {
-         AvlNode* right_subtree = (*rootp)->right;
-         if (avl_insert_wrk(&right_subtree, oldV, a, kCmp)) {
-            switch((*rootp)->balance++){
-               case -1: return False;
-               case  0: return True;
-               case  1: break;
-               default: assert(0);
-            }
-            if ((*rootp)->right->balance > 0) {
-               avl_swl( rootp );
-               (*rootp)->balance = 0;
-               (*rootp)->left->balance = 0;
-            } else {
-               avl_swr( &((*rootp)->right) );
-               avl_swl( rootp );
-               avl_nasty( *rootp );
-            }
-         } else {
-            (*rootp)->right = right_subtree;
-         }
-         return False;
-      } else {
-         (*rootp)->right = a;
-         if ((*rootp)->balance++) 
-            return False;
-         return True;
-      }
-      assert(0);/*NOTREACHED*/
-   }
-   else {
-      /* cmpres == 0, a duplicate - replace the val, but don't
-         incorporate the node in the tree */
-      oldV->b = True;
-      oldV->w = (*rootp)->val;
-      (*rootp)->val = a->val;
-      return False;
-   }
-}
-
-/* Remove an element a from the AVL tree t.  a must be part of
-   the tree.  Returns True if the depth of the tree has shrunk. 
-*/
-static
-Bool avl_remove_wrk ( AvlNode** rootp, 
-                      AvlNode*  a, 
-                      Word(*kCmp)(Word,Word) )
-{
-   Bool ch;
-   Word cmpres = kCmp( (*rootp)->key, a->key );
-
-   if (cmpres > 0){
-      /* remove from the left subtree */
-      AvlNode* left_subtree = (*rootp)->left;
-      assert(left_subtree);
-      ch = avl_remove_wrk(&left_subtree, a, kCmp);
-      (*rootp)->left=left_subtree;
-      if (ch) {
-         switch ((*rootp)->balance++) {
-            case -1: return True;
-            case  0: return False;
-            case  1: break;
-            default: assert(0);
-         }
-         switch ((*rootp)->right->balance) {
-            case 0:
-               avl_swl( rootp );
-               (*rootp)->balance = -1;
-               (*rootp)->left->balance = 1;
-               return False;
-            case 1: 
-               avl_swl( rootp );
-               (*rootp)->balance = 0;
-               (*rootp)->left->balance = 0;
-               return -1;
-            case -1:
-               break;
-            default:
-               assert(0);
-         }
-         avl_swr( &((*rootp)->right) );
-         avl_swl( rootp );
-         avl_nasty( *rootp );
-         return True;
-      }
-   }
-   else
-   if (cmpres < 0) {
-      /* remove from the right subtree */
-      AvlNode* right_subtree = (*rootp)->right;
-      assert(right_subtree);
-      ch = avl_remove_wrk(&right_subtree, a, kCmp);
-      (*rootp)->right = right_subtree;
-      if (ch) {
-         switch ((*rootp)->balance--) {
-            case  1: return True;
-            case  0: return False;
-            case -1: break;
-            default: assert(0);
-         }
-         switch ((*rootp)->left->balance) {
-            case 0:
-               avl_swr( rootp );
-               (*rootp)->balance = 1;
-               (*rootp)->right->balance = -1;
-               return False;
-            case -1:
-               avl_swr( rootp );
-               (*rootp)->balance = 0;
-               (*rootp)->right->balance = 0;
-               return True;
-            case 1:
-               break;
-            default:
-               assert(0);
-         }
-         avl_swl( &((*rootp)->left) );
-         avl_swr( rootp );
-         avl_nasty( *rootp );
-         return True;
-      }
-   }
-   else {
-      assert(cmpres == 0);
-      assert((*rootp)==a);
-      return avl_removeroot_wrk(rootp, kCmp);
-   }
-   return 0;
-}
-
-/* Remove the root of the AVL tree *rootp.
- * Warning: dumps core if *rootp is empty
- */
-static 
-Bool avl_removeroot_wrk ( AvlNode** rootp, 
-                          Word(*kCmp)(Word,Word) )
-{
-   Bool     ch;
-   AvlNode* a;
-   if (!(*rootp)->left) {
-      if (!(*rootp)->right) {
-         (*rootp) = 0;
-         return True;
-      }
-      (*rootp) = (*rootp)->right;
-      return True;
-   }
-   if (!(*rootp)->right) {
-      (*rootp) = (*rootp)->left;
-      return True;
-   }
-   if ((*rootp)->balance < 0) {
-      /* remove from the left subtree */
-      a = (*rootp)->left;
-      while (a->right) a = a->right;
-   } else {
-      /* remove from the right subtree */
-      a = (*rootp)->right;
-      while (a->left) a = a->left;
-   }
-   ch = avl_remove_wrk(rootp, a, kCmp);
-   a->left    = (*rootp)->left;
-   a->right   = (*rootp)->right;
-   a->balance = (*rootp)->balance;
-   (*rootp)   = a;
-   if(a->balance == 0) return ch;
-   return False;
-}
-
-static 
-AvlNode* avl_find_node ( AvlNode* t, Word k, Word(*kCmp)(Word,Word) )
-{
-   Word cmpres;
-   while (True) {
-      if (t == NULL) return NULL;
-      cmpres = kCmp(t->key, k);
-      if (cmpres > 0) t = t->left;  else
-      if (cmpres < 0) t = t->right; else
-      return t;
-   }
-}
-
-// Clear the iterator stack.
-static void stackClear(WordFM* fm)
-{
-   Int i;
-   assert(fm);
-   for (i = 0; i < WFM_STKMAX; i++) {
-      fm->nodeStack[i] = NULL;
-      fm->numStack[i]  = 0;
-   }
-   fm->stackTop = 0;
-}
-
-// Push onto the iterator stack.
-static inline void stackPush(WordFM* fm, AvlNode* n, Int i)
-{
-   assert(fm->stackTop < WFM_STKMAX);
-   assert(1 <= i && i <= 3);
-   fm->nodeStack[fm->stackTop] = n;
-   fm-> numStack[fm->stackTop] = i;
-   fm->stackTop++;
-}
-
-// Pop from the iterator stack.
-static inline Bool stackPop(WordFM* fm, AvlNode** n, Int* i)
-{
-   assert(fm->stackTop <= WFM_STKMAX);
-
-   if (fm->stackTop > 0) {
-      fm->stackTop--;
-      *n = fm->nodeStack[fm->stackTop];
-      *i = fm-> numStack[fm->stackTop];
-      assert(1 <= *i && *i <= 3);
-      fm->nodeStack[fm->stackTop] = NULL;
-      fm-> numStack[fm->stackTop] = 0;
-      return True;
-   } else {
-      return False;
-   }
-}
-
-static 
-AvlNode* avl_dopy ( AvlNode* nd, 
-                    Word(*dopyK)(Word), 
-                    Word(*dopyV)(Word),
-                    void*(alloc_nofail)(SizeT) )
-{
-   AvlNode* nyu;
-   if (! nd)
-      return NULL;
-   nyu = alloc_nofail(sizeof(AvlNode));
-   assert(nyu);
-   
-   nyu->left = nd->left;
-   nyu->right = nd->right;
-   nyu->balance = nd->balance;
-
-   /* Copy key */
-   if (dopyK) {
-      nyu->key = dopyK( nd->key );
-      if (nd->key != 0 && nyu->key == 0)
-         return NULL; /* oom in key dcopy */
-   } else {
-      /* copying assumedly unboxed keys */
-      nyu->key = nd->key;
-   }
-
-   /* Copy val */
-   if (dopyV) {
-      nyu->val = dopyV( nd->val );
-      if (nd->val != 0 && nyu->val == 0)
-         return NULL; /* oom in val dcopy */
-   } else {
-      /* copying assumedly unboxed vals */
-      nyu->val = nd->val;
-   }
-
-   /* Copy subtrees */
-   if (nyu->left) {
-      nyu->left = avl_dopy( nyu->left, dopyK, dopyV, alloc_nofail );
-      if (! nyu->left)
-         return NULL;
-   }
-   if (nyu->right) {
-      nyu->right = avl_dopy( nyu->right, dopyK, dopyV, alloc_nofail );
-      if (! nyu->right)
-         return NULL;
-   }
-
-   return nyu;
-}
-
-/* --- Public interface functions --- */
-
-/* Initialise a WordFM. */
-void initFM ( WordFM* fm,
-              void*   (*alloc_nofail)( SizeT ),
-              void    (*dealloc)(void*),
-              Word    (*kCmp)(Word,Word) )
-{
-   fm->root         = 0;
-   fm->kCmp         = kCmp;
-   fm->alloc_nofail = alloc_nofail;
-   fm->dealloc      = dealloc;
-   fm->stackTop     = 0;
-}
-
-/* Allocate and Initialise a WordFM. */
-WordFM* newFM( void* (*alloc_nofail)( SizeT ),
-               void  (*dealloc)(void*),
-               Word  (*kCmp)(Word,Word) )
-{
-   WordFM* fm = alloc_nofail(sizeof(WordFM));
-   assert(fm);
-   initFM(fm, alloc_nofail, dealloc, kCmp);
-   return fm;
-}
-
-static void avl_free ( AvlNode* nd, 
-                       void(*kFin)(Word),
-                       void(*vFin)(Word),
-                       void(*dealloc)(void*) )
-{
-   if (!nd)
-      return;
-   if (nd->left)
-      avl_free(nd->left, kFin, vFin, dealloc);
-   if (nd->right)
-      avl_free(nd->right, kFin, vFin, dealloc);
-   if (kFin)
-      kFin( nd->key );
-   if (vFin)
-      vFin( nd->val );
-   memset(nd, 0, sizeof(AvlNode));
-   dealloc(nd);
-}
-
-/* Free up the FM.  If kFin is non-NULL, it is applied to keys
-   before the FM is deleted; ditto with vFin for vals. */
-void deleteFM ( WordFM* fm, void(*kFin)(Word), void(*vFin)(Word) )
-{
-   void(*dealloc)(void*) = fm->dealloc;
-   avl_free( fm->root, kFin, vFin, dealloc );
-   memset(fm, 0, sizeof(WordFM) );
-   dealloc(fm);
-}
-
-/* Add (k,v) to fm. */
-void addToFM ( WordFM* fm, Word k, Word v )
-{
-   MaybeWord oldV;
-   AvlNode* node;
-   node = fm->alloc_nofail( sizeof(struct _AvlNode) );
-   node->key = k;
-   node->val = v;
-   oldV.b = False;
-   oldV.w = 0;
-   avl_insert_wrk( &fm->root, &oldV, node, fm->kCmp );
-   //if (oldV.b && fm->vFin)
-   //   fm->vFin( oldV.w );
-   if (oldV.b)
-      free(node);
-}
-
-// Delete key from fm, returning associated val if found
-Bool delFromFM ( WordFM* fm, /*OUT*/Word* oldV, Word key )
-{
-   AvlNode* node = avl_find_node( fm->root, key, fm->kCmp );
-   if (node) {
-      avl_remove_wrk( &fm->root, node, fm->kCmp );
-      if (oldV)
-         *oldV = node->val;
-      fm->dealloc(node);
-      return True;
-   } else {
-      return False;
-   }
-}
-
-// Look up in fm, assigning found val at spec'd address
-Bool lookupFM ( WordFM* fm, /*OUT*/Word* valP, Word key )
-{
-   AvlNode* node = avl_find_node( fm->root, key, fm->kCmp );
-   if (node) {
-      if (valP)
-         *valP = node->val;
-      return True;
-   } else {
-      return False;
-   }
-}
-
-Word sizeFM ( WordFM* fm )
-{
-   // Hmm, this is a bad way to do this
-   return fm->root ? size_avl_nonNull( fm->root ) : 0;
-}
-
-// set up FM for iteration
-void initIterFM ( WordFM* fm )
-{
-   assert(fm);
-   stackClear(fm);
-   if (fm->root)
-      stackPush(fm, fm->root, 1);
-}
-
-// get next key/val pair.  Will assert if fm has been modified
-// or looked up in since initIterFM was called.
-Bool nextIterFM ( WordFM* fm, /*OUT*/Word* pKey, /*OUT*/Word* pVal )
-{
-   Int i = 0;
-   AvlNode* n = NULL;
-   
-   assert(fm);
-
-   // This in-order traversal requires each node to be pushed and popped
-   // three times.  These could be avoided by updating nodes in-situ on the
-   // top of the stack, but the push/pop cost is so small that it's worth
-   // keeping this loop in this simpler form.
-   while (stackPop(fm, &n, &i)) {
-      switch (i) {
-      case 1: 
-         stackPush(fm, n, 2);
-         if (n->left)  stackPush(fm, n->left, 1);
-         break;
-      case 2: 
-         stackPush(fm, n, 3);
-         if (pKey) *pKey = n->key;
-         if (pVal) *pVal = n->val;
-         return True;
-      case 3:
-         if (n->right) stackPush(fm, n->right, 1);
-         break;
-      default:
-         assert(0);
-      }
-   }
-
-   // Stack empty, iterator is exhausted, return NULL
-   return False;
-}
-
-// clear the I'm iterating flag
-void doneIterFM ( WordFM* fm )
-{
-}
-
-WordFM* dopyFM ( WordFM* fm, Word(*dopyK)(Word), Word(*dopyV)(Word) )
-{
-   WordFM* nyu; 
-
-   /* can't clone the fm whilst iterating on it */
-   assert(fm->stackTop == 0);
-
-   nyu = fm->alloc_nofail( sizeof(WordFM) );
-   assert(nyu);
-
-   *nyu = *fm;
-
-   fm->stackTop = 0;
-   memset(fm->nodeStack, 0, sizeof(fm->nodeStack));
-   memset(fm->numStack, 0,  sizeof(fm->numStack));
-
-   if (nyu->root) {
-      nyu->root = avl_dopy( nyu->root, dopyK, dopyV, fm->alloc_nofail );
-      if (! nyu->root)
-         return NULL;
-   }
-
-   return nyu;
-}
-
-//------------------------------------------------------------------//
-//---                         end WordFM                         ---//
-//---                       Implementation                       ---//
-//------------------------------------------------------------------//
-
-/*--------------------------------------------------------------------*/
-/*--- end                                               cg_merge.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg_merge.in b/cachegrind/cg_merge.in
new file mode 100755
index 0000000000..fca73439eb
--- /dev/null
+++ b/cachegrind/cg_merge.in
@@ -0,0 +1,339 @@
+#! /usr/bin/env python3
+# pyright: strict
+
+# --------------------------------------------------------------------
+# --- Cachegrind's merger.                             cg_merge.in ---
+# --------------------------------------------------------------------
+
+#  This file is part of Cachegrind, a Valgrind tool for cache
+#  profiling programs.
+#
+#  Copyright (C) 2002-2023 Nicholas Nethercote
+#     njn@valgrind.org
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License as
+#  published by the Free Software Foundation; either version 2 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+#  The GNU General Public License is contained in the file COPYING.
+
+"""
+This script diffs Cachegrind output files.
+"""
+
+# Use `make pymerge` to "build" this script every time it is changed. This runs
+# the formatters, type-checkers, and linters on `cg_merge.in` and then
+# generates `cg_merge`.
+#
+# This is a cut-down version of `cg_annotate.in`.
+
+from __future__ import annotations
+
+import re
+import sys
+from argparse import ArgumentParser, Namespace
+from collections import defaultdict
+from typing import DefaultDict, NoReturn, TextIO
+
+
+class Args(Namespace):
+    """
+    A typed wrapper for parsed args.
+
+    None of these fields are modified after arg parsing finishes.
+    """
+
+    output: str
+    cgout_filename: list[str]
+
+    @staticmethod
+    def parse() -> Args:
+        p = ArgumentParser(description="Merge multiple Cachegrind output files.")
+
+        p.add_argument("--version", action="version", version="%(prog)s-@VERSION@")
+
+        p.add_argument(
+            "-o",
+            dest="output",
+            type=str,
+            metavar="FILE",
+            help="output file (default: stdout)",
+        )
+
+        p.add_argument(
+            "cgout_filename",
+            nargs="+",
+            metavar="cachegrind-out-file",
+            help="file produced by Cachegrind",
+        )
+
+        return p.parse_args(namespace=Args())
+
+
+# Args are stored in a global for easy access.
+args = Args.parse()
+
+# A single instance of this class is constructed, from `args` and the `events:`
+# line in the cgout file.
+class Events:
+    # The event names.
+    events: list[str]
+
+    def __init__(self, text: str) -> None:
+        self.events = text.split()
+        self.num_events = len(self.events)
+
+    def mk_cc(self, text: str) -> Cc:
+        """Raises a `ValueError` exception on syntax error."""
+        # This is slightly faster than a list comprehension.
+        counts = list(map(int, text.split()))
+
+        if len(counts) == self.num_events:
+            pass
+        elif len(counts) < self.num_events:
+            # Add zeroes at the end for any missing numbers.
+            counts.extend([0] * (self.num_events - len(counts)))
+        else:
+            raise ValueError
+
+        return Cc(counts)
+
+    def mk_empty_cc(self) -> Cc:
+        # This is much faster than a list comprehension.
+        return Cc([0] * self.num_events)
+
+
+class Cc:
+    """
+    This is a dumb container for counts.
+
+    It doesn't know anything about events, i.e. what each count means. It can
+    do basic operations like `__iadd__` and `__eq__`, and anything more must be
+    done elsewhere. `Events.mk_cc` and `Events.mk_empty_cc` are used for
+    construction.
+    """
+
+    # Always the same length as `Events.events`.
+    counts: list[int]
+
+    def __init__(self, counts: list[int]) -> None:
+        self.counts = counts
+
+    def __repr__(self) -> str:
+        return str(self.counts)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Cc):
+            return NotImplemented
+        return self.counts == other.counts
+
+    def __iadd__(self, other: Cc) -> Cc:
+        for i, other_count in enumerate(other.counts):
+            self.counts[i] += other_count
+        return self
+
+
+# Per-line CCs, organised by filename, function name, and line number.
+DictLineCc = DefaultDict[int, Cc]
+DictFnDictLineCc = DefaultDict[str, DictLineCc]
+DictFlDictFnDictLineCc = DefaultDict[str, DictFnDictLineCc]
+
+
+def die(msg: str) -> NoReturn:
+    print("cg_merge: error:", msg, file=sys.stderr)
+    sys.exit(1)
+
+
+def read_cgout_file(
+    cgout_filename: str,
+    is_first_file: bool,
+    cumul_dict_fl_dict_fn_dict_line_cc: DictFlDictFnDictLineCc,
+    cumul_summary_cc: Cc,
+) -> tuple[list[str], str, Events]:
+    # The file format is described in Cachegrind's manual.
+    try:
+        cgout_file = open(cgout_filename, "r", encoding="utf-8")
+    except OSError as err:
+        die(f"{err}")
+
+    with cgout_file:
+        cgout_line_num = 0
+
+        def parse_die(msg: str) -> NoReturn:
+            die(f"{cgout_file.name}:{cgout_line_num}: {msg}")
+
+        def readline() -> str:
+            nonlocal cgout_line_num
+            cgout_line_num += 1
+            return cgout_file.readline()
+
+        # Read "desc:" lines.
+        desc: list[str] = []
+        while line := readline():
+            if m := re.match(r"desc:\s+(.*)", line):
+                desc.append(m.group(1))
+            else:
+                break
+
+        # Read "cmd:" line. (`line` is already set from the "desc:" loop.)
+        if m := re.match(r"cmd:\s+(.*)", line):
+            cmd = m.group(1)
+        else:
+            parse_die("missing a `command:` line")
+
+        # Read "events:" line.
+        line = readline()
+        if m := re.match(r"events:\s+(.*)", line):
+            events = Events(m.group(1))
+        else:
+            parse_die("missing an `events:` line")
+
+        def mk_empty_dict_line_cc() -> DictLineCc:
+            return defaultdict(events.mk_empty_cc)
+
+        def mk_empty_dict_fn_dict_line_cc() -> DictFnDictLineCc:
+            return defaultdict(mk_empty_dict_line_cc)
+
+        summary_cc_present = False
+
+        curr_fl = ""
+        curr_fn = ""
+
+        # The `cumul_*` values are passed in by reference and are modified by
+        # this function. But they can't be properly initialized until the
+        # `events:` line of the first file is read and the number of events is
+        # known. So we initialize them in an invalid state, and then
+        # reinitialize them properly here, before their first use.
+        if is_first_file:
+            cumul_dict_fl_dict_fn_dict_line_cc.default_factory = (
+                mk_empty_dict_fn_dict_line_cc
+            )
+            cumul_summary_cc.counts = events.mk_empty_cc().counts
+
+        # Compile the one hot regex.
+        count_pat = re.compile(r"(\d+)\s+(.*)")
+
+        # Line matching is done in order of pattern frequency, for speed.
+        while True:
+            line = readline()
+
+            if m := count_pat.match(line):
+                line_num = int(m.group(1))
+                try:
+                    cc = events.mk_cc(m.group(2))
+                except ValueError:
+                    parse_die("malformed or too many event counts")
+
+                # Record this CC at the file/func/line level.
+                line_cc = cumul_dict_fl_dict_fn_dict_line_cc[curr_fl][curr_fn][line_num]
+                line_cc += cc
+
+            elif line.startswith("fn="):
+                curr_fn = line[3:-1]
+
+            elif line.startswith("fl="):
+                curr_fl = line[3:-1]
+                # A `fn=` line should follow, overwriting the "???".
+                curr_fn = "???"
+
+            elif m := re.match(r"summary:\s+(.*)", line):
+                summary_cc_present = True
+                try:
+                    cumul_summary_cc += events.mk_cc(m.group(1))
+                except ValueError:
+                    parse_die("too many event counts")
+
+            elif line == "":
+                break  # EOF
+
+            elif line == "\n" or line.startswith("#"):
+                # Skip empty lines and comment lines.
+                pass
+
+            else:
+                parse_die(f"malformed line: {line[:-1]}")
+
+    # Check if summary line was present.
+    if not summary_cc_present:
+        parse_die("missing `summary:` line, aborting")
+
+    # In `cg_annotate.in` and `cg_diff.in` we check that the file's summary CC
+    # matches the totals of the file's individual CCs, but not here. That's
+    # because in this script we don't collect the file's CCs in isolation,
+    # instead we just add them to the accumulated CCs, for speed. This makes it
+    # difficult to do the per-file checking.
+
+    return (desc, cmd, events)
+
+
+def main() -> None:
+    desc1: list[str] | None = None
+    cmd1 = None
+    events1 = None
+
+    # Different places where we accumulate CC data. Initialized to invalid
+    # states prior to the number of events being known.
+    cumul_dict_fl_dict_fn_dict_line_cc: DictFlDictFnDictLineCc = defaultdict(None)
+    cumul_summary_cc: Cc = Cc([])
+
+    for n, filename in enumerate(args.cgout_filename):
+        is_first_file = n == 0
+        (desc_n, cmd_n, events_n) = read_cgout_file(
+            filename,
+            is_first_file,
+            cumul_dict_fl_dict_fn_dict_line_cc,
+            cumul_summary_cc,
+        )
+        # We reuse the description and command from the first file, like the
+        # the old C version of `cg_merge`.
+        if is_first_file:
+            desc1 = desc_n
+            cmd1 = cmd_n
+            events1 = events_n
+        else:
+            assert events1
+            if events1.num_events != events_n.num_events:
+                die("events don't match")
+
+    def write_output(f: TextIO) -> None:
+        # These assertions hold because the loop above executes at least twice.
+        assert desc1
+        assert events1
+        assert cumul_dict_fl_dict_fn_dict_line_cc is not None
+        assert cumul_summary_cc
+
+        for desc_line in desc1:
+            print("desc:", desc_line, file=f)
+        print("cmd:", cmd1, file=f)
+        print("events:", *events1.events, sep=" ", file=f)
+
+        for fl, dict_fn_dict_line_cc in cumul_dict_fl_dict_fn_dict_line_cc.items():
+            print(f"fl={fl}", file=f)
+            for fn, dict_line_cc in dict_fn_dict_line_cc.items():
+                print(f"fn={fn}", file=f)
+                for line, cc in dict_line_cc.items():
+                    print(line, *cc.counts, file=f)
+
+        print("summary:", *cumul_summary_cc.counts, sep=" ", file=f)
+
+    if args.output:
+        try:
+            with open(args.output, "w", encoding="utf-8") as f:
+                write_output(f)
+        except OSError as err:
+            die(f"{err}")
+    else:
+        write_output(sys.stdout)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cachegrind/tests/Makefile.am b/cachegrind/tests/Makefile.am
index 16ac524b35..3aa85c9990 100644
--- a/cachegrind/tests/Makefile.am
+++ b/cachegrind/tests/Makefile.am
@@ -15,6 +15,8 @@ dist_noinst_SCRIPTS = filter_stderr filter_cachesim_discards
 EXTRA_DIST = \
 	ann-diff1.post.exp ann-diff1.stderr.exp ann-diff1.vgtest \
 		ann-diff2a.cgout ann-diff2b.cgout \
+	ann-merge1.post.exp ann-merge1.stderr.exp ann-merge1.vgtest 
+		ann-merge1a.cgout ann-merge1b.cgout \
 	ann1a.post.exp ann1a.stderr.exp ann1a.vgtest ann1.cgout \
 	ann1b.post.exp ann1b.stderr.exp ann1b.vgtest ann1b.cgout \
 	ann2.post.exp ann2.stderr.exp ann2.vgtest ann2.cgout \
diff --git a/cachegrind/tests/ann-merge-x.rs b/cachegrind/tests/ann-merge-x.rs
new file mode 100644
index 0000000000..b2f931a673
--- /dev/null
+++ b/cachegrind/tests/ann-merge-x.rs
@@ -0,0 +1,5 @@
+one
+two
+three
+four
+five
diff --git a/cachegrind/tests/ann-merge-y.rs b/cachegrind/tests/ann-merge-y.rs
new file mode 100644
index 0000000000..b566061598
--- /dev/null
+++ b/cachegrind/tests/ann-merge-y.rs
@@ -0,0 +1,6 @@
+one
+two
+three
+four
+five
+six
diff --git a/cachegrind/tests/ann-merge1.post.exp b/cachegrind/tests/ann-merge1.post.exp
new file mode 100644
index 0000000000..6e9a5f1a37
--- /dev/null
+++ b/cachegrind/tests/ann-merge1.post.exp
@@ -0,0 +1,66 @@
+--------------------------------------------------------------------------------
+-- Cachegrind profile
+--------------------------------------------------------------------------------
+Description 1a
+Description 1b
+Command:          Command 1
+Data file:        ann-merge1c.cgout
+Events recorded:  A B C
+Events shown:     A B C
+Event sort order: A B C
+Threshold:        0.1
+Include dirs:     
+User annotated:   
+Auto-annotation:  on
+
+--------------------------------------------------------------------------------
+-- Summary
+--------------------------------------------------------------------------------
+A           B            C            
+
+86 (100.0%) 113 (100.0%) 145 (100.0%)  PROGRAM TOTALS
+
+--------------------------------------------------------------------------------
+-- Function summary
+--------------------------------------------------------------------------------
+A          B          C            file:function
+
+40 (46.5%) 80 (70.8%) 120 (82.8%)  ann-merge-x.rs:x1
+20 (23.3%) 10  (8.8%)   5  (3.4%)  ann-merge-x.rs:x3
+16 (18.6%) 18 (15.9%)  20 (13.8%)  ann-merge-y.rs:y1
+10 (11.6%)  5  (4.4%)   0          ann-merge-x.rs:x2
+
+--------------------------------------------------------------------------------
+-- Auto-annotated source file: ann-merge-x.rs
+--------------------------------------------------------------------------------
+A          B          C          
+
+20 (23.3%) 40 (35.4%) 60 (41.4%)  one
+10 (11.6%) 20 (17.7%) 30 (20.7%)  two
+10 (11.6%) 20 (17.7%) 30 (20.7%)  three
+10 (11.6%)  5  (4.4%)  0          four
+20 (23.3%) 10  (8.8%)  5  (3.4%)  five
+
+--------------------------------------------------------------------------------
+-- Auto-annotated source file: ann-merge-y.rs
+--------------------------------------------------------------------------------
+A        B        C         
+
+8 (9.3%) 9 (8.0%) 10 (6.9%)  one
+8 (9.3%) 9 (8.0%) 10 (6.9%)  two
+.        .         .         three
+.        .         .         four
+.        .         .         five
+.        .         .         six
+
+--------------------------------------------------------------------------------
+-- Annotation summary
+--------------------------------------------------------------------------------
+A           B            C            
+
+86 (100.0%) 113 (100.0%) 145 (100.0%)    annotated: files known & above threshold & readable, line numbers known
+ 0            0            0             annotated: files known & above threshold & readable, line numbers unknown
+ 0            0            0           unannotated: files known & above threshold & unreadable 
+ 0            0            0           unannotated: files known & below threshold
+ 0            0            0           unannotated: files unknown
+
diff --git a/cachegrind/tests/ann-merge1.stderr.exp b/cachegrind/tests/ann-merge1.stderr.exp
new file mode 100644
index 0000000000..e8084c12c3
--- /dev/null
+++ b/cachegrind/tests/ann-merge1.stderr.exp
@@ -0,0 +1,17 @@
+
+
+I   refs:
+I1  misses:
+LLi misses:
+I1  miss rate:
+LLi miss rate:
+
+D   refs:
+D1  misses:
+LLd misses:
+D1  miss rate:
+LLd miss rate:
+
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/ann-merge1.vgtest b/cachegrind/tests/ann-merge1.vgtest
new file mode 100644
index 0000000000..5862828a3f
--- /dev/null
+++ b/cachegrind/tests/ann-merge1.vgtest
@@ -0,0 +1,7 @@
+# The `prog` doesn't matter because we don't use its output. Instead we test
+# the post-processing of the `ann{1,1b}.cgout` test files.
+prog: ../../tests/true
+vgopts: --cachegrind-out-file=cachegrind.out
+post: python ../../cachegrind/cg_merge ann-merge1a.cgout ann-merge1b.cgout > ann-merge1c.cgout && python ../../cachegrind/cg_annotate ann-merge1c.cgout
+cleanup: rm ann-merge1c.cgout
+
diff --git a/cachegrind/tests/ann-merge1a.cgout b/cachegrind/tests/ann-merge1a.cgout
new file mode 100644
index 0000000000..d3d1aec29b
--- /dev/null
+++ b/cachegrind/tests/ann-merge1a.cgout
@@ -0,0 +1,19 @@
+desc: Description 1a
+desc: Description 1b
+cmd: Command 1
+events: A B C
+
+fl=ann-merge-x.rs
+fn=x1
+1 10 20 30
+2 10 20 30
+
+fn=x2
+4 10 5 0
+
+fl=ann-merge-y.rs
+fn=y1
+1 8 9 10
+2 8 9 10
+
+summary: 46 63 80
diff --git a/cachegrind/tests/ann-merge1b.cgout b/cachegrind/tests/ann-merge1b.cgout
new file mode 100644
index 0000000000..8552ce275c
--- /dev/null
+++ b/cachegrind/tests/ann-merge1b.cgout
@@ -0,0 +1,14 @@
+desc: Description 2a
+desc: Description 2b
+cmd: Command 2
+events: A B C
+
+fl=ann-merge-x.rs
+fn=x1
+1 10 20 30
+3 10 20 30
+
+fn=x3
+5 20 10 5
+
+summary: 40 50 65
diff --git a/configure.ac b/configure.ac
index 92fb33c4a5..ea6d3991a8 100755
--- a/configure.ac
+++ b/configure.ac
@@ -5406,6 +5406,7 @@ AC_CONFIG_FILES([
    cachegrind/tests/x86/Makefile
    cachegrind/cg_annotate
    cachegrind/cg_diff
+   cachegrind/cg_merge
    callgrind/Makefile
    callgrind/callgrind_annotate
    callgrind/callgrind_control
-- 
2.47.2