From: Florian Krohm <florian@eich-krohm.de>
Date: Sun, 7 Oct 2012 19:47:04 +0000 (+0000)
Subject: This patch is the first installment of the cache info reorganisation.
X-Git-Tag: svn/VALGRIND_3_9_0~640
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a9b2103cf2d5335903ab2db6aeafb56d5cf323f2;p=thirdparty%2Fvalgrind.git

This patch is the first installment of the cache info reorganisation.
It's reorg only. No new cache autodetection stuff has been added.

coregrind
pub_tool_cpuid.h is removed as it is no longer exposed to tools.
Its contents has moved to pub_core_cpuid.h.
New file: coregrind/m_cache.c to contain the autodetect code for
cache configurations and define other cache characteristics that
cannot be autodetected (i.e. icaches_maintain_coherence). Most of
cg-arch/x86-amd64.c was moved here. The cache detection code for
x86-64 needs to be fixed to properly initialise VexCacheInfo. It
currently has cachegrind bias.
m_cache.c exports a single function (to coregrind):
   VG_(machine_get_cache_info)(VexArchInfo *vai)
This function is called from VG_(machine_get_hwcaps) after hwcaps have
been detected.

cachegrind
Remove cachegrind/cg-{ppc32,ppc43,arm,mips32,s390x,x86-amd64}.c
With the exception of x86/mamd64 those were only establishing a
default cache configuration and that is so small a code snippet that
a separate file is no longer warranted. So, the code was moved to
cg-arch.c. Code was added to extract the relevant info from
x86-amd64.
New function maybe_tweak_LLc which captures the code to massage the
LLc cache configuration into something the simulator can handle. This
was originally in cg-x86-amd64.c but should be used to all architectures.
Changed warning message about missing cache auto-detect feature
to be more useful. Adapted filter-stderr scripts accordingly.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13028
---

diff --git a/cachegrind/Makefile.am b/cachegrind/Makefile.am
index f22fe17307..43a44af6c8 100644
--- a/cachegrind/Makefile.am
+++ b/cachegrind/Makefile.am
@@ -41,13 +41,7 @@ endif
 
 CACHEGRIND_SOURCES_COMMON = \
 	cg_main.c \
-	cg-arch.c \
-	cg-x86-amd64.c \
-	cg-ppc32.c \
-	cg-ppc64.c \
-	cg-arm.c   \
-	cg-s390x.c \
-	cg-mips32.c
+	cg-arch.c
 
 cachegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES      = \
 	$(CACHEGRIND_SOURCES_COMMON)
@@ -88,5 +82,3 @@ cachegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \
 	$(cachegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \
 	$(cachegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS)
 endif
-
-
diff --git a/cachegrind/cg-arch.c b/cachegrind/cg-arch.c
index 4afaab6997..c6847d99e9 100644
--- a/cachegrind/cg-arch.c
+++ b/cachegrind/cg-arch.c
@@ -1,8 +1,5 @@
 /*--------------------------------------------------------------------*/
-/*--- Cachegrind: cache configuration.                             ---*/
-/*--- The architecture specific void VG_(configure_caches) are     ---*/
-/*--- located in the cg-<architecture>.c files.                    ---*/
-/*---                                                    cg-arch.c ---*/
+/*--- Cachegrind: cache configuration.                   cg-arch.c ---*/
 /*--------------------------------------------------------------------*/
 
 /*
@@ -35,9 +32,13 @@
 #include "pub_tool_libcbase.h"
 #include "pub_tool_libcprint.h"
 #include "pub_tool_options.h"
+#include "pub_tool_machine.h"
 
 #include "cg_arch.h"
 
+static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
+                             Bool all_caches_clo_defined);
+
 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
 // string otherwise.
 static Char* check_cache(cache_t* cache)
@@ -157,6 +158,65 @@ static void check_cache_or_override(Char* desc, cache_t* c, Bool clo_redefined)
    }
 }
 
+
+/* If the LL cache config isn't something the simulation functions
+   can handle, try to adjust it so it is.  Caches are characterised
+   by (total size T, line size L, associativity A), and then we
+   have
+
+     number of sets S = T / (L * A)
+
+   The required constraints are:
+
+   * L must be a power of 2, but it always is in practice, so
+     no problem there
+
+   * A can be any value >= 1
+
+   * T can be any value, but ..
+
+   * S must be a power of 2.
+
+   That sometimes gives a problem.  For example, some Core iX based
+   Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
+   sets.  The "fix" in this case is to increase the associativity
+   by 50% to 24, which reduces the number of sets to 8192, making
+   it a power of 2.  That's what the following code does (handing
+   the "3/2 rescaling case".)  We might need to deal with other
+   ratios later (5/4 ?).
+
+   The "fix" is "justified" (cough, cough) by alleging that
+   increases of associativity above about 4 have very little effect
+   on the actual miss rate.  It would be far more inaccurate to
+   fudge this by changing the size of the simulated cache --
+   changing the associativity is a much better option.
+*/
+
+static void
+maybe_tweak_LLc(cache_t *LLc)
+{
+  if (LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0) {
+      Long nSets = (Long)LLc->size / (Long)(LLc->line_size * LLc->assoc);
+      if (/* stay sane */
+          nSets >= 4
+          /* nSets is not a power of 2 */
+          && VG_(log2_64)( (ULong)nSets ) == -1
+          /* nSets is 50% above a power of 2 */
+          && VG_(log2_64)( (ULong)((2 * nSets) / (Long)3) ) != -1
+          /* associativity can be increased by exactly 50% */
+          && (LLc->assoc % 2) == 0
+         ) {
+         /* # sets is 1.5 * a power of two, but the associativity is
+            even, so we can increase that up by 50% and implicitly
+            scale the # sets down accordingly. */
+         Int new_assoc = LLc->assoc + (LLc->assoc / 2);
+         VG_(dmsg)("warning: pretending that LL cache has associativity"
+                   " %d instead of actual %d\n", new_assoc, LLc->assoc);
+         LLc->assoc = new_assoc;
+      }
+   }
+}
+
 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
                                          cache_t* D1c,
                                          cache_t* LLc,
@@ -174,7 +234,9 @@ void VG_(post_clo_init_configure_caches)(cache_t* I1c,
 
    // Set the cache config (using auto-detection, if supported by the
    // architecture).
-   VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
+   configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
+
+   maybe_tweak_LLc( LLc );
 
    // Check the default/auto-detected values.
    // Allow the user to override invalid auto-detected caches
@@ -206,3 +268,172 @@ void VG_(print_cache_clo_opts)()
 "    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
                );
 }
+
+
+// Traverse the cache info and return a cache of the given kind and level.
+// Return NULL if no such cache exists.
+static const VexCache *
+locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
+{
+   const VexCache *c;
+
+   for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
+      if (c->level == level && c->kind == kind) {
+         return c;
+      }
+   }
+   return NULL;  // not found
+}
+
+
+// Gives the auto-detected configuration of I1, D1 and LL caches.  They get
+// overridden by any cache configurations specified on the command line.
+static void
+configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
+                 Bool all_caches_clo_defined)
+{
+   VexArchInfo vai;
+   const VexCacheInfo *ci;
+   const VexCache *i1, *d1, *ll;
+
+   VG_(machine_get_VexArchInfo)(NULL, &vai);
+   ci = &vai.hwcache_info;
+
+   // Extract what we need
+   i1 = locate_cache(ci, INSN_CACHE, 1);
+   d1 = locate_cache(ci, DATA_CACHE, 1);
+   // FIXME: needs clarification for num_levels > 3 see also warning below
+   // FIXME: whether it needs adjustment
+   ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
+
+   if (ll && ci->num_levels > 2) {
+      VG_(dmsg)("warning: L%u cache found, using its data for the "
+                "LL simulation.\n", ci->num_levels);
+   }
+
+   if (i1 && d1 && ll) {
+      *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
+      *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
+      *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
+
+      return;
+   }
+
+   // Cache information could not be queried; choose some default
+   // architecture specific default setting.
+
+#if defined(VGA_ppc32)
+
+   // Default cache configuration
+   *I1c = (cache_t) {  65536, 2, 64 };
+   *D1c = (cache_t) {  65536, 2, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
+
+#elif defined(VGA_ppc64)
+
+   // Default cache configuration
+   *I1c = (cache_t) {  65536, 2, 64 };
+   *D1c = (cache_t) {  65536, 2, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
+
+#elif defined(VGA_arm)
+
+   // Set caches to default (for Cortex-A8 ?)
+   *I1c = (cache_t) {  16384, 4, 64 };
+   *D1c = (cache_t) {  16384, 4, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
+
+#elif defined(VGA_s390x)
+   // z900
+   //
+   // Source:
+   // The microarchitecture of the IBM eServer z900 processor
+   // IBM Journal of Research and Development
+   // Volume 46, Number 4/5, pp 381-395, July/September 2002
+   //
+   // Split L1 I/D cache
+   // Size: 256 kB each
+   // Line size: 256 bytes
+   // 4-way set associative
+   // L2 cache: 16 MB x 2 (16 MB per 10 CPs)  (Charles Webb)
+
+   // z800
+   //
+   // Source:  Charles Webb from IBM
+   //
+   // Split L1 I/D cache
+   // Size: 256 kB each
+   // Line size: 256 bytes
+   // 4-way set associative
+   // L2 cache: 16 MB  (or half that size)
+
+   // z990
+   //
+   // The IBM eServer z990 microprocessor
+   // IBM Journal of Research and Development
+   // Volume 48, Number 3/4, pp 295-309, May/July 2004 
+   //
+   // Split L1 I/D cache
+   // Size: 256 kB each
+   // Line size: 256 bytes
+   // 4-way set associative
+   // L2 cache: 32 MB x 4 (32 MB per book/node)  (Charles Webb)
+
+   // z890
+   //
+   // Source:  Charles Webb from IBM
+   //
+   // Split L1 I/D cache
+   // Size: 256 kB each
+   // Line size: 256 bytes
+   // 4-way set associative
+   // L2 cache: 32 MB  (or half that size)
+
+   // z9
+   //
+   // Source:  Charles Webb from IBM
+   //
+   // Split L1 I/D cache
+   // Size: 256 kB each
+   // Line size: 256 bytes
+   // 4-way set associative
+   // L2 cache: 40 MB x 4 (40 MB per book/node)
+
+   // fixs390: have a table for all models we support and check
+   // fixs390: VEX_S390X_MODEL(hwcaps)
+
+   // Default cache configuration is z10-EC  (Source: ECAG insn)
+   *I1c = (cache_t) {    65536,  4, 256 };
+   *D1c = (cache_t) {   131072,  8, 256 };
+   *LLc = (cache_t) { 50331648, 24, 256 };
+
+#elif defined(VGA_mips32)
+
+   // Set caches to default (for MIPS32-r2(mips 74kc))
+   *I1c = (cache_t) {  32768, 4, 32 };
+   *D1c = (cache_t) {  32768, 4, 32 };
+   *L2c = (cache_t) { 524288, 8, 32 };
+
+#elif defined(VGA_x86) || defined(VGA_amd64)
+
+   *I1c = (cache_t) {  65536, 2, 64 };
+   *D1c = (cache_t) {  65536, 2, 64 };
+   *LLc = (cache_t) { 262144, 8, 64 };
+
+#else
+
+#error "Unknown arch"
+
+#endif
+
+   if (!all_caches_clo_defined) {
+      const char warning[] =
+        "Warning: Cannot auto-detect cache config, using defaults.\n"
+        "         Run with -v to see.\n";
+      VG_(dmsg)("%s", warning);
+   }
+}
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                          ---*/
+/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg-arm.c b/cachegrind/cg-arm.c
deleted file mode 100644
index 00badcd95f..0000000000
--- a/cachegrind/cg-arm.c
+++ /dev/null
@@ -1,59 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- ARM-specific definitions.                           cg-arm.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Cachegrind, a Valgrind tool for cache
-   profiling programs.
-
-   Copyright (C) 2005-2012 Johan Bjork
-      jbjoerk@gmail.com
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file COPYING.
-*/
-
-#if defined(VGA_arm)
-
-#include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
-                           Bool all_caches_clo_defined)
-{
-   // Set caches to default (for Cortex-A8 ?)
-   *I1c = (cache_t) {  16384, 4, 64 };
-   *D1c = (cache_t) {  16384, 4, 64 };
-   *LLc = (cache_t) { 262144, 8, 64 };
-
-   if (!all_caches_clo_defined) {
-      VG_(message)(Vg_DebugMsg, 
-                   "Warning: Cannot auto-detect cache config on ARM, using one "
-                   "or more defaults\n");
-   }
-}
-
-#endif // #if defined(VGA_arm)
-
-/*--------------------------------------------------------------------*/
-/*--- end                                                 cg-arm.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg-mips32.c b/cachegrind/cg-mips32.c
deleted file mode 100644
index 5ad69c2e51..0000000000
--- a/cachegrind/cg-mips32.c
+++ /dev/null
@@ -1,59 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- MIPS-specific definitions.                       cg-mips32.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Cachegrind, a Valgrind tool for cache
-   profiling programs.
-
-   Copyright (C) 2010-2012 RT-RK
-      mips-valgrind@rt-rk.com
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file COPYING.
-*/
-
-#if defined(VGA_mips32)
-
-#include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
-                           Bool all_caches_clo_defined)
-{
-   // Set caches to default (for MIPS32-r2(mips 74kc))
-   *I1c = (cache_t) {  32768, 4, 32 };
-   *D1c = (cache_t) {  32768, 4, 32 };
-   *L2c = (cache_t) { 524288, 8, 32 };
-
-   if (!all_caches_clo_defined) {
-      VG_(message)(Vg_DebugMsg, 
-                   "Warning: Cannot auto-detect cache config on MIPS32, using one "
-                   "or more defaults\n");
-   }
-}
-
-#endif // #if defined(VGA_mips32)
-
-/*--------------------------------------------------------------------*/
-/*--- end                                              cg-mips32.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg-ppc32.c b/cachegrind/cg-ppc32.c
deleted file mode 100644
index d0386d690f..0000000000
--- a/cachegrind/cg-ppc32.c
+++ /dev/null
@@ -1,68 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- PPC32-specific definitions.                       cg-ppc32.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Cachegrind, a Valgrind tool for cache
-   profiling programs.
-
-   Copyright (C) 2005-2012 Nicholas Nethercote
-      njn@valgrind.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file COPYING.
-*/
-
-#if defined(VGA_ppc32)
-
-#include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
-                           Bool all_caches_clo_defined)
-{
-   // Set caches to default.
-   *I1c = (cache_t) {  65536, 2, 64 };
-   *D1c = (cache_t) {  65536, 2, 64 };
-   *LLc = (cache_t) { 262144, 8, 64 };
-
-   // Warn if config not completely specified from cmd line.  Note that
-   // this message is slightly different from the one we give on x86/AMD64
-   // when auto-detection fails;  this lets us filter out this one (which is
-   // not important) in the regression test suite without filtering the
-   // x86/AMD64 one (which we want to see if it ever occurs in the
-   // regression test suite).
-   //
-   // If you change this message, please update
-   // cachegrind/tests/filter_stderr!
-   //
-   if (!all_caches_clo_defined) {
-      VG_(dmsg)("Warning: Cannot auto-detect cache config on PPC32, using one "
-                "or more defaults\n");
-   }
-}
-
-#endif // defined(VGA_ppc32)
-
-/*--------------------------------------------------------------------*/
-/*--- end                                                          ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg-ppc64.c b/cachegrind/cg-ppc64.c
deleted file mode 100644
index e594b997b1..0000000000
--- a/cachegrind/cg-ppc64.c
+++ /dev/null
@@ -1,68 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- PPC64-specific definitions.                       cg-ppc64.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Cachegrind, a Valgrind tool for cache
-   profiling programs.
-
-   Copyright (C) 2005-2012 Nicholas Nethercote
-      njn@valgrind.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file COPYING.
-*/
-
-#if defined(VGA_ppc64)
-
-#include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
-                           Bool all_caches_clo_defined)
-{
-   // Set caches to default.
-   *I1c = (cache_t) {  65536, 2, 64 };
-   *D1c = (cache_t) {  65536, 2, 64 };
-   *LLc = (cache_t) { 262144, 8, 64 };
-
-   // Warn if config not completely specified from cmd line.  Note that
-   // this message is slightly different from the one we give on x86/AMD64
-   // when auto-detection fails;  this lets us filter out this one (which is
-   // not important) in the regression test suite without filtering the
-   // x86/AMD64 one (which we want to see if it ever occurs in the
-   // regression test suite).
-   //
-   // If you change this message, please update
-   // cachegrind/tests/filter_stderr!
-   //
-   if (!all_caches_clo_defined) {
-      VG_(dmsg)("Warning: Cannot auto-detect cache config on PPC64, using one "
-                "or more defaults\n");
-   }
-}
-
-#endif
-
-/*--------------------------------------------------------------------*/
-/*--- end                                                          ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg-s390x.c b/cachegrind/cg-s390x.c
deleted file mode 100644
index 3165efd005..0000000000
--- a/cachegrind/cg-s390x.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/* -*- mode: C; c-basic-offset: 3; -*- */
-
-/*--------------------------------------------------------------------*/
-/*--- s390x-specific definitions.                       cg-s390x.c ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Cachegrind, a Valgrind tool for cache
-   profiling programs.
-
-   Copyright IBM Corp. 2010-2012
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file COPYING.
-*/
-
-/* Contributed by Christian Borntraeger */
-
-#if defined(VGA_s390x)
-
-#include "pub_tool_basics.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
-                           Bool all_caches_clo_defined)
-{
-   // z900
-   //
-   // Source:
-   // The microarchitecture of the IBM eServer z900 processor
-   // IBM Journal of Research and Development
-   // Volume 46, Number 4/5, pp 381-395, July/September 2002
-   //
-   // Split L1 I/D cache
-   // Size: 256 kB each
-   // Line size: 256 bytes
-   // 4-way set associative
-   // L2 cache: 16 MB x 2 (16 MB per 10 CPs)  (Charles Webb)
-
-   // z800
-   //
-   // Source:  Charles Webb from IBM
-   //
-   // Split L1 I/D cache
-   // Size: 256 kB each
-   // Line size: 256 bytes
-   // 4-way set associative
-   // L2 cache: 16 MB  (or half that size)
-
-   // z990
-   //
-   // The IBM eServer z990 microprocessor
-   // IBM Journal of Research and Development
-   // Volume 48, Number 3/4, pp 295-309, May/July 2004 
-   //
-   // Split L1 I/D cache
-   // Size: 256 kB each
-   // Line size: 256 bytes
-   // 4-way set associative
-   // L2 cache: 32 MB x 4 (32 MB per book/node)  (Charles Webb)
-
-   // z890
-   //
-   // Source:  Charles Webb from IBM
-   //
-   // Split L1 I/D cache
-   // Size: 256 kB each
-   // Line size: 256 bytes
-   // 4-way set associative
-   // L2 cache: 32 MB  (or half that size)
-
-   // z9
-   //
-   // Source:  Charles Webb from IBM
-   //
-   // Split L1 I/D cache
-   // Size: 256 kB each
-   // Line size: 256 bytes
-   // 4-way set associative
-   // L2 cache: 40 MB x 4 (40 MB per book/node)
-
-
-   // Set caches to z10 default.
-   // See IBM Journal of Research and Development
-   // Issue Date: Jan. 2009
-   // Volume: 53 Issue:1
-   // fixs390: have a table for all available models and check /proc/cpuinfo
-   *I1c = (cache_t) {   65536,  4, 256 };
-   *D1c = (cache_t) {  131072,  8, 256 };
-   *LLc = (cache_t) {50331648, 24, 256 };
-
-   // Warn if config not completely specified from cmd line.  Note that
-   // this message is slightly different from the one we give on x86/AMD64
-   // when auto-detection fails;  this lets us filter out this one (which is
-   // not important) in the regression test suite without filtering the
-   // x86/AMD64 one (which we want to see if it ever occurs in the
-   // regression test suite).
-   //
-   // If you change this message, please update
-   // cachegrind/tests/filter_stderr!
-   //
-   if (!all_caches_clo_defined) {
-      VG_(dmsg)("Warning: Cannot auto-detect cache config, "
-                "assuming z10-EC cache configuration\n");
-   }
-}
-
-#endif
-
-/*--------------------------------------------------------------------*/
-/*--- end                                               cg-s390x.c ---*/
-/*--------------------------------------------------------------------*/
diff --git a/cachegrind/cg_arch.h b/cachegrind/cg_arch.h
index 99d0cb1d33..d35ec73bd9 100644
--- a/cachegrind/cg_arch.h
+++ b/cachegrind/cg_arch.h
@@ -44,11 +44,6 @@ typedef struct {
 // initialized to UNDEFINED_CACHE.
 #define UNDEFINED_CACHE     { -1, -1, -1 }
 
-// Gives the auto-detected configuration of I1, D1 and LL caches.  They get
-// overridden by any cache configurations specified on the command line.
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
-                           Bool all_caches_clo_defined);
-
 // If arg is a command line option configuring I1 or D1 or LL cache,
 // then parses arg to set the relevant cache_t elements.
 // Returns True if arg is a cache command line option, False otherwise.
diff --git a/cachegrind/tests/filter_stderr b/cachegrind/tests/filter_stderr
index 0eeb91ea30..8b9dd78eaf 100755
--- a/cachegrind/tests/filter_stderr
+++ b/cachegrind/tests/filter_stderr
@@ -17,8 +17,6 @@ perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
 sed "/warning: L3 cache found, using its data for the LL simulation./d" |
-sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d" |
-sed "/Warning: Cannot auto-detect cache config on ARM, using one or more defaults/d" |
-sed "/Warning: Cannot auto-detect cache config, assuming z10-EC cache configuration/d" |
-sed "/Warning: Cannot auto-detect cache config on MIPS.., using one or more defaults/d" |
+sed "/Warning: Cannot auto-detect cache config, using defaults./d" |
+sed "/Run with -v to see./d" |
 sed "/warning: pretending that LL cache has associativity .*$/d"
diff --git a/callgrind/Makefile.am b/callgrind/Makefile.am
index ae4ff4fc69..343f03470e 100644
--- a/callgrind/Makefile.am
+++ b/callgrind/Makefile.am
@@ -45,13 +45,7 @@ CALLGRIND_SOURCES_COMMON = \
 	main.c \
 	sim.c \
 	threads.c \
-	../cachegrind/cg-arch.c \
-	../cachegrind/cg-x86-amd64.c \
-	../cachegrind/cg-ppc32.c \
-	../cachegrind/cg-ppc64.c \
-	../cachegrind/cg-arm.c   \
-	../cachegrind/cg-s390x.c \
-	../cachegrind/cg-mips32.c
+	../cachegrind/cg-arch.c
 
 CALLGRIND_CFLAGS_COMMON = -I$(top_srcdir)/cachegrind
 
@@ -94,4 +88,3 @@ callgrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \
 	$(callgrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \
 	$(callgrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS)
 endif
-
diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr
index 0842a93661..3f6f448627 100755
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@@ -26,8 +26,6 @@ perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
 sed "/warning: L3 cache found, using its data for the LL simulation./d" |
-sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d" |
-sed "/Warning: Cannot auto-detect cache config on ARM, using one or more defaults/d" |
-sed "/Warning: Cannot auto-detect cache config, assuming z10-EC cache configuration/d" |
-sed "/Warning: Cannot auto-detect cache config on MIPS.., using one or more defaults/d" |
+sed "/Warning: Cannot auto-detect cache config, using defaults./d" |
+sed "/Run with -v to see./d" |
 sed "/warning: pretending that LL cache has associativity .*$/d"
diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am
index 19fecf8b98..d4eb8167f8 100644
--- a/coregrind/Makefile.am
+++ b/coregrind/Makefile.am
@@ -240,6 +240,7 @@ pkglib_LIBRARIES += libcoregrind-@VGCONF_ARCH_SEC@-@VGCONF_OS@.a
 endif
 
 COREGRIND_SOURCES_COMMON = \
+	m_cache.c \
 	m_commandline.c \
 	m_clientstate.c \
 	m_cpuid.S \
diff --git a/cachegrind/cg-x86-amd64.c b/coregrind/m_cache.c
similarity index 71%
rename from cachegrind/cg-x86-amd64.c
rename to coregrind/m_cache.c
index 1eb6c99634..3d3860e4f4 100644
--- a/cachegrind/cg-x86-amd64.c
+++ b/coregrind/m_cache.c
@@ -1,11 +1,12 @@
+/* -*- mode: C; c-basic-offset: 3; -*- */
 
 /*--------------------------------------------------------------------*/
-/*--- x86- and AMD64-specific definitions.          cg-x86-amd64.c ---*/
+/*--- Cache-related stuff.                               m_cache.c ---*/
 /*--------------------------------------------------------------------*/
 
 /*
-   This file is part of Cachegrind, a Valgrind tool for cache
-   profiling programs.
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
 
    Copyright (C) 2002-2012 Nicholas Nethercote
       njn@valgrind.org
@@ -28,38 +29,51 @@
    The GNU General Public License is contained in the file COPYING.
 */
 
-#if defined(VGA_x86) || defined(VGA_amd64)
+#include "pub_core_basics.h"
+#include "pub_core_libcbase.h"
+#include "pub_core_libcassert.h"
+#include "pub_core_libcprint.h"
+#include "pub_core_mallocfree.h"
+#include "pub_core_machine.h"
+#include "libvex.h"
 
-#include "pub_tool_basics.h"
-#include "pub_tool_cpuid.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
+#if defined(VGA_x86) || defined(VGA_amd64)
 
-#include "cg_arch.h"
+#include "pub_core_cpuid.h"
 
 // All CPUID info taken from sandpile.org/ia32/cpuid.htm */
 // Probably only works for Intel and AMD chips, and probably only for some of
-// them. 
+// them.
 
-static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
+static void 
+micro_ops_warn(Int actual_size, Int used_size, Int line_size)
 {
-   VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n", 
+   VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n",
              actual_size);
-   VG_(dmsg)("         Simulating a %d KB I-cache with %d B lines\n", 
+   VG_(dmsg)("         Simulating a %d KB I-cache with %d B lines\n",
              used_size, line_size);
 }
 
+/* FIXME: Temporarily introduce cachegrind's cache_t structure here to
+   get Intel_cache_info to work. This function needs to be rewritten to
+   properly fill in VexCacheInfo. Absolutely no warnings about ignored
+   caches and such are appropriate here! */
+typedef struct {
+   Int size;       // bytes
+   Int assoc;
+   Int line_size;  // bytes
+} cache_t;
+
 /* Intel method is truly wretched.  We have to do an insane indexing into an
  * array of pre-defined configurations for various parts of the memory
  * hierarchy.
  * According to Intel Processor Identification, App Note 485.
- * 
+ *
  * If a L3 cache is found, then data for it rather than the L2
  * is returned via *LLc.
  */
-static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
+static Int
+Intel_cache_info_aux(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
 {
    Int cpuid1_eax;
    Int cpuid1_ignore;
@@ -88,7 +102,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
    family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf);
    model =  (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf);
 
-   VG_(cpuid)(2, 0, (Int*)&info[0], (Int*)&info[4], 
+   VG_(cpuid)(2, 0, (Int*)&info[0], (Int*)&info[4],
                     (Int*)&info[8], (Int*)&info[12]);
    trials  = info[0] - 1;   /* AL register - bits 0..7 of %eax */
    info[0] = 0x0;           /* reset AL */
@@ -105,7 +119,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
 
       case 0x0:       /* ignore zeros */
           break;
-          
+
       /* TLB info, ignore */
       case 0x01: case 0x02: case 0x03: case 0x04: case 0x05:
       case 0x0b:
@@ -116,7 +130,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
       case 0xb0: case 0xb1: case 0xb2:
       case 0xb3: case 0xb4: case 0xba: case 0xc0:
       case 0xca:
-          break;      
+          break;
 
       case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
       case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
@@ -130,10 +144,10 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
       case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
 
       /* IA-64 info -- panic! */
-      case 0x10: case 0x15: case 0x1a: 
+      case 0x10: case 0x15: case 0x1a:
       case 0x88: case 0x89: case 0x8a: case 0x8d:
       case 0x90: case 0x96: case 0x9b:
-         VG_(tool_panic)("IA-64 cache detected?!");
+         VG_(core_panic)("IA-64 cache detected?!");
 
       /* L3 cache info. */
       case 0x22: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
@@ -169,7 +183,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
       case 0x39: *LLc = (cache_t) {  128, 4, 64 }; L2_found = True; break;
       case 0x3c: *LLc = (cache_t) {  256, 4, 64 }; L2_found = True; break;
 
-      /* If a P6 core, this means "no L2 cache".  
+      /* If a P6 core, this means "no L2 cache".
          If a P4 core, this means "no L3 cache".
          We don't know what core it is, so don't issue a warning.  To detect
          a missing L2 cache, we use 'L2_found'. */
@@ -201,20 +215,20 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
       /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
        * conversion to byte size is a total guess;  treat the 12K and 16K
        * cases the same since the cache byte size must be a power of two for
-       * everything to work!.  Also guessing 32 bytes for the line size... 
+       * everything to work!.  Also guessing 32 bytes for the line size...
        */
       case 0x70:    /* 12K micro-ops, 8-way */
-         *I1c = (cache_t) { 16, 8, 32 };  
+         *I1c = (cache_t) { 16, 8, 32 };
          micro_ops_warn(12, 16, 32);
-         break;  
+         break;
       case 0x71:    /* 16K micro-ops, 8-way */
-         *I1c = (cache_t) { 16, 8, 32 };  
-         micro_ops_warn(16, 16, 32); 
-         break;  
+         *I1c = (cache_t) { 16, 8, 32 };
+         micro_ops_warn(16, 16, 32);
+         break;
       case 0x72:    /* 32K micro-ops, 8-way */
-         *I1c = (cache_t) { 32, 8, 32 };  
-         micro_ops_warn(32, 32, 32); 
-         break;  
+         *I1c = (cache_t) { 32, 8, 32 };
+         micro_ops_warn(32, 32, 32);
+         break;
 
       /* not sectored, whatever that might mean */
       case 0x78: *LLc = (cache_t) { 1024, 4,  64 }; L2_found = True;  break;
@@ -242,7 +256,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
 
       case 0xff:
          j = 0;
-         VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], 
+         VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4],
                             (Int*)&info[8], (Int*)&info[12]);
 
          while ((info[0] & 0x1f) != 0) {
@@ -264,25 +278,33 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
                case 1: *D1c = c; break;
                case 2: *I1c = c; break;
                case 3: VG_(dmsg)("warning: L1 unified cache ignored\n"); break;
-               default: VG_(dmsg)("warning: L1 cache of unknown type ignored\n"); break;
+               default:
+                  VG_(dmsg)("warning: L1 cache of unknown type ignored\n");
+                  break;
                }
                break;
             case 2:
                switch (info[0] & 0x1f)
                {
                case 1: VG_(dmsg)("warning: L2 data cache ignored\n"); break;
-               case 2: VG_(dmsg)("warning: L2 instruction cache ignored\n"); break;
+               case 2: VG_(dmsg)("warning: L2 instruction cache ignored\n");
+                  break;
                case 3: *LLc = c; L2_found = True; break;
-               default: VG_(dmsg)("warning: L2 cache of unknown type ignored\n"); break;
+               default:
+                  VG_(dmsg)("warning: L2 cache of unknown type ignored\n");
+                  break;
                }
                break;
             case 3:
                switch (info[0] & 0x1f)
                {
                case 1: VG_(dmsg)("warning: L3 data cache ignored\n"); break;
-               case 2: VG_(dmsg)("warning: L3 instruction cache ignored\n"); break;
+               case 2: VG_(dmsg)("warning: L3 instruction cache ignored\n");
+                  break;
                case 3: L3c = c; L3_found = True; break;
-               default: VG_(dmsg)("warning: L3 cache of unknown type ignored\n"); break;
+               default:
+                  VG_(dmsg)("warning: L3 cache of unknown type ignored\n");
+                  break;
                }
                break;
             default:
@@ -290,21 +312,26 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
                break;
             }
 
-            VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], 
+            VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4],
                                (Int*)&info[8], (Int*)&info[12]);
          }
          break;
 
       default:
-         VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), ignoring\n",
-                   info[i]);
+         VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), "
+                   "ignoring\n", info[i]);
          break;
       }
    }
 
-   /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
+   /* If we found a L3 cache, throw away the L2 data and use the L3's
+      instead. */
    if (L3_found) {
-      VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
+      /* Can't warn here: as we're not necessarily in cachegrind */
+#if 0
+      VG_(dmsg)("warning: L3 cache found, using its data for the "
+                "LL simulation.\n");
+#endif
       *LLc = L3c;
       L2_found = True;
    }
@@ -315,35 +342,63 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
    return 0;
 }
 
+static Int
+Intel_cache_info(Int level, VexCacheInfo *ci)
+{
+   cache_t I1c, D1c, LLc;
+   Int ret;
+
+   ret = Intel_cache_info_aux(level, &I1c, &D1c, &LLc);
+
+   /* Map results to VexCacheInfo. This is lossy as we simply assume
+      there is an L2 here (where in fact it could have been an L3). It
+      is irrelevant for current usages but needs to be fixed! */
+   if (ret == 0) {
+      ci->num_levels = 2;
+      ci->num_caches = 3;
+      ci->icaches_maintain_coherence = True;
+      ci->caches = VG_(malloc)("m_cache", ci->num_caches * sizeof *ci->caches);
+
+      ci->caches[0] = VEX_CACHE_INIT(DATA_CACHE, 1, D1c.size, D1c.line_size,
+                                     D1c.assoc);
+      ci->caches[1] = VEX_CACHE_INIT(INSN_CACHE, 1, I1c.size, I1c.line_size,
+                                     I1c.assoc);
+      ci->caches[2] = VEX_CACHE_INIT(UNIFIED_CACHE, 2, LLc.size, LLc.line_size,
+                                     LLc.assoc);
+   }
+   return ret;
+}
+
 /* AMD method is straightforward, just extract appropriate bits from the
  * result registers.
  *
  * Bits, for D1 and I1:
- *  31..24  data L1 cache size in KBs    
- *  23..16  data L1 cache associativity (FFh=full)    
- *  15.. 8  data L1 cache lines per tag    
+ *  31..24  data L1 cache size in KBs
+ *  23..16  data L1 cache associativity (FFh=full)
+ *  15.. 8  data L1 cache lines per tag
  *   7.. 0  data L1 cache line size in bytes
  *
  * Bits, for L2:
  *  31..16  unified L2 cache size in KBs
  *  15..12  unified L2 cache associativity (0=off, FFh=full)
- *  11.. 8  unified L2 cache lines per tag    
+ *  11.. 8  unified L2 cache lines per tag
  *   7.. 0  unified L2 cache line size in bytes
  *
- * #3  The AMD K7 processor's L2 cache must be configured prior to relying 
+ * #3  The AMD K7 processor's L2 cache must be configured prior to relying
  *     upon this information. (Whatever that means -- njn)
  *
  * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
  * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
  * so we detect that.
- * 
+ *
  * Returns 0 on success, non-zero on failure.  As with the Intel code
  * above, if a L3 cache is found, then data for it rather than the L2
  * is returned via *LLc.
  */
 
 /* A small helper */
-static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
+static Int
+decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
 {
    /* Decode a L2/L3 associativity indication.  It is encoded
       differently from the I1/D1 associativity.  Returns 1
@@ -362,17 +417,18 @@ static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
    }
 }
 
-static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
+static Int
+AMD_cache_info(VexCacheInfo *ci)
 {
    UInt ext_level;
    UInt dummy, model;
    UInt I1i, D1i, L2i, L3i;
-   
+   UInt size, line_size, assoc;
+
    VG_(cpuid)(0x80000000, 0, &ext_level, &dummy, &dummy, &dummy);
 
    if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
-      VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n", 
+      VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n",
                 ext_level);
       return -1;
    }
@@ -384,40 +440,60 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 
    /* Check for Duron bug */
    if (model == 0x630) {
-      VG_(dmsg)("warning: Buggy Duron stepping A0. Assuming L2 size=65536 bytes\n");
+      VG_(dmsg)("warning: Buggy Duron stepping A0. "
+                "Assuming L2 size=65536 bytes\n");
       L2i = (64 << 16) | (L2i & 0xffff);
    }
 
-   D1c->size      = (D1i >> 24) & 0xff;
-   D1c->assoc     = (D1i >> 16) & 0xff;
-   D1c->line_size = (D1i >>  0) & 0xff;
+   ci->num_levels = 2;
+   ci->num_caches = 3;
+   ci->icaches_maintain_coherence = True;
+
+   /* Check for L3 cache */
+   if (((L3i >> 18) & 0x3fff) > 0) {
+      ci->num_levels = 3;
+      ci->num_caches = 4;
+   }
+
+   ci->caches = VG_(malloc)("m_cache", ci->num_caches * sizeof *ci->caches);
+
+   // D1
+   size      = (D1i >> 24) & 0xff;
+   assoc     = (D1i >> 16) & 0xff;
+   line_size = (D1i >>  0) & 0xff;
+   ci->caches[0] = VEX_CACHE_INIT(DATA_CACHE, 1, size, line_size, assoc);
 
-   I1c->size      = (I1i >> 24) & 0xff;
-   I1c->assoc     = (I1i >> 16) & 0xff;
-   I1c->line_size = (I1i >>  0) & 0xff;
+   // I1
+   size      = (I1i >> 24) & 0xff;
+   assoc     = (I1i >> 16) & 0xff;
+   line_size = (I1i >>  0) & 0xff;
+   ci->caches[1] = VEX_CACHE_INIT(INSN_CACHE, 1, size, line_size, assoc);
 
-   LLc->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
-   LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
-   LLc->line_size = (L2i >>  0) & 0xff;
+   // L2    Nb: different bits used for L2
+   size      = (L2i >> 16) & 0xffff;
+   assoc     = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
+   line_size = (L2i >>  0) & 0xff;
+   ci->caches[2] = VEX_CACHE_INIT(INSN_CACHE, 2, size, line_size, assoc);
 
+   // L3, if any
    if (((L3i >> 18) & 0x3fff) > 0) {
-      /* There's an L3 cache.  Replace *LLc contents with this info. */
+      /* There's an L3 cache. */
       /* NB: the test in the if is "if L3 size > 0 ".  I don't know if
          this is the right way to test presence-vs-absence of L3.  I
          can't see any guidance on this in the AMD documentation. */
-      LLc->size      = ((L3i >> 18) & 0x3fff) * 512;
-      LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
-      LLc->line_size = (L3i >>  0) & 0xff;
-      VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
+      size      = ((L3i >> 18) & 0x3fff) * 512;
+      assoc     = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
+      line_size = (L3i >>  0) & 0xff;
+      ci->caches[3] = VEX_CACHE_INIT(INSN_CACHE, 3, size, line_size, assoc);
    }
 
    return 0;
 }
 
-static 
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
+static Int
+get_caches_from_CPUID(VexCacheInfo *ci)
 {
-   Int  level, ret;
+   Int  level, ret, i;
    Char vendor_id[13];
 
    if (!VG_(has_cpuid)()) {
@@ -425,8 +501,8 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
       return -1;
    }
 
-   VG_(cpuid)(0, 0, &level, (int*)&vendor_id[0], 
-	      (int*)&vendor_id[8], (int*)&vendor_id[4]);    
+   VG_(cpuid)(0, 0, &level, (int*)&vendor_id[0],
+	      (int*)&vendor_id[8], (int*)&vendor_id[4]);
    vendor_id[12] = '\0';
 
    if (0 == level) {
@@ -436,22 +512,21 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
 
    /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
    if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
-      ret = Intel_cache_info(level, I1c, D1c, LLc);
+      ret = Intel_cache_info(level, ci);
 
    } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
-      ret = AMD_cache_info(I1c, D1c, LLc);
+      ret = AMD_cache_info(ci);
 
    } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
       /* Total kludge.  Pretend to be a VIA Nehemiah. */
-      D1c->size      = 64;
-      D1c->assoc     = 16;
-      D1c->line_size = 16;
-      I1c->size      = 64;
-      I1c->assoc     = 4;
-      I1c->line_size = 16;
-      LLc->size      = 64;
-      LLc->assoc     = 16;
-      LLc->line_size = 16;
+      ci->num_levels = 2;
+      ci->num_caches = 3;
+      ci->icaches_maintain_coherence = True;
+      ci->caches = VG_(malloc)("m_cache", ci->num_caches * sizeof *ci->caches);
+      ci->caches[0] = VEX_CACHE_INIT(DATA_CACHE,    1, 64, 16, 16);
+      ci->caches[1] = VEX_CACHE_INIT(INSN_CACHE,    1, 64, 16,  4);
+      ci->caches[2] = VEX_CACHE_INIT(UNIFIED_CACHE, 2, 64, 16, 16);
+
       ret = 0;
 
    } else {
@@ -460,88 +535,47 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
    }
 
    /* Successful!  Convert sizes from KB to bytes */
-   I1c->size *= 1024;
-   D1c->size *= 1024;
-   LLc->size *= 1024;
-
-   /* If the LL cache config isn't something the simulation functions
-      can handle, try to adjust it so it is.  Caches are characterised
-      by (total size T, line size L, associativity A), and then we
-      have
-
-        number of sets S = T / (L * A)
-
-      The required constraints are:
-
-      * L must be a power of 2, but it always is in practice, so
-        no problem there
-
-      * A can be any value >= 1
-
-      * T can be any value, but ..
-
-      * S must be a power of 2.
-
-      That sometimes gives a problem.  For example, some Core iX based
-      Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
-      sets.  The "fix" in this case is to increase the associativity
-      by 50% to 24, which reduces the number of sets to 8192, making
-      it a power of 2.  That's what the following code does (handing
-      the "3/2 rescaling case".)  We might need to deal with other
-      ratios later (5/4 ?).
-
-      The "fix" is "justified" (cough, cough) by alleging that
-      increases of associativity above about 4 have very little effect
-      on the actual miss rate.  It would be far more inaccurate to
-      fudge this by changing the size of the simulated cache --
-      changing the associativity is a much better option.
-   */
-   if (LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0) {
-      Long nSets = (Long)LLc->size / (Long)(LLc->line_size * LLc->assoc);
-      if (/* stay sane */
-          nSets >= 4
-          /* nSets is not a power of 2 */
-          && VG_(log2_64)( (ULong)nSets ) == -1
-          /* nSets is 50% above a power of 2 */
-          && VG_(log2_64)( (ULong)((2 * nSets) / (Long)3) ) != -1
-          /* associativity can be increased by exactly 50% */
-          && (LLc->assoc % 2) == 0
-         ) {
-         /* # sets is 1.5 * a power of two, but the associativity is
-            even, so we can increase that up by 50% and implicitly
-            scale the # sets down accordingly. */
-         Int new_assoc = LLc->assoc + (LLc->assoc / 2);
-         VG_(dmsg)("warning: pretending that LL cache has associativity"
-                   " %d instead of actual %d\n", new_assoc, LLc->assoc);
-         LLc->assoc = new_assoc;
-      }
+   for (i = 0; i < ci->num_caches; ++i) {
+      ci->caches[i].sizeB *= 1024;
    }
 
    return ret;
 }
 
+Bool
+VG_(machine_get_cache_info)(VexArchInfo *vai)
+{
+   Int ret = get_caches_from_CPUID(&vai->hwcache_info); 
+
+   return ret == 0 ? True : False;
+}
+
+#elif defined(VGA_arm) || defined(VGA_ppc32) || defined(VGA_ppc64) || \
+      defined(VGA_mips32)
 
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
-                           Bool all_caches_clo_defined)
+Bool
+VG_(machine_get_cache_info)(VexArchInfo *vai)
 {
-   Int res;
-   
-   // Set caches to default.
-   *I1c = (cache_t) {  65536, 2, 64 };
-   *D1c = (cache_t) {  65536, 2, 64 };
-   *LLc = (cache_t) { 262144, 8, 64 };
-
-   // Then replace with any info we can get from CPUID.
-   res = get_caches_from_CPUID(I1c, D1c, LLc);
-
-   // Warn if CPUID failed and config not completely specified from cmd line.
-   if (res != 0 && !all_caches_clo_defined) {
-      VG_(dmsg)("Warning: Couldn't auto-detect cache config, using one "
-                "or more defaults \n");
-   }
+   vai->hwcache_info.icaches_maintain_coherence = False;
+
+   return False;   // not yet
 }
 
-#endif // defined(VGA_x86) || defined(VGA_amd64)
+#elif defined(VGA_s390x)
+
+Bool
+VG_(machine_get_cache_info)(VexArchInfo *vai)
+{
+   vai->hwcache_info.icaches_maintain_coherence = True;
+
+   return False;   // not yet
+}
+
+#else
+
+#error "Unknown arch"
+
+#endif
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/m_libcproc.c b/coregrind/m_libcproc.c
index 136426b4d2..f0a7d7f3e2 100644
--- a/coregrind/m_libcproc.c
+++ b/coregrind/m_libcproc.c
@@ -724,6 +724,15 @@ void VG_(do_atfork_child)(ThreadId tid)
 
 void VG_(invalidate_icache) ( void *ptr, SizeT nbytes )
 {
+   if (nbytes == 0) return;    // nothing to do
+
+   // Get cache info
+   VexArchInfo vai;
+   VG_(machine_get_VexArchInfo)(NULL, &vai);
+
+   // If I-caches are coherent, nothing needs to be done here
+   if (vai.hwcache_info.icaches_maintain_coherence) return;
+
 #  if defined(VGA_ppc32) || defined(VGA_ppc64)
    Addr startaddr = (Addr) ptr;
    Addr endaddr   = startaddr + nbytes;
@@ -731,9 +740,6 @@ void VG_(invalidate_icache) ( void *ptr, SizeT nbytes )
    Addr addr;
    VexArchInfo vai;
 
-   if (nbytes == 0) return;
-   vg_assert(nbytes > 0);
-
    VG_(machine_get_VexArchInfo)( NULL, &vai );
    cls = vai.ppc_cache_line_szB;
 
@@ -750,15 +756,6 @@ void VG_(invalidate_icache) ( void *ptr, SizeT nbytes )
    }
    __asm__ __volatile__("sync; isync");
 
-#  elif defined(VGA_x86)
-   /* no need to do anything, hardware provides coherence */
-
-#  elif defined(VGA_amd64)
-   /* no need to do anything, hardware provides coherence */
-
-#  elif defined(VGA_s390x)
-   /* no need to do anything, hardware provides coherence */
-
 #  elif defined(VGP_arm_linux)
    /* ARM cache flushes are privileged, so we must defer to the kernel. */
    Addr startaddr = (Addr) ptr;
@@ -770,8 +767,6 @@ void VG_(invalidate_icache) ( void *ptr, SizeT nbytes )
                                  (UWord) nbytes, (UWord) 3);
    vg_assert( sres._isError == 0 );
 
-#  else
-#    error "Unknown ARCH"
 #  endif
 }
 
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
index f2d90a211e..9b4dabc1ae 100644
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -674,7 +674,8 @@ static UInt VG_(get_machine_model)(void)
 
 /* Determine what insn set and insn set variant the host has, and
    record it.  To be called once at system startup.  Returns False if
-   this a CPU incapable of running Valgrind. */
+   this a CPU incapable of running Valgrind.
+   Also determine information about the caches on this host. */
 
 Bool VG_(machine_get_hwcaps)( void )
 {
@@ -731,26 +732,23 @@ Bool VG_(machine_get_hwcaps)( void )
         have_lzcnt = (ecx & (1<<5)) != 0; /* True => have LZCNT */
      }
 
+     va = VexArchX86;
      if (have_sse2 && have_sse1) {
-        va          = VexArchX86;
         vai.hwcaps  = VEX_HWCAPS_X86_SSE1;
         vai.hwcaps |= VEX_HWCAPS_X86_SSE2;
         if (have_lzcnt)
            vai.hwcaps |= VEX_HWCAPS_X86_LZCNT;
         VG_(machine_x86_have_mxcsr) = 1;
-        return True;
-     }
-
-     if (have_sse1) {
-        va          = VexArchX86;
+     } else if (have_sse1) {
         vai.hwcaps  = VEX_HWCAPS_X86_SSE1;
         VG_(machine_x86_have_mxcsr) = 1;
-        return True;
+     } else {
+       vai.hwcaps = 0; /*baseline - no sse at all*/
+       VG_(machine_x86_have_mxcsr) = 0;
      }
 
-     va         = VexArchX86;
-     vai.hwcaps = 0; /*baseline - no sse at all*/
-     VG_(machine_x86_have_mxcsr) = 0;
+     VG_(machine_get_cache_info)(&vai);
+
      return True;
    }
 
@@ -836,6 +834,9 @@ Bool VG_(machine_get_hwcaps)( void )
                 | (have_cx16  ? VEX_HWCAPS_AMD64_CX16  : 0)
                 | (have_lzcnt ? VEX_HWCAPS_AMD64_LZCNT : 0)
                 | (have_avx   ? VEX_HWCAPS_AMD64_AVX   : 0);
+
+     VG_(machine_get_cache_info)(&vai);
+
      return True;
    }
 
@@ -980,6 +981,7 @@ Bool VG_(machine_get_hwcaps)( void )
      if (have_VX) vai.hwcaps |= VEX_HWCAPS_PPC32_VX;
      if (have_DFP) vai.hwcaps |= VEX_HWCAPS_PPC32_DFP;
 
+     VG_(machine_get_cache_info)(&vai);
 
      /* But we're not done yet: VG_(machine_ppc32_set_clszB) must be
         called before we're ready to go. */
@@ -1105,6 +1107,8 @@ Bool VG_(machine_get_hwcaps)( void )
      if (have_VX) vai.hwcaps |= VEX_HWCAPS_PPC64_VX;
      if (have_DFP) vai.hwcaps |= VEX_HWCAPS_PPC64_DFP;
 
+     VG_(machine_get_cache_info)(&vai);
+
      /* But we're not done yet: VG_(machine_ppc64_set_clszB) must be
         called before we're ready to go. */
      return True;
@@ -1251,6 +1255,8 @@ Bool VG_(machine_get_hwcaps)( void )
 
      VG_(debugLog)(1, "machine", "hwcaps = 0x%x\n", vai.hwcaps);
 
+     VG_(machine_get_cache_info)(&vai);
+
      return True;
    }
 
@@ -1360,6 +1366,8 @@ Bool VG_(machine_get_hwcaps)( void )
      if (have_VFP)  vai.hwcaps |= VEX_HWCAPS_ARM_VFP;
      if (have_NEON) vai.hwcaps |= VEX_HWCAPS_ARM_NEON;
 
+     VG_(machine_get_cache_info)(&vai);
+
      return True;
    }
 
@@ -1371,6 +1379,9 @@ Bool VG_(machine_get_hwcaps)( void )
          return False;
 
      vai.hwcaps = model;
+
+     VG_(machine_get_cache_info)(&vai);
+
      return True;
    }
 
diff --git a/coregrind/pub_core_cpuid.h b/coregrind/pub_core_cpuid.h
index 7f1d75035f..269ae57df8 100644
--- a/coregrind/pub_core_cpuid.h
+++ b/coregrind/pub_core_cpuid.h
@@ -36,7 +36,13 @@
 // CPUID instruction.
 //--------------------------------------------------------------------
 
-#include "pub_tool_cpuid.h"
+#if defined(VGA_x86) || defined(VGA_amd64)
+extern Bool VG_(has_cpuid) ( void );
+
+extern void VG_(cpuid) ( UInt eax, UInt ecx,
+                         UInt* eax_ret, UInt* ebx_ret,
+                         UInt* ecx_ret, UInt* edx_ret );
+#endif
 
 #endif   // __PUB_CORE_CPUID_H
 
diff --git a/coregrind/pub_core_machine.h b/coregrind/pub_core_machine.h
index ded9b9af48..11e590a8cf 100644
--- a/coregrind/pub_core_machine.h
+++ b/coregrind/pub_core_machine.h
@@ -199,9 +199,9 @@ void VG_(get_UnwindStartRegs) ( /*OUT*/UnwindStartRegs* regs,
    this a CPU incapable of running Valgrind. */
 extern Bool VG_(machine_get_hwcaps)( void );
 
-/* Fetch host cpu info, as per above comment. */
-extern void VG_(machine_get_VexArchInfo)( /*OUT*/VexArch*,
-                                          /*OUT*/VexArchInfo* );
+/* Determine information about the cache system this host has and
+   record it. Returns False, if cache information cannot be auto-detected. */
+extern Bool VG_(machine_get_cache_info)( VexArchInfo * );
 
 /* Notify host cpu cache line size, as per above comment. */
 #if defined(VGA_ppc32)
diff --git a/include/Makefile.am b/include/Makefile.am
index a115754a65..41defb8ceb 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -8,7 +8,6 @@ nobase_pkginclude_HEADERS = \
 	pub_tool_aspacemgr.h 		\
 	pub_tool_clientstate.h		\
 	pub_tool_clreq.h		\
-	pub_tool_cpuid.h 		\
 	pub_tool_debuginfo.h 		\
 	pub_tool_errormgr.h 		\
 	pub_tool_execontext.h 		\
diff --git a/include/pub_tool_cpuid.h b/include/pub_tool_cpuid.h
deleted file mode 100644
index 149131a024..0000000000
--- a/include/pub_tool_cpuid.h
+++ /dev/null
@@ -1,46 +0,0 @@
-
-/*--------------------------------------------------------------------*/
-/*--- Interface to CPUID.                         pub_tool_cpuid.h ---*/
-/*--------------------------------------------------------------------*/
-
-/*
-   This file is part of Valgrind, a dynamic binary instrumentation
-   framework.
-
-   Copyright (C) 2000-2012 Julian Seward
-      jseward@acm.org
-
-   This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307, USA.
-
-   The GNU General Public License is contained in the file COPYING.
-*/
-
-#ifndef __PUB_TOOL_CPUID_H
-#define __PUB_TOOL_CPUID_H
-
-#if defined(VGA_x86) || defined(VGA_amd64)
-extern Bool VG_(has_cpuid) ( void );
-
-extern void VG_(cpuid) ( UInt eax, UInt ecx,
-                         UInt* eax_ret, UInt* ebx_ret,
-                         UInt* ecx_ret, UInt* edx_ret );
-#endif
-
-#endif   // __PUB_TOOL_CPUID_H
-
-/*--------------------------------------------------------------------*/
-/*--- end                                                          ---*/
-/*--------------------------------------------------------------------*/
diff --git a/include/pub_tool_machine.h b/include/pub_tool_machine.h
index 026db6b574..08ab203a20 100644
--- a/include/pub_tool_machine.h
+++ b/include/pub_tool_machine.h
@@ -31,6 +31,8 @@
 #ifndef __PUB_TOOL_MACHINE_H
 #define __PUB_TOOL_MACHINE_H
 
+#include "libvex.h"                    // VexArchInfo
+
 #if defined(VGP_x86_linux)
 #  define VG_MIN_INSTR_SZB          1  // min length of native instruction
 #  define VG_MAX_INSTR_SZB         16  // max length of native instruction
@@ -164,6 +166,10 @@ extern void* VG_(fnptr_to_fnentry)( void* );
    (eg, AVX or non-AVX ?, for amd64). */
 extern Int VG_(machine_get_size_of_largest_guest_register) ( void );
 
+/* Return host cpu info. */
+extern void VG_(machine_get_VexArchInfo)( /*OUT*/VexArch*,
+                                          /*OUT*/VexArchInfo* );
+
 #endif   // __PUB_TOOL_MACHINE_H
 
 /*--------------------------------------------------------------------*/