From: Julian Seward <jseward@acm.org>
Date: Thu, 29 Jan 2009 10:14:53 +0000 (+0000)
Subject: Merge in non-power-of-2-sized cache simulation fixes for Callgrind
X-Git-Tag: svn/VALGRIND_3_4_1~37
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=15402401c3efd8c00ea31ef2855dfcf891789d41;p=thirdparty%2Fvalgrind.git

Merge in non-power-of-2-sized cache simulation fixes for Callgrind
and Cachegrind:

8912 callgrind/tests/filter_stderr
     cachegrind/tests/filter_stderr
        Filter out an additional warning, so the tests pass on
        machines with a 6MB L2 cache.
     cachegrind/cg-x86.c
     cachegrind/cg-amd64.c
        These two files were almost identical.  cg-amd64.c now just
        #includes cg-x86.c.

9080 Cachegrind/Callgrind: allow for cache sizes other than only
     powers of two

9081 Callgrind: Remove ifdef'ed-out, non-working code.


git-svn-id: svn://svn.valgrind.org/valgrind/branches/VALGRIND_3_4_BRANCH@9088
---

diff --git a/cachegrind/cg-amd64.c b/cachegrind/cg-amd64.c
index 8ea6ec1375..90bee9857a 100644
--- a/cachegrind/cg-amd64.c
+++ b/cachegrind/cg-amd64.c
@@ -28,339 +28,7 @@
    The GNU General Public License is contained in the file COPYING.
 */
 
-#include "pub_tool_basics.h"
-#include "pub_tool_cpuid.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-// All CPUID info taken from sandpile.org/a32/cpuid.htm */
-// Probably only works for Intel and AMD chips, and probably only for some of
-// them. 
-
-static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
-{
-    VG_(message)(Vg_DebugMsg, 
-       "warning: Pentium 4 with %d KB micro-op instruction trace cache", 
-       actual_size);
-    VG_(message)(Vg_DebugMsg, 
-       "         Simulating a %d KB I-cache with %d B lines", 
-       used_size, line_size);
-}
-
-/* Intel method is truly wretched.  We have to do an insane indexing into an
- * array of pre-defined configurations for various parts of the memory
- * hierarchy.
- * According to Intel Processor Identification, App Note 485.
- */
-static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   Int cpuid1_eax;
-   Int cpuid1_ignore;
-   Int family;
-   Int model;
-   UChar info[16];
-   Int   i, trials;
-   Bool  L2_found = False;
-
-   if (level < 2) {
-      VG_(message)(Vg_DebugMsg, 
-         "warning: CPUID level < 2 for Intel processor (%d)", 
-         level);
-      return -1;
-   }
-
-   /* family/model needed to distinguish code reuse (currently 0x49) */
-   VG_(cpuid)(1, &cpuid1_eax, &cpuid1_ignore,
-	      &cpuid1_ignore, &cpuid1_ignore);
-   family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf);
-   model =  (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf);
-
-   VG_(cpuid)(2, (Int*)&info[0], (Int*)&info[4], 
-                 (Int*)&info[8], (Int*)&info[12]);
-   trials  = info[0] - 1;   /* AL register - bits 0..7 of %eax */
-   info[0] = 0x0;           /* reset AL */
-
-   if (0 != trials) {
-      VG_(message)(Vg_DebugMsg, 
-         "warning: non-zero CPUID trials for Intel processor (%d)",
-         trials);
-      return -1;
-   }
-
-   for (i = 0; i < 16; i++) {
-
-      switch (info[i]) {
-
-      case 0x0:       /* ignore zeros */
-          break;
-          
-      /* TLB info, ignore */
-      case 0x01: case 0x02: case 0x03: case 0x04: case 0x05:
-      case 0x4f: case 0x50: case 0x51: case 0x52:
-      case 0x56: case 0x57: case 0x59:
-      case 0x5b: case 0x5c: case 0x5d:
-      case 0xb0: case 0xb1:
-      case 0xb3: case 0xb4: case 0xba: case 0xc0:
-          break;      
-
-      case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
-      case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
-      case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break;
-
-      case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
-      case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
-      case 0x0e:
-         /* Real D1 cache configuration is:
-            D1c = (cache_t) { 24, 6, 64 }; */
-         VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
-         *D1c = (cache_t) { 16, 4, 64 };
-         break;
-      case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
-
-      /* IA-64 info -- panic! */
-      case 0x10: case 0x15: case 0x1a: 
-      case 0x88: case 0x89: case 0x8a: case 0x8d:
-      case 0x90: case 0x96: case 0x9b:
-         VG_(tool_panic)("IA-64 cache detected?!");
-
-      case 0x22: case 0x23: case 0x25: case 0x29:
-      case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
-          VG_(message)(Vg_DebugMsg, 
-             "warning: L3 cache detected but ignored");
-          break;
-
-      /* These are sectored, whatever that means */
-      case 0x39: *L2c = (cache_t) {  128, 4, 64 }; L2_found = True; break;
-      case 0x3c: *L2c = (cache_t) {  256, 4, 64 }; L2_found = True; break;
-
-      /* If a P6 core, this means "no L2 cache".  
-         If a P4 core, this means "no L3 cache".
-         We don't know what core it is, so don't issue a warning.  To detect
-         a missing L2 cache, we use 'L2_found'. */
-      case 0x40:
-          break;
-
-      case 0x41: *L2c = (cache_t) {  128, 4, 32 }; L2_found = True; break;
-      case 0x42: *L2c = (cache_t) {  256, 4, 32 }; L2_found = True; break;
-      case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
-      case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
-      case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
-      case 0x48:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
-         *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
-         break;
-      case 0x49:
-	  if ((family == 15) && (model == 6))
-	      /* On Xeon MP (family F, model 6), this is for L3 */
-	      VG_(message)(Vg_DebugMsg, 
-			   "warning: L3 cache detected but ignored");
-	  else
-	      *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-	  break;
-      case 0x4e:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
-         *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-         break;
-
-      /* These are sectored, whatever that means */
-      case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
-      case 0x66: *D1c = (cache_t) {  8, 4, 64 };  break;      /* sectored */
-      case 0x67: *D1c = (cache_t) { 16, 4, 64 };  break;      /* sectored */
-      case 0x68: *D1c = (cache_t) { 32, 4, 64 };  break;      /* sectored */
-
-      /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
-       * conversion to byte size is a total guess;  treat the 12K and 16K
-       * cases the same since the cache byte size must be a power of two for
-       * everything to work!.  Also guessing 32 bytes for the line size... 
-       */
-      case 0x70:    /* 12K micro-ops, 8-way */
-         *I1c = (cache_t) { 16, 8, 32 };  
-         micro_ops_warn(12, 16, 32);
-         break;  
-      case 0x71:    /* 16K micro-ops, 8-way */
-         *I1c = (cache_t) { 16, 8, 32 };  
-         micro_ops_warn(16, 16, 32); 
-         break;  
-      case 0x72:    /* 32K micro-ops, 8-way */
-         *I1c = (cache_t) { 32, 8, 32 };  
-         micro_ops_warn(32, 32, 32); 
-         break;  
-
-      /* These are sectored, whatever that means */
-      case 0x79: *L2c = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
-      case 0x7a: *L2c = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
-      case 0x7b: *L2c = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
-      case 0x7c: *L2c = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
-      case 0x7d: *L2c = (cache_t) { 2048, 8,  64 }; L2_found = True;  break;
-      case 0x7e: *L2c = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
-
-      case 0x7f: *L2c = (cache_t) {  512, 2, 64 };  L2_found = True;  break;
-      case 0x80: *L2c = (cache_t) {  512, 8, 64 };  L2_found = True;  break;
-
-      case 0x81: *L2c = (cache_t) {  128, 8, 32 };  L2_found = True;  break;
-      case 0x82: *L2c = (cache_t) {  256, 8, 32 };  L2_found = True;  break;
-      case 0x83: *L2c = (cache_t) {  512, 8, 32 };  L2_found = True;  break;
-      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };  L2_found = True;  break;
-      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };  L2_found = True;  break;
-      case 0x86: *L2c = (cache_t) {  512, 4, 64 };  L2_found = True;  break;
-      case 0x87: *L2c = (cache_t) { 1024, 8, 64 };  L2_found = True;  break;
-
-      /* Ignore prefetch information */
-      case 0xf0: case 0xf1:
-          break;
-
-      default:
-          VG_(message)(Vg_DebugMsg, 
-             "warning: Unknown Intel cache config value "
-             "(0x%x), ignoring", info[i]);
-          break;
-      }
-   }
-
-   if (!L2_found)
-      VG_(message)(Vg_DebugMsg, 
-         "warning: L2 cache not installed, ignore L2 results.");
-
-   return 0;
-}
-
-/* AMD method is straightforward, just extract appropriate bits from the
- * result registers.
- *
- * Bits, for D1 and I1:
- *  31..24  data L1 cache size in KBs    
- *  23..16  data L1 cache associativity (FFh=full)    
- *  15.. 8  data L1 cache lines per tag    
- *   7.. 0  data L1 cache line size in bytes
- *
- * Bits, for L2:
- *  31..16  unified L2 cache size in KBs
- *  15..12  unified L2 cache associativity (0=off, FFh=full)
- *  11.. 8  unified L2 cache lines per tag    
- *   7.. 0  unified L2 cache line size in bytes
- *
- * #3  The AMD K7 processor's L2 cache must be configured prior to relying 
- *     upon this information. (Whatever that means -- njn)
- *
- * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
- * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
- * so we detect that.
- * 
- * Returns 0 on success, non-zero on failure.
- */
-static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   UInt ext_level;
-   UInt dummy, model;
-   UInt I1i, D1i, L2i;
-   
-   VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
-
-   if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
-      VG_(message)(Vg_UserMsg, 
-         "warning: ext_level < 0x80000006 for AMD processor (0x%x)", 
-         ext_level);
-      return -1;
-   }
-
-   VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
-   VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
-
-   VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
-
-   /* Check for Duron bug */
-   if (model == 0x630) {
-      VG_(message)(Vg_UserMsg,
-         "Buggy Duron stepping A0. Assuming L2 size=65536 bytes");
-      L2i = (64 << 16) | (L2i & 0xffff);
-   }
-
-   D1c->size      = (D1i >> 24) & 0xff;
-   D1c->assoc     = (D1i >> 16) & 0xff;
-   D1c->line_size = (D1i >>  0) & 0xff;
-
-   I1c->size      = (I1i >> 24) & 0xff;
-   I1c->assoc     = (I1i >> 16) & 0xff;
-   I1c->line_size = (I1i >>  0) & 0xff;
-
-   L2c->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
-   L2c->assoc     = (L2i >> 12) & 0xf;
-   L2c->line_size = (L2i >>  0) & 0xff;
-
-   return 0;
-}
-
-static 
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
-   Int  level, ret;
-   Char vendor_id[13];
-
-   if (!VG_(has_cpuid)()) {
-      VG_(message)(Vg_DebugMsg, "CPUID instruction not supported");
-      return -1;
-   }
-
-   VG_(cpuid)(0, &level, (int*)&vendor_id[0], 
-	      (int*)&vendor_id[8], (int*)&vendor_id[4]);    
-   vendor_id[12] = '\0';
-
-   if (0 == level) {
-      VG_(message)(Vg_DebugMsg, "CPUID level is 0, early Pentium?");
-      return -1;
-   }
-
-   /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
-   if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
-      ret = Intel_cache_info(level, I1c, D1c, L2c);
-
-   } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
-      ret = AMD_cache_info(I1c, D1c, L2c);
-
-   } else {
-      VG_(message)(Vg_DebugMsg, "CPU vendor ID not recognised (%s)",
-                   vendor_id);
-      return -1;
-   }
-
-   /* Successful!  Convert sizes from KB to bytes */
-   I1c->size *= 1024;
-   D1c->size *= 1024;
-   L2c->size *= 1024;
-      
-   return ret;
-}
-
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
-                           Bool all_caches_clo_defined)
-{
-   Int res;
-   
-   // Set caches to default.
-   *I1c = (cache_t) {  65536, 2, 64 };
-   *D1c = (cache_t) {  65536, 2, 64 };
-   *L2c = (cache_t) { 262144, 8, 64 };
-
-   // Then replace with any info we can get from CPUID.
-   res = get_caches_from_CPUID(I1c, D1c, L2c);
-
-   // Warn if CPUID failed and config not completely specified from cmd line.
-   if (res != 0 && !all_caches_clo_defined) {
-      VG_(message)(Vg_DebugMsg, 
-                   "Warning: Couldn't auto-detect cache config, using one "
-                   "or more defaults ");
-   }
-}
+#include "cg-x86.c"
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/cachegrind/cg-x86.c b/cachegrind/cg-x86.c
index 164f944e61..7c6b8f362b 100644
--- a/cachegrind/cg-x86.c
+++ b/cachegrind/cg-x86.c
@@ -1,6 +1,6 @@
 
 /*--------------------------------------------------------------------*/
-/*--- x86-specific definitions.                           cg-x86.c ---*/
+/*--- x86-specific (and AMD64-specific) definitions.      cg-x86.c ---*/
 /*--------------------------------------------------------------------*/
 
 /*
@@ -113,12 +113,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
 
       case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
       case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
-      case 0x0e:
-         /* Real D1 cache configuration is:
-            D1c = (cache_t) { 24, 6, 64 }; */
-         VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
-         *D1c = (cache_t) { 16, 4, 64 };
-         break;
+      case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break;
       case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
 
       /* IA-64 info -- panic! */
@@ -149,12 +144,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
       case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
       case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
       case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
-      case 0x48:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
-         *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
-         break;
+      case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
       case 0x49:
 	  if ((family == 15) && (model == 6))
 	      /* On Xeon MP (family F, model 6), this is for L3 */
@@ -163,12 +153,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
 	  else
 	      *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
 	  break;
-      case 0x4e:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
-         *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-         break;
+      case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
 
       /* These are sectored, whatever that means */
       case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index a30ba758d3..36ddbab430 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -1158,18 +1158,12 @@ static cache_t clo_L2_cache = UNDEFINED_CACHE;
 static 
 void check_cache(cache_t* cache, Char *name)
 {
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
+   /* Simulator requires line size and set count to be powers of two */
+   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
       VG_(message)(Vg_UserMsg,
-         "error: %s size of %dB not a power of two; aborting.",
-         name, cache->size);
-      VG_(exit)(1);
-   }
-
-   if (-1 == VG_(log2)(cache->assoc)) {
-      VG_(message)(Vg_UserMsg,
-         "error: %s associativity of %d not a power of two; aborting.",
-         name, cache->assoc);
+         "error: %s set count not a power of two; aborting.",
+         name);
       VG_(exit)(1);
    }
 
diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c
index 6edf12628f..25bef497ad 100644
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@@ -44,7 +44,6 @@ typedef struct {
    Int          line_size;              /* bytes */
    Int          sets;
    Int          sets_min_1;
-   Int          assoc_bits;
    Int          line_size_bits;
    Int          tag_shift;
    Char         desc_line[128];
@@ -62,7 +61,6 @@ static void cachesim_initcache(cache_t config, cache_t2* c)
 
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
    c->line_size_bits = VG_(log2)(c->line_size);
    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
 
@@ -111,8 +109,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
    /* First case: word entirely within line. */                             \
    if (set1 == set2) {                                                      \
                                                                             \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
                                                                             \
       /* This loop is unrolled for just the first case, which is the most */\
       /* common.  We can't unroll any further because it would screw up   */\
@@ -143,7 +140,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
    /* Second case: word straddles two lines. */                             \
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       if (tag == set[0]) {                                                  \
          goto block2;                                                       \
       }                                                                     \
@@ -162,7 +159,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
       set[0] = tag;                                                         \
       is_miss = True;                                                       \
 block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      set = &(L.tags[set2 * L.assoc]);                                      \
       tag2 = (a+size-1) >> L.tag_shift;                                     \
       if (tag2 == set[0]) {                                                 \
          goto miss_treatment;                                               \
diff --git a/cachegrind/docs/cg-manual.xml b/cachegrind/docs/cg-manual.xml
index f65272bbc2..512eeb409b 100644
--- a/cachegrind/docs/cg-manual.xml
+++ b/cachegrind/docs/cg-manual.xml
@@ -142,7 +142,7 @@ follows:</para>
   </listitem>
 
   <listitem>
-    <para>Bit-selection hash function: the line(s) in the cache
+    <para>Bit-selection hash function: the set of line(s) in the cache
     to which a memory block maps is chosen by the middle bits
     M--(M+N-1) of the byte address, where:</para>
     <itemizedlist>
@@ -150,15 +150,17 @@ follows:</para>
         <para>line size = 2^M bytes</para>
       </listitem>
       <listitem>
-        <para>(cache size / line size) = 2^N bytes</para>
+        <para>(cache size / line size / associativity) = 2^N bytes</para>
       </listitem>
     </itemizedlist> 
   </listitem>
 
   <listitem>
-    <para>Inclusive L2 cache: the L2 cache replicates all the
-    entries of the L1 cache.  This is standard on Pentium chips,
-    but AMD Opterons, Athlons and Durons 
+    <para>Inclusive L2 cache: the L2 cache typically replicates all
+    the entries of the L1 caches, because fetching into L1 involves
+    fetching into L2 first (this does not guarantee strict inclusiveness,
+    as lines evicted from L2 still could reside in L1).  This is
+    standard on Pentium chips, but AMD Opterons, Athlons and Durons
     use an exclusive L2 cache that only holds
     blocks evicted from L1.  Ditto most modern VIA CPUs.</para>
   </listitem>
@@ -176,7 +178,10 @@ happens.  You can manually specify one, two or all three levels
 (I1/D1/L2) of the cache from the command line using the
 <computeroutput>--I1</computeroutput>,
 <computeroutput>--D1</computeroutput> and
-<computeroutput>--L2</computeroutput> options.</para>
+<computeroutput>--L2</computeroutput> options.
+For cache parameters to be valid for simulation, the number
+of sets (with associativity being the number of cache lines in
+each set) has to be a power of two.</para>
 
 <para>On PowerPC platforms
 Cachegrind cannot automatically 
@@ -227,10 +232,7 @@ need to specify it with the
 <para>If you are interested in simulating a cache with different
 properties, it is not particularly hard to write your own cache
 simulator, or to modify the existing ones in
-<computeroutput>vg_cachesim_I1.c</computeroutput>,
-<computeroutput>vg_cachesim_D1.c</computeroutput>,
-<computeroutput>vg_cachesim_L2.c</computeroutput> and
-<computeroutput>vg_cachesim_gen.c</computeroutput>.  We'd be
+<computeroutput>cg_sim.c</computeroutput>. We'd be
 interested to hear from anyone who does.</para>
 
 </sect2>
diff --git a/cachegrind/tests/Makefile.am b/cachegrind/tests/Makefile.am
index 4a40e770d6..de669623e7 100644
--- a/cachegrind/tests/Makefile.am
+++ b/cachegrind/tests/Makefile.am
@@ -24,6 +24,7 @@ EXTRA_DIST = $(noinst_SCRIPTS) \
 	chdir.vgtest chdir.stderr.exp \
 	clreq.vgtest clreq.stderr.exp \
 	dlclose.vgtest dlclose.stderr.exp dlclose.stdout.exp \
+	notpower2.vgtest notpower2.stderr.exp \
 	wrap5.vgtest wrap5.stderr.exp wrap5.stdout.exp
 
 check_PROGRAMS = \
diff --git a/cachegrind/tests/notpower2.stderr.exp b/cachegrind/tests/notpower2.stderr.exp
new file mode 100644
index 0000000000..8eaf65446c
--- /dev/null
+++ b/cachegrind/tests/notpower2.stderr.exp
@@ -0,0 +1,17 @@
+
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/cachegrind/tests/notpower2.vgtest b/cachegrind/tests/notpower2.vgtest
new file mode 100644
index 0000000000..132cfe5923
--- /dev/null
+++ b/cachegrind/tests/notpower2.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm cachegrind.out.*
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 506ed9e400..6e9de48966 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -74,7 +74,6 @@ typedef struct {
    Bool         sectored;  /* prefetch nearside cacheline on read */
    int          sets;
    int          sets_min_1;
-   int          assoc_bits;
    int          line_size_bits;
    int          tag_shift;
    UWord        tag_mask;
@@ -195,7 +194,6 @@ static void cachesim_initcache(cache_t config, cache_t2* c)
 
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
    c->line_size_bits = VG_(log2)(c->line_size);
    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
    c->tag_mask       = ~((1<<c->tag_shift)-1);
@@ -259,8 +257,7 @@ CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
     int i, j;
     UWord *set;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
 
     /* This loop is unrolled for just the first case, which is the most */
     /* common.  We can't unroll any further because it would screw up   */
@@ -359,8 +356,7 @@ CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
     int i, j;
     UWord *set, tmp_tag;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
 
     /* This loop is unrolled for just the first case, which is the most */
     /* common.  We can't unroll any further because it would screw up   */
@@ -407,7 +403,7 @@ CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
     /* Access straddles two lines. */
     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
     else if (((set1 + 1) & (c->sets-1)) == set2) {
-	UWord tag2  = (a+size-1) >> c->tag_shift;
+	UWord tag2  = (a+size-1) & c->tag_mask;
 
 	/* the call updates cache structures as side effect */
 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
@@ -676,7 +672,7 @@ void cacheuse_initcache(cache_t2* c)
     /* We use lower tag bits as offset pointers to cache use info.
      * I.e. some cache parameters don't work.
      */
-    if (c->tag_shift < c->assoc_bits) {
+    if ( (1<<c->tag_shift) < c->assoc) {
 	VG_(message)(Vg_DebugMsg,
 		     "error: Use associativity < %d for cache use statistics!",
 		     (1<<c->tag_shift) );
@@ -684,102 +680,6 @@ void cacheuse_initcache(cache_t2* c)
     }
 }
     
-/* FIXME: A little tricky */
-#if 0
-
-static __inline__
-void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
-{
-    int idx = (high_idx << c->assoc_bits) | low_idx;
-
-    c->use[idx].count ++;
-    c->use[idx].mask |= use_mask;
-
-    CLG_DEBUG(6," Hit [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",
-	      idx, c->loaded[idx].memline,  c->loaded[idx].iaddr,
-	      use_mask, c->use[idx].mask, c->use[idx].count);
-}
-
-/* only used for I1, D1 */
-
-static __inline__
-CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
-{
-    int i, j, idx;
-    UWord *set, tmp_tag;
-    UInt use_mask;
-
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
-    use_mask =
-	c->line_start_mask[a & c->line_size_mask] &
-	c->line_end_mask[(a+size-1) & c->line_size_mask];
-
-    /* This loop is unrolled for just the first case, which is the most */
-    /* common.  We can't unroll any further because it would screw up   */
-    /* if we have a direct-mapped (1-way) cache.                        */
-    if (tag == (set[0] & c->tag_mask)) {
-	cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
-	return L1_Hit;
-    }
-
-    /* If the tag is one other than the MRU, move it into the MRU spot  */
-    /* and shuffle the rest down.                                       */
-    for (i = 1; i < c->assoc; i++) {
-	if (tag == (set[i] & c->tag_mask)) {
-  	    tmp_tag = set[i];
-            for (j = i; j > 0; j--) {
-                set[j] = set[j - 1];
-            }
-            set[0] = tmp_tag;
-
-	    cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
-            return L1_Hit;
-        }
-    }
-
-    /* A miss;  install this tag as MRU, shuffle rest down. */
-    tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
-    for (j = c->assoc - 1; j > 0; j--) {
-        set[j] = set[j - 1];
-    }
-    set[0] = tag | tmp_tag;
-
-    cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
-		     use_mask, a & ~c->line_size_mask);
-
-    return Miss;
-}
-
-
-static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
-{
-    UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
-    UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
-    UWord tag  = a >> c->tag_shift;
-
-    /* Access entirely within line. */
-    if (set1 == set2) 
-	return cacheuse_setref(c, set1, tag);
-
-    /* Access straddles two lines. */
-    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
-    else if (((set1 + 1) & (c->sets-1)) == set2) {
-	UWord tag2  = a >> c->tag_shift;
-
-	/* the call updates cache structures as side effect */
-	CacheResult res1 =  cacheuse_isMiss(c, set1, tag);
-	CacheResult res2 =  cacheuse_isMiss(c, set2, tag2);
-	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
-
-   } else {
-       VG_(printf)("addr: %x  size: %u  sets: %d %d", a, size, set1, set2);
-       VG_(tool_panic)("item straddles more than two cache sets");
-   }
-   return Hit;
-}
-#endif
-
 
 /* for I1/D1 caches */
 #define CACHEUSE(L)                                                         \
@@ -800,8 +700,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
    /* First case: word entirely within line. */                             \
    if (set1 == set2) {                                                      \
                                                                             \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
                                                                             \
@@ -809,7 +708,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
       /* common.  We can't unroll any further because it would screw up   */\
       /* if we have a direct-mapped (1-way) cache.                        */\
       if (tag == (set[0] & L.tag_mask)) {                                   \
-        idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);              \
+        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
         L.use[idx].count ++;                                                \
         L.use[idx].mask |= use_mask;                                        \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -826,7 +725,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;			                            \
-            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -842,7 +741,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag | tmp_tag;                                               \
-      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
       return update_##L##_use(&L, idx,         			            \
 		       use_mask, a &~ L.line_size_mask);		    \
                                                                             \
@@ -850,10 +749,10 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
       if (tag == (set[0] & L.tag_mask)) {                                   \
-         idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
          L.use[idx].count ++;                                               \
          L.use[idx].mask |= use_mask;                                       \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -868,7 +767,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;                                               \
-            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -882,15 +781,15 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag | tmp_tag;                                               \
-      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
       miss1 = update_##L##_use(&L, idx,        			            \
 		       use_mask, a &~ L.line_size_mask);		    \
 block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      set = &(L.tags[set2 * L.assoc]);                                      \
       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
       tag2  = (a+size-1) & L.tag_mask;                                      \
       if (tag2 == (set[0] & L.tag_mask)) {                                  \
-         idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
          L.use[idx].count ++;                                               \
          L.use[idx].mask |= use_mask;                                       \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -905,7 +804,7 @@ block2:                                                                     \
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;                                               \
-            idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -919,7 +818,7 @@ block2:                                                                     \
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag2 | tmp_tag;                                              \
-      idx = (set2 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set2 * L.assoc) + tmp_tag;                                     \
       miss2 = update_##L##_use(&L, idx,			                    \
 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
@@ -984,7 +883,7 @@ static
 CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
 {
    UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
-   UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
+   UWord* set = &(L2.tags[setNo * L2.assoc]);
    UWord tag  = memline & L2.tag_mask;
 
    int i, j, idx;
@@ -993,7 +892,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
    CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
 
    if (tag == (set[0] & L2.tag_mask)) {
-     idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
+     idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
      l1_loaded->dep_use = &(L2.use[idx]);
 
      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
@@ -1008,7 +907,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
 	 set[j] = set[j - 1];
        }
        set[0] = tmp_tag;
-       idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
+       idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
        l1_loaded->dep_use = &(L2.use[idx]);
 
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
@@ -1024,7 +923,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
      set[j] = set[j - 1];
    }
    set[0] = tag | tmp_tag;
-   idx = (setNo << L2.assoc_bits) | tmp_tag;
+   idx = (setNo * L2.assoc) + tmp_tag;
    l1_loaded->dep_use = &(L2.use[idx]);
 
    update_L2_use(idx, memline);
@@ -1380,22 +1279,15 @@ static cache_t clo_L2_cache = UNDEFINED_CACHE;
 static
 void check_cache(cache_t* cache, Char *name)
 {
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
-      VG_(message)(Vg_UserMsg,
-         "error: %s size of %dB not a power of two; aborting.",
-         name, cache->size);
-      VG_(exit)(1);
-   }
-
-   if (-1 == VG_(log2)(cache->assoc)) {
+   /* Simulator requires line size and set count to be powers of two */
+   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
       VG_(message)(Vg_UserMsg,
-         "error: %s associativity of %d not a power of two; aborting.",
-         name, cache->assoc);
-      VG_(exit)(1);
+         "error: %s set count not a power of two; aborting.",
+         name);
    }
 
-  if (-1 == VG_(log2)(cache->line_size)) {
+   if (-1 == VG_(log2)(cache->line_size)) {
       VG_(message)(Vg_UserMsg,
          "error: %s line size of %dB not a power of two; aborting.",
          name, cache->line_size);
diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am
index b95cfc3cf9..1c3f42c070 100644
--- a/callgrind/tests/Makefile.am
+++ b/callgrind/tests/Makefile.am
@@ -11,6 +11,10 @@ EXTRA_DIST = $(noinst_SCRIPTS) \
              simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \
              simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \
              simwork3.vgtest simwork3.stdout.exp simwork3.stderr.exp \
+             notpower2.vgtest notpower2.stderr.exp \
+             notpower2-wb.vgtest notpower2-wb.stderr.exp \
+             notpower2-hwpref.vgtest notpower2-hwpref.stderr.exp \
+             notpower2-use.vgtest notpower2-use.stderr.exp \
              threads.vgtest threads.stderr.exp
 
 check_PROGRAMS = clreq simwork threads
diff --git a/callgrind/tests/notpower2-hwpref.stderr.exp b/callgrind/tests/notpower2-hwpref.stderr.exp
new file mode 100644
index 0000000000..0705c1c849
--- /dev/null
+++ b/callgrind/tests/notpower2-hwpref.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-hwpref.vgtest b/callgrind/tests/notpower2-hwpref.vgtest
new file mode 100644
index 0000000000..9da7dced2a
--- /dev/null
+++ b/callgrind/tests/notpower2-hwpref.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-use.stderr.exp b/callgrind/tests/notpower2-use.stderr.exp
new file mode 100644
index 0000000000..ea9acc89b5
--- /dev/null
+++ b/callgrind/tests/notpower2-use.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-use.vgtest b/callgrind/tests/notpower2-use.vgtest
new file mode 100644
index 0000000000..b8312a76be
--- /dev/null
+++ b/callgrind/tests/notpower2-use.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-wb.stderr.exp b/callgrind/tests/notpower2-wb.stderr.exp
new file mode 100644
index 0000000000..90da3e4cec
--- /dev/null
+++ b/callgrind/tests/notpower2-wb.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-wb.vgtest b/callgrind/tests/notpower2-wb.vgtest
new file mode 100644
index 0000000000..34a1f6b335
--- /dev/null
+++ b/callgrind/tests/notpower2-wb.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2.stderr.exp b/callgrind/tests/notpower2.stderr.exp
new file mode 100644
index 0000000000..0705c1c849
--- /dev/null
+++ b/callgrind/tests/notpower2.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2.vgtest b/callgrind/tests/notpower2.vgtest
new file mode 100644
index 0000000000..73823d7493
--- /dev/null
+++ b/callgrind/tests/notpower2.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm callgrind.out.*