From: Josef Weidendorfer <Josef.Weidendorfer@gmx.de>
Date: Mon, 26 Jan 2009 22:56:14 +0000 (+0000)
Subject: Cachegrind/Callgrind: allow for cache sizes other than only powers of two
X-Git-Tag: svn/VALGRIND_3_5_0~1033
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=3a5719678f985fc6967aee2232882d645a0714fb;p=thirdparty%2Fvalgrind.git

Cachegrind/Callgrind: allow for cache sizes other than only powers of two

The number of sets, ie. number of cache lines divided by associativity,
and the cache line size still have to be powers of two.
This change is needed for default cache parameters used on some Intel
Core 2 and Atom processors.

Includes cachegrind manual update and explicit tests with 24KB D1/3MB L2
Reverts addition of 6MB warning to {cachegrind,callgrind}/tests/filter_stderr

Backporting to VALGRIND_3_4_BRANCH needs r8912

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@9080
---

diff --git a/cachegrind/cg-x86.c b/cachegrind/cg-x86.c
index 873c351537..eceec6d3bd 100644
--- a/cachegrind/cg-x86.c
+++ b/cachegrind/cg-x86.c
@@ -113,12 +113,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
 
       case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
       case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
-      case 0x0e:
-         /* Real D1 cache configuration is:
-            D1c = (cache_t) { 24, 6, 64 }; */
-         VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
-         *D1c = (cache_t) { 16, 4, 64 };
-         break;
+      case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break;
       case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
 
       /* IA-64 info -- panic! */
@@ -149,12 +144,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
       case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
       case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
       case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
-      case 0x48:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
-         *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
-         break;
+      case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
       case 0x49:
 	  if ((family == 15) && (model == 6))
 	      /* On Xeon MP (family F, model 6), this is for L3 */
@@ -163,12 +153,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
 	  else
 	      *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
 	  break;
-      case 0x4e:
-         /* Real L2 cache configuration is:
-            *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
-         VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
-         *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
-         break;
+      case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
 
       /* These are sectored, whatever that means */
       case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index a30ba758d3..36ddbab430 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -1158,18 +1158,12 @@ static cache_t clo_L2_cache = UNDEFINED_CACHE;
 static 
 void check_cache(cache_t* cache, Char *name)
 {
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
+   /* Simulator requires line size and set count to be powers of two */
+   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
       VG_(message)(Vg_UserMsg,
-         "error: %s size of %dB not a power of two; aborting.",
-         name, cache->size);
-      VG_(exit)(1);
-   }
-
-   if (-1 == VG_(log2)(cache->assoc)) {
-      VG_(message)(Vg_UserMsg,
-         "error: %s associativity of %d not a power of two; aborting.",
-         name, cache->assoc);
+         "error: %s set count not a power of two; aborting.",
+         name);
       VG_(exit)(1);
    }
 
diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c
index 6edf12628f..25bef497ad 100644
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@@ -44,7 +44,6 @@ typedef struct {
    Int          line_size;              /* bytes */
    Int          sets;
    Int          sets_min_1;
-   Int          assoc_bits;
    Int          line_size_bits;
    Int          tag_shift;
    Char         desc_line[128];
@@ -62,7 +61,6 @@ static void cachesim_initcache(cache_t config, cache_t2* c)
 
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
    c->line_size_bits = VG_(log2)(c->line_size);
    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
 
@@ -111,8 +109,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
    /* First case: word entirely within line. */                             \
    if (set1 == set2) {                                                      \
                                                                             \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
                                                                             \
       /* This loop is unrolled for just the first case, which is the most */\
       /* common.  We can't unroll any further because it would screw up   */\
@@ -143,7 +140,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
    /* Second case: word straddles two lines. */                             \
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       if (tag == set[0]) {                                                  \
          goto block2;                                                       \
       }                                                                     \
@@ -162,7 +159,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2)         \
       set[0] = tag;                                                         \
       is_miss = True;                                                       \
 block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      set = &(L.tags[set2 * L.assoc]);                                      \
       tag2 = (a+size-1) >> L.tag_shift;                                     \
       if (tag2 == set[0]) {                                                 \
          goto miss_treatment;                                               \
diff --git a/cachegrind/docs/cg-manual.xml b/cachegrind/docs/cg-manual.xml
index f65272bbc2..512eeb409b 100644
--- a/cachegrind/docs/cg-manual.xml
+++ b/cachegrind/docs/cg-manual.xml
@@ -142,7 +142,7 @@ follows:</para>
   </listitem>
 
   <listitem>
-    <para>Bit-selection hash function: the line(s) in the cache
+    <para>Bit-selection hash function: the set of line(s) in the cache
     to which a memory block maps is chosen by the middle bits
     M--(M+N-1) of the byte address, where:</para>
     <itemizedlist>
@@ -150,15 +150,17 @@ follows:</para>
         <para>line size = 2^M bytes</para>
       </listitem>
       <listitem>
-        <para>(cache size / line size) = 2^N bytes</para>
+        <para>(cache size / line size / associativity) = 2^N bytes</para>
       </listitem>
     </itemizedlist> 
   </listitem>
 
   <listitem>
-    <para>Inclusive L2 cache: the L2 cache replicates all the
-    entries of the L1 cache.  This is standard on Pentium chips,
-    but AMD Opterons, Athlons and Durons 
+    <para>Inclusive L2 cache: the L2 cache typically replicates all
+    the entries of the L1 caches, because fetching into L1 involves
+    fetching into L2 first (this does not guarantee strict inclusiveness,
+    as lines evicted from L2 still could reside in L1).  This is
+    standard on Pentium chips, but AMD Opterons, Athlons and Durons
     use an exclusive L2 cache that only holds
     blocks evicted from L1.  Ditto most modern VIA CPUs.</para>
   </listitem>
@@ -176,7 +178,10 @@ happens.  You can manually specify one, two or all three levels
 (I1/D1/L2) of the cache from the command line using the
 <computeroutput>--I1</computeroutput>,
 <computeroutput>--D1</computeroutput> and
-<computeroutput>--L2</computeroutput> options.</para>
+<computeroutput>--L2</computeroutput> options.
+For cache parameters to be valid for simulation, the number
+of sets (with associativity being the number of cache lines in
+each set) has to be a power of two.</para>
 
 <para>On PowerPC platforms
 Cachegrind cannot automatically 
@@ -227,10 +232,7 @@ need to specify it with the
 <para>If you are interested in simulating a cache with different
 properties, it is not particularly hard to write your own cache
 simulator, or to modify the existing ones in
-<computeroutput>vg_cachesim_I1.c</computeroutput>,
-<computeroutput>vg_cachesim_D1.c</computeroutput>,
-<computeroutput>vg_cachesim_L2.c</computeroutput> and
-<computeroutput>vg_cachesim_gen.c</computeroutput>.  We'd be
+<computeroutput>cg_sim.c</computeroutput>. We'd be
 interested to hear from anyone who does.</para>
 
 </sect2>
diff --git a/cachegrind/tests/Makefile.am b/cachegrind/tests/Makefile.am
index 0ffc4da6dc..8e568cc379 100644
--- a/cachegrind/tests/Makefile.am
+++ b/cachegrind/tests/Makefile.am
@@ -15,6 +15,7 @@ EXTRA_DIST = $(noinst_SCRIPTS) \
 	chdir.vgtest chdir.stderr.exp \
 	clreq.vgtest clreq.stderr.exp \
 	dlclose.vgtest dlclose.stderr.exp dlclose.stdout.exp \
+	notpower2.vgtest notpower2.stderr.exp \
 	wrap5.vgtest wrap5.stderr.exp wrap5.stdout.exp
 
 check_PROGRAMS = \
diff --git a/cachegrind/tests/filter_stderr b/cachegrind/tests/filter_stderr
index 43efa2d6dc..a5bc1f4411 100755
--- a/cachegrind/tests/filter_stderr
+++ b/cachegrind/tests/filter_stderr
@@ -17,5 +17,4 @@ sed "s/\(\(I1\|D1\|L2\|L2i\|L2d\) *\(misses\|miss rate\):\)[ 0-9,()+rdw%\.]*$/\1
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
 sed "/warning: L3 cache detected but ignored/d" |
-sed "/warning: 6Mb L2 cache detected, treating as 4Mb/d" |
 sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d"
diff --git a/cachegrind/tests/notpower2.stderr.exp b/cachegrind/tests/notpower2.stderr.exp
new file mode 100644
index 0000000000..8eaf65446c
--- /dev/null
+++ b/cachegrind/tests/notpower2.stderr.exp
@@ -0,0 +1,17 @@
+
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/cachegrind/tests/notpower2.vgtest b/cachegrind/tests/notpower2.vgtest
new file mode 100644
index 0000000000..132cfe5923
--- /dev/null
+++ b/cachegrind/tests/notpower2.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm cachegrind.out.*
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 506ed9e400..29f72bd7a6 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -74,7 +74,6 @@ typedef struct {
    Bool         sectored;  /* prefetch nearside cacheline on read */
    int          sets;
    int          sets_min_1;
-   int          assoc_bits;
    int          line_size_bits;
    int          tag_shift;
    UWord        tag_mask;
@@ -195,7 +194,6 @@ static void cachesim_initcache(cache_t config, cache_t2* c)
 
    c->sets           = (c->size / c->line_size) / c->assoc;
    c->sets_min_1     = c->sets - 1;
-   c->assoc_bits     = VG_(log2)(c->assoc);
    c->line_size_bits = VG_(log2)(c->line_size);
    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
    c->tag_mask       = ~((1<<c->tag_shift)-1);
@@ -259,8 +257,7 @@ CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
     int i, j;
     UWord *set;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
 
     /* This loop is unrolled for just the first case, which is the most */
     /* common.  We can't unroll any further because it would screw up   */
@@ -359,8 +356,7 @@ CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
     int i, j;
     UWord *set, tmp_tag;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
 
     /* This loop is unrolled for just the first case, which is the most */
     /* common.  We can't unroll any further because it would screw up   */
@@ -407,7 +403,7 @@ CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
     /* Access straddles two lines. */
     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
     else if (((set1 + 1) & (c->sets-1)) == set2) {
-	UWord tag2  = (a+size-1) >> c->tag_shift;
+	UWord tag2  = (a+size-1) & c->tag_mask;
 
 	/* the call updates cache structures as side effect */
 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
@@ -676,7 +672,7 @@ void cacheuse_initcache(cache_t2* c)
     /* We use lower tag bits as offset pointers to cache use info.
      * I.e. some cache parameters don't work.
      */
-    if (c->tag_shift < c->assoc_bits) {
+    if ( (1<<c->tag_shift) < c->assoc) {
 	VG_(message)(Vg_DebugMsg,
 		     "error: Use associativity < %d for cache use statistics!",
 		     (1<<c->tag_shift) );
@@ -690,7 +686,7 @@ void cacheuse_initcache(cache_t2* c)
 static __inline__
 void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
 {
-    int idx = (high_idx << c->assoc_bits) | low_idx;
+    int idx = (high_idx * c->assoc) + low_idx;
 
     c->use[idx].count ++;
     c->use[idx].mask |= use_mask;
@@ -709,8 +705,7 @@ CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
     UWord *set, tmp_tag;
     UInt use_mask;
 
-    /* Shifting is a bit faster than multiplying */
-    set = &(c->tags[set_no << c->assoc_bits]);
+    set = &(c->tags[set_no * c->assoc]);
     use_mask =
 	c->line_start_mask[a & c->line_size_mask] &
 	c->line_end_mask[(a+size-1) & c->line_size_mask];
@@ -745,7 +740,7 @@ CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
     }
     set[0] = tag | tmp_tag;
 
-    cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
+    cacheuse_L2_miss(c, (set_no * c->assoc) | tmp_tag,
 		     use_mask, a & ~c->line_size_mask);
 
     return Miss;
@@ -756,7 +751,7 @@ static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
 {
     UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
     UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
-    UWord tag  = a >> c->tag_shift;
+    UWord tag  = a & c->tag_mask;
 
     /* Access entirely within line. */
     if (set1 == set2) 
@@ -765,7 +760,7 @@ static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
     /* Access straddles two lines. */
     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
     else if (((set1 + 1) & (c->sets-1)) == set2) {
-	UWord tag2  = a >> c->tag_shift;
+	UWord tag2  = a & c->tag_mask;
 
 	/* the call updates cache structures as side effect */
 	CacheResult res1 =  cacheuse_isMiss(c, set1, tag);
@@ -800,8 +795,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
    /* First case: word entirely within line. */                             \
    if (set1 == set2) {                                                      \
                                                                             \
-      /* Shifting is a bit faster than multiplying */                       \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
                                                                             \
@@ -809,7 +803,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
       /* common.  We can't unroll any further because it would screw up   */\
       /* if we have a direct-mapped (1-way) cache.                        */\
       if (tag == (set[0] & L.tag_mask)) {                                   \
-        idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);              \
+        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
         L.use[idx].count ++;                                                \
         L.use[idx].mask |= use_mask;                                        \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -826,7 +820,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;			                            \
-            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -842,7 +836,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag | tmp_tag;                                               \
-      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
       return update_##L##_use(&L, idx,         			            \
 		       use_mask, a &~ L.line_size_mask);		    \
                                                                             \
@@ -850,10 +844,10 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
-      set = &(L.tags[set1 << L.assoc_bits]);                                \
+      set = &(L.tags[set1 * L.assoc]);                                      \
       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
       if (tag == (set[0] & L.tag_mask)) {                                   \
-         idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
          L.use[idx].count ++;                                               \
          L.use[idx].mask |= use_mask;                                       \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -868,7 +862,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;                                               \
-            idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -882,15 +876,15 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag | tmp_tag;                                               \
-      idx = (set1 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set1 * L.assoc) + tmp_tag;                                     \
       miss1 = update_##L##_use(&L, idx,        			            \
 		       use_mask, a &~ L.line_size_mask);		    \
 block2:                                                                     \
-      set = &(L.tags[set2 << L.assoc_bits]);                                \
+      set = &(L.tags[set2 * L.assoc]);                                      \
       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
       tag2  = (a+size-1) & L.tag_mask;                                      \
       if (tag2 == (set[0] & L.tag_mask)) {                                  \
-         idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask);             \
+         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
          L.use[idx].count ++;                                               \
          L.use[idx].mask |= use_mask;                                       \
 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -905,7 +899,7 @@ block2:                                                                     \
                set[j] = set[j - 1];                                         \
             }                                                               \
             set[0] = tmp_tag;                                               \
-            idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask);         \
+            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
             L.use[idx].count ++;                                            \
             L.use[idx].mask |= use_mask;                                    \
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
@@ -919,7 +913,7 @@ block2:                                                                     \
          set[j] = set[j - 1];                                               \
       }                                                                     \
       set[0] = tag2 | tmp_tag;                                              \
-      idx = (set2 << L.assoc_bits) | tmp_tag;                               \
+      idx = (set2 * L.assoc) + tmp_tag;                                     \
       miss2 = update_##L##_use(&L, idx,			                    \
 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
@@ -984,7 +978,7 @@ static
 CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
 {
    UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
-   UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
+   UWord* set = &(L2.tags[setNo * L2.assoc]);
    UWord tag  = memline & L2.tag_mask;
 
    int i, j, idx;
@@ -993,7 +987,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
    CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
 
    if (tag == (set[0] & L2.tag_mask)) {
-     idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
+     idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
      l1_loaded->dep_use = &(L2.use[idx]);
 
      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
@@ -1008,7 +1002,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
 	 set[j] = set[j - 1];
        }
        set[0] = tmp_tag;
-       idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
+       idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
        l1_loaded->dep_use = &(L2.use[idx]);
 
 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
@@ -1024,7 +1018,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
      set[j] = set[j - 1];
    }
    set[0] = tag | tmp_tag;
-   idx = (setNo << L2.assoc_bits) | tmp_tag;
+   idx = (setNo * L2.assoc) + tmp_tag;
    l1_loaded->dep_use = &(L2.use[idx]);
 
    update_L2_use(idx, memline);
@@ -1380,22 +1374,15 @@ static cache_t clo_L2_cache = UNDEFINED_CACHE;
 static
 void check_cache(cache_t* cache, Char *name)
 {
-   /* First check they're all powers of two */
-   if (-1 == VG_(log2)(cache->size)) {
+   /* Simulator requires line size and set count to be powers of two */
+   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
       VG_(message)(Vg_UserMsg,
-         "error: %s size of %dB not a power of two; aborting.",
-         name, cache->size);
-      VG_(exit)(1);
-   }
-
-   if (-1 == VG_(log2)(cache->assoc)) {
-      VG_(message)(Vg_UserMsg,
-         "error: %s associativity of %d not a power of two; aborting.",
-         name, cache->assoc);
-      VG_(exit)(1);
+         "error: %s set count not a power of two; aborting.",
+         name);
    }
 
-  if (-1 == VG_(log2)(cache->line_size)) {
+   if (-1 == VG_(log2)(cache->line_size)) {
       VG_(message)(Vg_UserMsg,
          "error: %s line size of %dB not a power of two; aborting.",
          name, cache->line_size);
diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am
index b95cfc3cf9..1c3f42c070 100644
--- a/callgrind/tests/Makefile.am
+++ b/callgrind/tests/Makefile.am
@@ -11,6 +11,10 @@ EXTRA_DIST = $(noinst_SCRIPTS) \
              simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \
              simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \
              simwork3.vgtest simwork3.stdout.exp simwork3.stderr.exp \
+             notpower2.vgtest notpower2.stderr.exp \
+             notpower2-wb.vgtest notpower2-wb.stderr.exp \
+             notpower2-hwpref.vgtest notpower2-hwpref.stderr.exp \
+             notpower2-use.vgtest notpower2-use.stderr.exp \
              threads.vgtest threads.stderr.exp
 
 check_PROGRAMS = clreq simwork threads
diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr
index 1023bc6fe0..7b69674f50 100755
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@@ -23,5 +23,4 @@ sed "s/\(\(I1\|D1\|L2\|L2i\|L2d\) *\(misses\|miss rate\):\)[ 0-9,()+rdw%\.]*$/\1
 sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
 sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
 sed "/warning: L3 cache detected but ignored/d" |
-sed "/warning: 6Mb L2 cache detected, treating as 4Mb/d" |
 sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d"
diff --git a/callgrind/tests/notpower2-hwpref.stderr.exp b/callgrind/tests/notpower2-hwpref.stderr.exp
new file mode 100644
index 0000000000..0705c1c849
--- /dev/null
+++ b/callgrind/tests/notpower2-hwpref.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-hwpref.vgtest b/callgrind/tests/notpower2-hwpref.vgtest
new file mode 100644
index 0000000000..9da7dced2a
--- /dev/null
+++ b/callgrind/tests/notpower2-hwpref.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-use.stderr.exp b/callgrind/tests/notpower2-use.stderr.exp
new file mode 100644
index 0000000000..ea9acc89b5
--- /dev/null
+++ b/callgrind/tests/notpower2-use.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-use.vgtest b/callgrind/tests/notpower2-use.vgtest
new file mode 100644
index 0000000000..b8312a76be
--- /dev/null
+++ b/callgrind/tests/notpower2-use.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-wb.stderr.exp b/callgrind/tests/notpower2-wb.stderr.exp
new file mode 100644
index 0000000000..90da3e4cec
--- /dev/null
+++ b/callgrind/tests/notpower2-wb.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2-wb.vgtest b/callgrind/tests/notpower2-wb.vgtest
new file mode 100644
index 0000000000..34a1f6b335
--- /dev/null
+++ b/callgrind/tests/notpower2-wb.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2.stderr.exp b/callgrind/tests/notpower2.stderr.exp
new file mode 100644
index 0000000000..0705c1c849
--- /dev/null
+++ b/callgrind/tests/notpower2.stderr.exp
@@ -0,0 +1,20 @@
+
+
+Events    : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I   refs:
+I1  misses:
+L2i misses:
+I1  miss rate:
+L2i miss rate:
+
+D   refs:
+D1  misses:
+L2d misses:
+D1  miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
diff --git a/callgrind/tests/notpower2.vgtest b/callgrind/tests/notpower2.vgtest
new file mode 100644
index 0000000000..73823d7493
--- /dev/null
+++ b/callgrind/tests/notpower2.vgtest
@@ -0,0 +1,3 @@
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm callgrind.out.*