case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break;
case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
- case 0x0e:
- /* Real D1 cache configuration is:
- D1c = (cache_t) { 24, 6, 64 }; */
- VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
- *D1c = (cache_t) { 16, 4, 64 };
- break;
+ case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break;
case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
/* IA-64 info -- panic! */
case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break;
case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
- case 0x48:
- /* Real L2 cache configuration is:
- *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
- VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
- *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
- break;
+ case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
case 0x49:
if ((family == 15) && (model == 6))
/* On Xeon MP (family F, model 6), this is for L3 */
else
*L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
break;
- case 0x4e:
- /* Real L2 cache configuration is:
- *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
- VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
- *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
- break;
+ case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
/* These are sectored, whatever that means */
case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */
</listitem>
<listitem>
- <para>Bit-selection hash function: the line(s) in the cache
+ <para>Bit-selection hash function: the set of line(s) in the cache
to which a memory block maps is chosen by the middle bits
M--(M+N-1) of the byte address, where:</para>
<itemizedlist>
<para>line size = 2^M bytes</para>
</listitem>
<listitem>
- <para>(cache size / line size) = 2^N bytes</para>
+ <para>(cache size / line size / associativity) = 2^N bytes</para>
</listitem>
</itemizedlist>
</listitem>
<listitem>
- <para>Inclusive L2 cache: the L2 cache replicates all the
- entries of the L1 cache. This is standard on Pentium chips,
- but AMD Opterons, Athlons and Durons
+ <para>Inclusive L2 cache: the L2 cache typically replicates all
+ the entries of the L1 caches, because fetching into L1 involves
+ fetching into L2 first (this does not guarantee strict inclusiveness,
+ as lines evicted from L2 still could reside in L1). This is
+ standard on Pentium chips, but AMD Opterons, Athlons and Durons
use an exclusive L2 cache that only holds
blocks evicted from L1. Ditto most modern VIA CPUs.</para>
</listitem>
(I1/D1/L2) of the cache from the command line using the
<computeroutput>--I1</computeroutput>,
<computeroutput>--D1</computeroutput> and
-<computeroutput>--L2</computeroutput> options.</para>
+<computeroutput>--L2</computeroutput> options.
+For cache parameters to be valid for simulation, the number
+of sets (with associativity being the number of cache lines in
+each set) has to be a power of two.</para>
<para>On PowerPC platforms
Cachegrind cannot automatically
<para>If you are interested in simulating a cache with different
properties, it is not particularly hard to write your own cache
simulator, or to modify the existing ones in
-<computeroutput>vg_cachesim_I1.c</computeroutput>,
-<computeroutput>vg_cachesim_D1.c</computeroutput>,
-<computeroutput>vg_cachesim_L2.c</computeroutput> and
-<computeroutput>vg_cachesim_gen.c</computeroutput>. We'd be
+<computeroutput>cg_sim.c</computeroutput>. We'd be
interested to hear from anyone who does.</para>
</sect2>
Bool sectored; /* prefetch nearside cacheline on read */
int sets;
int sets_min_1;
- int assoc_bits;
int line_size_bits;
int tag_shift;
UWord tag_mask;
c->sets = (c->size / c->line_size) / c->assoc;
c->sets_min_1 = c->sets - 1;
- c->assoc_bits = VG_(log2)(c->assoc);
c->line_size_bits = VG_(log2)(c->line_size);
c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
c->tag_mask = ~((1<<c->tag_shift)-1);
int i, j;
UWord *set;
- /* Shifting is a bit faster than multiplying */
- set = &(c->tags[set_no << c->assoc_bits]);
+ set = &(c->tags[set_no * c->assoc]);
/* This loop is unrolled for just the first case, which is the most */
/* common. We can't unroll any further because it would screw up */
int i, j;
UWord *set, tmp_tag;
- /* Shifting is a bit faster than multiplying */
- set = &(c->tags[set_no << c->assoc_bits]);
+ set = &(c->tags[set_no * c->assoc]);
/* This loop is unrolled for just the first case, which is the most */
/* common. We can't unroll any further because it would screw up */
/* Access straddles two lines. */
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
else if (((set1 + 1) & (c->sets-1)) == set2) {
- UWord tag2 = (a+size-1) >> c->tag_shift;
+ UWord tag2 = (a+size-1) & c->tag_mask;
/* the call updates cache structures as side effect */
CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
/* We use lower tag bits as offset pointers to cache use info.
* I.e. some cache parameters don't work.
*/
- if (c->tag_shift < c->assoc_bits) {
+ if ( (1<<c->tag_shift) < c->assoc) {
VG_(message)(Vg_DebugMsg,
"error: Use associativity < %d for cache use statistics!",
(1<<c->tag_shift) );
static __inline__
void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
{
- int idx = (high_idx << c->assoc_bits) | low_idx;
+ int idx = (high_idx * c->assoc) + low_idx;
c->use[idx].count ++;
c->use[idx].mask |= use_mask;
UWord *set, tmp_tag;
UInt use_mask;
- /* Shifting is a bit faster than multiplying */
- set = &(c->tags[set_no << c->assoc_bits]);
+ set = &(c->tags[set_no * c->assoc]);
use_mask =
c->line_start_mask[a & c->line_size_mask] &
c->line_end_mask[(a+size-1) & c->line_size_mask];
}
set[0] = tag | tmp_tag;
- cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
+ cacheuse_L2_miss(c, (set_no * c->assoc) | tmp_tag,
use_mask, a & ~c->line_size_mask);
return Miss;
{
UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
- UWord tag = a >> c->tag_shift;
+ UWord tag = a & c->tag_mask;
/* Access entirely within line. */
if (set1 == set2)
/* Access straddles two lines. */
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
else if (((set1 + 1) & (c->sets-1)) == set2) {
- UWord tag2 = a >> c->tag_shift;
+ UWord tag2 = a & c->tag_mask;
/* the call updates cache structures as side effect */
CacheResult res1 = cacheuse_isMiss(c, set1, tag);
/* First case: word entirely within line. */ \
if (set1 == set2) { \
\
- /* Shifting is a bit faster than multiplying */ \
- set = &(L.tags[set1 << L.assoc_bits]); \
+ set = &(L.tags[set1 * L.assoc]); \
use_mask = L.line_start_mask[a & L.line_size_mask] & \
L.line_end_mask[(a+size-1) & L.line_size_mask]; \
\
/* common. We can't unroll any further because it would screw up */\
/* if we have a direct-mapped (1-way) cache. */\
if (tag == (set[0] & L.tag_mask)) { \
- idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tmp_tag; \
- idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tag | tmp_tag; \
- idx = (set1 << L.assoc_bits) | tmp_tag; \
+ idx = (set1 * L.assoc) + tmp_tag; \
return update_##L##_use(&L, idx, \
use_mask, a &~ L.line_size_mask); \
\
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
- set = &(L.tags[set1 << L.assoc_bits]); \
+ set = &(L.tags[set1 * L.assoc]); \
use_mask = L.line_start_mask[a & L.line_size_mask]; \
if (tag == (set[0] & L.tag_mask)) { \
- idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tmp_tag; \
- idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tag | tmp_tag; \
- idx = (set1 << L.assoc_bits) | tmp_tag; \
+ idx = (set1 * L.assoc) + tmp_tag; \
miss1 = update_##L##_use(&L, idx, \
use_mask, a &~ L.line_size_mask); \
block2: \
- set = &(L.tags[set2 << L.assoc_bits]); \
+ set = &(L.tags[set2 * L.assoc]); \
use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
tag2 = (a+size-1) & L.tag_mask; \
if (tag2 == (set[0] & L.tag_mask)) { \
- idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
+ idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tmp_tag; \
- idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
+ idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tag2 | tmp_tag; \
- idx = (set2 << L.assoc_bits) | tmp_tag; \
+ idx = (set2 * L.assoc) + tmp_tag; \
miss2 = update_##L##_use(&L, idx, \
use_mask, (a+size-1) &~ L.line_size_mask); \
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
{
UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
- UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
+ UWord* set = &(L2.tags[setNo * L2.assoc]);
UWord tag = memline & L2.tag_mask;
int i, j, idx;
CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
if (tag == (set[0] & L2.tag_mask)) {
- idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
+ idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
l1_loaded->dep_use = &(L2.use[idx]);
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
set[j] = set[j - 1];
}
set[0] = tmp_tag;
- idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
+ idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
l1_loaded->dep_use = &(L2.use[idx]);
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
set[j] = set[j - 1];
}
set[0] = tag | tmp_tag;
- idx = (setNo << L2.assoc_bits) | tmp_tag;
+ idx = (setNo * L2.assoc) + tmp_tag;
l1_loaded->dep_use = &(L2.use[idx]);
update_L2_use(idx, memline);
static
void check_cache(cache_t* cache, Char *name)
{
- /* First check they're all powers of two */
- if (-1 == VG_(log2)(cache->size)) {
+ /* Simulator requires line size and set count to be powers of two */
+ if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+ (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
VG_(message)(Vg_UserMsg,
- "error: %s size of %dB not a power of two; aborting.",
- name, cache->size);
- VG_(exit)(1);
- }
-
- if (-1 == VG_(log2)(cache->assoc)) {
- VG_(message)(Vg_UserMsg,
- "error: %s associativity of %d not a power of two; aborting.",
- name, cache->assoc);
- VG_(exit)(1);
+ "error: %s set count not a power of two; aborting.",
+ name);
}
- if (-1 == VG_(log2)(cache->line_size)) {
+ if (-1 == VG_(log2)(cache->line_size)) {
VG_(message)(Vg_UserMsg,
"error: %s line size of %dB not a power of two; aborting.",
name, cache->line_size);