The GNU General Public License is contained in the file COPYING.
*/
-#include "pub_tool_basics.h"
-#include "pub_tool_cpuid.h"
-#include "pub_tool_libcbase.h"
-#include "pub_tool_libcassert.h"
-#include "pub_tool_libcprint.h"
-
-#include "cg_arch.h"
-
-// All CPUID info taken from sandpile.org/a32/cpuid.htm */
-// Probably only works for Intel and AMD chips, and probably only for some of
-// them.
-
-static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
-{
- VG_(message)(Vg_DebugMsg,
- "warning: Pentium 4 with %d KB micro-op instruction trace cache",
- actual_size);
- VG_(message)(Vg_DebugMsg,
- " Simulating a %d KB I-cache with %d B lines",
- used_size, line_size);
-}
-
-/* Intel method is truly wretched. We have to do an insane indexing into an
- * array of pre-defined configurations for various parts of the memory
- * hierarchy.
- * According to Intel Processor Identification, App Note 485.
- */
-static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
- Int cpuid1_eax;
- Int cpuid1_ignore;
- Int family;
- Int model;
- UChar info[16];
- Int i, trials;
- Bool L2_found = False;
-
- if (level < 2) {
- VG_(message)(Vg_DebugMsg,
- "warning: CPUID level < 2 for Intel processor (%d)",
- level);
- return -1;
- }
-
- /* family/model needed to distinguish code reuse (currently 0x49) */
- VG_(cpuid)(1, &cpuid1_eax, &cpuid1_ignore,
- &cpuid1_ignore, &cpuid1_ignore);
- family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf);
- model = (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf);
-
- VG_(cpuid)(2, (Int*)&info[0], (Int*)&info[4],
- (Int*)&info[8], (Int*)&info[12]);
- trials = info[0] - 1; /* AL register - bits 0..7 of %eax */
- info[0] = 0x0; /* reset AL */
-
- if (0 != trials) {
- VG_(message)(Vg_DebugMsg,
- "warning: non-zero CPUID trials for Intel processor (%d)",
- trials);
- return -1;
- }
-
- for (i = 0; i < 16; i++) {
-
- switch (info[i]) {
-
- case 0x0: /* ignore zeros */
- break;
-
- /* TLB info, ignore */
- case 0x01: case 0x02: case 0x03: case 0x04: case 0x05:
- case 0x4f: case 0x50: case 0x51: case 0x52:
- case 0x56: case 0x57: case 0x59:
- case 0x5b: case 0x5c: case 0x5d:
- case 0xb0: case 0xb1:
- case 0xb3: case 0xb4: case 0xba: case 0xc0:
- break;
-
- case 0x06: *I1c = (cache_t) { 8, 4, 32 }; break;
- case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
- case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break;
-
- case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break;
- case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
- case 0x0e:
- /* Real D1 cache configuration is:
- D1c = (cache_t) { 24, 6, 64 }; */
- VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
- *D1c = (cache_t) { 16, 4, 64 };
- break;
- case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
-
- /* IA-64 info -- panic! */
- case 0x10: case 0x15: case 0x1a:
- case 0x88: case 0x89: case 0x8a: case 0x8d:
- case 0x90: case 0x96: case 0x9b:
- VG_(tool_panic)("IA-64 cache detected?!");
-
- case 0x22: case 0x23: case 0x25: case 0x29:
- case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
- VG_(message)(Vg_DebugMsg,
- "warning: L3 cache detected but ignored");
- break;
-
- /* These are sectored, whatever that means */
- case 0x39: *L2c = (cache_t) { 128, 4, 64 }; L2_found = True; break;
- case 0x3c: *L2c = (cache_t) { 256, 4, 64 }; L2_found = True; break;
-
- /* If a P6 core, this means "no L2 cache".
- If a P4 core, this means "no L3 cache".
- We don't know what core it is, so don't issue a warning. To detect
- a missing L2 cache, we use 'L2_found'. */
- case 0x40:
- break;
-
- case 0x41: *L2c = (cache_t) { 128, 4, 32 }; L2_found = True; break;
- case 0x42: *L2c = (cache_t) { 256, 4, 32 }; L2_found = True; break;
- case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break;
- case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
- case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
- case 0x48:
- /* Real L2 cache configuration is:
- *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
- VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
- *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
- break;
- case 0x49:
- if ((family == 15) && (model == 6))
- /* On Xeon MP (family F, model 6), this is for L3 */
- VG_(message)(Vg_DebugMsg,
- "warning: L3 cache detected but ignored");
- else
- *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
- break;
- case 0x4e:
- /* Real L2 cache configuration is:
- *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
- VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
- *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
- break;
-
- /* These are sectored, whatever that means */
- case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */
- case 0x66: *D1c = (cache_t) { 8, 4, 64 }; break; /* sectored */
- case 0x67: *D1c = (cache_t) { 16, 4, 64 }; break; /* sectored */
- case 0x68: *D1c = (cache_t) { 32, 4, 64 }; break; /* sectored */
-
- /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
- * conversion to byte size is a total guess; treat the 12K and 16K
- * cases the same since the cache byte size must be a power of two for
- * everything to work!. Also guessing 32 bytes for the line size...
- */
- case 0x70: /* 12K micro-ops, 8-way */
- *I1c = (cache_t) { 16, 8, 32 };
- micro_ops_warn(12, 16, 32);
- break;
- case 0x71: /* 16K micro-ops, 8-way */
- *I1c = (cache_t) { 16, 8, 32 };
- micro_ops_warn(16, 16, 32);
- break;
- case 0x72: /* 32K micro-ops, 8-way */
- *I1c = (cache_t) { 32, 8, 32 };
- micro_ops_warn(32, 32, 32);
- break;
-
- /* These are sectored, whatever that means */
- case 0x79: *L2c = (cache_t) { 128, 8, 64 }; L2_found = True; break;
- case 0x7a: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
- case 0x7b: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
- case 0x7c: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
- case 0x7d: *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
- case 0x7e: *L2c = (cache_t) { 256, 8, 128 }; L2_found = True; break;
-
- case 0x7f: *L2c = (cache_t) { 512, 2, 64 }; L2_found = True; break;
- case 0x80: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
-
- case 0x81: *L2c = (cache_t) { 128, 8, 32 }; L2_found = True; break;
- case 0x82: *L2c = (cache_t) { 256, 8, 32 }; L2_found = True; break;
- case 0x83: *L2c = (cache_t) { 512, 8, 32 }; L2_found = True; break;
- case 0x84: *L2c = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
- case 0x85: *L2c = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
- case 0x86: *L2c = (cache_t) { 512, 4, 64 }; L2_found = True; break;
- case 0x87: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
-
- /* Ignore prefetch information */
- case 0xf0: case 0xf1:
- break;
-
- default:
- VG_(message)(Vg_DebugMsg,
- "warning: Unknown Intel cache config value "
- "(0x%x), ignoring", info[i]);
- break;
- }
- }
-
- if (!L2_found)
- VG_(message)(Vg_DebugMsg,
- "warning: L2 cache not installed, ignore L2 results.");
-
- return 0;
-}
-
-/* AMD method is straightforward, just extract appropriate bits from the
- * result registers.
- *
- * Bits, for D1 and I1:
- * 31..24 data L1 cache size in KBs
- * 23..16 data L1 cache associativity (FFh=full)
- * 15.. 8 data L1 cache lines per tag
- * 7.. 0 data L1 cache line size in bytes
- *
- * Bits, for L2:
- * 31..16 unified L2 cache size in KBs
- * 15..12 unified L2 cache associativity (0=off, FFh=full)
- * 11.. 8 unified L2 cache lines per tag
- * 7.. 0 unified L2 cache line size in bytes
- *
- * #3 The AMD K7 processor's L2 cache must be configured prior to relying
- * upon this information. (Whatever that means -- njn)
- *
- * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
- * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
- * so we detect that.
- *
- * Returns 0 on success, non-zero on failure.
- */
-static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
- UInt ext_level;
- UInt dummy, model;
- UInt I1i, D1i, L2i;
-
- VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
-
- if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
- VG_(message)(Vg_UserMsg,
- "warning: ext_level < 0x80000006 for AMD processor (0x%x)",
- ext_level);
- return -1;
- }
-
- VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
- VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
-
- VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
-
- /* Check for Duron bug */
- if (model == 0x630) {
- VG_(message)(Vg_UserMsg,
- "Buggy Duron stepping A0. Assuming L2 size=65536 bytes");
- L2i = (64 << 16) | (L2i & 0xffff);
- }
-
- D1c->size = (D1i >> 24) & 0xff;
- D1c->assoc = (D1i >> 16) & 0xff;
- D1c->line_size = (D1i >> 0) & 0xff;
-
- I1c->size = (I1i >> 24) & 0xff;
- I1c->assoc = (I1i >> 16) & 0xff;
- I1c->line_size = (I1i >> 0) & 0xff;
-
- L2c->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
- L2c->assoc = (L2i >> 12) & 0xf;
- L2c->line_size = (L2i >> 0) & 0xff;
-
- return 0;
-}
-
-static
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
-{
- Int level, ret;
- Char vendor_id[13];
-
- if (!VG_(has_cpuid)()) {
- VG_(message)(Vg_DebugMsg, "CPUID instruction not supported");
- return -1;
- }
-
- VG_(cpuid)(0, &level, (int*)&vendor_id[0],
- (int*)&vendor_id[8], (int*)&vendor_id[4]);
- vendor_id[12] = '\0';
-
- if (0 == level) {
- VG_(message)(Vg_DebugMsg, "CPUID level is 0, early Pentium?");
- return -1;
- }
-
- /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
- if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
- ret = Intel_cache_info(level, I1c, D1c, L2c);
-
- } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
- ret = AMD_cache_info(I1c, D1c, L2c);
-
- } else {
- VG_(message)(Vg_DebugMsg, "CPU vendor ID not recognised (%s)",
- vendor_id);
- return -1;
- }
-
- /* Successful! Convert sizes from KB to bytes */
- I1c->size *= 1024;
- D1c->size *= 1024;
- L2c->size *= 1024;
-
- return ret;
-}
-
-
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
- Bool all_caches_clo_defined)
-{
- Int res;
-
- // Set caches to default.
- *I1c = (cache_t) { 65536, 2, 64 };
- *D1c = (cache_t) { 65536, 2, 64 };
- *L2c = (cache_t) { 262144, 8, 64 };
-
- // Then replace with any info we can get from CPUID.
- res = get_caches_from_CPUID(I1c, D1c, L2c);
-
- // Warn if CPUID failed and config not completely specified from cmd line.
- if (res != 0 && !all_caches_clo_defined) {
- VG_(message)(Vg_DebugMsg,
- "Warning: Couldn't auto-detect cache config, using one "
- "or more defaults ");
- }
-}
+#include "cg-x86.c"
/*--------------------------------------------------------------------*/
/*--- end ---*/
/*--------------------------------------------------------------------*/
-/*--- x86-specific definitions. cg-x86.c ---*/
+/*--- x86-specific (and AMD64-specific) definitions. cg-x86.c ---*/
/*--------------------------------------------------------------------*/
/*
case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break;
case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
- case 0x0e:
- /* Real D1 cache configuration is:
- D1c = (cache_t) { 24, 6, 64 }; */
- VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb");
- *D1c = (cache_t) { 16, 4, 64 };
- break;
+ case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break;
case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
/* IA-64 info -- panic! */
case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break;
case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
- case 0x48:
- /* Real L2 cache configuration is:
- *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */
- VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb");
- *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True;
- break;
+ case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
case 0x49:
if ((family == 15) && (model == 6))
/* On Xeon MP (family F, model 6), this is for L3 */
else
*L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
break;
- case 0x4e:
- /* Real L2 cache configuration is:
- *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */
- VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb");
- *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
- break;
+ case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
/* These are sectored, whatever that means */
case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */
static
void check_cache(cache_t* cache, Char *name)
{
- /* First check they're all powers of two */
- if (-1 == VG_(log2)(cache->size)) {
+ /* Simulator requires line size and set count to be powers of two */
+ if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+ (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
VG_(message)(Vg_UserMsg,
- "error: %s size of %dB not a power of two; aborting.",
- name, cache->size);
- VG_(exit)(1);
- }
-
- if (-1 == VG_(log2)(cache->assoc)) {
- VG_(message)(Vg_UserMsg,
- "error: %s associativity of %d not a power of two; aborting.",
- name, cache->assoc);
+ "error: %s set count not a power of two; aborting.",
+ name);
VG_(exit)(1);
}
Int line_size; /* bytes */
Int sets;
Int sets_min_1;
- Int assoc_bits;
Int line_size_bits;
Int tag_shift;
Char desc_line[128];
c->sets = (c->size / c->line_size) / c->assoc;
c->sets_min_1 = c->sets - 1;
- c->assoc_bits = VG_(log2)(c->assoc);
c->line_size_bits = VG_(log2)(c->line_size);
c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
/* First case: word entirely within line. */ \
if (set1 == set2) { \
\
- /* Shifting is a bit faster than multiplying */ \
- set = &(L.tags[set1 << L.assoc_bits]); \
+ set = &(L.tags[set1 * L.assoc]); \
\
/* This loop is unrolled for just the first case, which is the most */\
/* common. We can't unroll any further because it would screw up */\
/* Second case: word straddles two lines. */ \
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
- set = &(L.tags[set1 << L.assoc_bits]); \
+ set = &(L.tags[set1 * L.assoc]); \
if (tag == set[0]) { \
goto block2; \
} \
set[0] = tag; \
is_miss = True; \
block2: \
- set = &(L.tags[set2 << L.assoc_bits]); \
+ set = &(L.tags[set2 * L.assoc]); \
tag2 = (a+size-1) >> L.tag_shift; \
if (tag2 == set[0]) { \
goto miss_treatment; \
</listitem>
<listitem>
- <para>Bit-selection hash function: the line(s) in the cache
+ <para>Bit-selection hash function: the set of line(s) in the cache
to which a memory block maps is chosen by the middle bits
M--(M+N-1) of the byte address, where:</para>
<itemizedlist>
<para>line size = 2^M bytes</para>
</listitem>
<listitem>
- <para>(cache size / line size) = 2^N bytes</para>
+ <para>(cache size / line size / associativity) = 2^N bytes</para>
</listitem>
</itemizedlist>
</listitem>
<listitem>
- <para>Inclusive L2 cache: the L2 cache replicates all the
- entries of the L1 cache. This is standard on Pentium chips,
- but AMD Opterons, Athlons and Durons
+ <para>Inclusive L2 cache: the L2 cache typically replicates all
+ the entries of the L1 caches, because fetching into L1 involves
+ fetching into L2 first (this does not guarantee strict inclusiveness,
+ as lines evicted from L2 still could reside in L1). This is
+ standard on Pentium chips, but AMD Opterons, Athlons and Durons
use an exclusive L2 cache that only holds
blocks evicted from L1. Ditto most modern VIA CPUs.</para>
</listitem>
(I1/D1/L2) of the cache from the command line using the
<computeroutput>--I1</computeroutput>,
<computeroutput>--D1</computeroutput> and
-<computeroutput>--L2</computeroutput> options.</para>
+<computeroutput>--L2</computeroutput> options.
+For cache parameters to be valid for simulation, the number
+of sets (with associativity being the number of cache lines in
+each set) has to be a power of two.</para>
<para>On PowerPC platforms
Cachegrind cannot automatically
<para>If you are interested in simulating a cache with different
properties, it is not particularly hard to write your own cache
simulator, or to modify the existing ones in
-<computeroutput>vg_cachesim_I1.c</computeroutput>,
-<computeroutput>vg_cachesim_D1.c</computeroutput>,
-<computeroutput>vg_cachesim_L2.c</computeroutput> and
-<computeroutput>vg_cachesim_gen.c</computeroutput>. We'd be
+<computeroutput>cg_sim.c</computeroutput>. We'd be
interested to hear from anyone who does.</para>
</sect2>
chdir.vgtest chdir.stderr.exp \
clreq.vgtest clreq.stderr.exp \
dlclose.vgtest dlclose.stderr.exp dlclose.stdout.exp \
+ notpower2.vgtest notpower2.stderr.exp \
wrap5.vgtest wrap5.stderr.exp wrap5.stdout.exp
check_PROGRAMS = \
--- /dev/null
+
+
+I refs:
+I1 misses:
+L2i misses:
+I1 miss rate:
+L2i miss rate:
+
+D refs:
+D1 misses:
+L2d misses:
+D1 miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
--- /dev/null
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm cachegrind.out.*
Bool sectored; /* prefetch nearside cacheline on read */
int sets;
int sets_min_1;
- int assoc_bits;
int line_size_bits;
int tag_shift;
UWord tag_mask;
c->sets = (c->size / c->line_size) / c->assoc;
c->sets_min_1 = c->sets - 1;
- c->assoc_bits = VG_(log2)(c->assoc);
c->line_size_bits = VG_(log2)(c->line_size);
c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
c->tag_mask = ~((1<<c->tag_shift)-1);
int i, j;
UWord *set;
- /* Shifting is a bit faster than multiplying */
- set = &(c->tags[set_no << c->assoc_bits]);
+ set = &(c->tags[set_no * c->assoc]);
/* This loop is unrolled for just the first case, which is the most */
/* common. We can't unroll any further because it would screw up */
int i, j;
UWord *set, tmp_tag;
- /* Shifting is a bit faster than multiplying */
- set = &(c->tags[set_no << c->assoc_bits]);
+ set = &(c->tags[set_no * c->assoc]);
/* This loop is unrolled for just the first case, which is the most */
/* common. We can't unroll any further because it would screw up */
/* Access straddles two lines. */
/* Nb: this is a fast way of doing ((set1+1) % c->sets) */
else if (((set1 + 1) & (c->sets-1)) == set2) {
- UWord tag2 = (a+size-1) >> c->tag_shift;
+ UWord tag2 = (a+size-1) & c->tag_mask;
/* the call updates cache structures as side effect */
CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
/* We use lower tag bits as offset pointers to cache use info.
* I.e. some cache parameters don't work.
*/
- if (c->tag_shift < c->assoc_bits) {
+ if ( (1<<c->tag_shift) < c->assoc) {
VG_(message)(Vg_DebugMsg,
"error: Use associativity < %d for cache use statistics!",
(1<<c->tag_shift) );
}
}
-/* FIXME: A little tricky */
-#if 0
-
-static __inline__
-void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask)
-{
- int idx = (high_idx << c->assoc_bits) | low_idx;
-
- c->use[idx].count ++;
- c->use[idx].mask |= use_mask;
-
- CLG_DEBUG(6," Hit [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",
- idx, c->loaded[idx].memline, c->loaded[idx].iaddr,
- use_mask, c->use[idx].mask, c->use[idx].count);
-}
-
-/* only used for I1, D1 */
-
-static __inline__
-CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag)
-{
- int i, j, idx;
- UWord *set, tmp_tag;
- UInt use_mask;
-
- /* Shifting is a bit faster than multiplying */
- set = &(c->tags[set_no << c->assoc_bits]);
- use_mask =
- c->line_start_mask[a & c->line_size_mask] &
- c->line_end_mask[(a+size-1) & c->line_size_mask];
-
- /* This loop is unrolled for just the first case, which is the most */
- /* common. We can't unroll any further because it would screw up */
- /* if we have a direct-mapped (1-way) cache. */
- if (tag == (set[0] & c->tag_mask)) {
- cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask);
- return L1_Hit;
- }
-
- /* If the tag is one other than the MRU, move it into the MRU spot */
- /* and shuffle the rest down. */
- for (i = 1; i < c->assoc; i++) {
- if (tag == (set[i] & c->tag_mask)) {
- tmp_tag = set[i];
- for (j = i; j > 0; j--) {
- set[j] = set[j - 1];
- }
- set[0] = tmp_tag;
-
- cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask);
- return L1_Hit;
- }
- }
-
- /* A miss; install this tag as MRU, shuffle rest down. */
- tmp_tag = set[L.assoc - 1] & ~c->tag_mask;
- for (j = c->assoc - 1; j > 0; j--) {
- set[j] = set[j - 1];
- }
- set[0] = tag | tmp_tag;
-
- cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag,
- use_mask, a & ~c->line_size_mask);
-
- return Miss;
-}
-
-
-static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size)
-{
- UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
- UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
- UWord tag = a >> c->tag_shift;
-
- /* Access entirely within line. */
- if (set1 == set2)
- return cacheuse_setref(c, set1, tag);
-
- /* Access straddles two lines. */
- /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
- else if (((set1 + 1) & (c->sets-1)) == set2) {
- UWord tag2 = a >> c->tag_shift;
-
- /* the call updates cache structures as side effect */
- CacheResult res1 = cacheuse_isMiss(c, set1, tag);
- CacheResult res2 = cacheuse_isMiss(c, set2, tag2);
- return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
-
- } else {
- VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2);
- VG_(tool_panic)("item straddles more than two cache sets");
- }
- return Hit;
-}
-#endif
-
/* for I1/D1 caches */
#define CACHEUSE(L) \
/* First case: word entirely within line. */ \
if (set1 == set2) { \
\
- /* Shifting is a bit faster than multiplying */ \
- set = &(L.tags[set1 << L.assoc_bits]); \
+ set = &(L.tags[set1 * L.assoc]); \
use_mask = L.line_start_mask[a & L.line_size_mask] & \
L.line_end_mask[(a+size-1) & L.line_size_mask]; \
\
/* common. We can't unroll any further because it would screw up */\
/* if we have a direct-mapped (1-way) cache. */\
if (tag == (set[0] & L.tag_mask)) { \
- idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tmp_tag; \
- idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tag | tmp_tag; \
- idx = (set1 << L.assoc_bits) | tmp_tag; \
+ idx = (set1 * L.assoc) + tmp_tag; \
return update_##L##_use(&L, idx, \
use_mask, a &~ L.line_size_mask); \
\
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
- set = &(L.tags[set1 << L.assoc_bits]); \
+ set = &(L.tags[set1 * L.assoc]); \
use_mask = L.line_start_mask[a & L.line_size_mask]; \
if (tag == (set[0] & L.tag_mask)) { \
- idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tmp_tag; \
- idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
+ idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tag | tmp_tag; \
- idx = (set1 << L.assoc_bits) | tmp_tag; \
+ idx = (set1 * L.assoc) + tmp_tag; \
miss1 = update_##L##_use(&L, idx, \
use_mask, a &~ L.line_size_mask); \
block2: \
- set = &(L.tags[set2 << L.assoc_bits]); \
+ set = &(L.tags[set2 * L.assoc]); \
use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
tag2 = (a+size-1) & L.tag_mask; \
if (tag2 == (set[0] & L.tag_mask)) { \
- idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \
+ idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tmp_tag; \
- idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \
+ idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
L.use[idx].count ++; \
L.use[idx].mask |= use_mask; \
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
set[j] = set[j - 1]; \
} \
set[0] = tag2 | tmp_tag; \
- idx = (set2 << L.assoc_bits) | tmp_tag; \
+ idx = (set2 * L.assoc) + tmp_tag; \
miss2 = update_##L##_use(&L, idx, \
use_mask, (a+size-1) &~ L.line_size_mask); \
return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
{
UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
- UWord* set = &(L2.tags[setNo << L2.assoc_bits]);
+ UWord* set = &(L2.tags[setNo * L2.assoc]);
UWord tag = memline & L2.tag_mask;
int i, j, idx;
CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
if (tag == (set[0] & L2.tag_mask)) {
- idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask);
+ idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
l1_loaded->dep_use = &(L2.use[idx]);
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
set[j] = set[j - 1];
}
set[0] = tmp_tag;
- idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask);
+ idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
l1_loaded->dep_use = &(L2.use[idx]);
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
set[j] = set[j - 1];
}
set[0] = tag | tmp_tag;
- idx = (setNo << L2.assoc_bits) | tmp_tag;
+ idx = (setNo * L2.assoc) + tmp_tag;
l1_loaded->dep_use = &(L2.use[idx]);
update_L2_use(idx, memline);
static
void check_cache(cache_t* cache, Char *name)
{
- /* First check they're all powers of two */
- if (-1 == VG_(log2)(cache->size)) {
- VG_(message)(Vg_UserMsg,
- "error: %s size of %dB not a power of two; aborting.",
- name, cache->size);
- VG_(exit)(1);
- }
-
- if (-1 == VG_(log2)(cache->assoc)) {
+ /* Simulator requires line size and set count to be powers of two */
+ if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
+ (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
VG_(message)(Vg_UserMsg,
- "error: %s associativity of %d not a power of two; aborting.",
- name, cache->assoc);
- VG_(exit)(1);
+ "error: %s set count not a power of two; aborting.",
+ name);
}
- if (-1 == VG_(log2)(cache->line_size)) {
+ if (-1 == VG_(log2)(cache->line_size)) {
VG_(message)(Vg_UserMsg,
"error: %s line size of %dB not a power of two; aborting.",
name, cache->line_size);
simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \
simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \
simwork3.vgtest simwork3.stdout.exp simwork3.stderr.exp \
+ notpower2.vgtest notpower2.stderr.exp \
+ notpower2-wb.vgtest notpower2-wb.stderr.exp \
+ notpower2-hwpref.vgtest notpower2-hwpref.stderr.exp \
+ notpower2-use.vgtest notpower2-use.stderr.exp \
threads.vgtest threads.stderr.exp
check_PROGRAMS = clreq simwork threads
--- /dev/null
+
+
+Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I refs:
+I1 misses:
+L2i misses:
+I1 miss rate:
+L2i miss rate:
+
+D refs:
+D1 misses:
+L2d misses:
+D1 miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
--- /dev/null
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+cleanup: rm callgrind.out.*
--- /dev/null
+
+
+Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Collected :
+
+I refs:
+I1 misses:
+L2i misses:
+I1 miss rate:
+L2i miss rate:
+
+D refs:
+D1 misses:
+L2d misses:
+D1 miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
--- /dev/null
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+cleanup: rm callgrind.out.*
--- /dev/null
+
+
+Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Collected :
+
+I refs:
+I1 misses:
+L2i misses:
+I1 miss rate:
+L2i miss rate:
+
+D refs:
+D1 misses:
+L2d misses:
+D1 miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
--- /dev/null
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+cleanup: rm callgrind.out.*
--- /dev/null
+
+
+Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Collected :
+
+I refs:
+I1 misses:
+L2i misses:
+I1 miss rate:
+L2i miss rate:
+
+D refs:
+D1 misses:
+L2d misses:
+D1 miss rate:
+L2d miss rate:
+
+L2 refs:
+L2 misses:
+L2 miss rate:
--- /dev/null
+prog: ../../tests/true
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+cleanup: rm callgrind.out.*