From: Julian Seward Date: Thu, 29 Jan 2009 10:14:53 +0000 (+0000) Subject: Merge in non-power-of-2-sized cache simulation fixes for Callgrind X-Git-Tag: svn/VALGRIND_3_4_1~37 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=15402401c3efd8c00ea31ef2855dfcf891789d41;p=thirdparty%2Fvalgrind.git Merge in non-power-of-2-sized cache simulation fixes for Callgrind and Cachegrind: 8912 callgrind/tests/filter_stderr cachegrind/tests/filter_stderr Filter out an additional warning, so the tests pass on machines with a 6MB L2 cache. cachegrind/cg-x86.c cachegrind/cg-amd64.c These two files were almost identical. cg-amd64.c now just #includes cg-x86.c. 9080 Cachegrind/Callgrind: allow for cache sizes other than only powers of two 9081 Callgrind: Remove ifdef'ed-out, non-working code. git-svn-id: svn://svn.valgrind.org/valgrind/branches/VALGRIND_3_4_BRANCH@9088 --- diff --git a/cachegrind/cg-amd64.c b/cachegrind/cg-amd64.c index 8ea6ec1375..90bee9857a 100644 --- a/cachegrind/cg-amd64.c +++ b/cachegrind/cg-amd64.c @@ -28,339 +28,7 @@ The GNU General Public License is contained in the file COPYING. */ -#include "pub_tool_basics.h" -#include "pub_tool_cpuid.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" - -#include "cg_arch.h" - -// All CPUID info taken from sandpile.org/a32/cpuid.htm */ -// Probably only works for Intel and AMD chips, and probably only for some of -// them. - -static void micro_ops_warn(Int actual_size, Int used_size, Int line_size) -{ - VG_(message)(Vg_DebugMsg, - "warning: Pentium 4 with %d KB micro-op instruction trace cache", - actual_size); - VG_(message)(Vg_DebugMsg, - " Simulating a %d KB I-cache with %d B lines", - used_size, line_size); -} - -/* Intel method is truly wretched. We have to do an insane indexing into an - * array of pre-defined configurations for various parts of the memory - * hierarchy. - * According to Intel Processor Identification, App Note 485. - */ -static -Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c) -{ - Int cpuid1_eax; - Int cpuid1_ignore; - Int family; - Int model; - UChar info[16]; - Int i, trials; - Bool L2_found = False; - - if (level < 2) { - VG_(message)(Vg_DebugMsg, - "warning: CPUID level < 2 for Intel processor (%d)", - level); - return -1; - } - - /* family/model needed to distinguish code reuse (currently 0x49) */ - VG_(cpuid)(1, &cpuid1_eax, &cpuid1_ignore, - &cpuid1_ignore, &cpuid1_ignore); - family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf); - model = (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf); - - VG_(cpuid)(2, (Int*)&info[0], (Int*)&info[4], - (Int*)&info[8], (Int*)&info[12]); - trials = info[0] - 1; /* AL register - bits 0..7 of %eax */ - info[0] = 0x0; /* reset AL */ - - if (0 != trials) { - VG_(message)(Vg_DebugMsg, - "warning: non-zero CPUID trials for Intel processor (%d)", - trials); - return -1; - } - - for (i = 0; i < 16; i++) { - - switch (info[i]) { - - case 0x0: /* ignore zeros */ - break; - - /* TLB info, ignore */ - case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: - case 0x4f: case 0x50: case 0x51: case 0x52: - case 0x56: case 0x57: case 0x59: - case 0x5b: case 0x5c: case 0x5d: - case 0xb0: case 0xb1: - case 0xb3: case 0xb4: case 0xba: case 0xc0: - break; - - case 0x06: *I1c = (cache_t) { 8, 4, 32 }; break; - case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break; - case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break; - - case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break; - case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break; - case 0x0e: - /* Real D1 cache configuration is: - D1c = (cache_t) { 24, 6, 64 }; */ - VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb"); - *D1c = (cache_t) { 16, 4, 64 }; - break; - case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break; - - /* IA-64 info -- panic! */ - case 0x10: case 0x15: case 0x1a: - case 0x88: case 0x89: case 0x8a: case 0x8d: - case 0x90: case 0x96: case 0x9b: - VG_(tool_panic)("IA-64 cache detected?!"); - - case 0x22: case 0x23: case 0x25: case 0x29: - case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d: - VG_(message)(Vg_DebugMsg, - "warning: L3 cache detected but ignored"); - break; - - /* These are sectored, whatever that means */ - case 0x39: *L2c = (cache_t) { 128, 4, 64 }; L2_found = True; break; - case 0x3c: *L2c = (cache_t) { 256, 4, 64 }; L2_found = True; break; - - /* If a P6 core, this means "no L2 cache". - If a P4 core, this means "no L3 cache". - We don't know what core it is, so don't issue a warning. To detect - a missing L2 cache, we use 'L2_found'. */ - case 0x40: - break; - - case 0x41: *L2c = (cache_t) { 128, 4, 32 }; L2_found = True; break; - case 0x42: *L2c = (cache_t) { 256, 4, 32 }; L2_found = True; break; - case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break; - case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break; - case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break; - case 0x48: - /* Real L2 cache configuration is: - *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */ - VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb"); - *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; - break; - case 0x49: - if ((family == 15) && (model == 6)) - /* On Xeon MP (family F, model 6), this is for L3 */ - VG_(message)(Vg_DebugMsg, - "warning: L3 cache detected but ignored"); - else - *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True; - break; - case 0x4e: - /* Real L2 cache configuration is: - *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */ - VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb"); - *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True; - break; - - /* These are sectored, whatever that means */ - case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */ - case 0x66: *D1c = (cache_t) { 8, 4, 64 }; break; /* sectored */ - case 0x67: *D1c = (cache_t) { 16, 4, 64 }; break; /* sectored */ - case 0x68: *D1c = (cache_t) { 32, 4, 64 }; break; /* sectored */ - - /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based. - * conversion to byte size is a total guess; treat the 12K and 16K - * cases the same since the cache byte size must be a power of two for - * everything to work!. Also guessing 32 bytes for the line size... - */ - case 0x70: /* 12K micro-ops, 8-way */ - *I1c = (cache_t) { 16, 8, 32 }; - micro_ops_warn(12, 16, 32); - break; - case 0x71: /* 16K micro-ops, 8-way */ - *I1c = (cache_t) { 16, 8, 32 }; - micro_ops_warn(16, 16, 32); - break; - case 0x72: /* 32K micro-ops, 8-way */ - *I1c = (cache_t) { 32, 8, 32 }; - micro_ops_warn(32, 32, 32); - break; - - /* These are sectored, whatever that means */ - case 0x79: *L2c = (cache_t) { 128, 8, 64 }; L2_found = True; break; - case 0x7a: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break; - case 0x7b: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break; - case 0x7c: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break; - case 0x7d: *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; break; - case 0x7e: *L2c = (cache_t) { 256, 8, 128 }; L2_found = True; break; - - case 0x7f: *L2c = (cache_t) { 512, 2, 64 }; L2_found = True; break; - case 0x80: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break; - - case 0x81: *L2c = (cache_t) { 128, 8, 32 }; L2_found = True; break; - case 0x82: *L2c = (cache_t) { 256, 8, 32 }; L2_found = True; break; - case 0x83: *L2c = (cache_t) { 512, 8, 32 }; L2_found = True; break; - case 0x84: *L2c = (cache_t) { 1024, 8, 32 }; L2_found = True; break; - case 0x85: *L2c = (cache_t) { 2048, 8, 32 }; L2_found = True; break; - case 0x86: *L2c = (cache_t) { 512, 4, 64 }; L2_found = True; break; - case 0x87: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break; - - /* Ignore prefetch information */ - case 0xf0: case 0xf1: - break; - - default: - VG_(message)(Vg_DebugMsg, - "warning: Unknown Intel cache config value " - "(0x%x), ignoring", info[i]); - break; - } - } - - if (!L2_found) - VG_(message)(Vg_DebugMsg, - "warning: L2 cache not installed, ignore L2 results."); - - return 0; -} - -/* AMD method is straightforward, just extract appropriate bits from the - * result registers. - * - * Bits, for D1 and I1: - * 31..24 data L1 cache size in KBs - * 23..16 data L1 cache associativity (FFh=full) - * 15.. 8 data L1 cache lines per tag - * 7.. 0 data L1 cache line size in bytes - * - * Bits, for L2: - * 31..16 unified L2 cache size in KBs - * 15..12 unified L2 cache associativity (0=off, FFh=full) - * 11.. 8 unified L2 cache lines per tag - * 7.. 0 unified L2 cache line size in bytes - * - * #3 The AMD K7 processor's L2 cache must be configured prior to relying - * upon this information. (Whatever that means -- njn) - * - * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model - * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB), - * so we detect that. - * - * Returns 0 on success, non-zero on failure. - */ -static -Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c) -{ - UInt ext_level; - UInt dummy, model; - UInt I1i, D1i, L2i; - - VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy); - - if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) { - VG_(message)(Vg_UserMsg, - "warning: ext_level < 0x80000006 for AMD processor (0x%x)", - ext_level); - return -1; - } - - VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i); - VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy); - - VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy); - - /* Check for Duron bug */ - if (model == 0x630) { - VG_(message)(Vg_UserMsg, - "Buggy Duron stepping A0. Assuming L2 size=65536 bytes"); - L2i = (64 << 16) | (L2i & 0xffff); - } - - D1c->size = (D1i >> 24) & 0xff; - D1c->assoc = (D1i >> 16) & 0xff; - D1c->line_size = (D1i >> 0) & 0xff; - - I1c->size = (I1i >> 24) & 0xff; - I1c->assoc = (I1i >> 16) & 0xff; - I1c->line_size = (I1i >> 0) & 0xff; - - L2c->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */ - L2c->assoc = (L2i >> 12) & 0xf; - L2c->line_size = (L2i >> 0) & 0xff; - - return 0; -} - -static -Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c) -{ - Int level, ret; - Char vendor_id[13]; - - if (!VG_(has_cpuid)()) { - VG_(message)(Vg_DebugMsg, "CPUID instruction not supported"); - return -1; - } - - VG_(cpuid)(0, &level, (int*)&vendor_id[0], - (int*)&vendor_id[8], (int*)&vendor_id[4]); - vendor_id[12] = '\0'; - - if (0 == level) { - VG_(message)(Vg_DebugMsg, "CPUID level is 0, early Pentium?"); - return -1; - } - - /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */ - if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) { - ret = Intel_cache_info(level, I1c, D1c, L2c); - - } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) { - ret = AMD_cache_info(I1c, D1c, L2c); - - } else { - VG_(message)(Vg_DebugMsg, "CPU vendor ID not recognised (%s)", - vendor_id); - return -1; - } - - /* Successful! Convert sizes from KB to bytes */ - I1c->size *= 1024; - D1c->size *= 1024; - L2c->size *= 1024; - - return ret; -} - - -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c, - Bool all_caches_clo_defined) -{ - Int res; - - // Set caches to default. - *I1c = (cache_t) { 65536, 2, 64 }; - *D1c = (cache_t) { 65536, 2, 64 }; - *L2c = (cache_t) { 262144, 8, 64 }; - - // Then replace with any info we can get from CPUID. - res = get_caches_from_CPUID(I1c, D1c, L2c); - - // Warn if CPUID failed and config not completely specified from cmd line. - if (res != 0 && !all_caches_clo_defined) { - VG_(message)(Vg_DebugMsg, - "Warning: Couldn't auto-detect cache config, using one " - "or more defaults "); - } -} +#include "cg-x86.c" /*--------------------------------------------------------------------*/ /*--- end ---*/ diff --git a/cachegrind/cg-x86.c b/cachegrind/cg-x86.c index 164f944e61..7c6b8f362b 100644 --- a/cachegrind/cg-x86.c +++ b/cachegrind/cg-x86.c @@ -1,6 +1,6 @@ /*--------------------------------------------------------------------*/ -/*--- x86-specific definitions. cg-x86.c ---*/ +/*--- x86-specific (and AMD64-specific) definitions. cg-x86.c ---*/ /*--------------------------------------------------------------------*/ /* @@ -113,12 +113,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c) case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break; case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break; - case 0x0e: - /* Real D1 cache configuration is: - D1c = (cache_t) { 24, 6, 64 }; */ - VG_(message)(Vg_DebugMsg, "warning: 24Kb D1 cache detected, treating as 16Kb"); - *D1c = (cache_t) { 16, 4, 64 }; - break; + case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break; case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break; /* IA-64 info -- panic! */ @@ -149,12 +144,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c) case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break; case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break; case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break; - case 0x48: - /* Real L2 cache configuration is: - *L2c = (cache_t) { 3072, 12, 64 }; L2_found = True; */ - VG_(message)(Vg_DebugMsg, "warning: 3Mb L2 cache detected, treating as 2Mb"); - *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; - break; + case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break; case 0x49: if ((family == 15) && (model == 6)) /* On Xeon MP (family F, model 6), this is for L3 */ @@ -163,12 +153,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c) else *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True; break; - case 0x4e: - /* Real L2 cache configuration is: - *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; */ - VG_(message)(Vg_DebugMsg, "warning: 6Mb L2 cache detected, treating as 4Mb"); - *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True; - break; + case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break; /* These are sectored, whatever that means */ case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */ diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c index a30ba758d3..36ddbab430 100644 --- a/cachegrind/cg_main.c +++ b/cachegrind/cg_main.c @@ -1158,18 +1158,12 @@ static cache_t clo_L2_cache = UNDEFINED_CACHE; static void check_cache(cache_t* cache, Char *name) { - /* First check they're all powers of two */ - if (-1 == VG_(log2)(cache->size)) { + /* Simulator requires line size and set count to be powers of two */ + if (( cache->size % (cache->line_size * cache->assoc) != 0) || + (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) { VG_(message)(Vg_UserMsg, - "error: %s size of %dB not a power of two; aborting.", - name, cache->size); - VG_(exit)(1); - } - - if (-1 == VG_(log2)(cache->assoc)) { - VG_(message)(Vg_UserMsg, - "error: %s associativity of %d not a power of two; aborting.", - name, cache->assoc); + "error: %s set count not a power of two; aborting.", + name); VG_(exit)(1); } diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c index 6edf12628f..25bef497ad 100644 --- a/cachegrind/cg_sim.c +++ b/cachegrind/cg_sim.c @@ -44,7 +44,6 @@ typedef struct { Int line_size; /* bytes */ Int sets; Int sets_min_1; - Int assoc_bits; Int line_size_bits; Int tag_shift; Char desc_line[128]; @@ -62,7 +61,6 @@ static void cachesim_initcache(cache_t config, cache_t2* c) c->sets = (c->size / c->line_size) / c->assoc; c->sets_min_1 = c->sets - 1; - c->assoc_bits = VG_(log2)(c->assoc); c->line_size_bits = VG_(log2)(c->line_size); c->tag_shift = c->line_size_bits + VG_(log2)(c->sets); @@ -111,8 +109,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2) \ /* First case: word entirely within line. */ \ if (set1 == set2) { \ \ - /* Shifting is a bit faster than multiplying */ \ - set = &(L.tags[set1 << L.assoc_bits]); \ + set = &(L.tags[set1 * L.assoc]); \ \ /* This loop is unrolled for just the first case, which is the most */\ /* common. We can't unroll any further because it would screw up */\ @@ -143,7 +140,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2) \ /* Second case: word straddles two lines. */ \ /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \ } else if (((set1 + 1) & (L.sets-1)) == set2) { \ - set = &(L.tags[set1 << L.assoc_bits]); \ + set = &(L.tags[set1 * L.assoc]); \ if (tag == set[0]) { \ goto block2; \ } \ @@ -162,7 +159,7 @@ void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2) \ set[0] = tag; \ is_miss = True; \ block2: \ - set = &(L.tags[set2 << L.assoc_bits]); \ + set = &(L.tags[set2 * L.assoc]); \ tag2 = (a+size-1) >> L.tag_shift; \ if (tag2 == set[0]) { \ goto miss_treatment; \ diff --git a/cachegrind/docs/cg-manual.xml b/cachegrind/docs/cg-manual.xml index f65272bbc2..512eeb409b 100644 --- a/cachegrind/docs/cg-manual.xml +++ b/cachegrind/docs/cg-manual.xml @@ -142,7 +142,7 @@ follows: - Bit-selection hash function: the line(s) in the cache + Bit-selection hash function: the set of line(s) in the cache to which a memory block maps is chosen by the middle bits M--(M+N-1) of the byte address, where: @@ -150,15 +150,17 @@ follows: line size = 2^M bytes - (cache size / line size) = 2^N bytes + (cache size / line size / associativity) = 2^N bytes - Inclusive L2 cache: the L2 cache replicates all the - entries of the L1 cache. This is standard on Pentium chips, - but AMD Opterons, Athlons and Durons + Inclusive L2 cache: the L2 cache typically replicates all + the entries of the L1 caches, because fetching into L1 involves + fetching into L2 first (this does not guarantee strict inclusiveness, + as lines evicted from L2 still could reside in L1). This is + standard on Pentium chips, but AMD Opterons, Athlons and Durons use an exclusive L2 cache that only holds blocks evicted from L1. Ditto most modern VIA CPUs. @@ -176,7 +178,10 @@ happens. You can manually specify one, two or all three levels (I1/D1/L2) of the cache from the command line using the --I1, --D1 and ---L2 options. +--L2 options. +For cache parameters to be valid for simulation, the number +of sets (with associativity being the number of cache lines in +each set) has to be a power of two. On PowerPC platforms Cachegrind cannot automatically @@ -227,10 +232,7 @@ need to specify it with the If you are interested in simulating a cache with different properties, it is not particularly hard to write your own cache simulator, or to modify the existing ones in -vg_cachesim_I1.c, -vg_cachesim_D1.c, -vg_cachesim_L2.c and -vg_cachesim_gen.c. We'd be +cg_sim.c. We'd be interested to hear from anyone who does. diff --git a/cachegrind/tests/Makefile.am b/cachegrind/tests/Makefile.am index 4a40e770d6..de669623e7 100644 --- a/cachegrind/tests/Makefile.am +++ b/cachegrind/tests/Makefile.am @@ -24,6 +24,7 @@ EXTRA_DIST = $(noinst_SCRIPTS) \ chdir.vgtest chdir.stderr.exp \ clreq.vgtest clreq.stderr.exp \ dlclose.vgtest dlclose.stderr.exp dlclose.stdout.exp \ + notpower2.vgtest notpower2.stderr.exp \ wrap5.vgtest wrap5.stderr.exp wrap5.stdout.exp check_PROGRAMS = \ diff --git a/cachegrind/tests/notpower2.stderr.exp b/cachegrind/tests/notpower2.stderr.exp new file mode 100644 index 0000000000..8eaf65446c --- /dev/null +++ b/cachegrind/tests/notpower2.stderr.exp @@ -0,0 +1,17 @@ + + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: diff --git a/cachegrind/tests/notpower2.vgtest b/cachegrind/tests/notpower2.vgtest new file mode 100644 index 0000000000..132cfe5923 --- /dev/null +++ b/cachegrind/tests/notpower2.vgtest @@ -0,0 +1,3 @@ +prog: ../../tests/true +vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 +cleanup: rm cachegrind.out.* diff --git a/callgrind/sim.c b/callgrind/sim.c index 506ed9e400..6e9de48966 100644 --- a/callgrind/sim.c +++ b/callgrind/sim.c @@ -74,7 +74,6 @@ typedef struct { Bool sectored; /* prefetch nearside cacheline on read */ int sets; int sets_min_1; - int assoc_bits; int line_size_bits; int tag_shift; UWord tag_mask; @@ -195,7 +194,6 @@ static void cachesim_initcache(cache_t config, cache_t2* c) c->sets = (c->size / c->line_size) / c->assoc; c->sets_min_1 = c->sets - 1; - c->assoc_bits = VG_(log2)(c->assoc); c->line_size_bits = VG_(log2)(c->line_size); c->tag_shift = c->line_size_bits + VG_(log2)(c->sets); c->tag_mask = ~((1<tag_shift)-1); @@ -259,8 +257,7 @@ CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag) int i, j; UWord *set; - /* Shifting is a bit faster than multiplying */ - set = &(c->tags[set_no << c->assoc_bits]); + set = &(c->tags[set_no * c->assoc]); /* This loop is unrolled for just the first case, which is the most */ /* common. We can't unroll any further because it would screw up */ @@ -359,8 +356,7 @@ CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag) int i, j; UWord *set, tmp_tag; - /* Shifting is a bit faster than multiplying */ - set = &(c->tags[set_no << c->assoc_bits]); + set = &(c->tags[set_no * c->assoc]); /* This loop is unrolled for just the first case, which is the most */ /* common. We can't unroll any further because it would screw up */ @@ -407,7 +403,7 @@ CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size) /* Access straddles two lines. */ /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ else if (((set1 + 1) & (c->sets-1)) == set2) { - UWord tag2 = (a+size-1) >> c->tag_shift; + UWord tag2 = (a+size-1) & c->tag_mask; /* the call updates cache structures as side effect */ CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag); @@ -676,7 +672,7 @@ void cacheuse_initcache(cache_t2* c) /* We use lower tag bits as offset pointers to cache use info. * I.e. some cache parameters don't work. */ - if (c->tag_shift < c->assoc_bits) { + if ( (1<tag_shift) < c->assoc) { VG_(message)(Vg_DebugMsg, "error: Use associativity < %d for cache use statistics!", (1<tag_shift) ); @@ -684,102 +680,6 @@ void cacheuse_initcache(cache_t2* c) } } -/* FIXME: A little tricky */ -#if 0 - -static __inline__ -void cacheuse_update_hit(cache_t2* c, UInt high_idx, UInt low_idx, UInt use_mask) -{ - int idx = (high_idx << c->assoc_bits) | low_idx; - - c->use[idx].count ++; - c->use[idx].mask |= use_mask; - - CLG_DEBUG(6," Hit [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n", - idx, c->loaded[idx].memline, c->loaded[idx].iaddr, - use_mask, c->use[idx].mask, c->use[idx].count); -} - -/* only used for I1, D1 */ - -static __inline__ -CacheResult cacheuse_setref(cache_t2* c, UInt set_no, UWord tag) -{ - int i, j, idx; - UWord *set, tmp_tag; - UInt use_mask; - - /* Shifting is a bit faster than multiplying */ - set = &(c->tags[set_no << c->assoc_bits]); - use_mask = - c->line_start_mask[a & c->line_size_mask] & - c->line_end_mask[(a+size-1) & c->line_size_mask]; - - /* This loop is unrolled for just the first case, which is the most */ - /* common. We can't unroll any further because it would screw up */ - /* if we have a direct-mapped (1-way) cache. */ - if (tag == (set[0] & c->tag_mask)) { - cacheuse_update(c, set_no, set[0] & ~c->tag_mask, use_mask); - return L1_Hit; - } - - /* If the tag is one other than the MRU, move it into the MRU spot */ - /* and shuffle the rest down. */ - for (i = 1; i < c->assoc; i++) { - if (tag == (set[i] & c->tag_mask)) { - tmp_tag = set[i]; - for (j = i; j > 0; j--) { - set[j] = set[j - 1]; - } - set[0] = tmp_tag; - - cacheuse_update(c, set_no, tmp_tag & ~c->tag_mask, use_mask); - return L1_Hit; - } - } - - /* A miss; install this tag as MRU, shuffle rest down. */ - tmp_tag = set[L.assoc - 1] & ~c->tag_mask; - for (j = c->assoc - 1; j > 0; j--) { - set[j] = set[j - 1]; - } - set[0] = tag | tmp_tag; - - cacheuse_L2_miss(c, (set_no << c->assoc_bits) | tmp_tag, - use_mask, a & ~c->line_size_mask); - - return Miss; -} - - -static CacheResult cacheuse_ref(cache_t2* c, Addr a, UChar size) -{ - UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1); - UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1); - UWord tag = a >> c->tag_shift; - - /* Access entirely within line. */ - if (set1 == set2) - return cacheuse_setref(c, set1, tag); - - /* Access straddles two lines. */ - /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ - else if (((set1 + 1) & (c->sets-1)) == set2) { - UWord tag2 = a >> c->tag_shift; - - /* the call updates cache structures as side effect */ - CacheResult res1 = cacheuse_isMiss(c, set1, tag); - CacheResult res2 = cacheuse_isMiss(c, set2, tag2); - return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; - - } else { - VG_(printf)("addr: %x size: %u sets: %d %d", a, size, set1, set2); - VG_(tool_panic)("item straddles more than two cache sets"); - } - return Hit; -} -#endif - /* for I1/D1 caches */ #define CACHEUSE(L) \ @@ -800,8 +700,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ /* First case: word entirely within line. */ \ if (set1 == set2) { \ \ - /* Shifting is a bit faster than multiplying */ \ - set = &(L.tags[set1 << L.assoc_bits]); \ + set = &(L.tags[set1 * L.assoc]); \ use_mask = L.line_start_mask[a & L.line_size_mask] & \ L.line_end_mask[(a+size-1) & L.line_size_mask]; \ \ @@ -809,7 +708,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ /* common. We can't unroll any further because it would screw up */\ /* if we have a direct-mapped (1-way) cache. */\ if (tag == (set[0] & L.tag_mask)) { \ - idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \ + idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ L.use[idx].count ++; \ L.use[idx].mask |= use_mask; \ CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ @@ -826,7 +725,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ set[j] = set[j - 1]; \ } \ set[0] = tmp_tag; \ - idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \ + idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ L.use[idx].count ++; \ L.use[idx].mask |= use_mask; \ CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ @@ -842,7 +741,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ set[j] = set[j - 1]; \ } \ set[0] = tag | tmp_tag; \ - idx = (set1 << L.assoc_bits) | tmp_tag; \ + idx = (set1 * L.assoc) + tmp_tag; \ return update_##L##_use(&L, idx, \ use_mask, a &~ L.line_size_mask); \ \ @@ -850,10 +749,10 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \ } else if (((set1 + 1) & (L.sets-1)) == set2) { \ Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \ - set = &(L.tags[set1 << L.assoc_bits]); \ + set = &(L.tags[set1 * L.assoc]); \ use_mask = L.line_start_mask[a & L.line_size_mask]; \ if (tag == (set[0] & L.tag_mask)) { \ - idx = (set1 << L.assoc_bits) | (set[0] & ~L.tag_mask); \ + idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ L.use[idx].count ++; \ L.use[idx].mask |= use_mask; \ CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ @@ -868,7 +767,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ set[j] = set[j - 1]; \ } \ set[0] = tmp_tag; \ - idx = (set1 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \ + idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ L.use[idx].count ++; \ L.use[idx].mask |= use_mask; \ CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ @@ -882,15 +781,15 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ set[j] = set[j - 1]; \ } \ set[0] = tag | tmp_tag; \ - idx = (set1 << L.assoc_bits) | tmp_tag; \ + idx = (set1 * L.assoc) + tmp_tag; \ miss1 = update_##L##_use(&L, idx, \ use_mask, a &~ L.line_size_mask); \ block2: \ - set = &(L.tags[set2 << L.assoc_bits]); \ + set = &(L.tags[set2 * L.assoc]); \ use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \ tag2 = (a+size-1) & L.tag_mask; \ if (tag2 == (set[0] & L.tag_mask)) { \ - idx = (set2 << L.assoc_bits) | (set[0] & ~L.tag_mask); \ + idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \ L.use[idx].count ++; \ L.use[idx].mask |= use_mask; \ CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ @@ -905,7 +804,7 @@ block2: \ set[j] = set[j - 1]; \ } \ set[0] = tmp_tag; \ - idx = (set2 << L.assoc_bits) | (tmp_tag & ~L.tag_mask); \ + idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \ L.use[idx].count ++; \ L.use[idx].mask |= use_mask; \ CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ @@ -919,7 +818,7 @@ block2: \ set[j] = set[j - 1]; \ } \ set[0] = tag2 | tmp_tag; \ - idx = (set2 << L.assoc_bits) | tmp_tag; \ + idx = (set2 * L.assoc) + tmp_tag; \ miss2 = update_##L##_use(&L, idx, \ use_mask, (a+size-1) &~ L.line_size_mask); \ return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \ @@ -984,7 +883,7 @@ static CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded) { UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1); - UWord* set = &(L2.tags[setNo << L2.assoc_bits]); + UWord* set = &(L2.tags[setNo * L2.assoc]); UWord tag = memline & L2.tag_mask; int i, j, idx; @@ -993,7 +892,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded) CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo); if (tag == (set[0] & L2.tag_mask)) { - idx = (setNo << L2.assoc_bits) | (set[0] & ~L2.tag_mask); + idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask); l1_loaded->dep_use = &(L2.use[idx]); CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n", @@ -1008,7 +907,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded) set[j] = set[j - 1]; } set[0] = tmp_tag; - idx = (setNo << L2.assoc_bits) | (tmp_tag & ~L2.tag_mask); + idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask); l1_loaded->dep_use = &(L2.use[idx]); CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n", @@ -1024,7 +923,7 @@ CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded) set[j] = set[j - 1]; } set[0] = tag | tmp_tag; - idx = (setNo << L2.assoc_bits) | tmp_tag; + idx = (setNo * L2.assoc) + tmp_tag; l1_loaded->dep_use = &(L2.use[idx]); update_L2_use(idx, memline); @@ -1380,22 +1279,15 @@ static cache_t clo_L2_cache = UNDEFINED_CACHE; static void check_cache(cache_t* cache, Char *name) { - /* First check they're all powers of two */ - if (-1 == VG_(log2)(cache->size)) { - VG_(message)(Vg_UserMsg, - "error: %s size of %dB not a power of two; aborting.", - name, cache->size); - VG_(exit)(1); - } - - if (-1 == VG_(log2)(cache->assoc)) { + /* Simulator requires line size and set count to be powers of two */ + if (( cache->size % (cache->line_size * cache->assoc) != 0) || + (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) { VG_(message)(Vg_UserMsg, - "error: %s associativity of %d not a power of two; aborting.", - name, cache->assoc); - VG_(exit)(1); + "error: %s set count not a power of two; aborting.", + name); } - if (-1 == VG_(log2)(cache->line_size)) { + if (-1 == VG_(log2)(cache->line_size)) { VG_(message)(Vg_UserMsg, "error: %s line size of %dB not a power of two; aborting.", name, cache->line_size); diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am index b95cfc3cf9..1c3f42c070 100644 --- a/callgrind/tests/Makefile.am +++ b/callgrind/tests/Makefile.am @@ -11,6 +11,10 @@ EXTRA_DIST = $(noinst_SCRIPTS) \ simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \ simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \ simwork3.vgtest simwork3.stdout.exp simwork3.stderr.exp \ + notpower2.vgtest notpower2.stderr.exp \ + notpower2-wb.vgtest notpower2-wb.stderr.exp \ + notpower2-hwpref.vgtest notpower2-hwpref.stderr.exp \ + notpower2-use.vgtest notpower2-use.stderr.exp \ threads.vgtest threads.stderr.exp check_PROGRAMS = clreq simwork threads diff --git a/callgrind/tests/notpower2-hwpref.stderr.exp b/callgrind/tests/notpower2-hwpref.stderr.exp new file mode 100644 index 0000000000..0705c1c849 --- /dev/null +++ b/callgrind/tests/notpower2-hwpref.stderr.exp @@ -0,0 +1,20 @@ + + +Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw +Collected : + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: diff --git a/callgrind/tests/notpower2-hwpref.vgtest b/callgrind/tests/notpower2-hwpref.vgtest new file mode 100644 index 0000000000..9da7dced2a --- /dev/null +++ b/callgrind/tests/notpower2-hwpref.vgtest @@ -0,0 +1,3 @@ +prog: ../../tests/true +vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes +cleanup: rm callgrind.out.* diff --git a/callgrind/tests/notpower2-use.stderr.exp b/callgrind/tests/notpower2-use.stderr.exp new file mode 100644 index 0000000000..ea9acc89b5 --- /dev/null +++ b/callgrind/tests/notpower2-use.stderr.exp @@ -0,0 +1,20 @@ + + +Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2 +Collected : + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: diff --git a/callgrind/tests/notpower2-use.vgtest b/callgrind/tests/notpower2-use.vgtest new file mode 100644 index 0000000000..b8312a76be --- /dev/null +++ b/callgrind/tests/notpower2-use.vgtest @@ -0,0 +1,3 @@ +prog: ../../tests/true +vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes +cleanup: rm callgrind.out.* diff --git a/callgrind/tests/notpower2-wb.stderr.exp b/callgrind/tests/notpower2-wb.stderr.exp new file mode 100644 index 0000000000..90da3e4cec --- /dev/null +++ b/callgrind/tests/notpower2-wb.stderr.exp @@ -0,0 +1,20 @@ + + +Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw +Collected : + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: diff --git a/callgrind/tests/notpower2-wb.vgtest b/callgrind/tests/notpower2-wb.vgtest new file mode 100644 index 0000000000..34a1f6b335 --- /dev/null +++ b/callgrind/tests/notpower2-wb.vgtest @@ -0,0 +1,3 @@ +prog: ../../tests/true +vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes +cleanup: rm callgrind.out.* diff --git a/callgrind/tests/notpower2.stderr.exp b/callgrind/tests/notpower2.stderr.exp new file mode 100644 index 0000000000..0705c1c849 --- /dev/null +++ b/callgrind/tests/notpower2.stderr.exp @@ -0,0 +1,20 @@ + + +Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw +Collected : + +I refs: +I1 misses: +L2i misses: +I1 miss rate: +L2i miss rate: + +D refs: +D1 misses: +L2d misses: +D1 miss rate: +L2d miss rate: + +L2 refs: +L2 misses: +L2 miss rate: diff --git a/callgrind/tests/notpower2.vgtest b/callgrind/tests/notpower2.vgtest new file mode 100644 index 0000000000..73823d7493 --- /dev/null +++ b/callgrind/tests/notpower2.vgtest @@ -0,0 +1,3 @@ +prog: ../../tests/true +vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 +cleanup: rm callgrind.out.*