From: Florian Krohm Date: Sun, 7 Oct 2012 19:47:04 +0000 (+0000) Subject: This patch is the first installment of the cache info reorganisation. X-Git-Tag: svn/VALGRIND_3_9_0~640 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a9b2103cf2d5335903ab2db6aeafb56d5cf323f2;p=thirdparty%2Fvalgrind.git This patch is the first installment of the cache info reorganisation. It's reorg only. No new cache autodetection stuff has been added. coregrind pub_tool_cpuid.h is removed as it is no longer exposed to tools. Its contents has moved to pub_core_cpuid.h. New file: coregrind/m_cache.c to contain the autodetect code for cache configurations and define other cache characteristics that cannot be autodetected (i.e. icaches_maintain_coherence). Most of cg-arch/x86-amd64.c was moved here. The cache detection code for x86-64 needs to be fixed to properly initialise VexCacheInfo. It currently has cachegrind bias. m_cache.c exports a single function (to coregrind): VG_(machine_get_cache_info)(VexArchInfo *vai) This function is called from VG_(machine_get_hwcaps) after hwcaps have been detected. cachegrind Remove cachegrind/cg-{ppc32,ppc43,arm,mips32,s390x,x86-amd64}.c With the exception of x86/mamd64 those were only establishing a default cache configuration and that is so small a code snippet that a separate file is no longer warranted. So, the code was moved to cg-arch.c. Code was added to extract the relevant info from x86-amd64. New function maybe_tweak_LLc which captures the code to massage the LLc cache configuration into something the simulator can handle. This was originally in cg-x86-amd64.c but should be used to all architectures. Changed warning message about missing cache auto-detect feature to be more useful. Adapted filter-stderr scripts accordingly. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13028 --- diff --git a/cachegrind/Makefile.am b/cachegrind/Makefile.am index f22fe17307..43a44af6c8 100644 --- a/cachegrind/Makefile.am +++ b/cachegrind/Makefile.am @@ -41,13 +41,7 @@ endif CACHEGRIND_SOURCES_COMMON = \ cg_main.c \ - cg-arch.c \ - cg-x86-amd64.c \ - cg-ppc32.c \ - cg-ppc64.c \ - cg-arm.c \ - cg-s390x.c \ - cg-mips32.c + cg-arch.c cachegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES = \ $(CACHEGRIND_SOURCES_COMMON) @@ -88,5 +82,3 @@ cachegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \ $(cachegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \ $(cachegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS) endif - - diff --git a/cachegrind/cg-arch.c b/cachegrind/cg-arch.c index 4afaab6997..c6847d99e9 100644 --- a/cachegrind/cg-arch.c +++ b/cachegrind/cg-arch.c @@ -1,8 +1,5 @@ /*--------------------------------------------------------------------*/ -/*--- Cachegrind: cache configuration. ---*/ -/*--- The architecture specific void VG_(configure_caches) are ---*/ -/*--- located in the cg-.c files. ---*/ -/*--- cg-arch.c ---*/ +/*--- Cachegrind: cache configuration. cg-arch.c ---*/ /*--------------------------------------------------------------------*/ /* @@ -35,9 +32,13 @@ #include "pub_tool_libcbase.h" #include "pub_tool_libcprint.h" #include "pub_tool_options.h" +#include "pub_tool_machine.h" #include "cg_arch.h" +static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc, + Bool all_caches_clo_defined); + // Checks cache config is ok. Returns NULL if ok, or a pointer to an error // string otherwise. static Char* check_cache(cache_t* cache) @@ -157,6 +158,65 @@ static void check_cache_or_override(Char* desc, cache_t* c, Bool clo_redefined) } } + +/* If the LL cache config isn't something the simulation functions + can handle, try to adjust it so it is. Caches are characterised + by (total size T, line size L, associativity A), and then we + have + + number of sets S = T / (L * A) + + The required constraints are: + + * L must be a power of 2, but it always is in practice, so + no problem there + + * A can be any value >= 1 + + * T can be any value, but .. + + * S must be a power of 2. + + That sometimes gives a problem. For example, some Core iX based + Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288 + sets. The "fix" in this case is to increase the associativity + by 50% to 24, which reduces the number of sets to 8192, making + it a power of 2. That's what the following code does (handing + the "3/2 rescaling case".) We might need to deal with other + ratios later (5/4 ?). + + The "fix" is "justified" (cough, cough) by alleging that + increases of associativity above about 4 have very little effect + on the actual miss rate. It would be far more inaccurate to + fudge this by changing the size of the simulated cache -- + changing the associativity is a much better option. +*/ + +static void +maybe_tweak_LLc(cache_t *LLc) +{ + if (LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0) { + Long nSets = (Long)LLc->size / (Long)(LLc->line_size * LLc->assoc); + if (/* stay sane */ + nSets >= 4 + /* nSets is not a power of 2 */ + && VG_(log2_64)( (ULong)nSets ) == -1 + /* nSets is 50% above a power of 2 */ + && VG_(log2_64)( (ULong)((2 * nSets) / (Long)3) ) != -1 + /* associativity can be increased by exactly 50% */ + && (LLc->assoc % 2) == 0 + ) { + /* # sets is 1.5 * a power of two, but the associativity is + even, so we can increase that up by 50% and implicitly + scale the # sets down accordingly. */ + Int new_assoc = LLc->assoc + (LLc->assoc / 2); + VG_(dmsg)("warning: pretending that LL cache has associativity" + " %d instead of actual %d\n", new_assoc, LLc->assoc); + LLc->assoc = new_assoc; + } + } +} + void VG_(post_clo_init_configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, @@ -174,7 +234,9 @@ void VG_(post_clo_init_configure_caches)(cache_t* I1c, // Set the cache config (using auto-detection, if supported by the // architecture). - VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined ); + configure_caches( I1c, D1c, LLc, all_caches_clo_defined ); + + maybe_tweak_LLc( LLc ); // Check the default/auto-detected values. // Allow the user to override invalid auto-detected caches @@ -206,3 +268,172 @@ void VG_(print_cache_clo_opts)() " --LL=,, set LL cache manually\n" ); } + + +// Traverse the cache info and return a cache of the given kind and level. +// Return NULL if no such cache exists. +static const VexCache * +locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level) +{ + const VexCache *c; + + for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) { + if (c->level == level && c->kind == kind) { + return c; + } + } + return NULL; // not found +} + + +// Gives the auto-detected configuration of I1, D1 and LL caches. They get +// overridden by any cache configurations specified on the command line. +static void +configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc, + Bool all_caches_clo_defined) +{ + VexArchInfo vai; + const VexCacheInfo *ci; + const VexCache *i1, *d1, *ll; + + VG_(machine_get_VexArchInfo)(NULL, &vai); + ci = &vai.hwcache_info; + + // Extract what we need + i1 = locate_cache(ci, INSN_CACHE, 1); + d1 = locate_cache(ci, DATA_CACHE, 1); + // FIXME: needs clarification for num_levels > 3 see also warning below + // FIXME: whether it needs adjustment + ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels); + + if (ll && ci->num_levels > 2) { + VG_(dmsg)("warning: L%u cache found, using its data for the " + "LL simulation.\n", ci->num_levels); + } + + if (i1 && d1 && ll) { + *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB }; + *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB }; + *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB }; + + return; + } + + // Cache information could not be queried; choose some default + // architecture specific default setting. + +#if defined(VGA_ppc32) + + // Default cache configuration + *I1c = (cache_t) { 65536, 2, 64 }; + *D1c = (cache_t) { 65536, 2, 64 }; + *LLc = (cache_t) { 262144, 8, 64 }; + +#elif defined(VGA_ppc64) + + // Default cache configuration + *I1c = (cache_t) { 65536, 2, 64 }; + *D1c = (cache_t) { 65536, 2, 64 }; + *LLc = (cache_t) { 262144, 8, 64 }; + +#elif defined(VGA_arm) + + // Set caches to default (for Cortex-A8 ?) + *I1c = (cache_t) { 16384, 4, 64 }; + *D1c = (cache_t) { 16384, 4, 64 }; + *LLc = (cache_t) { 262144, 8, 64 }; + +#elif defined(VGA_s390x) + // z900 + // + // Source: + // The microarchitecture of the IBM eServer z900 processor + // IBM Journal of Research and Development + // Volume 46, Number 4/5, pp 381-395, July/September 2002 + // + // Split L1 I/D cache + // Size: 256 kB each + // Line size: 256 bytes + // 4-way set associative + // L2 cache: 16 MB x 2 (16 MB per 10 CPs) (Charles Webb) + + // z800 + // + // Source: Charles Webb from IBM + // + // Split L1 I/D cache + // Size: 256 kB each + // Line size: 256 bytes + // 4-way set associative + // L2 cache: 16 MB (or half that size) + + // z990 + // + // The IBM eServer z990 microprocessor + // IBM Journal of Research and Development + // Volume 48, Number 3/4, pp 295-309, May/July 2004 + // + // Split L1 I/D cache + // Size: 256 kB each + // Line size: 256 bytes + // 4-way set associative + // L2 cache: 32 MB x 4 (32 MB per book/node) (Charles Webb) + + // z890 + // + // Source: Charles Webb from IBM + // + // Split L1 I/D cache + // Size: 256 kB each + // Line size: 256 bytes + // 4-way set associative + // L2 cache: 32 MB (or half that size) + + // z9 + // + // Source: Charles Webb from IBM + // + // Split L1 I/D cache + // Size: 256 kB each + // Line size: 256 bytes + // 4-way set associative + // L2 cache: 40 MB x 4 (40 MB per book/node) + + // fixs390: have a table for all models we support and check + // fixs390: VEX_S390X_MODEL(hwcaps) + + // Default cache configuration is z10-EC (Source: ECAG insn) + *I1c = (cache_t) { 65536, 4, 256 }; + *D1c = (cache_t) { 131072, 8, 256 }; + *LLc = (cache_t) { 50331648, 24, 256 }; + +#elif defined(VGA_mips32) + + // Set caches to default (for MIPS32-r2(mips 74kc)) + *I1c = (cache_t) { 32768, 4, 32 }; + *D1c = (cache_t) { 32768, 4, 32 }; + *L2c = (cache_t) { 524288, 8, 32 }; + +#elif defined(VGA_x86) || defined(VGA_amd64) + + *I1c = (cache_t) { 65536, 2, 64 }; + *D1c = (cache_t) { 65536, 2, 64 }; + *LLc = (cache_t) { 262144, 8, 64 }; + +#else + +#error "Unknown arch" + +#endif + + if (!all_caches_clo_defined) { + const char warning[] = + "Warning: Cannot auto-detect cache config, using defaults.\n" + " Run with -v to see.\n"; + VG_(dmsg)("%s", warning); + } +} + +/*--------------------------------------------------------------------*/ +/*--- end ---*/ +/*--------------------------------------------------------------------*/ diff --git a/cachegrind/cg-arm.c b/cachegrind/cg-arm.c deleted file mode 100644 index 00badcd95f..0000000000 --- a/cachegrind/cg-arm.c +++ /dev/null @@ -1,59 +0,0 @@ - -/*--------------------------------------------------------------------*/ -/*--- ARM-specific definitions. cg-arm.c ---*/ -/*--------------------------------------------------------------------*/ - -/* - This file is part of Cachegrind, a Valgrind tool for cache - profiling programs. - - Copyright (C) 2005-2012 Johan Bjork - jbjoerk@gmail.com - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307, USA. - - The GNU General Public License is contained in the file COPYING. -*/ - -#if defined(VGA_arm) - -#include "pub_tool_basics.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" - -#include "cg_arch.h" - -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, - Bool all_caches_clo_defined) -{ - // Set caches to default (for Cortex-A8 ?) - *I1c = (cache_t) { 16384, 4, 64 }; - *D1c = (cache_t) { 16384, 4, 64 }; - *LLc = (cache_t) { 262144, 8, 64 }; - - if (!all_caches_clo_defined) { - VG_(message)(Vg_DebugMsg, - "Warning: Cannot auto-detect cache config on ARM, using one " - "or more defaults\n"); - } -} - -#endif // #if defined(VGA_arm) - -/*--------------------------------------------------------------------*/ -/*--- end cg-arm.c ---*/ -/*--------------------------------------------------------------------*/ diff --git a/cachegrind/cg-mips32.c b/cachegrind/cg-mips32.c deleted file mode 100644 index 5ad69c2e51..0000000000 --- a/cachegrind/cg-mips32.c +++ /dev/null @@ -1,59 +0,0 @@ - -/*--------------------------------------------------------------------*/ -/*--- MIPS-specific definitions. cg-mips32.c ---*/ -/*--------------------------------------------------------------------*/ - -/* - This file is part of Cachegrind, a Valgrind tool for cache - profiling programs. - - Copyright (C) 2010-2012 RT-RK - mips-valgrind@rt-rk.com - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307, USA. - - The GNU General Public License is contained in the file COPYING. -*/ - -#if defined(VGA_mips32) - -#include "pub_tool_basics.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" - -#include "cg_arch.h" - -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c, - Bool all_caches_clo_defined) -{ - // Set caches to default (for MIPS32-r2(mips 74kc)) - *I1c = (cache_t) { 32768, 4, 32 }; - *D1c = (cache_t) { 32768, 4, 32 }; - *L2c = (cache_t) { 524288, 8, 32 }; - - if (!all_caches_clo_defined) { - VG_(message)(Vg_DebugMsg, - "Warning: Cannot auto-detect cache config on MIPS32, using one " - "or more defaults\n"); - } -} - -#endif // #if defined(VGA_mips32) - -/*--------------------------------------------------------------------*/ -/*--- end cg-mips32.c ---*/ -/*--------------------------------------------------------------------*/ diff --git a/cachegrind/cg-ppc32.c b/cachegrind/cg-ppc32.c deleted file mode 100644 index d0386d690f..0000000000 --- a/cachegrind/cg-ppc32.c +++ /dev/null @@ -1,68 +0,0 @@ - -/*--------------------------------------------------------------------*/ -/*--- PPC32-specific definitions. cg-ppc32.c ---*/ -/*--------------------------------------------------------------------*/ - -/* - This file is part of Cachegrind, a Valgrind tool for cache - profiling programs. - - Copyright (C) 2005-2012 Nicholas Nethercote - njn@valgrind.org - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307, USA. - - The GNU General Public License is contained in the file COPYING. -*/ - -#if defined(VGA_ppc32) - -#include "pub_tool_basics.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" - -#include "cg_arch.h" - -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, - Bool all_caches_clo_defined) -{ - // Set caches to default. - *I1c = (cache_t) { 65536, 2, 64 }; - *D1c = (cache_t) { 65536, 2, 64 }; - *LLc = (cache_t) { 262144, 8, 64 }; - - // Warn if config not completely specified from cmd line. Note that - // this message is slightly different from the one we give on x86/AMD64 - // when auto-detection fails; this lets us filter out this one (which is - // not important) in the regression test suite without filtering the - // x86/AMD64 one (which we want to see if it ever occurs in the - // regression test suite). - // - // If you change this message, please update - // cachegrind/tests/filter_stderr! - // - if (!all_caches_clo_defined) { - VG_(dmsg)("Warning: Cannot auto-detect cache config on PPC32, using one " - "or more defaults\n"); - } -} - -#endif // defined(VGA_ppc32) - -/*--------------------------------------------------------------------*/ -/*--- end ---*/ -/*--------------------------------------------------------------------*/ diff --git a/cachegrind/cg-ppc64.c b/cachegrind/cg-ppc64.c deleted file mode 100644 index e594b997b1..0000000000 --- a/cachegrind/cg-ppc64.c +++ /dev/null @@ -1,68 +0,0 @@ - -/*--------------------------------------------------------------------*/ -/*--- PPC64-specific definitions. cg-ppc64.c ---*/ -/*--------------------------------------------------------------------*/ - -/* - This file is part of Cachegrind, a Valgrind tool for cache - profiling programs. - - Copyright (C) 2005-2012 Nicholas Nethercote - njn@valgrind.org - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307, USA. - - The GNU General Public License is contained in the file COPYING. -*/ - -#if defined(VGA_ppc64) - -#include "pub_tool_basics.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" - -#include "cg_arch.h" - -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, - Bool all_caches_clo_defined) -{ - // Set caches to default. - *I1c = (cache_t) { 65536, 2, 64 }; - *D1c = (cache_t) { 65536, 2, 64 }; - *LLc = (cache_t) { 262144, 8, 64 }; - - // Warn if config not completely specified from cmd line. Note that - // this message is slightly different from the one we give on x86/AMD64 - // when auto-detection fails; this lets us filter out this one (which is - // not important) in the regression test suite without filtering the - // x86/AMD64 one (which we want to see if it ever occurs in the - // regression test suite). - // - // If you change this message, please update - // cachegrind/tests/filter_stderr! - // - if (!all_caches_clo_defined) { - VG_(dmsg)("Warning: Cannot auto-detect cache config on PPC64, using one " - "or more defaults\n"); - } -} - -#endif - -/*--------------------------------------------------------------------*/ -/*--- end ---*/ -/*--------------------------------------------------------------------*/ diff --git a/cachegrind/cg-s390x.c b/cachegrind/cg-s390x.c deleted file mode 100644 index 3165efd005..0000000000 --- a/cachegrind/cg-s390x.c +++ /dev/null @@ -1,130 +0,0 @@ -/* -*- mode: C; c-basic-offset: 3; -*- */ - -/*--------------------------------------------------------------------*/ -/*--- s390x-specific definitions. cg-s390x.c ---*/ -/*--------------------------------------------------------------------*/ - -/* - This file is part of Cachegrind, a Valgrind tool for cache - profiling programs. - - Copyright IBM Corp. 2010-2012 - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307, USA. - - The GNU General Public License is contained in the file COPYING. -*/ - -/* Contributed by Christian Borntraeger */ - -#if defined(VGA_s390x) - -#include "pub_tool_basics.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" - -#include "cg_arch.h" - -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, - Bool all_caches_clo_defined) -{ - // z900 - // - // Source: - // The microarchitecture of the IBM eServer z900 processor - // IBM Journal of Research and Development - // Volume 46, Number 4/5, pp 381-395, July/September 2002 - // - // Split L1 I/D cache - // Size: 256 kB each - // Line size: 256 bytes - // 4-way set associative - // L2 cache: 16 MB x 2 (16 MB per 10 CPs) (Charles Webb) - - // z800 - // - // Source: Charles Webb from IBM - // - // Split L1 I/D cache - // Size: 256 kB each - // Line size: 256 bytes - // 4-way set associative - // L2 cache: 16 MB (or half that size) - - // z990 - // - // The IBM eServer z990 microprocessor - // IBM Journal of Research and Development - // Volume 48, Number 3/4, pp 295-309, May/July 2004 - // - // Split L1 I/D cache - // Size: 256 kB each - // Line size: 256 bytes - // 4-way set associative - // L2 cache: 32 MB x 4 (32 MB per book/node) (Charles Webb) - - // z890 - // - // Source: Charles Webb from IBM - // - // Split L1 I/D cache - // Size: 256 kB each - // Line size: 256 bytes - // 4-way set associative - // L2 cache: 32 MB (or half that size) - - // z9 - // - // Source: Charles Webb from IBM - // - // Split L1 I/D cache - // Size: 256 kB each - // Line size: 256 bytes - // 4-way set associative - // L2 cache: 40 MB x 4 (40 MB per book/node) - - - // Set caches to z10 default. - // See IBM Journal of Research and Development - // Issue Date: Jan. 2009 - // Volume: 53 Issue:1 - // fixs390: have a table for all available models and check /proc/cpuinfo - *I1c = (cache_t) { 65536, 4, 256 }; - *D1c = (cache_t) { 131072, 8, 256 }; - *LLc = (cache_t) {50331648, 24, 256 }; - - // Warn if config not completely specified from cmd line. Note that - // this message is slightly different from the one we give on x86/AMD64 - // when auto-detection fails; this lets us filter out this one (which is - // not important) in the regression test suite without filtering the - // x86/AMD64 one (which we want to see if it ever occurs in the - // regression test suite). - // - // If you change this message, please update - // cachegrind/tests/filter_stderr! - // - if (!all_caches_clo_defined) { - VG_(dmsg)("Warning: Cannot auto-detect cache config, " - "assuming z10-EC cache configuration\n"); - } -} - -#endif - -/*--------------------------------------------------------------------*/ -/*--- end cg-s390x.c ---*/ -/*--------------------------------------------------------------------*/ diff --git a/cachegrind/cg_arch.h b/cachegrind/cg_arch.h index 99d0cb1d33..d35ec73bd9 100644 --- a/cachegrind/cg_arch.h +++ b/cachegrind/cg_arch.h @@ -44,11 +44,6 @@ typedef struct { // initialized to UNDEFINED_CACHE. #define UNDEFINED_CACHE { -1, -1, -1 } -// Gives the auto-detected configuration of I1, D1 and LL caches. They get -// overridden by any cache configurations specified on the command line. -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, - Bool all_caches_clo_defined); - // If arg is a command line option configuring I1 or D1 or LL cache, // then parses arg to set the relevant cache_t elements. // Returns True if arg is a cache command line option, False otherwise. diff --git a/cachegrind/tests/filter_stderr b/cachegrind/tests/filter_stderr index 0eeb91ea30..8b9dd78eaf 100755 --- a/cachegrind/tests/filter_stderr +++ b/cachegrind/tests/filter_stderr @@ -17,8 +17,6 @@ perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' | sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" | sed "/Simulating a 16 KB I-cache with 32 B lines/d" | sed "/warning: L3 cache found, using its data for the LL simulation./d" | -sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d" | -sed "/Warning: Cannot auto-detect cache config on ARM, using one or more defaults/d" | -sed "/Warning: Cannot auto-detect cache config, assuming z10-EC cache configuration/d" | -sed "/Warning: Cannot auto-detect cache config on MIPS.., using one or more defaults/d" | +sed "/Warning: Cannot auto-detect cache config, using defaults./d" | +sed "/Run with -v to see./d" | sed "/warning: pretending that LL cache has associativity .*$/d" diff --git a/callgrind/Makefile.am b/callgrind/Makefile.am index ae4ff4fc69..343f03470e 100644 --- a/callgrind/Makefile.am +++ b/callgrind/Makefile.am @@ -45,13 +45,7 @@ CALLGRIND_SOURCES_COMMON = \ main.c \ sim.c \ threads.c \ - ../cachegrind/cg-arch.c \ - ../cachegrind/cg-x86-amd64.c \ - ../cachegrind/cg-ppc32.c \ - ../cachegrind/cg-ppc64.c \ - ../cachegrind/cg-arm.c \ - ../cachegrind/cg-s390x.c \ - ../cachegrind/cg-mips32.c + ../cachegrind/cg-arch.c CALLGRIND_CFLAGS_COMMON = -I$(top_srcdir)/cachegrind @@ -94,4 +88,3 @@ callgrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \ $(callgrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \ $(callgrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS) endif - diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr index 0842a93661..3f6f448627 100755 --- a/callgrind/tests/filter_stderr +++ b/callgrind/tests/filter_stderr @@ -26,8 +26,6 @@ perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' | sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" | sed "/Simulating a 16 KB I-cache with 32 B lines/d" | sed "/warning: L3 cache found, using its data for the LL simulation./d" | -sed "/Warning: Cannot auto-detect cache config on PPC.., using one or more defaults/d" | -sed "/Warning: Cannot auto-detect cache config on ARM, using one or more defaults/d" | -sed "/Warning: Cannot auto-detect cache config, assuming z10-EC cache configuration/d" | -sed "/Warning: Cannot auto-detect cache config on MIPS.., using one or more defaults/d" | +sed "/Warning: Cannot auto-detect cache config, using defaults./d" | +sed "/Run with -v to see./d" | sed "/warning: pretending that LL cache has associativity .*$/d" diff --git a/coregrind/Makefile.am b/coregrind/Makefile.am index 19fecf8b98..d4eb8167f8 100644 --- a/coregrind/Makefile.am +++ b/coregrind/Makefile.am @@ -240,6 +240,7 @@ pkglib_LIBRARIES += libcoregrind-@VGCONF_ARCH_SEC@-@VGCONF_OS@.a endif COREGRIND_SOURCES_COMMON = \ + m_cache.c \ m_commandline.c \ m_clientstate.c \ m_cpuid.S \ diff --git a/cachegrind/cg-x86-amd64.c b/coregrind/m_cache.c similarity index 71% rename from cachegrind/cg-x86-amd64.c rename to coregrind/m_cache.c index 1eb6c99634..3d3860e4f4 100644 --- a/cachegrind/cg-x86-amd64.c +++ b/coregrind/m_cache.c @@ -1,11 +1,12 @@ +/* -*- mode: C; c-basic-offset: 3; -*- */ /*--------------------------------------------------------------------*/ -/*--- x86- and AMD64-specific definitions. cg-x86-amd64.c ---*/ +/*--- Cache-related stuff. m_cache.c ---*/ /*--------------------------------------------------------------------*/ /* - This file is part of Cachegrind, a Valgrind tool for cache - profiling programs. + This file is part of Valgrind, a dynamic binary instrumentation + framework. Copyright (C) 2002-2012 Nicholas Nethercote njn@valgrind.org @@ -28,38 +29,51 @@ The GNU General Public License is contained in the file COPYING. */ -#if defined(VGA_x86) || defined(VGA_amd64) +#include "pub_core_basics.h" +#include "pub_core_libcbase.h" +#include "pub_core_libcassert.h" +#include "pub_core_libcprint.h" +#include "pub_core_mallocfree.h" +#include "pub_core_machine.h" +#include "libvex.h" -#include "pub_tool_basics.h" -#include "pub_tool_cpuid.h" -#include "pub_tool_libcbase.h" -#include "pub_tool_libcassert.h" -#include "pub_tool_libcprint.h" +#if defined(VGA_x86) || defined(VGA_amd64) -#include "cg_arch.h" +#include "pub_core_cpuid.h" // All CPUID info taken from sandpile.org/ia32/cpuid.htm */ // Probably only works for Intel and AMD chips, and probably only for some of -// them. +// them. -static void micro_ops_warn(Int actual_size, Int used_size, Int line_size) +static void +micro_ops_warn(Int actual_size, Int used_size, Int line_size) { - VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n", + VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n", actual_size); - VG_(dmsg)(" Simulating a %d KB I-cache with %d B lines\n", + VG_(dmsg)(" Simulating a %d KB I-cache with %d B lines\n", used_size, line_size); } +/* FIXME: Temporarily introduce cachegrind's cache_t structure here to + get Intel_cache_info to work. This function needs to be rewritten to + properly fill in VexCacheInfo. Absolutely no warnings about ignored + caches and such are appropriate here! */ +typedef struct { + Int size; // bytes + Int assoc; + Int line_size; // bytes +} cache_t; + /* Intel method is truly wretched. We have to do an insane indexing into an * array of pre-defined configurations for various parts of the memory * hierarchy. * According to Intel Processor Identification, App Note 485. - * + * * If a L3 cache is found, then data for it rather than the L2 * is returned via *LLc. */ -static -Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) +static Int +Intel_cache_info_aux(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) { Int cpuid1_eax; Int cpuid1_ignore; @@ -88,7 +102,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf); model = (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf); - VG_(cpuid)(2, 0, (Int*)&info[0], (Int*)&info[4], + VG_(cpuid)(2, 0, (Int*)&info[0], (Int*)&info[4], (Int*)&info[8], (Int*)&info[12]); trials = info[0] - 1; /* AL register - bits 0..7 of %eax */ info[0] = 0x0; /* reset AL */ @@ -105,7 +119,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) case 0x0: /* ignore zeros */ break; - + /* TLB info, ignore */ case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x0b: @@ -116,7 +130,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xba: case 0xc0: case 0xca: - break; + break; case 0x06: *I1c = (cache_t) { 8, 4, 32 }; break; case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break; @@ -130,10 +144,10 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break; /* IA-64 info -- panic! */ - case 0x10: case 0x15: case 0x1a: + case 0x10: case 0x15: case 0x1a: case 0x88: case 0x89: case 0x8a: case 0x8d: case 0x90: case 0x96: case 0x9b: - VG_(tool_panic)("IA-64 cache detected?!"); + VG_(core_panic)("IA-64 cache detected?!"); /* L3 cache info. */ case 0x22: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break; @@ -169,7 +183,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) case 0x39: *LLc = (cache_t) { 128, 4, 64 }; L2_found = True; break; case 0x3c: *LLc = (cache_t) { 256, 4, 64 }; L2_found = True; break; - /* If a P6 core, this means "no L2 cache". + /* If a P6 core, this means "no L2 cache". If a P4 core, this means "no L3 cache". We don't know what core it is, so don't issue a warning. To detect a missing L2 cache, we use 'L2_found'. */ @@ -201,20 +215,20 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based. * conversion to byte size is a total guess; treat the 12K and 16K * cases the same since the cache byte size must be a power of two for - * everything to work!. Also guessing 32 bytes for the line size... + * everything to work!. Also guessing 32 bytes for the line size... */ case 0x70: /* 12K micro-ops, 8-way */ - *I1c = (cache_t) { 16, 8, 32 }; + *I1c = (cache_t) { 16, 8, 32 }; micro_ops_warn(12, 16, 32); - break; + break; case 0x71: /* 16K micro-ops, 8-way */ - *I1c = (cache_t) { 16, 8, 32 }; - micro_ops_warn(16, 16, 32); - break; + *I1c = (cache_t) { 16, 8, 32 }; + micro_ops_warn(16, 16, 32); + break; case 0x72: /* 32K micro-ops, 8-way */ - *I1c = (cache_t) { 32, 8, 32 }; - micro_ops_warn(32, 32, 32); - break; + *I1c = (cache_t) { 32, 8, 32 }; + micro_ops_warn(32, 32, 32); + break; /* not sectored, whatever that might mean */ case 0x78: *LLc = (cache_t) { 1024, 4, 64 }; L2_found = True; break; @@ -242,7 +256,7 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) case 0xff: j = 0; - VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], + VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], (Int*)&info[8], (Int*)&info[12]); while ((info[0] & 0x1f) != 0) { @@ -264,25 +278,33 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) case 1: *D1c = c; break; case 2: *I1c = c; break; case 3: VG_(dmsg)("warning: L1 unified cache ignored\n"); break; - default: VG_(dmsg)("warning: L1 cache of unknown type ignored\n"); break; + default: + VG_(dmsg)("warning: L1 cache of unknown type ignored\n"); + break; } break; case 2: switch (info[0] & 0x1f) { case 1: VG_(dmsg)("warning: L2 data cache ignored\n"); break; - case 2: VG_(dmsg)("warning: L2 instruction cache ignored\n"); break; + case 2: VG_(dmsg)("warning: L2 instruction cache ignored\n"); + break; case 3: *LLc = c; L2_found = True; break; - default: VG_(dmsg)("warning: L2 cache of unknown type ignored\n"); break; + default: + VG_(dmsg)("warning: L2 cache of unknown type ignored\n"); + break; } break; case 3: switch (info[0] & 0x1f) { case 1: VG_(dmsg)("warning: L3 data cache ignored\n"); break; - case 2: VG_(dmsg)("warning: L3 instruction cache ignored\n"); break; + case 2: VG_(dmsg)("warning: L3 instruction cache ignored\n"); + break; case 3: L3c = c; L3_found = True; break; - default: VG_(dmsg)("warning: L3 cache of unknown type ignored\n"); break; + default: + VG_(dmsg)("warning: L3 cache of unknown type ignored\n"); + break; } break; default: @@ -290,21 +312,26 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) break; } - VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], + VG_(cpuid)(4, j++, (Int*)&info[0], (Int*)&info[4], (Int*)&info[8], (Int*)&info[12]); } break; default: - VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), ignoring\n", - info[i]); + VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), " + "ignoring\n", info[i]); break; } } - /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */ + /* If we found a L3 cache, throw away the L2 data and use the L3's + instead. */ if (L3_found) { - VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n"); + /* Can't warn here: as we're not necessarily in cachegrind */ +#if 0 + VG_(dmsg)("warning: L3 cache found, using its data for the " + "LL simulation.\n"); +#endif *LLc = L3c; L2_found = True; } @@ -315,35 +342,63 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) return 0; } +static Int +Intel_cache_info(Int level, VexCacheInfo *ci) +{ + cache_t I1c, D1c, LLc; + Int ret; + + ret = Intel_cache_info_aux(level, &I1c, &D1c, &LLc); + + /* Map results to VexCacheInfo. This is lossy as we simply assume + there is an L2 here (where in fact it could have been an L3). It + is irrelevant for current usages but needs to be fixed! */ + if (ret == 0) { + ci->num_levels = 2; + ci->num_caches = 3; + ci->icaches_maintain_coherence = True; + ci->caches = VG_(malloc)("m_cache", ci->num_caches * sizeof *ci->caches); + + ci->caches[0] = VEX_CACHE_INIT(DATA_CACHE, 1, D1c.size, D1c.line_size, + D1c.assoc); + ci->caches[1] = VEX_CACHE_INIT(INSN_CACHE, 1, I1c.size, I1c.line_size, + I1c.assoc); + ci->caches[2] = VEX_CACHE_INIT(UNIFIED_CACHE, 2, LLc.size, LLc.line_size, + LLc.assoc); + } + return ret; +} + /* AMD method is straightforward, just extract appropriate bits from the * result registers. * * Bits, for D1 and I1: - * 31..24 data L1 cache size in KBs - * 23..16 data L1 cache associativity (FFh=full) - * 15.. 8 data L1 cache lines per tag + * 31..24 data L1 cache size in KBs + * 23..16 data L1 cache associativity (FFh=full) + * 15.. 8 data L1 cache lines per tag * 7.. 0 data L1 cache line size in bytes * * Bits, for L2: * 31..16 unified L2 cache size in KBs * 15..12 unified L2 cache associativity (0=off, FFh=full) - * 11.. 8 unified L2 cache lines per tag + * 11.. 8 unified L2 cache lines per tag * 7.. 0 unified L2 cache line size in bytes * - * #3 The AMD K7 processor's L2 cache must be configured prior to relying + * #3 The AMD K7 processor's L2 cache must be configured prior to relying * upon this information. (Whatever that means -- njn) * * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB), * so we detect that. - * + * * Returns 0 on success, non-zero on failure. As with the Intel code * above, if a L3 cache is found, then data for it rather than the L2 * is returned via *LLc. */ /* A small helper */ -static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 ) +static Int +decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 ) { /* Decode a L2/L3 associativity indication. It is encoded differently from the I1/D1 associativity. Returns 1 @@ -362,17 +417,18 @@ static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 ) } } -static -Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc) +static Int +AMD_cache_info(VexCacheInfo *ci) { UInt ext_level; UInt dummy, model; UInt I1i, D1i, L2i, L3i; - + UInt size, line_size, assoc; + VG_(cpuid)(0x80000000, 0, &ext_level, &dummy, &dummy, &dummy); if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) { - VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n", + VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n", ext_level); return -1; } @@ -384,40 +440,60 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc) /* Check for Duron bug */ if (model == 0x630) { - VG_(dmsg)("warning: Buggy Duron stepping A0. Assuming L2 size=65536 bytes\n"); + VG_(dmsg)("warning: Buggy Duron stepping A0. " + "Assuming L2 size=65536 bytes\n"); L2i = (64 << 16) | (L2i & 0xffff); } - D1c->size = (D1i >> 24) & 0xff; - D1c->assoc = (D1i >> 16) & 0xff; - D1c->line_size = (D1i >> 0) & 0xff; + ci->num_levels = 2; + ci->num_caches = 3; + ci->icaches_maintain_coherence = True; + + /* Check for L3 cache */ + if (((L3i >> 18) & 0x3fff) > 0) { + ci->num_levels = 3; + ci->num_caches = 4; + } + + ci->caches = VG_(malloc)("m_cache", ci->num_caches * sizeof *ci->caches); + + // D1 + size = (D1i >> 24) & 0xff; + assoc = (D1i >> 16) & 0xff; + line_size = (D1i >> 0) & 0xff; + ci->caches[0] = VEX_CACHE_INIT(DATA_CACHE, 1, size, line_size, assoc); - I1c->size = (I1i >> 24) & 0xff; - I1c->assoc = (I1i >> 16) & 0xff; - I1c->line_size = (I1i >> 0) & 0xff; + // I1 + size = (I1i >> 24) & 0xff; + assoc = (I1i >> 16) & 0xff; + line_size = (I1i >> 0) & 0xff; + ci->caches[1] = VEX_CACHE_INIT(INSN_CACHE, 1, size, line_size, assoc); - LLc->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */ - LLc->assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf); - LLc->line_size = (L2i >> 0) & 0xff; + // L2 Nb: different bits used for L2 + size = (L2i >> 16) & 0xffff; + assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf); + line_size = (L2i >> 0) & 0xff; + ci->caches[2] = VEX_CACHE_INIT(INSN_CACHE, 2, size, line_size, assoc); + // L3, if any if (((L3i >> 18) & 0x3fff) > 0) { - /* There's an L3 cache. Replace *LLc contents with this info. */ + /* There's an L3 cache. */ /* NB: the test in the if is "if L3 size > 0 ". I don't know if this is the right way to test presence-vs-absence of L3. I can't see any guidance on this in the AMD documentation. */ - LLc->size = ((L3i >> 18) & 0x3fff) * 512; - LLc->assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf); - LLc->line_size = (L3i >> 0) & 0xff; - VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n"); + size = ((L3i >> 18) & 0x3fff) * 512; + assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf); + line_size = (L3i >> 0) & 0xff; + ci->caches[3] = VEX_CACHE_INIT(INSN_CACHE, 3, size, line_size, assoc); } return 0; } -static -Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc) +static Int +get_caches_from_CPUID(VexCacheInfo *ci) { - Int level, ret; + Int level, ret, i; Char vendor_id[13]; if (!VG_(has_cpuid)()) { @@ -425,8 +501,8 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc) return -1; } - VG_(cpuid)(0, 0, &level, (int*)&vendor_id[0], - (int*)&vendor_id[8], (int*)&vendor_id[4]); + VG_(cpuid)(0, 0, &level, (int*)&vendor_id[0], + (int*)&vendor_id[8], (int*)&vendor_id[4]); vendor_id[12] = '\0'; if (0 == level) { @@ -436,22 +512,21 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc) /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */ if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) { - ret = Intel_cache_info(level, I1c, D1c, LLc); + ret = Intel_cache_info(level, ci); } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) { - ret = AMD_cache_info(I1c, D1c, LLc); + ret = AMD_cache_info(ci); } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) { /* Total kludge. Pretend to be a VIA Nehemiah. */ - D1c->size = 64; - D1c->assoc = 16; - D1c->line_size = 16; - I1c->size = 64; - I1c->assoc = 4; - I1c->line_size = 16; - LLc->size = 64; - LLc->assoc = 16; - LLc->line_size = 16; + ci->num_levels = 2; + ci->num_caches = 3; + ci->icaches_maintain_coherence = True; + ci->caches = VG_(malloc)("m_cache", ci->num_caches * sizeof *ci->caches); + ci->caches[0] = VEX_CACHE_INIT(DATA_CACHE, 1, 64, 16, 16); + ci->caches[1] = VEX_CACHE_INIT(INSN_CACHE, 1, 64, 16, 4); + ci->caches[2] = VEX_CACHE_INIT(UNIFIED_CACHE, 2, 64, 16, 16); + ret = 0; } else { @@ -460,88 +535,47 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc) } /* Successful! Convert sizes from KB to bytes */ - I1c->size *= 1024; - D1c->size *= 1024; - LLc->size *= 1024; - - /* If the LL cache config isn't something the simulation functions - can handle, try to adjust it so it is. Caches are characterised - by (total size T, line size L, associativity A), and then we - have - - number of sets S = T / (L * A) - - The required constraints are: - - * L must be a power of 2, but it always is in practice, so - no problem there - - * A can be any value >= 1 - - * T can be any value, but .. - - * S must be a power of 2. - - That sometimes gives a problem. For example, some Core iX based - Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288 - sets. The "fix" in this case is to increase the associativity - by 50% to 24, which reduces the number of sets to 8192, making - it a power of 2. That's what the following code does (handing - the "3/2 rescaling case".) We might need to deal with other - ratios later (5/4 ?). - - The "fix" is "justified" (cough, cough) by alleging that - increases of associativity above about 4 have very little effect - on the actual miss rate. It would be far more inaccurate to - fudge this by changing the size of the simulated cache -- - changing the associativity is a much better option. - */ - if (LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0) { - Long nSets = (Long)LLc->size / (Long)(LLc->line_size * LLc->assoc); - if (/* stay sane */ - nSets >= 4 - /* nSets is not a power of 2 */ - && VG_(log2_64)( (ULong)nSets ) == -1 - /* nSets is 50% above a power of 2 */ - && VG_(log2_64)( (ULong)((2 * nSets) / (Long)3) ) != -1 - /* associativity can be increased by exactly 50% */ - && (LLc->assoc % 2) == 0 - ) { - /* # sets is 1.5 * a power of two, but the associativity is - even, so we can increase that up by 50% and implicitly - scale the # sets down accordingly. */ - Int new_assoc = LLc->assoc + (LLc->assoc / 2); - VG_(dmsg)("warning: pretending that LL cache has associativity" - " %d instead of actual %d\n", new_assoc, LLc->assoc); - LLc->assoc = new_assoc; - } + for (i = 0; i < ci->num_caches; ++i) { + ci->caches[i].sizeB *= 1024; } return ret; } +Bool +VG_(machine_get_cache_info)(VexArchInfo *vai) +{ + Int ret = get_caches_from_CPUID(&vai->hwcache_info); + + return ret == 0 ? True : False; +} + +#elif defined(VGA_arm) || defined(VGA_ppc32) || defined(VGA_ppc64) || \ + defined(VGA_mips32) -void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, - Bool all_caches_clo_defined) +Bool +VG_(machine_get_cache_info)(VexArchInfo *vai) { - Int res; - - // Set caches to default. - *I1c = (cache_t) { 65536, 2, 64 }; - *D1c = (cache_t) { 65536, 2, 64 }; - *LLc = (cache_t) { 262144, 8, 64 }; - - // Then replace with any info we can get from CPUID. - res = get_caches_from_CPUID(I1c, D1c, LLc); - - // Warn if CPUID failed and config not completely specified from cmd line. - if (res != 0 && !all_caches_clo_defined) { - VG_(dmsg)("Warning: Couldn't auto-detect cache config, using one " - "or more defaults \n"); - } + vai->hwcache_info.icaches_maintain_coherence = False; + + return False; // not yet } -#endif // defined(VGA_x86) || defined(VGA_amd64) +#elif defined(VGA_s390x) + +Bool +VG_(machine_get_cache_info)(VexArchInfo *vai) +{ + vai->hwcache_info.icaches_maintain_coherence = True; + + return False; // not yet +} + +#else + +#error "Unknown arch" + +#endif /*--------------------------------------------------------------------*/ /*--- end ---*/ diff --git a/coregrind/m_libcproc.c b/coregrind/m_libcproc.c index 136426b4d2..f0a7d7f3e2 100644 --- a/coregrind/m_libcproc.c +++ b/coregrind/m_libcproc.c @@ -724,6 +724,15 @@ void VG_(do_atfork_child)(ThreadId tid) void VG_(invalidate_icache) ( void *ptr, SizeT nbytes ) { + if (nbytes == 0) return; // nothing to do + + // Get cache info + VexArchInfo vai; + VG_(machine_get_VexArchInfo)(NULL, &vai); + + // If I-caches are coherent, nothing needs to be done here + if (vai.hwcache_info.icaches_maintain_coherence) return; + # if defined(VGA_ppc32) || defined(VGA_ppc64) Addr startaddr = (Addr) ptr; Addr endaddr = startaddr + nbytes; @@ -731,9 +740,6 @@ void VG_(invalidate_icache) ( void *ptr, SizeT nbytes ) Addr addr; VexArchInfo vai; - if (nbytes == 0) return; - vg_assert(nbytes > 0); - VG_(machine_get_VexArchInfo)( NULL, &vai ); cls = vai.ppc_cache_line_szB; @@ -750,15 +756,6 @@ void VG_(invalidate_icache) ( void *ptr, SizeT nbytes ) } __asm__ __volatile__("sync; isync"); -# elif defined(VGA_x86) - /* no need to do anything, hardware provides coherence */ - -# elif defined(VGA_amd64) - /* no need to do anything, hardware provides coherence */ - -# elif defined(VGA_s390x) - /* no need to do anything, hardware provides coherence */ - # elif defined(VGP_arm_linux) /* ARM cache flushes are privileged, so we must defer to the kernel. */ Addr startaddr = (Addr) ptr; @@ -770,8 +767,6 @@ void VG_(invalidate_icache) ( void *ptr, SizeT nbytes ) (UWord) nbytes, (UWord) 3); vg_assert( sres._isError == 0 ); -# else -# error "Unknown ARCH" # endif } diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c index f2d90a211e..9b4dabc1ae 100644 --- a/coregrind/m_machine.c +++ b/coregrind/m_machine.c @@ -674,7 +674,8 @@ static UInt VG_(get_machine_model)(void) /* Determine what insn set and insn set variant the host has, and record it. To be called once at system startup. Returns False if - this a CPU incapable of running Valgrind. */ + this a CPU incapable of running Valgrind. + Also determine information about the caches on this host. */ Bool VG_(machine_get_hwcaps)( void ) { @@ -731,26 +732,23 @@ Bool VG_(machine_get_hwcaps)( void ) have_lzcnt = (ecx & (1<<5)) != 0; /* True => have LZCNT */ } + va = VexArchX86; if (have_sse2 && have_sse1) { - va = VexArchX86; vai.hwcaps = VEX_HWCAPS_X86_SSE1; vai.hwcaps |= VEX_HWCAPS_X86_SSE2; if (have_lzcnt) vai.hwcaps |= VEX_HWCAPS_X86_LZCNT; VG_(machine_x86_have_mxcsr) = 1; - return True; - } - - if (have_sse1) { - va = VexArchX86; + } else if (have_sse1) { vai.hwcaps = VEX_HWCAPS_X86_SSE1; VG_(machine_x86_have_mxcsr) = 1; - return True; + } else { + vai.hwcaps = 0; /*baseline - no sse at all*/ + VG_(machine_x86_have_mxcsr) = 0; } - va = VexArchX86; - vai.hwcaps = 0; /*baseline - no sse at all*/ - VG_(machine_x86_have_mxcsr) = 0; + VG_(machine_get_cache_info)(&vai); + return True; } @@ -836,6 +834,9 @@ Bool VG_(machine_get_hwcaps)( void ) | (have_cx16 ? VEX_HWCAPS_AMD64_CX16 : 0) | (have_lzcnt ? VEX_HWCAPS_AMD64_LZCNT : 0) | (have_avx ? VEX_HWCAPS_AMD64_AVX : 0); + + VG_(machine_get_cache_info)(&vai); + return True; } @@ -980,6 +981,7 @@ Bool VG_(machine_get_hwcaps)( void ) if (have_VX) vai.hwcaps |= VEX_HWCAPS_PPC32_VX; if (have_DFP) vai.hwcaps |= VEX_HWCAPS_PPC32_DFP; + VG_(machine_get_cache_info)(&vai); /* But we're not done yet: VG_(machine_ppc32_set_clszB) must be called before we're ready to go. */ @@ -1105,6 +1107,8 @@ Bool VG_(machine_get_hwcaps)( void ) if (have_VX) vai.hwcaps |= VEX_HWCAPS_PPC64_VX; if (have_DFP) vai.hwcaps |= VEX_HWCAPS_PPC64_DFP; + VG_(machine_get_cache_info)(&vai); + /* But we're not done yet: VG_(machine_ppc64_set_clszB) must be called before we're ready to go. */ return True; @@ -1251,6 +1255,8 @@ Bool VG_(machine_get_hwcaps)( void ) VG_(debugLog)(1, "machine", "hwcaps = 0x%x\n", vai.hwcaps); + VG_(machine_get_cache_info)(&vai); + return True; } @@ -1360,6 +1366,8 @@ Bool VG_(machine_get_hwcaps)( void ) if (have_VFP) vai.hwcaps |= VEX_HWCAPS_ARM_VFP; if (have_NEON) vai.hwcaps |= VEX_HWCAPS_ARM_NEON; + VG_(machine_get_cache_info)(&vai); + return True; } @@ -1371,6 +1379,9 @@ Bool VG_(machine_get_hwcaps)( void ) return False; vai.hwcaps = model; + + VG_(machine_get_cache_info)(&vai); + return True; } diff --git a/coregrind/pub_core_cpuid.h b/coregrind/pub_core_cpuid.h index 7f1d75035f..269ae57df8 100644 --- a/coregrind/pub_core_cpuid.h +++ b/coregrind/pub_core_cpuid.h @@ -36,7 +36,13 @@ // CPUID instruction. //-------------------------------------------------------------------- -#include "pub_tool_cpuid.h" +#if defined(VGA_x86) || defined(VGA_amd64) +extern Bool VG_(has_cpuid) ( void ); + +extern void VG_(cpuid) ( UInt eax, UInt ecx, + UInt* eax_ret, UInt* ebx_ret, + UInt* ecx_ret, UInt* edx_ret ); +#endif #endif // __PUB_CORE_CPUID_H diff --git a/coregrind/pub_core_machine.h b/coregrind/pub_core_machine.h index ded9b9af48..11e590a8cf 100644 --- a/coregrind/pub_core_machine.h +++ b/coregrind/pub_core_machine.h @@ -199,9 +199,9 @@ void VG_(get_UnwindStartRegs) ( /*OUT*/UnwindStartRegs* regs, this a CPU incapable of running Valgrind. */ extern Bool VG_(machine_get_hwcaps)( void ); -/* Fetch host cpu info, as per above comment. */ -extern void VG_(machine_get_VexArchInfo)( /*OUT*/VexArch*, - /*OUT*/VexArchInfo* ); +/* Determine information about the cache system this host has and + record it. Returns False, if cache information cannot be auto-detected. */ +extern Bool VG_(machine_get_cache_info)( VexArchInfo * ); /* Notify host cpu cache line size, as per above comment. */ #if defined(VGA_ppc32) diff --git a/include/Makefile.am b/include/Makefile.am index a115754a65..41defb8ceb 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -8,7 +8,6 @@ nobase_pkginclude_HEADERS = \ pub_tool_aspacemgr.h \ pub_tool_clientstate.h \ pub_tool_clreq.h \ - pub_tool_cpuid.h \ pub_tool_debuginfo.h \ pub_tool_errormgr.h \ pub_tool_execontext.h \ diff --git a/include/pub_tool_cpuid.h b/include/pub_tool_cpuid.h deleted file mode 100644 index 149131a024..0000000000 --- a/include/pub_tool_cpuid.h +++ /dev/null @@ -1,46 +0,0 @@ - -/*--------------------------------------------------------------------*/ -/*--- Interface to CPUID. pub_tool_cpuid.h ---*/ -/*--------------------------------------------------------------------*/ - -/* - This file is part of Valgrind, a dynamic binary instrumentation - framework. - - Copyright (C) 2000-2012 Julian Seward - jseward@acm.org - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307, USA. - - The GNU General Public License is contained in the file COPYING. -*/ - -#ifndef __PUB_TOOL_CPUID_H -#define __PUB_TOOL_CPUID_H - -#if defined(VGA_x86) || defined(VGA_amd64) -extern Bool VG_(has_cpuid) ( void ); - -extern void VG_(cpuid) ( UInt eax, UInt ecx, - UInt* eax_ret, UInt* ebx_ret, - UInt* ecx_ret, UInt* edx_ret ); -#endif - -#endif // __PUB_TOOL_CPUID_H - -/*--------------------------------------------------------------------*/ -/*--- end ---*/ -/*--------------------------------------------------------------------*/ diff --git a/include/pub_tool_machine.h b/include/pub_tool_machine.h index 026db6b574..08ab203a20 100644 --- a/include/pub_tool_machine.h +++ b/include/pub_tool_machine.h @@ -31,6 +31,8 @@ #ifndef __PUB_TOOL_MACHINE_H #define __PUB_TOOL_MACHINE_H +#include "libvex.h" // VexArchInfo + #if defined(VGP_x86_linux) # define VG_MIN_INSTR_SZB 1 // min length of native instruction # define VG_MAX_INSTR_SZB 16 // max length of native instruction @@ -164,6 +166,10 @@ extern void* VG_(fnptr_to_fnentry)( void* ); (eg, AVX or non-AVX ?, for amd64). */ extern Int VG_(machine_get_size_of_largest_guest_register) ( void ); +/* Return host cpu info. */ +extern void VG_(machine_get_VexArchInfo)( /*OUT*/VexArch*, + /*OUT*/VexArchInfo* ); + #endif // __PUB_TOOL_MACHINE_H /*--------------------------------------------------------------------*/