From: Philippe Waroquiers Date: Fri, 18 Oct 2013 00:08:20 +0000 (+0000) Subject: Allow the user to dimension the translation cache X-Git-Tag: svn/VALGRIND_3_9_0~39 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=61b8b9617b0c3646325085739479349aec262ac4;p=thirdparty%2Fvalgrind.git Allow the user to dimension the translation cache A previous commit had decreased to 6 (on android) and increased to 16 (other platforms) the nr of sectors in the translation cache. This patch adds a command line option to let the user specify the nr of sectors as e.g. 16 sectors might be a lot and cause an out of memory for some workloads or might be too small for huge executable or executables using a lot of shared libs. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13652 --- diff --git a/NEWS b/NEWS index dccbefe32d..a31481ebb2 100644 --- a/NEWS +++ b/NEWS @@ -40,6 +40,13 @@ Release 3.9.0 (?? ?????? 201?) * ==================== OTHER CHANGES ==================== + - The default nr of sectors in the translation cache has been + decreased to 6 on android platforms, and increased to 16 + on all other platforms. A sector (lazily allocated) uses several + MB depending on the tool (about 40MB for memcheck). + The option --num-transtab-sectors allows to specify how + many sectors Valgrind can allocate. + - Option --merge-recursive-frames= tells Valgrind to detect and merge (collapse) recursive calls when recording stack traces. When your program has recursive algorithms, this limits diff --git a/coregrind/m_main.c b/coregrind/m_main.c index eeaa6f8d8f..8a079891d3 100644 --- a/coregrind/m_main.c +++ b/coregrind/m_main.c @@ -200,6 +200,8 @@ static void usage_NORETURN ( Bool debug_help ) " handle non-standard kernel variants\n" " --merge-recursive-frames= merge frames between identical\n" " program counters in max frames) [0]\n" +" --num-transtab-sectors= size of translated code cache [%d]\n" +" more sectors may increase the performance, but use more memory.\n" " --show-emwarns=no|yes show warnings about emulation limits? [no]\n" " --require-text-symbol=:sonamepattern:symbolpattern abort run if the\n" " stated shared object doesn't have the stated\n" @@ -306,7 +308,8 @@ static void usage_NORETURN ( Bool debug_help ) default_alignment /* char* */, default_redzone_size /* char* */, VG_(clo_vgdb_poll) /* int */, - VG_(vgdb_prefix_default)() /* char* */ + VG_(vgdb_prefix_default)() /* char* */, + N_SECTORS_DEFAULT /* int */ ); if (VG_(details).name) { VG_(printf)(" user options for %s:\n", VG_(details).name); @@ -606,6 +609,9 @@ void main_process_cmd_line_options ( /*OUT*/Bool* logging_to_fd, else if VG_INT_CLO (arg, "--sanity-level", VG_(clo_sanity_level)) {} else if VG_BINT_CLO(arg, "--num-callers", VG_(clo_backtrace_size), 1, VG_DEEPEST_BACKTRACE) {} + else if VG_BINT_CLO(arg, "--num-transtab-sectors", + VG_(clo_num_transtab_sectors), + MIN_N_SECTORS, MAX_N_SECTORS) {} else if VG_BINT_CLO(arg, "--merge-recursive-frames", VG_(clo_merge_recursive_frames), 0, VG_DEEPEST_BACKTRACE) {} diff --git a/coregrind/m_transtab.c b/coregrind/m_transtab.c index c27f7b7a47..456819b5d3 100644 --- a/coregrind/m_transtab.c +++ b/coregrind/m_transtab.c @@ -53,20 +53,13 @@ /*--- Management of the FIFO-based translation table+cache. ---*/ /*-------------------------------------------------------------*/ -/*------------------ CONSTANTS ------------------*/ - -/* Number of sectors the TC is divided into. If you need a larger - overall translation cache, increase this value. On Android, space - is limited, so try to get by with fewer sectors. On other - platforms we can go to town. 16 sectors gives theoretical capacity - of about 440MB of JITted code in 1.05 million translations - (realistically, about 2/3 of that) for Memcheck. */ -#if defined(VGPV_arm_linux_android) || defined(VGPV_x86_linux_android) -# define N_SECTORS 6 -#else -# define N_SECTORS 16 -#endif +/* Nr of sectors provided via command line parameter. */ +UInt VG_(clo_num_transtab_sectors) = N_SECTORS_DEFAULT; +/* Nr of sectors. + Will be set by VG_(init_tt_tc) to VG_(clo_num_transtab_sectors). */ +static int n_sectors; +/*------------------ CONSTANTS ------------------*/ /* Number of TC entries in each sector. This needs to be a prime number to work properly, it must be <= 65535 (so that a TT index fits in a UShort, leaving room for 0xFFFF(EC2TTE_DELETED) to denote @@ -356,7 +349,7 @@ typedef N_TC_SECTORS. The initial -1 value indicates the TT/TC system is not yet initialised. */ -static Sector sectors[N_SECTORS]; +static Sector sectors[MAX_N_SECTORS]; static Int youngest_sector = -1; /* The number of ULongs in each TCEntry area. This is computed once @@ -368,7 +361,7 @@ static Int tc_sector_szQ; searched to find translations. This is an optimisation to be used when searching for translations and should not affect correctness. -1 denotes "no entry". */ -static Int sector_search_order[N_SECTORS]; +static Int sector_search_order[MAX_N_SECTORS]; /* Fast helper for the TC. A direct-mapped cache which holds a set of @@ -447,7 +440,7 @@ static void ttaux_free ( void* p ) static inline TTEntry* index_tte ( UInt sNo, UInt tteNo ) { - vg_assert(sNo < N_SECTORS); + vg_assert(sNo < n_sectors); vg_assert(tteNo < N_TTES_PER_SECTOR); Sector* s = §ors[sNo]; vg_assert(s->tt); @@ -682,7 +675,7 @@ Bool find_TTEntry_from_hcode( /*OUT*/UInt* from_sNo, Int i; /* Search order logic copied from VG_(search_transtab). */ - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { Int sno = sector_search_order[i]; if (UNLIKELY(sno == -1)) return False; /* run out of sectors to search */ @@ -732,7 +725,7 @@ Bool find_TTEntry_from_hcode( /*OUT*/UInt* from_sNo, static Bool is_in_the_main_TC ( void* hcode ) { Int i, sno; - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { sno = sector_search_order[i]; if (sno == -1) break; /* run out of sectors to search */ @@ -1222,32 +1215,32 @@ static Bool sanity_check_sector_search_order ( void ) { Int i, j, nListed; /* assert the array is the right size */ - vg_assert(N_SECTORS == (sizeof(sector_search_order) - / sizeof(sector_search_order[0]))); + vg_assert(MAX_N_SECTORS == (sizeof(sector_search_order) + / sizeof(sector_search_order[0]))); /* Check it's of the form valid_sector_numbers ++ [-1, -1, ..] */ - for (i = 0; i < N_SECTORS; i++) { - if (sector_search_order[i] < 0 || sector_search_order[i] >= N_SECTORS) + for (i = 0; i < n_sectors; i++) { + if (sector_search_order[i] < 0 || sector_search_order[i] >= n_sectors) break; } nListed = i; - for (/* */; i < N_SECTORS; i++) { + for (/* */; i < n_sectors; i++) { if (sector_search_order[i] != -1) break; } - if (i != N_SECTORS) + if (i != n_sectors) return False; /* Check each sector number only appears once */ - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { if (sector_search_order[i] == -1) continue; - for (j = i+1; j < N_SECTORS; j++) { + for (j = i+1; j < n_sectors; j++) { if (sector_search_order[j] == sector_search_order[i]) return False; } } /* Check that the number of listed sectors equals the number in use, by counting nListed back down. */ - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { if (sectors[i].tc != NULL) nListed--; } @@ -1261,7 +1254,7 @@ static Bool sanity_check_all_sectors ( void ) Int sno; Bool sane; Sector* sec; - for (sno = 0; sno < N_SECTORS; sno++) { + for (sno = 0; sno < n_sectors; sno++) { Int i; Int nr_not_dead_hx = 0; Int szhxa; @@ -1308,7 +1301,7 @@ static UInt vge_osize ( VexGuestExtents* vge ) static Bool isValidSector ( Int sector ) { - if (sector < 0 || sector >= N_SECTORS) + if (sector < 0 || sector >= n_sectors) return False; return True; } @@ -1413,11 +1406,11 @@ static void initialiseSector ( Int sno ) sizeof(HostExtent)); /* Add an entry in the sector_search_order */ - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { if (sector_search_order[i] == -1) break; } - vg_assert(i >= 0 && i < N_SECTORS); + vg_assert(i >= 0 && i < n_sectors); sector_search_order[i] = sno; if (VG_(clo_verbosity) > 2) @@ -1482,11 +1475,11 @@ static void initialiseSector ( Int sno ) /* Sanity check: ensure it is already in sector_search_order[]. */ - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { if (sector_search_order[i] == sno) break; } - vg_assert(i >= 0 && i < N_SECTORS); + vg_assert(i >= 0 && i < n_sectors); if (VG_(clo_verbosity) > 2) VG_(message)(Vg_DebugMsg, "TT/TC: recycle sector %d\n", sno); @@ -1579,7 +1572,7 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, y, tt_loading_pct, tc_loading_pct); } youngest_sector++; - if (youngest_sector >= N_SECTORS) + if (youngest_sector >= n_sectors) youngest_sector = 0; y = youngest_sector; initialiseSector(y); @@ -1693,7 +1686,7 @@ Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode, /* Search in all the sectors,using sector_search_order[] as a heuristic guide as to what order to visit the sectors. */ - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { sno = sector_search_order[i]; if (UNLIKELY(sno == -1)) @@ -1951,7 +1944,7 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range, /* Fast scheme */ vg_assert(ec >= 0 && ec < ECLASS_MISC); - for (sno = 0; sno < N_SECTORS; sno++) { + for (sno = 0; sno < n_sectors; sno++) { sec = §ors[sno]; if (sec->tc == NULL) continue; @@ -1972,7 +1965,7 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range, VG_(debugLog)(2, "transtab", " SLOW, ec = %d\n", ec); - for (sno = 0; sno < N_SECTORS; sno++) { + for (sno = 0; sno < n_sectors; sno++) { sec = §ors[sno]; if (sec->tc == NULL) continue; @@ -1996,7 +1989,7 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range, vg_assert(sane); /* But now, also check the requested address range isn't present anywhere. */ - for (sno = 0; sno < N_SECTORS; sno++) { + for (sno = 0; sno < n_sectors; sno++) { sec = §ors[sno]; if (sec->tc == NULL) continue; @@ -2211,9 +2204,13 @@ void VG_(init_tt_tc) ( void ) vg_assert(tc_sector_szQ >= 2 * N_TTES_PER_SECTOR_USABLE); vg_assert(tc_sector_szQ <= 100 * N_TTES_PER_SECTOR_USABLE); + n_sectors = VG_(clo_num_transtab_sectors); + vg_assert(n_sectors >= MIN_N_SECTORS); + vg_assert(n_sectors <= MAX_N_SECTORS); + /* Initialise the sectors */ youngest_sector = 0; - for (i = 0; i < N_SECTORS; i++) { + for (i = 0; i < n_sectors; i++) { sectors[i].tc = NULL; sectors[i].tt = NULL; sectors[i].tc_next = NULL; @@ -2227,7 +2224,7 @@ void VG_(init_tt_tc) ( void ) } /* Initialise the sector_search_order hint table. */ - for (i = 0; i < N_SECTORS; i++) + for (i = 0; i < n_sectors; i++) sector_search_order[i] = -1; /* Initialise the fast cache. */ @@ -2236,27 +2233,24 @@ void VG_(init_tt_tc) ( void ) /* and the unredir tt/tc */ init_unredir_tt_tc(); - if (VG_(clo_verbosity) > 2 || VG_(clo_stats)) { + if (VG_(clo_verbosity) > 2 || VG_(clo_stats) + || VG_(debugLog_getLevel) () >= 2) { VG_(message)(Vg_DebugMsg, "TT/TC: cache: %d sectors of %d bytes each = %d total\n", - N_SECTORS, 8 * tc_sector_szQ, - N_SECTORS * 8 * tc_sector_szQ ); + n_sectors, 8 * tc_sector_szQ, + n_sectors * 8 * tc_sector_szQ ); VG_(message)(Vg_DebugMsg, - "TT/TC: table: %d total entries, max occupancy %d (%d%%)\n", - N_SECTORS * N_TTES_PER_SECTOR, - N_SECTORS * N_TTES_PER_SECTOR_USABLE, + "TT/TC: table: %d tables of %d bytes each = %d total\n", + n_sectors, (int)(N_TTES_PER_SECTOR * sizeof(TTEntry)), + (int)(n_sectors * N_TTES_PER_SECTOR * sizeof(TTEntry))); + VG_(message)(Vg_DebugMsg, + "TT/TC: table: %d entries each = %d total entries" + " max occupancy %d (%d%%)\n", + N_TTES_PER_SECTOR, + n_sectors * N_TTES_PER_SECTOR, + n_sectors * N_TTES_PER_SECTOR_USABLE, SECTOR_TT_LIMIT_PERCENT ); } - - VG_(debugLog)(2, "transtab", - "cache: %d sectors of %d bytes each = %d total\n", - N_SECTORS, 8 * tc_sector_szQ, - N_SECTORS * 8 * tc_sector_szQ ); - VG_(debugLog)(2, "transtab", - "table: %d total entries, max occupancy %d (%d%%)\n", - N_SECTORS * N_TTES_PER_SECTOR, - N_SECTORS * N_TTES_PER_SECTOR_USABLE, - SECTOR_TT_LIMIT_PERCENT ); } @@ -2332,7 +2326,7 @@ ULong VG_(get_SB_profile) ( SBProfEntry tops[], UInt n_tops ) score_total = 0; - for (sno = 0; sno < N_SECTORS; sno++) { + for (sno = 0; sno < n_sectors; sno++) { if (sectors[sno].tc == NULL) continue; for (i = 0; i < N_TTES_PER_SECTOR; i++) { @@ -2370,7 +2364,7 @@ ULong VG_(get_SB_profile) ( SBProfEntry tops[], UInt n_tops ) /* Now zero out all the counter fields, so that we can make multiple calls here and just get the values since the last call, each time, rather than values accumulated for the whole run. */ - for (sno = 0; sno < N_SECTORS; sno++) { + for (sno = 0; sno < n_sectors; sno++) { if (sectors[sno].tc == NULL) continue; for (i = 0; i < N_TTES_PER_SECTOR; i++) { diff --git a/coregrind/pub_core_options.h b/coregrind/pub_core_options.h index 58e19dd109..0202ab9e27 100644 --- a/coregrind/pub_core_options.h +++ b/coregrind/pub_core_options.h @@ -272,6 +272,9 @@ extern Word VG_(clo_main_stacksize); Note that the value is changeable by a gdbsrv command. */ extern Int VG_(clo_merge_recursive_frames); +/* Max number of sectors that will be used by the translation code cache. */ +extern UInt VG_(clo_num_transtab_sectors); + /* Delay startup to allow GDB to be attached? Default: NO */ extern Bool VG_(clo_wait_for_gdb); diff --git a/coregrind/pub_core_transtab.h b/coregrind/pub_core_transtab.h index 6ed1cb4390..b02296636e 100644 --- a/coregrind/pub_core_transtab.h +++ b/coregrind/pub_core_transtab.h @@ -53,8 +53,29 @@ extern __attribute__((aligned(16))) #define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1) + +/* Initialises the TC, using VG_(clo_num_transtab_sectors). + VG_(clo_num_transtab_sectors) must be >= MIN_N_SECTORS + and <= MAX_N_SECTORS. */ extern void VG_(init_tt_tc) ( void ); + +/* Limits for number of sectors the TC is divided into. If you need a larger + overall translation cache, increase MAX_N_SECTORS. */ +#define MIN_N_SECTORS 2 +#define MAX_N_SECTORS 32 + +/* Default for the nr of sectors, if not overriden by command line. + On Android, space is limited, so try to get by with fewer sectors. + On other platforms we can go to town. 16 sectors gives theoretical + capacity of about 440MB of JITted code in 1.05 million translations + (realistically, about 2/3 of that) for Memcheck. */ +#if defined(VGPV_arm_linux_android) || defined(VGPV_x86_linux_android) +# define N_SECTORS_DEFAULT 6 +#else +# define N_SECTORS_DEFAULT 16 +#endif + extern void VG_(add_to_transtab)( VexGuestExtents* vge, Addr64 entry, diff --git a/docs/xml/manual-core.xml b/docs/xml/manual-core.xml index 4de9a04fa0..acbb165df3 100644 --- a/docs/xml/manual-core.xml +++ b/docs/xml/manual-core.xml @@ -1852,6 +1852,28 @@ need to use them. + + + + + + Valgrind translates and instruments your program code. The + translations are stored in a translation cache organized in + sectors. If the cache is full, the sector containing the older + translations is emptied and recycled. If these old translations + are needed again, Valgrind must re-translate and re-instrument + the corresponding program code. If the "executed instructions" + working set of a program is big, increasing the number of + sectors may improve the performance by reducing the number of + re-translations needed. A sector is lazily allocated but once + allocated, it permanently uses several MB depending + on the tool (about 40 MB per sector for memcheck). + Use the option to obtain precise + information about the memory used by a sector and the allocation + and recycling of sectors. + + + diff --git a/none/tests/cmdline1.stdout.exp b/none/tests/cmdline1.stdout.exp index c39373feb1..abd100eea8 100644 --- a/none/tests/cmdline1.stdout.exp +++ b/none/tests/cmdline1.stdout.exp @@ -88,6 +88,8 @@ usage: valgrind [options] prog-and-args handle non-standard kernel variants --merge-recursive-frames= merge frames between identical program counters in max frames) [0] + --num-transtab-sectors= size of translated code cache [16] + more sectors may increase the performance, but use more memory. --show-emwarns=no|yes show warnings about emulation limits? [no] --require-text-symbol=:sonamepattern:symbolpattern abort run if the stated shared object doesn't have the stated diff --git a/none/tests/cmdline2.stdout.exp b/none/tests/cmdline2.stdout.exp index 7579c30733..0ca629e600 100644 --- a/none/tests/cmdline2.stdout.exp +++ b/none/tests/cmdline2.stdout.exp @@ -88,6 +88,8 @@ usage: valgrind [options] prog-and-args handle non-standard kernel variants --merge-recursive-frames= merge frames between identical program counters in max frames) [0] + --num-transtab-sectors= size of translated code cache [16] + more sectors may increase the performance, but use more memory. --show-emwarns=no|yes show warnings about emulation limits? [no] --require-text-symbol=:sonamepattern:symbolpattern abort run if the stated shared object doesn't have the stated diff --git a/perf/bigcode.c b/perf/bigcode.c index 628cd01dd4..6e8940140b 100644 --- a/perf/bigcode.c +++ b/perf/bigcode.c @@ -1,6 +1,7 @@ // This artificial program runs a lot of code. The exact amount depends on -// the command line -- if any command line args are given, it does exactly +// the command line -- if an arg "0" is given, it does exactly // the same amount of work, but using four times as much code. +// If an arg >= 1 is given, the amount of code is multiplied by this arg. // // It's a stress test for Valgrind's translation speed; natively the two // modes run in about the same time (the I-cache effects aren't big enough @@ -9,6 +10,7 @@ #include #include +#include #include #if defined(__mips__) #include @@ -39,11 +41,6 @@ int main(int argc, char* argv[]) int h, i, sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0; int n_fns, n_reps; - char* a = mmap(0, FN_SIZE * N_LOOPS, - PROT_EXEC|PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS, -1,0); - assert(a != (char*)MAP_FAILED); - if (argc <= 1) { // Mode 1: not so much code n_fns = N_LOOPS / RATIO; @@ -51,12 +48,21 @@ int main(int argc, char* argv[]) printf("mode 1: "); } else { // Mode 2: lots of code - n_fns = N_LOOPS; + const int mul = atoi(argv[1]); + if (mul == 0) + n_fns = N_LOOPS; + else + n_fns = N_LOOPS * mul; n_reps = 1; printf("mode 1: "); } printf("%d copies of f(), %d reps\n", n_fns, n_reps); + char* a = mmap(0, FN_SIZE * n_fns, + PROT_EXEC|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1,0); + assert(a != (char*)MAP_FAILED); + // Make a whole lot of copies of f(). FN_SIZE is much bigger than f() // will ever be (we hope). for (i = 0; i < n_fns; i++) {