From 6d0f2f35c099eeeb590a05605643f10d2dbea50a Mon Sep 17 00:00:00 2001 From: Philippe Waroquiers Date: Fri, 15 May 2015 11:41:54 +0000 Subject: [PATCH] This patch (re-)gains performance in helgrind, following revision 15207, that reduced memory use doing SecMap GC, but was slowing down some workloads (typically, workloads doing a lot of malloc/free). A significant part of the slowdown came from the clear of the filter, that was not optimised for big ranges : the filter was working byte per byte till an 8 alignment. Then working per 8 bytes at a time. With the patch, the filter clear is done the following way: * all the bytes till 8 alignement are done together * then 8 bytes at a time till filter_line alignment (32 bytes) * then 32 bytes at a time. Moreover, as the filter cache is small (1024 lines of 32 bytes), clearing filter for ranges bigger than 32Kb was uselessly checking several times the same entry. This is now avoided by using a range check rather than a tag equality check. As the new filter clear is significanly more complex than the previous simple algorithm, the old algorithm is kept and used to check the new algorithm when CHECK_ZSM is defined as 1. The patch also contains a few micro optimisations and disables // VG_(track_die_mem_stack) ( evh__die_mem ); as this had no effect and was somewhat costly. With this patch, we have almost reached for all perf tests the same performance as we had before revision 15207. Some tests are still slightly slower than before the SecMap GC (max 2% difference). Some tests are now significantly faster (e.g. sarp). For almost all tests, we are now faster than valgrind 3.10.1. Details below. Regtested on x86/amd64/ppc64 (and regtested with all compile time checks set). I have also regtested with libreoffice and firefox. (with firefox, also with CHECK_ZSM set to 1). Details about performance: hgtrace = this patch trunk_untouched = trunk base_secmap = trunk before secmap GC valgrind 3.10.1 included for comparison Measured on core i5 2.53GHz -- Running tests in perf ---------------------------------------------- -- bigcode1 -- bigcode1 hgtrace :0.14s he: 2.6s (18.4x, -----) bigcode1 trunk_untouched:0.14s he: 2.6s (18.4x, -0.4%) bigcode1 base_secmap:0.14s he: 2.6s (18.6x, -1.2%) bigcode1 valgrind-3.10.1:0.14s he: 2.8s (19.8x, -7.8%) -- bigcode2 -- bigcode2 hgtrace :0.14s he: 6.3s (44.7x, -----) bigcode2 trunk_untouched:0.14s he: 6.2s (44.6x, 0.2%) bigcode2 base_secmap:0.14s he: 6.3s (45.0x, -0.6%) bigcode2 valgrind-3.10.1:0.14s he: 6.6s (47.1x, -5.4%) -- bz2 -- bz2 hgtrace :0.64s he:11.3s (17.7x, -----) bz2 trunk_untouched:0.64s he:11.7s (18.2x, -3.2%) bz2 base_secmap:0.64s he:11.1s (17.3x, 1.9%) bz2 valgrind-3.10.1:0.64s he:12.6s (19.7x,-11.3%) -- fbench -- fbench hgtrace :0.29s he: 3.4s (11.8x, -----) fbench trunk_untouched:0.29s he: 3.4s (11.7x, 0.6%) fbench base_secmap:0.29s he: 3.6s (12.4x, -5.0%) fbench valgrind-3.10.1:0.29s he: 3.5s (12.2x, -3.5%) -- ffbench -- ffbench hgtrace :0.26s he: 9.8s (37.7x, -----) ffbench trunk_untouched:0.26s he:10.0s (38.4x, -1.9%) ffbench base_secmap:0.26s he: 9.8s (37.8x, -0.2%) ffbench valgrind-3.10.1:0.26s he:10.0s (38.4x, -1.9%) -- heap -- heap hgtrace :0.11s he: 9.2s (84.0x, -----) heap trunk_untouched:0.11s he: 9.6s (87.1x, -3.7%) heap base_secmap:0.11s he: 9.0s (81.9x, 2.5%) heap valgrind-3.10.1:0.11s he: 9.1s (82.9x, 1.3%) -- heap_pdb4 -- heap_pdb4 hgtrace :0.13s he:10.7s (82.3x, -----) heap_pdb4 trunk_untouched:0.13s he:11.0s (84.8x, -3.0%) heap_pdb4 base_secmap:0.13s he:10.5s (80.8x, 1.8%) heap_pdb4 valgrind-3.10.1:0.13s he:10.6s (81.8x, 0.7%) -- many-loss-records -- many-loss-records hgtrace :0.01s he: 1.5s (152.0x, -----) many-loss-records trunk_untouched:0.01s he: 1.6s (157.0x, -3.3%) many-loss-records base_secmap:0.01s he: 1.6s (158.0x, -3.9%) many-loss-records valgrind-3.10.1:0.01s he: 1.7s (167.0x, -9.9%) -- many-xpts -- many-xpts hgtrace :0.03s he: 2.8s (91.7x, -----) many-xpts trunk_untouched:0.03s he: 2.8s (94.7x, -3.3%) many-xpts base_secmap:0.03s he: 2.8s (94.0x, -2.5%) many-xpts valgrind-3.10.1:0.03s he: 2.9s (97.7x, -6.5%) -- memrw -- memrw hgtrace :0.06s he: 7.3s (121.2x, -----) memrw trunk_untouched:0.06s he: 7.2s (120.3x, 0.7%) memrw base_secmap:0.06s he: 7.1s (117.7x, 2.9%) memrw valgrind-3.10.1:0.06s he: 8.1s (135.2x,-11.6%) -- sarp -- sarp hgtrace :0.02s he: 7.6s (378.5x, -----) sarp trunk_untouched:0.02s he: 8.4s (422.0x,-11.5%) sarp base_secmap:0.02s he: 8.6s (431.0x,-13.9%) sarp valgrind-3.10.1:0.02s he: 8.8s (442.0x,-16.8%) -- tinycc -- tinycc hgtrace :0.20s he:12.4s (62.0x, -----) tinycc trunk_untouched:0.20s he:12.6s (63.2x, -1.9%) tinycc base_secmap:0.20s he:12.6s (63.0x, -1.6%) tinycc valgrind-3.10.1:0.20s he:12.7s (63.5x, -2.3%) -- Finished tests in perf ---------------------------------------------- == 12 programs, 48 timings ================= git-svn-id: svn://svn.valgrind.org/valgrind/trunk@15236 --- helgrind/hg_main.c | 6 +- helgrind/libhb_core.c | 154 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 156 insertions(+), 4 deletions(-) diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c index a8dbdc813a..ed0efbe844 100644 --- a/helgrind/hg_main.c +++ b/helgrind/hg_main.c @@ -5568,7 +5568,11 @@ static void hg_pre_clo_init ( void ) VG_(track_die_mem_stack_signal)( evh__die_mem ); VG_(track_die_mem_brk) ( evh__die_mem_munmap ); VG_(track_die_mem_munmap) ( evh__die_mem_munmap ); - VG_(track_die_mem_stack) ( evh__die_mem ); + + /* evh__die_mem calls at the end libhb_srange_noaccess_NoFX + which has no effect. We do not use VG_(track_die_mem_stack), + as this would be an expensive way to do nothing. */ + // VG_(track_die_mem_stack) ( evh__die_mem ); // FIXME: what is this for? VG_(track_ban_mem_stack) (NULL); diff --git a/helgrind/libhb_core.c b/helgrind/libhb_core.c index 791e95ecf7..d7710c23f6 100644 --- a/helgrind/libhb_core.c +++ b/helgrind/libhb_core.c @@ -3568,9 +3568,12 @@ static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a ) } } -static void Filter__clear_range ( Filter* fi, Addr a, UWord len ) +/* Only used to verify the fast Filter__clear_range */ +__attribute__((unused)) +static void Filter__clear_range_SLOW ( Filter* fi, Addr a, UWord len ) { - //VG_(printf)("%lu ", len); + tl_assert (CHECK_ZSM); + /* slowly do part preceding 8-alignment */ while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) { Filter__clear_1byte( fi, a ); @@ -3591,6 +3594,151 @@ static void Filter__clear_range ( Filter* fi, Addr a, UWord len ) } } +static void Filter__clear_range ( Filter* fi, Addr a, UWord len ) +{ +# if CHECK_ZSM > 0 + /* We check the below more complex algorithm with the simple one. + This check is very expensive : we do first the slow way on a + copy of the data, then do it the fast way. On RETURN, we check + the two values are equal. */ + Filter fi_check = *fi; + Filter__clear_range_SLOW(&fi_check, a, len); +# define RETURN goto check_and_return +# else +# define RETURN return +# endif + + Addr begtag = FI_GET_TAG(a); /* tag of range begin */ + + Addr end = a + len - 1; + Addr endtag = FI_GET_TAG(end); /* tag of range end. */ + + UWord rlen = len; /* remaining length to clear */ + + Addr c = a; /* Current position we are clearing. */ + UWord clineno = FI_GET_LINENO(c); /* Current lineno we are clearing */ + FiLine* cline; /* Current line we are clearing */ + UWord cloff; /* Current offset in line we are clearing, when clearing + partial lines. */ + + UShort u16; + + STATIC_ASSERT (FI_LINE_SZB == 32); + // Below assumes filter lines are 32 bytes + + if (LIKELY(fi->tags[clineno] == begtag)) { + /* LIKELY for the heavy caller VG_(unknown_SP_update). */ + /* First filter line matches begtag. + If c is not at the filter line begin, the below will clear + the filter line bytes starting from c. */ + cline = &fi->lines[clineno]; + cloff = (c - begtag) / 8; + + /* First the byte(s) needed to reach 8-alignment */ + if (UNLIKELY(!VG_IS_8_ALIGNED(c))) { + /* hiB is the nr of bytes (higher addresses) from c to reach + 8-aligment. */ + UWord hiB = 8 - (c & 7); + /* Compute 2-bit/byte mask representing hiB bytes [c..c+hiB[ + mask is C000 , F000, FC00, FF00, FFC0, FFF0 or FFFC for the byte + range 7..7 6..7 5..7 4..7 3..7 2..7 1..7 */ + UShort mask = 0xFFFF << (16 - 2*hiB); + + u16 = cline->u16s[cloff]; + if (LIKELY(rlen >= hiB)) { + cline->u16s[cloff] = u16 & ~mask; /* clear all hiB from c */ + rlen -= hiB; + c += hiB; + cloff += 1; + } else { + /* Only have the bits for rlen bytes bytes. */ + mask = mask & ~(0xFFFF << (16 - 2*(hiB-rlen))); + cline->u16s[cloff] = u16 & ~mask; /* clear rlen bytes from c. */ + RETURN; // We have cleared all what we can. + } + } + /* c is now 8 aligned. Clear by 8 aligned bytes, + till c is filter-line aligned */ + while (!VG_IS_32_ALIGNED(c) && rlen >= 8) { + cline->u16s[cloff] = 0; + c += 8; + rlen -= 8; + cloff += 1; + } + } else { + c = begtag + FI_LINE_SZB; + if (c > end) + RETURN; // We have cleared all what we can. + rlen -= c - a; + } + // We have changed c, so re-establish clineno. + clineno = FI_GET_LINENO(c); + + if (rlen >= FI_LINE_SZB) { + /* Here, c is filter line-aligned. Clear all full lines that + overlap with the range starting at c, made of a full lines */ + UWord nfull = rlen / FI_LINE_SZB; + UWord full_len = nfull * FI_LINE_SZB; + rlen -= full_len; + if (nfull > FI_NUM_LINES) + nfull = FI_NUM_LINES; // no need to check several times the same entry. + + for (UWord n = 0; n < nfull; n++) { + if (UNLIKELY(address_in_range(fi->tags[clineno], c, full_len))) { + cline = &fi->lines[clineno]; + cline->u16s[0] = 0; + cline->u16s[1] = 0; + cline->u16s[2] = 0; + cline->u16s[3] = 0; + STATIC_ASSERT (4 == sizeof(cline->u16s)/sizeof(cline->u16s[0])); + } + clineno++; + if (UNLIKELY(clineno == FI_NUM_LINES)) + clineno = 0; + } + + c += full_len; + clineno = FI_GET_LINENO(c); + } + + if (CHECK_ZSM) { + tl_assert(VG_IS_8_ALIGNED(c)); + tl_assert(clineno == FI_GET_LINENO(c)); + } + + /* Do the last filter line, if it was not cleared as a full filter line */ + if (UNLIKELY(rlen > 0) && fi->tags[clineno] == endtag) { + cline = &fi->lines[clineno]; + cloff = (c - endtag) / 8; + if (CHECK_ZSM) tl_assert(FI_GET_TAG(c) == endtag); + + /* c is 8 aligned. Clear by 8 aligned bytes, till we have less than + 8 bytes. */ + while (rlen >= 8) { + cline->u16s[cloff] = 0; + c += 8; + rlen -= 8; + cloff += 1; + } + /* Then the remaining byte(s) */ + if (rlen > 0) { + /* nr of bytes from c to reach end. */ + UWord loB = rlen; + /* Compute mask representing loB bytes [c..c+loB[ : + mask is 0003, 000F, 003F, 00FF, 03FF, 0FFF or 3FFF */ + UShort mask = 0xFFFF >> (16 - 2*loB); + + u16 = cline->u16s[cloff]; + cline->u16s[cloff] = u16 & ~mask; /* clear all loB from c */ + } + } + +# if CHECK_ZSM > 0 + check_and_return: + tl_assert (VG_(memcmp)(&fi_check, fi, sizeof(fi_check)) == 0); +# endif +# undef RETURN +} /* ------ Read handlers for the filter. ------ */ @@ -6786,7 +6934,7 @@ static void zsm_sset_range_noaccess (Addr addr, SizeT len) if (CHECK_ZSM) tl_assert(is_sane_SecMap(sm)); for (UInt lz = 0; lz < N_SECMAP_ZLINES; lz++) { - if (sm->linesZ[lz].dict[0] != SVal_INVALID) + if (LIKELY(sm->linesZ[lz].dict[0] != SVal_INVALID)) rcdec_LineZ(&sm->linesZ[lz]); } for (UInt lf = 0; lf < sm->linesF_size; lf++) { -- 2.47.3