From: Julian Seward Date: Wed, 8 Dec 2021 06:52:09 +0000 (+0100) Subject: Bug 446103 - Memcheck: `--track-origins=yes` causes extreme slowdowns for large mmap... X-Git-Tag: VALGRIND_3_19_0~56 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8ee1656165902125c414d598cff788c7bb0b1556;p=thirdparty%2Fvalgrind.git Bug 446103 - Memcheck: `--track-origins=yes` causes extreme slowdowns for large mmap/munmap. This patch rewrites the Level 2 origin-tracking cache (ocacheL2) so that set-address-range-permissions (SARP) operations on it, for large ranges, are at least a factor of 2.5 x faster. This is primarily targeted at SARPs in the range of hundreds to thousands of megabytes. The Level 1 origin-tracking cache covers 64MB address space, so SARPs that fit within it are mostly unaffected. There are extensive comments in-line. Changes are: * Change the Level 2 cache from a single AVL tree (OSet) into 4096 such trees, selected by middle bits of the tag, hence "taking out" 12 significant bits of search in any given tree. * For the OCacheLine type, use a union so as to overlay the w32 and descr arrays with an array of 64-bit values. This is used to speed up cases where those fields are to be set to zero, or checked against zero. * Due to the various fast-paths added by this patch, OC_BITS_PER_LINE has pretty much been frozen at the current value, 5. * ocache_sarp_Set_Origins, ocache_sarp_Clear_Origins: deal with large ranges in 32-byte steps instead of 4-byte steps. * MC_(helperc_b_store32), MC_(helperc_b_store16): rewrite these to be (much) more efficient. * fast-return cases for VG_(OSetGen_Lookup) and VG_(OSetGen_Remove) when the tree is empty * a few extra inline hints --- diff --git a/NEWS b/NEWS index e7b53a515c..a0844428ef 100644 --- a/NEWS +++ b/NEWS @@ -57,10 +57,11 @@ are not entered into bugzilla tend to get forgotten about or ignored. 446139 DRD/Helgrind with std::shared_timed_mutex::try_lock_until and try_lock_shared_until false positives 446138 DRD/Helgrind with std::timed_mutex::try_lock_until false positives 446281 Add a DRD suppression for fwrite +446103 Memcheck: `--track-origins=yes` causes extreme slowdowns for large mmap/munmap To see details of a given bug, visit https://bugs.kde.org/show_bug.cgi?id=XXXXXX -where XXXXXX is the bug number as listed below. +where XXXXXX is the bug number as listed above. diff --git a/coregrind/m_oset.c b/coregrind/m_oset.c index cf28faec95..0521904d96 100644 --- a/coregrind/m_oset.c +++ b/coregrind/m_oset.c @@ -570,7 +570,7 @@ void VG_(OSetWord_Insert)(AvlTree* t, UWord val) /*--------------------------------------------------------------------*/ // Find the *node* in t matching k, or NULL if not found. -static AvlNode* avl_lookup(const AvlTree* t, const void* k) +static inline AvlNode* avl_lookup(const AvlTree* t, const void* k) { Word cmpres; AvlNode* curr = t->root; @@ -604,9 +604,10 @@ static AvlNode* avl_lookup(const AvlTree* t, const void* k) // Find the *element* in t matching k, or NULL if not found. void* VG_(OSetGen_Lookup)(const AvlTree* t, const void* k) { - AvlNode* n; vg_assert(t); - n = avl_lookup(t, k); + if (LIKELY(t->root == NULL)) + return NULL; + AvlNode* n = avl_lookup(t, k); return ( n ? elem_of_node(n) : NULL ); } @@ -767,6 +768,8 @@ static Bool avl_removeroot(AvlTree* t) // if not present. void* VG_(OSetGen_Remove)(AvlTree* t, const void* k) { + if (LIKELY(t->root == NULL)) + return NULL; // Have to find the node first, then remove it. AvlNode* n = avl_lookup(t, k); if (n) { diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c index d268d7e52b..47b370bf51 100644 --- a/memcheck/mc_main.c +++ b/memcheck/mc_main.c @@ -2402,7 +2402,9 @@ static UWord stats_ocacheL1_misses = 0; static UWord stats_ocacheL1_lossage = 0; static UWord stats_ocacheL1_movefwds = 0; -static UWord stats__ocacheL2_refs = 0; +static UWord stats__ocacheL2_finds = 0; +static UWord stats__ocacheL2_adds = 0; +static UWord stats__ocacheL2_dels = 0; static UWord stats__ocacheL2_misses = 0; static UWord stats__ocacheL2_n_nodes_max = 0; @@ -2431,26 +2433,86 @@ static INLINE Bool is_valid_oc_tag ( Addr tag ) { #define OC_MOVE_FORWARDS_EVERY_BITS 7 +/* Originally (pre Dec 2021) it was the case that this code had a + parameterizable cache line size, set by changing OC_BITS_PER_LINE. + However, as a result of the speedup fixes necessitated by bug 446103, that + is no longer really the case, and much of the L1 and L2 cache code has been + tuned specifically for the case OC_BITS_PER_LINE == 5 (that is, the line + size is 32 bytes). Changing that would require a bunch of re-tuning + effort. So let's set it in stone for now. */ +STATIC_ASSERT(OC_BITS_PER_LINE == 5); +STATIC_ASSERT(OC_LINES_PER_SET == 2); + +/* Fundamentally we want an OCacheLine structure (see below) as follows: + struct { + Addr tag; + UInt w32 [OC_W32S_PER_LINE]; + UChar descr[OC_W32S_PER_LINE]; + } + However, in various places, we want to set the w32[] and descr[] arrays to + zero, or check if they are zero. This can be a very hot path (per bug + 446103). So, instead, we have a union which is either those two arrays + (OCacheLine_Main) or simply an array of ULongs (OCacheLine_W64s). For the + set-zero/test-zero operations, the OCacheLine_W64s are used. +*/ + +// To ensure that OCacheLine.descr[] will fit in an integral number of ULongs. +STATIC_ASSERT(0 == (OC_W32S_PER_LINE % 8)); + +#define OC_W64S_PER_MAIN /* "MAIN" meaning "struct OCacheLine_Main" */ \ + (OC_W32S_PER_LINE / 2 /* covers OCacheLine_Main.w32[] */ \ + + OC_W32S_PER_LINE / 8) /* covers OCacheLine_Main.descr[] */ +STATIC_ASSERT(OC_W64S_PER_MAIN == 5); + +typedef + ULong OCacheLine_W64s[OC_W64S_PER_MAIN]; + typedef struct { - Addr tag; - UInt w32[OC_W32S_PER_LINE]; + UInt w32 [OC_W32S_PER_LINE]; UChar descr[OC_W32S_PER_LINE]; } + OCacheLine_Main; + +STATIC_ASSERT(sizeof(OCacheLine_W64s) == sizeof(OCacheLine_Main)); + +typedef + struct { + Addr tag; + union { + OCacheLine_W64s w64s; + OCacheLine_Main main; + } u; + } OCacheLine; /* Classify and also sanity-check 'line'. Return 'e' (empty) if not in use, 'n' (nonzero) if it contains at least one valid origin tag, and 'z' if all the represented tags are zero. */ -static UChar classify_OCacheLine ( OCacheLine* line ) +static inline UChar classify_OCacheLine ( OCacheLine* line ) { UWord i; if (line->tag == 1/*invalid*/) return 'e'; /* EMPTY */ tl_assert(is_valid_oc_tag(line->tag)); + + // BEGIN fast special-case of the test loop below. This will detect + // zero-ness (case 'z') for a subset of cases that the loop below will, + // hence is safe. + if (OC_W64S_PER_MAIN == 5) { + if (line->u.w64s[0] == 0 + && line->u.w64s[1] == 0 && line->u.w64s[2] == 0 + && line->u.w64s[3] == 0 && line->u.w64s[4] == 0) { + return 'z'; + } + } else { + tl_assert2(0, "unsupported line size (classify_OCacheLine)"); + } + // END fast special-case of the test loop below. + for (i = 0; i < OC_W32S_PER_LINE; i++) { - tl_assert(0 == ((~0xF) & line->descr[i])); - if (line->w32[i] > 0 && line->descr[i] > 0) + tl_assert(0 == ((~0xF) & line->u.main.descr[i])); + if (line->u.main.w32[i] > 0 && line->u.main.descr[i] > 0) return 'n'; /* NONZERO - contains useful info */ } return 'z'; /* ZERO - no useful info */ @@ -2491,7 +2553,7 @@ static void init_OCache ( void ) init_ocacheL2(); } -static void moveLineForwards ( OCacheSet* set, UWord lineno ) +static inline void moveLineForwards ( OCacheSet* set, UWord lineno ) { OCacheLine tmp; stats_ocacheL1_movefwds++; @@ -2501,11 +2563,23 @@ static void moveLineForwards ( OCacheSet* set, UWord lineno ) set->line[lineno] = tmp; } -static void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) { +static inline void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) { UWord i; - for (i = 0; i < OC_W32S_PER_LINE; i++) { - line->w32[i] = 0; /* NO ORIGIN */ - line->descr[i] = 0; /* REALLY REALLY NO ORIGIN! */ + if (OC_W32S_PER_LINE == 8) { + // BEGIN fast special-case of the loop below + tl_assert(OC_W64S_PER_MAIN == 5); + line->u.w64s[0] = 0; + line->u.w64s[1] = 0; + line->u.w64s[2] = 0; + line->u.w64s[3] = 0; + line->u.w64s[4] = 0; + // END fast special-case of the loop below + } else { + tl_assert2(0, "unsupported line size (zeroise_OCacheLine)"); + for (i = 0; i < OC_W32S_PER_LINE; i++) { + line->u.main.w32[i] = 0; /* NO ORIGIN */ + line->u.main.descr[i] = 0; /* REALLY REALLY NO ORIGIN! */ + } } line->tag = tag; } @@ -2513,7 +2587,37 @@ static void zeroise_OCacheLine ( OCacheLine* line, Addr tag ) { ////////////////////////////////////////////////////////////// //// OCache backing store -static OSet* ocacheL2 = NULL; +// The backing store for ocacheL1 is, conceptually, an AVL tree of lines that +// got ejected from the L1 (a "victim cache"), and which actually contain +// useful info -- that is, for which classify_OCacheLine would return 'n' and +// no other value. However, the tree can grow large, and searching/updating +// it can be hot paths. Hence we "take out" 12 significant bits of the key by +// having 4096 trees, and select one using HASH_OCACHE_TAG. +// +// What that hash function returns isn't important so long as it is a pure +// function of the tag values, and is < 4096. However, it is critical for +// performance of long SARPs. Hence the extra shift of 11 bits. This means +// each tree conceptually is assigned to contiguous sequences of 2048 lines in +// the "line address space", giving some locality of reference when scanning +// linearly through address space, as is done by a SARP. Changing that 11 to +// 0 gives terrible performance on long SARPs, presumably because each new +// line is in a different tree, hence we wind up thrashing the (CPU's) caches. +// +// On 32-bit targets, we have to be a bit careful not to shift out so many +// bits that not all 2^12 trees get used. That leads to the constraint +// (OC_BITS_PER_LINE + 11 + 12) < 32. Note that the 11 is the only thing we +// can change here. In this case we have OC_BITS_PER_LINE == 5, hence the +// inequality is (28 < 32) and so we're good. +// +// The value 11 was determined empirically from various Firefox runs. 10 or +// 12 also work pretty well. + +static OSet* ocachesL2[4096]; + +STATIC_ASSERT((OC_BITS_PER_LINE + 11 + 12) < 32); +static inline UInt HASH_OCACHE_TAG ( Addr tag ) { + return (UInt)((tag >> (OC_BITS_PER_LINE + 11)) & 0xFFF); +} static void* ocacheL2_malloc ( const HChar* cc, SizeT szB ) { return VG_(malloc)(cc, szB); @@ -2527,23 +2631,26 @@ static UWord stats__ocacheL2_n_nodes = 0; static void init_ocacheL2 ( void ) { - tl_assert(!ocacheL2); tl_assert(sizeof(Word) == sizeof(Addr)); /* since OCacheLine.tag :: Addr */ tl_assert(0 == offsetof(OCacheLine,tag)); - ocacheL2 - = VG_(OSetGen_Create)( offsetof(OCacheLine,tag), - NULL, /* fast cmp */ - ocacheL2_malloc, "mc.ioL2", ocacheL2_free); + for (UInt i = 0; i < 4096; i++) { + tl_assert(!ocachesL2[i]); + ocachesL2[i] + = VG_(OSetGen_Create)( offsetof(OCacheLine,tag), + NULL, /* fast cmp */ + ocacheL2_malloc, "mc.ioL2", ocacheL2_free); + } stats__ocacheL2_n_nodes = 0; } /* Find line with the given tag in the tree, or NULL if not found. */ -static OCacheLine* ocacheL2_find_tag ( Addr tag ) +static inline OCacheLine* ocacheL2_find_tag ( Addr tag ) { OCacheLine* line; tl_assert(is_valid_oc_tag(tag)); - stats__ocacheL2_refs++; - line = VG_(OSetGen_Lookup)( ocacheL2, &tag ); + stats__ocacheL2_finds++; + OSet* oset = ocachesL2[HASH_OCACHE_TAG(tag)]; + line = VG_(OSetGen_Lookup)( oset, &tag ); return line; } @@ -2553,10 +2660,11 @@ static void ocacheL2_del_tag ( Addr tag ) { OCacheLine* line; tl_assert(is_valid_oc_tag(tag)); - stats__ocacheL2_refs++; - line = VG_(OSetGen_Remove)( ocacheL2, &tag ); + stats__ocacheL2_dels++; + OSet* oset = ocachesL2[HASH_OCACHE_TAG(tag)]; + line = VG_(OSetGen_Remove)( oset, &tag ); if (line) { - VG_(OSetGen_FreeNode)(ocacheL2, line); + VG_(OSetGen_FreeNode)(oset, line); tl_assert(stats__ocacheL2_n_nodes > 0); stats__ocacheL2_n_nodes--; } @@ -2568,10 +2676,11 @@ static void ocacheL2_add_line ( OCacheLine* line ) { OCacheLine* copy; tl_assert(is_valid_oc_tag(line->tag)); - copy = VG_(OSetGen_AllocNode)( ocacheL2, sizeof(OCacheLine) ); + OSet* oset = ocachesL2[HASH_OCACHE_TAG(line->tag)]; + copy = VG_(OSetGen_AllocNode)( oset, sizeof(OCacheLine) ); *copy = *line; - stats__ocacheL2_refs++; - VG_(OSetGen_Insert)( ocacheL2, copy ); + stats__ocacheL2_adds++; + VG_(OSetGen_Insert)( oset, copy ); stats__ocacheL2_n_nodes++; if (stats__ocacheL2_n_nodes > stats__ocacheL2_n_nodes_max) stats__ocacheL2_n_nodes_max = stats__ocacheL2_n_nodes; @@ -2626,7 +2735,7 @@ static OCacheLine* find_OCacheLine_SLOW ( Addr a ) /* line contains zeroes. We must ensure the backing store is updated accordingly, either by copying the line there verbatim, or by ensuring it isn't present there. We - chosse the latter on the basis that it reduces the size of + choose the latter on the basis that it reduces the size of the backing store. */ ocacheL2_del_tag( victim->tag ); break; @@ -2697,10 +2806,10 @@ static INLINE void set_aligned_word64_Origin_to_undef ( Addr a, UInt otag ) && lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/); } line = find_OCacheLine( a ); - line->descr[lineoff+0] = 0xF; - line->descr[lineoff+1] = 0xF; - line->w32[lineoff+0] = otag; - line->w32[lineoff+1] = otag; + line->u.main.descr[lineoff+0] = 0xF; + line->u.main.descr[lineoff+1] = 0xF; + line->u.main.w32[lineoff+0] = otag; + line->u.main.w32[lineoff+1] = otag; } //// END inlined, specialised version of MC_(helperc_b_store8) } @@ -2751,8 +2860,8 @@ void make_aligned_word32_undefined_w_otag ( Addr a, UInt otag ) tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE); } line = find_OCacheLine( a ); - line->descr[lineoff] = 0xF; - line->w32[lineoff] = otag; + line->u.main.descr[lineoff] = 0xF; + line->u.main.w32[lineoff] = otag; } //// END inlined, specialised version of MC_(helperc_b_store4) } @@ -2788,7 +2897,7 @@ void make_aligned_word32_noaccess ( Addr a ) tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE); } line = find_OCacheLine( a ); - line->descr[lineoff] = 0; + line->u.main.descr[lineoff] = 0; } //// END inlined, specialised version of MC_(helperc_b_store4) } @@ -2834,10 +2943,10 @@ void make_aligned_word64_undefined_w_otag ( Addr a, UInt otag ) tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/); line = find_OCacheLine( a ); - line->descr[lineoff+0] = 0xF; - line->descr[lineoff+1] = 0xF; - line->w32[lineoff+0] = otag; - line->w32[lineoff+1] = otag; + line->u.main.descr[lineoff+0] = 0xF; + line->u.main.descr[lineoff+1] = 0xF; + line->u.main.w32[lineoff+0] = otag; + line->u.main.w32[lineoff+1] = otag; } //// END inlined, specialised version of MC_(helperc_b_store8) } @@ -2872,8 +2981,8 @@ void make_aligned_word64_noaccess ( Addr a ) tl_assert(lineoff >= 0 && lineoff < OC_W32S_PER_LINE -1/*'cos 8-aligned*/); line = find_OCacheLine( a ); - line->descr[lineoff+0] = 0; - line->descr[lineoff+1] = 0; + line->u.main.descr[lineoff+0] = 0; + line->u.main.descr[lineoff+1] = 0; } //// END inlined, specialised version of MC_(helperc_b_store8) } @@ -7399,7 +7508,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load1)( Addr a ) { line = find_OCacheLine( a ); - descr = line->descr[lineoff]; + descr = line->u.main.descr[lineoff]; if (OC_ENABLE_ASSERTIONS) { tl_assert(descr < 0x10); } @@ -7407,7 +7516,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load1)( Addr a ) { if (LIKELY(0 == (descr & (1 << byteoff)))) { return 0; } else { - return line->w32[lineoff]; + return line->u.main.w32[lineoff]; } } @@ -7431,7 +7540,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load2)( Addr a ) { } line = find_OCacheLine( a ); - descr = line->descr[lineoff]; + descr = line->u.main.descr[lineoff]; if (OC_ENABLE_ASSERTIONS) { tl_assert(descr < 0x10); } @@ -7439,7 +7548,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load2)( Addr a ) { if (LIKELY(0 == (descr & (3 << byteoff)))) { return 0; } else { - return line->w32[lineoff]; + return line->u.main.w32[lineoff]; } } @@ -7462,7 +7571,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load4)( Addr a ) { line = find_OCacheLine( a ); - descr = line->descr[lineoff]; + descr = line->u.main.descr[lineoff]; if (OC_ENABLE_ASSERTIONS) { tl_assert(descr < 0x10); } @@ -7470,7 +7579,7 @@ UWord VG_REGPARM(1) MC_(helperc_b_load4)( Addr a ) { if (LIKELY(0 == descr)) { return 0; } else { - return line->w32[lineoff]; + return line->u.main.w32[lineoff]; } } @@ -7493,8 +7602,8 @@ UWord VG_REGPARM(1) MC_(helperc_b_load8)( Addr a ) { line = find_OCacheLine( a ); - descrLo = line->descr[lineoff + 0]; - descrHi = line->descr[lineoff + 1]; + descrLo = line->u.main.descr[lineoff + 0]; + descrHi = line->u.main.descr[lineoff + 1]; descr = descrLo | descrHi; if (OC_ENABLE_ASSERTIONS) { tl_assert(descr < 0x10); @@ -7503,8 +7612,8 @@ UWord VG_REGPARM(1) MC_(helperc_b_load8)( Addr a ) { if (LIKELY(0 == descr)) { return 0; /* both 32-bit chunks are defined */ } else { - UInt oLo = descrLo == 0 ? 0 : line->w32[lineoff + 0]; - UInt oHi = descrHi == 0 ? 0 : line->w32[lineoff + 1]; + UInt oLo = descrLo == 0 ? 0 : line->u.main.w32[lineoff + 0]; + UInt oHi = descrHi == 0 ? 0 : line->u.main.w32[lineoff + 1]; return merge_origins(oLo, oHi); } } @@ -7543,10 +7652,10 @@ void VG_REGPARM(2) MC_(helperc_b_store1)( Addr a, UWord d32 ) { line = find_OCacheLine( a ); if (d32 == 0) { - line->descr[lineoff] &= ~(1 << byteoff); + line->u.main.descr[lineoff] &= ~(1 << byteoff); } else { - line->descr[lineoff] |= (1 << byteoff); - line->w32[lineoff] = d32; + line->u.main.descr[lineoff] |= (1 << byteoff); + line->u.main.w32[lineoff] = d32; } } @@ -7571,10 +7680,10 @@ void VG_REGPARM(2) MC_(helperc_b_store2)( Addr a, UWord d32 ) { line = find_OCacheLine( a ); if (d32 == 0) { - line->descr[lineoff] &= ~(3 << byteoff); + line->u.main.descr[lineoff] &= ~(3 << byteoff); } else { - line->descr[lineoff] |= (3 << byteoff); - line->w32[lineoff] = d32; + line->u.main.descr[lineoff] |= (3 << byteoff); + line->u.main.w32[lineoff] = d32; } } @@ -7597,14 +7706,15 @@ void VG_REGPARM(2) MC_(helperc_b_store4)( Addr a, UWord d32 ) { line = find_OCacheLine( a ); if (d32 == 0) { - line->descr[lineoff] = 0; + line->u.main.descr[lineoff] = 0; } else { - line->descr[lineoff] = 0xF; - line->w32[lineoff] = d32; + line->u.main.descr[lineoff] = 0xF; + line->u.main.w32[lineoff] = d32; } } void VG_REGPARM(2) MC_(helperc_b_store8)( Addr a, UWord d32 ) { + STATIC_ASSERT(OC_W32S_PER_LINE == 8); OCacheLine* line; UWord lineoff; @@ -7623,26 +7733,98 @@ void VG_REGPARM(2) MC_(helperc_b_store8)( Addr a, UWord d32 ) { line = find_OCacheLine( a ); if (d32 == 0) { - line->descr[lineoff + 0] = 0; - line->descr[lineoff + 1] = 0; + line->u.main.descr[lineoff + 0] = 0; + line->u.main.descr[lineoff + 1] = 0; } else { - line->descr[lineoff + 0] = 0xF; - line->descr[lineoff + 1] = 0xF; - line->w32[lineoff + 0] = d32; - line->w32[lineoff + 1] = d32; + line->u.main.descr[lineoff + 0] = 0xF; + line->u.main.descr[lineoff + 1] = 0xF; + line->u.main.w32[lineoff + 0] = d32; + line->u.main.w32[lineoff + 1] = d32; } } void VG_REGPARM(2) MC_(helperc_b_store16)( Addr a, UWord d32 ) { - MC_(helperc_b_store8)( a + 0, d32 ); - MC_(helperc_b_store8)( a + 8, d32 ); + STATIC_ASSERT(OC_W32S_PER_LINE == 8); + OCacheLine* line; + UWord lineoff; + + if (UNLIKELY(a & 15)) { + /* Handle misaligned case, slowly. */ + MC_(helperc_b_store8)( a + 0, d32 ); + MC_(helperc_b_store8)( a + 8, d32 ); + return; + } + + lineoff = oc_line_offset(a); + if (OC_ENABLE_ASSERTIONS) { + tl_assert(lineoff == (lineoff & 4)); /*0,4*//*since 16-aligned*/ + } + + line = find_OCacheLine( a ); + + if (d32 == 0) { + line->u.main.descr[lineoff + 0] = 0; + line->u.main.descr[lineoff + 1] = 0; + line->u.main.descr[lineoff + 2] = 0; + line->u.main.descr[lineoff + 3] = 0; + } else { + line->u.main.descr[lineoff + 0] = 0xF; + line->u.main.descr[lineoff + 1] = 0xF; + line->u.main.descr[lineoff + 2] = 0xF; + line->u.main.descr[lineoff + 3] = 0xF; + line->u.main.w32[lineoff + 0] = d32; + line->u.main.w32[lineoff + 1] = d32; + line->u.main.w32[lineoff + 2] = d32; + line->u.main.w32[lineoff + 3] = d32; + } } void VG_REGPARM(2) MC_(helperc_b_store32)( Addr a, UWord d32 ) { - MC_(helperc_b_store8)( a + 0, d32 ); - MC_(helperc_b_store8)( a + 8, d32 ); - MC_(helperc_b_store8)( a + 16, d32 ); - MC_(helperc_b_store8)( a + 24, d32 ); + STATIC_ASSERT(OC_W32S_PER_LINE == 8); + OCacheLine* line; + UWord lineoff; + + if (UNLIKELY(a & 31)) { + /* Handle misaligned case, slowly. */ + MC_(helperc_b_store16)( a + 0, d32 ); + MC_(helperc_b_store16)( a + 16, d32 ); + return; + } + + lineoff = oc_line_offset(a); + if (OC_ENABLE_ASSERTIONS) { + tl_assert(lineoff == 0); + } + + line = find_OCacheLine( a ); + + if (d32 == 0) { + line->u.main.descr[0] = 0; + line->u.main.descr[1] = 0; + line->u.main.descr[2] = 0; + line->u.main.descr[3] = 0; + line->u.main.descr[4] = 0; + line->u.main.descr[5] = 0; + line->u.main.descr[6] = 0; + line->u.main.descr[7] = 0; + } else { + line->u.main.descr[0] = 0xF; + line->u.main.descr[1] = 0xF; + line->u.main.descr[2] = 0xF; + line->u.main.descr[3] = 0xF; + line->u.main.descr[4] = 0xF; + line->u.main.descr[5] = 0xF; + line->u.main.descr[6] = 0xF; + line->u.main.descr[7] = 0xF; + line->u.main.w32[0] = d32; + line->u.main.w32[1] = d32; + line->u.main.w32[2] = d32; + line->u.main.w32[3] = d32; + line->u.main.w32[4] = d32; + line->u.main.w32[5] = d32; + line->u.main.w32[6] = d32; + line->u.main.w32[7] = d32; + } } @@ -7650,6 +7832,9 @@ void VG_REGPARM(2) MC_(helperc_b_store32)( Addr a, UWord d32 ) { /*--- Origin tracking: sarp handlers ---*/ /*--------------------------------------------*/ +// We may get asked to do very large SARPs (bug 446103), hence it is important +// to process 32-byte chunks at a time when possible. + __attribute__((noinline)) static void ocache_sarp_Set_Origins ( Addr a, UWord len, UInt otag ) { if ((a & 1) && len >= 1) { @@ -7662,9 +7847,40 @@ static void ocache_sarp_Set_Origins ( Addr a, UWord len, UInt otag ) { a += 2; len -= 2; } - if (len >= 4) - tl_assert(0 == (a & 3)); - while (len >= 4) { + if ((a & 4) && len >= 4) { + MC_(helperc_b_store4)( a, otag ); + a += 4; + len -= 4; + } + if ((a & 8) && len >= 8) { + MC_(helperc_b_store8)( a, otag ); + a += 8; + len -= 8; + } + if ((a & 16) && len >= 16) { + MC_(helperc_b_store16)( a, otag ); + a += 16; + len -= 16; + } + if (len >= 32) { + tl_assert(0 == (a & 31)); + while (len >= 32) { + MC_(helperc_b_store32)( a, otag ); + a += 32; + len -= 32; + } + } + if (len >= 16) { + MC_(helperc_b_store16)( a, otag ); + a += 16; + len -= 16; + } + if (len >= 8) { + MC_(helperc_b_store8)( a, otag ); + a += 8; + len -= 8; + } + if (len >= 4) { MC_(helperc_b_store4)( a, otag ); a += 4; len -= 4; @@ -7694,9 +7910,40 @@ static void ocache_sarp_Clear_Origins ( Addr a, UWord len ) { a += 2; len -= 2; } - if (len >= 4) - tl_assert(0 == (a & 3)); - while (len >= 4) { + if ((a & 4) && len >= 4) { + MC_(helperc_b_store4)( a, 0 ); + a += 4; + len -= 4; + } + if ((a & 8) && len >= 8) { + MC_(helperc_b_store8)( a, 0 ); + a += 8; + len -= 8; + } + if ((a & 16) && len >= 16) { + MC_(helperc_b_store16)( a, 0 ); + a += 16; + len -= 16; + } + if (len >= 32) { + tl_assert(0 == (a & 31)); + while (len >= 32) { + MC_(helperc_b_store32)( a, 0 ); + a += 32; + len -= 32; + } + } + if (len >= 16) { + MC_(helperc_b_store16)( a, 0 ); + a += 16; + len -= 16; + } + if (len >= 8) { + MC_(helperc_b_store8)( a, 0 ); + a += 8; + len -= 8; + } + if (len >= 4) { MC_(helperc_b_store4)( a, 0 ); a += 4; len -= 4; @@ -7846,10 +8093,14 @@ static void mc_post_clo_init ( void ) if (MC_(clo_mc_level) >= 3) { init_OCache(); tl_assert(ocacheL1 != NULL); - tl_assert(ocacheL2 != NULL); + for (UInt i = 0; i < 4096; i++ ) { + tl_assert(ocachesL2[i] != NULL); + } } else { tl_assert(ocacheL1 == NULL); - tl_assert(ocacheL2 == NULL); + for (UInt i = 0; i < 4096; i++ ) { + tl_assert(ocachesL2[i] == NULL); + } } MC_(chunk_poolalloc) = VG_(newPA) @@ -7943,28 +8194,32 @@ static void mc_print_stats (void) if (MC_(clo_mc_level) >= 3) { VG_(message)(Vg_DebugMsg, - " ocacheL1: %'12lu refs %'12lu misses (%'lu lossage)\n", + " ocacheL1: %'14lu refs %'14lu misses (%'lu lossage)\n", stats_ocacheL1_find, stats_ocacheL1_misses, stats_ocacheL1_lossage ); VG_(message)(Vg_DebugMsg, - " ocacheL1: %'12lu at 0 %'12lu at 1\n", + " ocacheL1: %'14lu at 0 %'14lu at 1\n", stats_ocacheL1_find - stats_ocacheL1_misses - stats_ocacheL1_found_at_1 - stats_ocacheL1_found_at_N, stats_ocacheL1_found_at_1 ); VG_(message)(Vg_DebugMsg, - " ocacheL1: %'12lu at 2+ %'12lu move-fwds\n", + " ocacheL1: %'14lu at 2+ %'14lu move-fwds\n", stats_ocacheL1_found_at_N, stats_ocacheL1_movefwds ); VG_(message)(Vg_DebugMsg, - " ocacheL1: %'12lu sizeB %'12d useful\n", + " ocacheL1: %'14lu sizeB %'14d useful\n", (SizeT)sizeof(OCache), 4 * OC_W32S_PER_LINE * OC_LINES_PER_SET * OC_N_SETS ); VG_(message)(Vg_DebugMsg, - " ocacheL2: %'12lu refs %'12lu misses\n", - stats__ocacheL2_refs, + " ocacheL2: %'14lu finds %'14lu misses\n", + stats__ocacheL2_finds, stats__ocacheL2_misses ); + VG_(message)(Vg_DebugMsg, + " ocacheL2: %'14lu adds %'14lu dels\n", + stats__ocacheL2_adds, + stats__ocacheL2_dels ); VG_(message)(Vg_DebugMsg, " ocacheL2: %'9lu max nodes %'9lu curr nodes\n", stats__ocacheL2_n_nodes_max, @@ -7974,7 +8229,9 @@ static void mc_print_stats (void) stats__nia_cache_queries, stats__nia_cache_misses); } else { tl_assert(ocacheL1 == NULL); - tl_assert(ocacheL2 == NULL); + for (UInt i = 0; i < 4096; i++ ) { + tl_assert(ocachesL2[1] == NULL); + } } } @@ -8216,7 +8473,9 @@ static void mc_pre_clo_init(void) if we need to, since the command line args haven't been processed yet. Hence defer it to mc_post_clo_init. */ tl_assert(ocacheL1 == NULL); - tl_assert(ocacheL2 == NULL); + for (UInt i = 0; i < 4096; i++ ) { + tl_assert(ocachesL2[i] == NULL); + } /* Check some important stuff. See extensive comments above re UNALIGNED_OR_HIGH for background. */