From 23ad3fce2ff83ea2d43895e4023bf5ca64316323 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Wed, 3 May 2006 22:13:57 +0000 Subject: [PATCH] Vectorise copy_address_range_perms for common cases. This gives about 40% speedup on artificial programs which just do realloc() and nothing else, and about a 3-4% speedup on starting kpresenter-1.5.0 and loading a 16-slide presentation. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5880 --- docs/internals/performance.txt | 4 ++ memcheck/mc_main.c | 99 ++++++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 17 deletions(-) diff --git a/docs/internals/performance.txt b/docs/internals/performance.txt index 8a12bfcd3e..2dbcfc6ae2 100644 --- a/docs/internals/performance.txt +++ b/docs/internals/performance.txt @@ -29,6 +29,10 @@ Post 3.1.0: - Nick changed ExeContext gathering to not record/save extra zeroes at the end. Saved 7% on perf/heap with --num-callers=50, and about 1% on perf/tinycc. +- Julian vectorised copy_address_range_perms for common cases, which + gives about 40% speedup on artificial programs which just do + realloc() and nothing else, and about a 3-4% speedup on starting + kpresenter-1.5.0 and loading a 16-slide presentation. COMPVBITS branch: - Nick converted to compress V bits, initial version saved 0--5% on most diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c index 0ee073b25a..b28e80eb27 100644 --- a/memcheck/mc_main.c +++ b/memcheck/mc_main.c @@ -589,6 +589,28 @@ UChar get_vabits2 ( Addr a ) return extract_vabits2_from_vabits8(a, vabits8); } +// *** WARNING! *** +// Any time this function is called, if it is possible that any of the +// 4 2-bit fields in vabits8 are equal to VA_BITS2_PARTDEFINED, then the +// corresponding entry(s) in the sec-V-bits table must also be set! +static INLINE +UChar get_vabits8_for_aligned_word32 ( Addr a ) +{ + SecMap* sm = get_secmap_for_reading(a); + UWord sm_off = SM_OFF(a); + UChar vabits8 = sm->vabits8[sm_off]; + return vabits8; +} + +static INLINE +void set_vabits8_for_aligned_word32 ( Addr a, UChar vabits8 ) +{ + SecMap* sm = get_secmap_for_writing(a); + UWord sm_off = SM_OFF(a); + sm->vabits8[sm_off] = vabits8; +} + + // Forward declarations static UWord get_sec_vbits8(Addr a); static void set_sec_vbits8(Addr a, UWord vbits8); @@ -1227,35 +1249,81 @@ static void make_mem_defined_if_addressable ( Addr a, SizeT len ) void MC_(copy_address_range_state) ( Addr src, Addr dst, SizeT len ) { SizeT i, j; - UChar vabits2; + UChar vabits2, vabits8; + Bool aligned, nooverlap; DEBUG("MC_(copy_address_range_state)\n"); PROF_EVENT(50, "MC_(copy_address_range_state)"); - if (len == 0) + if (len == 0 || src == dst) return; - if (src < dst) { - for (i = 0, j = len-1; i < len; i++, j--) { - PROF_EVENT(51, "MC_(copy_address_range_state)(loop)"); - vabits2 = get_vabits2( src+j ); - set_vabits2( dst+j, vabits2 ); - if (VA_BITS2_PARTDEFINED == vabits2) { - set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) ); + aligned = VG_IS_4_ALIGNED(src) && VG_IS_4_ALIGNED(dst); + nooverlap = src+len <= dst || dst+len <= src; + + if (nooverlap && aligned) { + + /* Vectorised fast case, when no overlap and suitably aligned */ + /* vector loop */ + i = 0; + while (len >= 4) { + vabits8 = get_vabits8_for_aligned_word32( src+i ); + set_vabits8_for_aligned_word32( dst+i, vabits8 ); + if (EXPECTED_TAKEN(VA_BITS8_DEFINED == vabits8 + || VA_BITS8_UNDEFINED == vabits8 + || VA_BITS8_NOACCESS == vabits8)) { + /* do nothing */ + } else { + /* have to copy secondary map info */ + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+0 )) + set_sec_vbits8( dst+i+0, get_sec_vbits8( src+i+0 ) ); + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+1 )) + set_sec_vbits8( dst+i+1, get_sec_vbits8( src+i+1 ) ); + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+2 )) + set_sec_vbits8( dst+i+2, get_sec_vbits8( src+i+2 ) ); + if (VA_BITS2_PARTDEFINED == get_vabits2( src+i+3 )) + set_sec_vbits8( dst+i+3, get_sec_vbits8( src+i+3 ) ); } + i += 4; + len -= 4; } - } - - if (src > dst) { - for (i = 0; i < len; i++) { - PROF_EVENT(52, "MC_(copy_address_range_state)(loop)"); + /* fixup loop */ + while (len >= 1) { vabits2 = get_vabits2( src+i ); set_vabits2( dst+i, vabits2 ); if (VA_BITS2_PARTDEFINED == vabits2) { set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) ); } + i++; + len--; + } + + } else { + + /* We have to do things the slow way */ + if (src < dst) { + for (i = 0, j = len-1; i < len; i++, j--) { + PROF_EVENT(51, "MC_(copy_address_range_state)(loop)"); + vabits2 = get_vabits2( src+j ); + set_vabits2( dst+j, vabits2 ); + if (VA_BITS2_PARTDEFINED == vabits2) { + set_sec_vbits8( dst+j, get_sec_vbits8( src+j ) ); + } + } + } + + if (src > dst) { + for (i = 0; i < len; i++) { + PROF_EVENT(52, "MC_(copy_address_range_state)(loop)"); + vabits2 = get_vabits2( src+i ); + set_vabits2( dst+i, vabits2 ); + if (VA_BITS2_PARTDEFINED == vabits2) { + set_sec_vbits8( dst+i, get_sec_vbits8( src+i ) ); + } + } } } + } @@ -4422,6 +4490,3 @@ VG_DETERMINE_INTERFACE_VERSION(mc_pre_clo_init) /*--------------------------------------------------------------------*/ /*--- end ---*/ /*--------------------------------------------------------------------*/ - - - -- 2.47.2