/* AT ENTRY: %rax is next guest addr, %rbp is possibly
modified guest state ptr */
- /* Has the guest state pointer been messed with? If yes, exit. */
+ /* Has the guest state pointer been messed with? If yes, exit.
+ Also, set %rcx to be &VG_(tt_fast), some insns before it is
+ used, in the hope of getting it off the critical path. This
+ location seems to be optimal on 2.2GHz Athlon64. */
cmpq 8(%rsp), %rbp
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
jnz gsp_changed
/* save the jump address in the guest state */
jz counter_is_zero
/* try a fast lookup in the translation cache */
- movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
- movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx
- movq (%rcx,%rbx,8), %rcx
- cmpq %rax, (%rcx)
+ movq %rax, %rbx /* next guest addr */
+ andq $VG_TT_FAST_MASK, %rbx /* entry# */
+ shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
+ movq 0(%rcx,%rbx,1), %r10 /* .guest */
+ movq 8(%rcx,%rbx,1), %r11 /* .host */
+ cmpq %rax, %r10
jnz fast_lookup_failed
- /* Found a match. Call tce[1], which is 8 bytes along, since
- each tce element is a 64-bit int. */
- addq $8, %rcx
- jmp *%rcx
+ /* Found a match. Jump to .host. */
+ jmp *%r11
ud2 /* persuade insn decoders not to speculate past here */
/* generated code should run, then jump back to
VG_(run_innerloop__dispatch_unprofiled). */
/* AT ENTRY: %rax is next guest addr, %rbp is possibly
modified guest state ptr */
- /* Has the guest state pointer been messed with? If yes, exit. */
+ /* Has the guest state pointer been messed with? If yes, exit.
+ Also, set %rcx to be &VG_(tt_fast), some insns before it is
+ used, in the hope of getting it off the critical path. This
+ location seems to be optimal on 2.2GHz Athlon64. */
cmpq 8(%rsp), %rbp
+ movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
jnz gsp_changed
/* save the jump address in the guest state */
jz counter_is_zero
/* try a fast lookup in the translation cache */
- movq VG_(tt_fast)@GOTPCREL(%rip), %rcx
movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx
- movq (%rcx,%rbx,8), %rcx
- cmpq %rax, (%rcx)
+ andq $VG_TT_FAST_MASK, %rbx /* entry# */
+ shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
+ movq 0(%rcx,%rbx,1), %r10 /* .guest */
+ movq 8(%rcx,%rbx,1), %r11 /* .host */
+ cmpq %rax, %r10
jnz fast_lookup_failed
/* increment bb profile counter */
movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx
- movq (%rdx,%rbx,8), %rdx
+ shrq $1, %rbx /* entry# * sizeof(UInt*) */
+ movq (%rdx,%rbx,1), %rdx
addl $1, (%rdx)
- /* Found a match. Call tce[1], which is 8 bytes along, since
- each tce element is a 64-bit int. */
- addq $8, %rcx
- jmp *%rcx
+ /* Found a match. Jump to .host. */
+ jmp *%r11
ud2 /* persuade insn decoders not to speculate past here */
/* generated code should run, then jump back to
VG_(run_innerloop__dispatch_profiled). */
This file is part of Valgrind, a dynamic binary instrumentation
framework.
- Copyright (C) 2005 Cerion Armour-Brown <cerion@open-works.co.uk>
+ Copyright (C) 2005-2007 Cerion Armour-Brown <cerion@open-works.co.uk>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
Stack state:
44(r1) (=orig guest_state)
*/
-
- /* Has the guest state pointer been messed with? If yes, exit. */
- lwz 5,44(1) /* original guest_state ptr */
- cmpw 5,31
+ /* Has the guest state pointer been messed with? If yes, exit.
+ Also set up & VG_(tt_fast) to give the load time to come
+ through. */
+ lwz 9,44(1) /* original guest_state ptr */
+ lis 5,VG_(tt_fast)@ha
+ addi 5,5,VG_(tt_fast)@l /* & VG_(tt_fast) */
+ cmpw 9,31
bne gsp_changed
/* save the jump address in the guest state */
/* try a fast lookup in the translation cache */
/* r4 = VG_TT_FAST_HASH(addr) * sizeof(ULong*)
- = ((r3 >>u 2) & VG_TT_FAST_MASK) << 2 */
- rlwinm 4,3, 0, 32-2-VG_TT_FAST_BITS, 31-2
- addis 5,4,VG_(tt_fast)@ha
- lwz 5,VG_(tt_fast)@l(5)
- lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
+ = ((r3 >>u 2) & VG_TT_FAST_MASK) << 3 */
+ rlwinm 4,3,1, 29-VG_TT_FAST_BITS, 28 /* entry# * 8 */
+ add 5,5,4 /* & VG_(tt_fast)[entry#] */
+ lwz 6,0(5) /* .guest */
+ lwz 7,4(5) /* .host */
cmpw 3,6
bne fast_lookup_failed
- /* Found a match. Call tce[1], which is 8 bytes along, since
- each tce element is a 64-bit int. */
- addi 8,5,8
- mtctr 8
-
- /* run the translation */
+ /* Found a match. Call .host. */
+ mtctr 7
bctrl
/* On return from guest code:
r31 may be unchanged (guest_state), or may indicate further
details of the control transfer requested to *r3.
*/
-
/* start over */
b VG_(run_innerloop__dispatch_unprofiled)
/*NOTREACHED*/
Stack state:
44(r1) (=orig guest_state)
*/
-
- /* Has the guest state pointer been messed with? If yes, exit. */
- lwz 5,44(1) /* original guest_state ptr */
- cmpw 5,31
+ /* Has the guest state pointer been messed with? If yes, exit.
+ Also set up & VG_(tt_fast) to give the load time to come
+ through. */
+ lwz 9,44(1) /* original guest_state ptr */
+ lis 5,VG_(tt_fast)@ha
+ addi 5,5,VG_(tt_fast)@l /* & VG_(tt_fast) */
+ cmpw 9,31
bne gsp_changed
/* save the jump address in the guest state */
/* try a fast lookup in the translation cache */
/* r4 = VG_TT_FAST_HASH(addr) * sizeof(ULong*)
- = ((r3 >>u 2) & VG_TT_FAST_MASK) << 2 */
- rlwinm 4,3, 0, 32-2-VG_TT_FAST_BITS, 31-2
- addis 5,4,VG_(tt_fast)@ha
- lwz 5,VG_(tt_fast)@l(5)
- lwz 6,4(5) /* big-endian, so comparing 2nd 32bit word */
+ = ((r3 >>u 2) & VG_TT_FAST_MASK) << 3 */
+ rlwinm 4,3,1, 29-VG_TT_FAST_BITS, 28 /* entry# * 8 */
+ add 5,5,4 /* & VG_(tt_fast)[entry#] */
+ lwz 6,0(5) /* .guest */
+ lwz 7,4(5) /* .host */
cmpw 3,6
bne fast_lookup_failed
/* increment bb profile counter */
+ srwi 4,4,1 /* entry# * 4 */
addis 6,4,VG_(tt_fastN)@ha
- lwz 7,VG_(tt_fastN)@l(6)
- lwz 8,0(7)
+ lwz 9,VG_(tt_fastN)@l(6)
+ lwz 8,0(9)
addi 8,8,1
- stw 8,0(7)
+ stw 8,0(9)
- /* Found a match. Call tce[1], which is 8 bytes along, since
- each tce element is a 64-bit int. */
- addi 8,5,8
- mtctr 8
-
- /* run the translation */
+ /* Found a match. Call .host. */
+ mtctr 7
bctrl
/* On return from guest code:
r31 may be unchanged (guest_state), or may indicate further
details of the control transfer requested to *r3.
*/
-
/* start over */
b VG_(run_innerloop__dispatch_profiled)
/*NOTREACHED*/
jz counter_is_zero
/* try a fast lookup in the translation cache */
- movl %eax, %ebx
- andl $VG_TT_FAST_MASK, %ebx
- movl VG_(tt_fast)(,%ebx,4), %ecx
- cmpl %eax, (%ecx)
+ movl %eax, %ebx /* next guest addr */
+ andl $VG_TT_FAST_MASK, %ebx /* entry# */
+ movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
+ movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
+ cmpl %eax, %esi
jnz fast_lookup_failed
- /* Found a match. Jump to tce[1], which is 8 bytes along,
- since each tce element is a 64-bit int. */
- addl $8, %ecx
- jmp *%ecx
+ /* Found a match. Jump to .host. */
+ jmp *%edi
ud2 /* persuade insn decoders not to speculate past here */
/* generated code should run, then jump back to
VG_(run_innerloop__dispatch_unprofiled). */
jz counter_is_zero
/* try a fast lookup in the translation cache */
- movl %eax, %ebx
- andl $VG_TT_FAST_MASK, %ebx
- movl VG_(tt_fast)(,%ebx,4), %ecx
- cmpl %eax, (%ecx)
+ movl %eax, %ebx /* next guest addr */
+ andl $VG_TT_FAST_MASK, %ebx /* entry# */
+ movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
+ movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
+ cmpl %eax, %esi
jnz fast_lookup_failed
+
/* increment bb profile counter */
/* note: innocuous as this sounds, it causes a huge amount more
stress on D1 and significantly slows everything down. */
/* Use "addl $1", not "incl", to avoid partial-flags stall on P4 */
addl $1, (%edx)
- /* Found a match. Jump to tce[1], which is 8 bytes along,
- since each tce element is a 64-bit int. */
- addl $8, %ecx
- jmp *%ecx
+ /* Found a match. Jump to .host. */
+ jmp *%edi
ud2 /* persuade insn decoders not to speculate past here */
/* generated code should run, then jump back to
VG_(run_innerloop__dispatch_profiled). */
goto dontchase;
# endif
+ /* overly conservative, but .. don't chase into the distinguished
+ address that m_transtab uses as an empty-slot marker for
+ VG_(tt_fast). */
+ if (addr == TRANSTAB_BOGUS_GUEST_ADDR)
+ goto dontchase;
+
/* well, ok then. go on and chase. */
return True;
{ /* BEGIN new scope specially for 'seg' */
NSegment const* seg = VG_(am_find_nsegment)(addr);
- if (!translations_allowable_from_seg(seg)) {
+ if ( (!translations_allowable_from_seg(seg))
+ || addr == TRANSTAB_BOGUS_GUEST_ADDR ) {
if (VG_(clo_trace_signals))
VG_(message)(Vg_DebugMsg, "translations not allowed here "
"- throwing SEGV");
/*------------------ TYPES ------------------*/
-/* A translation-cache entry is two parts:
- - The guest address of the first (entry) bb in the translation,
- as a 64-bit word.
- - One or more 64-bit words containing the code.
- It is supposed to be 64-bit aligned.
-*/
-/*
-typedef
- struct {
- Addr64 orig_addr;
- ULong code[0];
- }
- TCEntry;
-*/
-
/* A translation-table entry. This indicates precisely which areas of
guest code are included in the translation, and contains all other
auxiliary info too. */
deletion, hence the Deleted state. */
enum { InUse, Deleted, Empty } status;
- /* Pointer to the corresponding TCEntry (must be in the same
- sector!) */
- ULong* tce;
+ /* 64-bit aligned pointer to one or more 64-bit words containing
+ the corresponding host code (must be in the same sector!)
+ This is a pointer into the sector's tc (code) area. */
+ ULong* tcptr;
/* This is the original guest address that purportedly is the
entry point of the translation. You might think that .entry
static Int tc_sector_szQ;
-/* Fast helper for the TC. A direct-mapped cache which holds a
- pointer to a TC entry which may or may not be the correct one, but
- which we hope usually is. This array is referred to directly from
- <arch>/dispatch.S.
+/* Fast helper for the TC. A direct-mapped cache which holds a set of
+ recently used (guest address, host address) pairs. This array is
+ referred to directly from m_dispatch/dispatch-<platform>.S.
- Entries in tt_fast may point to any valid TC entry, regardless of
+ Entries in tt_fast may refer to any valid TC entry, regardless of
which sector it's in. Consequently we must be very careful to
invalidate this cache when TC entries are changed or disappear.
- A special TCEntry -- bogus_tc_entry -- must be pointed at to cause
- that cache entry to miss. This relies on the assumption that no
- guest code actually has an address of 0x1.
+ A special .guest address - TRANSTAB_BOGUS_GUEST_ADDR -- must be
+ pointed at to cause that cache entry to miss. This relies on the
+ assumption that no guest code actually has that address, hence a
+ value 0x1 seems good. m_translate gives the client a synthetic
+ segfault if it tries to execute at this address.
+*/
+/*
+typedef
+ struct {
+ Addr guest;
+ Addr host;
+ }
+ FastCacheEntry;
+*/
+/*global*/ __attribute__((aligned(16)))
+ FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE];
+/*
+#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
*/
-/*global*/ ULong* VG_(tt_fast)[VG_TT_FAST_SIZE];
-
-static ULong bogus_tc_entry = (Addr64)1;
-
/* For profiling, we have a parallel array of pointers to .count
fields in TT entries. Again, these pointers must be invalidated
when translations disappear. A NULL pointer suffices to indicate
an unused slot.
- tt_fast and tt_fastN change together: if tt_fast[i] points to
- bogus_tc_entry then the corresponding tt_fastN[i] must be null. If
- tt_fast[i] points to some TC entry somewhere, then tt_fastN[i]
- *must* point to the .count field of the corresponding TT entry.
+ When not profiling (the normal case, VG_(clo_profile_flags) == 0),
+ all tt_fastN entries are set to NULL at startup and never read nor
+ written after that.
+
+ When profiling (VG_(clo_profile_flags) > 0), tt_fast and tt_fastN
+ change together: if tt_fast[i].guest is TRANSTAB_BOGUS_GUEST_ADDR
+ then the corresponding tt_fastN[i] must be null. If
+ tt_fast[i].guest is any other value, then tt_fastN[i] *must* point
+ to the .count field of the corresponding TT entry.
tt_fast and tt_fastN are referred to from assembly code
(dispatch.S).
/* Sanity check absolutely everything. True == check passed. */
-/* forward */
+/* forwards */
static Bool sanity_check_redir_tt_tc ( void );
+static Bool sanity_check_fastcache ( void );
static Bool sanity_check_all_sectors ( void )
{
if (!sane)
return False;
}
- if (!sanity_check_redir_tt_tc() )
+ if ( !sanity_check_redir_tt_tc() )
+ return False;
+ if ( !sanity_check_fastcache() )
return False;
return True;
}
return k32 % N_TTES_PER_SECTOR;
}
-static void setFastCacheEntry ( Addr64 key, ULong* tce, UInt* count )
+static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count )
{
UInt cno = (UInt)VG_TT_FAST_HASH(key);
- VG_(tt_fast)[cno] = tce;
- VG_(tt_fastN)[cno] = VG_(clo_profile_flags) > 0 ? count : NULL;
+ VG_(tt_fast)[cno].guest = (Addr)key;
+ VG_(tt_fast)[cno].host = (Addr)tcptr;
+ if (VG_(clo_profile_flags) > 0)
+ VG_(tt_fastN)[cno] = count;
n_fast_updates++;
+ /* This shouldn't fail. It should be assured by m_translate
+ which should reject any attempt to make translation of code
+ starting at TRANSTAB_BOGUS_GUEST_ADDR. */
+ vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR);
}
-static void invalidateFastCache ( void )
+/* Invalidate the fast cache's counter array, VG_(tt_fastN). */
+static void invalidateFastNCache ( void )
{
UInt j;
- /* This loop is popular enough to make it worth unrolling a
- bit, at least on ppc32. */
vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
- VG_(tt_fast)[j+0] = &bogus_tc_entry;
- VG_(tt_fast)[j+1] = &bogus_tc_entry;
- VG_(tt_fast)[j+2] = &bogus_tc_entry;
- VG_(tt_fast)[j+3] = &bogus_tc_entry;
VG_(tt_fastN)[j+0] = NULL;
VG_(tt_fastN)[j+1] = NULL;
VG_(tt_fastN)[j+2] = NULL;
VG_(tt_fastN)[j+3] = NULL;
}
vg_assert(j == VG_TT_FAST_SIZE);
+}
+
+/* Invalidate the fast cache VG_(tt_fast). If profiling, also
+ invalidate the fast cache's counter array VG_(tt_fastN), otherwise
+ don't touch it. */
+static void invalidateFastCache ( void )
+{
+ UInt j;
+ /* This loop is popular enough to make it worth unrolling a
+ bit, at least on ppc32. */
+ vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
+ for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
+ VG_(tt_fast)[j+0].guest = TRANSTAB_BOGUS_GUEST_ADDR;
+ VG_(tt_fast)[j+1].guest = TRANSTAB_BOGUS_GUEST_ADDR;
+ VG_(tt_fast)[j+2].guest = TRANSTAB_BOGUS_GUEST_ADDR;
+ VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR;
+ }
+
+ if (VG_(clo_profile_flags) > 0)
+ invalidateFastNCache();
+
+ vg_assert(j == VG_TT_FAST_SIZE);
n_fast_flushes++;
}
+static Bool sanity_check_fastcache ( void )
+{
+ UInt j;
+ if (0) VG_(printf)("sanity check fastcache\n");
+ if (VG_(clo_profile_flags) > 0) {
+ /* profiling */
+ for (j = 0; j < VG_TT_FAST_SIZE; j++) {
+ if (VG_(tt_fastN)[j] == NULL
+ && VG_(tt_fast)[j].guest != TRANSTAB_BOGUS_GUEST_ADDR)
+ return False;
+ if (VG_(tt_fastN)[j] != NULL
+ && VG_(tt_fast)[j].guest == TRANSTAB_BOGUS_GUEST_ADDR)
+ return False;
+ }
+ } else {
+ /* not profiling */
+ for (j = 0; j < VG_TT_FAST_SIZE; j++) {
+ if (VG_(tt_fastN)[j] != NULL)
+ return False;
+ }
+ }
+ return True;
+}
+
static void initialiseSector ( Int sno )
{
Int i;
Addr addr;
VexArchInfo vai;
+ if (nbytes == 0) return;
+ vg_assert(nbytes > 0);
+
VG_(machine_get_VexArchInfo)( NULL, &vai );
cls = vai.ppc_cache_line_szB;
Bool is_self_checking )
{
Int tcAvailQ, reqdQ, y, i;
- ULong *tce, *tce2;
+ ULong *tcptr, *tcptr2;
UChar* srcP;
UChar* dstP;
initialiseSector(y);
/* Try putting the translation in this sector. */
- reqdQ = 1 + ((code_len + 7) >> 3);
+ reqdQ = (code_len + 7) >> 3;
/* Will it fit in tc? */
tcAvailQ = ((ULong*)(§ors[y].tc[tc_sector_szQ]))
vg_assert(sectors[y].tt_n_inuse >= 0);
/* Copy into tc. */
- tce = sectors[y].tc_next;
- vg_assert(tce >= §ors[y].tc[0]);
- vg_assert(tce <= §ors[y].tc[tc_sector_szQ]);
+ tcptr = sectors[y].tc_next;
+ vg_assert(tcptr >= §ors[y].tc[0]);
+ vg_assert(tcptr <= §ors[y].tc[tc_sector_szQ]);
- tce[0] = entry;
- dstP = (UChar*)(&tce[1]);
+ dstP = (UChar*)tcptr;
srcP = (UChar*)code;
for (i = 0; i < code_len; i++)
dstP[i] = srcP[i];
invalidate_icache( dstP, code_len );
/* more paranoia */
- tce2 = sectors[y].tc_next;
- vg_assert(tce2 >= §ors[y].tc[0]);
- vg_assert(tce2 <= §ors[y].tc[tc_sector_szQ]);
+ tcptr2 = sectors[y].tc_next;
+ vg_assert(tcptr2 >= §ors[y].tc[0]);
+ vg_assert(tcptr2 <= §ors[y].tc[tc_sector_szQ]);
/* Find an empty tt slot, and use it. There must be such a slot
since tt is never allowed to get completely full. */
}
sectors[y].tt[i].status = InUse;
- sectors[y].tt[i].tce = tce;
+ sectors[y].tt[i].tcptr = tcptr;
sectors[y].tt[i].count = 0;
sectors[y].tt[i].weight = 1;
sectors[y].tt[i].vge = *vge;
sectors[y].tt[i].entry = entry;
/* Update the fast-cache. */
- setFastCacheEntry( entry, tce, §ors[y].tt[i].count );
+ setFastCacheEntry( entry, tcptr, §ors[y].tt[i].count );
/* Note the eclass numbers for this translation. */
upd_eclasses_after_add( §ors[y], i );
/* found it */
if (upd_cache)
setFastCacheEntry(
- guest_addr, sectors[sno].tt[k].tce,
+ guest_addr, sectors[sno].tt[k].tcptr,
§ors[sno].tt[k].count );
if (result)
- *result = sizeof(Addr64) + (AddrH)sectors[sno].tt[k].tce;
+ *result = (AddrH)sectors[sno].tt[k].tcptr;
return True;
}
if (sectors[sno].tt[k].status == Empty)
/* Otherwise lots of things go wrong... */
vg_assert(sizeof(ULong) == 8);
vg_assert(sizeof(Addr64) == 8);
+ /* check fast cache entries really are 2 words long */
+ vg_assert(sizeof(Addr) == sizeof(void*));
+ vg_assert(sizeof(FastCacheEntry) == 2 * sizeof(Addr));
+ /* check fast cache entries are packed back-to-back with no spaces */
+ vg_assert(sizeof( VG_(tt_fast) ) == VG_TT_FAST_SIZE * sizeof(FastCacheEntry));
+ /* check fast cache is aligned as we requested. Not fatal if it
+ isn't, but we might as well make sure. */
+ vg_assert(VG_IS_16_ALIGNED( ((Addr) & VG_(tt_fast)[0]) ));
if (VG_(clo_verbosity) > 2)
VG_(message)(Vg_DebugMsg,
}
}
- /* and the fast caches. */
+ /* Initialise the fast caches. If not profiling (the usual case),
+ we have to explicitly invalidate the fastN cache as
+ invalidateFastCache() won't do that for us. */
invalidateFastCache();
+ if (VG_(clo_profile_flags) == 0)
+ invalidateFastNCache();
/* and the unredir tt/tc */
init_unredir_tt_tc();
#include "pub_core_transtab_asm.h"
-/* The fast-cache for tt-lookup, and for finding counters. */
-extern ULong* VG_(tt_fast) [VG_TT_FAST_SIZE];
-extern UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE];
+/* The fast-cache for tt-lookup, and for finding counters. Unused
+ entries are denoted by .guest == 1, which is assumed to be a bogus
+ address for all guest code. */
+typedef
+ struct {
+ Addr guest;
+ Addr host;
+ }
+ FastCacheEntry;
+
+extern __attribute__((aligned(16)))
+ FastCacheEntry VG_(tt_fast) [VG_TT_FAST_SIZE];
+
+#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
+
+extern UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE];
extern void VG_(init_tt_tc) ( void );