(Valgrind side). See #296422.
git-svn-id: svn://svn.valgrind.org/valgrind/branches/TCHAIN@12484
-Wmissing-declarations \
@FLAG_W_NO_FORMAT_ZERO_LENGTH@ \
-fno-strict-aliasing \
- -fno-builtin
+ -fno-builtin \
+ \
+ -O
# These flags are used for building the preload shared objects.
# The aim is to give reasonable performance but also to have good
/*------------------------------------------------------------*/
/*--- ---*/
-/*--- The dispatch loop. VG_(run_innerloop) is used to ---*/
-/*--- run all translations except no-redir ones. ---*/
+/*--- The dispatch loop. VG_(disp_run_translations) is ---*/
+/*--- used to run all translations, ---*/
+/*--- including no-redir ones. ---*/
/*--- ---*/
/*------------------------------------------------------------*/
/*----------------------------------------------------*/
-/*--- Preamble (set everything up) ---*/
+/*--- Entry and preamble (set everything up) ---*/
/*----------------------------------------------------*/
/* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+ void* guest_state,
+ Addr host_addr );
*/
-
.text
-.globl VG_(run_innerloop)
-.type VG_(run_innerloop), @function
-VG_(run_innerloop):
- /* %rdi holds guest_state */
- /* %rsi holds do_profiling */
-
- /* ----- entry point to VG_(run_innerloop) ----- */
+.globl VG_(disp_run_translations)
+.type VG_(disp_run_translations), @function
+VG_(disp_run_translations):
+ /* %rdi holds two_words */
+ /* %rsi holds guest_state */
+ /* %rdx holds host_addr */
+
+ /* The preamble */
+
+ /* Save integer registers, since this is a pseudo-function. */
+ pushq %rax
pushq %rbx
pushq %rcx
- pushq %rdx
+ pushq %rdx
pushq %rsi
pushq %rbp
pushq %r8
pushq %r13
pushq %r14
pushq %r15
- pushq %rdi /* guest_state */
-
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
- movl (%r15), %r15d
- pushq %r15
+ /* %rdi must be saved last */
+ pushq %rdi
- /* 8(%rsp) holds cached copy of guest_state ptr */
- /* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */
-
- /* Set up the guest state pointer */
- movq %rdi, %rbp
-
- /* fetch %RIP into %rax */
- movq OFFSET_amd64_RIP(%rbp), %rax
+ /* Get the host CPU in the state expected by generated code. */
/* set host FPU control word to the default mode expected
by VEX-generated code. See comments in libvex.h for
/* set dir flag to known value */
cld
- /* fall into main loop (the right one) */
- cmpq $0, %rsi
- je VG_(run_innerloop__dispatch_unassisted_unprofiled)
- jmp VG_(run_innerloop__dispatch_unassisted_profiled)
- /*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher ---*/
-/*----------------------------------------------------*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_unassisted_unprofiled)
-VG_(run_innerloop__dispatch_unassisted_unprofiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- unmodified guest state ptr */
-
- /* save the jump address in the guest state */
- movq %rax, OFFSET_amd64_RIP(%rbp)
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, 0(%rsp)
- jz counter_is_zero
-
- /* try a fast lookup in the translation cache */
- movabsq $VG_(tt_fast), %rcx
- movq %rax, %rbx /* next guest addr */
- andq $VG_TT_FAST_MASK, %rbx /* entry# */
- shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
- movq 0(%rcx,%rbx,1), %r10 /* .guest */
- movq 8(%rcx,%rbx,1), %r11 /* .host */
- cmpq %rax, %r10
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%r11
- ud2 /* persuade insn decoders not to speculate past here */
- /* generated code should run, then jump back to either
- VG_(run_innerloop__dispatch_unassisted_unprofiled)
- VG_(run_innerloop__dispatch_assisted_unprofiled). */
- /*NOTREACHED*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_assisted_unprofiled)
-VG_(run_innerloop__dispatch_assisted_unprofiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- modified guest state ptr */
- /* We know the guest state pointer has been modified.
- So jump directly to gsp_changed. */
- jmp gsp_changed
- ud2
- /*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower) ---*/
-/*----------------------------------------------------*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_unassisted_profiled)
-VG_(run_innerloop__dispatch_unassisted_profiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- unmodified guest state ptr */
-
- /* save the jump address in the guest state */
- movq %rax, OFFSET_amd64_RIP(%rbp)
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, 0(%rsp)
- jz counter_is_zero
-
- /* try a fast lookup in the translation cache */
- movabsq $VG_(tt_fast), %rcx
- movq %rax, %rbx
- andq $VG_TT_FAST_MASK, %rbx /* entry# */
- shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
- movq 0(%rcx,%rbx,1), %r10 /* .guest */
- movq 8(%rcx,%rbx,1), %r11 /* .host */
- cmpq %rax, %r10
- jnz fast_lookup_failed
-
- /* increment bb profile counter */
- movabsq $VG_(tt_fastN), %rdx
- shrq $1, %rbx /* entry# * sizeof(UInt*) */
- movq (%rdx,%rbx,1), %rdx
- addl $1, (%rdx)
+ /* Set up the guest state pointer */
+ movq %rsi, %rbp
- /* Found a match. Jump to .host. */
- jmp *%r11
- ud2 /* persuade insn decoders not to speculate past here */
- /* generated code should run, then jump back to either
- VG_(run_innerloop__dispatch_unassisted_profiled)
- VG_(run_innerloop__dispatch_assisted_profiled). */
- /*NOTREACHED*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_assisted_profiled)
-VG_(run_innerloop__dispatch_assisted_profiled):
- /* AT ENTRY: %rax is next guest addr, %rbp is the
- modified guest state ptr */
-
- /* Well, we know the guest state pointer has been modified.
- So jump directly to gsp_changed. */
- jmp gsp_changed
- ud2
- /*NOTREACHED*/
+ /* and jump into the code cache. Chained translations in
+ the code cache run, until for whatever reason, they can't
+ continue. When that happens, the translation in question
+ will jump (or call) to one of the continuation points
+ VG_(cp_...) below. */
+ jmpq *%rdx
+ /*NOTREACHED*/
/*----------------------------------------------------*/
-/*--- exit points ---*/
+/*--- Postamble and exit. ---*/
/*----------------------------------------------------*/
-gsp_changed:
- /* Someone messed with the gsp. Have to
- defer to scheduler to resolve this. dispatch ctr
- is not yet decremented, so no need to increment. */
- /* %RIP is NOT up to date here. First, need to write
- %rax back to %RIP, but without trashing %rbp since
- that holds the value we want to return to the scheduler.
- Hence use %r15 transiently for the guest state pointer. */
- movq 8(%rsp), %r15
- movq %rax, OFFSET_amd64_RIP(%r15)
- movq %rbp, %rax
- jmp run_innerloop_exit
- /*NOTREACHED*/
-
-counter_is_zero:
- /* %RIP is up to date here */
- /* back out decrement of the dispatch counter */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_COUNTERZERO, %rax
- jmp run_innerloop_exit
-
-fast_lookup_failed:
- /* %RIP is up to date here */
- /* back out decrement of the dispatch counter */
- addl $1, 0(%rsp)
- movq $VG_TRC_INNER_FASTMISS, %rax
- jmp run_innerloop_exit
-
-
-
-/* All exits from the dispatcher go through here. %rax holds
- the return value.
-*/
-run_innerloop_exit:
- /* We're leaving. Check that nobody messed with
- %mxcsr or %fpucw. We can't mess with %rax here as it
- holds the tentative return value, but any other is OK. */
+postamble:
+ /* At this point, %rax and %rdx contain two
+ words to be returned to the caller. %rax
+ holds a TRC value, and %rdx optionally may
+ hold another word (for CHAIN_ME exits, the
+ address of the place to patch.) */
+
+ /* We're leaving. Check that nobody messed with %mxcsr
+ or %fpucw. We can't mess with %rax or %rdx here as they
+ hold the tentative return values, but any others are OK. */
#if !defined(ENABLE_INNER)
/* This check fails for self-hosting, so skip in that case */
pushq $0
fstcw (%rsp)
cmpl $0x027F, (%rsp)
- popq %r15 /* get rid of the word without trashing %eflags */
+ popq %r15 /* get rid of the word without trashing %rflags */
jnz invariant_violation
#endif
pushq $0
popq %r15
jnz invariant_violation
/* otherwise we're OK */
- jmp run_innerloop_exit_REALLY
-
+ jmp remove_frame
invariant_violation:
movq $VG_TRC_INVARIANT_FAILED, %rax
- jmp run_innerloop_exit_REALLY
-
-run_innerloop_exit_REALLY:
-
- /* restore VG_(dispatch_ctr) */
- popq %r14
- movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
- movl %r14d, (%r15)
+ movq $0, %rdx
+remove_frame:
+ /* Pop %rdi, stash return values */
popq %rdi
+ movq %rax, 0(%rdi)
+ movq %rdx, 8(%rdi)
+ /* Now pop everything else */
popq %r15
popq %r14
popq %r13
popq %rdx
popq %rcx
popq %rbx
+ popq %rax
ret
-.size VG_(run_innerloop), .-VG_(run_innerloop)
+
+/*----------------------------------------------------*/
+/*--- Continuation points ---*/
+/*----------------------------------------------------*/
-
-/*------------------------------------------------------------*/
-/*--- ---*/
-/*--- A special dispatcher, for running no-redir ---*/
-/*--- translations. Just runs the given translation once. ---*/
-/*--- ---*/
-/*------------------------------------------------------------*/
+/* ------ Chain me to slow entry point ------ */
+.global VG_(disp_cp_chain_me_to_slowEP)
+VG_(disp_cp_chain_me_to_slowEP):
+ /* We got called. The return address indicates
+ where the patching needs to happen. Collect
+ the return address and, exit back to C land,
+ handing the caller the pair (Chain_me_S, RA) */
+ movq $VG_TRC_CHAIN_ME_TO_SLOW_EP, %rax
+ popq %rdx
+ /* 10 = movabsq $VG_(disp_chain_me_to_slowEP), %r11;
+ 3 = call *%r11 */
+ subq $10+3, %rdx
+ jmp postamble
+
+/* ------ Chain me to fast entry point ------ */
+.global VG_(disp_cp_chain_me_to_fastEP)
+VG_(disp_cp_chain_me_to_fastEP):
+ /* We got called. The return address indicates
+ where the patching needs to happen. Collect
+ the return address and, exit back to C land,
+ handing the caller the pair (Chain_me_F, RA) */
+ movq $VG_TRC_CHAIN_ME_TO_FAST_EP, %rax
+ popq %rdx
+ /* 10 = movabsq $VG_(disp_chain_me_to_fastEP), %r11;
+ 3 = call *%r11 */
+ subq $10+3, %rdx
+ jmp postamble
+
+/* ------ Indirect but boring jump ------ */
+.global VG_(disp_cp_xindir)
+VG_(disp_cp_xindir):
+ /* Where are we going? */
+ movq OFFSET_amd64_RIP(%rbp), %rax
-/* signature:
-void VG_(run_a_noredir_translation) ( UWord* argblock );
-*/
+ /* RM ME -- stats only */
+ addq $1, vgPlain_stats__n_xindirs
+
+ /* try a fast lookup in the translation cache */
+ movabsq $VG_(tt_fast), %rcx
+ movq %rax, %rbx /* next guest addr */
+ andq $VG_TT_FAST_MASK, %rbx /* entry# */
+ shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
+ movq 0(%rcx,%rbx,1), %r10 /* .guest */
+ movq 8(%rcx,%rbx,1), %r11 /* .host */
+ cmpq %rax, %r10
+ jnz fast_lookup_failed
-/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args
- and 2 to carry results:
- 0: input: ptr to translation
- 1: input: ptr to guest state
- 2: output: next guest PC
- 3: output: guest state pointer afterwards (== thread return code)
-*/
-.align 16
-.global VG_(run_a_noredir_translation)
-.type VG_(run_a_noredir_translation), @function
-VG_(run_a_noredir_translation):
- /* Save callee-saves regs */
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- pushq %rdi /* we will need it after running the translation */
- movq 8(%rdi), %rbp
- jmp *0(%rdi)
- /*NOTREACHED*/
- ud2
- /* If the translation has been correctly constructed, we
- should resume at the the following label. */
-.global VG_(run_a_noredir_translation__return_point)
-VG_(run_a_noredir_translation__return_point):
- popq %rdi
- movq %rax, 16(%rdi)
- movq %rbp, 24(%rdi)
-
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
- ret
-.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation)
+ /* Found a match. Jump to .host. */
+ jmp *%r11
+ ud2 /* persuade insn decoders not to speculate past here */
+
+fast_lookup_failed:
+ /* RM ME -- stats only */
+ addq $1, vgPlain_stats__n_xindir_misses
+
+ movq $VG_TRC_INNER_FASTMISS, %rax
+ movq $0, %rdx
+ jmp postamble
+
+/* ------ Assisted jump ------ */
+.global VG_(disp_cp_xassisted)
+VG_(disp_cp_xassisted):
+ /* %rbp contains the TRC */
+ movq %rbp, %rax
+ movq $0, %rdx
+ jmp postamble
+
+/* ------ Event check failed ------ */
+.global VG_(disp_cp_evcheck_fail)
+VG_(disp_cp_evcheck_fail):
+ movq $VG_TRC_INNER_COUNTERZERO, %rax
+ movq $0, %rdx
+ jmp postamble
+
+
+.size VG_(disp_run_translations), .-VG_(disp_run_translations)
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
+
/*--------------------------------------------------------------------*/
/*--- The core dispatch loop, for jumping to a code address. ---*/
/*--- dispatch-arm-linux.S ---*/
/*------------------------------------------------------------*/
/*----------------------------------------------------*/
-/*--- Preamble (set everything up) ---*/
+/*--- Entry and preamble (set everything up) ---*/
/*----------------------------------------------------*/
/* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+ void* guest_state,
+ Addr host_addr );
*/
.text
-.globl VG_(run_innerloop)
-VG_(run_innerloop):
- push {r0, r1, r4, r5, r6, r7, r8, r9, fp, lr}
+.global VG_(disp_run_translations)
+VG_(disp_run_translations):
+ /* r0 holds two_words
+ r1 holds guest_state
+ r2 holds host_addr
+ */
+ /* The number of regs in this list needs to be even, in
+ order to keep the stack 8-aligned. */
+ push {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
/* set FPSCR to vex-required default value */
mov r4, #0
fmxr fpscr, r4
- /* r0 (hence also [sp,#0]) holds guest_state */
- /* r1 holds do_profiling */
- mov r8, r0
- ldr r0, [r8, #OFFSET_arm_R15T]
-
- /* fall into main loop (the right one) */
- cmp r1, #0 /* do_profiling */
- beq VG_(run_innerloop__dispatch_unprofiled)
- b VG_(run_innerloop__dispatch_profiled)
-
+ /* Set up the guest state pointer */
+ mov r8, r1
+ /* and jump into the code cache. Chained translations in
+ the code cache run, until for whatever reason, they can't
+ continue. When that happens, the translation in question
+ will jump (or call) to one of the continuation points
+ VG_(cp_...) below. */
+ bx r2
+ /* NOTREACHED */
+
/*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher ---*/
+/*--- Postamble and exit. ---*/
/*----------------------------------------------------*/
-/* Pairing of insns below is my guesstimate of how dual dispatch would
- work on an A8. JRS, 2011-May-28 */
-
-.global VG_(run_innerloop__dispatch_unprofiled)
-VG_(run_innerloop__dispatch_unprofiled):
-
- /* AT ENTRY: r0 is next guest addr, r8 is possibly
- modified guest state ptr */
-
- /* Has the guest state pointer been messed with? If yes, exit. */
- movw r3, #:lower16:VG_(dispatch_ctr)
- tst r8, #1
-
- movt r3, #:upper16:VG_(dispatch_ctr)
-
- bne gsp_changed
-
- /* save the jump address in the guest state */
- str r0, [r8, #OFFSET_arm_R15T]
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- ldr r2, [r3]
-
- subs r2, r2, #1
-
- str r2, [r3]
-
- beq counter_is_zero
-
- /* try a fast lookup in the translation cache */
- // r0 = next guest, r1,r2,r3,r4 scratch
- movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK
- movw r4, #:lower16:VG_(tt_fast)
-
- and r2, r1, r0, LSR #1 // r2 = entry #
- movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast)
-
- add r1, r4, r2, LSL #3 // r1 = &tt_fast[entry#]
-
- ldrd r4, r5, [r1, #0] // r4 = .guest, r5 = .host
-
- cmp r4, r0
+postamble:
+ /* At this point, r1 and r2 contain two
+ words to be returned to the caller. r1
+ holds a TRC value, and r2 optionally may
+ hold another word (for CHAIN_ME exits, the
+ address of the place to patch.) */
- bne fast_lookup_failed
- // r5: next-host r8: live, gsp
- // r4: next-guest
- // r2: entry #
- // LIVE: r5, r8; all others dead
-
- /* Found a match. Jump to .host. */
- blx r5
- b VG_(run_innerloop__dispatch_unprofiled)
-.ltorg
- /*NOTREACHED*/
+ /* We're leaving. Check that nobody messed with
+ FPSCR in ways we don't expect. */
+ fmrx r4, fpscr
+ bic r4, #0xF8000000 /* mask out NZCV and QC */
+ bic r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */
+ cmp r4, #0
+ beq remove_frame /* we're OK */
+ /* otherwise we have an invariant violation */
+ movw r1, #VG_TRC_INVARIANT_FAILED
+ movw r2, #0
+ /* fall through */
+
+remove_frame:
+ /* Restore int regs, including importantly r0 (two_words) */
+ pop {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+ /* Stash return values */
+ str r1, [r0, #0]
+ str r2, [r0, #4]
+ bx lr
/*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower) ---*/
+/*--- Continuation points ---*/
/*----------------------------------------------------*/
-.global VG_(run_innerloop__dispatch_profiled)
-VG_(run_innerloop__dispatch_profiled):
-
- /* AT ENTRY: r0 is next guest addr, r8 is possibly
- modified guest state ptr */
-
- /* Has the guest state pointer been messed with? If yes, exit. */
- movw r3, #:lower16:VG_(dispatch_ctr)
- tst r8, #1
-
- movt r3, #:upper16:VG_(dispatch_ctr)
-
- bne gsp_changed
-
- /* save the jump address in the guest state */
- str r0, [r8, #OFFSET_arm_R15T]
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- ldr r2, [r3]
-
- subs r2, r2, #1
-
- str r2, [r3]
-
- beq counter_is_zero
-
+/* ------ Chain me to slow entry point ------ */
+.global VG_(disp_cp_chain_me_to_slowEP)
+VG_(disp_cp_chain_me_to_slowEP):
+ /* We got called. The return address indicates
+ where the patching needs to happen. Collect
+ the return address and, exit back to C land,
+ handing the caller the pair (Chain_me_S, RA) */
+ mov r1, #VG_TRC_CHAIN_ME_TO_SLOW_EP
+ mov r2, lr
+ /* 4 = movw r12, lo16(disp_cp_chain_me_to_slowEP)
+ 4 = movt r12, hi16(disp_cp_chain_me_to_slowEP)
+ 4 = blx r12 */
+ sub r2, r2, #4+4+4
+ b postamble
+
+/* ------ Chain me to fast entry point ------ */
+.global VG_(disp_cp_chain_me_to_fastEP)
+VG_(disp_cp_chain_me_to_fastEP):
+ /* We got called. The return address indicates
+ where the patching needs to happen. Collect
+ the return address and, exit back to C land,
+ handing the caller the pair (Chain_me_F, RA) */
+ mov r1, #VG_TRC_CHAIN_ME_TO_FAST_EP
+ mov r2, lr
+ /* 4 = movw r12, lo16(disp_cp_chain_me_to_fastEP)
+ 4 = movt r12, hi16(disp_cp_chain_me_to_fastEP)
+ 4 = blx r12 */
+ sub r2, r2, #4+4+4
+ b postamble
+
+/* ------ Indirect but boring jump ------ */
+.global VG_(disp_cp_xindir)
+VG_(disp_cp_xindir):
+ /* Where are we going? */
+ ldr r0, [r8, #OFFSET_arm_R15T]
+
+ /* RM ME -- stats only */
+ movw r1, #:lower16:vgPlain_stats__n_xindirs
+ movt r1, #:upper16:vgPlain_stats__n_xindirs
+ ldr r2, [r1, #0]
+ adds r2, r2, #1
+ str r2, [r1, #0]
+ ldr r2, [r1, #4]
+ adc r2, r2, #0
+ str r2, [r1, #4]
+
/* try a fast lookup in the translation cache */
// r0 = next guest, r1,r2,r3,r4 scratch
movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK
cmp r4, r0
- bne fast_lookup_failed
- // r5: next-host r8: live, gsp
- // r4: next-guest
- // r2: entry #
- // LIVE: r5, r8; all others dead
-
- /* increment bb profile counter */
- movw r0, #:lower16:VG_(tt_fastN)
- movt r0, #:upper16:VG_(tt_fastN) // r0 = &tt_fastN[0]
- ldr r0, [r0, r2, LSL #2] // r0 = tt_fast[entry #]
- ldr r3, [r0] // *r0 ++
- add r3, r3, #1
- str r3, [r0]
-
- /* Found a match. Jump to .host. */
- blx r5
- b VG_(run_innerloop__dispatch_profiled)
- /*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- exit points ---*/
-/*----------------------------------------------------*/
-
-gsp_changed:
- // r0 = next guest addr (R15T), r8 = modified gsp
- /* Someone messed with the gsp. Have to
- defer to scheduler to resolve this. dispatch ctr
- is not yet decremented, so no need to increment. */
- /* R15T is NOT up to date here. First, need to write
- r0 back to R15T, but without trashing r8 since
- that holds the value we want to return to the scheduler.
- Hence use r1 transiently for the guest state pointer. */
- ldr r1, [sp, #0]
- str r0, [r1, #OFFSET_arm_R15T]
- mov r0, r8 // "return modified gsp"
- b run_innerloop_exit
- /*NOTREACHED*/
-
-counter_is_zero:
- /* R15T is up to date here */
- /* Back out increment of the dispatch ctr */
- ldr r1, =VG_(dispatch_ctr)
- ldr r2, [r1]
- add r2, r2, #1
- str r2, [r1]
- mov r0, #VG_TRC_INNER_COUNTERZERO
- b run_innerloop_exit
- /*NOTREACHED*/
-
-fast_lookup_failed:
- /* R15T is up to date here */
- /* Back out increment of the dispatch ctr */
- ldr r1, =VG_(dispatch_ctr)
- ldr r2, [r1]
- add r2, r2, #1
- str r2, [r1]
- mov r0, #VG_TRC_INNER_FASTMISS
- b run_innerloop_exit
- /*NOTREACHED*/
-
-/* All exits from the dispatcher go through here. %r0 holds
- the return value.
-*/
-run_innerloop_exit:
- /* We're leaving. Check that nobody messed with
- FPSCR in ways we don't expect. */
- fmrx r4, fpscr
- bic r4, #0xF8000000 /* mask out NZCV and QC */
- bic r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */
- cmp r4, #0
- bne invariant_violation
- b run_innerloop_exit_REALLY
-
-invariant_violation:
- mov r0, #VG_TRC_INVARIANT_FAILED
- b run_innerloop_exit_REALLY
-
-run_innerloop_exit_REALLY:
- add sp, sp, #8
- pop {r4, r5, r6, r7, r8, r9, fp, pc}
-
-.size VG_(run_innerloop), .-VG_(run_innerloop)
-
-
-/*------------------------------------------------------------*/
-/*--- ---*/
-/*--- A special dispatcher, for running no-redir ---*/
-/*--- translations. Just runs the given translation once. ---*/
-/*--- ---*/
-/*------------------------------------------------------------*/
-
-/* signature:
-void VG_(run_a_noredir_translation) ( UWord* argblock );
-*/
-
-/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args
- and 2 to carry results:
- 0: input: ptr to translation
- 1: input: ptr to guest state
- 2: output: next guest PC
- 3: output: guest state pointer afterwards (== thread return code)
-*/
-.global VG_(run_a_noredir_translation)
-VG_(run_a_noredir_translation):
- push {r0,r1 /* EABI compliance */, r4-r12, lr}
- ldr r8, [r0, #4]
- mov lr, pc
- ldr pc, [r0, #0]
-
- pop {r1}
- str r0, [r1, #8]
- str r8, [r1, #12]
- pop {r1/*EABI compliance*/,r4-r12, pc}
-
-.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation)
+ // jump to host if lookup succeeded
+ bxeq r5
+
+ /* otherwise the fast lookup failed */
+ /* RM ME -- stats only */
+ movw r1, #:lower16:vgPlain_stats__n_xindir_misses
+ movt r1, #:upper16:vgPlain_stats__n_xindir_misses
+ ldr r2, [r1, #0]
+ adds r2, r2, #1
+ str r2, [r1, #0]
+ ldr r2, [r1, #4]
+ adc r2, r2, #0
+ str r2, [r1, #4]
+
+ mov r1, #VG_TRC_INNER_FASTMISS
+ mov r2, #0
+ b postamble
+
+/* ------ Assisted jump ------ */
+.global VG_(disp_cp_xassisted)
+VG_(disp_cp_xassisted):
+ /* r8 contains the TRC */
+ mov r1, r8
+ mov r2, #0
+ b postamble
+
+/* ------ Event check failed ------ */
+.global VG_(disp_cp_evcheck_fail)
+VG_(disp_cp_evcheck_fail):
+ mov r1, #VG_TRC_INNER_COUNTERZERO
+ mov r2, #0
+ b postamble
+
+
+.size VG_(disp_run_translations), .-VG_(disp_run_translations)
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",%progbits
/*------------------------------------------------------------*/
/*----------------------------------------------------*/
-/*--- Preamble (set everything up) ---*/
+/*--- Entry and preamble (set everything up) ---*/
/*----------------------------------------------------*/
/* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+ void* guest_state,
+ Addr host_addr );
*/
.text
-.globl VG_(run_innerloop)
-.type VG_(run_innerloop), @function
-VG_(run_innerloop):
- /* 4(%esp) holds guest_state */
- /* 8(%esp) holds do_profiling */
-
- /* ----- entry point to VG_(run_innerloop) ----- */
+.globl VG_(disp_run_translations)
+.type VG_(disp_run_translations), @function
+VG_(disp_run_translations):
+ /* 0(%esp) holds our return address. */
+ /* 4(%esp) holds two_words */
+ /* 8(%esp) holds guest_state */
+ /* 12(%esp) holds host_addr */
+
+ /* The preamble */
+
+ /* Save integer registers, since this is a pseudo-function. */
+ pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
pushl %edi
pushl %ebp
- /* 28(%esp) holds guest_state */
- /* 32(%esp) holds do_profiling */
+ /* 28+4(%esp) holds two_words */
+ /* 28+8(%esp) holds guest_state */
+ /* 28+12(%esp) holds host_addr */
- /* Set up the guest state pointer */
- movl 28(%esp), %ebp
-
- /* fetch %EIP into %eax */
- movl OFFSET_x86_EIP(%ebp), %eax
+ /* Get the host CPU in the state expected by generated code. */
/* set host FPU control word to the default mode expected
by VEX-generated code. See comments in libvex.h for
L1:
/* set dir flag to known value */
cld
-
- /* fall into main loop (the right one) */
- cmpl $0, 32(%esp) /* do_profiling */
- je VG_(run_innerloop__dispatch_unassisted_unprofiled)
- jmp VG_(run_innerloop__dispatch_unassisted_profiled)
- /*NOTREACHED*/
-/*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher ---*/
-/*----------------------------------------------------*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_unassisted_unprofiled)
-VG_(run_innerloop__dispatch_unassisted_unprofiled):
- /* AT ENTRY: %eax is next guest addr, %ebp is the
- unmodified guest state ptr */
-
- /* save the jump address in the guest state */
- movl %eax, OFFSET_x86_EIP(%ebp)
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, VG_(dispatch_ctr)
- jz counter_is_zero
-
- /* try a fast lookup in the translation cache */
- movl %eax, %ebx /* next guest addr */
- andl $ VG_TT_FAST_MASK, %ebx /* entry# */
- movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
- movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
- cmpl %eax, %esi
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%edi
- ud2 /* persuade insn decoders not to speculate past here */
- /* generated code should run, then jump back to either
- VG_(run_innerloop__dispatch_unassisted_unprofiled) or
- VG_(run_innerloop__dispatch_assisted_unprofiled). */
- /*NOTREACHED*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_assisted_unprofiled)
-VG_(run_innerloop__dispatch_assisted_unprofiled):
- /* AT ENTRY: %eax is next guest addr, %ebp is the
- modified guest state ptr */
- /* We know the guest state pointer has been modified.
- So jump directly to gsp_changed. */
- jmp gsp_changed
- ud2
- /*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower) ---*/
-/*----------------------------------------------------*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_unassisted_profiled)
-VG_(run_innerloop__dispatch_unassisted_profiled):
- /* AT ENTRY: %eax is next guest addr, %ebp is the
- unmodified guest state ptr */
-
- /* save the jump address in the guest state */
- movl %eax, OFFSET_x86_EIP(%ebp)
-
- /* Are we out of timeslice? If yes, defer to scheduler. */
- subl $1, VG_(dispatch_ctr)
- jz counter_is_zero
-
- /* try a fast lookup in the translation cache */
- movl %eax, %ebx /* next guest addr */
- andl $ VG_TT_FAST_MASK, %ebx /* entry# */
- movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
- movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
- cmpl %eax, %esi
- jnz fast_lookup_failed
-
- /* increment bb profile counter */
- /* note: innocuous as this sounds, it causes a huge amount more
- stress on D1 and significantly slows everything down. */
- movl VG_(tt_fastN)(,%ebx,4), %edx
- /* Use "addl $1", not "incl", to avoid partial-flags stall on P4 */
- addl $1, (%edx)
-
- /* Found a match. Jump to .host. */
- jmp *%edi
- ud2 /* persuade insn decoders not to speculate past here */
- /* generated code should run, then jump back to either
- VG_(run_innerloop__dispatch_unassisted_profiled) or
- VG_(run_innerloop__dispatch_assisted_profiled). */
- /*NOTREACHED*/
-
-.align 16
-.global VG_(run_innerloop__dispatch_assisted_profiled)
-VG_(run_innerloop__dispatch_assisted_profiled):
- /* AT ENTRY: %eax is next guest addr, %ebp is the
- modified guest state ptr */
- /* We know the guest state pointer has been modified.
- So jump directly to gsp_changed. */
- jmp gsp_changed
- ud2
+ /* Set up the guest state pointer */
+ movl 28+8(%esp), %ebp
+
+ /* and jump into the code cache. Chained translations in
+ the code cache run, until for whatever reason, they can't
+ continue. When that happens, the translation in question
+ will jump (or call) to one of the continuation points
+ VG_(cp_...) below. */
+ jmpl *28+12(%esp)
/*NOTREACHED*/
/*----------------------------------------------------*/
-/*--- exit points ---*/
+/*--- Postamble and exit. ---*/
/*----------------------------------------------------*/
-gsp_changed:
- /* Someone messed with the gsp. Have to
- defer to scheduler to resolve this. dispatch ctr
- is not yet decremented, so no need to increment. */
- /* %EIP is NOT up to date here. First, need to write
- %eax back to %EIP, but without trashing %ebp since
- that holds the value we want to return to the scheduler.
- Hence use %esi transiently for the guest state pointer. */
- movl 28(%esp), %esi
- movl %eax, OFFSET_x86_EIP(%esi)
- movl %ebp, %eax
- jmp run_innerloop_exit
- /*NOTREACHED*/
-
-counter_is_zero:
- /* %EIP is up to date here */
- /* back out decrement of the dispatch counter */
- addl $1, VG_(dispatch_ctr)
- movl $ VG_TRC_INNER_COUNTERZERO, %eax
- jmp run_innerloop_exit
- /*NOTREACHED*/
-
-fast_lookup_failed:
- /* %EIP is up to date here */
- /* back out decrement of the dispatch counter */
- addl $1, VG_(dispatch_ctr)
- movl $ VG_TRC_INNER_FASTMISS, %eax
- jmp run_innerloop_exit
- /*NOTREACHED*/
+postamble:
+ /* At this point, %eax and %edx contain two
+ words to be returned to the caller. %eax
+ holds a TRC value, and %edx optionally may
+ hold another word (for CHAIN_ME exits, the
+ address of the place to patch.) */
-
-
-/* All exits from the dispatcher go through here. %eax holds
- the return value.
-*/
-run_innerloop_exit:
- /* We're leaving. Check that nobody messed with
- %mxcsr or %fpucw. We can't mess with %eax here as it
- holds the tentative return value, but any other is OK. */
+ /* We're leaving. Check that nobody messed with %mxcsr
+ or %fpucw. We can't mess with %eax or %edx here as they
+ holds the tentative return value, but any others are OK. */
#if !defined(ENABLE_INNER)
/* This check fails for self-hosting, so skip in that case */
pushl $0
popl %esi /* get rid of the word without trashing %eflags */
jnz invariant_violation
#endif
- cmpl $0, VG_(machine_x86_have_mxcsr)
+# cmpl $0, VG_(machine_x86_have_mxcsr)
jz L2
pushl $0
stmxcsr (%esp)
popl %esi
jnz invariant_violation
L2: /* otherwise we're OK */
- jmp run_innerloop_exit_REALLY
-
+ jmp remove_frame
invariant_violation:
- movl $ VG_TRC_INVARIANT_FAILED, %eax
- jmp run_innerloop_exit_REALLY
-
-run_innerloop_exit_REALLY:
+ movl $VG_TRC_INVARIANT_FAILED, %eax
+ movl $0, %edx
+
+remove_frame:
+ /* Stash return values */
+ movl 28+4(%esp), %edi /* two_words */
+ movl %eax, 0(%edi)
+ movl %edx, 4(%edi)
+ /* Restore int regs and return. */
popl %ebp
popl %edi
popl %esi
popl %edx
popl %ecx
popl %ebx
+ popl %eax
ret
-.size VG_(run_innerloop), .-VG_(run_innerloop)
+
+/*----------------------------------------------------*/
+/*--- Continuation points ---*/
+/*----------------------------------------------------*/
+/* ------ Chain me to slow entry point ------ */
+.global VG_(disp_cp_chain_me_to_slowEP)
+VG_(disp_cp_chain_me_to_slowEP):
+ /* We got called. The return address indicates
+ where the patching needs to happen. Collect
+ the return address and, exit back to C land,
+ handing the caller the pair (Chain_me_S, RA) */
+ movl $VG_TRC_CHAIN_ME_TO_SLOW_EP, %eax
+ popl %edx
+ /* 5 = movl $VG_(disp_chain_me_to_slowEP), %edx;
+ 2 = call *%edx */
+ subl $5+2, %edx
+ jmp postamble
+
+/* ------ Chain me to fast entry point ------ */
+.global VG_(disp_cp_chain_me_to_fastEP)
+VG_(disp_cp_chain_me_to_fastEP):
+ /* We got called. The return address indicates
+ where the patching needs to happen. Collect
+ the return address and, exit back to C land,
+ handing the caller the pair (Chain_me_F, RA) */
+ movl $VG_TRC_CHAIN_ME_TO_FAST_EP, %eax
+ popl %edx
+ /* 5 = movl $VG_(disp_chain_me_to_fastEP), %edx;
+ 2 = call *%edx */
+ subl $5+2, %edx
+ jmp postamble
+
+/* ------ Indirect but boring jump ------ */
+.global VG_(disp_cp_xindir)
+VG_(disp_cp_xindir):
+ /* Where are we going? */
+ movl OFFSET_x86_EIP(%ebp), %eax
-/*------------------------------------------------------------*/
-/*--- ---*/
-/*--- A special dispatcher, for running no-redir ---*/
-/*--- translations. Just runs the given translation once. ---*/
-/*--- ---*/
-/*------------------------------------------------------------*/
+ /* RM ME -- stats only */
+ addl $1, vgPlain_stats__n_xindirs
+ adcl $0, vgPlain_stats__n_xindirs+4
+
+ /* try a fast lookup in the translation cache */
+ movl %eax, %ebx /* next guest addr */
+ andl $VG_TT_FAST_MASK, %ebx /* entry# */
+ movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
+ movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
+ cmpl %eax, %esi
+ jnz fast_lookup_failed
+
+ /* Found a match. Jump to .host. */
+ jmp *%edi
+ ud2 /* persuade insn decoders not to speculate past here */
-/* signature:
-void VG_(run_a_noredir_translation) ( UWord* argblock );
-*/
+fast_lookup_failed:
+ /* RM ME -- stats only */
+ addl $1, vgPlain_stats__n_xindir_misses
+ adcl $0, vgPlain_stats__n_xindir_misses+4
+
+ movl $VG_TRC_INNER_FASTMISS, %eax
+ movl $0, %edx
+ jmp postamble
+
+/* ------ Assisted jump ------ */
+.global VG_(disp_cp_xassisted)
+VG_(disp_cp_xassisted):
+ /* %ebp contains the TRC */
+ movl %ebp, %eax
+ movl $0, %edx
+ jmp postamble
+
+/* ------ Event check failed ------ */
+.global VG_(disp_cp_evcheck_fail)
+VG_(disp_cp_evcheck_fail):
+ movl $VG_TRC_INNER_COUNTERZERO, %eax
+ movl $0, %edx
+ jmp postamble
+
+
+.size VG_(disp_run_translations), .-VG_(disp_run_translations)
-/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args
- and 2 to carry results:
- 0: input: ptr to translation
- 1: input: ptr to guest state
- 2: output: next guest PC
- 3: output: guest state pointer afterwards (== thread return code)
-*/
-.align 16
-.global VG_(run_a_noredir_translation)
-.type VG_(run_a_noredir_translation), @function
-VG_(run_a_noredir_translation):
- /* Save callee-saves regs */
- pushl %esi
- pushl %edi
- pushl %ebp
- pushl %ebx
-
- movl 20(%esp), %edi /* %edi = argblock */
- movl 4(%edi), %ebp /* argblock[1] */
- jmp *0(%edi) /* argblock[0] */
- /*NOTREACHED*/
- ud2
- /* If the translation has been correctly constructed, we
- should resume at the the following label. */
-.global VG_(run_a_noredir_translation__return_point)
-VG_(run_a_noredir_translation__return_point):
- movl 20(%esp), %edi
- movl %eax, 8(%edi) /* argblock[2] */
- movl %ebp, 12(%edi) /* argblock[3] */
-
- popl %ebx
- popl %ebp
- popl %edi
- popl %esi
- ret
-.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation)
-
-
/* Let the linker know we don't need an executable stack */
.section .note.GNU-stack,"",@progbits
if ((i+1 == VG_(clo_dump_error))) {
StackTrace ips = VG_(get_ExeContext_StackTrace)(p_min->where);
- VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/,
+ VG_(translate) ( NULL/*caused_discardP*/,
+ 0 /* dummy ThreadId; irrelevant due to debugging*/,
ips[0], /*debugging*/True, 0xFE/*verbosity*/,
/*bbs_done*/0,
/*allow redir?*/True);
address = thumb_pc (address);
# endif
- VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/,
+ VG_(translate) ( NULL/*caused_discardP*/,
+ 0 /* dummy ThreadId; irrelevant due to debugging*/,
address,
/*debugging*/True,
(Int) vex_verbosity,
}
+/* ---------------------------------------------------------------------
+ icache invalidation
+ ------------------------------------------------------------------ */
+
+void VG_(invalidate_icache) ( void *ptr, SizeT nbytes )
+{
+# if defined(VGA_ppc32) || defined(VGA_ppc64)
+ Addr startaddr = (Addr) ptr;
+ Addr endaddr = startaddr + nbytes;
+ Addr cls;
+ Addr addr;
+ VexArchInfo vai;
+
+ if (nbytes == 0) return;
+ vg_assert(nbytes > 0);
+
+ VG_(machine_get_VexArchInfo)( NULL, &vai );
+ cls = vai.ppc_cache_line_szB;
+
+ /* Stay sane .. */
+ vg_assert(cls == 32 || cls == 64 || cls == 128);
+
+ startaddr &= ~(cls - 1);
+ for (addr = startaddr; addr < endaddr; addr += cls) {
+ __asm__ __volatile__("dcbst 0,%0" : : "r" (addr));
+ }
+ __asm__ __volatile__("sync");
+ for (addr = startaddr; addr < endaddr; addr += cls) {
+ __asm__ __volatile__("icbi 0,%0" : : "r" (addr));
+ }
+ __asm__ __volatile__("sync; isync");
+
+# elif defined(VGA_x86)
+ /* no need to do anything, hardware provides coherence */
+
+# elif defined(VGA_amd64)
+ /* no need to do anything, hardware provides coherence */
+
+# elif defined(VGA_s390x)
+ /* no need to do anything, hardware provides coherence */
+
+# elif defined(VGP_arm_linux)
+ /* ARM cache flushes are privileged, so we must defer to the kernel. */
+ Addr startaddr = (Addr) ptr;
+ Addr endaddr = startaddr + nbytes;
+ VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr);
+
+# else
+# error "Unknown ARCH"
+# endif
+}
+
+
/*--------------------------------------------------------------------*/
/*--- end ---*/
/*--------------------------------------------------------------------*/
score_here, buf_here, tops[r].addr, name );
VG_(printf)("\n");
VG_(discard_translations)(tops[r].addr, 1, "bb profile");
- VG_(translate)(0, tops[r].addr, True, VG_(clo_profile_flags), 0, True);
+ VG_(translate)(NULL/*caused_discardP*/,
+ 0, tops[r].addr, True, VG_(clo_profile_flags), 0, True);
VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-= end BB rank %d "
"=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r);
}
VG_(printf)("pid=%d, entering delay loop\n", VG_(getpid)());
# if defined(VGP_x86_linux)
- iters = 5;
+ iters = 10;
# elif defined(VGP_amd64_linux) || defined(VGP_ppc64_linux)
iters = 10;
# elif defined(VGP_ppc32_linux)
iters = 5;
# elif defined(VGP_arm_linux)
- iters = 1;
+ iters = 5;
# elif defined(VGP_s390x_linux)
iters = 10;
# elif defined(VGO_darwin)
the OS handles threading and signalling are abstracted away and
implemented elsewhere. [Some of the functions have worked their
way back for the moment, until we do an OS port in earnest...]
- */
+*/
+
+/* FIXME tchaining tests:
+ - extensive spinrounds
+ - with sched quantum = 1 -- check that handle_noredir_jump
+ doesn't return with INNER_COUNTERZERO
+ other:
+ - out of date comment w.r.t. bit 0 set in libvex_trc_values.h
+ - can VG_TRC_BORING still happen? if not, rm
+ - memory leaks in m_transtab (InEdgeArr/OutEdgeArr leaking?)
+ - move do_cacheflush out of m_transtab
+ - more economical unchaining when nuking an entire sector
+ - ditto w.r.t. cache flushes
+ - add comments about caused_discard to handle_chain_me()
+ - verify case of 2 paths from A to B
+ - check -- is IP_AT_SYSCALL still right?
+*/
#include "pub_core_basics.h"
#include "pub_core_debuglog.h"
/* If False, a fault is Valgrind-internal (ie, a bug) */
Bool VG_(in_generated_code) = False;
-/* Counts downwards in VG_(run_innerloop). */
-UInt VG_(dispatch_ctr);
-
/* 64-bit counter for the number of basic blocks done. */
static ULong bbs_done = 0;
static ULong n_scheduling_events_MINOR = 0;
static ULong n_scheduling_events_MAJOR = 0;
+ULong VG_(stats__n_xindirs) = 0;
+ULong VG_(stats__n_xindir_misses) = 0;
+
/* Sanity checking counts. */
static UInt sanity_fast_count = 0;
static UInt sanity_slow_count = 0;
void VG_(print_scheduler_stats)(void)
{
VG_(message)(Vg_DebugMsg,
- "scheduler: %'llu jumps (bb entries).\n", bbs_done );
+ "scheduler: %'llu event checks.\n", bbs_done );
+ VG_(message)(Vg_DebugMsg,
+ "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n",
+ VG_(stats__n_xindirs), VG_(stats__n_xindir_misses),
+ VG_(stats__n_xindirs) / (VG_(stats__n_xindir_misses)
+ ? VG_(stats__n_xindir_misses) : 1));
VG_(message)(Vg_DebugMsg,
"scheduler: %'llu/%'llu major/minor sched events.\n",
n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
vg_assert(a_vex + 3 * sz_vex == a_spill);
+# if defined(VGA_x86)
+ /* x86 XMM regs must form an array, ie, have no holes in
+ between. */
+ vg_assert(
+ (offsetof(VexGuestX86State,guest_XMM7)
+ - offsetof(VexGuestX86State,guest_XMM0))
+ == (8/*#regs*/-1) * 16/*bytes per reg*/
+ );
+ vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestX86State,guest_XMM0)));
+ vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestX86State,guest_FPREG)));
+ vg_assert(8 == offsetof(VexGuestX86State,guest_EAX));
+ vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EAX)));
+ vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EIP)));
+# endif
+
# if defined(VGA_amd64)
- /* x86/amd64 XMM regs must form an array, ie, have no
- holes in between. */
+ /* amd64 XMM regs must form an array, ie, have no holes in
+ between. */
vg_assert(
(offsetof(VexGuestAMD64State,guest_XMM16)
- offsetof(VexGuestAMD64State,guest_XMM0))
== (17/*#regs*/-1) * 16/*bytes per reg*/
);
+ vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestAMD64State,guest_XMM0)));
+ vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_FPREG)));
+ vg_assert(16 == offsetof(VexGuestAMD64State,guest_RAX));
+ vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RAX)));
+ vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RIP)));
# endif
# if defined(VGA_ppc32) || defined(VGA_ppc64)
# if defined(VGA_arm)
/* arm guest_state VFP regs must be 8 byte aligned for
- loads/stores. */
- vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D0));
- vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
- vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
+ loads/stores. Let's use 16 just to be on the safe side. */
+ vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_D0));
+ vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
+ vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
/* be extra paranoid .. */
vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D1));
vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D1));
}
/* Run the thread tid for a while, and return a VG_TRC_* value
- indicating why VG_(run_innerloop) stopped. */
-static UInt run_thread_for_a_while ( ThreadId tid )
+ indicating why VG_(disp_run_translations) stopped, and possibly an
+ auxiliary word. Also, only allow the thread to run for at most
+ *dispatchCtrP events. If (as is the normal case) use_alt_host_addr
+ is False, we are running ordinary redir'd translations, and we
+ should therefore start by looking up the guest next IP in TT. If
+ it is True then we ignore the guest next IP and just run from
+ alt_host_addr, which presumably points at host code for a no-redir
+ translation.
+
+ Return results are placed in two_words. two_words[0] is set to the
+ TRC. In the case where that is VG_TRC_CHAIN_ME_TO_{SLOW,FAST}_EP,
+ the address to patch is placed in two_words[1].
+*/
+static
+void run_thread_for_a_while ( /*OUT*/HWord* two_words,
+ /*MOD*/Int* dispatchCtrP,
+ ThreadId tid,
+ HWord alt_host_addr,
+ Bool use_alt_host_addr )
{
- volatile UWord jumped;
- volatile ThreadState* tst = NULL; /* stop gcc complaining */
- volatile UInt trc;
- volatile Int dispatch_ctr_SAVED;
- volatile Int done_this_time;
+ volatile HWord jumped = 0;
+ volatile ThreadState* tst = NULL; /* stop gcc complaining */
+ volatile UInt trc = 0;
+ volatile Int done_this_time = 0;
+ volatile HWord host_code_addr = 0;
/* Paranoia */
vg_assert(VG_(is_valid_tid)(tid));
vg_assert(VG_(is_running_thread)(tid));
vg_assert(!VG_(is_exiting)(tid));
+ vg_assert(*dispatchCtrP > 0);
tst = VG_(get_ThreadState)(tid);
do_pre_run_checks( (ThreadState*)tst );
/* end Paranoia */
- trc = 0;
- dispatch_ctr_SAVED = VG_(dispatch_ctr);
+ /* Clear return area. */
+ two_words[0] = two_words[1] = 0;
+
+ /* Figure out where we're starting from. */
+ if (use_alt_host_addr) {
+ /* unusual case -- no-redir translation */
+ host_code_addr = alt_host_addr;
+ vg_assert(host_code_addr != 0); /* implausible */
+ } else {
+ /* normal case -- redir translation */
+ AddrH res = 0;
+ Bool found = VG_(search_transtab)(
+ &res, NULL, NULL,
+ (Addr64)tst->arch.vex.VG_INSTR_PTR,
+ True/*upd cache -- necessary?*/
+ );
+ if (found) {
+ host_code_addr = res;
+ vg_assert(host_code_addr != 0); /* implausible */
+ } else {
+ host_code_addr = 0;
+ }
+ }
+
+ /* At this point, either host_code_addr is nonzero, in which case
+ we're OK, or it's zero, in which case we know that we intended
+ to start at a normal redir translation, but it was not found.
+ In which case we can return now claiming it's not findable. */
+ if (host_code_addr == 0) {
+ two_words[0] = VG_TRC_INNER_FASTMISS; /* hmm, is that right? */
+ return;
+ }
/* there should be no undealt-with signals */
//vg_assert(VG_(threads)[tid].siginfo.si_signo == 0);
+ /* Set up event counter stuff for the run. */
+ tst->arch.vex.host_EvC_COUNTER = *dispatchCtrP;
+ tst->arch.vex.host_EvC_FAILADDR = (HWord)&VG_(disp_cp_evcheck_fail);
+
if (0) {
vki_sigset_t m;
Int i, err = VG_(sigprocmask)(VKI_SIG_SETMASK, NULL, &m);
VG_(printf)("\n");
}
+ /* Set up return-value area. */
+
// Tell the tool this thread is about to run client code
VG_TRACK( start_client_code, tid, bbs_done );
SCHEDSETJMP(
tid,
jumped,
- trc = (UInt)VG_(run_innerloop)( (void*)&tst->arch.vex,
- VG_(clo_profile_flags) > 0 ? 1 : 0 )
+ trc = (UInt)VG_(disp_run_translations)(
+ two_words,
+ (void*)&tst->arch.vex,
+ host_code_addr
+ )
);
vg_assert(VG_(in_generated_code) == True);
VG_(in_generated_code) = False;
- if (jumped != (UWord)0) {
+ if (jumped != (HWord)0) {
/* We get here if the client took a fault that caused our signal
handler to longjmp. */
vg_assert(trc == 0);
- trc = VG_TRC_FAULT_SIGNAL;
+ two_words[0] = VG_TRC_FAULT_SIGNAL;
+ two_words[1] = 0;
block_signals();
}
- done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 0;
+ vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1);
+ vg_assert(tst->arch.vex.host_EvC_FAILADDR
+ == (HWord)&VG_(disp_cp_evcheck_fail));
+
+ done_this_time = *dispatchCtrP - ((Int)tst->arch.vex.host_EvC_COUNTER + 1);
vg_assert(done_this_time >= 0);
bbs_done += (ULong)done_this_time;
+ *dispatchCtrP -= done_this_time;
+ vg_assert(*dispatchCtrP >= 0);
+
// Tell the tool this thread has stopped running client code
VG_TRACK( stop_client_code, tid, bbs_done );
VG_(gdbserver) (tid);
}
- return trc;
-}
-
-
-/* Run a no-redir translation just once, and return the resulting
- VG_TRC_* value. */
-static UInt run_noredir_translation ( Addr hcode, ThreadId tid )
-{
- volatile UWord jumped;
- volatile ThreadState* tst;
- volatile UWord argblock[4];
- volatile UInt retval;
-
- /* Paranoia */
- vg_assert(VG_(is_valid_tid)(tid));
- vg_assert(VG_(is_running_thread)(tid));
- vg_assert(!VG_(is_exiting)(tid));
-
- tst = VG_(get_ThreadState)(tid);
- do_pre_run_checks( (ThreadState*)tst );
- /* end Paranoia */
-
-# if defined(VGA_ppc32) || defined(VGA_ppc64)
- /* I don't think we need to clear this thread's guest_RESVN here,
- because we can only get here if run_thread_for_a_while() has
- been used immediately before, on this same thread. */
-# endif
-
- /* There can be 3 outcomes from VG_(run_a_noredir_translation):
-
- - a signal occurred and the sighandler longjmp'd. Then both [2]
- and [3] are unchanged - hence zero.
-
- - translation ran normally, set [2] (next guest IP) and set [3]
- to whatever [1] was beforehand, indicating a normal (boring)
- jump to the next block.
-
- - translation ran normally, set [2] (next guest IP) and set [3]
- to something different from [1] beforehand, which indicates a
- TRC_ value.
- */
- argblock[0] = (UWord)hcode;
- argblock[1] = (UWord)&VG_(threads)[tid].arch.vex;
- argblock[2] = 0; /* next guest IP is written here */
- argblock[3] = 0; /* guest state ptr afterwards is written here */
-
- // Tell the tool this thread is about to run client code
- VG_TRACK( start_client_code, tid, bbs_done );
-
- vg_assert(VG_(in_generated_code) == False);
- VG_(in_generated_code) = True;
-
- SCHEDSETJMP(
- tid,
- jumped,
- VG_(run_a_noredir_translation)( &argblock[0] )
- );
-
- VG_(in_generated_code) = False;
-
- if (jumped != (UWord)0) {
- /* We get here if the client took a fault that caused our signal
- handler to longjmp. */
- vg_assert(argblock[2] == 0); /* next guest IP was not written */
- vg_assert(argblock[3] == 0); /* trc was not written */
- block_signals();
- retval = VG_TRC_FAULT_SIGNAL;
+ /* TRC value and possible auxiliary patch-address word are already
+ in two_words[0] and [1] respectively, as a result of the call to
+ VG_(run_innerloop). */
+ /* Stay sane .. */
+ if (two_words[0] == VG_TRC_CHAIN_ME_TO_SLOW_EP
+ || two_words[0] == VG_TRC_CHAIN_ME_TO_FAST_EP) {
+ vg_assert(two_words[1] != 0); /* we have a legit patch addr */
} else {
- /* store away the guest program counter */
- VG_(set_IP)( tid, argblock[2] );
- if (argblock[3] == argblock[1])
- /* the guest state pointer afterwards was unchanged */
- retval = VG_TRC_BORING;
- else
- retval = (UInt)argblock[3];
+ vg_assert(two_words[1] == 0); /* nobody messed with it */
}
-
- bbs_done++;
-
- // Tell the tool this thread has stopped running client code
- VG_TRACK( stop_client_code, tid, bbs_done );
-
- return retval;
}
/* Trivial event. Miss in the fast-cache. Do a full
lookup for it. */
- found = VG_(search_transtab)( NULL, ip, True/*upd_fast_cache*/ );
+ found = VG_(search_transtab)( NULL, NULL, NULL,
+ ip, True/*upd_fast_cache*/ );
if (UNLIKELY(!found)) {
/* Not found; we need to request a translation. */
- if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/,
+ if (VG_(translate)( NULL/*caused_discardP*/,
+ tid, ip, /*debug*/False, 0/*not verbose*/,
bbs_done, True/*allow redirection*/ )) {
- found = VG_(search_transtab)( NULL, ip, True );
- vg_assert2(found, "VG_TRC_INNER_FASTMISS: missing tt_fast entry");
+ found = VG_(search_transtab)( NULL, NULL, NULL,
+ ip, True );
+ vg_assert2(found, "handle_tt_miss: missing tt_fast entry");
} else {
// If VG_(translate)() fails, it's because it had to throw a
}
}
+static
+void handle_chain_me ( ThreadId tid, void* place_to_chain, Bool toFastEP )
+{
+ Bool found = False;
+ Addr ip = VG_(get_IP)(tid);
+ UInt to_sNo = (UInt)-1;
+ UInt to_tteNo = (UInt)-1;
+ Bool caused_discard = False;
+
+ found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
+ ip, False/*dont_upd_fast_cache*/ );
+ if (!found) {
+ /* Not found; we need to request a translation. */
+ if (VG_(translate)( &caused_discard,
+ tid, ip, /*debug*/False, 0/*not verbose*/,
+ bbs_done, True/*allow redirection*/ )) {
+ found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
+ ip, False );
+ vg_assert2(found, "handle_chain_me: missing tt_fast entry");
+ } else {
+ // If VG_(translate)() fails, it's because it had to throw a
+ // signal because the client jumped to a bad address. That
+ // means that either a signal has been set up for delivery,
+ // or the thread has been marked for termination. Either
+ // way, we just need to go back into the scheduler loop.
+ return;
+ }
+ }
+ vg_assert(found);
+ vg_assert(to_sNo != -1);
+ vg_assert(to_tteNo != -1);
+
+ /* So, finally we know where to patch through to. Do the patching
+ and update the various admin tables that allow it to be undone
+ in the case that the destination block gets deleted. */
+ if (!caused_discard)
+ VG_(tt_tc_do_chaining)( place_to_chain,
+ to_sNo, to_tteNo, toFastEP );
+}
+
static void handle_syscall(ThreadId tid, UInt trc)
{
ThreadState * volatile tst = VG_(get_ThreadState)(tid);
/* tid just requested a jump to the noredir version of its current
program counter. So make up that translation if needed, run it,
- and return the resulting thread return code. */
-static UInt/*trc*/ handle_noredir_jump ( ThreadId tid )
+ and return the resulting thread return code in two_words[]. */
+static
+void handle_noredir_jump ( /*OUT*/HWord* two_words,
+ /*MOD*/Int* dispatchCtrP,
+ ThreadId tid )
{
+ /* Clear return area. */
+ two_words[0] = two_words[1] = 0;
+
AddrH hcode = 0;
Addr ip = VG_(get_IP)(tid);
Bool found = VG_(search_unredir_transtab)( &hcode, ip );
if (!found) {
/* Not found; we need to request a translation. */
- if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
+ if (VG_(translate)( NULL/*caused_discardP*/,
+ tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
False/*NO REDIRECTION*/ )) {
found = VG_(search_unredir_transtab)( &hcode, ip );
vg_assert2(found, "unredir translation missing after creation?!");
-
} else {
// If VG_(translate)() fails, it's because it had to throw a
// signal because the client jumped to a bad address. That
// means that either a signal has been set up for delivery,
// or the thread has been marked for termination. Either
// way, we just need to go back into the scheduler loop.
- return VG_TRC_BORING;
+ two_words[0] = VG_TRC_BORING;
+ return;
}
}
vg_assert(found);
vg_assert(hcode != 0);
- /* Otherwise run it and return the resulting VG_TRC_* value. */
- return run_noredir_translation( hcode, tid );
+ /* Otherwise run it and return the resulting VG_TRC_* value. */
+ vg_assert(*dispatchCtrP > 0); /* so as to guarantee progress */
+ run_thread_for_a_while( two_words, dispatchCtrP, tid,
+ hcode, True/*use hcode*/ );
}
*/
VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
{
- UInt trc;
+ /* Holds the remaining size of this thread's "timeslice". */
+ Int dispatch_ctr = 0;
+
ThreadState *tst = VG_(get_ThreadState)(tid);
static Bool vgdb_startup_action_done = False;
vg_assert(VG_(is_running_thread)(tid));
- VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1;
+ dispatch_ctr = SCHEDULING_QUANTUM;
while (!VG_(is_exiting)(tid)) {
- if (VG_(dispatch_ctr) == 1) {
+ vg_assert(dispatch_ctr >= 0);
+ if (dispatch_ctr == 0) {
/* Our slice is done, so yield the CPU to another thread. On
Linux, this doesn't sleep between sleeping and running,
exceed zero before entering the innerloop. Also also, the
decrement is done before the bb is actually run, so you
always get at least one decrement even if nothing happens. */
- VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1;
+ // FIXME is this right?
+ dispatch_ctr = SCHEDULING_QUANTUM;
/* paranoia ... */
vg_assert(tst->tid == tid);
if (0)
VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs\n",
- tid, VG_(dispatch_ctr) - 1 );
+ tid, dispatch_ctr - 1 );
- trc = run_thread_for_a_while ( tid );
+ HWord trc[2]; /* "two_words" */
+ run_thread_for_a_while( &trc[0],
+ &dispatch_ctr,
+ tid, 0/*ignored*/, False );
if (VG_(clo_trace_sched) && VG_(clo_verbosity) > 2) {
- Char buf[50];
- VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc));
+ HChar buf[50];
+ VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc[0]));
print_sched_event(tid, buf);
}
- if (trc == VEX_TRC_JMP_NOREDIR) {
+ if (trc[0] == VEX_TRC_JMP_NOREDIR) {
/* If we got a request to run a no-redir version of
something, do so now -- handle_noredir_jump just (creates
and) runs that one translation. The flip side is that the
request -- that would be nonsensical. It can, however,
return VG_TRC_BORING, which just means keep going as
normal. */
- trc = handle_noredir_jump(tid);
- vg_assert(trc != VEX_TRC_JMP_NOREDIR);
+ /* Note that the fact that we need to continue with a
+ no-redir jump is not recorded anywhere else in this
+ thread's state. So we *must* execute the block right now
+ -- we can't fail to execute it and later resume with it,
+ because by then we'll have forgotten the fact that it
+ should be run as no-redir, but will get run as a normal
+ potentially-redir'd, hence screwing up. This really ought
+ to be cleaned up, by noting in the guest state that the
+ next block to be executed should be no-redir. Then we can
+ suspend and resume at any point, which isn't the case at
+ the moment. */
+ handle_noredir_jump( &trc[0],
+ &dispatch_ctr,
+ tid );
+ vg_assert(trc[0] != VEX_TRC_JMP_NOREDIR);
+
+ /* This can't be allowed to happen, since it means the block
+ didn't execute, and we have no way to resume-as-noredir
+ after we get more timeslice. But I don't think it ever
+ can, since handle_noredir_jump will assert if the counter
+ is zero on entry. */
+ vg_assert(trc[0] != VG_TRC_INNER_COUNTERZERO);
+
+ /* A no-redir translation can't return with a chain-me
+ request, since chaining in the no-redir cache is too
+ complex. */
+ vg_assert(trc[0] != VG_TRC_CHAIN_ME_TO_SLOW_EP
+ && trc[0] != VG_TRC_CHAIN_ME_TO_FAST_EP);
}
- switch (trc) {
+ switch (trc[0]) {
+ case VEX_TRC_JMP_BORING:
+ /* assisted dispatch, no event. Used by no-redir
+ translations to force return to the scheduler. */
case VG_TRC_BORING:
/* no special event, just keep going. */
break;
case VG_TRC_INNER_FASTMISS:
- vg_assert(VG_(dispatch_ctr) > 1);
+ vg_assert(dispatch_ctr > 0);
handle_tt_miss(tid);
break;
-
+
+ case VG_TRC_CHAIN_ME_TO_SLOW_EP: {
+ if (0) VG_(printf)("sched: CHAIN_TO_SLOW_EP: %p\n", (void*)trc[1] );
+ handle_chain_me(tid, (void*)trc[1], False);
+ break;
+ }
+
+ case VG_TRC_CHAIN_ME_TO_FAST_EP: {
+ if (0) VG_(printf)("sched: CHAIN_TO_FAST_EP: %p\n", (void*)trc[1] );
+ handle_chain_me(tid, (void*)trc[1], True);
+ break;
+ }
+
case VEX_TRC_JMP_CLIENTREQ:
do_client_request(tid);
break;
case VEX_TRC_JMP_SYS_INT129: /* x86-darwin */
case VEX_TRC_JMP_SYS_INT130: /* x86-darwin */
case VEX_TRC_JMP_SYS_SYSCALL: /* amd64-linux, ppc32-linux, amd64-darwin */
- handle_syscall(tid, trc);
+ handle_syscall(tid, trc[0]);
if (VG_(clo_sanity_level) > 2)
VG_(sanity_check_general)(True); /* sanity-check every syscall */
break;
before swapping to another. That means that short term
spins waiting for hardware to poke memory won't cause a
thread swap. */
- if (VG_(dispatch_ctr) > 2000)
- VG_(dispatch_ctr) = 2000;
+ if (dispatch_ctr > 2000)
+ dispatch_ctr = 2000;
break;
case VG_TRC_INNER_COUNTERZERO:
/* Timeslice is out. Let a new thread be scheduled. */
- vg_assert(VG_(dispatch_ctr) == 1);
+ vg_assert(dispatch_ctr == 0);
break;
case VG_TRC_FAULT_SIGNAL:
default:
vg_assert2(0, "VG_(scheduler), phase 3: "
- "unexpected thread return code (%u)", trc);
+ "unexpected thread return code (%u)", trc[0]);
/* NOTREACHED */
break;
bb->tyenv = deepCopyIRTypeEnv(sb_in->tyenv);
bb->next = deepCopyIRExpr(sb_in->next);
bb->jumpkind = sb_in->jumpkind;
+ bb->offsIP = sb_in->offsIP;
delta = 0;
instead of the normal one.
TID is the identity of the thread requesting this translation.
-*/
-Bool VG_(translate) ( ThreadId tid,
- Addr64 nraddr,
- Bool debugging_translation,
- Int debugging_verbosity,
- ULong bbs_done,
- Bool allow_redirection )
+ *caused_discardP returns whether or not this translation resulting
+ in code being dumped from the main translation cache in order to
+ make space for the new translation.
+*/
+Bool VG_(translate) ( /*OUT*/Bool* caused_discardP,
+ ThreadId tid,
+ Addr64 nraddr,
+ Bool debugging_translation,
+ Int debugging_verbosity,
+ ULong bbs_done,
+ Bool allow_redirection )
{
Addr64 addr;
T_Kind kind;
VexTranslateResult tres;
VgCallbackClosure closure;
- /* Make sure Vex is initialised right. */
+ if (caused_discardP) *caused_discardP = False;
+ /* Make sure Vex is initialised right. */
static Bool vex_init_done = False;
if (!vex_init_done) {
}
vg_assert(objname);
VG_(printf)(
- "==== SB %d (exec'd %lld) [tid %d] 0x%llx %s %s+0x%llx\n",
+ "==== SB %d (evchecks %lld) [tid %d] 0x%llx %s %s+0x%llx\n",
VG_(get_bbs_translated)(), bbs_done, (Int)tid, addr,
fnname, objname, (ULong)objoff
);
vta.arch_host = vex_arch;
vta.archinfo_host = vex_archinfo;
vta.abiinfo_both = vex_abiinfo;
+ vta.callback_opaque = (void*)&closure;
vta.guest_bytes = (UChar*)ULong_to_Ptr(addr);
vta.guest_bytes_addr = (Addr64)addr;
- vta.callback_opaque = (void*)&closure;
vta.chase_into_ok = chase_into_ok;
- vta.preamble_function = preamble_fn;
vta.guest_extents = &vge;
vta.host_bytes = tmpbuf;
vta.host_bytes_size = N_TMPBUF;
IRSB*,VexGuestLayout*,VexGuestExtents*,
IRType,IRType)
= (IRSB*(*)(void*,IRSB*,VexGuestLayout*,VexGuestExtents*,IRType,IRType))f;
- vta.instrument1 = g;
+ vta.instrument1 = g;
}
/* No need for type kludgery here. */
- vta.instrument2 = need_to_handle_SP_assignment()
- ? vg_SP_update_pass
- : NULL;
- vta.finaltidy = VG_(needs).final_IR_tidy_pass
- ? VG_(tdict).tool_final_IR_tidy_pass
- : NULL;
- vta.needs_self_check = needs_self_check;
- vta.traceflags = verbosity;
-
- /* Set up the dispatch-return info. For archs without a link
- register, vex generates a jump back to the specified dispatch
- address. Else, it just generates a branch-to-LR. */
+ vta.instrument2 = need_to_handle_SP_assignment()
+ ? vg_SP_update_pass
+ : NULL;
+ vta.finaltidy = VG_(needs).final_IR_tidy_pass
+ ? VG_(tdict).tool_final_IR_tidy_pass
+ : NULL;
+ vta.needs_self_check = needs_self_check;
+ vta.preamble_function = preamble_fn;
+ vta.traceflags = verbosity;
+ vta.addProfInc = VG_(clo_profile_flags) > 0
+ && kind != T_NoRedir;
+
+ /* Set up the dispatch continuation-point info. If this is a
+ no-redir translation then it cannot be chained, and the chain-me
+ points are set to NULL to indicate that. The indir point must
+ also be NULL, since we can't allow this translation to do an
+ indir transfer -- that would take it back into the main
+ translation cache too.
+
+ All this is because no-redir translations live outside the main
+ translation cache (in a secondary one) and chaining them would
+ involve more adminstrative complexity that isn't worth the
+ hassle, because we don't expect them to get used often. So
+ don't bother. */
+ if (allow_redirection) {
+ vta.disp_cp_chain_me_to_slowEP = (void*) &VG_(disp_cp_chain_me_to_slowEP);
+ vta.disp_cp_chain_me_to_fastEP = (void*) &VG_(disp_cp_chain_me_to_fastEP);
+ vta.disp_cp_xindir = (void*) &VG_(disp_cp_xindir);
+ } else {
+ vta.disp_cp_chain_me_to_slowEP = NULL;
+ vta.disp_cp_chain_me_to_fastEP = NULL;
+ vta.disp_cp_xindir = NULL;
+ }
+ /* Thins doesn't involve chaining and so is always allowable. */
+ vta.disp_cp_xassisted = (void*) &VG_(disp_cp_xassisted);
+#if 0
+ // FIXME tidy this up and make profiling work again
# if defined(VGA_x86) || defined(VGA_amd64)
if (!allow_redirection) {
/* It's a no-redir translation. Will be run with the
# else
# error "Unknown arch"
# endif
+#endif /* 0 */
/* Sheesh. Finally, actually _do_ the translation! */
tres = LibVEX_Translate ( &vta );
// Note that we use nraddr (the non-redirected address), not
// addr, which might have been changed by the redirection
- VG_(add_to_transtab)( &vge,
- nraddr,
- (Addr)(&tmpbuf[0]),
- tmpbuf_used,
- tres.n_sc_extents > 0 );
+ Bool caused_discard
+ = VG_(add_to_transtab)( &vge,
+ nraddr,
+ (Addr)(&tmpbuf[0]),
+ tmpbuf_used,
+ tres.n_sc_extents > 0,
+ tres.offs_profInc,
+ vex_arch );
+ if (caused_discardP)
+ *caused_discardP = caused_discard;
} else {
+ vg_assert(tres.offs_profInc == -1); /* -1 == unset */
VG_(add_to_unredir_transtab)( &vge,
nraddr,
(Addr)(&tmpbuf[0]),
#include "pub_core_basics.h"
#include "pub_core_debuglog.h"
-#include "pub_core_machine.h" // For VG(machine_get_VexArchInfo)
+#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo)
#include "pub_core_libcbase.h"
+#include "pub_core_vki.h" // to keep pub_core_libproc.h happy, sigh
+#include "pub_core_libcproc.h" // VG_(invalidate_icache)
#include "pub_core_libcassert.h"
#include "pub_core_libcprint.h"
#include "pub_core_options.h"
#include "pub_core_transtab.h"
#include "pub_core_aspacemgr.h"
#include "pub_core_mallocfree.h" // VG_(out_of_memory_NORETURN)
-
-// JRS FIXME get rid of this somehow
-#if defined(VGP_arm_linux)
-# include "pub_core_vkiscnums.h" // __ARM_NR_cacheflush
-# include "pub_core_syscall.h" // VG_(do_syscallN)
-#endif
+#include "pub_core_xarray.h"
+#include "pub_core_dispatch.h" // For VG_(disp_cp*) addresses
/* #define DEBUG_TRANSTAB */
'deleted') and it is strongly recommended not to change this.
65521 is the largest prime <= 65535. */
#define N_TTES_PER_SECTOR /*30011*/ /*40009*/ 65521
+//DEBUG-ONLY: #define N_TTES_PER_SECTOR 10007
/* Because each sector contains a hash table of TTEntries, we need to
specify the maximum allowable loading, after which the sector is
/*------------------ TYPES ------------------*/
+/* In edges ("to-me") in the graph created by chaining. */
+typedef
+ struct {
+ UInt from_sNo; /* sector number */
+ UInt from_tteNo; /* TTE number in given sector */
+ UInt from_offs; /* code offset from TCEntry::tcptr where the patch is */
+ Bool to_fastEP; /* Is the patch to a fast or slow entry point? */
+ }
+ InEdge;
+
+
+/* Out edges ("from-me") in the graph created by chaining. */
+typedef
+ struct {
+ UInt to_sNo; /* sector number */
+ UInt to_tteNo; /* TTE number in given sector */
+ UInt from_offs; /* code offset in owning translation where patch is */
+ }
+ OutEdge;
+
+
+#define N_FIXED_IN_EDGE_ARR 3
+typedef
+ struct {
+ UInt n_fixed; /* 0 .. N_FIXED_IN_EDGE_ARR */
+ InEdge fixed[N_FIXED_IN_EDGE_ARR];
+ XArray* var; /* XArray* of InEdgeArr */
+ }
+ InEdgeArr;
+
+#define N_FIXED_OUT_EDGE_ARR 2
+typedef
+ struct {
+ UInt n_fixed; /* 0 .. N_FIXED_OUT_EDGE_ARR */
+ OutEdge fixed[N_FIXED_OUT_EDGE_ARR];
+ XArray* var; /* XArray* of OutEdgeArr */
+ }
+ OutEdgeArr;
+
+
/* A translation-table entry. This indicates precisely which areas of
guest code are included in the translation, and contains all other
auxiliary info too. */
Count is an entry count for the translation and is
incremented by 1 every time the translation is used, if we
are profiling. */
- UInt count;
+ ULong count;
UShort weight;
/* Status of the slot. Note, we need to be able to do lazy
// sec->ec2tte[ tte2ec_ec[i] ][ tte2ec_ix[i] ]
// should be the index
// of this TTEntry in the containing Sector's tt array.
+
+ /* Admin information for chaining. 'in_edges' is a set of the
+ patch points which jump to this translation -- hence are
+ predecessors in the control flow graph. 'out_edges' points
+ to successors in the control flow graph -- translations to
+ which this one has a patched jump. In short these are just
+ backwards and forwards edges in the graph of patched-together
+ blocks. The 'in_edges' contain slightly more info, enough
+ that we can undo the chaining of each mentioned patch point.
+ The 'out_edges' list exists only so that we can visit the
+ 'in_edges' entries of all blocks we're patched through to, in
+ order to remove ourselves from then when we're deleted. */
+
+ /* It is possible, although very unlikely, that a block A has
+ more than one patched jump to block B. This could happen if
+ (eg) A finishes "jcond B; jmp B".
+
+ This means in turn that B's in_edges set can list A more than
+ once (twice in this example). However, each such entry must
+ have a different from_offs, since a patched jump can only
+ jump to one place at once (it's meaningless for it to have
+ multiple destinations.) IOW, the successor and predecessor
+ edges in the graph are not uniquely determined by a
+ TTEntry --> TTEntry pair, but rather by a
+ (TTEntry,offset) --> TTEntry triple.
+
+ If A has multiple edges to B then B will mention A multiple
+ times in its in_edges. To make things simpler, we then
+ require that A mentions B exactly the same number of times in
+ its out_edges. Furthermore, a matching out-in pair must have
+ the same offset (from_offs). This facilitates sanity
+ checking, and it facilitates establishing the invariant that
+ a out_edges set may not have duplicates when using the
+ equality defined by (TTEntry,offset). Hence the out_edges
+ and in_edges sets really do have both have set semantics.
+
+ eg if A has been patched to B at offsets 42 and 87 (in A)
+ then A.out_edges = { (B,42), (B,87) } (in any order)
+ and B.in_edges = { (A,42), (A,87) } (in any order)
+
+ Hence for each node pair P->Q in the graph, there's a 1:1
+ mapping between P.out_edges and Q.in_edges.
+ */
+ InEdgeArr in_edges;
+ OutEdgeArr out_edges;
}
TTEntry;
+/* A structure used for mapping host code addresses back to the
+ relevant TTEntry. Used when doing chaining, for finding the
+ TTEntry to which some arbitrary patch address belongs. */
+typedef
+ struct {
+ UChar* start;
+ UInt len;
+ UInt tteNo;
+ }
+ HostExtent;
+
/* Finally, a sector itself. Each sector contains an array of
TCEntries, which hold code, and an array of TTEntries, containing
all required administrative info. Profiling is supported using the
- TTEntry .count and .weight fields, if required. Each sector is
- independent in that no cross-sector references are allowed.
+ TTEntry .count and .weight fields, if required.
If the sector is not in use, all three pointers are NULL and
tt_n_inuse is zero.
Int ec2tte_size[ECLASS_N];
Int ec2tte_used[ECLASS_N];
UShort* ec2tte[ECLASS_N];
+
+ /* The host extents. The [start, +len) ranges are constructed
+ in strictly non-overlapping order, so we can binary search
+ them at any time. */
+ XArray* host_extents; /* XArray* of HostExtent */
}
Sector;
*/
/*global*/ __attribute__((aligned(16)))
FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE];
-/*
-#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
-*/
-
-/* For profiling, we have a parallel array of pointers to .count
- fields in TT entries. Again, these pointers must be invalidated
- when translations disappear. A NULL pointer suffices to indicate
- an unused slot.
-
- When not profiling (the normal case, VG_(clo_profile_flags) == 0),
- all tt_fastN entries are set to NULL at startup and never read nor
- written after that.
-
- When profiling (VG_(clo_profile_flags) > 0), tt_fast and tt_fastN
- change together: if tt_fast[i].guest is TRANSTAB_BOGUS_GUEST_ADDR
- then the corresponding tt_fastN[i] must be null. If
- tt_fast[i].guest is any other value, then tt_fastN[i] *must* point
- to the .count field of the corresponding TT entry.
-
- tt_fast and tt_fastN are referred to from assembly code
- (dispatch.S).
-*/
-/*global*/ UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE];
-
/* Make sure we're not used before initialisation. */
static Bool init_done = False;
/*------------------ STATS DECLS ------------------*/
/* Number of fast-cache updates and flushes done. */
-ULong n_fast_flushes = 0;
-ULong n_fast_updates = 0;
+static ULong n_fast_flushes = 0;
+static ULong n_fast_updates = 0;
/* Number of full lookups done. */
-ULong n_full_lookups = 0;
-ULong n_lookup_probes = 0;
+static ULong n_full_lookups = 0;
+static ULong n_lookup_probes = 0;
/* Number/osize/tsize of translations entered; also the number of
those for which self-checking was requested. */
-ULong n_in_count = 0;
-ULong n_in_osize = 0;
-ULong n_in_tsize = 0;
-ULong n_in_sc_count = 0;
+static ULong n_in_count = 0;
+static ULong n_in_osize = 0;
+static ULong n_in_tsize = 0;
+static ULong n_in_sc_count = 0;
/* Number/osize of translations discarded due to lack of space. */
-ULong n_dump_count = 0;
-ULong n_dump_osize = 0;
+static ULong n_dump_count = 0;
+static ULong n_dump_osize = 0;
/* Number/osize of translations discarded due to requests to do so. */
-ULong n_disc_count = 0;
-ULong n_disc_osize = 0;
+static ULong n_disc_count = 0;
+static ULong n_disc_osize = 0;
+
+
+/*-------------------------------------------------------------*/
+/*--- Misc ---*/
+/*-------------------------------------------------------------*/
+
+static void* ttaux_malloc ( HChar* tag, SizeT n )
+{
+ return VG_(arena_malloc)(VG_AR_TTAUX, tag, n);
+}
+
+static void ttaux_free ( void* p )
+{
+ VG_(arena_free)(VG_AR_TTAUX, p);
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- Chaining support ---*/
+/*-------------------------------------------------------------*/
+
+static inline TTEntry* index_tte ( UInt sNo, UInt tteNo )
+{
+ vg_assert(sNo < N_SECTORS);
+ vg_assert(tteNo < N_TTES_PER_SECTOR);
+ Sector* s = §ors[sNo];
+ vg_assert(s->tt);
+ TTEntry* tte = &s->tt[tteNo];
+ vg_assert(tte->status == InUse);
+ return tte;
+}
+
+static void InEdge__init ( InEdge* ie )
+{
+ ie->from_sNo = -1; /* invalid */
+ ie->from_tteNo = 0;
+ ie->from_offs = 0;
+ ie->to_fastEP = False;
+}
+
+static void OutEdge__init ( OutEdge* oe )
+{
+ oe->to_sNo = -1; /* invalid */
+ oe->to_tteNo = 0;
+ oe->from_offs = 0;
+}
+
+static void TTEntry__init ( TTEntry* tte )
+{
+ VG_(memset)(tte, 0, sizeof(*tte));
+}
+
+static UWord InEdgeArr__size ( InEdgeArr* iea )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ return VG_(sizeXA)(iea->var);
+ } else {
+ vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+ return iea->n_fixed;
+ }
+}
+
+static void InEdgeArr__makeEmpty ( InEdgeArr* iea )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ VG_(deleteXA)(iea->var);
+ iea->var = NULL;
+ } else {
+ vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+ iea->n_fixed = 0;
+ }
+}
+
+static
+InEdge* InEdgeArr__index ( InEdgeArr* iea, UWord i )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ return (InEdge*)VG_(indexXA)(iea->var, i);
+ } else {
+ vg_assert(i < iea->n_fixed);
+ return &iea->fixed[i];
+ }
+}
+
+static
+void InEdgeArr__deleteIndex ( InEdgeArr* iea, UWord i )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ VG_(removeIndexXA)(iea->var, i);
+ } else {
+ vg_assert(i < iea->n_fixed);
+ for (; i+1 < iea->n_fixed; i++) {
+ iea->fixed[i] = iea->fixed[i+1];
+ }
+ iea->n_fixed--;
+ }
+}
+
+static
+void InEdgeArr__add ( InEdgeArr* iea, InEdge* ie )
+{
+ if (iea->var) {
+ vg_assert(iea->n_fixed == 0);
+ VG_(addToXA)(iea->var, ie);
+ } else {
+ vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+ if (iea->n_fixed == N_FIXED_IN_EDGE_ARR) {
+ /* The fixed array is full, so we have to initialise an
+ XArray and copy the fixed array into it. */
+ iea->var = VG_(newXA)(ttaux_malloc, "transtab.IEA__add",
+ ttaux_free,
+ sizeof(InEdge));
+ UWord i;
+ for (i = 0; i < iea->n_fixed; i++) {
+ VG_(addToXA)(iea->var, &iea->fixed[i]);
+ }
+ VG_(addToXA)(iea->var, ie);
+ iea->n_fixed = 0;
+ } else {
+ /* Just add to the fixed array. */
+ iea->fixed[iea->n_fixed++] = *ie;
+ }
+ }
+}
+
+static UWord OutEdgeArr__size ( OutEdgeArr* oea )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ return VG_(sizeXA)(oea->var);
+ } else {
+ vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+ return oea->n_fixed;
+ }
+}
+
+static void OutEdgeArr__makeEmpty ( OutEdgeArr* oea )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ VG_(deleteXA)(oea->var);
+ oea->var = NULL;
+ } else {
+ vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+ oea->n_fixed = 0;
+ }
+}
+
+static
+OutEdge* OutEdgeArr__index ( OutEdgeArr* oea, UWord i )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ return (OutEdge*)VG_(indexXA)(oea->var, i);
+ } else {
+ vg_assert(i < oea->n_fixed);
+ return &oea->fixed[i];
+ }
+}
+
+static
+void OutEdgeArr__deleteIndex ( OutEdgeArr* oea, UWord i )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ VG_(removeIndexXA)(oea->var, i);
+ } else {
+ vg_assert(i < oea->n_fixed);
+ for (; i+1 < oea->n_fixed; i++) {
+ oea->fixed[i] = oea->fixed[i+1];
+ }
+ oea->n_fixed--;
+ }
+}
+
+static
+void OutEdgeArr__add ( OutEdgeArr* oea, OutEdge* oe )
+{
+ if (oea->var) {
+ vg_assert(oea->n_fixed == 0);
+ VG_(addToXA)(oea->var, oe);
+ } else {
+ vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+ if (oea->n_fixed == N_FIXED_OUT_EDGE_ARR) {
+ /* The fixed array is full, so we have to initialise an
+ XArray and copy the fixed array into it. */
+ oea->var = VG_(newXA)(ttaux_malloc, "transtab.OEA__add",
+ ttaux_free,
+ sizeof(OutEdge));
+ UWord i;
+ for (i = 0; i < oea->n_fixed; i++) {
+ VG_(addToXA)(oea->var, &oea->fixed[i]);
+ }
+ VG_(addToXA)(oea->var, oe);
+ oea->n_fixed = 0;
+ } else {
+ /* Just add to the fixed array. */
+ oea->fixed[oea->n_fixed++] = *oe;
+ }
+ }
+}
+
+static
+Int HostExtent__cmpOrd ( void* v1, void* v2 )
+{
+ HostExtent* hx1 = (HostExtent*)v1;
+ HostExtent* hx2 = (HostExtent*)v2;
+ if (hx1->start + hx1->len <= hx2->start) return -1;
+ if (hx2->start + hx2->len <= hx1->start) return 1;
+ return 0; /* partial overlap */
+}
+
+static __attribute__((noinline))
+Bool find_TTEntry_from_hcode( /*OUT*/UInt* from_sNo,
+ /*OUT*/UInt* from_tteNo,
+ void* hcode )
+{
+ Int i;
+
+ /* Search order logic copied from VG_(search_transtab). */
+ for (i = 0; i < N_SECTORS; i++) {
+ Int sno = sector_search_order[i];
+ if (UNLIKELY(sno == -1))
+ return False; /* run out of sectors to search */
+
+ Sector* sec = §ors[sno];
+ XArray* /* of HostExtent */ host_extents = sec->host_extents;
+ vg_assert(host_extents);
+
+ HostExtent key;
+ VG_(memset)(&key, 0, sizeof(key));
+ key.start = hcode;
+ key.len = 1;
+ Word firstW = -1, lastW = -1;
+ Bool found = VG_(lookupXA_UNSAFE)(
+ host_extents, &key, &firstW, &lastW,
+ (Int(*)(void*,void*))HostExtent__cmpOrd
+ );
+ vg_assert(firstW == lastW); // always true, even if not found
+ if (found) {
+ HostExtent* hx = VG_(indexXA)(host_extents, firstW);
+ UInt tteNo = hx->tteNo;
+ /* Do some additional sanity checks. */
+ vg_assert(tteNo <= N_TTES_PER_SECTOR);
+ vg_assert(sec->tt[tteNo].status == InUse);
+ /* Can only half check that the found TTEntry contains hcode,
+ due to not having a length value for the hcode in the
+ TTEntry. */
+ vg_assert((UChar*)sec->tt[tteNo].tcptr <= (UChar*)hcode);
+ /* Looks plausible */
+ *from_sNo = sno;
+ *from_tteNo = (UInt)tteNo;
+ return True;
+ }
+ }
+ return False;
+}
+
+
+/* Figure out whether or not hcode is jitted code present in the main
+ code cache (but not in the no-redir cache). Used for sanity
+ checking. */
+static Bool is_in_the_main_TC ( void* hcode )
+{
+ Int i, sno;
+ for (i = 0; i < N_SECTORS; i++) {
+ sno = sector_search_order[i];
+ if (sno == -1)
+ break; /* run out of sectors to search */
+ if ((UChar*)hcode >= (UChar*)sectors[sno].tc
+ && (UChar*)hcode <= (UChar*)sectors[sno].tc_next
+ + sizeof(ULong) - 1)
+ return True;
+ }
+ return False;
+}
+
+
+/* Fulfill a chaining request, and record admin info so we
+ can undo it later, if required.
+*/
+void VG_(tt_tc_do_chaining) ( void* from__patch_addr,
+ UInt to_sNo,
+ UInt to_tteNo,
+ Bool to_fastEP )
+{
+ /* Get the CPU info established at startup. */
+ VexArch vex_arch = VexArch_INVALID;
+ VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
+ // host_code is where we're patching to. So it needs to
+ // take into account, whether we're jumping to the slow
+ // or fast entry point. By definition, the fast entry point
+ // is exactly one event check's worth of code along from
+ // the slow (tcptr) entry point.
+ TTEntry* to_tte = index_tte(to_sNo, to_tteNo);
+ void* host_code = ((UChar*)to_tte->tcptr)
+ + (to_fastEP ? LibVEX_evCheckSzB(vex_arch) : 0);
+
+ // stay sane -- the patch point (dst) is in this sector's code cache
+ vg_assert( (UChar*)host_code >= (UChar*)sectors[to_sNo].tc );
+ vg_assert( (UChar*)host_code <= (UChar*)sectors[to_sNo].tc_next
+ + sizeof(ULong) - 1 );
+ // stay sane -- the patch src is in some sector's code cache
+ vg_assert( is_in_the_main_TC(from__patch_addr) );
+
+ /* Get VEX to do the patching itself. We have to hand it off
+ since it is host-dependent. */
+ VexInvalRange vir
+ = LibVEX_Chain( vex_arch,
+ from__patch_addr,
+ to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP)
+ : &VG_(disp_cp_chain_me_to_slowEP),
+ (void*)host_code );
+ VG_(invalidate_icache)( (void*)vir.start, vir.len );
+
+ /* Now do the tricky bit -- update the ch_succs and ch_preds info
+ for the two translations involved, so we can undo the chaining
+ later, which we will have to do if the to_ block gets removed
+ for whatever reason. */
+ /* Find the TTEntry for the from__ code. This isn't simple since
+ we only know the patch address, which is going to be somewhere
+ inside the from_ block. */
+ UInt from_sNo = (UInt)-1;
+ UInt from_tteNo = (UInt)-1;
+ Bool from_found
+ = find_TTEntry_from_hcode( &from_sNo, &from_tteNo,
+ from__patch_addr );
+ vg_assert(from_found);
+ TTEntry* from_tte = index_tte(from_sNo, from_tteNo);
+
+ /* This is the new from_ -> to_ link to add. */
+ InEdge ie;
+ InEdge__init(&ie);
+ ie.from_sNo = from_sNo;
+ ie.from_tteNo = from_tteNo;
+ ie.to_fastEP = to_fastEP;
+ HWord from_offs = (HWord)( (UChar*)from__patch_addr
+ - (UChar*)from_tte->tcptr );
+ vg_assert(from_offs < 100000/* let's say */);
+ ie.from_offs = (UInt)from_offs;
+
+ /* This is the new to_ -> from_ backlink to add. */
+ OutEdge oe;
+ OutEdge__init(&oe);
+ oe.to_sNo = to_sNo;
+ oe.to_tteNo = to_tteNo;
+ oe.from_offs = (UInt)from_offs;
+
+ /* Add .. */
+ InEdgeArr__add(&to_tte->in_edges, &ie);
+ OutEdgeArr__add(&from_tte->out_edges, &oe);
+}
+
+
+/* Unchain one patch, as described by the specified InEdge. For
+ sanity check purposes only (to check that the patched location is
+ as expected) it also requires the fast and slow entry point
+ addresses of the destination block (that is, the block that owns
+ this InEdge). */
+__attribute__((noinline))
+static void unchain_one ( VexArch vex_arch,
+ InEdge* ie,
+ void* to_fastEPaddr, void* to_slowEPaddr )
+{
+ vg_assert(ie);
+ TTEntry* tte
+ = index_tte(ie->from_sNo, ie->from_tteNo);
+ UChar* place_to_patch
+ = ((HChar*)tte->tcptr) + ie->from_offs;
+ UChar* disp_cp_chain_me
+ = ie->to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP)
+ : &VG_(disp_cp_chain_me_to_slowEP);
+ UChar* place_to_jump_to_EXPECTED
+ = ie->to_fastEP ? to_fastEPaddr : to_slowEPaddr;
+
+ // stay sane: both src and dst for this unchaining are
+ // in the main code cache
+ vg_assert( is_in_the_main_TC(place_to_patch) ); // src
+ vg_assert( is_in_the_main_TC(place_to_jump_to_EXPECTED) ); // dst
+ // dst check is ok because LibVEX_UnChain checks that
+ // place_to_jump_to_EXPECTED really is the current dst, and
+ // asserts if it isn't.
+ VexInvalRange vir
+ = LibVEX_UnChain( vex_arch, place_to_patch,
+ place_to_jump_to_EXPECTED, disp_cp_chain_me );
+ VG_(invalidate_icache)( (void*)vir.start, vir.len );
+}
+
+
+/* The specified block is about to be deleted. Update the preds and
+ succs of its associated blocks accordingly. This includes undoing
+ any chained jumps to this block. */
+static
+void unchain_in_preparation_for_deletion ( VexArch vex_arch,
+ UInt here_sNo, UInt here_tteNo )
+{
+ if (0)
+ VG_(printf)("QQQ unchain_in_prep %u.%u\n", here_sNo, here_tteNo);
+ UWord i, j, n, m;
+ Int evCheckSzB = LibVEX_evCheckSzB(vex_arch);
+ TTEntry* here_tte = index_tte(here_sNo, here_tteNo);
+ vg_assert(here_tte->status == InUse);
+
+ /* Visit all InEdges owned by here_tte. */
+ n = InEdgeArr__size(&here_tte->in_edges);
+ for (i = 0; i < n; i++) {
+ InEdge* ie = InEdgeArr__index(&here_tte->in_edges, i);
+ // Undo the chaining.
+ UChar* here_slow_EP = (UChar*)here_tte->tcptr;
+ UChar* here_fast_EP = here_slow_EP + evCheckSzB;
+ unchain_one(vex_arch, ie, here_fast_EP, here_slow_EP);
+ // Find the corresponding entry in the "from" node's out_edges,
+ // and remove it.
+ TTEntry* from_tte = index_tte(ie->from_sNo, ie->from_tteNo);
+ m = OutEdgeArr__size(&from_tte->out_edges);
+ vg_assert(m > 0); // it must have at least one entry
+ for (j = 0; j < m; j++) {
+ OutEdge* oe = OutEdgeArr__index(&from_tte->out_edges, j);
+ if (oe->to_sNo == here_sNo && oe->to_tteNo == here_tteNo
+ && oe->from_offs == ie->from_offs)
+ break;
+ }
+ vg_assert(j < m); // "oe must be findable"
+ OutEdgeArr__deleteIndex(&from_tte->out_edges, j);
+ }
+
+ /* Visit all OutEdges owned by here_tte. */
+ n = OutEdgeArr__size(&here_tte->out_edges);
+ for (i = 0; i < n; i++) {
+ OutEdge* oe = OutEdgeArr__index(&here_tte->out_edges, i);
+ // Find the corresponding entry in the "to" node's in_edges,
+ // and remove it.
+ TTEntry* to_tte = index_tte(oe->to_sNo, oe->to_tteNo);
+ m = InEdgeArr__size(&to_tte->in_edges);
+ vg_assert(m > 0); // it must have at least one entry
+ for (j = 0; j < m; j++) {
+ InEdge* ie = InEdgeArr__index(&to_tte->in_edges, j);
+ if (ie->from_sNo == here_sNo && ie->from_tteNo == here_tteNo
+ && ie->from_offs == oe->from_offs)
+ break;
+ }
+ vg_assert(j < m); // "ie must be findable"
+ InEdgeArr__deleteIndex(&to_tte->in_edges, j);
+ }
+
+ InEdgeArr__makeEmpty(&here_tte->in_edges);
+ OutEdgeArr__makeEmpty(&here_tte->out_edges);
+}
/*-------------------------------------------------------------*/
old_sz = sec->ec2tte_size[ec];
old_ar = sec->ec2tte[ec];
new_sz = old_sz==0 ? 8 : old_sz<64 ? 2*old_sz : (3*old_sz)/2;
- new_ar = VG_(arena_malloc)(VG_AR_TTAUX, "transtab.aECN.1",
- new_sz * sizeof(UShort));
+ new_ar = ttaux_malloc("transtab.aECN.1",
+ new_sz * sizeof(UShort));
for (i = 0; i < old_sz; i++)
new_ar[i] = old_ar[i];
if (old_ar)
- VG_(arena_free)(VG_AR_TTAUX, old_ar);
+ ttaux_free(old_ar);
sec->ec2tte_size[ec] = new_sz;
sec->ec2tte[ec] = new_ar;
/* forwards */
static Bool sanity_check_redir_tt_tc ( void );
-static Bool sanity_check_fastcache ( void );
static Bool sanity_check_sector_search_order ( void )
{
}
if ( !sanity_check_redir_tt_tc() )
return False;
- if ( !sanity_check_fastcache() )
- return False;
if ( !sanity_check_sector_search_order() )
return False;
return True;
return k32 % N_TTES_PER_SECTOR;
}
-static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count )
+static void setFastCacheEntry ( Addr64 key, ULong* tcptr )
{
UInt cno = (UInt)VG_TT_FAST_HASH(key);
VG_(tt_fast)[cno].guest = (Addr)key;
VG_(tt_fast)[cno].host = (Addr)tcptr;
- if (VG_(clo_profile_flags) > 0)
- VG_(tt_fastN)[cno] = count;
n_fast_updates++;
/* This shouldn't fail. It should be assured by m_translate
which should reject any attempt to make translation of code
vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR);
}
-/* Invalidate the fast cache's counter array, VG_(tt_fastN). */
-static void invalidateFastNCache ( void )
-{
- UInt j;
- vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
- for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
- VG_(tt_fastN)[j+0] = NULL;
- VG_(tt_fastN)[j+1] = NULL;
- VG_(tt_fastN)[j+2] = NULL;
- VG_(tt_fastN)[j+3] = NULL;
- }
- vg_assert(j == VG_TT_FAST_SIZE);
-}
-
-/* Invalidate the fast cache VG_(tt_fast). If profiling, also
- invalidate the fast cache's counter array VG_(tt_fastN), otherwise
- don't touch it. */
+/* Invalidate the fast cache VG_(tt_fast). */
static void invalidateFastCache ( void )
{
UInt j;
VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR;
}
- if (VG_(clo_profile_flags) > 0)
- invalidateFastNCache();
-
vg_assert(j == VG_TT_FAST_SIZE);
n_fast_flushes++;
}
-static Bool sanity_check_fastcache ( void )
+/* Returns True if the sector has been used before (hence, if we have
+ to eject existing code in it), False if it's never been used
+ before. */
+static Bool initialiseSector ( Int sno )
{
- UInt j;
- if (0) VG_(printf)("sanity check fastcache\n");
- if (VG_(clo_profile_flags) > 0) {
- /* profiling */
- for (j = 0; j < VG_TT_FAST_SIZE; j++) {
- if (VG_(tt_fastN)[j] == NULL
- && VG_(tt_fast)[j].guest != TRANSTAB_BOGUS_GUEST_ADDR)
- return False;
- if (VG_(tt_fastN)[j] != NULL
- && VG_(tt_fast)[j].guest == TRANSTAB_BOGUS_GUEST_ADDR)
- return False;
- }
- } else {
- /* not profiling */
- for (j = 0; j < VG_TT_FAST_SIZE; j++) {
- if (VG_(tt_fastN)[j] != NULL)
- return False;
- }
- }
- return True;
-}
-
-static void initialiseSector ( Int sno )
-{
- Int i;
- SysRes sres;
+ Int i;
+ SysRes sres;
Sector* sec;
+ Bool has_been_used_before = False;
vg_assert(isValidSector(sno));
{ Bool sane = sanity_check_sector_search_order();
vg_assert(sec->ec2tte_used[i] == 0);
vg_assert(sec->ec2tte[i] == NULL);
}
+ vg_assert(sec->host_extents == NULL);
VG_(debugLog)(1,"transtab", "allocate sector %d\n", sno);
sec->tt[i].n_tte2ec = 0;
}
+ /* Set up the host_extents array. */
+ sec->host_extents
+ = VG_(newXA)(ttaux_malloc, "transtab.initialiseSector(host_extents)",
+ ttaux_free,
+ sizeof(HostExtent));
+
/* Add an entry in the sector_search_order */
for (i = 0; i < N_SECTORS; i++) {
if (sector_search_order[i] == -1)
/* Sector has been used before. Dump the old contents. */
VG_(debugLog)(1,"transtab", "recycle sector %d\n", sno);
+ has_been_used_before = True;
vg_assert(sec->tt != NULL);
vg_assert(sec->tc_next != NULL);
n_dump_count += sec->tt_n_inuse;
+ VexArch vex_arch = VexArch_INVALID;
+ VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
/* Visit each just-about-to-be-abandoned translation. */
+VG_(printf)("QQQ unlink-entire-sector: %d START\n", sno);
for (i = 0; i < N_TTES_PER_SECTOR; i++) {
if (sec->tt[i].status == InUse) {
vg_assert(sec->tt[i].n_tte2ec >= 1);
sec->tt[i].entry,
sec->tt[i].vge );
}
+ unchain_in_preparation_for_deletion(vex_arch, sno, i);
} else {
vg_assert(sec->tt[i].n_tte2ec == 0);
}
sec->tt[i].status = Empty;
sec->tt[i].n_tte2ec = 0;
}
+VG_(printf)("QQQ unlink-entire-sector: %d END\n", sno);
/* Free up the eclass structures. */
for (i = 0; i < ECLASS_N; i++) {
vg_assert(sec->ec2tte[i] == NULL);
} else {
vg_assert(sec->ec2tte[i] != NULL);
- VG_(arena_free)(VG_AR_TTAUX, sec->ec2tte[i]);
+ ttaux_free(sec->ec2tte[i]);
sec->ec2tte[i] = NULL;
sec->ec2tte_size[i] = 0;
sec->ec2tte_used[i] = 0;
}
}
+ /* Empty out the host extents array. */
+ vg_assert(sec->host_extents != NULL);
+ VG_(dropTailXA)(sec->host_extents, VG_(sizeXA)(sec->host_extents));
+ vg_assert(VG_(sizeXA)(sec->host_extents) == 0);
+
/* Sanity check: ensure it is already in
sector_search_order[]. */
for (i = 0; i < N_SECTORS; i++) {
{ Bool sane = sanity_check_sector_search_order();
vg_assert(sane);
}
-}
-
-static void invalidate_icache ( void *ptr, Int nbytes )
-{
-# if defined(VGA_ppc32) || defined(VGA_ppc64)
- Addr startaddr = (Addr) ptr;
- Addr endaddr = startaddr + nbytes;
- Addr cls;
- Addr addr;
- VexArchInfo vai;
-
- if (nbytes == 0) return;
- vg_assert(nbytes > 0);
-
- VG_(machine_get_VexArchInfo)( NULL, &vai );
- cls = vai.ppc_cache_line_szB;
-
- /* Stay sane .. */
- vg_assert(cls == 32 || cls == 64 || cls == 128);
-
- startaddr &= ~(cls - 1);
- for (addr = startaddr; addr < endaddr; addr += cls) {
- __asm__ __volatile__("dcbst 0,%0" : : "r" (addr));
- }
- __asm__ __volatile__("sync");
- for (addr = startaddr; addr < endaddr; addr += cls) {
- __asm__ __volatile__("icbi 0,%0" : : "r" (addr));
- }
- __asm__ __volatile__("sync; isync");
-
-# elif defined(VGA_x86)
- /* no need to do anything, hardware provides coherence */
-
-# elif defined(VGA_amd64)
- /* no need to do anything, hardware provides coherence */
-
-# elif defined(VGA_s390x)
- /* no need to do anything, hardware provides coherence */
-
-# elif defined(VGP_arm_linux)
- /* ARM cache flushes are privileged, so we must defer to the kernel. */
- Addr startaddr = (Addr) ptr;
- Addr endaddr = startaddr + nbytes;
- VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr);
-# else
-# error "Unknown ARCH"
-# endif
+ return has_been_used_before;
}
pre: youngest_sector points to a valid (although possibly full)
sector.
+
+ Returns True if the call caused any existing translation(s) to get
+ thrown away in order to make space for this one.
*/
-void VG_(add_to_transtab)( VexGuestExtents* vge,
+Bool VG_(add_to_transtab)( VexGuestExtents* vge,
Addr64 entry,
AddrH code,
UInt code_len,
- Bool is_self_checking )
+ Bool is_self_checking,
+ Int offs_profInc,
+ VexArch arch_host )
{
Int tcAvailQ, reqdQ, y, i;
ULong *tcptr, *tcptr2;
UChar* srcP;
UChar* dstP;
+ /* We need to tell the caller whether this call caused any code to
+ be thrown away due to the TC becoming full, and hence the oldest
+ Sector to be emptied out and recycled. */
+ Bool caused_code_discarding = False;
+
vg_assert(init_done);
vg_assert(vge->n_used >= 1 && vge->n_used <= 3);
y = youngest_sector;
vg_assert(isValidSector(y));
- if (sectors[y].tc == NULL)
- initialiseSector(y);
+ if (sectors[y].tc == NULL) {
+ Bool used_before = initialiseSector(y);
+ vg_assert(!used_before);
+ }
/* Try putting the translation in this sector. */
reqdQ = (code_len + 7) >> 3;
if (youngest_sector >= N_SECTORS)
youngest_sector = 0;
y = youngest_sector;
- initialiseSector(y);
+ caused_code_discarding = initialiseSector(y);
+
}
/* Be sure ... */
dstP = (UChar*)tcptr;
srcP = (UChar*)code;
- for (i = 0; i < code_len; i++)
- dstP[i] = srcP[i];
+ VG_(memcpy)(dstP, srcP, code_len);
sectors[y].tc_next += reqdQ;
sectors[y].tt_n_inuse++;
- invalidate_icache( dstP, code_len );
-
/* more paranoia */
tcptr2 = sectors[y].tc_next;
vg_assert(tcptr2 >= §ors[y].tc[0]);
i = 0;
}
+ TTEntry__init(§ors[y].tt[i]);
sectors[y].tt[i].status = InUse;
sectors[y].tt[i].tcptr = tcptr;
sectors[y].tt[i].count = 0;
sectors[y].tt[i].vge = *vge;
sectors[y].tt[i].entry = entry;
+ /* Patch in the profile counter location, if necessary. */
+ if (offs_profInc != -1) {
+ vg_assert(offs_profInc >= 0 && offs_profInc < code_len);
+ VexInvalRange vir
+ = LibVEX_PatchProfInc( arch_host,
+ dstP + offs_profInc,
+ §ors[y].tt[i].count );
+ VG_(invalidate_icache)( (void*)vir.start, vir.len );
+ }
+
+ VG_(invalidate_icache)( dstP, code_len );
+
+ /* Add this entry to the host_extents map, checking that we're
+ adding in order. */
+ { HostExtent hx;
+ hx.start = (UChar*)tcptr;
+ hx.len = code_len;
+ hx.tteNo = i;
+ vg_assert(hx.len > 0); /* bsearch fails w/ zero length entries */
+ XArray* hx_array = sectors[y].host_extents;
+ vg_assert(hx_array);
+ Word n = VG_(sizeXA)(hx_array);
+ if (n > 0) {
+ HostExtent* hx_prev = (HostExtent*)VG_(indexXA)(hx_array, n-1);
+ vg_assert(hx_prev->start + hx_prev->len <= hx.start);
+ }
+ VG_(addToXA)(hx_array, &hx);
+ }
+
/* Update the fast-cache. */
- setFastCacheEntry( entry, tcptr, §ors[y].tt[i].count );
+ setFastCacheEntry( entry, tcptr );
/* Note the eclass numbers for this translation. */
upd_eclasses_after_add( §ors[y], i );
+
+ return caused_code_discarding;
}
requested, a successful search can also cause the fast-caches to be
updated.
*/
-Bool VG_(search_transtab) ( /*OUT*/AddrH* result,
+Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode,
+ /*OUT*/UInt* res_sNo,
+ /*OUT*/UInt* res_tteNo,
Addr64 guest_addr,
Bool upd_cache )
{
/* found it */
if (upd_cache)
setFastCacheEntry(
- guest_addr, sectors[sno].tt[k].tcptr,
- §ors[sno].tt[k].count );
- if (result)
- *result = (AddrH)sectors[sno].tt[k].tcptr;
+ guest_addr, sectors[sno].tt[k].tcptr );
+ if (res_hcode)
+ *res_hcode = (AddrH)sectors[sno].tt[k].tcptr;
+ if (res_sNo)
+ *res_sNo = sno;
+ if (res_tteNo)
+ *res_tteNo = k;
/* pull this one one step closer to the front. For large
apps this more or less halves the number of required
probes. */
/* Delete a tt entry, and update all the eclass data accordingly. */
-static void delete_tte ( /*MOD*/Sector* sec, Int tteno )
+static void delete_tte ( /*MOD*/Sector* sec, UInt secNo, Int tteno,
+ VexArch vex_arch )
{
Int i, ec_num, ec_idx;
TTEntry* tte;
+ /* sec and secNo are mutually redundant; cross-check. */
+ vg_assert(sec == §ors[secNo]);
+
vg_assert(tteno >= 0 && tteno < N_TTES_PER_SECTOR);
tte = &sec->tt[tteno];
vg_assert(tte->status == InUse);
vg_assert(tte->n_tte2ec >= 1 && tte->n_tte2ec <= 3);
+ /* Unchain .. */
+ unchain_in_preparation_for_deletion(vex_arch, secNo, tteno);
+
/* Deal with the ec-to-tte links first. */
for (i = 0; i < tte->n_tte2ec; i++) {
ec_num = (Int)tte->tte2ec_ec[i];
only consider translations in the specified eclass. */
static
-Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec,
+Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, UInt secNo,
Addr64 guest_start, ULong range,
- Int ec )
+ Int ec,
+ VexArch vex_arch )
{
Int i;
UShort tteno;
if (overlaps( guest_start, range, &tte->vge )) {
anyDeld = True;
- delete_tte( sec, (Int)tteno );
+ delete_tte( sec, secNo, (Int)tteno, vex_arch );
}
}
slow way, by inspecting all translations in sec. */
static
-Bool delete_translations_in_sector ( /*MOD*/Sector* sec,
- Addr64 guest_start, ULong range )
+Bool delete_translations_in_sector ( /*MOD*/Sector* sec, UInt secNo,
+ Addr64 guest_start, ULong range,
+ VexArch vex_arch )
{
Int i;
Bool anyDeld = False;
if (sec->tt[i].status == InUse
&& overlaps( guest_start, range, &sec->tt[i].vge )) {
anyDeld = True;
- delete_tte( sec, i );
+ delete_tte( sec, secNo, i, vex_arch );
}
}
if (range == 0)
return;
+ VexArch vex_arch = VexArch_INVALID;
+ VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
/* There are two different ways to do this.
If the range fits within a single address-range equivalence
if (sec->tc == NULL)
continue;
anyDeleted |= delete_translations_in_sector_eclass(
- sec, guest_start, range, ec );
+ sec, sno, guest_start, range, ec,
+ vex_arch
+ );
anyDeleted |= delete_translations_in_sector_eclass(
- sec, guest_start, range, ECLASS_MISC );
+ sec, sno, guest_start, range, ECLASS_MISC,
+ vex_arch
+ );
}
} else {
if (sec->tc == NULL)
continue;
anyDeleted |= delete_translations_in_sector(
- sec, guest_start, range );
+ sec, sno, guest_start, range, vex_arch );
}
}
for (j = 0; j < code_len; j++)
dstP[j] = srcP[j];
- invalidate_icache( dstP, code_len );
+ VG_(invalidate_icache)( dstP, code_len );
unredir_tt[i].inUse = True;
unredir_tt[i].vge = *vge;
sectors[i].ec2tte_used[j] = 0;
sectors[i].ec2tte[j] = NULL;
}
+ sectors[i].host_extents = NULL;
}
/* Initialise the sector_search_order hint table. */
for (i = 0; i < N_SECTORS; i++)
sector_search_order[i] = -1;
- /* Initialise the fast caches. If not profiling (the usual case),
- we have to explicitly invalidate the fastN cache as
- invalidateFastCache() won't do that for us. */
+ /* Initialise the fast cache. */
invalidateFastCache();
- if (VG_(clo_profile_flags) == 0)
- invalidateFastNCache();
/* and the unredir tt/tc */
init_unredir_tt_tc();
xa->usedsizeE -= n;
}
+void VG_(removeIndexXA)( XArray* xao, Word n )
+{
+ struct _XArray* xa = (struct _XArray*)xao;
+ vg_assert(xa);
+ vg_assert(n >= 0);
+ vg_assert(n < xa->usedsizeE);
+ if (n+1 < xa->usedsizeE) {
+ VG_(memmove)( ((char*)xa->arr) + (n+0) * xa->elemSzB,
+ ((char*)xa->arr) + (n+1) * xa->elemSzB,
+ (xa->usedsizeE - n - 1) * xa->elemSzB );
+ }
+ xa->usedsizeE--;
+}
+
void VG_(getContentsXA_UNSAFE)( XArray* xao,
/*OUT*/void** ctsP,
/*OUT*/Word* usedP )
#include "pub_core_dispatch_asm.h"
-/* This subroutine is called from the C world. It is passed
- a pointer to the VEX guest state (arch.vex). It must run code
- from the instruction pointer in the guest state, and exit when
- VG_(dispatch_ctr) reaches zero, or we need to defer to the scheduler.
+/* Run translations, with the given guest state, and starting by
+ running the host code at 'host_addr'. It is almost always the case
+ that host_addr is the translation for guest_state.guest_IP, that
+ is, host_addr is what it would be if we looked up the address of
+ the translation corresponding to guest_state.guest_IP.
+
+ The only case where this isn't true is where we're running a
+ no-redir translation. In this case host_addr is the address of the
+ alternative (non-redirected) translation for guest_state.guest_IP.
+
The return value must indicate why it returned back to the scheduler.
It can also be exited if the executing code throws a non-resumable
signal, for example SIGSEGV, in which case control longjmp()s back past
here.
- If do_profiling is nonzero, the profile counters arrays should be
- updated for each translation run.
-
- This code simply handles the common case fast -- when the translation
- address is found in the translation cache. For anything else, the
- scheduler does the work.
-
- NOTE, VG_(run_innerloop) MUST NOT BE USED for noredir translations.
- Instead use VG_(run_a_noredir_translation).
-*/
-extern
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
-#if defined(VGA_x86) || defined(VGA_amd64)
-/* We need to locate a couple of labels inside VG_(run_innerloop), so
- that Vex can add branches to them from generated code. Hence the
- following somewhat bogus decls. At least on x86 and amd64. ppc32
- and ppc64 use straightforward bl-blr to get from dispatcher to
- translation and back and so do not need these labels. */
-extern Addr VG_(run_innerloop__dispatch_unassisted_unprofiled);
-extern Addr VG_(run_innerloop__dispatch_assisted_unprofiled);
-extern Addr VG_(run_innerloop__dispatch_unassisted_profiled);
-extern Addr VG_(run_innerloop__dispatch_assisted_profiled);
-#endif
-
-
-/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args
- and 2 to carry results:
- 0: input: ptr to translation
- 1: input: ptr to guest state
- 2: output: next guest PC
- 3: output: guest state pointer afterwards (== thread return code)
- MUST NOT BE USED for non-noredir (normal) translations.
+ two_words holds the return values (two words). First is
+ a TRC value. Second is generally unused, except in the case
+ where we have to return a chain-me request.
*/
-extern void VG_(run_a_noredir_translation) ( volatile UWord* argblock );
-#if defined(VGA_x86) || defined(VGA_amd64)
-/* We need to a label inside VG_(run_a_noredir_translation), so that
- Vex can add branches to them from generated code. Hence the
- following somewhat bogus decl. */
-extern Addr VG_(run_a_noredir_translation__return_point);
-#endif
-
+HWord VG_(disp_run_translations)( HWord* two_words,
+ void* guest_state,
+ Addr host_addr );
+
+/* We need to know addresses of the continuation-point (cp_) labels so
+ we can tell VEX what they are. They will get baked into the code
+ VEX generates. The UChar is entirely mythical, but we need to
+ state _some_ type, so as to keep gcc happy. */
+UChar VG_(disp_cp_chain_me_to_slowEP);
+UChar VG_(disp_cp_chain_me_to_fastEP);
+UChar VG_(disp_cp_xindir);
+UChar VG_(disp_cp_xassisted);
+UChar VG_(disp_cp_evcheck_fail);
#endif // __PUB_CORE_DISPATCH_H
/* And some more of our own. These must not have the same values as
those from libvex_trc_values.h. (viz, 60 or below is safe).
+ (The following comment is no longer relevant, but is retained
+ for historical purposes.)
These values *must* be odd (have bit 0 set) because the dispatchers
(coregrind/m_dispatch/dispatch-*-*.S) use this fact to distinguish
a TRC value from the unchanged baseblock pointer -- which has 0 as
its lowest bit.
*/
-#define VG_TRC_BORING 29 /* no event; just keep going */
-#define VG_TRC_INNER_FASTMISS 37 /* TRC only; means fast-cache miss. */
-#define VG_TRC_INNER_COUNTERZERO 41 /* TRC only; means bb ctr == 0 */
-#define VG_TRC_FAULT_SIGNAL 43 /* TRC only; got sigsegv/sigbus */
-#define VG_TRC_INVARIANT_FAILED 47 /* TRC only; invariant violation */
+#define VG_TRC_BORING 29 /* no event; just keep going */
+#define VG_TRC_INNER_FASTMISS 37 /* TRC only; means fast-cache miss. */
+#define VG_TRC_INNER_COUNTERZERO 41 /* TRC only; means bb ctr == 0 */
+#define VG_TRC_FAULT_SIGNAL 43 /* TRC only; got sigsegv/sigbus */
+#define VG_TRC_INVARIANT_FAILED 47 /* TRC only; invariant violation */
+#define VG_TRC_CHAIN_ME_TO_SLOW_EP 49 /* TRC only; chain to slow EP */
+#define VG_TRC_CHAIN_ME_TO_FAST_EP 51 /* TRC only; chain to fast EP */
#endif // __PUB_CORE_DISPATCH_ASM_H
extern void VG_(do_atfork_parent) ( ThreadId tid );
extern void VG_(do_atfork_child) ( ThreadId tid );
+// icache invalidation
+extern void VG_(invalidate_icache) ( void *ptr, SizeT nbytes );
+
+
#endif // __PUB_CORE_LIBCPROC_H
/*--------------------------------------------------------------------*/
//--------------------------------------------------------------------
extern
-Bool VG_(translate) ( ThreadId tid,
- Addr64 orig_addr,
- Bool debugging_translation,
- Int debugging_verbosity,
- ULong bbs_done,
- Bool allow_redirection );
+Bool VG_(translate) ( /*OUT*/Bool* caused_discardP,
+ ThreadId tid,
+ Addr64 orig_addr,
+ Bool debugging_translation,
+ Int debugging_verbosity,
+ ULong bbs_done,
+ Bool allow_redirection );
extern void VG_(print_translation_stats) ( void );
#include "pub_core_transtab_asm.h"
-/* The fast-cache for tt-lookup, and for finding counters. Unused
- entries are denoted by .guest == 1, which is assumed to be a bogus
- address for all guest code. */
+/* The fast-cache for tt-lookup. Unused entries are denoted by .guest
+ == 1, which is assumed to be a bogus address for all guest code. */
typedef
struct {
Addr guest;
#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
-extern UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE];
-
extern void VG_(init_tt_tc) ( void );
extern
-void VG_(add_to_transtab)( VexGuestExtents* vge,
+Bool VG_(add_to_transtab)( VexGuestExtents* vge,
Addr64 entry,
AddrH code,
UInt code_len,
- Bool is_self_checking );
+ Bool is_self_checking,
+ Int offs_profInc,
+ VexArch arch_host );
-extern Bool VG_(search_transtab) ( /*OUT*/AddrH* result,
+extern
+void VG_(tt_tc_do_chaining) ( void* from__patch_addr,
+ UInt to_sNo,
+ UInt to_tteNo,
+ Bool to_fastEP );
+
+extern Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode,
+ /*OUT*/UInt* res_sNo,
+ /*OUT*/UInt* res_tteNo,
Addr64 guest_addr,
Bool upd_cache );
ever be used. So instead the function is '(address >>u
2)[VG_TT_FAST_BITS-1 : 0]' on those targets.
- On ARM we do like ppc32/ppc64, although that will have to be
- revisited when we come to implement Thumb.
+ On ARM we shift by 1, since Thumb insns can be of size 2, hence to
+ minimise collisions and maximise cache utilisation we need to take
+ into account all but the least significant bit.
On s390x the rightmost bit of an instruction address is zero.
For best table utilization shift the address to the right by 1 bit. */
internals/register-uses.txt \
internals/release-HOWTO.txt \
internals/segments-seginfos.txt \
+ internals/t-chaining-notes.txt \
internals/threads-syscalls-signals.txt \
internals/tm-mutexstates.dot \
internals/tm-threadstates.dot \
--- /dev/null
+
+DO NOT MERGE
+~~~~~~~~~~~
+
+Changes memcheck/tests/Makefile.am w.r.t. -mfloat-abi=softfp
+Ditto none/tests/arm/Makefile.am
+
+
+Verification todo
+~~~~~~~~~~~~~~~~~
+check that illegal insns on all targets don't cause the _toIR.c's to
+assert.
+
+check also with --vex-guest-chase-cond=yes
+
+check that all targets can run their insn set tests with
+--vex-guest-max-insns=1.
+
+
+Cleanups
+~~~~~~~~
+host_arm_isel.c and host_arm_defs.c: get rid of global var arm_hwcaps.
+
+host_x86_defs.c, host_amd64_defs.c: return proper VexInvalRange
+records from the patchers, instead of {0,0}, so that transparent
+self hosting works properly.
+
+
+Optimisations
+~~~~~~~~~~~~~
+all targets: change VG_(stats__n_xindirs) to a 32 bit counter, and
+empty out every now and again.
+
+amd64: XDirect: write const value to guest_RIP using single
+insn when the value is < 0x8000'0000
+
+arm: chain_XDirect: generate short form jumps when possible
+
+arm codegen: Generate ORRS for CmpwNEZ32(Or32(x,y))
+
+all targets: when nuking an entire sector, don't bother to undo the
+patching for any translations within the sector (nor with their
+invalidations).
+
+(somewhat implausible) for jumps to disp_cp_indir, have multiple
+copies of disp_cp_indir, one for each of the possible registers that
+could have held the target guest address before jumping to the stub.
+Then disp_cp_indir wouldn't have to reload it from memory each time.
+Might also have the effect of spreading out the indirect mispredict
+burden somewhat (across the multiple copies.)
+
+
+Implementation notes
+~~~~~~~~~~~~~~~~~~~~
+T-chaining changes -- summary
+
+* The code generators (host_blah_isel.c, host_blah_defs.[ch]) interact
+ more closely with Valgrind than before. In particular the
+ instruction selectors must use one of 3 different kinds of
+ control-transfer instructions: XDirect, XIndir and XAssisted.
+ All archs must use these the same; no more ad-hoc control transfer
+ instructions.
+ (more detail below)
+
+
+* With T-chaining, translations can jump between each other without
+ going through the dispatcher loop every time. This means that the
+ event check (counter dec, and exit if negative) the dispatcher loop
+ previously did now needs to be compiled into each translation.
+
+
+* The assembly dispatcher code (dispatch-arch-os.S) is still
+ present. It still provides table lookup services for
+ indirect branches, but it also provides a new feature:
+ dispatch points, to which the generated code jumps. There
+ are 5:
+
+ VG_(disp_cp_chain_me_to_slowEP):
+ VG_(disp_cp_chain_me_to_fastEP):
+ These are chain-me requests, used for Boring conditional and
+ unconditional jumps to destinations known at JIT time. The
+ generated code calls these (doesn't jump to them) and the
+ stub recovers the return address. These calls never return;
+ instead the call is done so that the stub knows where the
+ calling point is. It needs to know this so it can patch
+ the calling point to the requested destination.
+ VG_(disp_cp_xindir):
+ Old-style table lookup and go; used for indirect jumps
+ VG_(disp_cp_xassisted):
+ Most general and slowest kind. Can transfer to anywhere, but
+ first returns to scheduler to do some other event (eg a syscall)
+ before continuing.
+ VG_(disp_cp_evcheck_fail):
+ Code jumps here when the event check fails.
+
+
+* new instructions in backends: XDirect, XIndir and XAssisted.
+ XDirect is used for chainable jumps. It is compiled into a
+ call to VG_(disp_cp_chain_me_to_slowEP) or
+ VG_(disp_cp_chain_me_to_fastEP).
+
+ XIndir is used for indirect jumps. It is compiled into a jump
+ to VG_(disp_cp_xindir)
+
+ XAssisted is used for "assisted" (do something first, then jump)
+ transfers. It is compiled into a jump to VG_(disp_cp_xassisted)
+
+ All 3 of these may be conditional.
+
+ More complexity: in some circumstances (no-redir translations)
+ all transfers must be done with XAssisted. In such cases the
+ instruction selector will be told this.
+
+
+* Patching: XDirect is compiled basically into
+ %r11 = &VG_(disp_cp_chain_me_to_{slow,fast}EP)
+ call *%r11
+ Backends must provide a function (eg) chainXDirect_AMD64
+ which converts it into a jump to a specified destination
+ jmp $delta-of-PCs
+ or
+ %r11 = 64-bit immediate
+ jmpq *%r11
+ depending on branch distance.
+
+ Backends must provide a function (eg) unchainXDirect_AMD64
+ which restores the original call-to-the-stub version.
+
+
+* Event checks. Each translation now has two entry points,
+ the slow one (slowEP) and fast one (fastEP). Like this:
+
+ slowEP:
+ counter--
+ if (counter < 0) goto VG_(disp_cp_evcheck_fail)
+ fastEP:
+ (rest of the translation)
+
+ slowEP is used for control flow transfers that are or might be
+ a back edge in the control flow graph. Insn selectors are
+ given the address of the highest guest byte in the block so
+ they can determine which edges are definitely not back edges.
+
+ The counter is placed in the first 8 bytes of the guest state,
+ and the address of VG_(disp_cp_evcheck_fail) is placed in
+ the next 8 bytes. This allows very compact checks on all
+ targets, since no immediates need to be synthesised, eg:
+
+ decq 0(%baseblock-pointer)
+ jns fastEP
+ jmpq *8(baseblock-pointer)
+ fastEP:
+
+ On amd64 a non-failing check is therefore 2 insns; all 3 occupy
+ just 8 bytes.
+
+ On amd64 the event check is created by a special single
+ pseudo-instruction AMD64_EvCheck.
+
+
+* BB profiling (for --profile-flags=). The dispatch assembly
+ dispatch-arch-os.S no longer deals with this and so is much
+ simplified. Instead the profile inc is compiled into each
+ translation, as the insn immediately following the event
+ check. Again, on amd64 a pseudo-insn AMD64_ProfInc is used.
+ Counters are now 64 bit even on 32 bit hosts, to avoid overflow.
+
+ One complexity is that at JIT time it is not known where the
+ address of the counter is. To solve this, VexTranslateResult
+ now returns the offset of the profile inc in the generated
+ code. When the counter address is known, VEX can be called
+ again to patch it in. Backends must supply eg
+ patchProfInc_AMD64 to make this happen.
+
+
+* Front end changes (guest_blah_toIR.c)
+
+ The way the guest program counter is handled has changed
+ significantly. Previously, the guest PC was updated (in IR)
+ at the start of each instruction, except for the first insn
+ in an IRSB. This is inconsistent and doesn't work with the
+ new framework.
+
+ Now, each instruction must update the guest PC as its last
+ IR statement -- not its first. And no special exemption for
+ the first insn in the block. As before most of these are
+ optimised out by ir_opt, so no concerns about efficiency.
+
+ As a logical side effect of this, exits (IRStmt_Exit) and the
+ block-end transfer are both considered to write to the guest state
+ (the guest PC) and so need to be told the offset of it.
+
+ IR generators (eg disInstr_AMD64) are no longer allowed to set the
+ IRSB::next, to specify the block-end transfer address. Instead they
+ now indicate, to the generic steering logic that drives them (iow,
+ guest_generic_bb_to_IR.c), that the block has ended. This then
+ generates effectively "goto GET(PC)" (which, again, is optimised
+ away). What this does mean is that if the IR generator function
+ ends the IR of the last instruction in the block with an incorrect
+ assignment to the guest PC, execution will transfer to an incorrect
+ destination -- making the error obvious quickly.
bb->tyenv = deepCopyIRTypeEnv(bb_in->tyenv);
bb->next = deepCopyIRExpr(bb_in->next);
bb->jumpkind = bb_in->jumpkind;
+ bb->offsIP = bb_in->offsIP;
for (i = 0; i < bb_in->stmts_used; i++)
{
{ return memset(s, c, sz); }
void* VG_(memcpy)(void *d, const void *s, SizeT sz)
{ return memcpy(d, s, sz); }
+void* VG_(memmove)(void *d, const void *s, SizeT sz)
+{ return memmove(d, s, sz); }
Int VG_(memcmp)(const void* s1, const void* s2, SizeT n)
{ return memcmp(s1, s2, n); }
UInt VG_(printf)(const HChar *format, ...)
bbOut->tyenv = deepCopyIRTypeEnv(bbIn->tyenv);
bbOut->next = deepCopyIRExpr(bbIn->next);
bbOut->jumpkind = bbIn->jumpkind;
+ bbOut->offsIP = bbIn->offsIP;
// Copy verbatim any IR preamble preceding the first IMark
i = 0;
is the number of elements remaining in the XArray. */
extern void VG_(dropHeadXA) ( XArray*, Word );
+/* Remove the specified element of an XArray, and slide all elements
+ beyond it back one place. This is an O(N) operation, where N is
+ the number of elements after the specified element, in the
+ array. */
+extern void VG_(removeIndexXA)( XArray*, Word );
+
/* Make a new, completely independent copy of the given XArray, using
the existing allocation function to allocate the new space.
Returns NULL if the allocation function didn't manage to allocate
AM_CFLAGS += $(AM_FLAG_M3264_PRI)
AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
-if VGCONF_PLATFORMS_INCLUDE_ARM_LINUX
-AM_CFLAGS += -mfloat-abi=softfp
-AM_CXXFLAGS += -mfloat-abi=softfp
-endif
+#if VGCONF_PLATFORMS_INCLUDE_ARM_LINUX
+#AM_CFLAGS += -mfloat-abi=softfp
+#AM_CXXFLAGS += -mfloat-abi=softfp
+#endif
if VGCONF_OS_IS_DARWIN
atomic_incs_CFLAGS = $(AM_CFLAGS) -mdynamic-no-pic
#define vgPlain_printf printf
#define vgPlain_memset memset
#define vgPlain_memcpy memcpy
+#define vgPlain_memmove memmove
// Crudely replace some functions (in m_xarray.c, but not needed for
// this unit test) by (hopefully) failing asserts.
v6media_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 -mthumb
vfp_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
- -mfpu=neon -mfloat-abi=softfp \
+ -mfpu=neon \
-mthumb
neon128_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
- -mfpu=neon -mfloat-abi=softfp \
+ -mfpu=neon \
-mthumb
neon64_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
- -mfpu=neon -mfloat-abi=softfp \
+ -mfpu=neon \
-mthumb