From: Julian Seward Date: Mon, 2 Apr 2012 21:56:03 +0000 (+0000) Subject: Add translation chaining support for amd64, x86 and ARM X-Git-Tag: svn/VALGRIND_3_8_0~350^2~11 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b6f93641ca2f6f8caccdf40c29c4271f6799489;p=thirdparty%2Fvalgrind.git Add translation chaining support for amd64, x86 and ARM (Valgrind side). See #296422. git-svn-id: svn://svn.valgrind.org/valgrind/branches/TCHAIN@12484 --- diff --git a/Makefile.all.am b/Makefile.all.am index fe6fccd201..fdf4ab99aa 100644 --- a/Makefile.all.am +++ b/Makefile.all.am @@ -92,7 +92,9 @@ AM_CFLAGS_BASE = \ -Wmissing-declarations \ @FLAG_W_NO_FORMAT_ZERO_LENGTH@ \ -fno-strict-aliasing \ - -fno-builtin + -fno-builtin \ + \ + -O # These flags are used for building the preload shared objects. # The aim is to give reasonable performance but also to have good diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S index a3e22d5a83..459c44708c 100644 --- a/coregrind/m_dispatch/dispatch-amd64-linux.S +++ b/coregrind/m_dispatch/dispatch-amd64-linux.S @@ -39,30 +39,36 @@ /*------------------------------------------------------------*/ /*--- ---*/ -/*--- The dispatch loop. VG_(run_innerloop) is used to ---*/ -/*--- run all translations except no-redir ones. ---*/ +/*--- The dispatch loop. VG_(disp_run_translations) is ---*/ +/*--- used to run all translations, ---*/ +/*--- including no-redir ones. ---*/ /*--- ---*/ /*------------------------------------------------------------*/ /*----------------------------------------------------*/ -/*--- Preamble (set everything up) ---*/ +/*--- Entry and preamble (set everything up) ---*/ /*----------------------------------------------------*/ /* signature: -UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling ); +UWord VG_(disp_run_translations)( UWord* two_words, + void* guest_state, + Addr host_addr ); */ - .text -.globl VG_(run_innerloop) -.type VG_(run_innerloop), @function -VG_(run_innerloop): - /* %rdi holds guest_state */ - /* %rsi holds do_profiling */ - - /* ----- entry point to VG_(run_innerloop) ----- */ +.globl VG_(disp_run_translations) +.type VG_(disp_run_translations), @function +VG_(disp_run_translations): + /* %rdi holds two_words */ + /* %rsi holds guest_state */ + /* %rdx holds host_addr */ + + /* The preamble */ + + /* Save integer registers, since this is a pseudo-function. */ + pushq %rax pushq %rbx pushq %rcx - pushq %rdx + pushq %rdx pushq %rsi pushq %rbp pushq %r8 @@ -73,20 +79,10 @@ VG_(run_innerloop): pushq %r13 pushq %r14 pushq %r15 - pushq %rdi /* guest_state */ - - movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15 - movl (%r15), %r15d - pushq %r15 + /* %rdi must be saved last */ + pushq %rdi - /* 8(%rsp) holds cached copy of guest_state ptr */ - /* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */ - - /* Set up the guest state pointer */ - movq %rdi, %rbp - - /* fetch %RIP into %rax */ - movq OFFSET_amd64_RIP(%rbp), %rax + /* Get the host CPU in the state expected by generated code. */ /* set host FPU control word to the default mode expected by VEX-generated code. See comments in libvex.h for @@ -105,158 +101,37 @@ VG_(run_innerloop): /* set dir flag to known value */ cld - /* fall into main loop (the right one) */ - cmpq $0, %rsi - je VG_(run_innerloop__dispatch_unassisted_unprofiled) - jmp VG_(run_innerloop__dispatch_unassisted_profiled) - /*NOTREACHED*/ - -/*----------------------------------------------------*/ -/*--- NO-PROFILING (standard) dispatcher ---*/ -/*----------------------------------------------------*/ - -.align 16 -.global VG_(run_innerloop__dispatch_unassisted_unprofiled) -VG_(run_innerloop__dispatch_unassisted_unprofiled): - /* AT ENTRY: %rax is next guest addr, %rbp is the - unmodified guest state ptr */ - - /* save the jump address in the guest state */ - movq %rax, OFFSET_amd64_RIP(%rbp) - - /* Are we out of timeslice? If yes, defer to scheduler. */ - subl $1, 0(%rsp) - jz counter_is_zero - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - /* generated code should run, then jump back to either - VG_(run_innerloop__dispatch_unassisted_unprofiled) - VG_(run_innerloop__dispatch_assisted_unprofiled). */ - /*NOTREACHED*/ - -.align 16 -.global VG_(run_innerloop__dispatch_assisted_unprofiled) -VG_(run_innerloop__dispatch_assisted_unprofiled): - /* AT ENTRY: %rax is next guest addr, %rbp is the - modified guest state ptr */ - /* We know the guest state pointer has been modified. - So jump directly to gsp_changed. */ - jmp gsp_changed - ud2 - /*NOTREACHED*/ - -/*----------------------------------------------------*/ -/*--- PROFILING dispatcher (can be much slower) ---*/ -/*----------------------------------------------------*/ - -.align 16 -.global VG_(run_innerloop__dispatch_unassisted_profiled) -VG_(run_innerloop__dispatch_unassisted_profiled): - /* AT ENTRY: %rax is next guest addr, %rbp is the - unmodified guest state ptr */ - - /* save the jump address in the guest state */ - movq %rax, OFFSET_amd64_RIP(%rbp) - - /* Are we out of timeslice? If yes, defer to scheduler. */ - subl $1, 0(%rsp) - jz counter_is_zero - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* increment bb profile counter */ - movabsq $VG_(tt_fastN), %rdx - shrq $1, %rbx /* entry# * sizeof(UInt*) */ - movq (%rdx,%rbx,1), %rdx - addl $1, (%rdx) + /* Set up the guest state pointer */ + movq %rsi, %rbp - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - /* generated code should run, then jump back to either - VG_(run_innerloop__dispatch_unassisted_profiled) - VG_(run_innerloop__dispatch_assisted_profiled). */ - /*NOTREACHED*/ - -.align 16 -.global VG_(run_innerloop__dispatch_assisted_profiled) -VG_(run_innerloop__dispatch_assisted_profiled): - /* AT ENTRY: %rax is next guest addr, %rbp is the - modified guest state ptr */ - - /* Well, we know the guest state pointer has been modified. - So jump directly to gsp_changed. */ - jmp gsp_changed - ud2 - /*NOTREACHED*/ + /* and jump into the code cache. Chained translations in + the code cache run, until for whatever reason, they can't + continue. When that happens, the translation in question + will jump (or call) to one of the continuation points + VG_(cp_...) below. */ + jmpq *%rdx + /*NOTREACHED*/ /*----------------------------------------------------*/ -/*--- exit points ---*/ +/*--- Postamble and exit. ---*/ /*----------------------------------------------------*/ -gsp_changed: - /* Someone messed with the gsp. Have to - defer to scheduler to resolve this. dispatch ctr - is not yet decremented, so no need to increment. */ - /* %RIP is NOT up to date here. First, need to write - %rax back to %RIP, but without trashing %rbp since - that holds the value we want to return to the scheduler. - Hence use %r15 transiently for the guest state pointer. */ - movq 8(%rsp), %r15 - movq %rax, OFFSET_amd64_RIP(%r15) - movq %rbp, %rax - jmp run_innerloop_exit - /*NOTREACHED*/ - -counter_is_zero: - /* %RIP is up to date here */ - /* back out decrement of the dispatch counter */ - addl $1, 0(%rsp) - movq $VG_TRC_INNER_COUNTERZERO, %rax - jmp run_innerloop_exit - -fast_lookup_failed: - /* %RIP is up to date here */ - /* back out decrement of the dispatch counter */ - addl $1, 0(%rsp) - movq $VG_TRC_INNER_FASTMISS, %rax - jmp run_innerloop_exit - - - -/* All exits from the dispatcher go through here. %rax holds - the return value. -*/ -run_innerloop_exit: - /* We're leaving. Check that nobody messed with - %mxcsr or %fpucw. We can't mess with %rax here as it - holds the tentative return value, but any other is OK. */ +postamble: + /* At this point, %rax and %rdx contain two + words to be returned to the caller. %rax + holds a TRC value, and %rdx optionally may + hold another word (for CHAIN_ME exits, the + address of the place to patch.) */ + + /* We're leaving. Check that nobody messed with %mxcsr + or %fpucw. We can't mess with %rax or %rdx here as they + hold the tentative return values, but any others are OK. */ #if !defined(ENABLE_INNER) /* This check fails for self-hosting, so skip in that case */ pushq $0 fstcw (%rsp) cmpl $0x027F, (%rsp) - popq %r15 /* get rid of the word without trashing %eflags */ + popq %r15 /* get rid of the word without trashing %rflags */ jnz invariant_violation #endif pushq $0 @@ -266,20 +141,17 @@ run_innerloop_exit: popq %r15 jnz invariant_violation /* otherwise we're OK */ - jmp run_innerloop_exit_REALLY - + jmp remove_frame invariant_violation: movq $VG_TRC_INVARIANT_FAILED, %rax - jmp run_innerloop_exit_REALLY - -run_innerloop_exit_REALLY: - - /* restore VG_(dispatch_ctr) */ - popq %r14 - movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15 - movl %r14d, (%r15) + movq $0, %rdx +remove_frame: + /* Pop %rdi, stash return values */ popq %rdi + movq %rax, 0(%rdi) + movq %rdx, 8(%rdi) + /* Now pop everything else */ popq %r15 popq %r14 popq %r13 @@ -293,61 +165,89 @@ run_innerloop_exit_REALLY: popq %rdx popq %rcx popq %rbx + popq %rax ret -.size VG_(run_innerloop), .-VG_(run_innerloop) + +/*----------------------------------------------------*/ +/*--- Continuation points ---*/ +/*----------------------------------------------------*/ - -/*------------------------------------------------------------*/ -/*--- ---*/ -/*--- A special dispatcher, for running no-redir ---*/ -/*--- translations. Just runs the given translation once. ---*/ -/*--- ---*/ -/*------------------------------------------------------------*/ +/* ------ Chain me to slow entry point ------ */ +.global VG_(disp_cp_chain_me_to_slowEP) +VG_(disp_cp_chain_me_to_slowEP): + /* We got called. The return address indicates + where the patching needs to happen. Collect + the return address and, exit back to C land, + handing the caller the pair (Chain_me_S, RA) */ + movq $VG_TRC_CHAIN_ME_TO_SLOW_EP, %rax + popq %rdx + /* 10 = movabsq $VG_(disp_chain_me_to_slowEP), %r11; + 3 = call *%r11 */ + subq $10+3, %rdx + jmp postamble + +/* ------ Chain me to fast entry point ------ */ +.global VG_(disp_cp_chain_me_to_fastEP) +VG_(disp_cp_chain_me_to_fastEP): + /* We got called. The return address indicates + where the patching needs to happen. Collect + the return address and, exit back to C land, + handing the caller the pair (Chain_me_F, RA) */ + movq $VG_TRC_CHAIN_ME_TO_FAST_EP, %rax + popq %rdx + /* 10 = movabsq $VG_(disp_chain_me_to_fastEP), %r11; + 3 = call *%r11 */ + subq $10+3, %rdx + jmp postamble + +/* ------ Indirect but boring jump ------ */ +.global VG_(disp_cp_xindir) +VG_(disp_cp_xindir): + /* Where are we going? */ + movq OFFSET_amd64_RIP(%rbp), %rax -/* signature: -void VG_(run_a_noredir_translation) ( UWord* argblock ); -*/ + /* RM ME -- stats only */ + addq $1, vgPlain_stats__n_xindirs + + /* try a fast lookup in the translation cache */ + movabsq $VG_(tt_fast), %rcx + movq %rax, %rbx /* next guest addr */ + andq $VG_TT_FAST_MASK, %rbx /* entry# */ + shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ + movq 0(%rcx,%rbx,1), %r10 /* .guest */ + movq 8(%rcx,%rbx,1), %r11 /* .host */ + cmpq %rax, %r10 + jnz fast_lookup_failed -/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args - and 2 to carry results: - 0: input: ptr to translation - 1: input: ptr to guest state - 2: output: next guest PC - 3: output: guest state pointer afterwards (== thread return code) -*/ -.align 16 -.global VG_(run_a_noredir_translation) -.type VG_(run_a_noredir_translation), @function -VG_(run_a_noredir_translation): - /* Save callee-saves regs */ - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - pushq %rdi /* we will need it after running the translation */ - movq 8(%rdi), %rbp - jmp *0(%rdi) - /*NOTREACHED*/ - ud2 - /* If the translation has been correctly constructed, we - should resume at the the following label. */ -.global VG_(run_a_noredir_translation__return_point) -VG_(run_a_noredir_translation__return_point): - popq %rdi - movq %rax, 16(%rdi) - movq %rbp, 24(%rdi) - - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - ret -.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation) + /* Found a match. Jump to .host. */ + jmp *%r11 + ud2 /* persuade insn decoders not to speculate past here */ + +fast_lookup_failed: + /* RM ME -- stats only */ + addq $1, vgPlain_stats__n_xindir_misses + + movq $VG_TRC_INNER_FASTMISS, %rax + movq $0, %rdx + jmp postamble + +/* ------ Assisted jump ------ */ +.global VG_(disp_cp_xassisted) +VG_(disp_cp_xassisted): + /* %rbp contains the TRC */ + movq %rbp, %rax + movq $0, %rdx + jmp postamble + +/* ------ Event check failed ------ */ +.global VG_(disp_cp_evcheck_fail) +VG_(disp_cp_evcheck_fail): + movq $VG_TRC_INNER_COUNTERZERO, %rax + movq $0, %rdx + jmp postamble + + +.size VG_(disp_run_translations), .-VG_(disp_run_translations) /* Let the linker know we don't need an executable stack */ .section .note.GNU-stack,"",@progbits diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S index 9e2334957e..4833a75bcc 100644 --- a/coregrind/m_dispatch/dispatch-arm-linux.S +++ b/coregrind/m_dispatch/dispatch-arm-linux.S @@ -1,3 +1,4 @@ + /*--------------------------------------------------------------------*/ /*--- The core dispatch loop, for jumping to a code address. ---*/ /*--- dispatch-arm-linux.S ---*/ @@ -45,121 +46,121 @@ /*------------------------------------------------------------*/ /*----------------------------------------------------*/ -/*--- Preamble (set everything up) ---*/ +/*--- Entry and preamble (set everything up) ---*/ /*----------------------------------------------------*/ /* signature: -UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling ); +UWord VG_(disp_run_translations)( UWord* two_words, + void* guest_state, + Addr host_addr ); */ .text -.globl VG_(run_innerloop) -VG_(run_innerloop): - push {r0, r1, r4, r5, r6, r7, r8, r9, fp, lr} +.global VG_(disp_run_translations) +VG_(disp_run_translations): + /* r0 holds two_words + r1 holds guest_state + r2 holds host_addr + */ + /* The number of regs in this list needs to be even, in + order to keep the stack 8-aligned. */ + push {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} /* set FPSCR to vex-required default value */ mov r4, #0 fmxr fpscr, r4 - /* r0 (hence also [sp,#0]) holds guest_state */ - /* r1 holds do_profiling */ - mov r8, r0 - ldr r0, [r8, #OFFSET_arm_R15T] - - /* fall into main loop (the right one) */ - cmp r1, #0 /* do_profiling */ - beq VG_(run_innerloop__dispatch_unprofiled) - b VG_(run_innerloop__dispatch_profiled) - + /* Set up the guest state pointer */ + mov r8, r1 + /* and jump into the code cache. Chained translations in + the code cache run, until for whatever reason, they can't + continue. When that happens, the translation in question + will jump (or call) to one of the continuation points + VG_(cp_...) below. */ + bx r2 + /* NOTREACHED */ + /*----------------------------------------------------*/ -/*--- NO-PROFILING (standard) dispatcher ---*/ +/*--- Postamble and exit. ---*/ /*----------------------------------------------------*/ -/* Pairing of insns below is my guesstimate of how dual dispatch would - work on an A8. JRS, 2011-May-28 */ - -.global VG_(run_innerloop__dispatch_unprofiled) -VG_(run_innerloop__dispatch_unprofiled): - - /* AT ENTRY: r0 is next guest addr, r8 is possibly - modified guest state ptr */ - - /* Has the guest state pointer been messed with? If yes, exit. */ - movw r3, #:lower16:VG_(dispatch_ctr) - tst r8, #1 - - movt r3, #:upper16:VG_(dispatch_ctr) - - bne gsp_changed - - /* save the jump address in the guest state */ - str r0, [r8, #OFFSET_arm_R15T] - - /* Are we out of timeslice? If yes, defer to scheduler. */ - ldr r2, [r3] - - subs r2, r2, #1 - - str r2, [r3] - - beq counter_is_zero - - /* try a fast lookup in the translation cache */ - // r0 = next guest, r1,r2,r3,r4 scratch - movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK - movw r4, #:lower16:VG_(tt_fast) - - and r2, r1, r0, LSR #1 // r2 = entry # - movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast) - - add r1, r4, r2, LSL #3 // r1 = &tt_fast[entry#] - - ldrd r4, r5, [r1, #0] // r4 = .guest, r5 = .host - - cmp r4, r0 +postamble: + /* At this point, r1 and r2 contain two + words to be returned to the caller. r1 + holds a TRC value, and r2 optionally may + hold another word (for CHAIN_ME exits, the + address of the place to patch.) */ - bne fast_lookup_failed - // r5: next-host r8: live, gsp - // r4: next-guest - // r2: entry # - // LIVE: r5, r8; all others dead - - /* Found a match. Jump to .host. */ - blx r5 - b VG_(run_innerloop__dispatch_unprofiled) -.ltorg - /*NOTREACHED*/ + /* We're leaving. Check that nobody messed with + FPSCR in ways we don't expect. */ + fmrx r4, fpscr + bic r4, #0xF8000000 /* mask out NZCV and QC */ + bic r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */ + cmp r4, #0 + beq remove_frame /* we're OK */ + /* otherwise we have an invariant violation */ + movw r1, #VG_TRC_INVARIANT_FAILED + movw r2, #0 + /* fall through */ + +remove_frame: + /* Restore int regs, including importantly r0 (two_words) */ + pop {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + /* Stash return values */ + str r1, [r0, #0] + str r2, [r0, #4] + bx lr /*----------------------------------------------------*/ -/*--- PROFILING dispatcher (can be much slower) ---*/ +/*--- Continuation points ---*/ /*----------------------------------------------------*/ -.global VG_(run_innerloop__dispatch_profiled) -VG_(run_innerloop__dispatch_profiled): - - /* AT ENTRY: r0 is next guest addr, r8 is possibly - modified guest state ptr */ - - /* Has the guest state pointer been messed with? If yes, exit. */ - movw r3, #:lower16:VG_(dispatch_ctr) - tst r8, #1 - - movt r3, #:upper16:VG_(dispatch_ctr) - - bne gsp_changed - - /* save the jump address in the guest state */ - str r0, [r8, #OFFSET_arm_R15T] - - /* Are we out of timeslice? If yes, defer to scheduler. */ - ldr r2, [r3] - - subs r2, r2, #1 - - str r2, [r3] - - beq counter_is_zero - +/* ------ Chain me to slow entry point ------ */ +.global VG_(disp_cp_chain_me_to_slowEP) +VG_(disp_cp_chain_me_to_slowEP): + /* We got called. The return address indicates + where the patching needs to happen. Collect + the return address and, exit back to C land, + handing the caller the pair (Chain_me_S, RA) */ + mov r1, #VG_TRC_CHAIN_ME_TO_SLOW_EP + mov r2, lr + /* 4 = movw r12, lo16(disp_cp_chain_me_to_slowEP) + 4 = movt r12, hi16(disp_cp_chain_me_to_slowEP) + 4 = blx r12 */ + sub r2, r2, #4+4+4 + b postamble + +/* ------ Chain me to fast entry point ------ */ +.global VG_(disp_cp_chain_me_to_fastEP) +VG_(disp_cp_chain_me_to_fastEP): + /* We got called. The return address indicates + where the patching needs to happen. Collect + the return address and, exit back to C land, + handing the caller the pair (Chain_me_F, RA) */ + mov r1, #VG_TRC_CHAIN_ME_TO_FAST_EP + mov r2, lr + /* 4 = movw r12, lo16(disp_cp_chain_me_to_fastEP) + 4 = movt r12, hi16(disp_cp_chain_me_to_fastEP) + 4 = blx r12 */ + sub r2, r2, #4+4+4 + b postamble + +/* ------ Indirect but boring jump ------ */ +.global VG_(disp_cp_xindir) +VG_(disp_cp_xindir): + /* Where are we going? */ + ldr r0, [r8, #OFFSET_arm_R15T] + + /* RM ME -- stats only */ + movw r1, #:lower16:vgPlain_stats__n_xindirs + movt r1, #:upper16:vgPlain_stats__n_xindirs + ldr r2, [r1, #0] + adds r2, r2, #1 + str r2, [r1, #0] + ldr r2, [r1, #4] + adc r2, r2, #0 + str r2, [r1, #4] + /* try a fast lookup in the translation cache */ // r0 = next guest, r1,r2,r3,r4 scratch movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK @@ -174,121 +175,41 @@ VG_(run_innerloop__dispatch_profiled): cmp r4, r0 - bne fast_lookup_failed - // r5: next-host r8: live, gsp - // r4: next-guest - // r2: entry # - // LIVE: r5, r8; all others dead - - /* increment bb profile counter */ - movw r0, #:lower16:VG_(tt_fastN) - movt r0, #:upper16:VG_(tt_fastN) // r0 = &tt_fastN[0] - ldr r0, [r0, r2, LSL #2] // r0 = tt_fast[entry #] - ldr r3, [r0] // *r0 ++ - add r3, r3, #1 - str r3, [r0] - - /* Found a match. Jump to .host. */ - blx r5 - b VG_(run_innerloop__dispatch_profiled) - /*NOTREACHED*/ - -/*----------------------------------------------------*/ -/*--- exit points ---*/ -/*----------------------------------------------------*/ - -gsp_changed: - // r0 = next guest addr (R15T), r8 = modified gsp - /* Someone messed with the gsp. Have to - defer to scheduler to resolve this. dispatch ctr - is not yet decremented, so no need to increment. */ - /* R15T is NOT up to date here. First, need to write - r0 back to R15T, but without trashing r8 since - that holds the value we want to return to the scheduler. - Hence use r1 transiently for the guest state pointer. */ - ldr r1, [sp, #0] - str r0, [r1, #OFFSET_arm_R15T] - mov r0, r8 // "return modified gsp" - b run_innerloop_exit - /*NOTREACHED*/ - -counter_is_zero: - /* R15T is up to date here */ - /* Back out increment of the dispatch ctr */ - ldr r1, =VG_(dispatch_ctr) - ldr r2, [r1] - add r2, r2, #1 - str r2, [r1] - mov r0, #VG_TRC_INNER_COUNTERZERO - b run_innerloop_exit - /*NOTREACHED*/ - -fast_lookup_failed: - /* R15T is up to date here */ - /* Back out increment of the dispatch ctr */ - ldr r1, =VG_(dispatch_ctr) - ldr r2, [r1] - add r2, r2, #1 - str r2, [r1] - mov r0, #VG_TRC_INNER_FASTMISS - b run_innerloop_exit - /*NOTREACHED*/ - -/* All exits from the dispatcher go through here. %r0 holds - the return value. -*/ -run_innerloop_exit: - /* We're leaving. Check that nobody messed with - FPSCR in ways we don't expect. */ - fmrx r4, fpscr - bic r4, #0xF8000000 /* mask out NZCV and QC */ - bic r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */ - cmp r4, #0 - bne invariant_violation - b run_innerloop_exit_REALLY - -invariant_violation: - mov r0, #VG_TRC_INVARIANT_FAILED - b run_innerloop_exit_REALLY - -run_innerloop_exit_REALLY: - add sp, sp, #8 - pop {r4, r5, r6, r7, r8, r9, fp, pc} - -.size VG_(run_innerloop), .-VG_(run_innerloop) - - -/*------------------------------------------------------------*/ -/*--- ---*/ -/*--- A special dispatcher, for running no-redir ---*/ -/*--- translations. Just runs the given translation once. ---*/ -/*--- ---*/ -/*------------------------------------------------------------*/ - -/* signature: -void VG_(run_a_noredir_translation) ( UWord* argblock ); -*/ - -/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args - and 2 to carry results: - 0: input: ptr to translation - 1: input: ptr to guest state - 2: output: next guest PC - 3: output: guest state pointer afterwards (== thread return code) -*/ -.global VG_(run_a_noredir_translation) -VG_(run_a_noredir_translation): - push {r0,r1 /* EABI compliance */, r4-r12, lr} - ldr r8, [r0, #4] - mov lr, pc - ldr pc, [r0, #0] - - pop {r1} - str r0, [r1, #8] - str r8, [r1, #12] - pop {r1/*EABI compliance*/,r4-r12, pc} - -.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation) + // jump to host if lookup succeeded + bxeq r5 + + /* otherwise the fast lookup failed */ + /* RM ME -- stats only */ + movw r1, #:lower16:vgPlain_stats__n_xindir_misses + movt r1, #:upper16:vgPlain_stats__n_xindir_misses + ldr r2, [r1, #0] + adds r2, r2, #1 + str r2, [r1, #0] + ldr r2, [r1, #4] + adc r2, r2, #0 + str r2, [r1, #4] + + mov r1, #VG_TRC_INNER_FASTMISS + mov r2, #0 + b postamble + +/* ------ Assisted jump ------ */ +.global VG_(disp_cp_xassisted) +VG_(disp_cp_xassisted): + /* r8 contains the TRC */ + mov r1, r8 + mov r2, #0 + b postamble + +/* ------ Event check failed ------ */ +.global VG_(disp_cp_evcheck_fail) +VG_(disp_cp_evcheck_fail): + mov r1, #VG_TRC_INNER_COUNTERZERO + mov r2, #0 + b postamble + + +.size VG_(disp_run_translations), .-VG_(disp_run_translations) /* Let the linker know we don't need an executable stack */ .section .note.GNU-stack,"",%progbits diff --git a/coregrind/m_dispatch/dispatch-x86-linux.S b/coregrind/m_dispatch/dispatch-x86-linux.S index 3e13ba65e9..7db1de1e2a 100644 --- a/coregrind/m_dispatch/dispatch-x86-linux.S +++ b/coregrind/m_dispatch/dispatch-x86-linux.S @@ -45,20 +45,27 @@ /*------------------------------------------------------------*/ /*----------------------------------------------------*/ -/*--- Preamble (set everything up) ---*/ +/*--- Entry and preamble (set everything up) ---*/ /*----------------------------------------------------*/ /* signature: -UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling ); +UWord VG_(disp_run_translations)( UWord* two_words, + void* guest_state, + Addr host_addr ); */ .text -.globl VG_(run_innerloop) -.type VG_(run_innerloop), @function -VG_(run_innerloop): - /* 4(%esp) holds guest_state */ - /* 8(%esp) holds do_profiling */ - - /* ----- entry point to VG_(run_innerloop) ----- */ +.globl VG_(disp_run_translations) +.type VG_(disp_run_translations), @function +VG_(disp_run_translations): + /* 0(%esp) holds our return address. */ + /* 4(%esp) holds two_words */ + /* 8(%esp) holds guest_state */ + /* 12(%esp) holds host_addr */ + + /* The preamble */ + + /* Save integer registers, since this is a pseudo-function. */ + pushl %eax pushl %ebx pushl %ecx pushl %edx @@ -66,14 +73,11 @@ VG_(run_innerloop): pushl %edi pushl %ebp - /* 28(%esp) holds guest_state */ - /* 32(%esp) holds do_profiling */ + /* 28+4(%esp) holds two_words */ + /* 28+8(%esp) holds guest_state */ + /* 28+12(%esp) holds host_addr */ - /* Set up the guest state pointer */ - movl 28(%esp), %ebp - - /* fetch %EIP into %eax */ - movl OFFSET_x86_EIP(%ebp), %eax + /* Get the host CPU in the state expected by generated code. */ /* set host FPU control word to the default mode expected by VEX-generated code. See comments in libvex.h for @@ -93,151 +97,32 @@ VG_(run_innerloop): L1: /* set dir flag to known value */ cld - - /* fall into main loop (the right one) */ - cmpl $0, 32(%esp) /* do_profiling */ - je VG_(run_innerloop__dispatch_unassisted_unprofiled) - jmp VG_(run_innerloop__dispatch_unassisted_profiled) - /*NOTREACHED*/ -/*----------------------------------------------------*/ -/*--- NO-PROFILING (standard) dispatcher ---*/ -/*----------------------------------------------------*/ - -.align 16 -.global VG_(run_innerloop__dispatch_unassisted_unprofiled) -VG_(run_innerloop__dispatch_unassisted_unprofiled): - /* AT ENTRY: %eax is next guest addr, %ebp is the - unmodified guest state ptr */ - - /* save the jump address in the guest state */ - movl %eax, OFFSET_x86_EIP(%ebp) - - /* Are we out of timeslice? If yes, defer to scheduler. */ - subl $1, VG_(dispatch_ctr) - jz counter_is_zero - - /* try a fast lookup in the translation cache */ - movl %eax, %ebx /* next guest addr */ - andl $ VG_TT_FAST_MASK, %ebx /* entry# */ - movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ - movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ - cmpl %eax, %esi - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%edi - ud2 /* persuade insn decoders not to speculate past here */ - /* generated code should run, then jump back to either - VG_(run_innerloop__dispatch_unassisted_unprofiled) or - VG_(run_innerloop__dispatch_assisted_unprofiled). */ - /*NOTREACHED*/ - -.align 16 -.global VG_(run_innerloop__dispatch_assisted_unprofiled) -VG_(run_innerloop__dispatch_assisted_unprofiled): - /* AT ENTRY: %eax is next guest addr, %ebp is the - modified guest state ptr */ - /* We know the guest state pointer has been modified. - So jump directly to gsp_changed. */ - jmp gsp_changed - ud2 - /*NOTREACHED*/ - -/*----------------------------------------------------*/ -/*--- PROFILING dispatcher (can be much slower) ---*/ -/*----------------------------------------------------*/ - -.align 16 -.global VG_(run_innerloop__dispatch_unassisted_profiled) -VG_(run_innerloop__dispatch_unassisted_profiled): - /* AT ENTRY: %eax is next guest addr, %ebp is the - unmodified guest state ptr */ - - /* save the jump address in the guest state */ - movl %eax, OFFSET_x86_EIP(%ebp) - - /* Are we out of timeslice? If yes, defer to scheduler. */ - subl $1, VG_(dispatch_ctr) - jz counter_is_zero - - /* try a fast lookup in the translation cache */ - movl %eax, %ebx /* next guest addr */ - andl $ VG_TT_FAST_MASK, %ebx /* entry# */ - movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ - movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ - cmpl %eax, %esi - jnz fast_lookup_failed - - /* increment bb profile counter */ - /* note: innocuous as this sounds, it causes a huge amount more - stress on D1 and significantly slows everything down. */ - movl VG_(tt_fastN)(,%ebx,4), %edx - /* Use "addl $1", not "incl", to avoid partial-flags stall on P4 */ - addl $1, (%edx) - - /* Found a match. Jump to .host. */ - jmp *%edi - ud2 /* persuade insn decoders not to speculate past here */ - /* generated code should run, then jump back to either - VG_(run_innerloop__dispatch_unassisted_profiled) or - VG_(run_innerloop__dispatch_assisted_profiled). */ - /*NOTREACHED*/ - -.align 16 -.global VG_(run_innerloop__dispatch_assisted_profiled) -VG_(run_innerloop__dispatch_assisted_profiled): - /* AT ENTRY: %eax is next guest addr, %ebp is the - modified guest state ptr */ - /* We know the guest state pointer has been modified. - So jump directly to gsp_changed. */ - jmp gsp_changed - ud2 + /* Set up the guest state pointer */ + movl 28+8(%esp), %ebp + + /* and jump into the code cache. Chained translations in + the code cache run, until for whatever reason, they can't + continue. When that happens, the translation in question + will jump (or call) to one of the continuation points + VG_(cp_...) below. */ + jmpl *28+12(%esp) /*NOTREACHED*/ /*----------------------------------------------------*/ -/*--- exit points ---*/ +/*--- Postamble and exit. ---*/ /*----------------------------------------------------*/ -gsp_changed: - /* Someone messed with the gsp. Have to - defer to scheduler to resolve this. dispatch ctr - is not yet decremented, so no need to increment. */ - /* %EIP is NOT up to date here. First, need to write - %eax back to %EIP, but without trashing %ebp since - that holds the value we want to return to the scheduler. - Hence use %esi transiently for the guest state pointer. */ - movl 28(%esp), %esi - movl %eax, OFFSET_x86_EIP(%esi) - movl %ebp, %eax - jmp run_innerloop_exit - /*NOTREACHED*/ - -counter_is_zero: - /* %EIP is up to date here */ - /* back out decrement of the dispatch counter */ - addl $1, VG_(dispatch_ctr) - movl $ VG_TRC_INNER_COUNTERZERO, %eax - jmp run_innerloop_exit - /*NOTREACHED*/ - -fast_lookup_failed: - /* %EIP is up to date here */ - /* back out decrement of the dispatch counter */ - addl $1, VG_(dispatch_ctr) - movl $ VG_TRC_INNER_FASTMISS, %eax - jmp run_innerloop_exit - /*NOTREACHED*/ +postamble: + /* At this point, %eax and %edx contain two + words to be returned to the caller. %eax + holds a TRC value, and %edx optionally may + hold another word (for CHAIN_ME exits, the + address of the place to patch.) */ - - -/* All exits from the dispatcher go through here. %eax holds - the return value. -*/ -run_innerloop_exit: - /* We're leaving. Check that nobody messed with - %mxcsr or %fpucw. We can't mess with %eax here as it - holds the tentative return value, but any other is OK. */ + /* We're leaving. Check that nobody messed with %mxcsr + or %fpucw. We can't mess with %eax or %edx here as they + holds the tentative return value, but any others are OK. */ #if !defined(ENABLE_INNER) /* This check fails for self-hosting, so skip in that case */ pushl $0 @@ -246,7 +131,7 @@ run_innerloop_exit: popl %esi /* get rid of the word without trashing %eflags */ jnz invariant_violation #endif - cmpl $0, VG_(machine_x86_have_mxcsr) +# cmpl $0, VG_(machine_x86_have_mxcsr) jz L2 pushl $0 stmxcsr (%esp) @@ -255,72 +140,107 @@ run_innerloop_exit: popl %esi jnz invariant_violation L2: /* otherwise we're OK */ - jmp run_innerloop_exit_REALLY - + jmp remove_frame invariant_violation: - movl $ VG_TRC_INVARIANT_FAILED, %eax - jmp run_innerloop_exit_REALLY - -run_innerloop_exit_REALLY: + movl $VG_TRC_INVARIANT_FAILED, %eax + movl $0, %edx + +remove_frame: + /* Stash return values */ + movl 28+4(%esp), %edi /* two_words */ + movl %eax, 0(%edi) + movl %edx, 4(%edi) + /* Restore int regs and return. */ popl %ebp popl %edi popl %esi popl %edx popl %ecx popl %ebx + popl %eax ret -.size VG_(run_innerloop), .-VG_(run_innerloop) + +/*----------------------------------------------------*/ +/*--- Continuation points ---*/ +/*----------------------------------------------------*/ +/* ------ Chain me to slow entry point ------ */ +.global VG_(disp_cp_chain_me_to_slowEP) +VG_(disp_cp_chain_me_to_slowEP): + /* We got called. The return address indicates + where the patching needs to happen. Collect + the return address and, exit back to C land, + handing the caller the pair (Chain_me_S, RA) */ + movl $VG_TRC_CHAIN_ME_TO_SLOW_EP, %eax + popl %edx + /* 5 = movl $VG_(disp_chain_me_to_slowEP), %edx; + 2 = call *%edx */ + subl $5+2, %edx + jmp postamble + +/* ------ Chain me to fast entry point ------ */ +.global VG_(disp_cp_chain_me_to_fastEP) +VG_(disp_cp_chain_me_to_fastEP): + /* We got called. The return address indicates + where the patching needs to happen. Collect + the return address and, exit back to C land, + handing the caller the pair (Chain_me_F, RA) */ + movl $VG_TRC_CHAIN_ME_TO_FAST_EP, %eax + popl %edx + /* 5 = movl $VG_(disp_chain_me_to_fastEP), %edx; + 2 = call *%edx */ + subl $5+2, %edx + jmp postamble + +/* ------ Indirect but boring jump ------ */ +.global VG_(disp_cp_xindir) +VG_(disp_cp_xindir): + /* Where are we going? */ + movl OFFSET_x86_EIP(%ebp), %eax -/*------------------------------------------------------------*/ -/*--- ---*/ -/*--- A special dispatcher, for running no-redir ---*/ -/*--- translations. Just runs the given translation once. ---*/ -/*--- ---*/ -/*------------------------------------------------------------*/ + /* RM ME -- stats only */ + addl $1, vgPlain_stats__n_xindirs + adcl $0, vgPlain_stats__n_xindirs+4 + + /* try a fast lookup in the translation cache */ + movl %eax, %ebx /* next guest addr */ + andl $VG_TT_FAST_MASK, %ebx /* entry# */ + movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ + movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ + cmpl %eax, %esi + jnz fast_lookup_failed + + /* Found a match. Jump to .host. */ + jmp *%edi + ud2 /* persuade insn decoders not to speculate past here */ -/* signature: -void VG_(run_a_noredir_translation) ( UWord* argblock ); -*/ +fast_lookup_failed: + /* RM ME -- stats only */ + addl $1, vgPlain_stats__n_xindir_misses + adcl $0, vgPlain_stats__n_xindir_misses+4 + + movl $VG_TRC_INNER_FASTMISS, %eax + movl $0, %edx + jmp postamble + +/* ------ Assisted jump ------ */ +.global VG_(disp_cp_xassisted) +VG_(disp_cp_xassisted): + /* %ebp contains the TRC */ + movl %ebp, %eax + movl $0, %edx + jmp postamble + +/* ------ Event check failed ------ */ +.global VG_(disp_cp_evcheck_fail) +VG_(disp_cp_evcheck_fail): + movl $VG_TRC_INNER_COUNTERZERO, %eax + movl $0, %edx + jmp postamble + + +.size VG_(disp_run_translations), .-VG_(disp_run_translations) -/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args - and 2 to carry results: - 0: input: ptr to translation - 1: input: ptr to guest state - 2: output: next guest PC - 3: output: guest state pointer afterwards (== thread return code) -*/ -.align 16 -.global VG_(run_a_noredir_translation) -.type VG_(run_a_noredir_translation), @function -VG_(run_a_noredir_translation): - /* Save callee-saves regs */ - pushl %esi - pushl %edi - pushl %ebp - pushl %ebx - - movl 20(%esp), %edi /* %edi = argblock */ - movl 4(%edi), %ebp /* argblock[1] */ - jmp *0(%edi) /* argblock[0] */ - /*NOTREACHED*/ - ud2 - /* If the translation has been correctly constructed, we - should resume at the the following label. */ -.global VG_(run_a_noredir_translation__return_point) -VG_(run_a_noredir_translation__return_point): - movl 20(%esp), %edi - movl %eax, 8(%edi) /* argblock[2] */ - movl %ebp, 12(%edi) /* argblock[3] */ - - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation) - - /* Let the linker know we don't need an executable stack */ .section .note.GNU-stack,"",@progbits diff --git a/coregrind/m_errormgr.c b/coregrind/m_errormgr.c index 44976a804b..788041b443 100644 --- a/coregrind/m_errormgr.c +++ b/coregrind/m_errormgr.c @@ -966,7 +966,8 @@ void VG_(show_all_errors) ( Int verbosity, Bool xml ) if ((i+1 == VG_(clo_dump_error))) { StackTrace ips = VG_(get_ExeContext_StackTrace)(p_min->where); - VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/, + VG_(translate) ( NULL/*caused_discardP*/, + 0 /* dummy ThreadId; irrelevant due to debugging*/, ips[0], /*debugging*/True, 0xFE/*verbosity*/, /*bbs_done*/0, /*allow redir?*/True); diff --git a/coregrind/m_gdbserver/server.c b/coregrind/m_gdbserver/server.c index 2736419c8c..8e58589919 100644 --- a/coregrind/m_gdbserver/server.c +++ b/coregrind/m_gdbserver/server.c @@ -310,7 +310,8 @@ int handle_gdb_valgrind_command (char* mon, OutputSink* sink_wanted_at_return) address = thumb_pc (address); # endif - VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/, + VG_(translate) ( NULL/*caused_discardP*/, + 0 /* dummy ThreadId; irrelevant due to debugging*/, address, /*debugging*/True, (Int) vex_verbosity, diff --git a/coregrind/m_libcproc.c b/coregrind/m_libcproc.c index 4462b6a2fc..de16c53bde 100644 --- a/coregrind/m_libcproc.c +++ b/coregrind/m_libcproc.c @@ -716,6 +716,59 @@ void VG_(do_atfork_child)(ThreadId tid) } +/* --------------------------------------------------------------------- + icache invalidation + ------------------------------------------------------------------ */ + +void VG_(invalidate_icache) ( void *ptr, SizeT nbytes ) +{ +# if defined(VGA_ppc32) || defined(VGA_ppc64) + Addr startaddr = (Addr) ptr; + Addr endaddr = startaddr + nbytes; + Addr cls; + Addr addr; + VexArchInfo vai; + + if (nbytes == 0) return; + vg_assert(nbytes > 0); + + VG_(machine_get_VexArchInfo)( NULL, &vai ); + cls = vai.ppc_cache_line_szB; + + /* Stay sane .. */ + vg_assert(cls == 32 || cls == 64 || cls == 128); + + startaddr &= ~(cls - 1); + for (addr = startaddr; addr < endaddr; addr += cls) { + __asm__ __volatile__("dcbst 0,%0" : : "r" (addr)); + } + __asm__ __volatile__("sync"); + for (addr = startaddr; addr < endaddr; addr += cls) { + __asm__ __volatile__("icbi 0,%0" : : "r" (addr)); + } + __asm__ __volatile__("sync; isync"); + +# elif defined(VGA_x86) + /* no need to do anything, hardware provides coherence */ + +# elif defined(VGA_amd64) + /* no need to do anything, hardware provides coherence */ + +# elif defined(VGA_s390x) + /* no need to do anything, hardware provides coherence */ + +# elif defined(VGP_arm_linux) + /* ARM cache flushes are privileged, so we must defer to the kernel. */ + Addr startaddr = (Addr) ptr; + Addr endaddr = startaddr + nbytes; + VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr); + +# else +# error "Unknown ARCH" +# endif +} + + /*--------------------------------------------------------------------*/ /*--- end ---*/ /*--------------------------------------------------------------------*/ diff --git a/coregrind/m_main.c b/coregrind/m_main.c index 094e884dfb..3bddb47926 100644 --- a/coregrind/m_main.c +++ b/coregrind/m_main.c @@ -1373,7 +1373,8 @@ void show_BB_profile ( BBProfEntry tops[], UInt n_tops, ULong score_total ) score_here, buf_here, tops[r].addr, name ); VG_(printf)("\n"); VG_(discard_translations)(tops[r].addr, 1, "bb profile"); - VG_(translate)(0, tops[r].addr, True, VG_(clo_profile_flags), 0, True); + VG_(translate)(NULL/*caused_discardP*/, + 0, tops[r].addr, True, VG_(clo_profile_flags), 0, True); VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-= end BB rank %d " "=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r); } @@ -1881,13 +1882,13 @@ Int valgrind_main ( Int argc, HChar **argv, HChar **envp ) VG_(printf)("pid=%d, entering delay loop\n", VG_(getpid)()); # if defined(VGP_x86_linux) - iters = 5; + iters = 10; # elif defined(VGP_amd64_linux) || defined(VGP_ppc64_linux) iters = 10; # elif defined(VGP_ppc32_linux) iters = 5; # elif defined(VGP_arm_linux) - iters = 1; + iters = 5; # elif defined(VGP_s390x_linux) iters = 10; # elif defined(VGO_darwin) diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c index be2ae59066..349b56b355 100644 --- a/coregrind/m_scheduler/scheduler.c +++ b/coregrind/m_scheduler/scheduler.c @@ -55,7 +55,23 @@ the OS handles threading and signalling are abstracted away and implemented elsewhere. [Some of the functions have worked their way back for the moment, until we do an OS port in earnest...] - */ +*/ + +/* FIXME tchaining tests: + - extensive spinrounds + - with sched quantum = 1 -- check that handle_noredir_jump + doesn't return with INNER_COUNTERZERO + other: + - out of date comment w.r.t. bit 0 set in libvex_trc_values.h + - can VG_TRC_BORING still happen? if not, rm + - memory leaks in m_transtab (InEdgeArr/OutEdgeArr leaking?) + - move do_cacheflush out of m_transtab + - more economical unchaining when nuking an entire sector + - ditto w.r.t. cache flushes + - add comments about caused_discard to handle_chain_me() + - verify case of 2 paths from A to B + - check -- is IP_AT_SYSCALL still right? +*/ #include "pub_core_basics.h" #include "pub_core_debuglog.h" @@ -108,9 +124,6 @@ /* If False, a fault is Valgrind-internal (ie, a bug) */ Bool VG_(in_generated_code) = False; -/* Counts downwards in VG_(run_innerloop). */ -UInt VG_(dispatch_ctr); - /* 64-bit counter for the number of basic blocks done. */ static ULong bbs_done = 0; @@ -130,6 +143,9 @@ static void mostly_clear_thread_record ( ThreadId tid ); static ULong n_scheduling_events_MINOR = 0; static ULong n_scheduling_events_MAJOR = 0; +ULong VG_(stats__n_xindirs) = 0; +ULong VG_(stats__n_xindir_misses) = 0; + /* Sanity checking counts. */ static UInt sanity_fast_count = 0; static UInt sanity_slow_count = 0; @@ -137,7 +153,12 @@ static UInt sanity_slow_count = 0; void VG_(print_scheduler_stats)(void) { VG_(message)(Vg_DebugMsg, - "scheduler: %'llu jumps (bb entries).\n", bbs_done ); + "scheduler: %'llu event checks.\n", bbs_done ); + VG_(message)(Vg_DebugMsg, + "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n", + VG_(stats__n_xindirs), VG_(stats__n_xindir_misses), + VG_(stats__n_xindirs) / (VG_(stats__n_xindir_misses) + ? VG_(stats__n_xindir_misses) : 1)); VG_(message)(Vg_DebugMsg, "scheduler: %'llu/%'llu major/minor sched events.\n", n_scheduling_events_MAJOR, n_scheduling_events_MINOR); @@ -700,14 +721,34 @@ static void do_pre_run_checks ( ThreadState* tst ) vg_assert(sz_spill == LibVEX_N_SPILL_BYTES); vg_assert(a_vex + 3 * sz_vex == a_spill); +# if defined(VGA_x86) + /* x86 XMM regs must form an array, ie, have no holes in + between. */ + vg_assert( + (offsetof(VexGuestX86State,guest_XMM7) + - offsetof(VexGuestX86State,guest_XMM0)) + == (8/*#regs*/-1) * 16/*bytes per reg*/ + ); + vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestX86State,guest_XMM0))); + vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestX86State,guest_FPREG))); + vg_assert(8 == offsetof(VexGuestX86State,guest_EAX)); + vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EAX))); + vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EIP))); +# endif + # if defined(VGA_amd64) - /* x86/amd64 XMM regs must form an array, ie, have no - holes in between. */ + /* amd64 XMM regs must form an array, ie, have no holes in + between. */ vg_assert( (offsetof(VexGuestAMD64State,guest_XMM16) - offsetof(VexGuestAMD64State,guest_XMM0)) == (17/*#regs*/-1) * 16/*bytes per reg*/ ); + vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestAMD64State,guest_XMM0))); + vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_FPREG))); + vg_assert(16 == offsetof(VexGuestAMD64State,guest_RAX)); + vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RAX))); + vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RIP))); # endif # if defined(VGA_ppc32) || defined(VGA_ppc64) @@ -724,10 +765,10 @@ static void do_pre_run_checks ( ThreadState* tst ) # if defined(VGA_arm) /* arm guest_state VFP regs must be 8 byte aligned for - loads/stores. */ - vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D0)); - vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D0)); - vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D0)); + loads/stores. Let's use 16 just to be on the safe side. */ + vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_D0)); + vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_D0)); + vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_D0)); /* be extra paranoid .. */ vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D1)); vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D1)); @@ -755,30 +796,82 @@ void VG_(force_vgdb_poll) ( void ) } /* Run the thread tid for a while, and return a VG_TRC_* value - indicating why VG_(run_innerloop) stopped. */ -static UInt run_thread_for_a_while ( ThreadId tid ) + indicating why VG_(disp_run_translations) stopped, and possibly an + auxiliary word. Also, only allow the thread to run for at most + *dispatchCtrP events. If (as is the normal case) use_alt_host_addr + is False, we are running ordinary redir'd translations, and we + should therefore start by looking up the guest next IP in TT. If + it is True then we ignore the guest next IP and just run from + alt_host_addr, which presumably points at host code for a no-redir + translation. + + Return results are placed in two_words. two_words[0] is set to the + TRC. In the case where that is VG_TRC_CHAIN_ME_TO_{SLOW,FAST}_EP, + the address to patch is placed in two_words[1]. +*/ +static +void run_thread_for_a_while ( /*OUT*/HWord* two_words, + /*MOD*/Int* dispatchCtrP, + ThreadId tid, + HWord alt_host_addr, + Bool use_alt_host_addr ) { - volatile UWord jumped; - volatile ThreadState* tst = NULL; /* stop gcc complaining */ - volatile UInt trc; - volatile Int dispatch_ctr_SAVED; - volatile Int done_this_time; + volatile HWord jumped = 0; + volatile ThreadState* tst = NULL; /* stop gcc complaining */ + volatile UInt trc = 0; + volatile Int done_this_time = 0; + volatile HWord host_code_addr = 0; /* Paranoia */ vg_assert(VG_(is_valid_tid)(tid)); vg_assert(VG_(is_running_thread)(tid)); vg_assert(!VG_(is_exiting)(tid)); + vg_assert(*dispatchCtrP > 0); tst = VG_(get_ThreadState)(tid); do_pre_run_checks( (ThreadState*)tst ); /* end Paranoia */ - trc = 0; - dispatch_ctr_SAVED = VG_(dispatch_ctr); + /* Clear return area. */ + two_words[0] = two_words[1] = 0; + + /* Figure out where we're starting from. */ + if (use_alt_host_addr) { + /* unusual case -- no-redir translation */ + host_code_addr = alt_host_addr; + vg_assert(host_code_addr != 0); /* implausible */ + } else { + /* normal case -- redir translation */ + AddrH res = 0; + Bool found = VG_(search_transtab)( + &res, NULL, NULL, + (Addr64)tst->arch.vex.VG_INSTR_PTR, + True/*upd cache -- necessary?*/ + ); + if (found) { + host_code_addr = res; + vg_assert(host_code_addr != 0); /* implausible */ + } else { + host_code_addr = 0; + } + } + + /* At this point, either host_code_addr is nonzero, in which case + we're OK, or it's zero, in which case we know that we intended + to start at a normal redir translation, but it was not found. + In which case we can return now claiming it's not findable. */ + if (host_code_addr == 0) { + two_words[0] = VG_TRC_INNER_FASTMISS; /* hmm, is that right? */ + return; + } /* there should be no undealt-with signals */ //vg_assert(VG_(threads)[tid].siginfo.si_signo == 0); + /* Set up event counter stuff for the run. */ + tst->arch.vex.host_EvC_COUNTER = *dispatchCtrP; + tst->arch.vex.host_EvC_FAILADDR = (HWord)&VG_(disp_cp_evcheck_fail); + if (0) { vki_sigset_t m; Int i, err = VG_(sigprocmask)(VKI_SIG_SETMASK, NULL, &m); @@ -790,6 +883,8 @@ static UInt run_thread_for_a_while ( ThreadId tid ) VG_(printf)("\n"); } + /* Set up return-value area. */ + // Tell the tool this thread is about to run client code VG_TRACK( start_client_code, tid, bbs_done ); @@ -799,26 +894,37 @@ static UInt run_thread_for_a_while ( ThreadId tid ) SCHEDSETJMP( tid, jumped, - trc = (UInt)VG_(run_innerloop)( (void*)&tst->arch.vex, - VG_(clo_profile_flags) > 0 ? 1 : 0 ) + trc = (UInt)VG_(disp_run_translations)( + two_words, + (void*)&tst->arch.vex, + host_code_addr + ) ); vg_assert(VG_(in_generated_code) == True); VG_(in_generated_code) = False; - if (jumped != (UWord)0) { + if (jumped != (HWord)0) { /* We get here if the client took a fault that caused our signal handler to longjmp. */ vg_assert(trc == 0); - trc = VG_TRC_FAULT_SIGNAL; + two_words[0] = VG_TRC_FAULT_SIGNAL; + two_words[1] = 0; block_signals(); } - done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 0; + vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1); + vg_assert(tst->arch.vex.host_EvC_FAILADDR + == (HWord)&VG_(disp_cp_evcheck_fail)); + + done_this_time = *dispatchCtrP - ((Int)tst->arch.vex.host_EvC_COUNTER + 1); vg_assert(done_this_time >= 0); bbs_done += (ULong)done_this_time; + *dispatchCtrP -= done_this_time; + vg_assert(*dispatchCtrP >= 0); + // Tell the tool this thread has stopped running client code VG_TRACK( stop_client_code, tid, bbs_done ); @@ -832,89 +938,16 @@ static UInt run_thread_for_a_while ( ThreadId tid ) VG_(gdbserver) (tid); } - return trc; -} - - -/* Run a no-redir translation just once, and return the resulting - VG_TRC_* value. */ -static UInt run_noredir_translation ( Addr hcode, ThreadId tid ) -{ - volatile UWord jumped; - volatile ThreadState* tst; - volatile UWord argblock[4]; - volatile UInt retval; - - /* Paranoia */ - vg_assert(VG_(is_valid_tid)(tid)); - vg_assert(VG_(is_running_thread)(tid)); - vg_assert(!VG_(is_exiting)(tid)); - - tst = VG_(get_ThreadState)(tid); - do_pre_run_checks( (ThreadState*)tst ); - /* end Paranoia */ - -# if defined(VGA_ppc32) || defined(VGA_ppc64) - /* I don't think we need to clear this thread's guest_RESVN here, - because we can only get here if run_thread_for_a_while() has - been used immediately before, on this same thread. */ -# endif - - /* There can be 3 outcomes from VG_(run_a_noredir_translation): - - - a signal occurred and the sighandler longjmp'd. Then both [2] - and [3] are unchanged - hence zero. - - - translation ran normally, set [2] (next guest IP) and set [3] - to whatever [1] was beforehand, indicating a normal (boring) - jump to the next block. - - - translation ran normally, set [2] (next guest IP) and set [3] - to something different from [1] beforehand, which indicates a - TRC_ value. - */ - argblock[0] = (UWord)hcode; - argblock[1] = (UWord)&VG_(threads)[tid].arch.vex; - argblock[2] = 0; /* next guest IP is written here */ - argblock[3] = 0; /* guest state ptr afterwards is written here */ - - // Tell the tool this thread is about to run client code - VG_TRACK( start_client_code, tid, bbs_done ); - - vg_assert(VG_(in_generated_code) == False); - VG_(in_generated_code) = True; - - SCHEDSETJMP( - tid, - jumped, - VG_(run_a_noredir_translation)( &argblock[0] ) - ); - - VG_(in_generated_code) = False; - - if (jumped != (UWord)0) { - /* We get here if the client took a fault that caused our signal - handler to longjmp. */ - vg_assert(argblock[2] == 0); /* next guest IP was not written */ - vg_assert(argblock[3] == 0); /* trc was not written */ - block_signals(); - retval = VG_TRC_FAULT_SIGNAL; + /* TRC value and possible auxiliary patch-address word are already + in two_words[0] and [1] respectively, as a result of the call to + VG_(run_innerloop). */ + /* Stay sane .. */ + if (two_words[0] == VG_TRC_CHAIN_ME_TO_SLOW_EP + || two_words[0] == VG_TRC_CHAIN_ME_TO_FAST_EP) { + vg_assert(two_words[1] != 0); /* we have a legit patch addr */ } else { - /* store away the guest program counter */ - VG_(set_IP)( tid, argblock[2] ); - if (argblock[3] == argblock[1]) - /* the guest state pointer afterwards was unchanged */ - retval = VG_TRC_BORING; - else - retval = (UInt)argblock[3]; + vg_assert(two_words[1] == 0); /* nobody messed with it */ } - - bbs_done++; - - // Tell the tool this thread has stopped running client code - VG_TRACK( stop_client_code, tid, bbs_done ); - - return retval; } @@ -929,13 +962,16 @@ static void handle_tt_miss ( ThreadId tid ) /* Trivial event. Miss in the fast-cache. Do a full lookup for it. */ - found = VG_(search_transtab)( NULL, ip, True/*upd_fast_cache*/ ); + found = VG_(search_transtab)( NULL, NULL, NULL, + ip, True/*upd_fast_cache*/ ); if (UNLIKELY(!found)) { /* Not found; we need to request a translation. */ - if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, + if (VG_(translate)( NULL/*caused_discardP*/, + tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done, True/*allow redirection*/ )) { - found = VG_(search_transtab)( NULL, ip, True ); - vg_assert2(found, "VG_TRC_INNER_FASTMISS: missing tt_fast entry"); + found = VG_(search_transtab)( NULL, NULL, NULL, + ip, True ); + vg_assert2(found, "handle_tt_miss: missing tt_fast entry"); } else { // If VG_(translate)() fails, it's because it had to throw a @@ -947,6 +983,46 @@ static void handle_tt_miss ( ThreadId tid ) } } +static +void handle_chain_me ( ThreadId tid, void* place_to_chain, Bool toFastEP ) +{ + Bool found = False; + Addr ip = VG_(get_IP)(tid); + UInt to_sNo = (UInt)-1; + UInt to_tteNo = (UInt)-1; + Bool caused_discard = False; + + found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo, + ip, False/*dont_upd_fast_cache*/ ); + if (!found) { + /* Not found; we need to request a translation. */ + if (VG_(translate)( &caused_discard, + tid, ip, /*debug*/False, 0/*not verbose*/, + bbs_done, True/*allow redirection*/ )) { + found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo, + ip, False ); + vg_assert2(found, "handle_chain_me: missing tt_fast entry"); + } else { + // If VG_(translate)() fails, it's because it had to throw a + // signal because the client jumped to a bad address. That + // means that either a signal has been set up for delivery, + // or the thread has been marked for termination. Either + // way, we just need to go back into the scheduler loop. + return; + } + } + vg_assert(found); + vg_assert(to_sNo != -1); + vg_assert(to_tteNo != -1); + + /* So, finally we know where to patch through to. Do the patching + and update the various admin tables that allow it to be undone + in the case that the destination block gets deleted. */ + if (!caused_discard) + VG_(tt_tc_do_chaining)( place_to_chain, + to_sNo, to_tteNo, toFastEP ); +} + static void handle_syscall(ThreadId tid, UInt trc) { ThreadState * volatile tst = VG_(get_ThreadState)(tid); @@ -978,28 +1054,35 @@ static void handle_syscall(ThreadId tid, UInt trc) /* tid just requested a jump to the noredir version of its current program counter. So make up that translation if needed, run it, - and return the resulting thread return code. */ -static UInt/*trc*/ handle_noredir_jump ( ThreadId tid ) + and return the resulting thread return code in two_words[]. */ +static +void handle_noredir_jump ( /*OUT*/HWord* two_words, + /*MOD*/Int* dispatchCtrP, + ThreadId tid ) { + /* Clear return area. */ + two_words[0] = two_words[1] = 0; + AddrH hcode = 0; Addr ip = VG_(get_IP)(tid); Bool found = VG_(search_unredir_transtab)( &hcode, ip ); if (!found) { /* Not found; we need to request a translation. */ - if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done, + if (VG_(translate)( NULL/*caused_discardP*/, + tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done, False/*NO REDIRECTION*/ )) { found = VG_(search_unredir_transtab)( &hcode, ip ); vg_assert2(found, "unredir translation missing after creation?!"); - } else { // If VG_(translate)() fails, it's because it had to throw a // signal because the client jumped to a bad address. That // means that either a signal has been set up for delivery, // or the thread has been marked for termination. Either // way, we just need to go back into the scheduler loop. - return VG_TRC_BORING; + two_words[0] = VG_TRC_BORING; + return; } } @@ -1007,8 +1090,10 @@ static UInt/*trc*/ handle_noredir_jump ( ThreadId tid ) vg_assert(found); vg_assert(hcode != 0); - /* Otherwise run it and return the resulting VG_TRC_* value. */ - return run_noredir_translation( hcode, tid ); + /* Otherwise run it and return the resulting VG_TRC_* value. */ + vg_assert(*dispatchCtrP > 0); /* so as to guarantee progress */ + run_thread_for_a_while( two_words, dispatchCtrP, tid, + hcode, True/*use hcode*/ ); } @@ -1020,7 +1105,9 @@ static UInt/*trc*/ handle_noredir_jump ( ThreadId tid ) */ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) { - UInt trc; + /* Holds the remaining size of this thread's "timeslice". */ + Int dispatch_ctr = 0; + ThreadState *tst = VG_(get_ThreadState)(tid); static Bool vgdb_startup_action_done = False; @@ -1079,11 +1166,12 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) vg_assert(VG_(is_running_thread)(tid)); - VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1; + dispatch_ctr = SCHEDULING_QUANTUM; while (!VG_(is_exiting)(tid)) { - if (VG_(dispatch_ctr) == 1) { + vg_assert(dispatch_ctr >= 0); + if (dispatch_ctr == 0) { /* Our slice is done, so yield the CPU to another thread. On Linux, this doesn't sleep between sleeping and running, @@ -1130,7 +1218,8 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) exceed zero before entering the innerloop. Also also, the decrement is done before the bb is actually run, so you always get at least one decrement even if nothing happens. */ - VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1; + // FIXME is this right? + dispatch_ctr = SCHEDULING_QUANTUM; /* paranoia ... */ vg_assert(tst->tid == tid); @@ -1142,17 +1231,20 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) if (0) VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs\n", - tid, VG_(dispatch_ctr) - 1 ); + tid, dispatch_ctr - 1 ); - trc = run_thread_for_a_while ( tid ); + HWord trc[2]; /* "two_words" */ + run_thread_for_a_while( &trc[0], + &dispatch_ctr, + tid, 0/*ignored*/, False ); if (VG_(clo_trace_sched) && VG_(clo_verbosity) > 2) { - Char buf[50]; - VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc)); + HChar buf[50]; + VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc[0])); print_sched_event(tid, buf); } - if (trc == VEX_TRC_JMP_NOREDIR) { + if (trc[0] == VEX_TRC_JMP_NOREDIR) { /* If we got a request to run a no-redir version of something, do so now -- handle_noredir_jump just (creates and) runs that one translation. The flip side is that the @@ -1160,20 +1252,61 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) request -- that would be nonsensical. It can, however, return VG_TRC_BORING, which just means keep going as normal. */ - trc = handle_noredir_jump(tid); - vg_assert(trc != VEX_TRC_JMP_NOREDIR); + /* Note that the fact that we need to continue with a + no-redir jump is not recorded anywhere else in this + thread's state. So we *must* execute the block right now + -- we can't fail to execute it and later resume with it, + because by then we'll have forgotten the fact that it + should be run as no-redir, but will get run as a normal + potentially-redir'd, hence screwing up. This really ought + to be cleaned up, by noting in the guest state that the + next block to be executed should be no-redir. Then we can + suspend and resume at any point, which isn't the case at + the moment. */ + handle_noredir_jump( &trc[0], + &dispatch_ctr, + tid ); + vg_assert(trc[0] != VEX_TRC_JMP_NOREDIR); + + /* This can't be allowed to happen, since it means the block + didn't execute, and we have no way to resume-as-noredir + after we get more timeslice. But I don't think it ever + can, since handle_noredir_jump will assert if the counter + is zero on entry. */ + vg_assert(trc[0] != VG_TRC_INNER_COUNTERZERO); + + /* A no-redir translation can't return with a chain-me + request, since chaining in the no-redir cache is too + complex. */ + vg_assert(trc[0] != VG_TRC_CHAIN_ME_TO_SLOW_EP + && trc[0] != VG_TRC_CHAIN_ME_TO_FAST_EP); } - switch (trc) { + switch (trc[0]) { + case VEX_TRC_JMP_BORING: + /* assisted dispatch, no event. Used by no-redir + translations to force return to the scheduler. */ case VG_TRC_BORING: /* no special event, just keep going. */ break; case VG_TRC_INNER_FASTMISS: - vg_assert(VG_(dispatch_ctr) > 1); + vg_assert(dispatch_ctr > 0); handle_tt_miss(tid); break; - + + case VG_TRC_CHAIN_ME_TO_SLOW_EP: { + if (0) VG_(printf)("sched: CHAIN_TO_SLOW_EP: %p\n", (void*)trc[1] ); + handle_chain_me(tid, (void*)trc[1], False); + break; + } + + case VG_TRC_CHAIN_ME_TO_FAST_EP: { + if (0) VG_(printf)("sched: CHAIN_TO_FAST_EP: %p\n", (void*)trc[1] ); + handle_chain_me(tid, (void*)trc[1], True); + break; + } + case VEX_TRC_JMP_CLIENTREQ: do_client_request(tid); break; @@ -1182,7 +1315,7 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) case VEX_TRC_JMP_SYS_INT129: /* x86-darwin */ case VEX_TRC_JMP_SYS_INT130: /* x86-darwin */ case VEX_TRC_JMP_SYS_SYSCALL: /* amd64-linux, ppc32-linux, amd64-darwin */ - handle_syscall(tid, trc); + handle_syscall(tid, trc[0]); if (VG_(clo_sanity_level) > 2) VG_(sanity_check_general)(True); /* sanity-check every syscall */ break; @@ -1195,13 +1328,13 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) before swapping to another. That means that short term spins waiting for hardware to poke memory won't cause a thread swap. */ - if (VG_(dispatch_ctr) > 2000) - VG_(dispatch_ctr) = 2000; + if (dispatch_ctr > 2000) + dispatch_ctr = 2000; break; case VG_TRC_INNER_COUNTERZERO: /* Timeslice is out. Let a new thread be scheduled. */ - vg_assert(VG_(dispatch_ctr) == 1); + vg_assert(dispatch_ctr == 0); break; case VG_TRC_FAULT_SIGNAL: @@ -1346,7 +1479,7 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid ) default: vg_assert2(0, "VG_(scheduler), phase 3: " - "unexpected thread return code (%u)", trc); + "unexpected thread return code (%u)", trc[0]); /* NOTREACHED */ break; diff --git a/coregrind/m_translate.c b/coregrind/m_translate.c index cd0d269713..795c000e84 100644 --- a/coregrind/m_translate.c +++ b/coregrind/m_translate.c @@ -280,6 +280,7 @@ IRSB* vg_SP_update_pass ( void* closureV, bb->tyenv = deepCopyIRTypeEnv(sb_in->tyenv); bb->next = deepCopyIRExpr(sb_in->next); bb->jumpkind = sb_in->jumpkind; + bb->offsIP = sb_in->offsIP; delta = 0; @@ -1259,14 +1260,18 @@ typedef instead of the normal one. TID is the identity of the thread requesting this translation. -*/ -Bool VG_(translate) ( ThreadId tid, - Addr64 nraddr, - Bool debugging_translation, - Int debugging_verbosity, - ULong bbs_done, - Bool allow_redirection ) + *caused_discardP returns whether or not this translation resulting + in code being dumped from the main translation cache in order to + make space for the new translation. +*/ +Bool VG_(translate) ( /*OUT*/Bool* caused_discardP, + ThreadId tid, + Addr64 nraddr, + Bool debugging_translation, + Int debugging_verbosity, + ULong bbs_done, + Bool allow_redirection ) { Addr64 addr; T_Kind kind; @@ -1280,8 +1285,9 @@ Bool VG_(translate) ( ThreadId tid, VexTranslateResult tres; VgCallbackClosure closure; - /* Make sure Vex is initialised right. */ + if (caused_discardP) *caused_discardP = False; + /* Make sure Vex is initialised right. */ static Bool vex_init_done = False; if (!vex_init_done) { @@ -1348,7 +1354,7 @@ Bool VG_(translate) ( ThreadId tid, } vg_assert(objname); VG_(printf)( - "==== SB %d (exec'd %lld) [tid %d] 0x%llx %s %s+0x%llx\n", + "==== SB %d (evchecks %lld) [tid %d] 0x%llx %s %s+0x%llx\n", VG_(get_bbs_translated)(), bbs_done, (Int)tid, addr, fnname, objname, (ULong)objoff ); @@ -1461,11 +1467,10 @@ Bool VG_(translate) ( ThreadId tid, vta.arch_host = vex_arch; vta.archinfo_host = vex_archinfo; vta.abiinfo_both = vex_abiinfo; + vta.callback_opaque = (void*)&closure; vta.guest_bytes = (UChar*)ULong_to_Ptr(addr); vta.guest_bytes_addr = (Addr64)addr; - vta.callback_opaque = (void*)&closure; vta.chase_into_ok = chase_into_ok; - vta.preamble_function = preamble_fn; vta.guest_extents = &vge; vta.host_bytes = tmpbuf; vta.host_bytes_size = N_TMPBUF; @@ -1486,22 +1491,47 @@ Bool VG_(translate) ( ThreadId tid, IRSB*,VexGuestLayout*,VexGuestExtents*, IRType,IRType) = (IRSB*(*)(void*,IRSB*,VexGuestLayout*,VexGuestExtents*,IRType,IRType))f; - vta.instrument1 = g; + vta.instrument1 = g; } /* No need for type kludgery here. */ - vta.instrument2 = need_to_handle_SP_assignment() - ? vg_SP_update_pass - : NULL; - vta.finaltidy = VG_(needs).final_IR_tidy_pass - ? VG_(tdict).tool_final_IR_tidy_pass - : NULL; - vta.needs_self_check = needs_self_check; - vta.traceflags = verbosity; - - /* Set up the dispatch-return info. For archs without a link - register, vex generates a jump back to the specified dispatch - address. Else, it just generates a branch-to-LR. */ + vta.instrument2 = need_to_handle_SP_assignment() + ? vg_SP_update_pass + : NULL; + vta.finaltidy = VG_(needs).final_IR_tidy_pass + ? VG_(tdict).tool_final_IR_tidy_pass + : NULL; + vta.needs_self_check = needs_self_check; + vta.preamble_function = preamble_fn; + vta.traceflags = verbosity; + vta.addProfInc = VG_(clo_profile_flags) > 0 + && kind != T_NoRedir; + + /* Set up the dispatch continuation-point info. If this is a + no-redir translation then it cannot be chained, and the chain-me + points are set to NULL to indicate that. The indir point must + also be NULL, since we can't allow this translation to do an + indir transfer -- that would take it back into the main + translation cache too. + + All this is because no-redir translations live outside the main + translation cache (in a secondary one) and chaining them would + involve more adminstrative complexity that isn't worth the + hassle, because we don't expect them to get used often. So + don't bother. */ + if (allow_redirection) { + vta.disp_cp_chain_me_to_slowEP = (void*) &VG_(disp_cp_chain_me_to_slowEP); + vta.disp_cp_chain_me_to_fastEP = (void*) &VG_(disp_cp_chain_me_to_fastEP); + vta.disp_cp_xindir = (void*) &VG_(disp_cp_xindir); + } else { + vta.disp_cp_chain_me_to_slowEP = NULL; + vta.disp_cp_chain_me_to_fastEP = NULL; + vta.disp_cp_xindir = NULL; + } + /* Thins doesn't involve chaining and so is always allowable. */ + vta.disp_cp_xassisted = (void*) &VG_(disp_cp_xassisted); +#if 0 + // FIXME tidy this up and make profiling work again # if defined(VGA_x86) || defined(VGA_amd64) if (!allow_redirection) { /* It's a no-redir translation. Will be run with the @@ -1539,6 +1569,7 @@ Bool VG_(translate) ( ThreadId tid, # else # error "Unknown arch" # endif +#endif /* 0 */ /* Sheesh. Finally, actually _do_ the translation! */ tres = LibVEX_Translate ( &vta ); @@ -1577,12 +1608,18 @@ Bool VG_(translate) ( ThreadId tid, // Note that we use nraddr (the non-redirected address), not // addr, which might have been changed by the redirection - VG_(add_to_transtab)( &vge, - nraddr, - (Addr)(&tmpbuf[0]), - tmpbuf_used, - tres.n_sc_extents > 0 ); + Bool caused_discard + = VG_(add_to_transtab)( &vge, + nraddr, + (Addr)(&tmpbuf[0]), + tmpbuf_used, + tres.n_sc_extents > 0, + tres.offs_profInc, + vex_arch ); + if (caused_discardP) + *caused_discardP = caused_discard; } else { + vg_assert(tres.offs_profInc == -1); /* -1 == unset */ VG_(add_to_unredir_transtab)( &vge, nraddr, (Addr)(&tmpbuf[0]), diff --git a/coregrind/m_transtab.c b/coregrind/m_transtab.c index fac0b1f994..3c2439ddc1 100644 --- a/coregrind/m_transtab.c +++ b/coregrind/m_transtab.c @@ -31,8 +31,10 @@ #include "pub_core_basics.h" #include "pub_core_debuglog.h" -#include "pub_core_machine.h" // For VG(machine_get_VexArchInfo) +#include "pub_core_machine.h" // For VG_(machine_get_VexArchInfo) #include "pub_core_libcbase.h" +#include "pub_core_vki.h" // to keep pub_core_libproc.h happy, sigh +#include "pub_core_libcproc.h" // VG_(invalidate_icache) #include "pub_core_libcassert.h" #include "pub_core_libcprint.h" #include "pub_core_options.h" @@ -40,12 +42,8 @@ #include "pub_core_transtab.h" #include "pub_core_aspacemgr.h" #include "pub_core_mallocfree.h" // VG_(out_of_memory_NORETURN) - -// JRS FIXME get rid of this somehow -#if defined(VGP_arm_linux) -# include "pub_core_vkiscnums.h" // __ARM_NR_cacheflush -# include "pub_core_syscall.h" // VG_(do_syscallN) -#endif +#include "pub_core_xarray.h" +#include "pub_core_dispatch.h" // For VG_(disp_cp*) addresses /* #define DEBUG_TRANSTAB */ @@ -67,6 +65,7 @@ 'deleted') and it is strongly recommended not to change this. 65521 is the largest prime <= 65535. */ #define N_TTES_PER_SECTOR /*30011*/ /*40009*/ 65521 +//DEBUG-ONLY: #define N_TTES_PER_SECTOR 10007 /* Because each sector contains a hash table of TTEntries, we need to specify the maximum allowable loading, after which the sector is @@ -91,6 +90,46 @@ /*------------------ TYPES ------------------*/ +/* In edges ("to-me") in the graph created by chaining. */ +typedef + struct { + UInt from_sNo; /* sector number */ + UInt from_tteNo; /* TTE number in given sector */ + UInt from_offs; /* code offset from TCEntry::tcptr where the patch is */ + Bool to_fastEP; /* Is the patch to a fast or slow entry point? */ + } + InEdge; + + +/* Out edges ("from-me") in the graph created by chaining. */ +typedef + struct { + UInt to_sNo; /* sector number */ + UInt to_tteNo; /* TTE number in given sector */ + UInt from_offs; /* code offset in owning translation where patch is */ + } + OutEdge; + + +#define N_FIXED_IN_EDGE_ARR 3 +typedef + struct { + UInt n_fixed; /* 0 .. N_FIXED_IN_EDGE_ARR */ + InEdge fixed[N_FIXED_IN_EDGE_ARR]; + XArray* var; /* XArray* of InEdgeArr */ + } + InEdgeArr; + +#define N_FIXED_OUT_EDGE_ARR 2 +typedef + struct { + UInt n_fixed; /* 0 .. N_FIXED_OUT_EDGE_ARR */ + OutEdge fixed[N_FIXED_OUT_EDGE_ARR]; + XArray* var; /* XArray* of OutEdgeArr */ + } + OutEdgeArr; + + /* A translation-table entry. This indicates precisely which areas of guest code are included in the translation, and contains all other auxiliary info too. */ @@ -102,7 +141,7 @@ typedef Count is an entry count for the translation and is incremented by 1 every time the translation is used, if we are profiling. */ - UInt count; + ULong count; UShort weight; /* Status of the slot. Note, we need to be able to do lazy @@ -143,15 +182,70 @@ typedef // sec->ec2tte[ tte2ec_ec[i] ][ tte2ec_ix[i] ] // should be the index // of this TTEntry in the containing Sector's tt array. + + /* Admin information for chaining. 'in_edges' is a set of the + patch points which jump to this translation -- hence are + predecessors in the control flow graph. 'out_edges' points + to successors in the control flow graph -- translations to + which this one has a patched jump. In short these are just + backwards and forwards edges in the graph of patched-together + blocks. The 'in_edges' contain slightly more info, enough + that we can undo the chaining of each mentioned patch point. + The 'out_edges' list exists only so that we can visit the + 'in_edges' entries of all blocks we're patched through to, in + order to remove ourselves from then when we're deleted. */ + + /* It is possible, although very unlikely, that a block A has + more than one patched jump to block B. This could happen if + (eg) A finishes "jcond B; jmp B". + + This means in turn that B's in_edges set can list A more than + once (twice in this example). However, each such entry must + have a different from_offs, since a patched jump can only + jump to one place at once (it's meaningless for it to have + multiple destinations.) IOW, the successor and predecessor + edges in the graph are not uniquely determined by a + TTEntry --> TTEntry pair, but rather by a + (TTEntry,offset) --> TTEntry triple. + + If A has multiple edges to B then B will mention A multiple + times in its in_edges. To make things simpler, we then + require that A mentions B exactly the same number of times in + its out_edges. Furthermore, a matching out-in pair must have + the same offset (from_offs). This facilitates sanity + checking, and it facilitates establishing the invariant that + a out_edges set may not have duplicates when using the + equality defined by (TTEntry,offset). Hence the out_edges + and in_edges sets really do have both have set semantics. + + eg if A has been patched to B at offsets 42 and 87 (in A) + then A.out_edges = { (B,42), (B,87) } (in any order) + and B.in_edges = { (A,42), (A,87) } (in any order) + + Hence for each node pair P->Q in the graph, there's a 1:1 + mapping between P.out_edges and Q.in_edges. + */ + InEdgeArr in_edges; + OutEdgeArr out_edges; } TTEntry; +/* A structure used for mapping host code addresses back to the + relevant TTEntry. Used when doing chaining, for finding the + TTEntry to which some arbitrary patch address belongs. */ +typedef + struct { + UChar* start; + UInt len; + UInt tteNo; + } + HostExtent; + /* Finally, a sector itself. Each sector contains an array of TCEntries, which hold code, and an array of TTEntries, containing all required administrative info. Profiling is supported using the - TTEntry .count and .weight fields, if required. Each sector is - independent in that no cross-sector references are allowed. + TTEntry .count and .weight fields, if required. If the sector is not in use, all three pointers are NULL and tt_n_inuse is zero. @@ -181,6 +275,11 @@ typedef Int ec2tte_size[ECLASS_N]; Int ec2tte_used[ECLASS_N]; UShort* ec2tte[ECLASS_N]; + + /* The host extents. The [start, +len) ranges are constructed + in strictly non-overlapping order, so we can binary search + them at any time. */ + XArray* host_extents; /* XArray* of HostExtent */ } Sector; @@ -238,30 +337,6 @@ typedef */ /*global*/ __attribute__((aligned(16))) FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE]; -/* -#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1) -*/ - -/* For profiling, we have a parallel array of pointers to .count - fields in TT entries. Again, these pointers must be invalidated - when translations disappear. A NULL pointer suffices to indicate - an unused slot. - - When not profiling (the normal case, VG_(clo_profile_flags) == 0), - all tt_fastN entries are set to NULL at startup and never read nor - written after that. - - When profiling (VG_(clo_profile_flags) > 0), tt_fast and tt_fastN - change together: if tt_fast[i].guest is TRANSTAB_BOGUS_GUEST_ADDR - then the corresponding tt_fastN[i] must be null. If - tt_fast[i].guest is any other value, then tt_fastN[i] *must* point - to the .count field of the corresponding TT entry. - - tt_fast and tt_fastN are referred to from assembly code - (dispatch.S). -*/ -/*global*/ UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE]; - /* Make sure we're not used before initialisation. */ static Bool init_done = False; @@ -270,27 +345,480 @@ static Bool init_done = False; /*------------------ STATS DECLS ------------------*/ /* Number of fast-cache updates and flushes done. */ -ULong n_fast_flushes = 0; -ULong n_fast_updates = 0; +static ULong n_fast_flushes = 0; +static ULong n_fast_updates = 0; /* Number of full lookups done. */ -ULong n_full_lookups = 0; -ULong n_lookup_probes = 0; +static ULong n_full_lookups = 0; +static ULong n_lookup_probes = 0; /* Number/osize/tsize of translations entered; also the number of those for which self-checking was requested. */ -ULong n_in_count = 0; -ULong n_in_osize = 0; -ULong n_in_tsize = 0; -ULong n_in_sc_count = 0; +static ULong n_in_count = 0; +static ULong n_in_osize = 0; +static ULong n_in_tsize = 0; +static ULong n_in_sc_count = 0; /* Number/osize of translations discarded due to lack of space. */ -ULong n_dump_count = 0; -ULong n_dump_osize = 0; +static ULong n_dump_count = 0; +static ULong n_dump_osize = 0; /* Number/osize of translations discarded due to requests to do so. */ -ULong n_disc_count = 0; -ULong n_disc_osize = 0; +static ULong n_disc_count = 0; +static ULong n_disc_osize = 0; + + +/*-------------------------------------------------------------*/ +/*--- Misc ---*/ +/*-------------------------------------------------------------*/ + +static void* ttaux_malloc ( HChar* tag, SizeT n ) +{ + return VG_(arena_malloc)(VG_AR_TTAUX, tag, n); +} + +static void ttaux_free ( void* p ) +{ + VG_(arena_free)(VG_AR_TTAUX, p); +} + + +/*-------------------------------------------------------------*/ +/*--- Chaining support ---*/ +/*-------------------------------------------------------------*/ + +static inline TTEntry* index_tte ( UInt sNo, UInt tteNo ) +{ + vg_assert(sNo < N_SECTORS); + vg_assert(tteNo < N_TTES_PER_SECTOR); + Sector* s = §ors[sNo]; + vg_assert(s->tt); + TTEntry* tte = &s->tt[tteNo]; + vg_assert(tte->status == InUse); + return tte; +} + +static void InEdge__init ( InEdge* ie ) +{ + ie->from_sNo = -1; /* invalid */ + ie->from_tteNo = 0; + ie->from_offs = 0; + ie->to_fastEP = False; +} + +static void OutEdge__init ( OutEdge* oe ) +{ + oe->to_sNo = -1; /* invalid */ + oe->to_tteNo = 0; + oe->from_offs = 0; +} + +static void TTEntry__init ( TTEntry* tte ) +{ + VG_(memset)(tte, 0, sizeof(*tte)); +} + +static UWord InEdgeArr__size ( InEdgeArr* iea ) +{ + if (iea->var) { + vg_assert(iea->n_fixed == 0); + return VG_(sizeXA)(iea->var); + } else { + vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR); + return iea->n_fixed; + } +} + +static void InEdgeArr__makeEmpty ( InEdgeArr* iea ) +{ + if (iea->var) { + vg_assert(iea->n_fixed == 0); + VG_(deleteXA)(iea->var); + iea->var = NULL; + } else { + vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR); + iea->n_fixed = 0; + } +} + +static +InEdge* InEdgeArr__index ( InEdgeArr* iea, UWord i ) +{ + if (iea->var) { + vg_assert(iea->n_fixed == 0); + return (InEdge*)VG_(indexXA)(iea->var, i); + } else { + vg_assert(i < iea->n_fixed); + return &iea->fixed[i]; + } +} + +static +void InEdgeArr__deleteIndex ( InEdgeArr* iea, UWord i ) +{ + if (iea->var) { + vg_assert(iea->n_fixed == 0); + VG_(removeIndexXA)(iea->var, i); + } else { + vg_assert(i < iea->n_fixed); + for (; i+1 < iea->n_fixed; i++) { + iea->fixed[i] = iea->fixed[i+1]; + } + iea->n_fixed--; + } +} + +static +void InEdgeArr__add ( InEdgeArr* iea, InEdge* ie ) +{ + if (iea->var) { + vg_assert(iea->n_fixed == 0); + VG_(addToXA)(iea->var, ie); + } else { + vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR); + if (iea->n_fixed == N_FIXED_IN_EDGE_ARR) { + /* The fixed array is full, so we have to initialise an + XArray and copy the fixed array into it. */ + iea->var = VG_(newXA)(ttaux_malloc, "transtab.IEA__add", + ttaux_free, + sizeof(InEdge)); + UWord i; + for (i = 0; i < iea->n_fixed; i++) { + VG_(addToXA)(iea->var, &iea->fixed[i]); + } + VG_(addToXA)(iea->var, ie); + iea->n_fixed = 0; + } else { + /* Just add to the fixed array. */ + iea->fixed[iea->n_fixed++] = *ie; + } + } +} + +static UWord OutEdgeArr__size ( OutEdgeArr* oea ) +{ + if (oea->var) { + vg_assert(oea->n_fixed == 0); + return VG_(sizeXA)(oea->var); + } else { + vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR); + return oea->n_fixed; + } +} + +static void OutEdgeArr__makeEmpty ( OutEdgeArr* oea ) +{ + if (oea->var) { + vg_assert(oea->n_fixed == 0); + VG_(deleteXA)(oea->var); + oea->var = NULL; + } else { + vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR); + oea->n_fixed = 0; + } +} + +static +OutEdge* OutEdgeArr__index ( OutEdgeArr* oea, UWord i ) +{ + if (oea->var) { + vg_assert(oea->n_fixed == 0); + return (OutEdge*)VG_(indexXA)(oea->var, i); + } else { + vg_assert(i < oea->n_fixed); + return &oea->fixed[i]; + } +} + +static +void OutEdgeArr__deleteIndex ( OutEdgeArr* oea, UWord i ) +{ + if (oea->var) { + vg_assert(oea->n_fixed == 0); + VG_(removeIndexXA)(oea->var, i); + } else { + vg_assert(i < oea->n_fixed); + for (; i+1 < oea->n_fixed; i++) { + oea->fixed[i] = oea->fixed[i+1]; + } + oea->n_fixed--; + } +} + +static +void OutEdgeArr__add ( OutEdgeArr* oea, OutEdge* oe ) +{ + if (oea->var) { + vg_assert(oea->n_fixed == 0); + VG_(addToXA)(oea->var, oe); + } else { + vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR); + if (oea->n_fixed == N_FIXED_OUT_EDGE_ARR) { + /* The fixed array is full, so we have to initialise an + XArray and copy the fixed array into it. */ + oea->var = VG_(newXA)(ttaux_malloc, "transtab.OEA__add", + ttaux_free, + sizeof(OutEdge)); + UWord i; + for (i = 0; i < oea->n_fixed; i++) { + VG_(addToXA)(oea->var, &oea->fixed[i]); + } + VG_(addToXA)(oea->var, oe); + oea->n_fixed = 0; + } else { + /* Just add to the fixed array. */ + oea->fixed[oea->n_fixed++] = *oe; + } + } +} + +static +Int HostExtent__cmpOrd ( void* v1, void* v2 ) +{ + HostExtent* hx1 = (HostExtent*)v1; + HostExtent* hx2 = (HostExtent*)v2; + if (hx1->start + hx1->len <= hx2->start) return -1; + if (hx2->start + hx2->len <= hx1->start) return 1; + return 0; /* partial overlap */ +} + +static __attribute__((noinline)) +Bool find_TTEntry_from_hcode( /*OUT*/UInt* from_sNo, + /*OUT*/UInt* from_tteNo, + void* hcode ) +{ + Int i; + + /* Search order logic copied from VG_(search_transtab). */ + for (i = 0; i < N_SECTORS; i++) { + Int sno = sector_search_order[i]; + if (UNLIKELY(sno == -1)) + return False; /* run out of sectors to search */ + + Sector* sec = §ors[sno]; + XArray* /* of HostExtent */ host_extents = sec->host_extents; + vg_assert(host_extents); + + HostExtent key; + VG_(memset)(&key, 0, sizeof(key)); + key.start = hcode; + key.len = 1; + Word firstW = -1, lastW = -1; + Bool found = VG_(lookupXA_UNSAFE)( + host_extents, &key, &firstW, &lastW, + (Int(*)(void*,void*))HostExtent__cmpOrd + ); + vg_assert(firstW == lastW); // always true, even if not found + if (found) { + HostExtent* hx = VG_(indexXA)(host_extents, firstW); + UInt tteNo = hx->tteNo; + /* Do some additional sanity checks. */ + vg_assert(tteNo <= N_TTES_PER_SECTOR); + vg_assert(sec->tt[tteNo].status == InUse); + /* Can only half check that the found TTEntry contains hcode, + due to not having a length value for the hcode in the + TTEntry. */ + vg_assert((UChar*)sec->tt[tteNo].tcptr <= (UChar*)hcode); + /* Looks plausible */ + *from_sNo = sno; + *from_tteNo = (UInt)tteNo; + return True; + } + } + return False; +} + + +/* Figure out whether or not hcode is jitted code present in the main + code cache (but not in the no-redir cache). Used for sanity + checking. */ +static Bool is_in_the_main_TC ( void* hcode ) +{ + Int i, sno; + for (i = 0; i < N_SECTORS; i++) { + sno = sector_search_order[i]; + if (sno == -1) + break; /* run out of sectors to search */ + if ((UChar*)hcode >= (UChar*)sectors[sno].tc + && (UChar*)hcode <= (UChar*)sectors[sno].tc_next + + sizeof(ULong) - 1) + return True; + } + return False; +} + + +/* Fulfill a chaining request, and record admin info so we + can undo it later, if required. +*/ +void VG_(tt_tc_do_chaining) ( void* from__patch_addr, + UInt to_sNo, + UInt to_tteNo, + Bool to_fastEP ) +{ + /* Get the CPU info established at startup. */ + VexArch vex_arch = VexArch_INVALID; + VG_(machine_get_VexArchInfo)( &vex_arch, NULL ); + + // host_code is where we're patching to. So it needs to + // take into account, whether we're jumping to the slow + // or fast entry point. By definition, the fast entry point + // is exactly one event check's worth of code along from + // the slow (tcptr) entry point. + TTEntry* to_tte = index_tte(to_sNo, to_tteNo); + void* host_code = ((UChar*)to_tte->tcptr) + + (to_fastEP ? LibVEX_evCheckSzB(vex_arch) : 0); + + // stay sane -- the patch point (dst) is in this sector's code cache + vg_assert( (UChar*)host_code >= (UChar*)sectors[to_sNo].tc ); + vg_assert( (UChar*)host_code <= (UChar*)sectors[to_sNo].tc_next + + sizeof(ULong) - 1 ); + // stay sane -- the patch src is in some sector's code cache + vg_assert( is_in_the_main_TC(from__patch_addr) ); + + /* Get VEX to do the patching itself. We have to hand it off + since it is host-dependent. */ + VexInvalRange vir + = LibVEX_Chain( vex_arch, + from__patch_addr, + to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP) + : &VG_(disp_cp_chain_me_to_slowEP), + (void*)host_code ); + VG_(invalidate_icache)( (void*)vir.start, vir.len ); + + /* Now do the tricky bit -- update the ch_succs and ch_preds info + for the two translations involved, so we can undo the chaining + later, which we will have to do if the to_ block gets removed + for whatever reason. */ + /* Find the TTEntry for the from__ code. This isn't simple since + we only know the patch address, which is going to be somewhere + inside the from_ block. */ + UInt from_sNo = (UInt)-1; + UInt from_tteNo = (UInt)-1; + Bool from_found + = find_TTEntry_from_hcode( &from_sNo, &from_tteNo, + from__patch_addr ); + vg_assert(from_found); + TTEntry* from_tte = index_tte(from_sNo, from_tteNo); + + /* This is the new from_ -> to_ link to add. */ + InEdge ie; + InEdge__init(&ie); + ie.from_sNo = from_sNo; + ie.from_tteNo = from_tteNo; + ie.to_fastEP = to_fastEP; + HWord from_offs = (HWord)( (UChar*)from__patch_addr + - (UChar*)from_tte->tcptr ); + vg_assert(from_offs < 100000/* let's say */); + ie.from_offs = (UInt)from_offs; + + /* This is the new to_ -> from_ backlink to add. */ + OutEdge oe; + OutEdge__init(&oe); + oe.to_sNo = to_sNo; + oe.to_tteNo = to_tteNo; + oe.from_offs = (UInt)from_offs; + + /* Add .. */ + InEdgeArr__add(&to_tte->in_edges, &ie); + OutEdgeArr__add(&from_tte->out_edges, &oe); +} + + +/* Unchain one patch, as described by the specified InEdge. For + sanity check purposes only (to check that the patched location is + as expected) it also requires the fast and slow entry point + addresses of the destination block (that is, the block that owns + this InEdge). */ +__attribute__((noinline)) +static void unchain_one ( VexArch vex_arch, + InEdge* ie, + void* to_fastEPaddr, void* to_slowEPaddr ) +{ + vg_assert(ie); + TTEntry* tte + = index_tte(ie->from_sNo, ie->from_tteNo); + UChar* place_to_patch + = ((HChar*)tte->tcptr) + ie->from_offs; + UChar* disp_cp_chain_me + = ie->to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP) + : &VG_(disp_cp_chain_me_to_slowEP); + UChar* place_to_jump_to_EXPECTED + = ie->to_fastEP ? to_fastEPaddr : to_slowEPaddr; + + // stay sane: both src and dst for this unchaining are + // in the main code cache + vg_assert( is_in_the_main_TC(place_to_patch) ); // src + vg_assert( is_in_the_main_TC(place_to_jump_to_EXPECTED) ); // dst + // dst check is ok because LibVEX_UnChain checks that + // place_to_jump_to_EXPECTED really is the current dst, and + // asserts if it isn't. + VexInvalRange vir + = LibVEX_UnChain( vex_arch, place_to_patch, + place_to_jump_to_EXPECTED, disp_cp_chain_me ); + VG_(invalidate_icache)( (void*)vir.start, vir.len ); +} + + +/* The specified block is about to be deleted. Update the preds and + succs of its associated blocks accordingly. This includes undoing + any chained jumps to this block. */ +static +void unchain_in_preparation_for_deletion ( VexArch vex_arch, + UInt here_sNo, UInt here_tteNo ) +{ + if (0) + VG_(printf)("QQQ unchain_in_prep %u.%u\n", here_sNo, here_tteNo); + UWord i, j, n, m; + Int evCheckSzB = LibVEX_evCheckSzB(vex_arch); + TTEntry* here_tte = index_tte(here_sNo, here_tteNo); + vg_assert(here_tte->status == InUse); + + /* Visit all InEdges owned by here_tte. */ + n = InEdgeArr__size(&here_tte->in_edges); + for (i = 0; i < n; i++) { + InEdge* ie = InEdgeArr__index(&here_tte->in_edges, i); + // Undo the chaining. + UChar* here_slow_EP = (UChar*)here_tte->tcptr; + UChar* here_fast_EP = here_slow_EP + evCheckSzB; + unchain_one(vex_arch, ie, here_fast_EP, here_slow_EP); + // Find the corresponding entry in the "from" node's out_edges, + // and remove it. + TTEntry* from_tte = index_tte(ie->from_sNo, ie->from_tteNo); + m = OutEdgeArr__size(&from_tte->out_edges); + vg_assert(m > 0); // it must have at least one entry + for (j = 0; j < m; j++) { + OutEdge* oe = OutEdgeArr__index(&from_tte->out_edges, j); + if (oe->to_sNo == here_sNo && oe->to_tteNo == here_tteNo + && oe->from_offs == ie->from_offs) + break; + } + vg_assert(j < m); // "oe must be findable" + OutEdgeArr__deleteIndex(&from_tte->out_edges, j); + } + + /* Visit all OutEdges owned by here_tte. */ + n = OutEdgeArr__size(&here_tte->out_edges); + for (i = 0; i < n; i++) { + OutEdge* oe = OutEdgeArr__index(&here_tte->out_edges, i); + // Find the corresponding entry in the "to" node's in_edges, + // and remove it. + TTEntry* to_tte = index_tte(oe->to_sNo, oe->to_tteNo); + m = InEdgeArr__size(&to_tte->in_edges); + vg_assert(m > 0); // it must have at least one entry + for (j = 0; j < m; j++) { + InEdge* ie = InEdgeArr__index(&to_tte->in_edges, j); + if (ie->from_sNo == here_sNo && ie->from_tteNo == here_tteNo + && ie->from_offs == oe->from_offs) + break; + } + vg_assert(j < m); // "ie must be findable" + InEdgeArr__deleteIndex(&to_tte->in_edges, j); + } + + InEdgeArr__makeEmpty(&here_tte->in_edges); + OutEdgeArr__makeEmpty(&here_tte->out_edges); +} /*-------------------------------------------------------------*/ @@ -398,12 +926,12 @@ UInt addEClassNo ( /*MOD*/Sector* sec, Int ec, UShort tteno ) old_sz = sec->ec2tte_size[ec]; old_ar = sec->ec2tte[ec]; new_sz = old_sz==0 ? 8 : old_sz<64 ? 2*old_sz : (3*old_sz)/2; - new_ar = VG_(arena_malloc)(VG_AR_TTAUX, "transtab.aECN.1", - new_sz * sizeof(UShort)); + new_ar = ttaux_malloc("transtab.aECN.1", + new_sz * sizeof(UShort)); for (i = 0; i < old_sz; i++) new_ar[i] = old_ar[i]; if (old_ar) - VG_(arena_free)(VG_AR_TTAUX, old_ar); + ttaux_free(old_ar); sec->ec2tte_size[ec] = new_sz; sec->ec2tte[ec] = new_ar; @@ -575,7 +1103,6 @@ static Bool sanity_check_eclasses_in_sector ( Sector* sec ) /* forwards */ static Bool sanity_check_redir_tt_tc ( void ); -static Bool sanity_check_fastcache ( void ); static Bool sanity_check_sector_search_order ( void ) { @@ -630,8 +1157,6 @@ static Bool sanity_check_all_sectors ( void ) } if ( !sanity_check_redir_tt_tc() ) return False; - if ( !sanity_check_fastcache() ) - return False; if ( !sanity_check_sector_search_order() ) return False; return True; @@ -669,13 +1194,11 @@ static inline UInt HASH_TT ( Addr64 key ) return k32 % N_TTES_PER_SECTOR; } -static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count ) +static void setFastCacheEntry ( Addr64 key, ULong* tcptr ) { UInt cno = (UInt)VG_TT_FAST_HASH(key); VG_(tt_fast)[cno].guest = (Addr)key; VG_(tt_fast)[cno].host = (Addr)tcptr; - if (VG_(clo_profile_flags) > 0) - VG_(tt_fastN)[cno] = count; n_fast_updates++; /* This shouldn't fail. It should be assured by m_translate which should reject any attempt to make translation of code @@ -683,23 +1206,7 @@ static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count ) vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR); } -/* Invalidate the fast cache's counter array, VG_(tt_fastN). */ -static void invalidateFastNCache ( void ) -{ - UInt j; - vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0); - for (j = 0; j < VG_TT_FAST_SIZE; j += 4) { - VG_(tt_fastN)[j+0] = NULL; - VG_(tt_fastN)[j+1] = NULL; - VG_(tt_fastN)[j+2] = NULL; - VG_(tt_fastN)[j+3] = NULL; - } - vg_assert(j == VG_TT_FAST_SIZE); -} - -/* Invalidate the fast cache VG_(tt_fast). If profiling, also - invalidate the fast cache's counter array VG_(tt_fastN), otherwise - don't touch it. */ +/* Invalidate the fast cache VG_(tt_fast). */ static void invalidateFastCache ( void ) { UInt j; @@ -713,42 +1220,19 @@ static void invalidateFastCache ( void ) VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR; } - if (VG_(clo_profile_flags) > 0) - invalidateFastNCache(); - vg_assert(j == VG_TT_FAST_SIZE); n_fast_flushes++; } -static Bool sanity_check_fastcache ( void ) +/* Returns True if the sector has been used before (hence, if we have + to eject existing code in it), False if it's never been used + before. */ +static Bool initialiseSector ( Int sno ) { - UInt j; - if (0) VG_(printf)("sanity check fastcache\n"); - if (VG_(clo_profile_flags) > 0) { - /* profiling */ - for (j = 0; j < VG_TT_FAST_SIZE; j++) { - if (VG_(tt_fastN)[j] == NULL - && VG_(tt_fast)[j].guest != TRANSTAB_BOGUS_GUEST_ADDR) - return False; - if (VG_(tt_fastN)[j] != NULL - && VG_(tt_fast)[j].guest == TRANSTAB_BOGUS_GUEST_ADDR) - return False; - } - } else { - /* not profiling */ - for (j = 0; j < VG_TT_FAST_SIZE; j++) { - if (VG_(tt_fastN)[j] != NULL) - return False; - } - } - return True; -} - -static void initialiseSector ( Int sno ) -{ - Int i; - SysRes sres; + Int i; + SysRes sres; Sector* sec; + Bool has_been_used_before = False; vg_assert(isValidSector(sno)); { Bool sane = sanity_check_sector_search_order(); @@ -768,6 +1252,7 @@ static void initialiseSector ( Int sno ) vg_assert(sec->ec2tte_used[i] == 0); vg_assert(sec->ec2tte[i] == NULL); } + vg_assert(sec->host_extents == NULL); VG_(debugLog)(1,"transtab", "allocate sector %d\n", sno); @@ -793,6 +1278,12 @@ static void initialiseSector ( Int sno ) sec->tt[i].n_tte2ec = 0; } + /* Set up the host_extents array. */ + sec->host_extents + = VG_(newXA)(ttaux_malloc, "transtab.initialiseSector(host_extents)", + ttaux_free, + sizeof(HostExtent)); + /* Add an entry in the sector_search_order */ for (i = 0; i < N_SECTORS; i++) { if (sector_search_order[i] == -1) @@ -808,11 +1299,16 @@ static void initialiseSector ( Int sno ) /* Sector has been used before. Dump the old contents. */ VG_(debugLog)(1,"transtab", "recycle sector %d\n", sno); + has_been_used_before = True; vg_assert(sec->tt != NULL); vg_assert(sec->tc_next != NULL); n_dump_count += sec->tt_n_inuse; + VexArch vex_arch = VexArch_INVALID; + VG_(machine_get_VexArchInfo)( &vex_arch, NULL ); + /* Visit each just-about-to-be-abandoned translation. */ +VG_(printf)("QQQ unlink-entire-sector: %d START\n", sno); for (i = 0; i < N_TTES_PER_SECTOR; i++) { if (sec->tt[i].status == InUse) { vg_assert(sec->tt[i].n_tte2ec >= 1); @@ -824,12 +1320,14 @@ static void initialiseSector ( Int sno ) sec->tt[i].entry, sec->tt[i].vge ); } + unchain_in_preparation_for_deletion(vex_arch, sno, i); } else { vg_assert(sec->tt[i].n_tte2ec == 0); } sec->tt[i].status = Empty; sec->tt[i].n_tte2ec = 0; } +VG_(printf)("QQQ unlink-entire-sector: %d END\n", sno); /* Free up the eclass structures. */ for (i = 0; i < ECLASS_N; i++) { @@ -838,13 +1336,18 @@ static void initialiseSector ( Int sno ) vg_assert(sec->ec2tte[i] == NULL); } else { vg_assert(sec->ec2tte[i] != NULL); - VG_(arena_free)(VG_AR_TTAUX, sec->ec2tte[i]); + ttaux_free(sec->ec2tte[i]); sec->ec2tte[i] = NULL; sec->ec2tte_size[i] = 0; sec->ec2tte_used[i] = 0; } } + /* Empty out the host extents array. */ + vg_assert(sec->host_extents != NULL); + VG_(dropTailXA)(sec->host_extents, VG_(sizeXA)(sec->host_extents)); + vg_assert(VG_(sizeXA)(sec->host_extents) == 0); + /* Sanity check: ensure it is already in sector_search_order[]. */ for (i = 0; i < N_SECTORS; i++) { @@ -865,54 +1368,8 @@ static void initialiseSector ( Int sno ) { Bool sane = sanity_check_sector_search_order(); vg_assert(sane); } -} - -static void invalidate_icache ( void *ptr, Int nbytes ) -{ -# if defined(VGA_ppc32) || defined(VGA_ppc64) - Addr startaddr = (Addr) ptr; - Addr endaddr = startaddr + nbytes; - Addr cls; - Addr addr; - VexArchInfo vai; - - if (nbytes == 0) return; - vg_assert(nbytes > 0); - - VG_(machine_get_VexArchInfo)( NULL, &vai ); - cls = vai.ppc_cache_line_szB; - - /* Stay sane .. */ - vg_assert(cls == 32 || cls == 64 || cls == 128); - - startaddr &= ~(cls - 1); - for (addr = startaddr; addr < endaddr; addr += cls) { - __asm__ __volatile__("dcbst 0,%0" : : "r" (addr)); - } - __asm__ __volatile__("sync"); - for (addr = startaddr; addr < endaddr; addr += cls) { - __asm__ __volatile__("icbi 0,%0" : : "r" (addr)); - } - __asm__ __volatile__("sync; isync"); - -# elif defined(VGA_x86) - /* no need to do anything, hardware provides coherence */ - -# elif defined(VGA_amd64) - /* no need to do anything, hardware provides coherence */ - -# elif defined(VGA_s390x) - /* no need to do anything, hardware provides coherence */ - -# elif defined(VGP_arm_linux) - /* ARM cache flushes are privileged, so we must defer to the kernel. */ - Addr startaddr = (Addr) ptr; - Addr endaddr = startaddr + nbytes; - VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr); -# else -# error "Unknown ARCH" -# endif + return has_been_used_before; } @@ -921,18 +1378,28 @@ static void invalidate_icache ( void *ptr, Int nbytes ) pre: youngest_sector points to a valid (although possibly full) sector. + + Returns True if the call caused any existing translation(s) to get + thrown away in order to make space for this one. */ -void VG_(add_to_transtab)( VexGuestExtents* vge, +Bool VG_(add_to_transtab)( VexGuestExtents* vge, Addr64 entry, AddrH code, UInt code_len, - Bool is_self_checking ) + Bool is_self_checking, + Int offs_profInc, + VexArch arch_host ) { Int tcAvailQ, reqdQ, y, i; ULong *tcptr, *tcptr2; UChar* srcP; UChar* dstP; + /* We need to tell the caller whether this call caused any code to + be thrown away due to the TC becoming full, and hence the oldest + Sector to be emptied out and recycled. */ + Bool caused_code_discarding = False; + vg_assert(init_done); vg_assert(vge->n_used >= 1 && vge->n_used <= 3); @@ -952,8 +1419,10 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, y = youngest_sector; vg_assert(isValidSector(y)); - if (sectors[y].tc == NULL) - initialiseSector(y); + if (sectors[y].tc == NULL) { + Bool used_before = initialiseSector(y); + vg_assert(!used_before); + } /* Try putting the translation in this sector. */ reqdQ = (code_len + 7) >> 3; @@ -983,7 +1452,8 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, if (youngest_sector >= N_SECTORS) youngest_sector = 0; y = youngest_sector; - initialiseSector(y); + caused_code_discarding = initialiseSector(y); + } /* Be sure ... */ @@ -1002,13 +1472,10 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, dstP = (UChar*)tcptr; srcP = (UChar*)code; - for (i = 0; i < code_len; i++) - dstP[i] = srcP[i]; + VG_(memcpy)(dstP, srcP, code_len); sectors[y].tc_next += reqdQ; sectors[y].tt_n_inuse++; - invalidate_icache( dstP, code_len ); - /* more paranoia */ tcptr2 = sectors[y].tc_next; vg_assert(tcptr2 >= §ors[y].tc[0]); @@ -1027,6 +1494,7 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, i = 0; } + TTEntry__init(§ors[y].tt[i]); sectors[y].tt[i].status = InUse; sectors[y].tt[i].tcptr = tcptr; sectors[y].tt[i].count = 0; @@ -1034,11 +1502,42 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, sectors[y].tt[i].vge = *vge; sectors[y].tt[i].entry = entry; + /* Patch in the profile counter location, if necessary. */ + if (offs_profInc != -1) { + vg_assert(offs_profInc >= 0 && offs_profInc < code_len); + VexInvalRange vir + = LibVEX_PatchProfInc( arch_host, + dstP + offs_profInc, + §ors[y].tt[i].count ); + VG_(invalidate_icache)( (void*)vir.start, vir.len ); + } + + VG_(invalidate_icache)( dstP, code_len ); + + /* Add this entry to the host_extents map, checking that we're + adding in order. */ + { HostExtent hx; + hx.start = (UChar*)tcptr; + hx.len = code_len; + hx.tteNo = i; + vg_assert(hx.len > 0); /* bsearch fails w/ zero length entries */ + XArray* hx_array = sectors[y].host_extents; + vg_assert(hx_array); + Word n = VG_(sizeXA)(hx_array); + if (n > 0) { + HostExtent* hx_prev = (HostExtent*)VG_(indexXA)(hx_array, n-1); + vg_assert(hx_prev->start + hx_prev->len <= hx.start); + } + VG_(addToXA)(hx_array, &hx); + } + /* Update the fast-cache. */ - setFastCacheEntry( entry, tcptr, §ors[y].tt[i].count ); + setFastCacheEntry( entry, tcptr ); /* Note the eclass numbers for this translation. */ upd_eclasses_after_add( §ors[y], i ); + + return caused_code_discarding; } @@ -1046,7 +1545,9 @@ void VG_(add_to_transtab)( VexGuestExtents* vge, requested, a successful search can also cause the fast-caches to be updated. */ -Bool VG_(search_transtab) ( /*OUT*/AddrH* result, +Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode, + /*OUT*/UInt* res_sNo, + /*OUT*/UInt* res_tteNo, Addr64 guest_addr, Bool upd_cache ) { @@ -1076,10 +1577,13 @@ Bool VG_(search_transtab) ( /*OUT*/AddrH* result, /* found it */ if (upd_cache) setFastCacheEntry( - guest_addr, sectors[sno].tt[k].tcptr, - §ors[sno].tt[k].count ); - if (result) - *result = (AddrH)sectors[sno].tt[k].tcptr; + guest_addr, sectors[sno].tt[k].tcptr ); + if (res_hcode) + *res_hcode = (AddrH)sectors[sno].tt[k].tcptr; + if (res_sNo) + *res_sNo = sno; + if (res_tteNo) + *res_tteNo = k; /* pull this one one step closer to the front. For large apps this more or less halves the number of required probes. */ @@ -1147,16 +1651,23 @@ Bool overlaps ( Addr64 start, ULong range, VexGuestExtents* vge ) /* Delete a tt entry, and update all the eclass data accordingly. */ -static void delete_tte ( /*MOD*/Sector* sec, Int tteno ) +static void delete_tte ( /*MOD*/Sector* sec, UInt secNo, Int tteno, + VexArch vex_arch ) { Int i, ec_num, ec_idx; TTEntry* tte; + /* sec and secNo are mutually redundant; cross-check. */ + vg_assert(sec == §ors[secNo]); + vg_assert(tteno >= 0 && tteno < N_TTES_PER_SECTOR); tte = &sec->tt[tteno]; vg_assert(tte->status == InUse); vg_assert(tte->n_tte2ec >= 1 && tte->n_tte2ec <= 3); + /* Unchain .. */ + unchain_in_preparation_for_deletion(vex_arch, secNo, tteno); + /* Deal with the ec-to-tte links first. */ for (i = 0; i < tte->n_tte2ec; i++) { ec_num = (Int)tte->tte2ec_ec[i]; @@ -1192,9 +1703,10 @@ static void delete_tte ( /*MOD*/Sector* sec, Int tteno ) only consider translations in the specified eclass. */ static -Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, +Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, UInt secNo, Addr64 guest_start, ULong range, - Int ec ) + Int ec, + VexArch vex_arch ) { Int i; UShort tteno; @@ -1218,7 +1730,7 @@ Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, if (overlaps( guest_start, range, &tte->vge )) { anyDeld = True; - delete_tte( sec, (Int)tteno ); + delete_tte( sec, secNo, (Int)tteno, vex_arch ); } } @@ -1231,8 +1743,9 @@ Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, slow way, by inspecting all translations in sec. */ static -Bool delete_translations_in_sector ( /*MOD*/Sector* sec, - Addr64 guest_start, ULong range ) +Bool delete_translations_in_sector ( /*MOD*/Sector* sec, UInt secNo, + Addr64 guest_start, ULong range, + VexArch vex_arch ) { Int i; Bool anyDeld = False; @@ -1241,7 +1754,7 @@ Bool delete_translations_in_sector ( /*MOD*/Sector* sec, if (sec->tt[i].status == InUse && overlaps( guest_start, range, &sec->tt[i].vge )) { anyDeld = True; - delete_tte( sec, i ); + delete_tte( sec, secNo, i, vex_arch ); } } @@ -1271,6 +1784,9 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range, if (range == 0) return; + VexArch vex_arch = VexArch_INVALID; + VG_(machine_get_VexArchInfo)( &vex_arch, NULL ); + /* There are two different ways to do this. If the range fits within a single address-range equivalence @@ -1310,9 +1826,13 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range, if (sec->tc == NULL) continue; anyDeleted |= delete_translations_in_sector_eclass( - sec, guest_start, range, ec ); + sec, sno, guest_start, range, ec, + vex_arch + ); anyDeleted |= delete_translations_in_sector_eclass( - sec, guest_start, range, ECLASS_MISC ); + sec, sno, guest_start, range, ECLASS_MISC, + vex_arch + ); } } else { @@ -1327,7 +1847,7 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range, if (sec->tc == NULL) continue; anyDeleted |= delete_translations_in_sector( - sec, guest_start, range ); + sec, sno, guest_start, range, vex_arch ); } } @@ -1483,7 +2003,7 @@ void VG_(add_to_unredir_transtab)( VexGuestExtents* vge, for (j = 0; j < code_len; j++) dstP[j] = srcP[j]; - invalidate_icache( dstP, code_len ); + VG_(invalidate_icache)( dstP, code_len ); unredir_tt[i].inUse = True; unredir_tt[i].vge = *vge; @@ -1573,18 +2093,15 @@ void VG_(init_tt_tc) ( void ) sectors[i].ec2tte_used[j] = 0; sectors[i].ec2tte[j] = NULL; } + sectors[i].host_extents = NULL; } /* Initialise the sector_search_order hint table. */ for (i = 0; i < N_SECTORS; i++) sector_search_order[i] = -1; - /* Initialise the fast caches. If not profiling (the usual case), - we have to explicitly invalidate the fastN cache as - invalidateFastCache() won't do that for us. */ + /* Initialise the fast cache. */ invalidateFastCache(); - if (VG_(clo_profile_flags) == 0) - invalidateFastNCache(); /* and the unredir tt/tc */ init_unredir_tt_tc(); diff --git a/coregrind/m_xarray.c b/coregrind/m_xarray.c index 8859cec2e6..e9461ef7a2 100644 --- a/coregrind/m_xarray.c +++ b/coregrind/m_xarray.c @@ -311,6 +311,20 @@ void VG_(dropHeadXA) ( XArray* xao, Word n ) xa->usedsizeE -= n; } +void VG_(removeIndexXA)( XArray* xao, Word n ) +{ + struct _XArray* xa = (struct _XArray*)xao; + vg_assert(xa); + vg_assert(n >= 0); + vg_assert(n < xa->usedsizeE); + if (n+1 < xa->usedsizeE) { + VG_(memmove)( ((char*)xa->arr) + (n+0) * xa->elemSzB, + ((char*)xa->arr) + (n+1) * xa->elemSzB, + (xa->usedsizeE - n - 1) * xa->elemSzB ); + } + xa->usedsizeE--; +} + void VG_(getContentsXA_UNSAFE)( XArray* xao, /*OUT*/void** ctsP, /*OUT*/Word* usedP ) diff --git a/coregrind/pub_core_dispatch.h b/coregrind/pub_core_dispatch.h index 6de7fcf323..08cc3f29f6 100644 --- a/coregrind/pub_core_dispatch.h +++ b/coregrind/pub_core_dispatch.h @@ -41,56 +41,38 @@ #include "pub_core_dispatch_asm.h" -/* This subroutine is called from the C world. It is passed - a pointer to the VEX guest state (arch.vex). It must run code - from the instruction pointer in the guest state, and exit when - VG_(dispatch_ctr) reaches zero, or we need to defer to the scheduler. +/* Run translations, with the given guest state, and starting by + running the host code at 'host_addr'. It is almost always the case + that host_addr is the translation for guest_state.guest_IP, that + is, host_addr is what it would be if we looked up the address of + the translation corresponding to guest_state.guest_IP. + + The only case where this isn't true is where we're running a + no-redir translation. In this case host_addr is the address of the + alternative (non-redirected) translation for guest_state.guest_IP. + The return value must indicate why it returned back to the scheduler. It can also be exited if the executing code throws a non-resumable signal, for example SIGSEGV, in which case control longjmp()s back past here. - If do_profiling is nonzero, the profile counters arrays should be - updated for each translation run. - - This code simply handles the common case fast -- when the translation - address is found in the translation cache. For anything else, the - scheduler does the work. - - NOTE, VG_(run_innerloop) MUST NOT BE USED for noredir translations. - Instead use VG_(run_a_noredir_translation). -*/ -extern -UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling ); -#if defined(VGA_x86) || defined(VGA_amd64) -/* We need to locate a couple of labels inside VG_(run_innerloop), so - that Vex can add branches to them from generated code. Hence the - following somewhat bogus decls. At least on x86 and amd64. ppc32 - and ppc64 use straightforward bl-blr to get from dispatcher to - translation and back and so do not need these labels. */ -extern Addr VG_(run_innerloop__dispatch_unassisted_unprofiled); -extern Addr VG_(run_innerloop__dispatch_assisted_unprofiled); -extern Addr VG_(run_innerloop__dispatch_unassisted_profiled); -extern Addr VG_(run_innerloop__dispatch_assisted_profiled); -#endif - - -/* Run a no-redir translation. argblock points to 4 UWords, 2 to carry args - and 2 to carry results: - 0: input: ptr to translation - 1: input: ptr to guest state - 2: output: next guest PC - 3: output: guest state pointer afterwards (== thread return code) - MUST NOT BE USED for non-noredir (normal) translations. + two_words holds the return values (two words). First is + a TRC value. Second is generally unused, except in the case + where we have to return a chain-me request. */ -extern void VG_(run_a_noredir_translation) ( volatile UWord* argblock ); -#if defined(VGA_x86) || defined(VGA_amd64) -/* We need to a label inside VG_(run_a_noredir_translation), so that - Vex can add branches to them from generated code. Hence the - following somewhat bogus decl. */ -extern Addr VG_(run_a_noredir_translation__return_point); -#endif - +HWord VG_(disp_run_translations)( HWord* two_words, + void* guest_state, + Addr host_addr ); + +/* We need to know addresses of the continuation-point (cp_) labels so + we can tell VEX what they are. They will get baked into the code + VEX generates. The UChar is entirely mythical, but we need to + state _some_ type, so as to keep gcc happy. */ +UChar VG_(disp_cp_chain_me_to_slowEP); +UChar VG_(disp_cp_chain_me_to_fastEP); +UChar VG_(disp_cp_xindir); +UChar VG_(disp_cp_xassisted); +UChar VG_(disp_cp_evcheck_fail); #endif // __PUB_CORE_DISPATCH_H diff --git a/coregrind/pub_core_dispatch_asm.h b/coregrind/pub_core_dispatch_asm.h index 3e7b4a20d7..31d2f59a70 100644 --- a/coregrind/pub_core_dispatch_asm.h +++ b/coregrind/pub_core_dispatch_asm.h @@ -43,16 +43,20 @@ /* And some more of our own. These must not have the same values as those from libvex_trc_values.h. (viz, 60 or below is safe). + (The following comment is no longer relevant, but is retained + for historical purposes.) These values *must* be odd (have bit 0 set) because the dispatchers (coregrind/m_dispatch/dispatch-*-*.S) use this fact to distinguish a TRC value from the unchanged baseblock pointer -- which has 0 as its lowest bit. */ -#define VG_TRC_BORING 29 /* no event; just keep going */ -#define VG_TRC_INNER_FASTMISS 37 /* TRC only; means fast-cache miss. */ -#define VG_TRC_INNER_COUNTERZERO 41 /* TRC only; means bb ctr == 0 */ -#define VG_TRC_FAULT_SIGNAL 43 /* TRC only; got sigsegv/sigbus */ -#define VG_TRC_INVARIANT_FAILED 47 /* TRC only; invariant violation */ +#define VG_TRC_BORING 29 /* no event; just keep going */ +#define VG_TRC_INNER_FASTMISS 37 /* TRC only; means fast-cache miss. */ +#define VG_TRC_INNER_COUNTERZERO 41 /* TRC only; means bb ctr == 0 */ +#define VG_TRC_FAULT_SIGNAL 43 /* TRC only; got sigsegv/sigbus */ +#define VG_TRC_INVARIANT_FAILED 47 /* TRC only; invariant violation */ +#define VG_TRC_CHAIN_ME_TO_SLOW_EP 49 /* TRC only; chain to slow EP */ +#define VG_TRC_CHAIN_ME_TO_FAST_EP 51 /* TRC only; chain to fast EP */ #endif // __PUB_CORE_DISPATCH_ASM_H diff --git a/coregrind/pub_core_libcproc.h b/coregrind/pub_core_libcproc.h index cd9c18a29b..e573fd81a8 100644 --- a/coregrind/pub_core_libcproc.h +++ b/coregrind/pub_core_libcproc.h @@ -84,6 +84,10 @@ extern void VG_(do_atfork_pre) ( ThreadId tid ); extern void VG_(do_atfork_parent) ( ThreadId tid ); extern void VG_(do_atfork_child) ( ThreadId tid ); +// icache invalidation +extern void VG_(invalidate_icache) ( void *ptr, SizeT nbytes ); + + #endif // __PUB_CORE_LIBCPROC_H /*--------------------------------------------------------------------*/ diff --git a/coregrind/pub_core_translate.h b/coregrind/pub_core_translate.h index c6c24055d7..3182f4f796 100644 --- a/coregrind/pub_core_translate.h +++ b/coregrind/pub_core_translate.h @@ -37,12 +37,13 @@ //-------------------------------------------------------------------- extern -Bool VG_(translate) ( ThreadId tid, - Addr64 orig_addr, - Bool debugging_translation, - Int debugging_verbosity, - ULong bbs_done, - Bool allow_redirection ); +Bool VG_(translate) ( /*OUT*/Bool* caused_discardP, + ThreadId tid, + Addr64 orig_addr, + Bool debugging_translation, + Int debugging_verbosity, + ULong bbs_done, + Bool allow_redirection ); extern void VG_(print_translation_stats) ( void ); diff --git a/coregrind/pub_core_transtab.h b/coregrind/pub_core_transtab.h index 34ffee96e6..52dc5a7eee 100644 --- a/coregrind/pub_core_transtab.h +++ b/coregrind/pub_core_transtab.h @@ -39,9 +39,8 @@ #include "pub_core_transtab_asm.h" -/* The fast-cache for tt-lookup, and for finding counters. Unused - entries are denoted by .guest == 1, which is assumed to be a bogus - address for all guest code. */ +/* The fast-cache for tt-lookup. Unused entries are denoted by .guest + == 1, which is assumed to be a bogus address for all guest code. */ typedef struct { Addr guest; @@ -54,18 +53,26 @@ extern __attribute__((aligned(16))) #define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1) -extern UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE]; - extern void VG_(init_tt_tc) ( void ); extern -void VG_(add_to_transtab)( VexGuestExtents* vge, +Bool VG_(add_to_transtab)( VexGuestExtents* vge, Addr64 entry, AddrH code, UInt code_len, - Bool is_self_checking ); + Bool is_self_checking, + Int offs_profInc, + VexArch arch_host ); -extern Bool VG_(search_transtab) ( /*OUT*/AddrH* result, +extern +void VG_(tt_tc_do_chaining) ( void* from__patch_addr, + UInt to_sNo, + UInt to_tteNo, + Bool to_fastEP ); + +extern Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode, + /*OUT*/UInt* res_sNo, + /*OUT*/UInt* res_tteNo, Addr64 guest_addr, Bool upd_cache ); diff --git a/coregrind/pub_core_transtab_asm.h b/coregrind/pub_core_transtab_asm.h index 6d43a7ac3c..00adced522 100644 --- a/coregrind/pub_core_transtab_asm.h +++ b/coregrind/pub_core_transtab_asm.h @@ -42,8 +42,9 @@ ever be used. So instead the function is '(address >>u 2)[VG_TT_FAST_BITS-1 : 0]' on those targets. - On ARM we do like ppc32/ppc64, although that will have to be - revisited when we come to implement Thumb. + On ARM we shift by 1, since Thumb insns can be of size 2, hence to + minimise collisions and maximise cache utilisation we need to take + into account all but the least significant bit. On s390x the rightmost bit of an instruction address is zero. For best table utilization shift the address to the right by 1 bit. */ diff --git a/docs/Makefile.am b/docs/Makefile.am index 82fa93ab35..2deeb011c9 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -44,6 +44,7 @@ EXTRA_DIST = \ internals/register-uses.txt \ internals/release-HOWTO.txt \ internals/segments-seginfos.txt \ + internals/t-chaining-notes.txt \ internals/threads-syscalls-signals.txt \ internals/tm-mutexstates.dot \ internals/tm-threadstates.dot \ diff --git a/docs/internals/t-chaining-notes.txt b/docs/internals/t-chaining-notes.txt new file mode 100644 index 0000000000..be2d70bda9 --- /dev/null +++ b/docs/internals/t-chaining-notes.txt @@ -0,0 +1,201 @@ + +DO NOT MERGE +~~~~~~~~~~~ + +Changes memcheck/tests/Makefile.am w.r.t. -mfloat-abi=softfp +Ditto none/tests/arm/Makefile.am + + +Verification todo +~~~~~~~~~~~~~~~~~ +check that illegal insns on all targets don't cause the _toIR.c's to +assert. + +check also with --vex-guest-chase-cond=yes + +check that all targets can run their insn set tests with +--vex-guest-max-insns=1. + + +Cleanups +~~~~~~~~ +host_arm_isel.c and host_arm_defs.c: get rid of global var arm_hwcaps. + +host_x86_defs.c, host_amd64_defs.c: return proper VexInvalRange +records from the patchers, instead of {0,0}, so that transparent +self hosting works properly. + + +Optimisations +~~~~~~~~~~~~~ +all targets: change VG_(stats__n_xindirs) to a 32 bit counter, and +empty out every now and again. + +amd64: XDirect: write const value to guest_RIP using single +insn when the value is < 0x8000'0000 + +arm: chain_XDirect: generate short form jumps when possible + +arm codegen: Generate ORRS for CmpwNEZ32(Or32(x,y)) + +all targets: when nuking an entire sector, don't bother to undo the +patching for any translations within the sector (nor with their +invalidations). + +(somewhat implausible) for jumps to disp_cp_indir, have multiple +copies of disp_cp_indir, one for each of the possible registers that +could have held the target guest address before jumping to the stub. +Then disp_cp_indir wouldn't have to reload it from memory each time. +Might also have the effect of spreading out the indirect mispredict +burden somewhat (across the multiple copies.) + + +Implementation notes +~~~~~~~~~~~~~~~~~~~~ +T-chaining changes -- summary + +* The code generators (host_blah_isel.c, host_blah_defs.[ch]) interact + more closely with Valgrind than before. In particular the + instruction selectors must use one of 3 different kinds of + control-transfer instructions: XDirect, XIndir and XAssisted. + All archs must use these the same; no more ad-hoc control transfer + instructions. + (more detail below) + + +* With T-chaining, translations can jump between each other without + going through the dispatcher loop every time. This means that the + event check (counter dec, and exit if negative) the dispatcher loop + previously did now needs to be compiled into each translation. + + +* The assembly dispatcher code (dispatch-arch-os.S) is still + present. It still provides table lookup services for + indirect branches, but it also provides a new feature: + dispatch points, to which the generated code jumps. There + are 5: + + VG_(disp_cp_chain_me_to_slowEP): + VG_(disp_cp_chain_me_to_fastEP): + These are chain-me requests, used for Boring conditional and + unconditional jumps to destinations known at JIT time. The + generated code calls these (doesn't jump to them) and the + stub recovers the return address. These calls never return; + instead the call is done so that the stub knows where the + calling point is. It needs to know this so it can patch + the calling point to the requested destination. + VG_(disp_cp_xindir): + Old-style table lookup and go; used for indirect jumps + VG_(disp_cp_xassisted): + Most general and slowest kind. Can transfer to anywhere, but + first returns to scheduler to do some other event (eg a syscall) + before continuing. + VG_(disp_cp_evcheck_fail): + Code jumps here when the event check fails. + + +* new instructions in backends: XDirect, XIndir and XAssisted. + XDirect is used for chainable jumps. It is compiled into a + call to VG_(disp_cp_chain_me_to_slowEP) or + VG_(disp_cp_chain_me_to_fastEP). + + XIndir is used for indirect jumps. It is compiled into a jump + to VG_(disp_cp_xindir) + + XAssisted is used for "assisted" (do something first, then jump) + transfers. It is compiled into a jump to VG_(disp_cp_xassisted) + + All 3 of these may be conditional. + + More complexity: in some circumstances (no-redir translations) + all transfers must be done with XAssisted. In such cases the + instruction selector will be told this. + + +* Patching: XDirect is compiled basically into + %r11 = &VG_(disp_cp_chain_me_to_{slow,fast}EP) + call *%r11 + Backends must provide a function (eg) chainXDirect_AMD64 + which converts it into a jump to a specified destination + jmp $delta-of-PCs + or + %r11 = 64-bit immediate + jmpq *%r11 + depending on branch distance. + + Backends must provide a function (eg) unchainXDirect_AMD64 + which restores the original call-to-the-stub version. + + +* Event checks. Each translation now has two entry points, + the slow one (slowEP) and fast one (fastEP). Like this: + + slowEP: + counter-- + if (counter < 0) goto VG_(disp_cp_evcheck_fail) + fastEP: + (rest of the translation) + + slowEP is used for control flow transfers that are or might be + a back edge in the control flow graph. Insn selectors are + given the address of the highest guest byte in the block so + they can determine which edges are definitely not back edges. + + The counter is placed in the first 8 bytes of the guest state, + and the address of VG_(disp_cp_evcheck_fail) is placed in + the next 8 bytes. This allows very compact checks on all + targets, since no immediates need to be synthesised, eg: + + decq 0(%baseblock-pointer) + jns fastEP + jmpq *8(baseblock-pointer) + fastEP: + + On amd64 a non-failing check is therefore 2 insns; all 3 occupy + just 8 bytes. + + On amd64 the event check is created by a special single + pseudo-instruction AMD64_EvCheck. + + +* BB profiling (for --profile-flags=). The dispatch assembly + dispatch-arch-os.S no longer deals with this and so is much + simplified. Instead the profile inc is compiled into each + translation, as the insn immediately following the event + check. Again, on amd64 a pseudo-insn AMD64_ProfInc is used. + Counters are now 64 bit even on 32 bit hosts, to avoid overflow. + + One complexity is that at JIT time it is not known where the + address of the counter is. To solve this, VexTranslateResult + now returns the offset of the profile inc in the generated + code. When the counter address is known, VEX can be called + again to patch it in. Backends must supply eg + patchProfInc_AMD64 to make this happen. + + +* Front end changes (guest_blah_toIR.c) + + The way the guest program counter is handled has changed + significantly. Previously, the guest PC was updated (in IR) + at the start of each instruction, except for the first insn + in an IRSB. This is inconsistent and doesn't work with the + new framework. + + Now, each instruction must update the guest PC as its last + IR statement -- not its first. And no special exemption for + the first insn in the block. As before most of these are + optimised out by ir_opt, so no concerns about efficiency. + + As a logical side effect of this, exits (IRStmt_Exit) and the + block-end transfer are both considered to write to the guest state + (the guest PC) and so need to be told the offset of it. + + IR generators (eg disInstr_AMD64) are no longer allowed to set the + IRSB::next, to specify the block-end transfer address. Instead they + now indicate, to the generic steering logic that drives them (iow, + guest_generic_bb_to_IR.c), that the block has ended. This then + generates effectively "goto GET(PC)" (which, again, is optimised + away). What this does mean is that if the IR generator function + ends the IR of the last instruction in the block with an incorrect + assignment to the guest PC, execution will transfer to an incorrect + destination -- making the error obvious quickly. diff --git a/drd/drd_load_store.c b/drd/drd_load_store.c index 996ee61173..3d99112f1f 100644 --- a/drd/drd_load_store.c +++ b/drd/drd_load_store.c @@ -593,6 +593,7 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure, bb->tyenv = deepCopyIRTypeEnv(bb_in->tyenv); bb->next = deepCopyIRExpr(bb_in->next); bb->jumpkind = bb_in->jumpkind; + bb->offsIP = bb_in->offsIP; for (i = 0; i < bb_in->stmts_used; i++) { diff --git a/drd/tests/unit_bitmap.c b/drd/tests/unit_bitmap.c index b64c4964cb..6de61acdba 100644 --- a/drd/tests/unit_bitmap.c +++ b/drd/tests/unit_bitmap.c @@ -48,6 +48,8 @@ void* VG_(memset)(void *s, Int c, SizeT sz) { return memset(s, c, sz); } void* VG_(memcpy)(void *d, const void *s, SizeT sz) { return memcpy(d, s, sz); } +void* VG_(memmove)(void *d, const void *s, SizeT sz) +{ return memmove(d, s, sz); } Int VG_(memcmp)(const void* s1, const void* s2, SizeT n) { return memcmp(s1, s2, n); } UInt VG_(printf)(const HChar *format, ...) diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c index c1324db624..9bc36414b0 100644 --- a/helgrind/hg_main.c +++ b/helgrind/hg_main.c @@ -4317,6 +4317,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure, bbOut->tyenv = deepCopyIRTypeEnv(bbIn->tyenv); bbOut->next = deepCopyIRExpr(bbIn->next); bbOut->jumpkind = bbIn->jumpkind; + bbOut->offsIP = bbIn->offsIP; // Copy verbatim any IR preamble preceding the first IMark i = 0; diff --git a/include/pub_tool_xarray.h b/include/pub_tool_xarray.h index 9b699874db..cd34e79229 100644 --- a/include/pub_tool_xarray.h +++ b/include/pub_tool_xarray.h @@ -117,6 +117,12 @@ extern void VG_(dropTailXA) ( XArray*, Word ); is the number of elements remaining in the XArray. */ extern void VG_(dropHeadXA) ( XArray*, Word ); +/* Remove the specified element of an XArray, and slide all elements + beyond it back one place. This is an O(N) operation, where N is + the number of elements after the specified element, in the + array. */ +extern void VG_(removeIndexXA)( XArray*, Word ); + /* Make a new, completely independent copy of the given XArray, using the existing allocation function to allocate the new space. Returns NULL if the allocation function didn't manage to allocate diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am index f61e99f5b2..4dbefa246b 100644 --- a/memcheck/tests/Makefile.am +++ b/memcheck/tests/Makefile.am @@ -285,10 +285,10 @@ check_PROGRAMS = \ AM_CFLAGS += $(AM_FLAG_M3264_PRI) AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) -if VGCONF_PLATFORMS_INCLUDE_ARM_LINUX -AM_CFLAGS += -mfloat-abi=softfp -AM_CXXFLAGS += -mfloat-abi=softfp -endif +#if VGCONF_PLATFORMS_INCLUDE_ARM_LINUX +#AM_CFLAGS += -mfloat-abi=softfp +#AM_CXXFLAGS += -mfloat-abi=softfp +#endif if VGCONF_OS_IS_DARWIN atomic_incs_CFLAGS = $(AM_CFLAGS) -mdynamic-no-pic diff --git a/memcheck/tests/unit_oset.c b/memcheck/tests/unit_oset.c index 84f5ea25d1..854edf12c1 100644 --- a/memcheck/tests/unit_oset.c +++ b/memcheck/tests/unit_oset.c @@ -27,6 +27,7 @@ #define vgPlain_printf printf #define vgPlain_memset memset #define vgPlain_memcpy memcpy +#define vgPlain_memmove memmove // Crudely replace some functions (in m_xarray.c, but not needed for // this unit test) by (hopefully) failing asserts. diff --git a/none/tests/arm/Makefile.am b/none/tests/arm/Makefile.am index 013215ae80..ea25761c6a 100644 --- a/none/tests/arm/Makefile.am +++ b/none/tests/arm/Makefile.am @@ -39,14 +39,14 @@ v6intThumb_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 -mthumb v6media_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 -mthumb vfp_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \ - -mfpu=neon -mfloat-abi=softfp \ + -mfpu=neon \ -mthumb neon128_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \ - -mfpu=neon -mfloat-abi=softfp \ + -mfpu=neon \ -mthumb neon64_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \ - -mfpu=neon -mfloat-abi=softfp \ + -mfpu=neon \ -mthumb