From: Julian Seward Date: Thu, 15 Dec 2005 15:46:43 +0000 (+0000) Subject: Rewrite amd64 dispatch loop to add performance enhancements as per x86 X-Git-Tag: svn/VALGRIND_3_2_0~509 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=bcc3feca7ece9975632f6b376c533b50c82f7333;p=thirdparty%2Fvalgrind.git Rewrite amd64 dispatch loop to add performance enhancements as per x86 reorganisation of r5345. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@5346 --- diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S index f3169a594f..5a0d07e802 100644 --- a/coregrind/m_dispatch/dispatch-amd64-linux.S +++ b/coregrind/m_dispatch/dispatch-amd64-linux.S @@ -1,8 +1,8 @@ -##--------------------------------------------------------------------## -##--- The core dispatch loop, for jumping to a code address. ---## -##--- dispatch-amd64.S ---## -##--------------------------------------------------------------------## +/*--------------------------------------------------------------------*/ +/*--- The core dispatch loop, for jumping to a code address. ---*/ +/*--- dispatch-amd64.S ---*/ +/*--------------------------------------------------------------------*/ /* This file is part of Valgrind, a dynamic binary instrumentation @@ -39,11 +39,19 @@ /*--- The dispatch loop. ---*/ /*------------------------------------------------------------*/ -/* signature: UWord VG_(run_innerloop) ( void* guest_state ) */ +/*----------------------------------------------------*/ +/*--- Preamble (set everything up) ---*/ +/*----------------------------------------------------*/ +/* signature: +UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling ); +*/ + +.text .globl VG_(run_innerloop) VG_(run_innerloop): /* %rdi holds guest_state */ + /* %rsi holds do_profiling */ /* ----- entry point to VG_(run_innerloop) ----- */ pushq %rbx @@ -59,12 +67,13 @@ VG_(run_innerloop): pushq %r13 pushq %r14 pushq %r15 - pushq %rdi + pushq %rdi /* guest_state */ - movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi - pushq (%rsi) + movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15 + movl (%r15), %r15d + pushq %r15 - /* 8(%rsp) holds cached copy of guest_state */ + /* 8(%rsp) holds cached copy of guest_state ptr */ /* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */ /* Set up the guest state pointer */ @@ -90,12 +99,26 @@ VG_(run_innerloop): /* set dir flag to known value */ cld - /* fall into main loop */ + /* fall into main loop (the right one) */ + cmpq $0, %rsi + je VG_(run_innerloop__dispatch_unprofiled) + jmp VG_(run_innerloop__dispatch_profiled) + /*NOTREACHED*/ + +/*----------------------------------------------------*/ +/*--- NO-PROFILING (standard) dispatcher ---*/ +/*----------------------------------------------------*/ - /* Here, %rax is the only live (real) register. The entire - simulated state is saved in the ThreadState. */ +.align 16 +.global VG_(run_innerloop__dispatch_unprofiled) +VG_(run_innerloop__dispatch_unprofiled): + /* AT ENTRY: %rax is next guest addr, %rbp is possibly + modified guest state ptr */ + + /* Has the guest state pointer been messed with? If yes, exit. */ + cmpq 8(%rsp), %rbp + jnz gsp_changed -dispatch_boring: /* save the jump address in the guest state */ movq %rax, OFFSET_amd64_RIP(%rbp) @@ -104,37 +127,96 @@ dispatch_boring: jz counter_is_zero /* try a fast lookup in the translation cache */ - movq %rax, %rbx - andq $VG_TT_FAST_MASK, %rbx - movq VG_(tt_fast)@GOTPCREL(%rip), %rcx - movq (%rcx,%rbx,8), %rcx - cmpq %rax, (%rcx) - jnz fast_lookup_failed - /* increment bb profile counter */ - movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx - movq (%rdx,%rbx,8), %rdx - incl (%rdx) + movq VG_(tt_fast)@GOTPCREL(%rip), %rcx + movq %rax, %rbx + andq $VG_TT_FAST_MASK, %rbx + movq (%rcx,%rbx,8), %rcx + cmpq %rax, (%rcx) + jnz fast_lookup_failed /* Found a match. Call tce[1], which is 8 bytes along, since each tce element is a 64-bit int. */ addq $8, %rcx - call *%rcx + jmp *%rcx + ud2 /* persuade insn decoders not to speculate past here */ + /* generated code should run, then jump back to + VG_(run_innerloop__dispatch_unprofiled). */ + /*NOTREACHED*/ + +/*----------------------------------------------------*/ +/*--- PROFILING dispatcher (can be much slower) ---*/ +/*----------------------------------------------------*/ + +.align 16 +.global VG_(run_innerloop__dispatch_profiled) +VG_(run_innerloop__dispatch_profiled): + /* AT ENTRY: %rax is next guest addr, %rbp is possibly + modified guest state ptr */ + + /* Has the guest state pointer been messed with? If yes, exit. */ + cmpq 8(%rsp), %rbp + jnz gsp_changed - /* - %rax holds destination (original) address. - %rbp indicates further details of the control transfer - requested to the address in %rax. - - If rbp is unchanged (== * 8(%rsp)), just jump next to %rax. + /* save the jump address in the guest state */ + movq %rax, OFFSET_amd64_RIP(%rbp) - Otherwise fall out, back to the scheduler, and let it - figure out what to do next. - */ + /* Are we out of timeslice? If yes, defer to scheduler. */ + subl $1, 0(%rsp) + jz counter_is_zero - cmpq 8(%rsp), %rbp - jz dispatch_boring + /* try a fast lookup in the translation cache */ + movq VG_(tt_fast)@GOTPCREL(%rip), %rcx + movq %rax, %rbx + andq $VG_TT_FAST_MASK, %rbx + movq (%rcx,%rbx,8), %rcx + cmpq %rax, (%rcx) + jnz fast_lookup_failed - jmp dispatch_exceptional + /* increment bb profile counter */ + movq VG_(tt_fastN)@GOTPCREL(%rip), %rdx + movq (%rdx,%rbx,8), %rdx + addl $1, (%rdx) + + /* Found a match. Call tce[1], which is 8 bytes along, since + each tce element is a 64-bit int. */ + addq $8, %rcx + jmp *%rcx + ud2 /* persuade insn decoders not to speculate past here */ + /* generated code should run, then jump back to + VG_(run_innerloop__dispatch_profiled). */ + /*NOTREACHED*/ + +/*----------------------------------------------------*/ +/*--- exit points ---*/ +/*----------------------------------------------------*/ + +gsp_changed: + /* Someone messed with the gsp. Have to + defer to scheduler to resolve this. dispatch ctr + is not yet decremented, so no need to increment. */ + /* %RIP is NOT up to date here. First, need to write + %rax back to %RIP, but without trashing %rbp since + that holds the value we want to return to the scheduler. + Hence use %r15 transiently for the guest state pointer. */ + movq 8(%rsp), %r15 + movq %rax, OFFSET_amd64_RIP(%r15) + movq %rbp, %rax + jmp run_innerloop_exit + /*NOTREACHED*/ + +counter_is_zero: + /* %RIP is up to date here */ + /* back out decrement of the dispatch counter */ + addl $1, 0(%rsp) + movq $VG_TRC_INNER_COUNTERZERO, %rax + jmp run_innerloop_exit + +fast_lookup_failed: + /* %RIP is up to date here */ + /* back out decrement of the dispatch counter */ + addl $1, 0(%rsp) + movq $VG_TRC_INNER_FASTMISS, %rax + jmp run_innerloop_exit @@ -150,14 +232,14 @@ run_innerloop_exit: pushq $0 fstcw (%rsp) cmpl $0x027F, (%rsp) - popq %r11 /* get rid of the word without trashing %eflags */ + popq %r15 /* get rid of the word without trashing %eflags */ jnz invariant_violation #endif pushq $0 stmxcsr (%rsp) andl $0xFFFFFFC0, (%rsp) /* mask out status flags */ cmpl $0x1F80, (%rsp) - popq %r11 + popq %r15 jnz invariant_violation /* otherwise we're OK */ jmp run_innerloop_exit_REALLY @@ -167,8 +249,12 @@ invariant_violation: jmp run_innerloop_exit_REALLY run_innerloop_exit_REALLY: - movq VG_(dispatch_ctr)@GOTPCREL(%rip), %rsi - popq (%rsi) + + /* restore VG_(dispatch_ctr) */ + popq %r14 + movq VG_(dispatch_ctr)@GOTPCREL(%rip), %r15 + movl %r14d, (%r15) + popq %rdi popq %r15 popq %r14 @@ -190,31 +276,13 @@ run_innerloop_exit_REALLY: /* Other ways of getting out of the inner loop. Placed out-of-line to make it look cleaner. */ -dispatch_exceptional: - /* this is jumped to only, not fallen-through from above */ - - /* save %rax in %RIP and defer to sched */ - movq 8(%rsp), %rdi - movq %rax, OFFSET_amd64_RIP(%rdi) - movq %rbp, %rax - jmp run_innerloop_exit -fast_lookup_failed: - /* %RIP is up to date here since dispatch_boring dominates */ - addl $1, 0(%rsp) - movq $VG_TRC_INNER_FASTMISS, %rax - jmp run_innerloop_exit -counter_is_zero: - /* %RIP is up to date here since dispatch_boring dominates */ - addl $1, 0(%rsp) - movq $VG_TRC_INNER_COUNTERZERO, %rax - jmp run_innerloop_exit /* Let the linker know we don't need an executable stack */ .section .note.GNU-stack,"",@progbits -##--------------------------------------------------------------------## -##--- end ---## -##--------------------------------------------------------------------## +/*--------------------------------------------------------------------*/ +/*--- end ---*/ +/*--------------------------------------------------------------------*/