From: Julian Seward Date: Fri, 25 Jan 2019 08:14:56 +0000 (+0100) Subject: Bug 402781 - Redo the cache used to process indirect branch targets. X-Git-Tag: VALGRIND_3_15_0~96 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=50bb127b1df8d31812141aafa567d325d1fbc1b3;p=thirdparty%2Fvalgrind.git Bug 402781 - Redo the cache used to process indirect branch targets. [This commit contains an implementation for all targets except amd64-solaris and x86-solaris, which will be completed shortly.] In the baseline simulator, jumps to guest code addresses that are not known at JIT time have to be looked up in a guest->host mapping table. That means: indirect branches, indirect calls and most commonly, returns. Since there are huge numbers of these (often 10+ million/second) the mapping mechanism needs to be extremely cheap. Currently, this is implemented using a direct-mapped cache, VG_(tt_fast), with 2^15 (guest_addr, host_addr) pairs. This is queried in handwritten assembly in VG_(disp_cp_xindir) in dispatch--.S. If there is a miss in the cache then we fall back out to C land, and do a slow lookup using VG_(search_transtab). Given that the size of the translation table(s) in recent years has expanded significantly in order to keep pace with increasing application sizes, two bad things have happened: (1) the cost of a miss in the fast cache has risen significantly, and (2) the miss rate on the fast cache has also increased significantly. This means that large (~ one-million-basic-blocks-JITted) applications that run for a long time end up spending a lot of time in VG_(search_transtab). The proposed fix is to increase associativity of the fast cache, from 1 (direct mapped) to 4. Simulations of various cache configurations using indirect-branch traces from a large application show that is the best of various configurations. In an extreme case with 5.7 billion indirect branches: * The increase of associativity from 1 way to 4 way, whilst keeping the overall cache size the same (32k guest/host pairs), reduces the miss rate by around a factor of 3, from 4.02% to 1.30%. * The use of a slightly better hash function than merely slicing off the bottom 15 bits of the address, reduces the miss rate further, from 1.30% to 0.53%. Overall the VG_(tt_fast) miss rate is almost unchanged on small workloads, but reduced by a factor of up to almost 8 on large workloads. By implementing each (4-entry) cache set using a move-to-front scheme in the case of hits in ways 1, 2 or 3, the vast majority of hits can be made to happen in way 0. Hence the cost of having this extra associativity is almost zero in the case of a hit. The improved hash function costs an extra 2 ALU shots (a shift and an xor) but overall this seems performance neutral to a win. --- diff --git a/coregrind/m_dispatch/dispatch-amd64-darwin.S b/coregrind/m_dispatch/dispatch-amd64-darwin.S index d5603065ee..ccf2b91696 100644 --- a/coregrind/m_dispatch/dispatch-amd64-darwin.S +++ b/coregrind/m_dispatch/dispatch-amd64-darwin.S @@ -201,33 +201,98 @@ VG_(disp_cp_chain_me_to_fastEP): jmp postamble /* ------ Indirect but boring jump ------ */ -.globl VG_(disp_cp_xindir) +.global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + /* Where are we going? */ + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - movabsq $VG_(stats__n_xindirs_32), %r10 - addl $1, (%r10) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + movabsq $VG_(stats__n_xIndirs_32), %r8 + addl $1, (%r8) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 5 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + // %r8 (scratch address) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits1_32), %r8 + addl $1, (%r8) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits2_32), %r8 + addl $1, (%r8) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + movabsq $VG_(stats__n_xIndir_hits3_32), %r8 + addl $1, (%r8) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - movabsq $VG_(stats__n_xindir_misses_32), %r10 - addl $1, (%r10) + movabsq $VG_(stats__n_xIndir_misses_32), %r8 + addl $1, (%r8) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S index 62717d31d7..007c495f7c 100644 --- a/coregrind/m_dispatch/dispatch-amd64-linux.S +++ b/coregrind/m_dispatch/dispatch-amd64-linux.S @@ -205,28 +205,89 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movq OFFSET_amd64_RIP(%rbp), %rax + movq OFFSET_amd64_RIP(%rbp), %rax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movabsq $VG_(tt_fast), %rcx - movq %rax, %rbx /* next guest addr */ - andq $VG_TT_FAST_MASK, %rbx /* entry# */ - shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */ - movq 0(%rcx,%rbx,1), %r10 /* .guest */ - movq 8(%rcx,%rbx,1), %r11 /* .host */ - cmpq %rax, %r10 - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%r11 - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %rbp (guest state ptr), %rax (guest address to go to). + // We use 4 temporaries: + // %r9 (to point at the relevant FastCacheSet), + // %r10, %r11 and %r12 (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r9 = VG_TT_FAST_HASH(guest) + movq %rax, %r9 // guest + shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS) + xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest + andq $VG_TT_FAST_MASK, %r9 // setNo + + // Compute %r9 = &VG_(tt_fast)[%r9] + shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet) + movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0] + leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo] + + // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set) + // try way 0 + cmpq %rax, FCS_g0(%r9) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%r9) // goto .host0 + ud2 + +1: // try way 1 + cmpq %rax, FCS_g1(%r9) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movq FCS_g0(%r9), %r10 // r10 = old .guest0 + movq FCS_h0(%r9), %r11 // r11 = old .host0 + movq FCS_h1(%r9), %r12 // r12 = old .host1 + movq %rax, FCS_g0(%r9) // new .guest0 = guest + movq %r12, FCS_h0(%r9) // new .host0 = old .host1 + movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0 + movq %r11, FCS_h1(%r9) // new .host1 = old .host0 + jmp *%r12 // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpq %rax, FCS_g2(%r9) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movq FCS_g1(%r9), %r10 + movq FCS_h1(%r9), %r11 + movq FCS_h2(%r9), %r12 + movq %rax, FCS_g1(%r9) + movq %r12, FCS_h1(%r9) + movq %r10, FCS_g2(%r9) + movq %r11, FCS_h2(%r9) + jmp *%r12 + ud2 + +3: // try way 3 + cmpq %rax, FCS_g3(%r9) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movq FCS_g2(%r9), %r10 + movq FCS_h2(%r9), %r11 + movq FCS_h3(%r9), %r12 + movq %rax, FCS_g2(%r9) + movq %r12, FCS_h2(%r9) + movq %r10, FCS_g3(%r9) + movq %r11, FCS_h3(%r9) + jmp *%r12 + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movq $VG_TRC_INNER_FASTMISS, %rax movq $0, %rdx diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S index 3731c2ebd3..b61818c27d 100644 --- a/coregrind/m_dispatch/dispatch-arm-linux.S +++ b/coregrind/m_dispatch/dispatch-arm-linux.S @@ -154,36 +154,114 @@ VG_(disp_cp_xindir): ldr r0, [r8, #OFFSET_arm_R15T] /* stats only */ - movw r1, #:lower16:vgPlain_stats__n_xindirs_32 - movt r1, #:upper16:vgPlain_stats__n_xindirs_32 - ldr r2, [r1, #0] - add r2, r2, #1 - str r2, [r1, #0] + movw r4, #:lower16:VG_(stats__n_xIndirs_32) + movt r4, #:upper16:VG_(stats__n_xIndirs_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + + // LIVE: r8 (guest state ptr), r0 (guest address to go to). + // We use 6 temporaries: + // r6 (to point at the relevant FastCacheSet), + // r1, r2, r3 (scratch, for swapping entries within a set) + // r4, r5 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r6 = VG_TT_FAST_HASH(guest) + lsr r6, r0, #1 // g1 = guest >> 1 + eor r6, r6, r6, LSR #VG_TT_FAST_BITS // (g1 >> VG_TT_FAST_BITS) ^ g1 + ubfx r6, r6, #0, #VG_TT_FAST_BITS // setNo - /* try a fast lookup in the translation cache */ - // r0 = next guest, r1,r2,r3,r4 scratch - movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK + // Compute r6 = &VG_(tt_fast)[r6] movw r4, #:lower16:VG_(tt_fast) - - and r2, r1, r0, LSR #1 // r2 = entry # - movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast) - - add r1, r4, r2, LSL #3 // r1 = &tt_fast[entry#] - - ldrd r4, r5, [r1, #0] // r4 = .guest, r5 = .host - - cmp r4, r0 - - // jump to host if lookup succeeded - bxeq r5 - - /* otherwise the fast lookup failed */ - /* RM ME -- stats only */ - movw r1, #:lower16:vgPlain_stats__n_xindir_misses_32 - movt r1, #:upper16:vgPlain_stats__n_xindir_misses_32 - ldr r2, [r1, #0] - add r2, r2, #1 - str r2, [r1, #0] + movt r4, #:upper16:VG_(tt_fast) + add r6, r4, r6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo] + + // LIVE: r8 (guest state ptr), r0 (guest addr), r6 (cache set) + // try way 0 + ldr r4, [r6, #FCS_g0] // .guest0 + ldr r5, [r6, #FCS_h0] // .host0 + cmp r4, r0 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + bx r5 + /*NOTREACHED*/ + +1: // try way 1 + ldr r4, [r6, #FCS_g1] + cmp r4, r0 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ldr r1, [r6, #FCS_g0] // r1 = old .guest0 + ldr r2, [r6, #FCS_h0] // r2 = old .host0 + ldr r3, [r6, #FCS_h1] // r3 = old .host1 + str r0, [r6, #FCS_g0] // new .guest0 = guest + str r3, [r6, #FCS_h0] // new .host0 = old .host1 + str r1, [r6, #FCS_g1] // new .guest1 = old .guest0 + str r2, [r6, #FCS_h1] // new .host1 = old .host0 + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits1_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits1_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host1 a.k.a. new .host0 + bx r3 + /*NOTREACHED*/ + +2: // try way 2 + ldr r4, [r6, #FCS_g2] + cmp r4, r0 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ldr r1, [r6, #FCS_g1] + ldr r2, [r6, #FCS_h1] + ldr r3, [r6, #FCS_h2] + str r0, [r6, #FCS_g1] + str r3, [r6, #FCS_h1] + str r1, [r6, #FCS_g2] + str r2, [r6, #FCS_h2] + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits2_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits2_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host2 a.k.a. new .host1 + bx r3 + /*NOTREACHED*/ + +3: // try way 3 + ldr r4, [r6, #FCS_g3] + cmp r4, r0 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ldr r1, [r6, #FCS_g2] + ldr r2, [r6, #FCS_h2] + ldr r3, [r6, #FCS_h3] + str r0, [r6, #FCS_g2] + str r3, [r6, #FCS_h2] + str r1, [r6, #FCS_g3] + str r2, [r6, #FCS_h3] + // stats only + movw r4, #:lower16:VG_(stats__n_xIndir_hits3_32) + movt r4, #:upper16:VG_(stats__n_xIndir_hits3_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] + // goto old .host3 a.k.a. new .host2 + bx r3 + /*NOTREACHED*/ + +4: // fast lookup failed + movw r4, #:lower16:VG_(stats__n_xIndir_misses_32) + movt r4, #:upper16:VG_(stats__n_xIndir_misses_32) + ldr r5, [r4, #0] + add r5, r5, #1 + str r5, [r4, #0] mov r1, #VG_TRC_INNER_FASTMISS mov r2, #0 diff --git a/coregrind/m_dispatch/dispatch-arm64-linux.S b/coregrind/m_dispatch/dispatch-arm64-linux.S index ee289faf80..554fa9b1fd 100644 --- a/coregrind/m_dispatch/dispatch-arm64-linux.S +++ b/coregrind/m_dispatch/dispatch-arm64-linux.S @@ -173,42 +173,118 @@ VG_(disp_cp_chain_me_to_fastEP): /* ------ Indirect but boring jump ------ */ .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ + // Where are we going? ldr x0, [x21, #OFFSET_arm64_PC] - /* stats only */ - adrp x1, VG_(stats__n_xindirs_32) - add x1, x1, :lo12:VG_(stats__n_xindirs_32) - ldr w2, [x1, #0] - add w2, w2, #1 - str w2, [x1, #0] - - /* try a fast lookup in the translation cache */ - // x0 = next guest, x1,x2,x3,x4 scratch - mov x1, #VG_TT_FAST_MASK // x1 = VG_TT_FAST_MASK - and x2, x1, x0, LSR #2 // x2 = entry # = (x1 & (x0 >> 2)) - + // stats only + adrp x4, VG_(stats__n_xIndirs_32) + add x4, x4, :lo12:VG_(stats__n_xIndirs_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + + // LIVE: x21 (guest state ptr), x0 (guest address to go to). + // We use 6 temporaries: + // x6 (to point at the relevant FastCacheSet), + // x1, x2, x3 (scratch, for swapping entries within a set) + // x4, x5 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute x6 = VG_TT_FAST_HASH(guest) + lsr x6, x0, #2 // g2 = guest >> 2 + eor x6, x6, x6, LSR #VG_TT_FAST_BITS // (g2 >> VG_TT_FAST_BITS) ^ g2 + mov x4, #VG_TT_FAST_MASK // VG_TT_FAST_MASK + and x6, x6, x4 // setNo + + // Compute x6 = &VG_(tt_fast)[x6] adrp x4, VG_(tt_fast) - add x4, x4, :lo12:VG_(tt_fast) // x4 = &VG_(tt_fast) - - add x1, x4, x2, LSL #4 // r1 = &tt_fast[entry#] + add x4, x4, :lo12:VG_(tt_fast) // &VG_(tt_fast)[0] + add x6, x4, x6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo] + + // LIVE: x21 (guest state ptr), x0 (guest addr), x6 (cache set) + // try way 0 + ldp x4, x5, [x6, #FCS_g0] // x4 = .guest0, x5 = .host0 + cmp x4, x0 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + br x5 + /*NOTREACHED*/ - ldp x4, x5, [x1, #0] // x4 = .guest, x5 = .host +1: // try way 1 + ldr x4, [x6, #FCS_g1] + cmp x4, x0 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ldr x1, [x6, #FCS_g0] // x1 = old .guest0 + ldr x2, [x6, #FCS_h0] // x2 = old .host0 + ldr x3, [x6, #FCS_h1] // x3 = old .host1 + str x0, [x6, #FCS_g0] // new .guest0 = guest + str x3, [x6, #FCS_h0] // new .host0 = old .host1 + str x1, [x6, #FCS_g1] // new .guest1 = old .guest0 + str x2, [x6, #FCS_h1] // new .host1 = old .host0 + // stats only + adrp x4, VG_(stats__n_xIndir_hits1_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits1_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host1 a.k.a. new .host0 + br x3 + /*NOTREACHED*/ - cmp x4, x0 +2: // try way 2 + ldr x4, [x6, #FCS_g2] + cmp x4, x0 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ldr x1, [x6, #FCS_g1] + ldr x2, [x6, #FCS_h1] + ldr x3, [x6, #FCS_h2] + str x0, [x6, #FCS_g1] + str x3, [x6, #FCS_h1] + str x1, [x6, #FCS_g2] + str x2, [x6, #FCS_h2] + // stats only + adrp x4, VG_(stats__n_xIndir_hits2_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits2_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host2 a.k.a. new .host1 + br x3 + /*NOTREACHED*/ - // jump to host if lookup succeeded - bne fast_lookup_failed - br x5 +3: // try way 3 + ldr x4, [x6, #FCS_g3] + cmp x4, x0 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ldr x1, [x6, #FCS_g2] + ldr x2, [x6, #FCS_h2] + ldr x3, [x6, #FCS_h3] + str x0, [x6, #FCS_g2] + str x3, [x6, #FCS_h2] + str x1, [x6, #FCS_g3] + str x2, [x6, #FCS_h3] + // stats only + adrp x4, VG_(stats__n_xIndir_hits3_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_hits3_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] + // goto old .host3 a.k.a. new .host2 + br x3 /*NOTREACHED*/ -fast_lookup_failed: - /* RM ME -- stats only */ - adrp x1, VG_(stats__n_xindir_misses_32) - add x1, x1, :lo12:VG_(stats__n_xindir_misses_32) - ldr w2, [x1, #0] - add w2, w2, #1 - str w2, [x1, #0] +4: // fast lookup failed + adrp x4, VG_(stats__n_xIndir_misses_32) + add x4, x4, :lo12:VG_(stats__n_xIndir_misses_32) + ldr w5, [x4, #0] + add w5, w5, #1 + str w5, [x4, #0] mov x1, #VG_TRC_INNER_FASTMISS mov x2, #0 diff --git a/coregrind/m_dispatch/dispatch-mips32-linux.S b/coregrind/m_dispatch/dispatch-mips32-linux.S index 9918403d5e..fdb1e29b00 100644 --- a/coregrind/m_dispatch/dispatch-mips32-linux.S +++ b/coregrind/m_dispatch/dispatch-mips32-linux.S @@ -175,47 +175,116 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - lw $11, OFFSET_mips32_PC($23) - - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - - /* try a fast lookup in the translation cache */ - /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*) - = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */ - - move $14, $11 - li $12, VG_TT_FAST_MASK - srl $14, $14, 2 - and $14, $14, $12 - sll $14, $14, 3 - - /* t2 = (addr of VG_(tt_fast)) + t1 */ - la $13, VG_(tt_fast) - addu $13, $13, $14 - - lw $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */ - addiu $13, $13, 4 - lw $25, 0($13) /* little-endian, so comparing 1st 32bit word */ - nop - -check: - bne $12, $11, fast_lookup_failed - /* run the translation */ - jr $25 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* %PC is up to date */ - /* back out decrement of the dispatch counter */ - /* hold dispatch_ctr in t0 (r8) */ - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - li $2, VG_TRC_INNER_FASTMISS - li $3, 0 - b postamble + lw $10, OFFSET_mips32_PC($23) + + /* stats only */ + lw $15, VG_(stats__n_xIndirs_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndirs_32) + + // LIVE: r23 (guest state ptr), r10 (guest address to go to). + // We use 6 temporaries: + // r16 (to point at the relevant FastCacheSet), + // r11, r12, r13 (scratch, for swapping entries within a set) + // r14, r15 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r16 = VG_TT_FAST_HASH(guest) + srl $16, $10, 2 // g2 = guest >> 2 + srl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2 + li $15, VG_TT_FAST_MASK + and $16, $16, $15 // setNo + + // Compute r16 = &VG_(tt_fast)[r16] + la $15, VG_(tt_fast) + sll $16, $16, VG_FAST_CACHE_SET_BITS + addu $16, $16, $15 + + // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set) + // try way 0 + lw $14, FCS_g0($16) // .guest0 + lw $15, FCS_h0($16) // .host0 + bne $14, $10, 1f // cmp against .guest0 + // hit at way 0 + // goto .host0 + jr $15 + /*NOTREACHED*/ + .long 0x0 + +1: // try way 1 + lw $14, FCS_g1($16) + bne $14, $10, 2f // cmp against .guest1 + // hit at way 1; swap upwards + lw $11, FCS_g0($16) // $11 = old .guest0 + lw $12, FCS_h0($16) // $12 = old .host0 + lw $13, FCS_h1($16) // $13 = old .host1 + sw $10, FCS_g0($16) // new .guest0 = guest + sw $13, FCS_h0($16) // new .host0 = old .host1 + sw $11, FCS_g1($16) // new .guest1 = old .guest0 + sw $12, FCS_h1($16) // new .host1 = old .host0 + // stats only + lw $15, VG_(stats__n_xIndir_hits1_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits1_32) + // goto old .host1 a.k.a. new .host0 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +2: // try way 2 + lw $14, FCS_g2($16) + bne $14, $10, 3f // cmp against .guest2 + // hit at way 2; swap upwards + lw $11, FCS_g1($16) + lw $12, FCS_h1($16) + lw $13, FCS_h2($16) + sw $10, FCS_g1($16) + sw $13, FCS_h1($16) + sw $11, FCS_g2($16) + sw $12, FCS_h2($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits2_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits2_32) + // goto old .host2 a.k.a. new .host1 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +3: // try way 3 + lw $14, FCS_g3($16) + bne $14, $10, 4f // cmp against .guest3 + // hit at way 3; swap upwards + lw $11, FCS_g2($16) + lw $12, FCS_h2($16) + lw $13, FCS_h3($16) + sw $10, FCS_g2($16) + sw $13, FCS_h2($16) + sw $11, FCS_g3($16) + sw $12, FCS_h3($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits3_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits3_32) + // goto old .host3 a.k.a. new .host2 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +4: // fast lookup failed: + /* stats only */ + lw $15, VG_(stats__n_xIndir_misses_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_misses_32) + + li $2, VG_TRC_INNER_FASTMISS + li $3, 0 + b postamble + /*NOTREACHED*/ + .long 0x0 /* ------ Assisted jump ------ */ .global VG_(disp_cp_xassisted) diff --git a/coregrind/m_dispatch/dispatch-mips64-linux.S b/coregrind/m_dispatch/dispatch-mips64-linux.S index 4a2b1b734e..5d1efd622d 100644 --- a/coregrind/m_dispatch/dispatch-mips64-linux.S +++ b/coregrind/m_dispatch/dispatch-mips64-linux.S @@ -182,47 +182,116 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - ld $11, OFFSET_mips64_PC($23) - - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - - /* try a fast lookup in the translation cache */ - /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*) - = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */ - - move $14, $11 - li $12, VG_TT_FAST_MASK - srl $14, $14, 2 - and $14, $14, $12 - sll $14, $14, 3 - - /* t2 = (addr of VG_(tt_fast)) + t1 */ - dla $13, VG_(tt_fast) - daddu $13, $13, $14 - - ld $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */ - daddiu $13, $13, 8 - ld $25, 0($13) /* little-endian, so comparing 1st 32bit word */ - nop - -check: - bne $12, $11, fast_lookup_failed - /* run the translation */ - jr $25 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* %PC is up to date */ - /* back out decrement of the dispatch counter */ - /* hold dispatch_ctr in t0 (r8) */ - lw $13, vgPlain_stats__n_xindirs_32 - addiu $13, $13, 0x1 - sw $13, vgPlain_stats__n_xindirs_32 - li $2, VG_TRC_INNER_FASTMISS - li $3, 0 - b postamble + ld $10, OFFSET_mips64_PC($23) + + /* stats only */ + lw $15, VG_(stats__n_xIndirs_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndirs_32) + + // LIVE: r23 (guest state ptr), r10 (guest address to go to). + // We use 6 temporaries: + // r16 (to point at the relevant FastCacheSet), + // r11, r12, r13 (scratch, for swapping entries within a set) + // r14, r15 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r16 = VG_TT_FAST_HASH(guest) + dsrl $16, $10, 2 // g2 = guest >> 2 + dsrl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2 + li $15, VG_TT_FAST_MASK + and $16, $16, $15 // setNo + + // Compute r16 = &VG_(tt_fast)[r16] + dla $15, VG_(tt_fast) + dsll $16, $16, VG_FAST_CACHE_SET_BITS + daddu $16, $16, $15 + + // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set) + // try way 0 + ld $14, FCS_g0($16) // .guest0 + ld $15, FCS_h0($16) // .host0 + bne $14, $10, 1f // cmp against .guest0 + // hit at way 0 + // goto .host0 + jr $15 + /*NOTREACHED*/ + .long 0x0 + +1: // try way 1 + ld $14, FCS_g1($16) + bne $14, $10, 2f // cmp against .guest1 + // hit at way 1; swap upwards + ld $11, FCS_g0($16) // $11 = old .guest0 + ld $12, FCS_h0($16) // $12 = old .host0 + ld $13, FCS_h1($16) // $13 = old .host1 + sd $10, FCS_g0($16) // new .guest0 = guest + sd $13, FCS_h0($16) // new .host0 = old .host1 + sd $11, FCS_g1($16) // new .guest1 = old .guest0 + sd $12, FCS_h1($16) // new .host1 = old .host0 + // stats only + lw $15, VG_(stats__n_xIndir_hits1_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits1_32) + // goto old .host1 a.k.a. new .host0 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +2: // try way 2 + ld $14, FCS_g2($16) + bne $14, $10, 3f // cmp against .guest2 + // hit at way 2; swap upwards + ld $11, FCS_g1($16) + ld $12, FCS_h1($16) + ld $13, FCS_h2($16) + sd $10, FCS_g1($16) + sd $13, FCS_h1($16) + sd $11, FCS_g2($16) + sd $12, FCS_h2($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits2_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits2_32) + // goto old .host2 a.k.a. new .host1 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +3: // try way 3 + ld $14, FCS_g3($16) + bne $14, $10, 4f // cmp against .guest3 + // hit at way 3; swap upwards + ld $11, FCS_g2($16) + ld $12, FCS_h2($16) + ld $13, FCS_h3($16) + sd $10, FCS_g2($16) + sd $13, FCS_h2($16) + sd $11, FCS_g3($16) + sd $12, FCS_h3($16) + // stats only + lw $15, VG_(stats__n_xIndir_hits3_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_hits3_32) + // goto old .host3 a.k.a. new .host2 + jr $13 + /*NOTREACHED*/ + .long 0x0 + +4: // fast lookup failed: + /* stats only */ + lw $15, VG_(stats__n_xIndir_misses_32) + addiu $15, $15, 1 + sw $15, VG_(stats__n_xIndir_misses_32) + + li $2, VG_TRC_INNER_FASTMISS + li $3, 0 + b postamble + /*NOTREACHED*/ + .long 0x0 /* ------ Assisted jump ------ */ .global VG_(disp_cp_xassisted) diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S index 432306bf44..d3ff2d11e9 100644 --- a/coregrind/m_dispatch/dispatch-ppc32-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S @@ -437,44 +437,128 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - lwz 3,OFFSET_ppc32_CIA(31) + lwz 20, OFFSET_ppc32_CIA(31) /* stats only */ - lis 5,VG_(stats__n_xindirs_32)@ha - addi 5,5,VG_(stats__n_xindirs_32)@l - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) + lis 24, VG_(stats__n_xIndirs_32)@ha + addi 24, 24, VG_(stats__n_xIndirs_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srwi 26, 20, 2 // g2 = guest >> 2 + srwi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo - /* r5 = &VG_(tt_fast) */ - lis 5,VG_(tt_fast)@ha - addi 5,5,VG_(tt_fast)@l /* & VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 3 */ - rlwinm 4,3,1, 29-VG_TT_FAST_BITS, 28 /* entry# * 8 */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - lwz 6,0(5) /* .guest */ - lwz 7,4(5) /* .host */ - cmpw 3,6 - bne fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + // Compute r6 = &VG_(tt_fast)[r6] + lis 25, VG_(tt_fast)@ha + addi 25, 25, VG_(tt_fast)@l + slwi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + lwz 24, FCS_g0(26) // .guest0 + lwz 25, FCS_h0(26) // .host0 + cmpw 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr + /*NOTREACHED*/ + +1: // try way 1 + lwz 24, FCS_g1(26) + cmpw 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + lwz 21, FCS_g0(26) // 21 = old .guest0 + lwz 22, FCS_h0(26) // 22 = old .host0 + lwz 23, FCS_h1(26) // 23 = old .host1 + stw 20, FCS_g0(26) // new .guest0 = guest + stw 23, FCS_h0(26) // new .host0 = old .host1 + stw 21, FCS_g1(26) // new .guest1 = old .guest0 + stw 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + lis 24, VG_(stats__n_xIndir_hits1_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits1_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + lwz 24, FCS_g2(26) + cmpw 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + lwz 21, FCS_g1(26) + lwz 22, FCS_h1(26) + lwz 23, FCS_h2(26) + stw 20, FCS_g1(26) + stw 23, FCS_h1(26) + stw 21, FCS_g2(26) + stw 22, FCS_h2(26) + // stats only + lis 24, VG_(stats__n_xIndir_hits2_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits2_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + lwz 24, FCS_g3(26) + cmpw 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + lwz 21, FCS_g2(26) + lwz 22, FCS_h2(26) + lwz 23, FCS_h3(26) + stw 20, FCS_g2(26) + stw 23, FCS_h2(26) + stw 21, FCS_g3(26) + stw 22, FCS_h3(26) + // stats only + lis 24, VG_(stats__n_xIndir_hits3_32)@ha + addi 24, 24, VG_(stats__n_xIndir_hits3_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - lis 5,VG_(stats__n_xindir_misses_32)@ha - addi 5,5,VG_(stats__n_xindir_misses_32)@l - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b postamble + lis 24, VG_(stats__n_xIndir_misses_32)@ha + addi 24, 24, VG_(stats__n_xIndir_misses_32)@l + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + li 6, VG_TRC_INNER_FASTMISS + li 7, 0 + b postamble /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-ppc64be-linux.S b/coregrind/m_dispatch/dispatch-ppc64be-linux.S index 91bd3b236d..c5592d4f31 100644 --- a/coregrind/m_dispatch/dispatch-ppc64be-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc64be-linux.S @@ -45,14 +45,27 @@ .type vgPlain_tt_fast, @object */ .section ".toc","aw" + .tocent__vgPlain_tt_fast: .tc vgPlain_tt_fast[TC],vgPlain_tt_fast -.tocent__vgPlain_stats__n_xindirs_32: - .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32 -.tocent__vgPlain_stats__n_xindir_misses_32: - .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32 + +.tocent__vgPlain_stats__n_xIndirs_32: + .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32 + +.tocent__vgPlain_stats__n_xIndir_hits1_32: + .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32 + +.tocent__vgPlain_stats__n_xIndir_hits2_32: + .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32 + +.tocent__vgPlain_stats__n_xIndir_hits3_32: + .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32 + +.tocent__vgPlain_stats__n_xIndir_misses_32: + .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32 + .tocent__vgPlain_machine_ppc64_has_VMX: - .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX + .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX /*------------------------------------------------------------*/ /*--- ---*/ @@ -454,42 +467,122 @@ VG_(disp_cp_xindir): .globl .VG_(disp_cp_xindir) .VG_(disp_cp_xindir): /* Where are we going? */ - ld 3,OFFSET_ppc64_CIA(31) + ld 20, OFFSET_ppc64_CIA(31) /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - /* r5 = &VG_(tt_fast) */ - ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */ - rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */ - sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - ld 6,0(5) /* .guest */ - ld 7,8(5) /* .host */ - cmpd 3,6 - bne .fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srdi 26, 20, 2 // g2 = guest >> 2 + srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo + + // Compute r6 = &VG_(tt_fast)[r6] + ld 25, .tocent__vgPlain_tt_fast@toc(2) + sldi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + ld 24, FCS_g0(26) // .guest0 + ld 25, FCS_h0(26) // .host0 + cmpd 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr + /*NOTREACHED*/ + +1: // try way 1 + ld 24, FCS_g1(26) + cmpd 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ld 21, FCS_g0(26) // 21 = old .guest0 + ld 22, FCS_h0(26) // 22 = old .host0 + ld 23, FCS_h1(26) // 23 = old .host1 + std 20, FCS_g0(26) // new .guest0 = guest + std 23, FCS_h0(26) // new .host0 = old .host1 + std 21, FCS_g1(26) // new .guest1 = old .guest0 + std 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + ld 24, FCS_g2(26) + cmpd 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ld 21, FCS_g1(26) + ld 22, FCS_h1(26) + ld 23, FCS_h2(26) + std 20, FCS_g1(26) + std 23, FCS_h1(26) + std 21, FCS_g2(26) + std 22, FCS_h2(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + ld 24, FCS_g3(26) + cmpd 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ld 21, FCS_g2(26) + ld 22, FCS_h2(26) + ld 23, FCS_h3(26) + std 20, FCS_g2(26) + std 23, FCS_h2(26) + std 21, FCS_g3(26) + std 22, FCS_h3(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -.fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b .postamble + ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2) + lwz 25, 0(24) + addi 25 ,25, 1 + stw 25 ,0(24) + + li 6,VG_TRC_INNER_FASTMISS + li 7,0 + b .postamble /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-ppc64le-linux.S b/coregrind/m_dispatch/dispatch-ppc64le-linux.S index 21e43584d8..3e26d7715c 100644 --- a/coregrind/m_dispatch/dispatch-ppc64le-linux.S +++ b/coregrind/m_dispatch/dispatch-ppc64le-linux.S @@ -54,14 +54,27 @@ .type vgPlain_tt_fast, @object */ .section ".toc","aw" + .tocent__vgPlain_tt_fast: .tc vgPlain_tt_fast[TC],vgPlain_tt_fast -.tocent__vgPlain_stats__n_xindirs_32: - .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32 -.tocent__vgPlain_stats__n_xindir_misses_32: - .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32 + +.tocent__vgPlain_stats__n_xIndirs_32: + .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32 + +.tocent__vgPlain_stats__n_xIndir_hits1_32: + .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32 + +.tocent__vgPlain_stats__n_xIndir_hits2_32: + .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32 + +.tocent__vgPlain_stats__n_xIndir_hits3_32: + .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32 + +.tocent__vgPlain_stats__n_xIndir_misses_32: + .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32 + .tocent__vgPlain_machine_ppc64_has_VMX: - .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX + .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX /*------------------------------------------------------------*/ /*--- ---*/ @@ -518,47 +531,127 @@ VG_(disp_cp_xindir): addi 2,2,.TOC.-0b@l .localentry VG_(disp_cp_xindir), .-VG_(disp_cp_xindir) #endif - /* Where are we going? */ - ld 3,OFFSET_ppc64_CIA(31) + /* Where are we going? */ + ld 20, OFFSET_ppc64_CIA(31) /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - /* r5 = &VG_(tt_fast) */ - ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */ - - /* try a fast lookup in the translation cache */ - /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */ - rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */ - sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */ - add 5,5,4 /* & VG_(tt_fast)[entry#] */ - ld 6,0(5) /* .guest */ - ld 7,8(5) /* .host */ - cmpd 3,6 - bne .fast_lookup_failed - - /* Found a match. Jump to .host. */ - mtctr 7 + ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + + // LIVE: r31 (guest state ptr), r20 (guest address to go to). + // We use 6 temporaries: + // r26 (to point at the relevant FastCacheSet), + // r21, r22, r23 (scratch, for swapping entries within a set) + // r24, r25 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute r26 = VG_TT_FAST_HASH(guest) + srdi 26, 20, 2 // g2 = guest >> 2 + srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS) + xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2 + andi. 26, 26, VG_TT_FAST_MASK // setNo + + // Compute r6 = &VG_(tt_fast)[r6] + ld 25, .tocent__vgPlain_tt_fast@toc(2) + sldi 26, 26, VG_FAST_CACHE_SET_BITS + add 26, 26, 25 + + // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set) + // try way 0 + ld 24, FCS_g0(26) // .guest0 + ld 25, FCS_h0(26) // .host0 + cmpd 24, 20 // cmp against .guest0 + bne 1f + // hit at way 0 + // goto .host0 + mtctr 25 bctr -#if _CALL_ELF == 2 - .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir) -#endif + /*NOTREACHED*/ + +1: // try way 1 + ld 24, FCS_g1(26) + cmpd 24, 20 // cmp against .guest1 + bne 2f + // hit at way 1; swap upwards + ld 21, FCS_g0(26) // 21 = old .guest0 + ld 22, FCS_h0(26) // 22 = old .host0 + ld 23, FCS_h1(26) // 23 = old .host1 + std 20, FCS_g0(26) // new .guest0 = guest + std 23, FCS_h0(26) // new .host0 = old .host1 + std 21, FCS_g1(26) // new .guest1 = old .guest0 + std 22, FCS_h1(26) // new .host1 = old .host0 + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host1 a.k.a. new .host0 + mtctr 23 + bctr + /*NOTREACHED*/ + +2: // try way 2 + ld 24, FCS_g2(26) + cmpd 24, 20 // cmp against .guest2 + bne 3f + // hit at way 2; swap upwards + ld 21, FCS_g1(26) + ld 22, FCS_h1(26) + ld 23, FCS_h2(26) + std 20, FCS_g1(26) + std 23, FCS_h1(26) + std 21, FCS_g2(26) + std 22, FCS_h2(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host2 a.k.a. new .host1 + mtctr 23 + bctr + /*NOTREACHED*/ + +3: // try way 3 + ld 24, FCS_g3(26) + cmpd 24, 20 // cmp against .guest3 + bne 4f + // hit at way 3; swap upwards + ld 21, FCS_g2(26) + ld 22, FCS_h2(26) + ld 23, FCS_h3(26) + std 20, FCS_g2(26) + std 23, FCS_h2(26) + std 21, FCS_g3(26) + std 22, FCS_h3(26) + // stats only + ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2) + lwz 25, 0(24) + addi 25, 25, 1 + stw 25, 0(24) + // goto old .host3 a.k.a. new .host2 + mtctr 23 + bctr + /*NOTREACHED*/ -.fast_lookup_failed: +4: // fast lookup failed: /* stats only */ - ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2) - lwz 6,0(5) - addi 6,6,1 - stw 6,0(5) - - li 6,VG_TRC_INNER_FASTMISS - li 7,0 - b .postamble + ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2) + lwz 25, 0(24) + addi 25 ,25, 1 + stw 25 ,0(24) + + li 6,VG_TRC_INNER_FASTMISS + li 7,0 + b .postamble /*NOTREACHED*/ +#if _CALL_ELF == 2 + .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir) +#endif /* ------ Assisted jump ------ */ .section ".text" diff --git a/coregrind/m_dispatch/dispatch-s390x-linux.S b/coregrind/m_dispatch/dispatch-s390x-linux.S index 83c2e2a1da..c31e32a218 100644 --- a/coregrind/m_dispatch/dispatch-s390x-linux.S +++ b/coregrind/m_dispatch/dispatch-s390x-linux.S @@ -197,54 +197,121 @@ VG_(disp_cp_chain_me_to_fastEP): /* ------ Indirect but boring jump ------ */ .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): - /* Where are we going? */ - lg %r2, OFFSET_s390x_IA(%r13) - - /* Increment VG_(stats__n_xindirs_32) */ - larl %r8, VG_(stats__n_xindirs_32) - l %r10,0(%r8) - ahi %r10,1 - st %r10,0(%r8) - - /* Try a fast lookup in the translation cache: - Compute offset (not index) into VT_(tt_fast): - - offset = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry) - - with VG_TT_FAST_HASH(addr) == (addr >> 1) & VG_TT_FAST_MASK - and sizeof(FastCacheEntry) == 16 - - offset = ((addr >> 1) & VG_TT_FAST_MASK) << 4 - which is - offset = ((addr & (VG_TT_FAST_MASK << 1) ) << 3 - */ - larl %r8, VG_(tt_fast) - llill %r5,(VG_TT_FAST_MASK << 1) & 0xffff -#if ((( VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 != 0) - iilh %r5,((VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 -#endif - ngr %r5,%r2 - sllg %r7,%r5,3 - lg %r11, 8(%r8,%r7) /* .host */ - cg %r2, 0(%r8,%r7) /* next guest address == .guest ? */ - jne fast_lookup_failed - - /* Found a match. Call .host. - r11 is an address. There we will find the instrumented client code. - That code may modify the guest state register r13. */ - br %r11 - .long 0x0 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: - /* Increment VG_(stats__n_xindir_misses_32) */ - larl %r8, VG_(stats__n_xindir_misses_32) - l %r10,0(%r8) - ahi %r10,1 - st %r10,0(%r8) - - lghi %r0,VG_TRC_INNER_FASTMISS - lghi %r1,0 + /* Where are we going? */ + lg %r6, OFFSET_s390x_IA(%r13) // "guest" + + /* stats only */ + larl %r11, VG_(stats__n_xIndirs_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + + // LIVE: r13 (guest state ptr), r6 (guest address to go to). + // We use 6 temporaries: + // r7 (to point at the relevant FastCacheSet), + // r8, r9, r10 (scratch, for swapping entries within a set) + // r11, r12 (other scratch) + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %r7 = VG_TT_FAST_HASH(guest) + srlg %r7, %r6, 1 // g1 = guest >> 1 + srlg %r8, %r6, (VG_TT_FAST_BITS + 1) // (g1 >> VG_TT_FAST_BITS) + xgr %r7, %r8 // (g1 >> VG_TT_FAST_BITS) ^ g1 + llill %r8, VG_TT_FAST_MASK & 0xffff +# if ((VG_TT_FAST_MASK & 0xffff0000) >> 16 != 0) + iilh %r8, (VG_TT_FAST_MASK & 0xffff0000) >> 16 +# endif + ngr %r7, %r8 // setNo + + // Compute %r7 = &VG_(tt_fast)[%r7] + sllg %r7,%r7, VG_FAST_CACHE_SET_BITS // setNo * sizeof(FastCacheSet) + larl %r8, VG_(tt_fast) // &VG_(tt_fast)[0] + agr %r7, %r8 // &VG_(tt_fast)[setNo] + + // LIVE: %r13 (guest state ptr), %r6 (guest addr), %r7 (cache set) + // try way 0 + cg %r6, FCS_g0(%r7) // cmp against .guest0 + lg %r8, FCS_h0(%r7) + jne 1f + // hit at way 0 + // goto .host0 + br %r8 + /*NOTREACHED*/ + .long 0 + +1: // try way 1 + cg %r6, FCS_g1(%r7) // cmp against .guest1 + jne 2f + // hit at way 1; swap upwards + lg %r8, FCS_g0(%r7) // r8 = old .guest0 + lg %r9, FCS_h0(%r7) // r9 = old .host0 + lg %r10, FCS_h1(%r7) // r10 = old .host1 + stg %r6, FCS_g0(%r7) // new .guest0 = guest + stg %r10, FCS_h0(%r7) // new .host0 = old .host1 + stg %r8, FCS_g1(%r7) // new .guest1 = old .guest0 + stg %r9, FCS_h1(%r7) // new .host1 = old .host0 + // stats only + larl %r11, VG_(stats__n_xIndir_hits1_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host1 a.k.a. new .host0 + br %r10 + /*NOTREACHED*/ + .long 0 + +2: // try way 2 + cg %r6, FCS_g2(%r7) // cmp against .guest2 + jne 3f + lg %r8, FCS_g1(%r7) + lg %r9, FCS_h1(%r7) + lg %r10, FCS_h2(%r7) + stg %r6, FCS_g1(%r7) + stg %r10, FCS_h1(%r7) + stg %r8, FCS_g2(%r7) + stg %r9, FCS_h2(%r7) + // stats only + larl %r11, VG_(stats__n_xIndir_hits2_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host2 a.k.a. new .host1 + br %r10 + /*NOTREACHED*/ + .long 0 + +3: // try way 3 + cg %r6, FCS_g3(%r7) // cmp against .guest3 + jne 4f + // hit at way 3; swap upwards + lg %r8, FCS_g2(%r7) + lg %r9, FCS_h2(%r7) + lg %r10, FCS_h3(%r7) + stg %r6, FCS_g2(%r7) + stg %r10, FCS_h2(%r7) + stg %r8, FCS_g3(%r7) + stg %r9, FCS_h3(%r7) + // stats only + larl %r11, VG_(stats__n_xIndir_hits3_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + // goto old .host3 a.k.a. new .host2 + br %r10 + .long 0 + +4: // fast lookup failed + larl %r11, VG_(stats__n_xIndir_misses_32) + l %r12, 0(%r11) + ahi %r12, 1 + st %r12, 0(%r11) + + lghi %r0, VG_TRC_INNER_FASTMISS + lghi %r1, 0 j postamble + /*NOTREACHED*/ /* ------ Assisted jump ------ */ diff --git a/coregrind/m_dispatch/dispatch-x86-darwin.S b/coregrind/m_dispatch/dispatch-x86-darwin.S index 55188e9c58..467d7d62de 100644 --- a/coregrind/m_dispatch/dispatch-x86-darwin.S +++ b/coregrind/m_dispatch/dispatch-x86-darwin.S @@ -194,29 +194,91 @@ VG_(disp_cp_chain_me_to_fastEP): jmp postamble /* ------ Indirect but boring jump ------ */ -.globl VG_(disp_cp_xindir) +.global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movl OFFSET_x86_EIP(%ebp), %eax + movl OFFSET_x86_EIP(%ebp), %eax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movl %eax, %ebx /* next guest addr */ - andl $VG_TT_FAST_MASK, %ebx /* entry# */ - movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ - movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ - cmpl %eax, %esi - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%edi - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %ebp (guest state ptr), %eax (guest address to go to). + // We use 4 temporaries: + // %esi (to point at the relevant FastCacheSet), + // %ebx, %ecx and %edx (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %esi = VG_TT_FAST_HASH(guest) + movl %eax, %esi // guest + shrl $VG_TT_FAST_BITS, %esi // (guest >> VG_TT_FAST_BITS) + xorl %eax, %esi // (guest >> VG_TT_FAST_BITS) ^ guest + andl $VG_TT_FAST_MASK, %esi // setNo + + // Compute %esi = &VG_(tt_fast)[%esi] + shll $VG_FAST_CACHE_SET_BITS, %esi // setNo * sizeof(FastCacheSet) + leal VG_(tt_fast)(%esi), %esi // &VG_(tt_fast)[setNo] + + // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set) + // try way 0 + cmpl %eax, FCS_g0(%esi) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%esi) // goto .host0 + ud2 + +1: // try way 1 + cmpl %eax, FCS_g1(%esi) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movl FCS_g0(%esi), %ebx // ebx = old .guest0 + movl FCS_h0(%esi), %ecx // ecx = old .host0 + movl FCS_h1(%esi), %edx // edx = old .host1 + movl %eax, FCS_g0(%esi) // new .guest0 = guest + movl %edx, FCS_h0(%esi) // new .host0 = old .host1 + movl %ebx, FCS_g1(%esi) // new .guest1 = old .guest0 + movl %ecx, FCS_h1(%esi) // new .host1 = old .host0 + jmp *%edx // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpl %eax, FCS_g2(%esi) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movl FCS_g1(%esi), %ebx + movl FCS_h1(%esi), %ecx + movl FCS_h2(%esi), %edx + movl %eax, FCS_g1(%esi) + movl %edx, FCS_h1(%esi) + movl %ebx, FCS_g2(%esi) + movl %ecx, FCS_h2(%esi) + jmp *%edx + ud2 + +3: // try way 3 + cmpl %eax, FCS_g3(%esi) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movl FCS_g2(%esi), %ebx + movl FCS_h2(%esi), %ecx + movl FCS_h3(%esi), %edx + movl %eax, FCS_g2(%esi) + movl %edx, FCS_h2(%esi) + movl %ebx, FCS_g3(%esi) + movl %ecx, FCS_h3(%esi) + jmp *%edx + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movl $VG_TRC_INNER_FASTMISS, %eax movl $0, %edx diff --git a/coregrind/m_dispatch/dispatch-x86-linux.S b/coregrind/m_dispatch/dispatch-x86-linux.S index d949f1fd3e..7270744db0 100644 --- a/coregrind/m_dispatch/dispatch-x86-linux.S +++ b/coregrind/m_dispatch/dispatch-x86-linux.S @@ -198,26 +198,88 @@ VG_(disp_cp_chain_me_to_fastEP): .global VG_(disp_cp_xindir) VG_(disp_cp_xindir): /* Where are we going? */ - movl OFFSET_x86_EIP(%ebp), %eax + movl OFFSET_x86_EIP(%ebp), %eax // "guest" /* stats only */ - addl $1, VG_(stats__n_xindirs_32) - - /* try a fast lookup in the translation cache */ - movl %eax, %ebx /* next guest addr */ - andl $VG_TT_FAST_MASK, %ebx /* entry# */ - movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */ - movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */ - cmpl %eax, %esi - jnz fast_lookup_failed - - /* Found a match. Jump to .host. */ - jmp *%edi - ud2 /* persuade insn decoders not to speculate past here */ - -fast_lookup_failed: + addl $1, VG_(stats__n_xIndirs_32) + + // LIVE: %ebp (guest state ptr), %eax (guest address to go to). + // We use 4 temporaries: + // %esi (to point at the relevant FastCacheSet), + // %ebx, %ecx and %edx (scratch). + + /* Try a fast lookup in the translation cache. This is pretty much + a handcoded version of VG_(lookupInFastCache). */ + + // Compute %esi = VG_TT_FAST_HASH(guest) + movl %eax, %esi // guest + shrl $VG_TT_FAST_BITS, %esi // (guest >> VG_TT_FAST_BITS) + xorl %eax, %esi // (guest >> VG_TT_FAST_BITS) ^ guest + andl $VG_TT_FAST_MASK, %esi // setNo + + // Compute %esi = &VG_(tt_fast)[%esi] + shll $VG_FAST_CACHE_SET_BITS, %esi // setNo * sizeof(FastCacheSet) + leal VG_(tt_fast)(%esi), %esi // &VG_(tt_fast)[setNo] + + // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set) + // try way 0 + cmpl %eax, FCS_g0(%esi) // cmp against .guest0 + jnz 1f + // hit at way 0 + jmp *FCS_h0(%esi) // goto .host0 + ud2 + +1: // try way 1 + cmpl %eax, FCS_g1(%esi) // cmp against .guest1 + jnz 2f + // hit at way 1; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits1_32) + movl FCS_g0(%esi), %ebx // ebx = old .guest0 + movl FCS_h0(%esi), %ecx // ecx = old .host0 + movl FCS_h1(%esi), %edx // edx = old .host1 + movl %eax, FCS_g0(%esi) // new .guest0 = guest + movl %edx, FCS_h0(%esi) // new .host0 = old .host1 + movl %ebx, FCS_g1(%esi) // new .guest1 = old .guest0 + movl %ecx, FCS_h1(%esi) // new .host1 = old .host0 + jmp *%edx // goto old .host1 a.k.a. new .host0 + ud2 + +2: // try way 2 + cmpl %eax, FCS_g2(%esi) // cmp against .guest2 + jnz 3f + // hit at way 2; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits2_32) + movl FCS_g1(%esi), %ebx + movl FCS_h1(%esi), %ecx + movl FCS_h2(%esi), %edx + movl %eax, FCS_g1(%esi) + movl %edx, FCS_h1(%esi) + movl %ebx, FCS_g2(%esi) + movl %ecx, FCS_h2(%esi) + jmp *%edx + ud2 + +3: // try way 3 + cmpl %eax, FCS_g3(%esi) // cmp against .guest3 + jnz 4f + // hit at way 3; swap upwards + /* stats only */ + addl $1, VG_(stats__n_xIndir_hits3_32) + movl FCS_g2(%esi), %ebx + movl FCS_h2(%esi), %ecx + movl FCS_h3(%esi), %edx + movl %eax, FCS_g2(%esi) + movl %edx, FCS_h2(%esi) + movl %ebx, FCS_g3(%esi) + movl %ecx, FCS_h3(%esi) + jmp *%edx + ud2 + +4: // fast lookup failed /* stats only */ - addl $1, VG_(stats__n_xindir_misses_32) + addl $1, VG_(stats__n_xIndir_misses_32) movl $VG_TRC_INNER_FASTMISS, %eax movl $0, %edx diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c index 68e9590a01..bd266e4f39 100644 --- a/coregrind/m_scheduler/scheduler.c +++ b/coregrind/m_scheduler/scheduler.c @@ -130,16 +130,23 @@ static void mostly_clear_thread_record ( ThreadId tid ); static ULong n_scheduling_events_MINOR = 0; static ULong n_scheduling_events_MAJOR = 0; -/* Stats: number of XIndirs, and number that missed in the fast - cache. */ -static ULong stats__n_xindirs = 0; -static ULong stats__n_xindir_misses = 0; +/* Stats: number of XIndirs looked up in the fast cache, the number of hits in + ways 1, 2 and 3, and the number of misses. The number of hits in way 0 isn't + recorded because it can be computed from these five numbers. */ +static ULong stats__n_xIndirs = 0; +static ULong stats__n_xIndir_hits1 = 0; +static ULong stats__n_xIndir_hits2 = 0; +static ULong stats__n_xIndir_hits3 = 0; +static ULong stats__n_xIndir_misses = 0; /* And 32-bit temp bins for the above, so that 32-bit platforms don't have to do 64 bit incs on the hot path through - VG_(cp_disp_xindir). */ -/*global*/ UInt VG_(stats__n_xindirs_32) = 0; -/*global*/ UInt VG_(stats__n_xindir_misses_32) = 0; + VG_(disp_cp_xindir). */ +/*global*/ UInt VG_(stats__n_xIndirs_32) = 0; +/*global*/ UInt VG_(stats__n_xIndir_hits1_32) = 0; +/*global*/ UInt VG_(stats__n_xIndir_hits2_32) = 0; +/*global*/ UInt VG_(stats__n_xIndir_hits3_32) = 0; +/*global*/ UInt VG_(stats__n_xIndir_misses_32) = 0; /* Sanity checking counts. */ static UInt sanity_fast_count = 0; @@ -149,11 +156,25 @@ void VG_(print_scheduler_stats)(void) { VG_(message)(Vg_DebugMsg, "scheduler: %'llu event checks.\n", bbs_done ); + + const ULong hits0 + = stats__n_xIndirs - stats__n_xIndir_hits1 - stats__n_xIndir_hits2 + - stats__n_xIndir_hits3 - stats__n_xIndir_misses; + VG_(message)(Vg_DebugMsg, + "scheduler: %'llu indir transfers, " + "%'llu misses (1 in %llu) ..\n", + stats__n_xIndirs, stats__n_xIndir_misses, + stats__n_xIndirs / (stats__n_xIndir_misses + ? stats__n_xIndir_misses : 1)); VG_(message)(Vg_DebugMsg, - "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n", - stats__n_xindirs, stats__n_xindir_misses, - stats__n_xindirs / (stats__n_xindir_misses - ? stats__n_xindir_misses : 1)); + "scheduler: .. of which: %'llu hit0, %'llu hit1, " + "%'llu hit2, %'llu hit3, %'llu missed\n", + hits0, + stats__n_xIndir_hits1, + stats__n_xIndir_hits2, + stats__n_xIndir_hits3, + stats__n_xIndir_misses); + VG_(message)(Vg_DebugMsg, "scheduler: %'llu/%'llu major/minor sched events.\n", n_scheduling_events_MAJOR, n_scheduling_events_MINOR); @@ -928,8 +949,11 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words, /* end Paranoia */ /* Futz with the XIndir stats counters. */ - vg_assert(VG_(stats__n_xindirs_32) == 0); - vg_assert(VG_(stats__n_xindir_misses_32) == 0); + vg_assert(VG_(stats__n_xIndirs_32) == 0); + vg_assert(VG_(stats__n_xIndir_hits1_32) == 0); + vg_assert(VG_(stats__n_xIndir_hits2_32) == 0); + vg_assert(VG_(stats__n_xIndir_hits3_32) == 0); + vg_assert(VG_(stats__n_xIndir_misses_32) == 0); /* Clear return area. */ two_words[0] = two_words[1] = 0; @@ -940,10 +964,13 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words, host_code_addr = alt_host_addr; } else { /* normal case -- redir translation */ - UInt cno = (UInt)VG_TT_FAST_HASH((Addr)tst->arch.vex.VG_INSTR_PTR); - if (LIKELY(VG_(tt_fast)[cno].guest == (Addr)tst->arch.vex.VG_INSTR_PTR)) - host_code_addr = VG_(tt_fast)[cno].host; - else { + Addr host_from_fast_cache = 0; + Bool found_in_fast_cache + = VG_(lookupInFastCache)( &host_from_fast_cache, + (Addr)tst->arch.vex.VG_INSTR_PTR ); + if (found_in_fast_cache) { + host_code_addr = host_from_fast_cache; + } else { Addr res = 0; /* not found in VG_(tt_fast). Searching here the transtab improves the performance compared to returning directly @@ -1027,10 +1054,16 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words, /* Merge the 32-bit XIndir/miss counters into the 64 bit versions, and zero out the 32-bit ones in preparation for the next run of generated code. */ - stats__n_xindirs += (ULong)VG_(stats__n_xindirs_32); - VG_(stats__n_xindirs_32) = 0; - stats__n_xindir_misses += (ULong)VG_(stats__n_xindir_misses_32); - VG_(stats__n_xindir_misses_32) = 0; + stats__n_xIndirs += (ULong)VG_(stats__n_xIndirs_32); + VG_(stats__n_xIndirs_32) = 0; + stats__n_xIndir_hits1 += (ULong)VG_(stats__n_xIndir_hits1_32); + VG_(stats__n_xIndir_hits1_32) = 0; + stats__n_xIndir_hits2 += (ULong)VG_(stats__n_xIndir_hits2_32); + VG_(stats__n_xIndir_hits2_32) = 0; + stats__n_xIndir_hits3 += (ULong)VG_(stats__n_xIndir_hits3_32); + VG_(stats__n_xIndir_hits3_32) = 0; + stats__n_xIndir_misses += (ULong)VG_(stats__n_xIndir_misses_32); + VG_(stats__n_xIndir_misses_32) = 0; /* Inspect the event counter. */ vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1); diff --git a/coregrind/m_transtab.c b/coregrind/m_transtab.c index ef2e3df863..23ecb11f04 100644 --- a/coregrind/m_transtab.c +++ b/coregrind/m_transtab.c @@ -457,9 +457,10 @@ static Int tc_sector_szQ = 0; static SECno sector_search_order[MAX_N_SECTORS]; -/* Fast helper for the TC. A direct-mapped cache which holds a set of - recently used (guest address, host address) pairs. This array is - referred to directly from m_dispatch/dispatch-.S. +/* Fast helper for the TC. A 4-way set-associative cache, with more-or-less LRU + replacement. It holds a set of recently used (guest address, host address) + pairs. This array is referred to directly from + m_dispatch/dispatch-.S. Entries in tt_fast may refer to any valid TC entry, regardless of which sector it's in. Consequently we must be very careful to @@ -474,13 +475,19 @@ static SECno sector_search_order[MAX_N_SECTORS]; /* typedef struct { - Addr guest; - Addr host; - } - FastCacheEntry; + Addr guest0; + Addr host0; + Addr guest1; + Addr host1; + Addr guest2; + Addr host2; + Addr guest3; + Addr host3; + } + FastCacheSet; */ -/*global*/ __attribute__((aligned(16))) - FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE]; +/*global*/ __attribute__((aligned(64))) + FastCacheSet VG_(tt_fast)[VG_TT_FAST_SETS]; /* Make sure we're not used before initialisation. */ static Bool init_done = False; @@ -1455,36 +1462,40 @@ static inline HTTno HASH_TT ( Addr key ) return (HTTno)(k32 % N_HTTES_PER_SECTOR); } -static void setFastCacheEntry ( Addr key, ULong* tcptr ) -{ - UInt cno = (UInt)VG_TT_FAST_HASH(key); - VG_(tt_fast)[cno].guest = key; - VG_(tt_fast)[cno].host = (Addr)tcptr; - n_fast_updates++; - /* This shouldn't fail. It should be assured by m_translate - which should reject any attempt to make translation of code - starting at TRANSTAB_BOGUS_GUEST_ADDR. */ - vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR); -} - /* Invalidate the fast cache VG_(tt_fast). */ static void invalidateFastCache ( void ) { - UInt j; - /* This loop is popular enough to make it worth unrolling a - bit, at least on ppc32. */ - vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0); - for (j = 0; j < VG_TT_FAST_SIZE; j += 4) { - VG_(tt_fast)[j+0].guest = TRANSTAB_BOGUS_GUEST_ADDR; - VG_(tt_fast)[j+1].guest = TRANSTAB_BOGUS_GUEST_ADDR; - VG_(tt_fast)[j+2].guest = TRANSTAB_BOGUS_GUEST_ADDR; - VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR; + for (UWord j = 0; j < VG_TT_FAST_SETS; j++) { + FastCacheSet* set = &VG_(tt_fast)[j]; + set->guest0 = TRANSTAB_BOGUS_GUEST_ADDR; + set->guest1 = TRANSTAB_BOGUS_GUEST_ADDR; + set->guest2 = TRANSTAB_BOGUS_GUEST_ADDR; + set->guest3 = TRANSTAB_BOGUS_GUEST_ADDR; } - - vg_assert(j == VG_TT_FAST_SIZE); n_fast_flushes++; } +static void setFastCacheEntry ( Addr guest, ULong* tcptr ) +{ + /* This shouldn't fail. It should be assured by m_translate + which should reject any attempt to make translation of code + starting at TRANSTAB_BOGUS_GUEST_ADDR. */ + vg_assert(guest != TRANSTAB_BOGUS_GUEST_ADDR); + /* Shift all entries along one, so that the LRU one disappears, and put the + new entry at the MRU position. */ + UWord setNo = (UInt)VG_TT_FAST_HASH(guest); + FastCacheSet* set = &VG_(tt_fast)[setNo]; + set->host3 = set->host2; + set->guest3 = set->guest2; + set->host2 = set->host1; + set->guest2 = set->guest1; + set->host1 = set->host0; + set->guest1 = set->guest0; + set->host0 = (Addr)tcptr; + set->guest0 = guest; + n_fast_updates++; +} + static TTEno get_empty_tt_slot(SECno sNo) { @@ -2432,15 +2443,36 @@ void VG_(init_tt_tc) ( void ) vg_assert(N_HTTES_PER_SECTOR < INV_TTE); vg_assert(N_HTTES_PER_SECTOR < EC2TTE_DELETED); vg_assert(N_HTTES_PER_SECTOR < HTT_EMPTY); - /* check fast cache entries really are 2 words long */ + + /* check fast cache entries really are 8 words long */ vg_assert(sizeof(Addr) == sizeof(void*)); - vg_assert(sizeof(FastCacheEntry) == 2 * sizeof(Addr)); + vg_assert(sizeof(FastCacheSet) == 8 * sizeof(Addr)); /* check fast cache entries are packed back-to-back with no spaces */ vg_assert(sizeof( VG_(tt_fast) ) - == VG_TT_FAST_SIZE * sizeof(FastCacheEntry)); + == VG_TT_FAST_SETS * sizeof(FastCacheSet)); + /* check fast cache entries have the layout that the handwritten assembly + fragments assume. */ + vg_assert(sizeof(FastCacheSet) == (1 << VG_FAST_CACHE_SET_BITS)); + vg_assert(offsetof(FastCacheSet,guest0) == FCS_g0); + vg_assert(offsetof(FastCacheSet,host0) == FCS_h0); + vg_assert(offsetof(FastCacheSet,guest1) == FCS_g1); + vg_assert(offsetof(FastCacheSet,host1) == FCS_h1); + vg_assert(offsetof(FastCacheSet,guest2) == FCS_g2); + vg_assert(offsetof(FastCacheSet,host2) == FCS_h2); + vg_assert(offsetof(FastCacheSet,guest3) == FCS_g3); + vg_assert(offsetof(FastCacheSet,host3) == FCS_h3); + vg_assert(offsetof(FastCacheSet,guest0) == 0 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,host0) == 1 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,guest1) == 2 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,host1) == 3 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,guest2) == 4 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,host2) == 5 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,guest3) == 6 * sizeof(Addr)); + vg_assert(offsetof(FastCacheSet,host3) == 7 * sizeof(Addr)); + /* check fast cache is aligned as we requested. Not fatal if it isn't, but we might as well make sure. */ - vg_assert(VG_IS_16_ALIGNED( ((Addr) & VG_(tt_fast)[0]) )); + vg_assert(VG_IS_64_ALIGNED( ((Addr) & VG_(tt_fast)[0]) )); /* The TTEntryH size is critical for keeping the LLC miss rate down when doing a lot of discarding. Hence check it here. We also diff --git a/coregrind/pub_core_transtab.h b/coregrind/pub_core_transtab.h index 951cbd9496..a77ca3c19e 100644 --- a/coregrind/pub_core_transtab.h +++ b/coregrind/pub_core_transtab.h @@ -41,20 +41,107 @@ #include "pub_tool_transtab.h" #include "libvex.h" // VexGuestExtents -/* The fast-cache for tt-lookup. Unused entries are denoted by .guest - == 1, which is assumed to be a bogus address for all guest code. */ +/* The fast-cache for tt-lookup. Unused entries are denoted by + .guest == TRANSTAB_BOGUS_GUEST_ADDR (viz, 1), which is assumed + to be a bogus address for all guest code. See pub_core_transtab_asm.h + for further description. */ typedef struct { - Addr guest; - Addr host; + Addr guest0; + Addr host0; + Addr guest1; + Addr host1; + Addr guest2; + Addr host2; + Addr guest3; + Addr host3; } - FastCacheEntry; + FastCacheSet; -extern __attribute__((aligned(16))) - FastCacheEntry VG_(tt_fast) [VG_TT_FAST_SIZE]; +STATIC_ASSERT(sizeof(Addr) == sizeof(UWord)); +STATIC_ASSERT(sizeof(FastCacheSet) == sizeof(Addr) * 8); + +extern __attribute__((aligned(64))) + FastCacheSet VG_(tt_fast) [VG_TT_FAST_SETS]; #define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1) +#if defined(VGA_x86) || defined(VGA_amd64) +static inline UWord VG_TT_FAST_HASH ( Addr guest ) { + // There's no minimum insn alignment on these targets. + UWord merged = ((UWord)guest) >> 0; + merged = (merged >> VG_TT_FAST_BITS) ^ merged; + return merged & VG_TT_FAST_MASK; +} + +#elif defined(VGA_s390x) || defined(VGA_arm) +static inline UWord VG_TT_FAST_HASH ( Addr guest ) { + // Instructions are 2-byte aligned. + UWord merged = ((UWord)guest) >> 1; + merged = (merged >> VG_TT_FAST_BITS) ^ merged; + return merged & VG_TT_FAST_MASK; +} + +#elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le) \ + || defined(VGA_mips32) || defined(VGA_mips64) || defined(VGA_arm64) +static inline UWord VG_TT_FAST_HASH ( Addr guest ) { + // Instructions are 4-byte aligned. + UWord merged = ((UWord)guest) >> 2; + merged = (merged >> VG_TT_FAST_BITS) ^ merged; + return merged & VG_TT_FAST_MASK; +} + +#else +# error "VG_TT_FAST_HASH: unknown platform" +#endif + +static inline Bool VG_(lookupInFastCache)( /*MB_OUT*/Addr* host, Addr guest ) +{ + UWord setNo = (UInt)VG_TT_FAST_HASH(guest); + FastCacheSet* set = &VG_(tt_fast)[setNo]; + if (LIKELY(set->guest0 == guest)) { + // hit at way 0 + *host = set->host0; + return True; + } + if (LIKELY(set->guest1 == guest)) { + // hit at way 1; swap upwards + Addr tG = guest; + Addr tH = set->host1; + set->guest1 = set->guest0; + set->host1 = set->host0; + set->guest0 = tG; + set->host0 = tH; + *host = tH; + return True; + } + if (LIKELY(set->guest2 == guest)) { + // hit at way 2; swap upwards + Addr tG = guest; + Addr tH = set->host2; + set->guest2 = set->guest1; + set->host2 = set->host1; + set->guest1 = tG; + set->host1 = tH; + *host = tH; + return True; + } + if (LIKELY(set->guest3 == guest)) { + // hit at way 3; swap upwards + Addr tG = guest; + Addr tH = set->host3; + set->guest3 = set->guest2; + set->host3 = set->host2; + set->guest2 = tG; + set->host2 = tH; + *host = tH; + return True; + } + // Not found + *host = 0; + return False; +} + /* Initialises the TC, using VG_(clo_num_transtab_sectors) and VG_(clo_avg_transtab_entry_size). diff --git a/coregrind/pub_core_transtab_asm.h b/coregrind/pub_core_transtab_asm.h index e1e2687c2b..9e85774c74 100644 --- a/coregrind/pub_core_transtab_asm.h +++ b/coregrind/pub_core_transtab_asm.h @@ -31,43 +31,88 @@ #ifndef __PUB_CORE_TRANSTAB_ASM_H #define __PUB_CORE_TRANSTAB_ASM_H -/* Constants for the fast translation lookup cache. It is a direct - mapped cache, with 2^VG_TT_FAST_BITS entries. +/* Constants for the fast translation lookup cache. It is a 4 way associative + cache, with more-or-less LRU replacement. It contains 2^VG_TT_FAST_BITS + sets. + + On all targets, the set number is computed from least significant 2 * + VG_TT_FAST_BITS of the guest address. This is a bit unusual in as much as + it is more normal just to use a VG_TT_FAST_BITS-sized slice of the address + as the set number. Using twice as many bits (the two chunks are xor'd) + spreads entries out (reduces aliasing) and significantly reduces the overall + miss rate. The cost is two extra cycles on the fast lookup path, to perform + an extra shift and an xor. + + For each set there are 4 ways: way0, way1, way2 and way3. way0 is intended + to be the MRU and way3 the LRU. Most lookups hit way0 and involve no + modification of the line. A hit at way1 causes way0 and way1 to be swapped. + A hit at way2 causes way1 and way2 to be swapped; that is, way2 is moved one + step closer to the front. But not all the way to the front. Similarly a + hit at way3 causes way2 and way3 to be swapped. + + See VG_(lookupInFastCache) for a C implementation of this logic and + dispatch-*-*.S, label VG_(disp_cp_xindir), for the handcoded assembly + equivalents for each target. Note that VG_(lookupInFastCache) is used in C + land for some administrative lookups but isn't really performance critical. + The dispatch-*-*.S implementations are used to process all indirect branches + in the simulator and so *are* performance critical. + + Updates to the cache are rare. These are performed by setFastCacheEntry. + New entries are put into way0 and all others are shifted down one slot, so + that the contents of way3 falls out of the cache. On x86/amd64, the cache index is computed as - 'address[VG_TT_FAST_BITS-1 : 0]'. - - On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of - instruction addresses are zero, which means that function causes - only 1/4 of the entries to ever be used. So instead the function - is '(address >>u 2)[VG_TT_FAST_BITS-1 : 0]' on those targets. - - On ARM we shift by 1, since Thumb insns can be of size 2, hence to - minimise collisions and maximise cache utilisation we need to take - into account all but the least significant bit. - - On s390x the rightmost bit of an instruction address is zero. - For best table utilization shift the address to the right by 1 bit. */ - -#define VG_TT_FAST_BITS 15 -#define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS) -#define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1) - -/* This macro isn't usable in asm land; nevertheless this seems - like a good place to put it. */ - -#if defined(VGA_x86) || defined(VGA_amd64) -# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) ) & VG_TT_FAST_MASK) - -#elif defined(VGA_s390x) || defined(VGA_arm) -# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) >> 1) & VG_TT_FAST_MASK) - -#elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le) \ - || defined(VGA_mips32) || defined(VGA_mips64) || defined(VGA_arm64) -# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) >> 2) & VG_TT_FAST_MASK) + (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1 : 0]'. + + On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of instruction + addresses are zero, which means the above function causes only 1/4 of the + sets to ever be used. So instead the function is + (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+2 : 0+2]'. + + On arm32, the minimum instruction size is 2, so we discard only the least + significant bit of the address, hence: + (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+1 : 0+1]'. + + On s390x the rightmost bit of an instruction address is zero, so the arm32 + scheme is used. */ + +#define VG_TT_FAST_BITS 13 +#define VG_TT_FAST_SETS (1 << VG_TT_FAST_BITS) +#define VG_TT_FAST_MASK ((VG_TT_FAST_SETS) - 1) + +// Log2(sizeof(FastCacheSet)). This is needed in the handwritten assembly. + +#if defined(VGA_amd64) || defined(VGA_arm64) \ + || defined(VGA_ppc64be) || defined(VGA_ppc64le) || defined(VGA_mips64) \ + || defined(VGA_s390x) + // And all other 64-bit hosts +# define VG_FAST_CACHE_SET_BITS 6 + // These FCS_{g,h}{0,1,2,3} are the values of + // offsetof(FastCacheSet,{guest,host}{0,1,2,3}). +# define FCS_g0 0 +# define FCS_h0 8 +# define FCS_g1 16 +# define FCS_h1 24 +# define FCS_g2 32 +# define FCS_h2 40 +# define FCS_g3 48 +# define FCS_h3 56 + +#elif defined(VGA_x86) || defined(VGA_arm) || defined(VGA_ppc32) \ + || defined(VGA_mips32) + // And all other 32-bit hosts +# define VG_FAST_CACHE_SET_BITS 5 +# define FCS_g0 0 +# define FCS_h0 4 +# define FCS_g1 8 +# define FCS_h1 12 +# define FCS_g2 16 +# define FCS_h2 20 +# define FCS_g3 24 +# define FCS_h3 28 #else -# error "VG_TT_FAST_HASH: unknown platform" +# error "VG_FAST_CACHE_SET_BITS not known" #endif #endif // __PUB_CORE_TRANSTAB_ASM_H diff --git a/include/pub_tool_libcbase.h b/include/pub_tool_libcbase.h index f68579a8fe..476272cdbc 100644 --- a/include/pub_tool_libcbase.h +++ b/include/pub_tool_libcbase.h @@ -193,6 +193,7 @@ static void VG_(bzero_inline) ( void* s, SizeT sz ) #define VG_IS_8_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0x7))) #define VG_IS_16_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0xf))) #define VG_IS_32_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0x1f))) +#define VG_IS_64_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0x3f))) #define VG_IS_WORD_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)(sizeof(Addr)-1)))) #define VG_IS_PAGE_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)(VKI_PAGE_SIZE-1))))