jmp postamble
/* ------ Indirect but boring jump ------ */
-.globl VG_(disp_cp_xindir)
+.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
- /* Where are we going? */
- movq OFFSET_amd64_RIP(%rbp), %rax
+ /* Where are we going? */
+ movq OFFSET_amd64_RIP(%rbp), %rax // "guest"
/* stats only */
- movabsq $VG_(stats__n_xindirs_32), %r10
- addl $1, (%r10)
-
- /* try a fast lookup in the translation cache */
- movabsq $VG_(tt_fast), %rcx
- movq %rax, %rbx /* next guest addr */
- andq $VG_TT_FAST_MASK, %rbx /* entry# */
- shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
- movq 0(%rcx,%rbx,1), %r10 /* .guest */
- movq 8(%rcx,%rbx,1), %r11 /* .host */
- cmpq %rax, %r10
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%r11
- ud2 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+ movabsq $VG_(stats__n_xIndirs_32), %r8
+ addl $1, (%r8)
+
+ // LIVE: %rbp (guest state ptr), %rax (guest address to go to).
+ // We use 5 temporaries:
+ // %r9 (to point at the relevant FastCacheSet),
+ // %r10, %r11 and %r12 (scratch).
+ // %r8 (scratch address)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute %r9 = VG_TT_FAST_HASH(guest)
+ movq %rax, %r9 // guest
+ shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS)
+ xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest
+ andq $VG_TT_FAST_MASK, %r9 // setNo
+
+ // Compute %r9 = &VG_(tt_fast)[%r9]
+ shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet)
+ movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0]
+ leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo]
+
+ // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set)
+ // try way 0
+ cmpq %rax, FCS_g0(%r9) // cmp against .guest0
+ jnz 1f
+ // hit at way 0
+ jmp *FCS_h0(%r9) // goto .host0
+ ud2
+
+1: // try way 1
+ cmpq %rax, FCS_g1(%r9) // cmp against .guest1
+ jnz 2f
+ // hit at way 1; swap upwards
+ /* stats only */
+ movabsq $VG_(stats__n_xIndir_hits1_32), %r8
+ addl $1, (%r8)
+ movq FCS_g0(%r9), %r10 // r10 = old .guest0
+ movq FCS_h0(%r9), %r11 // r11 = old .host0
+ movq FCS_h1(%r9), %r12 // r12 = old .host1
+ movq %rax, FCS_g0(%r9) // new .guest0 = guest
+ movq %r12, FCS_h0(%r9) // new .host0 = old .host1
+ movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0
+ movq %r11, FCS_h1(%r9) // new .host1 = old .host0
+ jmp *%r12 // goto old .host1 a.k.a. new .host0
+ ud2
+
+2: // try way 2
+ cmpq %rax, FCS_g2(%r9) // cmp against .guest2
+ jnz 3f
+ // hit at way 2; swap upwards
+ /* stats only */
+ movabsq $VG_(stats__n_xIndir_hits2_32), %r8
+ addl $1, (%r8)
+ movq FCS_g1(%r9), %r10
+ movq FCS_h1(%r9), %r11
+ movq FCS_h2(%r9), %r12
+ movq %rax, FCS_g1(%r9)
+ movq %r12, FCS_h1(%r9)
+ movq %r10, FCS_g2(%r9)
+ movq %r11, FCS_h2(%r9)
+ jmp *%r12
+ ud2
+
+3: // try way 3
+ cmpq %rax, FCS_g3(%r9) // cmp against .guest3
+ jnz 4f
+ // hit at way 3; swap upwards
+ /* stats only */
+ movabsq $VG_(stats__n_xIndir_hits3_32), %r8
+ addl $1, (%r8)
+ movq FCS_g2(%r9), %r10
+ movq FCS_h2(%r9), %r11
+ movq FCS_h3(%r9), %r12
+ movq %rax, FCS_g2(%r9)
+ movq %r12, FCS_h2(%r9)
+ movq %r10, FCS_g3(%r9)
+ movq %r11, FCS_h3(%r9)
+ jmp *%r12
+ ud2
+
+4: // fast lookup failed
/* stats only */
- movabsq $VG_(stats__n_xindir_misses_32), %r10
- addl $1, (%r10)
+ movabsq $VG_(stats__n_xIndir_misses_32), %r8
+ addl $1, (%r8)
movq $VG_TRC_INNER_FASTMISS, %rax
movq $0, %rdx
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
/* Where are we going? */
- movq OFFSET_amd64_RIP(%rbp), %rax
+ movq OFFSET_amd64_RIP(%rbp), %rax // "guest"
/* stats only */
- addl $1, VG_(stats__n_xindirs_32)
-
- /* try a fast lookup in the translation cache */
- movabsq $VG_(tt_fast), %rcx
- movq %rax, %rbx /* next guest addr */
- andq $VG_TT_FAST_MASK, %rbx /* entry# */
- shlq $4, %rbx /* entry# * sizeof(FastCacheEntry) */
- movq 0(%rcx,%rbx,1), %r10 /* .guest */
- movq 8(%rcx,%rbx,1), %r11 /* .host */
- cmpq %rax, %r10
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%r11
- ud2 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+ addl $1, VG_(stats__n_xIndirs_32)
+
+ // LIVE: %rbp (guest state ptr), %rax (guest address to go to).
+ // We use 4 temporaries:
+ // %r9 (to point at the relevant FastCacheSet),
+ // %r10, %r11 and %r12 (scratch).
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute %r9 = VG_TT_FAST_HASH(guest)
+ movq %rax, %r9 // guest
+ shrq $VG_TT_FAST_BITS, %r9 // (guest >> VG_TT_FAST_BITS)
+ xorq %rax, %r9 // (guest >> VG_TT_FAST_BITS) ^ guest
+ andq $VG_TT_FAST_MASK, %r9 // setNo
+
+ // Compute %r9 = &VG_(tt_fast)[%r9]
+ shlq $VG_FAST_CACHE_SET_BITS, %r9 // setNo * sizeof(FastCacheSet)
+ movabsq $VG_(tt_fast), %r10 // &VG_(tt_fast)[0]
+ leaq (%r10, %r9), %r9 // &VG_(tt_fast)[setNo]
+
+ // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set)
+ // try way 0
+ cmpq %rax, FCS_g0(%r9) // cmp against .guest0
+ jnz 1f
+ // hit at way 0
+ jmp *FCS_h0(%r9) // goto .host0
+ ud2
+
+1: // try way 1
+ cmpq %rax, FCS_g1(%r9) // cmp against .guest1
+ jnz 2f
+ // hit at way 1; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits1_32)
+ movq FCS_g0(%r9), %r10 // r10 = old .guest0
+ movq FCS_h0(%r9), %r11 // r11 = old .host0
+ movq FCS_h1(%r9), %r12 // r12 = old .host1
+ movq %rax, FCS_g0(%r9) // new .guest0 = guest
+ movq %r12, FCS_h0(%r9) // new .host0 = old .host1
+ movq %r10, FCS_g1(%r9) // new .guest1 = old .guest0
+ movq %r11, FCS_h1(%r9) // new .host1 = old .host0
+ jmp *%r12 // goto old .host1 a.k.a. new .host0
+ ud2
+
+2: // try way 2
+ cmpq %rax, FCS_g2(%r9) // cmp against .guest2
+ jnz 3f
+ // hit at way 2; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits2_32)
+ movq FCS_g1(%r9), %r10
+ movq FCS_h1(%r9), %r11
+ movq FCS_h2(%r9), %r12
+ movq %rax, FCS_g1(%r9)
+ movq %r12, FCS_h1(%r9)
+ movq %r10, FCS_g2(%r9)
+ movq %r11, FCS_h2(%r9)
+ jmp *%r12
+ ud2
+
+3: // try way 3
+ cmpq %rax, FCS_g3(%r9) // cmp against .guest3
+ jnz 4f
+ // hit at way 3; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits3_32)
+ movq FCS_g2(%r9), %r10
+ movq FCS_h2(%r9), %r11
+ movq FCS_h3(%r9), %r12
+ movq %rax, FCS_g2(%r9)
+ movq %r12, FCS_h2(%r9)
+ movq %r10, FCS_g3(%r9)
+ movq %r11, FCS_h3(%r9)
+ jmp *%r12
+ ud2
+
+4: // fast lookup failed
/* stats only */
- addl $1, VG_(stats__n_xindir_misses_32)
+ addl $1, VG_(stats__n_xIndir_misses_32)
movq $VG_TRC_INNER_FASTMISS, %rax
movq $0, %rdx
ldr r0, [r8, #OFFSET_arm_R15T]
/* stats only */
- movw r1, #:lower16:vgPlain_stats__n_xindirs_32
- movt r1, #:upper16:vgPlain_stats__n_xindirs_32
- ldr r2, [r1, #0]
- add r2, r2, #1
- str r2, [r1, #0]
+ movw r4, #:lower16:VG_(stats__n_xIndirs_32)
+ movt r4, #:upper16:VG_(stats__n_xIndirs_32)
+ ldr r5, [r4, #0]
+ add r5, r5, #1
+ str r5, [r4, #0]
+
+ // LIVE: r8 (guest state ptr), r0 (guest address to go to).
+ // We use 6 temporaries:
+ // r6 (to point at the relevant FastCacheSet),
+ // r1, r2, r3 (scratch, for swapping entries within a set)
+ // r4, r5 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute r6 = VG_TT_FAST_HASH(guest)
+ lsr r6, r0, #1 // g1 = guest >> 1
+ eor r6, r6, r6, LSR #VG_TT_FAST_BITS // (g1 >> VG_TT_FAST_BITS) ^ g1
+ ubfx r6, r6, #0, #VG_TT_FAST_BITS // setNo
- /* try a fast lookup in the translation cache */
- // r0 = next guest, r1,r2,r3,r4 scratch
- movw r1, #VG_TT_FAST_MASK // r1 = VG_TT_FAST_MASK
+ // Compute r6 = &VG_(tt_fast)[r6]
movw r4, #:lower16:VG_(tt_fast)
-
- and r2, r1, r0, LSR #1 // r2 = entry #
- movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast)
-
- add r1, r4, r2, LSL #3 // r1 = &tt_fast[entry#]
-
- ldrd r4, r5, [r1, #0] // r4 = .guest, r5 = .host
-
- cmp r4, r0
-
- // jump to host if lookup succeeded
- bxeq r5
-
- /* otherwise the fast lookup failed */
- /* RM ME -- stats only */
- movw r1, #:lower16:vgPlain_stats__n_xindir_misses_32
- movt r1, #:upper16:vgPlain_stats__n_xindir_misses_32
- ldr r2, [r1, #0]
- add r2, r2, #1
- str r2, [r1, #0]
+ movt r4, #:upper16:VG_(tt_fast)
+ add r6, r4, r6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo]
+
+ // LIVE: r8 (guest state ptr), r0 (guest addr), r6 (cache set)
+ // try way 0
+ ldr r4, [r6, #FCS_g0] // .guest0
+ ldr r5, [r6, #FCS_h0] // .host0
+ cmp r4, r0 // cmp against .guest0
+ bne 1f
+ // hit at way 0
+ // goto .host0
+ bx r5
+ /*NOTREACHED*/
+
+1: // try way 1
+ ldr r4, [r6, #FCS_g1]
+ cmp r4, r0 // cmp against .guest1
+ bne 2f
+ // hit at way 1; swap upwards
+ ldr r1, [r6, #FCS_g0] // r1 = old .guest0
+ ldr r2, [r6, #FCS_h0] // r2 = old .host0
+ ldr r3, [r6, #FCS_h1] // r3 = old .host1
+ str r0, [r6, #FCS_g0] // new .guest0 = guest
+ str r3, [r6, #FCS_h0] // new .host0 = old .host1
+ str r1, [r6, #FCS_g1] // new .guest1 = old .guest0
+ str r2, [r6, #FCS_h1] // new .host1 = old .host0
+ // stats only
+ movw r4, #:lower16:VG_(stats__n_xIndir_hits1_32)
+ movt r4, #:upper16:VG_(stats__n_xIndir_hits1_32)
+ ldr r5, [r4, #0]
+ add r5, r5, #1
+ str r5, [r4, #0]
+ // goto old .host1 a.k.a. new .host0
+ bx r3
+ /*NOTREACHED*/
+
+2: // try way 2
+ ldr r4, [r6, #FCS_g2]
+ cmp r4, r0 // cmp against .guest2
+ bne 3f
+ // hit at way 2; swap upwards
+ ldr r1, [r6, #FCS_g1]
+ ldr r2, [r6, #FCS_h1]
+ ldr r3, [r6, #FCS_h2]
+ str r0, [r6, #FCS_g1]
+ str r3, [r6, #FCS_h1]
+ str r1, [r6, #FCS_g2]
+ str r2, [r6, #FCS_h2]
+ // stats only
+ movw r4, #:lower16:VG_(stats__n_xIndir_hits2_32)
+ movt r4, #:upper16:VG_(stats__n_xIndir_hits2_32)
+ ldr r5, [r4, #0]
+ add r5, r5, #1
+ str r5, [r4, #0]
+ // goto old .host2 a.k.a. new .host1
+ bx r3
+ /*NOTREACHED*/
+
+3: // try way 3
+ ldr r4, [r6, #FCS_g3]
+ cmp r4, r0 // cmp against .guest3
+ bne 4f
+ // hit at way 3; swap upwards
+ ldr r1, [r6, #FCS_g2]
+ ldr r2, [r6, #FCS_h2]
+ ldr r3, [r6, #FCS_h3]
+ str r0, [r6, #FCS_g2]
+ str r3, [r6, #FCS_h2]
+ str r1, [r6, #FCS_g3]
+ str r2, [r6, #FCS_h3]
+ // stats only
+ movw r4, #:lower16:VG_(stats__n_xIndir_hits3_32)
+ movt r4, #:upper16:VG_(stats__n_xIndir_hits3_32)
+ ldr r5, [r4, #0]
+ add r5, r5, #1
+ str r5, [r4, #0]
+ // goto old .host3 a.k.a. new .host2
+ bx r3
+ /*NOTREACHED*/
+
+4: // fast lookup failed
+ movw r4, #:lower16:VG_(stats__n_xIndir_misses_32)
+ movt r4, #:upper16:VG_(stats__n_xIndir_misses_32)
+ ldr r5, [r4, #0]
+ add r5, r5, #1
+ str r5, [r4, #0]
mov r1, #VG_TRC_INNER_FASTMISS
mov r2, #0
/* ------ Indirect but boring jump ------ */
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
- /* Where are we going? */
+ // Where are we going?
ldr x0, [x21, #OFFSET_arm64_PC]
- /* stats only */
- adrp x1, VG_(stats__n_xindirs_32)
- add x1, x1, :lo12:VG_(stats__n_xindirs_32)
- ldr w2, [x1, #0]
- add w2, w2, #1
- str w2, [x1, #0]
-
- /* try a fast lookup in the translation cache */
- // x0 = next guest, x1,x2,x3,x4 scratch
- mov x1, #VG_TT_FAST_MASK // x1 = VG_TT_FAST_MASK
- and x2, x1, x0, LSR #2 // x2 = entry # = (x1 & (x0 >> 2))
-
+ // stats only
+ adrp x4, VG_(stats__n_xIndirs_32)
+ add x4, x4, :lo12:VG_(stats__n_xIndirs_32)
+ ldr w5, [x4, #0]
+ add w5, w5, #1
+ str w5, [x4, #0]
+
+ // LIVE: x21 (guest state ptr), x0 (guest address to go to).
+ // We use 6 temporaries:
+ // x6 (to point at the relevant FastCacheSet),
+ // x1, x2, x3 (scratch, for swapping entries within a set)
+ // x4, x5 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute x6 = VG_TT_FAST_HASH(guest)
+ lsr x6, x0, #2 // g2 = guest >> 2
+ eor x6, x6, x6, LSR #VG_TT_FAST_BITS // (g2 >> VG_TT_FAST_BITS) ^ g2
+ mov x4, #VG_TT_FAST_MASK // VG_TT_FAST_MASK
+ and x6, x6, x4 // setNo
+
+ // Compute x6 = &VG_(tt_fast)[x6]
adrp x4, VG_(tt_fast)
- add x4, x4, :lo12:VG_(tt_fast) // x4 = &VG_(tt_fast)
-
- add x1, x4, x2, LSL #4 // r1 = &tt_fast[entry#]
+ add x4, x4, :lo12:VG_(tt_fast) // &VG_(tt_fast)[0]
+ add x6, x4, x6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo]
+
+ // LIVE: x21 (guest state ptr), x0 (guest addr), x6 (cache set)
+ // try way 0
+ ldp x4, x5, [x6, #FCS_g0] // x4 = .guest0, x5 = .host0
+ cmp x4, x0 // cmp against .guest0
+ bne 1f
+ // hit at way 0
+ // goto .host0
+ br x5
+ /*NOTREACHED*/
- ldp x4, x5, [x1, #0] // x4 = .guest, x5 = .host
+1: // try way 1
+ ldr x4, [x6, #FCS_g1]
+ cmp x4, x0 // cmp against .guest1
+ bne 2f
+ // hit at way 1; swap upwards
+ ldr x1, [x6, #FCS_g0] // x1 = old .guest0
+ ldr x2, [x6, #FCS_h0] // x2 = old .host0
+ ldr x3, [x6, #FCS_h1] // x3 = old .host1
+ str x0, [x6, #FCS_g0] // new .guest0 = guest
+ str x3, [x6, #FCS_h0] // new .host0 = old .host1
+ str x1, [x6, #FCS_g1] // new .guest1 = old .guest0
+ str x2, [x6, #FCS_h1] // new .host1 = old .host0
+ // stats only
+ adrp x4, VG_(stats__n_xIndir_hits1_32)
+ add x4, x4, :lo12:VG_(stats__n_xIndir_hits1_32)
+ ldr w5, [x4, #0]
+ add w5, w5, #1
+ str w5, [x4, #0]
+ // goto old .host1 a.k.a. new .host0
+ br x3
+ /*NOTREACHED*/
- cmp x4, x0
+2: // try way 2
+ ldr x4, [x6, #FCS_g2]
+ cmp x4, x0 // cmp against .guest2
+ bne 3f
+ // hit at way 2; swap upwards
+ ldr x1, [x6, #FCS_g1]
+ ldr x2, [x6, #FCS_h1]
+ ldr x3, [x6, #FCS_h2]
+ str x0, [x6, #FCS_g1]
+ str x3, [x6, #FCS_h1]
+ str x1, [x6, #FCS_g2]
+ str x2, [x6, #FCS_h2]
+ // stats only
+ adrp x4, VG_(stats__n_xIndir_hits2_32)
+ add x4, x4, :lo12:VG_(stats__n_xIndir_hits2_32)
+ ldr w5, [x4, #0]
+ add w5, w5, #1
+ str w5, [x4, #0]
+ // goto old .host2 a.k.a. new .host1
+ br x3
+ /*NOTREACHED*/
- // jump to host if lookup succeeded
- bne fast_lookup_failed
- br x5
+3: // try way 3
+ ldr x4, [x6, #FCS_g3]
+ cmp x4, x0 // cmp against .guest3
+ bne 4f
+ // hit at way 3; swap upwards
+ ldr x1, [x6, #FCS_g2]
+ ldr x2, [x6, #FCS_h2]
+ ldr x3, [x6, #FCS_h3]
+ str x0, [x6, #FCS_g2]
+ str x3, [x6, #FCS_h2]
+ str x1, [x6, #FCS_g3]
+ str x2, [x6, #FCS_h3]
+ // stats only
+ adrp x4, VG_(stats__n_xIndir_hits3_32)
+ add x4, x4, :lo12:VG_(stats__n_xIndir_hits3_32)
+ ldr w5, [x4, #0]
+ add w5, w5, #1
+ str w5, [x4, #0]
+ // goto old .host3 a.k.a. new .host2
+ br x3
/*NOTREACHED*/
-fast_lookup_failed:
- /* RM ME -- stats only */
- adrp x1, VG_(stats__n_xindir_misses_32)
- add x1, x1, :lo12:VG_(stats__n_xindir_misses_32)
- ldr w2, [x1, #0]
- add w2, w2, #1
- str w2, [x1, #0]
+4: // fast lookup failed
+ adrp x4, VG_(stats__n_xIndir_misses_32)
+ add x4, x4, :lo12:VG_(stats__n_xIndir_misses_32)
+ ldr w5, [x4, #0]
+ add w5, w5, #1
+ str w5, [x4, #0]
mov x1, #VG_TRC_INNER_FASTMISS
mov x2, #0
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
/* Where are we going? */
- lw $11, OFFSET_mips32_PC($23)
-
- lw $13, vgPlain_stats__n_xindirs_32
- addiu $13, $13, 0x1
- sw $13, vgPlain_stats__n_xindirs_32
-
- /* try a fast lookup in the translation cache */
- /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*)
- = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */
-
- move $14, $11
- li $12, VG_TT_FAST_MASK
- srl $14, $14, 2
- and $14, $14, $12
- sll $14, $14, 3
-
- /* t2 = (addr of VG_(tt_fast)) + t1 */
- la $13, VG_(tt_fast)
- addu $13, $13, $14
-
- lw $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */
- addiu $13, $13, 4
- lw $25, 0($13) /* little-endian, so comparing 1st 32bit word */
- nop
-
-check:
- bne $12, $11, fast_lookup_failed
- /* run the translation */
- jr $25
- .long 0x0 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
- /* %PC is up to date */
- /* back out decrement of the dispatch counter */
- /* hold dispatch_ctr in t0 (r8) */
- lw $13, vgPlain_stats__n_xindirs_32
- addiu $13, $13, 0x1
- sw $13, vgPlain_stats__n_xindirs_32
- li $2, VG_TRC_INNER_FASTMISS
- li $3, 0
- b postamble
+ lw $10, OFFSET_mips32_PC($23)
+
+ /* stats only */
+ lw $15, VG_(stats__n_xIndirs_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndirs_32)
+
+ // LIVE: r23 (guest state ptr), r10 (guest address to go to).
+ // We use 6 temporaries:
+ // r16 (to point at the relevant FastCacheSet),
+ // r11, r12, r13 (scratch, for swapping entries within a set)
+ // r14, r15 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute r16 = VG_TT_FAST_HASH(guest)
+ srl $16, $10, 2 // g2 = guest >> 2
+ srl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS)
+ xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2
+ li $15, VG_TT_FAST_MASK
+ and $16, $16, $15 // setNo
+
+ // Compute r16 = &VG_(tt_fast)[r16]
+ la $15, VG_(tt_fast)
+ sll $16, $16, VG_FAST_CACHE_SET_BITS
+ addu $16, $16, $15
+
+ // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set)
+ // try way 0
+ lw $14, FCS_g0($16) // .guest0
+ lw $15, FCS_h0($16) // .host0
+ bne $14, $10, 1f // cmp against .guest0
+ // hit at way 0
+ // goto .host0
+ jr $15
+ /*NOTREACHED*/
+ .long 0x0
+
+1: // try way 1
+ lw $14, FCS_g1($16)
+ bne $14, $10, 2f // cmp against .guest1
+ // hit at way 1; swap upwards
+ lw $11, FCS_g0($16) // $11 = old .guest0
+ lw $12, FCS_h0($16) // $12 = old .host0
+ lw $13, FCS_h1($16) // $13 = old .host1
+ sw $10, FCS_g0($16) // new .guest0 = guest
+ sw $13, FCS_h0($16) // new .host0 = old .host1
+ sw $11, FCS_g1($16) // new .guest1 = old .guest0
+ sw $12, FCS_h1($16) // new .host1 = old .host0
+ // stats only
+ lw $15, VG_(stats__n_xIndir_hits1_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_hits1_32)
+ // goto old .host1 a.k.a. new .host0
+ jr $13
+ /*NOTREACHED*/
+ .long 0x0
+
+2: // try way 2
+ lw $14, FCS_g2($16)
+ bne $14, $10, 3f // cmp against .guest2
+ // hit at way 2; swap upwards
+ lw $11, FCS_g1($16)
+ lw $12, FCS_h1($16)
+ lw $13, FCS_h2($16)
+ sw $10, FCS_g1($16)
+ sw $13, FCS_h1($16)
+ sw $11, FCS_g2($16)
+ sw $12, FCS_h2($16)
+ // stats only
+ lw $15, VG_(stats__n_xIndir_hits2_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_hits2_32)
+ // goto old .host2 a.k.a. new .host1
+ jr $13
+ /*NOTREACHED*/
+ .long 0x0
+
+3: // try way 3
+ lw $14, FCS_g3($16)
+ bne $14, $10, 4f // cmp against .guest3
+ // hit at way 3; swap upwards
+ lw $11, FCS_g2($16)
+ lw $12, FCS_h2($16)
+ lw $13, FCS_h3($16)
+ sw $10, FCS_g2($16)
+ sw $13, FCS_h2($16)
+ sw $11, FCS_g3($16)
+ sw $12, FCS_h3($16)
+ // stats only
+ lw $15, VG_(stats__n_xIndir_hits3_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_hits3_32)
+ // goto old .host3 a.k.a. new .host2
+ jr $13
+ /*NOTREACHED*/
+ .long 0x0
+
+4: // fast lookup failed:
+ /* stats only */
+ lw $15, VG_(stats__n_xIndir_misses_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_misses_32)
+
+ li $2, VG_TRC_INNER_FASTMISS
+ li $3, 0
+ b postamble
+ /*NOTREACHED*/
+ .long 0x0
/* ------ Assisted jump ------ */
.global VG_(disp_cp_xassisted)
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
/* Where are we going? */
- ld $11, OFFSET_mips64_PC($23)
-
- lw $13, vgPlain_stats__n_xindirs_32
- addiu $13, $13, 0x1
- sw $13, vgPlain_stats__n_xindirs_32
-
- /* try a fast lookup in the translation cache */
- /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*)
- = (t8 >> 2 & VG_TT_FAST_MASK) << 3 */
-
- move $14, $11
- li $12, VG_TT_FAST_MASK
- srl $14, $14, 2
- and $14, $14, $12
- sll $14, $14, 3
-
- /* t2 = (addr of VG_(tt_fast)) + t1 */
- dla $13, VG_(tt_fast)
- daddu $13, $13, $14
-
- ld $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */
- daddiu $13, $13, 8
- ld $25, 0($13) /* little-endian, so comparing 1st 32bit word */
- nop
-
-check:
- bne $12, $11, fast_lookup_failed
- /* run the translation */
- jr $25
- .long 0x0 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
- /* %PC is up to date */
- /* back out decrement of the dispatch counter */
- /* hold dispatch_ctr in t0 (r8) */
- lw $13, vgPlain_stats__n_xindirs_32
- addiu $13, $13, 0x1
- sw $13, vgPlain_stats__n_xindirs_32
- li $2, VG_TRC_INNER_FASTMISS
- li $3, 0
- b postamble
+ ld $10, OFFSET_mips64_PC($23)
+
+ /* stats only */
+ lw $15, VG_(stats__n_xIndirs_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndirs_32)
+
+ // LIVE: r23 (guest state ptr), r10 (guest address to go to).
+ // We use 6 temporaries:
+ // r16 (to point at the relevant FastCacheSet),
+ // r11, r12, r13 (scratch, for swapping entries within a set)
+ // r14, r15 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute r16 = VG_TT_FAST_HASH(guest)
+ dsrl $16, $10, 2 // g2 = guest >> 2
+ dsrl $15, $10, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS)
+ xor $16, $16, $15 // (g2 >> VG_TT_FAST_BITS) ^ g2
+ li $15, VG_TT_FAST_MASK
+ and $16, $16, $15 // setNo
+
+ // Compute r16 = &VG_(tt_fast)[r16]
+ dla $15, VG_(tt_fast)
+ dsll $16, $16, VG_FAST_CACHE_SET_BITS
+ daddu $16, $16, $15
+
+ // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set)
+ // try way 0
+ ld $14, FCS_g0($16) // .guest0
+ ld $15, FCS_h0($16) // .host0
+ bne $14, $10, 1f // cmp against .guest0
+ // hit at way 0
+ // goto .host0
+ jr $15
+ /*NOTREACHED*/
+ .long 0x0
+
+1: // try way 1
+ ld $14, FCS_g1($16)
+ bne $14, $10, 2f // cmp against .guest1
+ // hit at way 1; swap upwards
+ ld $11, FCS_g0($16) // $11 = old .guest0
+ ld $12, FCS_h0($16) // $12 = old .host0
+ ld $13, FCS_h1($16) // $13 = old .host1
+ sd $10, FCS_g0($16) // new .guest0 = guest
+ sd $13, FCS_h0($16) // new .host0 = old .host1
+ sd $11, FCS_g1($16) // new .guest1 = old .guest0
+ sd $12, FCS_h1($16) // new .host1 = old .host0
+ // stats only
+ lw $15, VG_(stats__n_xIndir_hits1_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_hits1_32)
+ // goto old .host1 a.k.a. new .host0
+ jr $13
+ /*NOTREACHED*/
+ .long 0x0
+
+2: // try way 2
+ ld $14, FCS_g2($16)
+ bne $14, $10, 3f // cmp against .guest2
+ // hit at way 2; swap upwards
+ ld $11, FCS_g1($16)
+ ld $12, FCS_h1($16)
+ ld $13, FCS_h2($16)
+ sd $10, FCS_g1($16)
+ sd $13, FCS_h1($16)
+ sd $11, FCS_g2($16)
+ sd $12, FCS_h2($16)
+ // stats only
+ lw $15, VG_(stats__n_xIndir_hits2_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_hits2_32)
+ // goto old .host2 a.k.a. new .host1
+ jr $13
+ /*NOTREACHED*/
+ .long 0x0
+
+3: // try way 3
+ ld $14, FCS_g3($16)
+ bne $14, $10, 4f // cmp against .guest3
+ // hit at way 3; swap upwards
+ ld $11, FCS_g2($16)
+ ld $12, FCS_h2($16)
+ ld $13, FCS_h3($16)
+ sd $10, FCS_g2($16)
+ sd $13, FCS_h2($16)
+ sd $11, FCS_g3($16)
+ sd $12, FCS_h3($16)
+ // stats only
+ lw $15, VG_(stats__n_xIndir_hits3_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_hits3_32)
+ // goto old .host3 a.k.a. new .host2
+ jr $13
+ /*NOTREACHED*/
+ .long 0x0
+
+4: // fast lookup failed:
+ /* stats only */
+ lw $15, VG_(stats__n_xIndir_misses_32)
+ addiu $15, $15, 1
+ sw $15, VG_(stats__n_xIndir_misses_32)
+
+ li $2, VG_TRC_INNER_FASTMISS
+ li $3, 0
+ b postamble
+ /*NOTREACHED*/
+ .long 0x0
/* ------ Assisted jump ------ */
.global VG_(disp_cp_xassisted)
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
/* Where are we going? */
- lwz 3,OFFSET_ppc32_CIA(31)
+ lwz 20, OFFSET_ppc32_CIA(31)
/* stats only */
- lis 5,VG_(stats__n_xindirs_32)@ha
- addi 5,5,VG_(stats__n_xindirs_32)@l
- lwz 6,0(5)
- addi 6,6,1
- stw 6,0(5)
+ lis 24, VG_(stats__n_xIndirs_32)@ha
+ addi 24, 24, VG_(stats__n_xIndirs_32)@l
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+
+ // LIVE: r31 (guest state ptr), r20 (guest address to go to).
+ // We use 6 temporaries:
+ // r26 (to point at the relevant FastCacheSet),
+ // r21, r22, r23 (scratch, for swapping entries within a set)
+ // r24, r25 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute r26 = VG_TT_FAST_HASH(guest)
+ srwi 26, 20, 2 // g2 = guest >> 2
+ srwi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS)
+ xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2
+ andi. 26, 26, VG_TT_FAST_MASK // setNo
- /* r5 = &VG_(tt_fast) */
- lis 5,VG_(tt_fast)@ha
- addi 5,5,VG_(tt_fast)@l /* & VG_(tt_fast) */
-
- /* try a fast lookup in the translation cache */
- /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry)
- = ((r3 >>u 2) & VG_TT_FAST_MASK) << 3 */
- rlwinm 4,3,1, 29-VG_TT_FAST_BITS, 28 /* entry# * 8 */
- add 5,5,4 /* & VG_(tt_fast)[entry#] */
- lwz 6,0(5) /* .guest */
- lwz 7,4(5) /* .host */
- cmpw 3,6
- bne fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- mtctr 7
+ // Compute r6 = &VG_(tt_fast)[r6]
+ lis 25, VG_(tt_fast)@ha
+ addi 25, 25, VG_(tt_fast)@l
+ slwi 26, 26, VG_FAST_CACHE_SET_BITS
+ add 26, 26, 25
+
+ // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set)
+ // try way 0
+ lwz 24, FCS_g0(26) // .guest0
+ lwz 25, FCS_h0(26) // .host0
+ cmpw 24, 20 // cmp against .guest0
+ bne 1f
+ // hit at way 0
+ // goto .host0
+ mtctr 25
bctr
+ /*NOTREACHED*/
+
+1: // try way 1
+ lwz 24, FCS_g1(26)
+ cmpw 24, 20 // cmp against .guest1
+ bne 2f
+ // hit at way 1; swap upwards
+ lwz 21, FCS_g0(26) // 21 = old .guest0
+ lwz 22, FCS_h0(26) // 22 = old .host0
+ lwz 23, FCS_h1(26) // 23 = old .host1
+ stw 20, FCS_g0(26) // new .guest0 = guest
+ stw 23, FCS_h0(26) // new .host0 = old .host1
+ stw 21, FCS_g1(26) // new .guest1 = old .guest0
+ stw 22, FCS_h1(26) // new .host1 = old .host0
+ // stats only
+ lis 24, VG_(stats__n_xIndir_hits1_32)@ha
+ addi 24, 24, VG_(stats__n_xIndir_hits1_32)@l
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host1 a.k.a. new .host0
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
+
+2: // try way 2
+ lwz 24, FCS_g2(26)
+ cmpw 24, 20 // cmp against .guest2
+ bne 3f
+ // hit at way 2; swap upwards
+ lwz 21, FCS_g1(26)
+ lwz 22, FCS_h1(26)
+ lwz 23, FCS_h2(26)
+ stw 20, FCS_g1(26)
+ stw 23, FCS_h1(26)
+ stw 21, FCS_g2(26)
+ stw 22, FCS_h2(26)
+ // stats only
+ lis 24, VG_(stats__n_xIndir_hits2_32)@ha
+ addi 24, 24, VG_(stats__n_xIndir_hits2_32)@l
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host2 a.k.a. new .host1
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
+
+3: // try way 3
+ lwz 24, FCS_g3(26)
+ cmpw 24, 20 // cmp against .guest3
+ bne 4f
+ // hit at way 3; swap upwards
+ lwz 21, FCS_g2(26)
+ lwz 22, FCS_h2(26)
+ lwz 23, FCS_h3(26)
+ stw 20, FCS_g2(26)
+ stw 23, FCS_h2(26)
+ stw 21, FCS_g3(26)
+ stw 22, FCS_h3(26)
+ // stats only
+ lis 24, VG_(stats__n_xIndir_hits3_32)@ha
+ addi 24, 24, VG_(stats__n_xIndir_hits3_32)@l
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host3 a.k.a. new .host2
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
-fast_lookup_failed:
+4: // fast lookup failed:
/* stats only */
- lis 5,VG_(stats__n_xindir_misses_32)@ha
- addi 5,5,VG_(stats__n_xindir_misses_32)@l
- lwz 6,0(5)
- addi 6,6,1
- stw 6,0(5)
-
- li 6,VG_TRC_INNER_FASTMISS
- li 7,0
- b postamble
+ lis 24, VG_(stats__n_xIndir_misses_32)@ha
+ addi 24, 24, VG_(stats__n_xIndir_misses_32)@l
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+
+ li 6, VG_TRC_INNER_FASTMISS
+ li 7, 0
+ b postamble
/*NOTREACHED*/
/* ------ Assisted jump ------ */
.type vgPlain_tt_fast, @object
*/
.section ".toc","aw"
+
.tocent__vgPlain_tt_fast:
.tc vgPlain_tt_fast[TC],vgPlain_tt_fast
-.tocent__vgPlain_stats__n_xindirs_32:
- .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32
-.tocent__vgPlain_stats__n_xindir_misses_32:
- .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32
+
+.tocent__vgPlain_stats__n_xIndirs_32:
+ .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32
+
+.tocent__vgPlain_stats__n_xIndir_hits1_32:
+ .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32
+
+.tocent__vgPlain_stats__n_xIndir_hits2_32:
+ .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32
+
+.tocent__vgPlain_stats__n_xIndir_hits3_32:
+ .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32
+
+.tocent__vgPlain_stats__n_xIndir_misses_32:
+ .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32
+
.tocent__vgPlain_machine_ppc64_has_VMX:
- .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX
+ .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX
/*------------------------------------------------------------*/
/*--- ---*/
.globl .VG_(disp_cp_xindir)
.VG_(disp_cp_xindir):
/* Where are we going? */
- ld 3,OFFSET_ppc64_CIA(31)
+ ld 20, OFFSET_ppc64_CIA(31)
/* stats only */
- ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2)
- lwz 6,0(5)
- addi 6,6,1
- stw 6,0(5)
-
- /* r5 = &VG_(tt_fast) */
- ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */
-
- /* try a fast lookup in the translation cache */
- /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry)
- = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */
- rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */
- sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */
- add 5,5,4 /* & VG_(tt_fast)[entry#] */
- ld 6,0(5) /* .guest */
- ld 7,8(5) /* .host */
- cmpd 3,6
- bne .fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- mtctr 7
+ ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+
+ // LIVE: r31 (guest state ptr), r20 (guest address to go to).
+ // We use 6 temporaries:
+ // r26 (to point at the relevant FastCacheSet),
+ // r21, r22, r23 (scratch, for swapping entries within a set)
+ // r24, r25 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute r26 = VG_TT_FAST_HASH(guest)
+ srdi 26, 20, 2 // g2 = guest >> 2
+ srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS)
+ xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2
+ andi. 26, 26, VG_TT_FAST_MASK // setNo
+
+ // Compute r6 = &VG_(tt_fast)[r6]
+ ld 25, .tocent__vgPlain_tt_fast@toc(2)
+ sldi 26, 26, VG_FAST_CACHE_SET_BITS
+ add 26, 26, 25
+
+ // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set)
+ // try way 0
+ ld 24, FCS_g0(26) // .guest0
+ ld 25, FCS_h0(26) // .host0
+ cmpd 24, 20 // cmp against .guest0
+ bne 1f
+ // hit at way 0
+ // goto .host0
+ mtctr 25
bctr
+ /*NOTREACHED*/
+
+1: // try way 1
+ ld 24, FCS_g1(26)
+ cmpd 24, 20 // cmp against .guest1
+ bne 2f
+ // hit at way 1; swap upwards
+ ld 21, FCS_g0(26) // 21 = old .guest0
+ ld 22, FCS_h0(26) // 22 = old .host0
+ ld 23, FCS_h1(26) // 23 = old .host1
+ std 20, FCS_g0(26) // new .guest0 = guest
+ std 23, FCS_h0(26) // new .host0 = old .host1
+ std 21, FCS_g1(26) // new .guest1 = old .guest0
+ std 22, FCS_h1(26) // new .host1 = old .host0
+ // stats only
+ ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host1 a.k.a. new .host0
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
+
+2: // try way 2
+ ld 24, FCS_g2(26)
+ cmpd 24, 20 // cmp against .guest2
+ bne 3f
+ // hit at way 2; swap upwards
+ ld 21, FCS_g1(26)
+ ld 22, FCS_h1(26)
+ ld 23, FCS_h2(26)
+ std 20, FCS_g1(26)
+ std 23, FCS_h1(26)
+ std 21, FCS_g2(26)
+ std 22, FCS_h2(26)
+ // stats only
+ ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host2 a.k.a. new .host1
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
+
+3: // try way 3
+ ld 24, FCS_g3(26)
+ cmpd 24, 20 // cmp against .guest3
+ bne 4f
+ // hit at way 3; swap upwards
+ ld 21, FCS_g2(26)
+ ld 22, FCS_h2(26)
+ ld 23, FCS_h3(26)
+ std 20, FCS_g2(26)
+ std 23, FCS_h2(26)
+ std 21, FCS_g3(26)
+ std 22, FCS_h3(26)
+ // stats only
+ ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host3 a.k.a. new .host2
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
-.fast_lookup_failed:
+4: // fast lookup failed:
/* stats only */
- ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2)
- lwz 6,0(5)
- addi 6,6,1
- stw 6,0(5)
-
- li 6,VG_TRC_INNER_FASTMISS
- li 7,0
- b .postamble
+ ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2)
+ lwz 25, 0(24)
+ addi 25 ,25, 1
+ stw 25 ,0(24)
+
+ li 6,VG_TRC_INNER_FASTMISS
+ li 7,0
+ b .postamble
/*NOTREACHED*/
/* ------ Assisted jump ------ */
.type vgPlain_tt_fast, @object
*/
.section ".toc","aw"
+
.tocent__vgPlain_tt_fast:
.tc vgPlain_tt_fast[TC],vgPlain_tt_fast
-.tocent__vgPlain_stats__n_xindirs_32:
- .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32
-.tocent__vgPlain_stats__n_xindir_misses_32:
- .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32
+
+.tocent__vgPlain_stats__n_xIndirs_32:
+ .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32
+
+.tocent__vgPlain_stats__n_xIndir_hits1_32:
+ .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32
+
+.tocent__vgPlain_stats__n_xIndir_hits2_32:
+ .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32
+
+.tocent__vgPlain_stats__n_xIndir_hits3_32:
+ .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32
+
+.tocent__vgPlain_stats__n_xIndir_misses_32:
+ .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32
+
.tocent__vgPlain_machine_ppc64_has_VMX:
- .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX
+ .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX
/*------------------------------------------------------------*/
/*--- ---*/
addi 2,2,.TOC.-0b@l
.localentry VG_(disp_cp_xindir), .-VG_(disp_cp_xindir)
#endif
- /* Where are we going? */
- ld 3,OFFSET_ppc64_CIA(31)
+ /* Where are we going? */
+ ld 20, OFFSET_ppc64_CIA(31)
/* stats only */
- ld 5, .tocent__vgPlain_stats__n_xindirs_32@toc(2)
- lwz 6,0(5)
- addi 6,6,1
- stw 6,0(5)
-
- /* r5 = &VG_(tt_fast) */
- ld 5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */
-
- /* try a fast lookup in the translation cache */
- /* r4 = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry)
- = ((r3 >>u 2) & VG_TT_FAST_MASK) << 4 */
- rldicl 4,3, 62, 64-VG_TT_FAST_BITS /* entry# */
- sldi 4,4,4 /* entry# * sizeof(FastCacheEntry) */
- add 5,5,4 /* & VG_(tt_fast)[entry#] */
- ld 6,0(5) /* .guest */
- ld 7,8(5) /* .host */
- cmpd 3,6
- bne .fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- mtctr 7
+ ld 24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+
+ // LIVE: r31 (guest state ptr), r20 (guest address to go to).
+ // We use 6 temporaries:
+ // r26 (to point at the relevant FastCacheSet),
+ // r21, r22, r23 (scratch, for swapping entries within a set)
+ // r24, r25 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute r26 = VG_TT_FAST_HASH(guest)
+ srdi 26, 20, 2 // g2 = guest >> 2
+ srdi 25, 20, (VG_TT_FAST_BITS + 2) // (g2 >> VG_TT_FAST_BITS)
+ xor 26, 26, 25 // (g2 >> VG_TT_FAST_BITS) ^ g2
+ andi. 26, 26, VG_TT_FAST_MASK // setNo
+
+ // Compute r6 = &VG_(tt_fast)[r6]
+ ld 25, .tocent__vgPlain_tt_fast@toc(2)
+ sldi 26, 26, VG_FAST_CACHE_SET_BITS
+ add 26, 26, 25
+
+ // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set)
+ // try way 0
+ ld 24, FCS_g0(26) // .guest0
+ ld 25, FCS_h0(26) // .host0
+ cmpd 24, 20 // cmp against .guest0
+ bne 1f
+ // hit at way 0
+ // goto .host0
+ mtctr 25
bctr
-#if _CALL_ELF == 2
- .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir)
-#endif
+ /*NOTREACHED*/
+
+1: // try way 1
+ ld 24, FCS_g1(26)
+ cmpd 24, 20 // cmp against .guest1
+ bne 2f
+ // hit at way 1; swap upwards
+ ld 21, FCS_g0(26) // 21 = old .guest0
+ ld 22, FCS_h0(26) // 22 = old .host0
+ ld 23, FCS_h1(26) // 23 = old .host1
+ std 20, FCS_g0(26) // new .guest0 = guest
+ std 23, FCS_h0(26) // new .host0 = old .host1
+ std 21, FCS_g1(26) // new .guest1 = old .guest0
+ std 22, FCS_h1(26) // new .host1 = old .host0
+ // stats only
+ ld 24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host1 a.k.a. new .host0
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
+
+2: // try way 2
+ ld 24, FCS_g2(26)
+ cmpd 24, 20 // cmp against .guest2
+ bne 3f
+ // hit at way 2; swap upwards
+ ld 21, FCS_g1(26)
+ ld 22, FCS_h1(26)
+ ld 23, FCS_h2(26)
+ std 20, FCS_g1(26)
+ std 23, FCS_h1(26)
+ std 21, FCS_g2(26)
+ std 22, FCS_h2(26)
+ // stats only
+ ld 24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host2 a.k.a. new .host1
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
+
+3: // try way 3
+ ld 24, FCS_g3(26)
+ cmpd 24, 20 // cmp against .guest3
+ bne 4f
+ // hit at way 3; swap upwards
+ ld 21, FCS_g2(26)
+ ld 22, FCS_h2(26)
+ ld 23, FCS_h3(26)
+ std 20, FCS_g2(26)
+ std 23, FCS_h2(26)
+ std 21, FCS_g3(26)
+ std 22, FCS_h3(26)
+ // stats only
+ ld 24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2)
+ lwz 25, 0(24)
+ addi 25, 25, 1
+ stw 25, 0(24)
+ // goto old .host3 a.k.a. new .host2
+ mtctr 23
+ bctr
+ /*NOTREACHED*/
-.fast_lookup_failed:
+4: // fast lookup failed:
/* stats only */
- ld 5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2)
- lwz 6,0(5)
- addi 6,6,1
- stw 6,0(5)
-
- li 6,VG_TRC_INNER_FASTMISS
- li 7,0
- b .postamble
+ ld 24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2)
+ lwz 25, 0(24)
+ addi 25 ,25, 1
+ stw 25 ,0(24)
+
+ li 6,VG_TRC_INNER_FASTMISS
+ li 7,0
+ b .postamble
/*NOTREACHED*/
+#if _CALL_ELF == 2
+ .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir)
+#endif
/* ------ Assisted jump ------ */
.section ".text"
/* ------ Indirect but boring jump ------ */
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
- /* Where are we going? */
- lg %r2, OFFSET_s390x_IA(%r13)
-
- /* Increment VG_(stats__n_xindirs_32) */
- larl %r8, VG_(stats__n_xindirs_32)
- l %r10,0(%r8)
- ahi %r10,1
- st %r10,0(%r8)
-
- /* Try a fast lookup in the translation cache:
- Compute offset (not index) into VT_(tt_fast):
-
- offset = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry)
-
- with VG_TT_FAST_HASH(addr) == (addr >> 1) & VG_TT_FAST_MASK
- and sizeof(FastCacheEntry) == 16
-
- offset = ((addr >> 1) & VG_TT_FAST_MASK) << 4
- which is
- offset = ((addr & (VG_TT_FAST_MASK << 1) ) << 3
- */
- larl %r8, VG_(tt_fast)
- llill %r5,(VG_TT_FAST_MASK << 1) & 0xffff
-#if ((( VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 != 0)
- iilh %r5,((VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16
-#endif
- ngr %r5,%r2
- sllg %r7,%r5,3
- lg %r11, 8(%r8,%r7) /* .host */
- cg %r2, 0(%r8,%r7) /* next guest address == .guest ? */
- jne fast_lookup_failed
-
- /* Found a match. Call .host.
- r11 is an address. There we will find the instrumented client code.
- That code may modify the guest state register r13. */
- br %r11
- .long 0x0 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
- /* Increment VG_(stats__n_xindir_misses_32) */
- larl %r8, VG_(stats__n_xindir_misses_32)
- l %r10,0(%r8)
- ahi %r10,1
- st %r10,0(%r8)
-
- lghi %r0,VG_TRC_INNER_FASTMISS
- lghi %r1,0
+ /* Where are we going? */
+ lg %r6, OFFSET_s390x_IA(%r13) // "guest"
+
+ /* stats only */
+ larl %r11, VG_(stats__n_xIndirs_32)
+ l %r12, 0(%r11)
+ ahi %r12, 1
+ st %r12, 0(%r11)
+
+ // LIVE: r13 (guest state ptr), r6 (guest address to go to).
+ // We use 6 temporaries:
+ // r7 (to point at the relevant FastCacheSet),
+ // r8, r9, r10 (scratch, for swapping entries within a set)
+ // r11, r12 (other scratch)
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute %r7 = VG_TT_FAST_HASH(guest)
+ srlg %r7, %r6, 1 // g1 = guest >> 1
+ srlg %r8, %r6, (VG_TT_FAST_BITS + 1) // (g1 >> VG_TT_FAST_BITS)
+ xgr %r7, %r8 // (g1 >> VG_TT_FAST_BITS) ^ g1
+ llill %r8, VG_TT_FAST_MASK & 0xffff
+# if ((VG_TT_FAST_MASK & 0xffff0000) >> 16 != 0)
+ iilh %r8, (VG_TT_FAST_MASK & 0xffff0000) >> 16
+# endif
+ ngr %r7, %r8 // setNo
+
+ // Compute %r7 = &VG_(tt_fast)[%r7]
+ sllg %r7,%r7, VG_FAST_CACHE_SET_BITS // setNo * sizeof(FastCacheSet)
+ larl %r8, VG_(tt_fast) // &VG_(tt_fast)[0]
+ agr %r7, %r8 // &VG_(tt_fast)[setNo]
+
+ // LIVE: %r13 (guest state ptr), %r6 (guest addr), %r7 (cache set)
+ // try way 0
+ cg %r6, FCS_g0(%r7) // cmp against .guest0
+ lg %r8, FCS_h0(%r7)
+ jne 1f
+ // hit at way 0
+ // goto .host0
+ br %r8
+ /*NOTREACHED*/
+ .long 0
+
+1: // try way 1
+ cg %r6, FCS_g1(%r7) // cmp against .guest1
+ jne 2f
+ // hit at way 1; swap upwards
+ lg %r8, FCS_g0(%r7) // r8 = old .guest0
+ lg %r9, FCS_h0(%r7) // r9 = old .host0
+ lg %r10, FCS_h1(%r7) // r10 = old .host1
+ stg %r6, FCS_g0(%r7) // new .guest0 = guest
+ stg %r10, FCS_h0(%r7) // new .host0 = old .host1
+ stg %r8, FCS_g1(%r7) // new .guest1 = old .guest0
+ stg %r9, FCS_h1(%r7) // new .host1 = old .host0
+ // stats only
+ larl %r11, VG_(stats__n_xIndir_hits1_32)
+ l %r12, 0(%r11)
+ ahi %r12, 1
+ st %r12, 0(%r11)
+ // goto old .host1 a.k.a. new .host0
+ br %r10
+ /*NOTREACHED*/
+ .long 0
+
+2: // try way 2
+ cg %r6, FCS_g2(%r7) // cmp against .guest2
+ jne 3f
+ lg %r8, FCS_g1(%r7)
+ lg %r9, FCS_h1(%r7)
+ lg %r10, FCS_h2(%r7)
+ stg %r6, FCS_g1(%r7)
+ stg %r10, FCS_h1(%r7)
+ stg %r8, FCS_g2(%r7)
+ stg %r9, FCS_h2(%r7)
+ // stats only
+ larl %r11, VG_(stats__n_xIndir_hits2_32)
+ l %r12, 0(%r11)
+ ahi %r12, 1
+ st %r12, 0(%r11)
+ // goto old .host2 a.k.a. new .host1
+ br %r10
+ /*NOTREACHED*/
+ .long 0
+
+3: // try way 3
+ cg %r6, FCS_g3(%r7) // cmp against .guest3
+ jne 4f
+ // hit at way 3; swap upwards
+ lg %r8, FCS_g2(%r7)
+ lg %r9, FCS_h2(%r7)
+ lg %r10, FCS_h3(%r7)
+ stg %r6, FCS_g2(%r7)
+ stg %r10, FCS_h2(%r7)
+ stg %r8, FCS_g3(%r7)
+ stg %r9, FCS_h3(%r7)
+ // stats only
+ larl %r11, VG_(stats__n_xIndir_hits3_32)
+ l %r12, 0(%r11)
+ ahi %r12, 1
+ st %r12, 0(%r11)
+ // goto old .host3 a.k.a. new .host2
+ br %r10
+ .long 0
+
+4: // fast lookup failed
+ larl %r11, VG_(stats__n_xIndir_misses_32)
+ l %r12, 0(%r11)
+ ahi %r12, 1
+ st %r12, 0(%r11)
+
+ lghi %r0, VG_TRC_INNER_FASTMISS
+ lghi %r1, 0
j postamble
+ /*NOTREACHED*/
/* ------ Assisted jump ------ */
jmp postamble
/* ------ Indirect but boring jump ------ */
-.globl VG_(disp_cp_xindir)
+.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
/* Where are we going? */
- movl OFFSET_x86_EIP(%ebp), %eax
+ movl OFFSET_x86_EIP(%ebp), %eax // "guest"
/* stats only */
- addl $1, VG_(stats__n_xindirs_32)
-
- /* try a fast lookup in the translation cache */
- movl %eax, %ebx /* next guest addr */
- andl $VG_TT_FAST_MASK, %ebx /* entry# */
- movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
- movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
- cmpl %eax, %esi
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%edi
- ud2 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+ addl $1, VG_(stats__n_xIndirs_32)
+
+ // LIVE: %ebp (guest state ptr), %eax (guest address to go to).
+ // We use 4 temporaries:
+ // %esi (to point at the relevant FastCacheSet),
+ // %ebx, %ecx and %edx (scratch).
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute %esi = VG_TT_FAST_HASH(guest)
+ movl %eax, %esi // guest
+ shrl $VG_TT_FAST_BITS, %esi // (guest >> VG_TT_FAST_BITS)
+ xorl %eax, %esi // (guest >> VG_TT_FAST_BITS) ^ guest
+ andl $VG_TT_FAST_MASK, %esi // setNo
+
+ // Compute %esi = &VG_(tt_fast)[%esi]
+ shll $VG_FAST_CACHE_SET_BITS, %esi // setNo * sizeof(FastCacheSet)
+ leal VG_(tt_fast)(%esi), %esi // &VG_(tt_fast)[setNo]
+
+ // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set)
+ // try way 0
+ cmpl %eax, FCS_g0(%esi) // cmp against .guest0
+ jnz 1f
+ // hit at way 0
+ jmp *FCS_h0(%esi) // goto .host0
+ ud2
+
+1: // try way 1
+ cmpl %eax, FCS_g1(%esi) // cmp against .guest1
+ jnz 2f
+ // hit at way 1; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits1_32)
+ movl FCS_g0(%esi), %ebx // ebx = old .guest0
+ movl FCS_h0(%esi), %ecx // ecx = old .host0
+ movl FCS_h1(%esi), %edx // edx = old .host1
+ movl %eax, FCS_g0(%esi) // new .guest0 = guest
+ movl %edx, FCS_h0(%esi) // new .host0 = old .host1
+ movl %ebx, FCS_g1(%esi) // new .guest1 = old .guest0
+ movl %ecx, FCS_h1(%esi) // new .host1 = old .host0
+ jmp *%edx // goto old .host1 a.k.a. new .host0
+ ud2
+
+2: // try way 2
+ cmpl %eax, FCS_g2(%esi) // cmp against .guest2
+ jnz 3f
+ // hit at way 2; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits2_32)
+ movl FCS_g1(%esi), %ebx
+ movl FCS_h1(%esi), %ecx
+ movl FCS_h2(%esi), %edx
+ movl %eax, FCS_g1(%esi)
+ movl %edx, FCS_h1(%esi)
+ movl %ebx, FCS_g2(%esi)
+ movl %ecx, FCS_h2(%esi)
+ jmp *%edx
+ ud2
+
+3: // try way 3
+ cmpl %eax, FCS_g3(%esi) // cmp against .guest3
+ jnz 4f
+ // hit at way 3; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits3_32)
+ movl FCS_g2(%esi), %ebx
+ movl FCS_h2(%esi), %ecx
+ movl FCS_h3(%esi), %edx
+ movl %eax, FCS_g2(%esi)
+ movl %edx, FCS_h2(%esi)
+ movl %ebx, FCS_g3(%esi)
+ movl %ecx, FCS_h3(%esi)
+ jmp *%edx
+ ud2
+
+4: // fast lookup failed
/* stats only */
- addl $1, VG_(stats__n_xindir_misses_32)
+ addl $1, VG_(stats__n_xIndir_misses_32)
movl $VG_TRC_INNER_FASTMISS, %eax
movl $0, %edx
.global VG_(disp_cp_xindir)
VG_(disp_cp_xindir):
/* Where are we going? */
- movl OFFSET_x86_EIP(%ebp), %eax
+ movl OFFSET_x86_EIP(%ebp), %eax // "guest"
/* stats only */
- addl $1, VG_(stats__n_xindirs_32)
-
- /* try a fast lookup in the translation cache */
- movl %eax, %ebx /* next guest addr */
- andl $VG_TT_FAST_MASK, %ebx /* entry# */
- movl 0+VG_(tt_fast)(,%ebx,8), %esi /* .guest */
- movl 4+VG_(tt_fast)(,%ebx,8), %edi /* .host */
- cmpl %eax, %esi
- jnz fast_lookup_failed
-
- /* Found a match. Jump to .host. */
- jmp *%edi
- ud2 /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+ addl $1, VG_(stats__n_xIndirs_32)
+
+ // LIVE: %ebp (guest state ptr), %eax (guest address to go to).
+ // We use 4 temporaries:
+ // %esi (to point at the relevant FastCacheSet),
+ // %ebx, %ecx and %edx (scratch).
+
+ /* Try a fast lookup in the translation cache. This is pretty much
+ a handcoded version of VG_(lookupInFastCache). */
+
+ // Compute %esi = VG_TT_FAST_HASH(guest)
+ movl %eax, %esi // guest
+ shrl $VG_TT_FAST_BITS, %esi // (guest >> VG_TT_FAST_BITS)
+ xorl %eax, %esi // (guest >> VG_TT_FAST_BITS) ^ guest
+ andl $VG_TT_FAST_MASK, %esi // setNo
+
+ // Compute %esi = &VG_(tt_fast)[%esi]
+ shll $VG_FAST_CACHE_SET_BITS, %esi // setNo * sizeof(FastCacheSet)
+ leal VG_(tt_fast)(%esi), %esi // &VG_(tt_fast)[setNo]
+
+ // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set)
+ // try way 0
+ cmpl %eax, FCS_g0(%esi) // cmp against .guest0
+ jnz 1f
+ // hit at way 0
+ jmp *FCS_h0(%esi) // goto .host0
+ ud2
+
+1: // try way 1
+ cmpl %eax, FCS_g1(%esi) // cmp against .guest1
+ jnz 2f
+ // hit at way 1; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits1_32)
+ movl FCS_g0(%esi), %ebx // ebx = old .guest0
+ movl FCS_h0(%esi), %ecx // ecx = old .host0
+ movl FCS_h1(%esi), %edx // edx = old .host1
+ movl %eax, FCS_g0(%esi) // new .guest0 = guest
+ movl %edx, FCS_h0(%esi) // new .host0 = old .host1
+ movl %ebx, FCS_g1(%esi) // new .guest1 = old .guest0
+ movl %ecx, FCS_h1(%esi) // new .host1 = old .host0
+ jmp *%edx // goto old .host1 a.k.a. new .host0
+ ud2
+
+2: // try way 2
+ cmpl %eax, FCS_g2(%esi) // cmp against .guest2
+ jnz 3f
+ // hit at way 2; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits2_32)
+ movl FCS_g1(%esi), %ebx
+ movl FCS_h1(%esi), %ecx
+ movl FCS_h2(%esi), %edx
+ movl %eax, FCS_g1(%esi)
+ movl %edx, FCS_h1(%esi)
+ movl %ebx, FCS_g2(%esi)
+ movl %ecx, FCS_h2(%esi)
+ jmp *%edx
+ ud2
+
+3: // try way 3
+ cmpl %eax, FCS_g3(%esi) // cmp against .guest3
+ jnz 4f
+ // hit at way 3; swap upwards
+ /* stats only */
+ addl $1, VG_(stats__n_xIndir_hits3_32)
+ movl FCS_g2(%esi), %ebx
+ movl FCS_h2(%esi), %ecx
+ movl FCS_h3(%esi), %edx
+ movl %eax, FCS_g2(%esi)
+ movl %edx, FCS_h2(%esi)
+ movl %ebx, FCS_g3(%esi)
+ movl %ecx, FCS_h3(%esi)
+ jmp *%edx
+ ud2
+
+4: // fast lookup failed
/* stats only */
- addl $1, VG_(stats__n_xindir_misses_32)
+ addl $1, VG_(stats__n_xIndir_misses_32)
movl $VG_TRC_INNER_FASTMISS, %eax
movl $0, %edx
static ULong n_scheduling_events_MINOR = 0;
static ULong n_scheduling_events_MAJOR = 0;
-/* Stats: number of XIndirs, and number that missed in the fast
- cache. */
-static ULong stats__n_xindirs = 0;
-static ULong stats__n_xindir_misses = 0;
+/* Stats: number of XIndirs looked up in the fast cache, the number of hits in
+ ways 1, 2 and 3, and the number of misses. The number of hits in way 0 isn't
+ recorded because it can be computed from these five numbers. */
+static ULong stats__n_xIndirs = 0;
+static ULong stats__n_xIndir_hits1 = 0;
+static ULong stats__n_xIndir_hits2 = 0;
+static ULong stats__n_xIndir_hits3 = 0;
+static ULong stats__n_xIndir_misses = 0;
/* And 32-bit temp bins for the above, so that 32-bit platforms don't
have to do 64 bit incs on the hot path through
- VG_(cp_disp_xindir). */
-/*global*/ UInt VG_(stats__n_xindirs_32) = 0;
-/*global*/ UInt VG_(stats__n_xindir_misses_32) = 0;
+ VG_(disp_cp_xindir). */
+/*global*/ UInt VG_(stats__n_xIndirs_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_hits1_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_hits2_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_hits3_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_misses_32) = 0;
/* Sanity checking counts. */
static UInt sanity_fast_count = 0;
{
VG_(message)(Vg_DebugMsg,
"scheduler: %'llu event checks.\n", bbs_done );
+
+ const ULong hits0
+ = stats__n_xIndirs - stats__n_xIndir_hits1 - stats__n_xIndir_hits2
+ - stats__n_xIndir_hits3 - stats__n_xIndir_misses;
+ VG_(message)(Vg_DebugMsg,
+ "scheduler: %'llu indir transfers, "
+ "%'llu misses (1 in %llu) ..\n",
+ stats__n_xIndirs, stats__n_xIndir_misses,
+ stats__n_xIndirs / (stats__n_xIndir_misses
+ ? stats__n_xIndir_misses : 1));
VG_(message)(Vg_DebugMsg,
- "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n",
- stats__n_xindirs, stats__n_xindir_misses,
- stats__n_xindirs / (stats__n_xindir_misses
- ? stats__n_xindir_misses : 1));
+ "scheduler: .. of which: %'llu hit0, %'llu hit1, "
+ "%'llu hit2, %'llu hit3, %'llu missed\n",
+ hits0,
+ stats__n_xIndir_hits1,
+ stats__n_xIndir_hits2,
+ stats__n_xIndir_hits3,
+ stats__n_xIndir_misses);
+
VG_(message)(Vg_DebugMsg,
"scheduler: %'llu/%'llu major/minor sched events.\n",
n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
/* end Paranoia */
/* Futz with the XIndir stats counters. */
- vg_assert(VG_(stats__n_xindirs_32) == 0);
- vg_assert(VG_(stats__n_xindir_misses_32) == 0);
+ vg_assert(VG_(stats__n_xIndirs_32) == 0);
+ vg_assert(VG_(stats__n_xIndir_hits1_32) == 0);
+ vg_assert(VG_(stats__n_xIndir_hits2_32) == 0);
+ vg_assert(VG_(stats__n_xIndir_hits3_32) == 0);
+ vg_assert(VG_(stats__n_xIndir_misses_32) == 0);
/* Clear return area. */
two_words[0] = two_words[1] = 0;
host_code_addr = alt_host_addr;
} else {
/* normal case -- redir translation */
- UInt cno = (UInt)VG_TT_FAST_HASH((Addr)tst->arch.vex.VG_INSTR_PTR);
- if (LIKELY(VG_(tt_fast)[cno].guest == (Addr)tst->arch.vex.VG_INSTR_PTR))
- host_code_addr = VG_(tt_fast)[cno].host;
- else {
+ Addr host_from_fast_cache = 0;
+ Bool found_in_fast_cache
+ = VG_(lookupInFastCache)( &host_from_fast_cache,
+ (Addr)tst->arch.vex.VG_INSTR_PTR );
+ if (found_in_fast_cache) {
+ host_code_addr = host_from_fast_cache;
+ } else {
Addr res = 0;
/* not found in VG_(tt_fast). Searching here the transtab
improves the performance compared to returning directly
/* Merge the 32-bit XIndir/miss counters into the 64 bit versions,
and zero out the 32-bit ones in preparation for the next run of
generated code. */
- stats__n_xindirs += (ULong)VG_(stats__n_xindirs_32);
- VG_(stats__n_xindirs_32) = 0;
- stats__n_xindir_misses += (ULong)VG_(stats__n_xindir_misses_32);
- VG_(stats__n_xindir_misses_32) = 0;
+ stats__n_xIndirs += (ULong)VG_(stats__n_xIndirs_32);
+ VG_(stats__n_xIndirs_32) = 0;
+ stats__n_xIndir_hits1 += (ULong)VG_(stats__n_xIndir_hits1_32);
+ VG_(stats__n_xIndir_hits1_32) = 0;
+ stats__n_xIndir_hits2 += (ULong)VG_(stats__n_xIndir_hits2_32);
+ VG_(stats__n_xIndir_hits2_32) = 0;
+ stats__n_xIndir_hits3 += (ULong)VG_(stats__n_xIndir_hits3_32);
+ VG_(stats__n_xIndir_hits3_32) = 0;
+ stats__n_xIndir_misses += (ULong)VG_(stats__n_xIndir_misses_32);
+ VG_(stats__n_xIndir_misses_32) = 0;
/* Inspect the event counter. */
vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1);
static SECno sector_search_order[MAX_N_SECTORS];
-/* Fast helper for the TC. A direct-mapped cache which holds a set of
- recently used (guest address, host address) pairs. This array is
- referred to directly from m_dispatch/dispatch-<platform>.S.
+/* Fast helper for the TC. A 4-way set-associative cache, with more-or-less LRU
+ replacement. It holds a set of recently used (guest address, host address)
+ pairs. This array is referred to directly from
+ m_dispatch/dispatch-<platform>.S.
Entries in tt_fast may refer to any valid TC entry, regardless of
which sector it's in. Consequently we must be very careful to
/*
typedef
struct {
- Addr guest;
- Addr host;
- }
- FastCacheEntry;
+ Addr guest0;
+ Addr host0;
+ Addr guest1;
+ Addr host1;
+ Addr guest2;
+ Addr host2;
+ Addr guest3;
+ Addr host3;
+ }
+ FastCacheSet;
*/
-/*global*/ __attribute__((aligned(16)))
- FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE];
+/*global*/ __attribute__((aligned(64)))
+ FastCacheSet VG_(tt_fast)[VG_TT_FAST_SETS];
/* Make sure we're not used before initialisation. */
static Bool init_done = False;
return (HTTno)(k32 % N_HTTES_PER_SECTOR);
}
-static void setFastCacheEntry ( Addr key, ULong* tcptr )
-{
- UInt cno = (UInt)VG_TT_FAST_HASH(key);
- VG_(tt_fast)[cno].guest = key;
- VG_(tt_fast)[cno].host = (Addr)tcptr;
- n_fast_updates++;
- /* This shouldn't fail. It should be assured by m_translate
- which should reject any attempt to make translation of code
- starting at TRANSTAB_BOGUS_GUEST_ADDR. */
- vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR);
-}
-
/* Invalidate the fast cache VG_(tt_fast). */
static void invalidateFastCache ( void )
{
- UInt j;
- /* This loop is popular enough to make it worth unrolling a
- bit, at least on ppc32. */
- vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
- for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
- VG_(tt_fast)[j+0].guest = TRANSTAB_BOGUS_GUEST_ADDR;
- VG_(tt_fast)[j+1].guest = TRANSTAB_BOGUS_GUEST_ADDR;
- VG_(tt_fast)[j+2].guest = TRANSTAB_BOGUS_GUEST_ADDR;
- VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR;
+ for (UWord j = 0; j < VG_TT_FAST_SETS; j++) {
+ FastCacheSet* set = &VG_(tt_fast)[j];
+ set->guest0 = TRANSTAB_BOGUS_GUEST_ADDR;
+ set->guest1 = TRANSTAB_BOGUS_GUEST_ADDR;
+ set->guest2 = TRANSTAB_BOGUS_GUEST_ADDR;
+ set->guest3 = TRANSTAB_BOGUS_GUEST_ADDR;
}
-
- vg_assert(j == VG_TT_FAST_SIZE);
n_fast_flushes++;
}
+static void setFastCacheEntry ( Addr guest, ULong* tcptr )
+{
+ /* This shouldn't fail. It should be assured by m_translate
+ which should reject any attempt to make translation of code
+ starting at TRANSTAB_BOGUS_GUEST_ADDR. */
+ vg_assert(guest != TRANSTAB_BOGUS_GUEST_ADDR);
+ /* Shift all entries along one, so that the LRU one disappears, and put the
+ new entry at the MRU position. */
+ UWord setNo = (UInt)VG_TT_FAST_HASH(guest);
+ FastCacheSet* set = &VG_(tt_fast)[setNo];
+ set->host3 = set->host2;
+ set->guest3 = set->guest2;
+ set->host2 = set->host1;
+ set->guest2 = set->guest1;
+ set->host1 = set->host0;
+ set->guest1 = set->guest0;
+ set->host0 = (Addr)tcptr;
+ set->guest0 = guest;
+ n_fast_updates++;
+}
+
static TTEno get_empty_tt_slot(SECno sNo)
{
vg_assert(N_HTTES_PER_SECTOR < INV_TTE);
vg_assert(N_HTTES_PER_SECTOR < EC2TTE_DELETED);
vg_assert(N_HTTES_PER_SECTOR < HTT_EMPTY);
- /* check fast cache entries really are 2 words long */
+
+ /* check fast cache entries really are 8 words long */
vg_assert(sizeof(Addr) == sizeof(void*));
- vg_assert(sizeof(FastCacheEntry) == 2 * sizeof(Addr));
+ vg_assert(sizeof(FastCacheSet) == 8 * sizeof(Addr));
/* check fast cache entries are packed back-to-back with no spaces */
vg_assert(sizeof( VG_(tt_fast) )
- == VG_TT_FAST_SIZE * sizeof(FastCacheEntry));
+ == VG_TT_FAST_SETS * sizeof(FastCacheSet));
+ /* check fast cache entries have the layout that the handwritten assembly
+ fragments assume. */
+ vg_assert(sizeof(FastCacheSet) == (1 << VG_FAST_CACHE_SET_BITS));
+ vg_assert(offsetof(FastCacheSet,guest0) == FCS_g0);
+ vg_assert(offsetof(FastCacheSet,host0) == FCS_h0);
+ vg_assert(offsetof(FastCacheSet,guest1) == FCS_g1);
+ vg_assert(offsetof(FastCacheSet,host1) == FCS_h1);
+ vg_assert(offsetof(FastCacheSet,guest2) == FCS_g2);
+ vg_assert(offsetof(FastCacheSet,host2) == FCS_h2);
+ vg_assert(offsetof(FastCacheSet,guest3) == FCS_g3);
+ vg_assert(offsetof(FastCacheSet,host3) == FCS_h3);
+ vg_assert(offsetof(FastCacheSet,guest0) == 0 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,host0) == 1 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,guest1) == 2 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,host1) == 3 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,guest2) == 4 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,host2) == 5 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,guest3) == 6 * sizeof(Addr));
+ vg_assert(offsetof(FastCacheSet,host3) == 7 * sizeof(Addr));
+
/* check fast cache is aligned as we requested. Not fatal if it
isn't, but we might as well make sure. */
- vg_assert(VG_IS_16_ALIGNED( ((Addr) & VG_(tt_fast)[0]) ));
+ vg_assert(VG_IS_64_ALIGNED( ((Addr) & VG_(tt_fast)[0]) ));
/* The TTEntryH size is critical for keeping the LLC miss rate down
when doing a lot of discarding. Hence check it here. We also
#include "pub_tool_transtab.h"
#include "libvex.h" // VexGuestExtents
-/* The fast-cache for tt-lookup. Unused entries are denoted by .guest
- == 1, which is assumed to be a bogus address for all guest code. */
+/* The fast-cache for tt-lookup. Unused entries are denoted by
+ .guest == TRANSTAB_BOGUS_GUEST_ADDR (viz, 1), which is assumed
+ to be a bogus address for all guest code. See pub_core_transtab_asm.h
+ for further description. */
typedef
struct {
- Addr guest;
- Addr host;
+ Addr guest0;
+ Addr host0;
+ Addr guest1;
+ Addr host1;
+ Addr guest2;
+ Addr host2;
+ Addr guest3;
+ Addr host3;
}
- FastCacheEntry;
+ FastCacheSet;
-extern __attribute__((aligned(16)))
- FastCacheEntry VG_(tt_fast) [VG_TT_FAST_SIZE];
+STATIC_ASSERT(sizeof(Addr) == sizeof(UWord));
+STATIC_ASSERT(sizeof(FastCacheSet) == sizeof(Addr) * 8);
+
+extern __attribute__((aligned(64)))
+ FastCacheSet VG_(tt_fast) [VG_TT_FAST_SETS];
#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
+#if defined(VGA_x86) || defined(VGA_amd64)
+static inline UWord VG_TT_FAST_HASH ( Addr guest ) {
+ // There's no minimum insn alignment on these targets.
+ UWord merged = ((UWord)guest) >> 0;
+ merged = (merged >> VG_TT_FAST_BITS) ^ merged;
+ return merged & VG_TT_FAST_MASK;
+}
+
+#elif defined(VGA_s390x) || defined(VGA_arm)
+static inline UWord VG_TT_FAST_HASH ( Addr guest ) {
+ // Instructions are 2-byte aligned.
+ UWord merged = ((UWord)guest) >> 1;
+ merged = (merged >> VG_TT_FAST_BITS) ^ merged;
+ return merged & VG_TT_FAST_MASK;
+}
+
+#elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
+ || defined(VGA_mips32) || defined(VGA_mips64) || defined(VGA_arm64)
+static inline UWord VG_TT_FAST_HASH ( Addr guest ) {
+ // Instructions are 4-byte aligned.
+ UWord merged = ((UWord)guest) >> 2;
+ merged = (merged >> VG_TT_FAST_BITS) ^ merged;
+ return merged & VG_TT_FAST_MASK;
+}
+
+#else
+# error "VG_TT_FAST_HASH: unknown platform"
+#endif
+
+static inline Bool VG_(lookupInFastCache)( /*MB_OUT*/Addr* host, Addr guest )
+{
+ UWord setNo = (UInt)VG_TT_FAST_HASH(guest);
+ FastCacheSet* set = &VG_(tt_fast)[setNo];
+ if (LIKELY(set->guest0 == guest)) {
+ // hit at way 0
+ *host = set->host0;
+ return True;
+ }
+ if (LIKELY(set->guest1 == guest)) {
+ // hit at way 1; swap upwards
+ Addr tG = guest;
+ Addr tH = set->host1;
+ set->guest1 = set->guest0;
+ set->host1 = set->host0;
+ set->guest0 = tG;
+ set->host0 = tH;
+ *host = tH;
+ return True;
+ }
+ if (LIKELY(set->guest2 == guest)) {
+ // hit at way 2; swap upwards
+ Addr tG = guest;
+ Addr tH = set->host2;
+ set->guest2 = set->guest1;
+ set->host2 = set->host1;
+ set->guest1 = tG;
+ set->host1 = tH;
+ *host = tH;
+ return True;
+ }
+ if (LIKELY(set->guest3 == guest)) {
+ // hit at way 3; swap upwards
+ Addr tG = guest;
+ Addr tH = set->host3;
+ set->guest3 = set->guest2;
+ set->host3 = set->host2;
+ set->guest2 = tG;
+ set->host2 = tH;
+ *host = tH;
+ return True;
+ }
+ // Not found
+ *host = 0;
+ return False;
+}
+
/* Initialises the TC, using VG_(clo_num_transtab_sectors)
and VG_(clo_avg_transtab_entry_size).
#ifndef __PUB_CORE_TRANSTAB_ASM_H
#define __PUB_CORE_TRANSTAB_ASM_H
-/* Constants for the fast translation lookup cache. It is a direct
- mapped cache, with 2^VG_TT_FAST_BITS entries.
+/* Constants for the fast translation lookup cache. It is a 4 way associative
+ cache, with more-or-less LRU replacement. It contains 2^VG_TT_FAST_BITS
+ sets.
+
+ On all targets, the set number is computed from least significant 2 *
+ VG_TT_FAST_BITS of the guest address. This is a bit unusual in as much as
+ it is more normal just to use a VG_TT_FAST_BITS-sized slice of the address
+ as the set number. Using twice as many bits (the two chunks are xor'd)
+ spreads entries out (reduces aliasing) and significantly reduces the overall
+ miss rate. The cost is two extra cycles on the fast lookup path, to perform
+ an extra shift and an xor.
+
+ For each set there are 4 ways: way0, way1, way2 and way3. way0 is intended
+ to be the MRU and way3 the LRU. Most lookups hit way0 and involve no
+ modification of the line. A hit at way1 causes way0 and way1 to be swapped.
+ A hit at way2 causes way1 and way2 to be swapped; that is, way2 is moved one
+ step closer to the front. But not all the way to the front. Similarly a
+ hit at way3 causes way2 and way3 to be swapped.
+
+ See VG_(lookupInFastCache) for a C implementation of this logic and
+ dispatch-*-*.S, label VG_(disp_cp_xindir), for the handcoded assembly
+ equivalents for each target. Note that VG_(lookupInFastCache) is used in C
+ land for some administrative lookups but isn't really performance critical.
+ The dispatch-*-*.S implementations are used to process all indirect branches
+ in the simulator and so *are* performance critical.
+
+ Updates to the cache are rare. These are performed by setFastCacheEntry.
+ New entries are put into way0 and all others are shifted down one slot, so
+ that the contents of way3 falls out of the cache.
On x86/amd64, the cache index is computed as
- 'address[VG_TT_FAST_BITS-1 : 0]'.
-
- On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of
- instruction addresses are zero, which means that function causes
- only 1/4 of the entries to ever be used. So instead the function
- is '(address >>u 2)[VG_TT_FAST_BITS-1 : 0]' on those targets.
-
- On ARM we shift by 1, since Thumb insns can be of size 2, hence to
- minimise collisions and maximise cache utilisation we need to take
- into account all but the least significant bit.
-
- On s390x the rightmost bit of an instruction address is zero.
- For best table utilization shift the address to the right by 1 bit. */
-
-#define VG_TT_FAST_BITS 15
-#define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS)
-#define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1)
-
-/* This macro isn't usable in asm land; nevertheless this seems
- like a good place to put it. */
-
-#if defined(VGA_x86) || defined(VGA_amd64)
-# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) ) & VG_TT_FAST_MASK)
-
-#elif defined(VGA_s390x) || defined(VGA_arm)
-# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) >> 1) & VG_TT_FAST_MASK)
-
-#elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
- || defined(VGA_mips32) || defined(VGA_mips64) || defined(VGA_arm64)
-# define VG_TT_FAST_HASH(_addr) ((((UWord)(_addr)) >> 2) & VG_TT_FAST_MASK)
+ (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1 : 0]'.
+
+ On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of instruction
+ addresses are zero, which means the above function causes only 1/4 of the
+ sets to ever be used. So instead the function is
+ (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+2 : 0+2]'.
+
+ On arm32, the minimum instruction size is 2, so we discard only the least
+ significant bit of the address, hence:
+ (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+1 : 0+1]'.
+
+ On s390x the rightmost bit of an instruction address is zero, so the arm32
+ scheme is used. */
+
+#define VG_TT_FAST_BITS 13
+#define VG_TT_FAST_SETS (1 << VG_TT_FAST_BITS)
+#define VG_TT_FAST_MASK ((VG_TT_FAST_SETS) - 1)
+
+// Log2(sizeof(FastCacheSet)). This is needed in the handwritten assembly.
+
+#if defined(VGA_amd64) || defined(VGA_arm64) \
+ || defined(VGA_ppc64be) || defined(VGA_ppc64le) || defined(VGA_mips64) \
+ || defined(VGA_s390x)
+ // And all other 64-bit hosts
+# define VG_FAST_CACHE_SET_BITS 6
+ // These FCS_{g,h}{0,1,2,3} are the values of
+ // offsetof(FastCacheSet,{guest,host}{0,1,2,3}).
+# define FCS_g0 0
+# define FCS_h0 8
+# define FCS_g1 16
+# define FCS_h1 24
+# define FCS_g2 32
+# define FCS_h2 40
+# define FCS_g3 48
+# define FCS_h3 56
+
+#elif defined(VGA_x86) || defined(VGA_arm) || defined(VGA_ppc32) \
+ || defined(VGA_mips32)
+ // And all other 32-bit hosts
+# define VG_FAST_CACHE_SET_BITS 5
+# define FCS_g0 0
+# define FCS_h0 4
+# define FCS_g1 8
+# define FCS_h1 12
+# define FCS_g2 16
+# define FCS_h2 20
+# define FCS_g3 24
+# define FCS_h3 28
#else
-# error "VG_TT_FAST_HASH: unknown platform"
+# error "VG_FAST_CACHE_SET_BITS not known"
#endif
#endif // __PUB_CORE_TRANSTAB_ASM_H
#define VG_IS_8_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0x7)))
#define VG_IS_16_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0xf)))
#define VG_IS_32_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0x1f)))
+#define VG_IS_64_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)0x3f)))
#define VG_IS_WORD_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)(sizeof(Addr)-1))))
#define VG_IS_PAGE_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)(VKI_PAGE_SIZE-1))))