From: Julian Seward <jseward@acm.org>
Date: Fri, 25 Jan 2019 08:14:56 +0000 (+0100)
Subject: Bug 402781 - Redo the cache used to process indirect branch targets.
X-Git-Tag: VALGRIND_3_15_0~96
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=50bb127b1df8d31812141aafa567d325d1fbc1b3;p=thirdparty%2Fvalgrind.git

Bug 402781 - Redo the cache used to process indirect branch targets.

[This commit contains an implementation for all targets except amd64-solaris
and x86-solaris, which will be completed shortly.]

In the baseline simulator, jumps to guest code addresses that are not known at
JIT time have to be looked up in a guest->host mapping table.  That means:
indirect branches, indirect calls and most commonly, returns.  Since there are
huge numbers of these (often 10+ million/second) the mapping mechanism needs
to be extremely cheap.

Currently, this is implemented using a direct-mapped cache, VG_(tt_fast), with
2^15 (guest_addr, host_addr) pairs.  This is queried in handwritten assembly
in VG_(disp_cp_xindir) in dispatch-<arch>-<os>.S.  If there is a miss in the
cache then we fall back out to C land, and do a slow lookup using
VG_(search_transtab).

Given that the size of the translation table(s) in recent years has expanded
significantly in order to keep pace with increasing application sizes, two bad
things have happened: (1) the cost of a miss in the fast cache has risen
significantly, and (2) the miss rate on the fast cache has also increased
significantly.  This means that large (~ one-million-basic-blocks-JITted)
applications that run for a long time end up spending a lot of time in
VG_(search_transtab).

The proposed fix is to increase associativity of the fast cache, from 1
(direct mapped) to 4.  Simulations of various cache configurations using
indirect-branch traces from a large application show that is the best of
various configurations.  In an extreme case with 5.7 billion indirect
branches:

* The increase of associativity from 1 way to 4 way, whilst keeping the
  overall cache size the same (32k guest/host pairs), reduces the miss rate by
  around a factor of 3, from 4.02% to 1.30%.

* The use of a slightly better hash function than merely slicing off the
  bottom 15 bits of the address, reduces the miss rate further, from 1.30% to
  0.53%.

Overall the VG_(tt_fast) miss rate is almost unchanged on small workloads, but
reduced by a factor of up to almost 8 on large workloads.

By implementing each (4-entry) cache set using a move-to-front scheme in the
case of hits in ways 1, 2 or 3, the vast majority of hits can be made to
happen in way 0.  Hence the cost of having this extra associativity is almost
zero in the case of a hit.  The improved hash function costs an extra 2 ALU
shots (a shift and an xor) but overall this seems performance neutral to a
win.
---

diff --git a/coregrind/m_dispatch/dispatch-amd64-darwin.S b/coregrind/m_dispatch/dispatch-amd64-darwin.S
index d5603065ee..ccf2b91696 100644
--- a/coregrind/m_dispatch/dispatch-amd64-darwin.S
+++ b/coregrind/m_dispatch/dispatch-amd64-darwin.S
@@ -201,33 +201,98 @@ VG_(disp_cp_chain_me_to_fastEP):
         jmp     postamble
 
 /* ------ Indirect but boring jump ------ */
-.globl VG_(disp_cp_xindir)
+.global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
-	/* Where are we going? */
-	movq	OFFSET_amd64_RIP(%rbp), %rax
+        /* Where are we going? */
+        movq    OFFSET_amd64_RIP(%rbp), %rax    // "guest"
 
         /* stats only */
-        movabsq $VG_(stats__n_xindirs_32), %r10
-        addl    $1, (%r10)
-        
-	/* try a fast lookup in the translation cache */
-	movabsq $VG_(tt_fast), %rcx
-	movq	%rax, %rbx		/* next guest addr */
-	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
-	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
-	movq	0(%rcx,%rbx,1), %r10	/* .guest */
-	movq	8(%rcx,%rbx,1), %r11	/* .host */
-	cmpq	%rax, %r10
-	jnz	fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-	jmp 	*%r11
-	ud2	/* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+        movabsq $VG_(stats__n_xIndirs_32), %r8
+        addl    $1, (%r8)
+
+        // LIVE: %rbp (guest state ptr), %rax (guest address to go to).
+        // We use 5 temporaries:
+        //   %r9 (to point at the relevant FastCacheSet),
+        //   %r10, %r11 and %r12 (scratch).
+        //   %r8 (scratch address)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute %r9 = VG_TT_FAST_HASH(guest)
+        movq    %rax, %r9               // guest
+        shrq    $VG_TT_FAST_BITS, %r9   // (guest >> VG_TT_FAST_BITS)
+        xorq    %rax, %r9               // (guest >> VG_TT_FAST_BITS) ^ guest
+        andq    $VG_TT_FAST_MASK, %r9   // setNo
+
+        // Compute %r9 = &VG_(tt_fast)[%r9]
+        shlq    $VG_FAST_CACHE_SET_BITS, %r9  // setNo * sizeof(FastCacheSet)
+        movabsq $VG_(tt_fast), %r10           // &VG_(tt_fast)[0]
+        leaq    (%r10, %r9), %r9              // &VG_(tt_fast)[setNo]
+
+        // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set)
+        // try way 0
+        cmpq    %rax, FCS_g0(%r9)   // cmp against .guest0
+        jnz     1f
+        // hit at way 0
+        jmp    *FCS_h0(%r9)         // goto .host0
+        ud2
+
+1:      // try way 1
+        cmpq    %rax, FCS_g1(%r9)   // cmp against .guest1
+        jnz     2f
+        // hit at way 1; swap upwards
+        /* stats only */
+        movabsq $VG_(stats__n_xIndir_hits1_32), %r8
+        addl    $1, (%r8)
+        movq    FCS_g0(%r9), %r10   // r10 = old .guest0
+        movq    FCS_h0(%r9), %r11   // r11 = old .host0
+        movq    FCS_h1(%r9), %r12   // r12 = old .host1
+        movq    %rax, FCS_g0(%r9)   // new .guest0 = guest
+        movq    %r12, FCS_h0(%r9)   // new .host0 = old .host1
+        movq    %r10, FCS_g1(%r9)   // new .guest1 = old .guest0
+        movq    %r11, FCS_h1(%r9)   // new .host1 = old .host0
+        jmp     *%r12               // goto old .host1 a.k.a. new .host0
+        ud2
+
+2:      // try way 2
+        cmpq    %rax, FCS_g2(%r9)   // cmp against .guest2
+        jnz     3f
+        // hit at way 2; swap upwards
+        /* stats only */
+        movabsq $VG_(stats__n_xIndir_hits2_32), %r8
+        addl    $1, (%r8)
+        movq    FCS_g1(%r9), %r10
+        movq    FCS_h1(%r9), %r11
+        movq    FCS_h2(%r9), %r12
+        movq    %rax, FCS_g1(%r9)
+        movq    %r12, FCS_h1(%r9)
+        movq    %r10, FCS_g2(%r9)
+        movq    %r11, FCS_h2(%r9)
+        jmp     *%r12
+        ud2
+
+3:      // try way 3
+        cmpq    %rax, FCS_g3(%r9)   // cmp against .guest3
+        jnz     4f
+        // hit at way 3; swap upwards
+        /* stats only */
+        movabsq $VG_(stats__n_xIndir_hits3_32), %r8
+        addl    $1, (%r8)
+        movq    FCS_g2(%r9), %r10
+        movq    FCS_h2(%r9), %r11
+        movq    FCS_h3(%r9), %r12
+        movq    %rax, FCS_g2(%r9)
+        movq    %r12, FCS_h2(%r9)
+        movq    %r10, FCS_g3(%r9)
+        movq    %r11, FCS_h3(%r9)
+        jmp     *%r12
+        ud2
+
+4:      // fast lookup failed
         /* stats only */
-        movabsq $VG_(stats__n_xindir_misses_32), %r10
-        addl    $1, (%r10)
+        movabsq $VG_(stats__n_xIndir_misses_32), %r8
+        addl    $1, (%r8)
 
 	movq	$VG_TRC_INNER_FASTMISS, %rax
         movq    $0, %rdx
diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S
index 62717d31d7..007c495f7c 100644
--- a/coregrind/m_dispatch/dispatch-amd64-linux.S
+++ b/coregrind/m_dispatch/dispatch-amd64-linux.S
@@ -205,28 +205,89 @@ VG_(disp_cp_chain_me_to_fastEP):
 .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
 	/* Where are we going? */
-	movq	OFFSET_amd64_RIP(%rbp), %rax
+	movq	OFFSET_amd64_RIP(%rbp), %rax    // "guest"
 
         /* stats only */
-        addl    $1, VG_(stats__n_xindirs_32)
-        
-	/* try a fast lookup in the translation cache */
-	movabsq $VG_(tt_fast), %rcx
-	movq	%rax, %rbx		/* next guest addr */
-	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
-	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
-	movq	0(%rcx,%rbx,1), %r10	/* .guest */
-	movq	8(%rcx,%rbx,1), %r11	/* .host */
-	cmpq	%rax, %r10
-	jnz	fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-	jmp 	*%r11
-	ud2	/* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+        addl    $1, VG_(stats__n_xIndirs_32)
+
+        // LIVE: %rbp (guest state ptr), %rax (guest address to go to).
+        // We use 4 temporaries:
+        //   %r9 (to point at the relevant FastCacheSet),
+        //   %r10, %r11 and %r12 (scratch).
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute %r9 = VG_TT_FAST_HASH(guest)
+        movq    %rax, %r9               // guest
+        shrq    $VG_TT_FAST_BITS, %r9   // (guest >> VG_TT_FAST_BITS)
+        xorq    %rax, %r9               // (guest >> VG_TT_FAST_BITS) ^ guest
+        andq    $VG_TT_FAST_MASK, %r9   // setNo
+
+        // Compute %r9 = &VG_(tt_fast)[%r9]
+        shlq    $VG_FAST_CACHE_SET_BITS, %r9  // setNo * sizeof(FastCacheSet)
+        movabsq $VG_(tt_fast), %r10           // &VG_(tt_fast)[0]
+        leaq    (%r10, %r9), %r9              // &VG_(tt_fast)[setNo]
+
+        // LIVE: %rbp (guest state ptr), %rax (guest addr), %r9 (cache set)
+        // try way 0
+        cmpq    %rax, FCS_g0(%r9)   // cmp against .guest0
+        jnz     1f
+        // hit at way 0
+        jmp    *FCS_h0(%r9)         // goto .host0
+        ud2
+
+1:      // try way 1
+        cmpq    %rax, FCS_g1(%r9)   // cmp against .guest1
+        jnz     2f
+        // hit at way 1; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits1_32)
+        movq    FCS_g0(%r9), %r10   // r10 = old .guest0
+        movq    FCS_h0(%r9), %r11   // r11 = old .host0
+        movq    FCS_h1(%r9), %r12   // r12 = old .host1
+        movq    %rax, FCS_g0(%r9)   // new .guest0 = guest
+        movq    %r12, FCS_h0(%r9)   // new .host0 = old .host1
+        movq    %r10, FCS_g1(%r9)   // new .guest1 = old .guest0
+        movq    %r11, FCS_h1(%r9)   // new .host1 = old .host0
+        jmp     *%r12               // goto old .host1 a.k.a. new .host0
+        ud2
+
+2:      // try way 2
+        cmpq    %rax, FCS_g2(%r9)   // cmp against .guest2
+        jnz     3f
+        // hit at way 2; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits2_32)
+        movq    FCS_g1(%r9), %r10
+        movq    FCS_h1(%r9), %r11
+        movq    FCS_h2(%r9), %r12
+        movq    %rax, FCS_g1(%r9)
+        movq    %r12, FCS_h1(%r9)
+        movq    %r10, FCS_g2(%r9)
+        movq    %r11, FCS_h2(%r9)
+        jmp     *%r12
+        ud2
+
+3:      // try way 3
+        cmpq    %rax, FCS_g3(%r9)   // cmp against .guest3
+        jnz     4f
+        // hit at way 3; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits3_32)
+        movq    FCS_g2(%r9), %r10
+        movq    FCS_h2(%r9), %r11
+        movq    FCS_h3(%r9), %r12
+        movq    %rax, FCS_g2(%r9)
+        movq    %r12, FCS_h2(%r9)
+        movq    %r10, FCS_g3(%r9)
+        movq    %r11, FCS_h3(%r9)
+        jmp     *%r12
+        ud2
+
+4:      // fast lookup failed
         /* stats only */
-        addl    $1, VG_(stats__n_xindir_misses_32)
+        addl    $1, VG_(stats__n_xIndir_misses_32)
 
 	movq	$VG_TRC_INNER_FASTMISS, %rax
         movq    $0, %rdx
diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S
index 3731c2ebd3..b61818c27d 100644
--- a/coregrind/m_dispatch/dispatch-arm-linux.S
+++ b/coregrind/m_dispatch/dispatch-arm-linux.S
@@ -154,36 +154,114 @@ VG_(disp_cp_xindir):
         ldr  r0, [r8, #OFFSET_arm_R15T]
 
         /* stats only */
-        movw r1, #:lower16:vgPlain_stats__n_xindirs_32
-        movt r1, #:upper16:vgPlain_stats__n_xindirs_32
-        ldr  r2, [r1, #0]
-        add  r2, r2, #1
-        str  r2, [r1, #0]
+        movw r4, #:lower16:VG_(stats__n_xIndirs_32)
+        movt r4, #:upper16:VG_(stats__n_xIndirs_32)
+        ldr  r5, [r4, #0]
+        add  r5, r5, #1
+        str  r5, [r4, #0]
+
+        // LIVE: r8 (guest state ptr), r0 (guest address to go to).
+        // We use 6 temporaries:
+        //   r6 (to point at the relevant FastCacheSet),
+        //   r1, r2, r3 (scratch, for swapping entries within a set)
+        //   r4, r5 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute r6 = VG_TT_FAST_HASH(guest)
+        lsr  r6, r0, #1                       // g1 = guest >> 1
+        eor  r6, r6, r6, LSR #VG_TT_FAST_BITS // (g1 >> VG_TT_FAST_BITS) ^ g1
+        ubfx r6, r6, #0, #VG_TT_FAST_BITS     // setNo
         
-        /* try a fast lookup in the translation cache */
-        // r0 = next guest, r1,r2,r3,r4 scratch
-        movw r1, #VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
+        // Compute r6 = &VG_(tt_fast)[r6]
         movw r4, #:lower16:VG_(tt_fast)
-
-	and  r2, r1, r0, LSR #1         // r2 = entry #
-        movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast)
-
-	add  r1, r4, r2, LSL #3         // r1 = &tt_fast[entry#]
-
-        ldrd r4, r5, [r1, #0]           // r4 = .guest, r5 = .host
-
-	cmp  r4, r0
-
-        // jump to host if lookup succeeded
-	bxeq r5
-
-        /* otherwise the fast lookup failed */
-        /* RM ME -- stats only */
-        movw r1, #:lower16:vgPlain_stats__n_xindir_misses_32
-        movt r1, #:upper16:vgPlain_stats__n_xindir_misses_32
-        ldr  r2, [r1, #0]
-        add  r2, r2, #1
-        str  r2, [r1, #0]
+        movt r4, #:upper16:VG_(tt_fast)
+        add  r6, r4, r6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo]
+
+        // LIVE: r8 (guest state ptr), r0 (guest addr), r6 (cache set)
+        // try way 0
+        ldr  r4, [r6, #FCS_g0]   // .guest0
+        ldr  r5, [r6, #FCS_h0]   // .host0
+        cmp  r4, r0   // cmp against .guest0
+        bne  1f
+        // hit at way 0
+        // goto .host0
+        bx   r5
+        /*NOTREACHED*/
+
+1:      // try way 1
+        ldr  r4, [r6, #FCS_g1]
+        cmp  r4, r0   // cmp against .guest1
+        bne  2f
+        // hit at way 1; swap upwards
+        ldr  r1, [r6, #FCS_g0]   // r1 = old .guest0
+        ldr  r2, [r6, #FCS_h0]   // r2 = old .host0
+        ldr  r3, [r6, #FCS_h1]   // r3 = old .host1
+        str  r0, [r6, #FCS_g0]   // new .guest0 = guest
+        str  r3, [r6, #FCS_h0]   // new .host0 = old .host1
+        str  r1, [r6, #FCS_g1]   // new .guest1 = old .guest0
+        str  r2, [r6, #FCS_h1]   // new .host1 = old .host0
+        // stats only
+        movw r4, #:lower16:VG_(stats__n_xIndir_hits1_32)
+        movt r4, #:upper16:VG_(stats__n_xIndir_hits1_32)
+        ldr  r5, [r4, #0]
+        add  r5, r5, #1
+        str  r5, [r4, #0]
+        // goto old .host1 a.k.a. new .host0
+        bx   r3
+        /*NOTREACHED*/
+
+2:      // try way 2
+        ldr  r4, [r6, #FCS_g2]
+        cmp  r4, r0   // cmp against .guest2
+        bne  3f
+        // hit at way 2; swap upwards
+        ldr  r1, [r6, #FCS_g1]
+        ldr  r2, [r6, #FCS_h1]
+        ldr  r3, [r6, #FCS_h2]
+        str  r0, [r6, #FCS_g1]
+        str  r3, [r6, #FCS_h1]
+        str  r1, [r6, #FCS_g2]
+        str  r2, [r6, #FCS_h2]
+        // stats only
+        movw r4, #:lower16:VG_(stats__n_xIndir_hits2_32)
+        movt r4, #:upper16:VG_(stats__n_xIndir_hits2_32)
+        ldr  r5, [r4, #0]
+        add  r5, r5, #1
+        str  r5, [r4, #0]
+        // goto old .host2 a.k.a. new .host1
+        bx   r3
+        /*NOTREACHED*/
+
+3:      // try way 3
+        ldr  r4, [r6, #FCS_g3]
+        cmp  r4, r0   // cmp against .guest3
+        bne  4f
+        // hit at way 3; swap upwards
+        ldr  r1, [r6, #FCS_g2]
+        ldr  r2, [r6, #FCS_h2]
+        ldr  r3, [r6, #FCS_h3]
+        str  r0, [r6, #FCS_g2]
+        str  r3, [r6, #FCS_h2]
+        str  r1, [r6, #FCS_g3]
+        str  r2, [r6, #FCS_h3]
+        // stats only
+        movw r4, #:lower16:VG_(stats__n_xIndir_hits3_32)
+        movt r4, #:upper16:VG_(stats__n_xIndir_hits3_32)
+        ldr  r5, [r4, #0]
+        add  r5, r5, #1
+        str  r5, [r4, #0]
+        // goto old .host3 a.k.a. new .host2
+        bx   r3
+        /*NOTREACHED*/
+
+4:      // fast lookup failed
+        movw r4, #:lower16:VG_(stats__n_xIndir_misses_32)
+        movt r4, #:upper16:VG_(stats__n_xIndir_misses_32)
+        ldr  r5, [r4, #0]
+        add  r5, r5, #1
+        str  r5, [r4, #0]
 
 	mov  r1, #VG_TRC_INNER_FASTMISS
         mov  r2, #0
diff --git a/coregrind/m_dispatch/dispatch-arm64-linux.S b/coregrind/m_dispatch/dispatch-arm64-linux.S
index ee289faf80..554fa9b1fd 100644
--- a/coregrind/m_dispatch/dispatch-arm64-linux.S
+++ b/coregrind/m_dispatch/dispatch-arm64-linux.S
@@ -173,42 +173,118 @@ VG_(disp_cp_chain_me_to_fastEP):
 /* ------ Indirect but boring jump ------ */
 .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
-	/* Where are we going? */
+	// Where are we going?
         ldr  x0, [x21, #OFFSET_arm64_PC]
 
-        /* stats only */
-        adrp x1,           VG_(stats__n_xindirs_32)
-        add  x1, x1, :lo12:VG_(stats__n_xindirs_32)
-        ldr  w2, [x1, #0]
-        add  w2, w2, #1
-        str  w2, [x1, #0]
-        
-        /* try a fast lookup in the translation cache */
-        // x0 = next guest, x1,x2,x3,x4 scratch
-        mov  x1, #VG_TT_FAST_MASK       // x1 = VG_TT_FAST_MASK
-	and  x2, x1, x0, LSR #2         // x2 = entry # = (x1 & (x0 >> 2))
-
+        // stats only
+        adrp x4,           VG_(stats__n_xIndirs_32)
+        add  x4, x4, :lo12:VG_(stats__n_xIndirs_32)
+        ldr  w5, [x4, #0]
+        add  w5, w5, #1
+        str  w5, [x4, #0]
+
+        // LIVE: x21 (guest state ptr), x0 (guest address to go to).
+        // We use 6 temporaries:
+        //   x6 (to point at the relevant FastCacheSet),
+        //   x1, x2, x3 (scratch, for swapping entries within a set)
+        //   x4, x5 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute x6 = VG_TT_FAST_HASH(guest)
+        lsr  x6, x0, #2                       // g2 = guest >> 2
+        eor  x6, x6, x6, LSR #VG_TT_FAST_BITS // (g2 >> VG_TT_FAST_BITS) ^ g2
+        mov  x4, #VG_TT_FAST_MASK             // VG_TT_FAST_MASK
+        and  x6, x6, x4                       // setNo
+
+        // Compute x6 = &VG_(tt_fast)[x6]
         adrp x4,           VG_(tt_fast)
-        add  x4, x4, :lo12:VG_(tt_fast) // x4 = &VG_(tt_fast)
-
-	add  x1, x4, x2, LSL #4         // r1 = &tt_fast[entry#]
+        add  x4, x4, :lo12:VG_(tt_fast)              // &VG_(tt_fast)[0]
+        add  x6, x4, x6, LSL #VG_FAST_CACHE_SET_BITS // &VG_(tt_fast)[setNo]
+
+        // LIVE: x21 (guest state ptr), x0 (guest addr), x6 (cache set)
+        // try way 0
+        ldp  x4, x5, [x6, #FCS_g0]  // x4 = .guest0, x5 = .host0
+        cmp  x4, x0   // cmp against .guest0
+        bne  1f
+        // hit at way 0
+        // goto .host0
+        br   x5
+        /*NOTREACHED*/
 
-        ldp  x4, x5, [x1, #0]           // x4 = .guest, x5 = .host
+1:      // try way 1
+        ldr  x4, [x6, #FCS_g1]
+        cmp  x4, x0   // cmp against .guest1
+        bne  2f
+        // hit at way 1; swap upwards
+        ldr  x1, [x6, #FCS_g0]   // x1 = old .guest0
+        ldr  x2, [x6, #FCS_h0]   // x2 = old .host0
+        ldr  x3, [x6, #FCS_h1]   // x3 = old .host1
+        str  x0, [x6, #FCS_g0]   // new .guest0 = guest
+        str  x3, [x6, #FCS_h0]   // new .host0 = old .host1
+        str  x1, [x6, #FCS_g1]   // new .guest1 = old .guest0
+        str  x2, [x6, #FCS_h1]   // new .host1 = old .host0
+        // stats only
+        adrp x4,           VG_(stats__n_xIndir_hits1_32)
+        add  x4, x4, :lo12:VG_(stats__n_xIndir_hits1_32)
+        ldr  w5, [x4, #0]
+        add  w5, w5, #1
+        str  w5, [x4, #0]
+        // goto old .host1 a.k.a. new .host0
+        br   x3
+        /*NOTREACHED*/
 
-	cmp  x4, x0
+2:      // try way 2
+        ldr  x4, [x6, #FCS_g2]
+        cmp  x4, x0   // cmp against .guest2
+        bne  3f
+        // hit at way 2; swap upwards
+        ldr  x1, [x6, #FCS_g1]
+        ldr  x2, [x6, #FCS_h1]
+        ldr  x3, [x6, #FCS_h2]
+        str  x0, [x6, #FCS_g1]
+        str  x3, [x6, #FCS_h1]
+        str  x1, [x6, #FCS_g2]
+        str  x2, [x6, #FCS_h2]
+        // stats only
+        adrp x4,           VG_(stats__n_xIndir_hits2_32)
+        add  x4, x4, :lo12:VG_(stats__n_xIndir_hits2_32)
+        ldr  w5, [x4, #0]
+        add  w5, w5, #1
+        str  w5, [x4, #0]
+        // goto old .host2 a.k.a. new .host1
+        br   x3
+        /*NOTREACHED*/
 
-        // jump to host if lookup succeeded
-        bne  fast_lookup_failed
-	br   x5
+3:      // try way 3
+        ldr  x4, [x6, #FCS_g3]
+        cmp  x4, x0   // cmp against .guest3
+        bne  4f
+        // hit at way 3; swap upwards
+        ldr  x1, [x6, #FCS_g2]
+        ldr  x2, [x6, #FCS_h2]
+        ldr  x3, [x6, #FCS_h3]
+        str  x0, [x6, #FCS_g2]
+        str  x3, [x6, #FCS_h2]
+        str  x1, [x6, #FCS_g3]
+        str  x2, [x6, #FCS_h3]
+        // stats only
+        adrp x4,           VG_(stats__n_xIndir_hits3_32)
+        add  x4, x4, :lo12:VG_(stats__n_xIndir_hits3_32)
+        ldr  w5, [x4, #0]
+        add  w5, w5, #1
+        str  w5, [x4, #0]
+        // goto old .host3 a.k.a. new .host2
+        br   x3
         /*NOTREACHED*/
 
-fast_lookup_failed:
-        /* RM ME -- stats only */
-        adrp x1,           VG_(stats__n_xindir_misses_32)
-        add  x1, x1, :lo12:VG_(stats__n_xindir_misses_32)
-        ldr  w2, [x1, #0]
-        add  w2, w2, #1
-        str  w2, [x1, #0]
+4:      // fast lookup failed
+        adrp x4,           VG_(stats__n_xIndir_misses_32)
+        add  x4, x4, :lo12:VG_(stats__n_xIndir_misses_32)
+        ldr  w5, [x4, #0]
+        add  w5, w5, #1
+        str  w5, [x4, #0]
 
 	mov  x1, #VG_TRC_INNER_FASTMISS
         mov  x2, #0
diff --git a/coregrind/m_dispatch/dispatch-mips32-linux.S b/coregrind/m_dispatch/dispatch-mips32-linux.S
index 9918403d5e..fdb1e29b00 100644
--- a/coregrind/m_dispatch/dispatch-mips32-linux.S
+++ b/coregrind/m_dispatch/dispatch-mips32-linux.S
@@ -175,47 +175,116 @@ VG_(disp_cp_chain_me_to_fastEP):
 .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
         /* Where are we going? */
-        lw  $11, OFFSET_mips32_PC($23)
-
-        lw $13, vgPlain_stats__n_xindirs_32
-        addiu $13, $13, 0x1
-        sw $13, vgPlain_stats__n_xindirs_32
-
-        /* try a fast lookup in the translation cache */
-        /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*)
-                = (t8 >> 2 & VG_TT_FAST_MASK)  << 3 */
-
-        move $14, $11
-        li $12, VG_TT_FAST_MASK
-        srl $14, $14, 2
-        and $14, $14, $12
-        sll $14, $14, 3
-
-        /* t2 = (addr of VG_(tt_fast)) + t1 */
-        la $13, VG_(tt_fast)
-        addu $13, $13, $14
-
-        lw $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */
-        addiu $13, $13, 4
-        lw $25, 0($13) /* little-endian, so comparing 1st 32bit word */
-        nop
-
-check:
-        bne $12, $11, fast_lookup_failed
-        /* run the translation */
-        jr $25
-        .long   0x0   /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
-        /* %PC is up to date */
-        /* back out decrement of the dispatch counter */
-        /* hold dispatch_ctr in t0 (r8) */
-        lw $13, vgPlain_stats__n_xindirs_32
-        addiu $13, $13, 0x1
-        sw $13, vgPlain_stats__n_xindirs_32
-        li $2, VG_TRC_INNER_FASTMISS
-        li $3, 0
-        b       postamble
+        lw    $10, OFFSET_mips32_PC($23)
+
+        /* stats only */
+        lw    $15, VG_(stats__n_xIndirs_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndirs_32)
+
+        // LIVE: r23 (guest state ptr), r10 (guest address to go to).
+        // We use 6 temporaries:
+        //   r16 (to point at the relevant FastCacheSet),
+        //   r11, r12, r13 (scratch, for swapping entries within a set)
+        //   r14, r15 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute r16 = VG_TT_FAST_HASH(guest)
+        srl   $16, $10, 2                      // g2 = guest >> 2
+        srl   $15, $10, (VG_TT_FAST_BITS + 2)  // (g2 >> VG_TT_FAST_BITS)
+        xor   $16, $16, $15                    // (g2 >> VG_TT_FAST_BITS) ^ g2
+        li    $15, VG_TT_FAST_MASK
+        and   $16, $16, $15                    // setNo
+
+        // Compute r16 = &VG_(tt_fast)[r16]
+	la    $15, VG_(tt_fast)
+        sll   $16, $16, VG_FAST_CACHE_SET_BITS
+        addu  $16, $16, $15
+
+        // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set)
+        // try way 0
+        lw    $14, FCS_g0($16)   // .guest0
+        lw    $15, FCS_h0($16)   // .host0
+        bne   $14, $10, 1f  // cmp against .guest0
+        // hit at way 0
+        // goto .host0
+        jr    $15
+        /*NOTREACHED*/
+	.long 0x0
+
+1:      // try way 1
+        lw    $14, FCS_g1($16)
+        bne   $14, $10, 2f  // cmp against .guest1
+        // hit at way 1; swap upwards
+        lw    $11, FCS_g0($16)   // $11 = old .guest0
+        lw    $12, FCS_h0($16)   // $12 = old .host0
+        lw    $13, FCS_h1($16)   // $13 = old .host1
+        sw    $10, FCS_g0($16)   // new .guest0 = guest
+        sw    $13, FCS_h0($16)   // new .host0 = old .host1
+        sw    $11, FCS_g1($16)   // new .guest1 = old .guest0
+        sw    $12, FCS_h1($16)   // new .host1 = old .host0
+        // stats only
+        lw    $15, VG_(stats__n_xIndir_hits1_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_hits1_32)
+        // goto old .host1 a.k.a. new .host0
+        jr    $13
+        /*NOTREACHED*/
+	.long 0x0
+
+2:      // try way 2
+        lw    $14, FCS_g2($16)
+        bne   $14, $10, 3f   // cmp against .guest2
+        // hit at way 2; swap upwards
+        lw    $11, FCS_g1($16)
+        lw    $12, FCS_h1($16)
+        lw    $13, FCS_h2($16)
+        sw    $10, FCS_g1($16)
+        sw    $13, FCS_h1($16)
+        sw    $11, FCS_g2($16)
+        sw    $12, FCS_h2($16)
+        // stats only
+        lw    $15, VG_(stats__n_xIndir_hits2_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_hits2_32)
+        // goto old .host2 a.k.a. new .host1
+        jr    $13
+        /*NOTREACHED*/
+	.long 0x0
+
+3:      // try way 3
+        lw    $14, FCS_g3($16)
+        bne   $14, $10, 4f   // cmp against .guest3
+        // hit at way 3; swap upwards
+        lw    $11, FCS_g2($16)
+        lw    $12, FCS_h2($16)
+        lw    $13, FCS_h3($16)
+        sw    $10, FCS_g2($16)
+        sw    $13, FCS_h2($16)
+        sw    $11, FCS_g3($16)
+        sw    $12, FCS_h3($16)
+        // stats only
+        lw    $15, VG_(stats__n_xIndir_hits3_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_hits3_32)
+        // goto old .host3 a.k.a. new .host2
+        jr    $13
+        /*NOTREACHED*/
+	.long 0x0
+
+4:      // fast lookup failed:
+        /* stats only */
+        lw    $15, VG_(stats__n_xIndir_misses_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_misses_32)
+
+        li    $2, VG_TRC_INNER_FASTMISS
+        li    $3, 0
+        b     postamble
+	/*NOTREACHED*/
+	.long 0x0
 
 /* ------ Assisted jump ------ */
         .global VG_(disp_cp_xassisted)
diff --git a/coregrind/m_dispatch/dispatch-mips64-linux.S b/coregrind/m_dispatch/dispatch-mips64-linux.S
index 4a2b1b734e..5d1efd622d 100644
--- a/coregrind/m_dispatch/dispatch-mips64-linux.S
+++ b/coregrind/m_dispatch/dispatch-mips64-linux.S
@@ -182,47 +182,116 @@ VG_(disp_cp_chain_me_to_fastEP):
 .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
         /* Where are we going? */
-        ld  $11, OFFSET_mips64_PC($23)
-
-        lw $13, vgPlain_stats__n_xindirs_32
-        addiu $13, $13, 0x1
-        sw $13, vgPlain_stats__n_xindirs_32
-
-        /* try a fast lookup in the translation cache */
-        /* t1 = VG_TT_FAST_HASH(addr) * sizeof(ULong*)
-                = (t8 >> 2 & VG_TT_FAST_MASK)  << 3 */
-
-        move $14, $11
-        li $12, VG_TT_FAST_MASK
-        srl $14, $14, 2
-        and $14, $14, $12
-        sll $14, $14, 3
-
-        /* t2 = (addr of VG_(tt_fast)) + t1 */
-        dla $13, VG_(tt_fast)
-        daddu $13, $13, $14
-
-        ld $12, 0($13) /* t3 = VG_(tt_fast)[hash] :: ULong* */
-        daddiu $13, $13, 8
-        ld $25, 0($13) /* little-endian, so comparing 1st 32bit word */
-        nop
-
-check:
-        bne $12, $11, fast_lookup_failed
-        /* run the translation */
-        jr $25
-        .long   0x0   /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
-        /* %PC is up to date */
-        /* back out decrement of the dispatch counter */
-        /* hold dispatch_ctr in t0 (r8) */
-        lw $13, vgPlain_stats__n_xindirs_32
-        addiu $13, $13, 0x1
-        sw $13, vgPlain_stats__n_xindirs_32
-        li $2, VG_TRC_INNER_FASTMISS
-        li $3, 0
-        b       postamble
+        ld    $10, OFFSET_mips64_PC($23)
+
+        /* stats only */
+        lw    $15, VG_(stats__n_xIndirs_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndirs_32)
+
+        // LIVE: r23 (guest state ptr), r10 (guest address to go to).
+        // We use 6 temporaries:
+        //   r16 (to point at the relevant FastCacheSet),
+        //   r11, r12, r13 (scratch, for swapping entries within a set)
+        //   r14, r15 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute r16 = VG_TT_FAST_HASH(guest)
+        dsrl  $16, $10, 2                      // g2 = guest >> 2
+        dsrl  $15, $10, (VG_TT_FAST_BITS + 2)  // (g2 >> VG_TT_FAST_BITS)
+        xor   $16, $16, $15                    // (g2 >> VG_TT_FAST_BITS) ^ g2
+        li    $15, VG_TT_FAST_MASK
+        and   $16, $16, $15                    // setNo
+
+        // Compute r16 = &VG_(tt_fast)[r16]
+	dla   $15, VG_(tt_fast)
+        dsll  $16, $16, VG_FAST_CACHE_SET_BITS
+        daddu $16, $16, $15
+
+        // LIVE: r23 (guest state ptr), r10 (guest addr), r16 (cache set)
+        // try way 0
+        ld    $14, FCS_g0($16)   // .guest0
+        ld    $15, FCS_h0($16)   // .host0
+        bne   $14, $10, 1f  // cmp against .guest0
+        // hit at way 0
+        // goto .host0
+        jr    $15
+        /*NOTREACHED*/
+	.long 0x0
+
+1:      // try way 1
+        ld    $14, FCS_g1($16)
+        bne   $14, $10, 2f  // cmp against .guest1
+        // hit at way 1; swap upwards
+        ld    $11, FCS_g0($16)   // $11 = old .guest0
+        ld    $12, FCS_h0($16)   // $12 = old .host0
+        ld    $13, FCS_h1($16)   // $13 = old .host1
+        sd    $10, FCS_g0($16)   // new .guest0 = guest
+        sd    $13, FCS_h0($16)   // new .host0 = old .host1
+        sd    $11, FCS_g1($16)   // new .guest1 = old .guest0
+        sd    $12, FCS_h1($16)   // new .host1 = old .host0
+        // stats only
+        lw    $15, VG_(stats__n_xIndir_hits1_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_hits1_32)
+        // goto old .host1 a.k.a. new .host0
+        jr    $13
+        /*NOTREACHED*/
+	.long 0x0
+
+2:      // try way 2
+        ld    $14, FCS_g2($16)
+        bne   $14, $10, 3f   // cmp against .guest2
+        // hit at way 2; swap upwards
+        ld    $11, FCS_g1($16)
+        ld    $12, FCS_h1($16)
+        ld    $13, FCS_h2($16)
+        sd    $10, FCS_g1($16)
+        sd    $13, FCS_h1($16)
+        sd    $11, FCS_g2($16)
+        sd    $12, FCS_h2($16)
+        // stats only
+        lw    $15, VG_(stats__n_xIndir_hits2_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_hits2_32)
+        // goto old .host2 a.k.a. new .host1
+        jr    $13
+        /*NOTREACHED*/
+	.long 0x0
+
+3:      // try way 3
+        ld    $14, FCS_g3($16)
+        bne   $14, $10, 4f   // cmp against .guest3
+        // hit at way 3; swap upwards
+        ld    $11, FCS_g2($16)
+        ld    $12, FCS_h2($16)
+        ld    $13, FCS_h3($16)
+        sd    $10, FCS_g2($16)
+        sd    $13, FCS_h2($16)
+        sd    $11, FCS_g3($16)
+        sd    $12, FCS_h3($16)
+        // stats only
+        lw    $15, VG_(stats__n_xIndir_hits3_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_hits3_32)
+        // goto old .host3 a.k.a. new .host2
+        jr    $13
+        /*NOTREACHED*/
+	.long 0x0
+
+4:      // fast lookup failed:
+        /* stats only */
+        lw    $15, VG_(stats__n_xIndir_misses_32)
+        addiu $15, $15, 1
+        sw    $15, VG_(stats__n_xIndir_misses_32)
+
+        li    $2, VG_TRC_INNER_FASTMISS
+        li    $3, 0
+        b     postamble
+	/*NOTREACHED*/
+	.long 0x0
 
 /* ------ Assisted jump ------ */
         .global VG_(disp_cp_xassisted)
diff --git a/coregrind/m_dispatch/dispatch-ppc32-linux.S b/coregrind/m_dispatch/dispatch-ppc32-linux.S
index 432306bf44..d3ff2d11e9 100644
--- a/coregrind/m_dispatch/dispatch-ppc32-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc32-linux.S
@@ -437,44 +437,128 @@ VG_(disp_cp_chain_me_to_fastEP):
 .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
         /* Where are we going? */
-        lwz     3,OFFSET_ppc32_CIA(31)
+        lwz   20, OFFSET_ppc32_CIA(31)
 
         /* stats only */
-        lis     5,VG_(stats__n_xindirs_32)@ha
-        addi    5,5,VG_(stats__n_xindirs_32)@l
-        lwz     6,0(5)
-        addi    6,6,1
-        stw     6,0(5)
+        lis   24,     VG_(stats__n_xIndirs_32)@ha
+        addi  24, 24, VG_(stats__n_xIndirs_32)@l
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+
+        // LIVE: r31 (guest state ptr), r20 (guest address to go to).
+        // We use 6 temporaries:
+        //   r26 (to point at the relevant FastCacheSet),
+        //   r21, r22, r23 (scratch, for swapping entries within a set)
+        //   r24, r25 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute r26 = VG_TT_FAST_HASH(guest)
+        srwi  26, 20, 2                       // g2 = guest >> 2
+        srwi  25, 20, (VG_TT_FAST_BITS + 2)   // (g2 >> VG_TT_FAST_BITS)
+        xor   26, 26, 25                      // (g2 >> VG_TT_FAST_BITS) ^ g2
+        andi. 26, 26, VG_TT_FAST_MASK         // setNo
         
-        /* r5 = &VG_(tt_fast) */
-        lis	5,VG_(tt_fast)@ha
-        addi    5,5,VG_(tt_fast)@l   /* & VG_(tt_fast) */
-
-        /* try a fast lookup in the translation cache */
-        /* r4 = VG_TT_FAST_HASH(addr)           * sizeof(FastCacheEntry)
-              = ((r3 >>u 2) & VG_TT_FAST_MASK)  << 3 */
-	rlwinm	4,3,1, 29-VG_TT_FAST_BITS, 28	/* entry# * 8 */
-	add	5,5,4	/* & VG_(tt_fast)[entry#] */
-	lwz	6,0(5)   /* .guest */
-	lwz	7,4(5)   /* .host */
-        cmpw    3,6
-        bne     fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-        mtctr   7
+        // Compute r6 = &VG_(tt_fast)[r6]
+        lis   25, VG_(tt_fast)@ha
+        addi  25, 25, VG_(tt_fast)@l
+        slwi  26, 26, VG_FAST_CACHE_SET_BITS
+        add   26, 26, 25
+
+        // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set)
+        // try way 0
+        lwz   24, FCS_g0(26)   // .guest0
+        lwz   25, FCS_h0(26)   // .host0
+        cmpw  24, 20  // cmp against .guest0
+        bne   1f
+        // hit at way 0
+        // goto .host0
+        mtctr 25
         bctr
+        /*NOTREACHED*/
+
+1:      // try way 1
+        lwz   24, FCS_g1(26)
+        cmpw  24, 20  // cmp against .guest1
+        bne   2f
+        // hit at way 1; swap upwards
+        lwz   21, FCS_g0(26)   // 21 = old .guest0
+        lwz   22, FCS_h0(26)   // 22 = old .host0
+        lwz   23, FCS_h1(26)   // 23 = old .host1
+        stw   20, FCS_g0(26)   // new .guest0 = guest
+        stw   23, FCS_h0(26)   // new .host0 = old .host1
+        stw   21, FCS_g1(26)   // new .guest1 = old .guest0
+        stw   22, FCS_h1(26)   // new .host1 = old .host0
+        // stats only
+        lis   24,     VG_(stats__n_xIndir_hits1_32)@ha
+        addi  24, 24, VG_(stats__n_xIndir_hits1_32)@l
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host1 a.k.a. new .host0
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
+
+2:      // try way 2
+        lwz   24, FCS_g2(26)
+        cmpw  24, 20   // cmp against .guest2
+        bne   3f
+        // hit at way 2; swap upwards
+        lwz   21, FCS_g1(26)
+        lwz   22, FCS_h1(26)
+        lwz   23, FCS_h2(26)
+        stw   20, FCS_g1(26)
+        stw   23, FCS_h1(26)
+        stw   21, FCS_g2(26)
+        stw   22, FCS_h2(26)
+        // stats only
+        lis   24,     VG_(stats__n_xIndir_hits2_32)@ha
+        addi  24, 24, VG_(stats__n_xIndir_hits2_32)@l
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host2 a.k.a. new .host1
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
+
+3:      // try way 3
+        lwz   24, FCS_g3(26)
+        cmpw  24, 20   // cmp against .guest3
+        bne   4f
+        // hit at way 3; swap upwards
+        lwz   21, FCS_g2(26)
+        lwz   22, FCS_h2(26)
+        lwz   23, FCS_h3(26)
+        stw   20, FCS_g2(26)
+        stw   23, FCS_h2(26)
+        stw   21, FCS_g3(26)
+        stw   22, FCS_h3(26)
+        // stats only
+        lis   24,     VG_(stats__n_xIndir_hits3_32)@ha
+        addi  24, 24, VG_(stats__n_xIndir_hits3_32)@l
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host3 a.k.a. new .host2
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
 
-fast_lookup_failed:
+4:      // fast lookup failed:
         /* stats only */
-        lis     5,VG_(stats__n_xindir_misses_32)@ha
-        addi    5,5,VG_(stats__n_xindir_misses_32)@l
-        lwz     6,0(5)
-        addi    6,6,1
-        stw     6,0(5)
-
-        li      6,VG_TRC_INNER_FASTMISS
-        li      7,0
-        b       postamble
+        lis   24,     VG_(stats__n_xIndir_misses_32)@ha
+        addi  24, 24, VG_(stats__n_xIndir_misses_32)@l
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+
+        li    6, VG_TRC_INNER_FASTMISS
+        li    7, 0
+        b     postamble
 	/*NOTREACHED*/
 
 /* ------ Assisted jump ------ */
diff --git a/coregrind/m_dispatch/dispatch-ppc64be-linux.S b/coregrind/m_dispatch/dispatch-ppc64be-linux.S
index 91bd3b236d..c5592d4f31 100644
--- a/coregrind/m_dispatch/dispatch-ppc64be-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc64be-linux.S
@@ -45,14 +45,27 @@
         .type   vgPlain_tt_fast, @object
 */
 .section ".toc","aw"
+
 .tocent__vgPlain_tt_fast:
         .tc vgPlain_tt_fast[TC],vgPlain_tt_fast
-.tocent__vgPlain_stats__n_xindirs_32:
-        .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32
-.tocent__vgPlain_stats__n_xindir_misses_32:
-        .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32
+
+.tocent__vgPlain_stats__n_xIndirs_32:
+        .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32
+
+.tocent__vgPlain_stats__n_xIndir_hits1_32:
+        .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32
+
+.tocent__vgPlain_stats__n_xIndir_hits2_32:
+        .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32
+
+.tocent__vgPlain_stats__n_xIndir_hits3_32:
+        .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32
+
+.tocent__vgPlain_stats__n_xIndir_misses_32:
+        .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32
+
 .tocent__vgPlain_machine_ppc64_has_VMX:
-        .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX
+        .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX
 
 /*------------------------------------------------------------*/
 /*---                                                      ---*/
@@ -454,42 +467,122 @@ VG_(disp_cp_xindir):
         .globl   .VG_(disp_cp_xindir)
 .VG_(disp_cp_xindir):
         /* Where are we going? */
-        ld      3,OFFSET_ppc64_CIA(31)
+        ld    20, OFFSET_ppc64_CIA(31)
 
         /* stats only */
-	ld	5, .tocent__vgPlain_stats__n_xindirs_32@toc(2)
-        lwz     6,0(5)
-        addi    6,6,1
-        stw     6,0(5)
-
-	/* r5 = &VG_(tt_fast) */
-	ld	5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */
-
-        /* try a fast lookup in the translation cache */
-        /* r4 = VG_TT_FAST_HASH(addr)           * sizeof(FastCacheEntry)
-              = ((r3 >>u 2) & VG_TT_FAST_MASK)  << 4 */
-	rldicl	4,3, 62, 64-VG_TT_FAST_BITS   /* entry# */
-	sldi	4,4,4      /* entry# * sizeof(FastCacheEntry) */
-	add	5,5,4      /* & VG_(tt_fast)[entry#] */
-	ld	6,0(5)     /* .guest */
-	ld	7,8(5)     /* .host */
-        cmpd    3,6
-        bne     .fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-        mtctr   7
+        ld    24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+
+        // LIVE: r31 (guest state ptr), r20 (guest address to go to).
+        // We use 6 temporaries:
+        //   r26 (to point at the relevant FastCacheSet),
+        //   r21, r22, r23 (scratch, for swapping entries within a set)
+        //   r24, r25 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute r26 = VG_TT_FAST_HASH(guest)
+        srdi  26, 20, 2                       // g2 = guest >> 2
+        srdi  25, 20, (VG_TT_FAST_BITS + 2)   // (g2 >> VG_TT_FAST_BITS)
+        xor   26, 26, 25                      // (g2 >> VG_TT_FAST_BITS) ^ g2
+        andi. 26, 26, VG_TT_FAST_MASK         // setNo
+
+        // Compute r6 = &VG_(tt_fast)[r6]
+        ld    25, .tocent__vgPlain_tt_fast@toc(2)
+        sldi  26, 26, VG_FAST_CACHE_SET_BITS
+        add   26, 26, 25
+
+        // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set)
+        // try way 0
+        ld    24, FCS_g0(26)   // .guest0
+        ld    25, FCS_h0(26)   // .host0
+        cmpd  24, 20  // cmp against .guest0
+        bne   1f
+        // hit at way 0
+        // goto .host0
+        mtctr 25
         bctr
+        /*NOTREACHED*/
+
+1:      // try way 1
+        ld    24, FCS_g1(26)
+        cmpd  24, 20  // cmp against .guest1
+        bne   2f
+        // hit at way 1; swap upwards
+        ld    21, FCS_g0(26)   // 21 = old .guest0
+        ld    22, FCS_h0(26)   // 22 = old .host0
+        ld    23, FCS_h1(26)   // 23 = old .host1
+        std   20, FCS_g0(26)   // new .guest0 = guest
+        std   23, FCS_h0(26)   // new .host0 = old .host1
+        std   21, FCS_g1(26)   // new .guest1 = old .guest0
+        std   22, FCS_h1(26)   // new .host1 = old .host0
+        // stats only
+        ld    24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host1 a.k.a. new .host0
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
+
+2:      // try way 2
+        ld    24, FCS_g2(26)
+        cmpd  24, 20   // cmp against .guest2
+        bne   3f
+        // hit at way 2; swap upwards
+        ld    21, FCS_g1(26)
+        ld    22, FCS_h1(26)
+        ld    23, FCS_h2(26)
+        std   20, FCS_g1(26)
+        std   23, FCS_h1(26)
+        std   21, FCS_g2(26)
+        std   22, FCS_h2(26)
+        // stats only
+        ld    24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host2 a.k.a. new .host1
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
+
+3:      // try way 3
+        ld    24, FCS_g3(26)
+        cmpd  24, 20   // cmp against .guest3
+        bne   4f
+        // hit at way 3; swap upwards
+        ld    21, FCS_g2(26)
+        ld    22, FCS_h2(26)
+        ld    23, FCS_h3(26)
+        std   20, FCS_g2(26)
+        std   23, FCS_h2(26)
+        std   21, FCS_g3(26)
+        std   22, FCS_h3(26)
+        // stats only
+        ld    24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host3 a.k.a. new .host2
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
 
-.fast_lookup_failed:
+4:      // fast lookup failed:
         /* stats only */
-	ld	5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2)
-        lwz     6,0(5)
-        addi    6,6,1
-        stw     6,0(5)
-
-        li      6,VG_TRC_INNER_FASTMISS
-        li      7,0
-        b       .postamble
+	ld    24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2)
+        lwz   25, 0(24)
+        addi  25 ,25, 1
+        stw   25 ,0(24)
+
+        li    6,VG_TRC_INNER_FASTMISS
+        li    7,0
+        b     .postamble
 	/*NOTREACHED*/
 
 /* ------ Assisted jump ------ */
diff --git a/coregrind/m_dispatch/dispatch-ppc64le-linux.S b/coregrind/m_dispatch/dispatch-ppc64le-linux.S
index 21e43584d8..3e26d7715c 100644
--- a/coregrind/m_dispatch/dispatch-ppc64le-linux.S
+++ b/coregrind/m_dispatch/dispatch-ppc64le-linux.S
@@ -54,14 +54,27 @@
         .type   vgPlain_tt_fast, @object
 */
 .section ".toc","aw"
+
 .tocent__vgPlain_tt_fast:
         .tc vgPlain_tt_fast[TC],vgPlain_tt_fast
-.tocent__vgPlain_stats__n_xindirs_32:
-        .tc vgPlain_stats__n_xindirs_32[TC],vgPlain_stats__n_xindirs_32
-.tocent__vgPlain_stats__n_xindir_misses_32:
-        .tc vgPlain_stats__n_xindir_misses_32[TC],vgPlain_stats__n_xindir_misses_32
+
+.tocent__vgPlain_stats__n_xIndirs_32:
+        .tc vgPlain_stats__n_xIndirs_32[TC], vgPlain_stats__n_xIndirs_32
+
+.tocent__vgPlain_stats__n_xIndir_hits1_32:
+        .tc vgPlain_stats__n_xIndir_hits1_32[TC], vgPlain_stats__n_xIndir_hits1_32
+
+.tocent__vgPlain_stats__n_xIndir_hits2_32:
+        .tc vgPlain_stats__n_xIndir_hits2_32[TC], vgPlain_stats__n_xIndir_hits2_32
+
+.tocent__vgPlain_stats__n_xIndir_hits3_32:
+        .tc vgPlain_stats__n_xIndir_hits3_32[TC], vgPlain_stats__n_xIndir_hits3_32
+
+.tocent__vgPlain_stats__n_xIndir_misses_32:
+        .tc vgPlain_stats__n_xIndir_misses_32[TC], vgPlain_stats__n_xIndir_misses_32
+
 .tocent__vgPlain_machine_ppc64_has_VMX:
-        .tc vgPlain_machine_ppc64_has_VMX[TC],vgPlain_machine_ppc64_has_VMX
+        .tc vgPlain_machine_ppc64_has_VMX[TC], vgPlain_machine_ppc64_has_VMX
 
 /*------------------------------------------------------------*/
 /*---                                                      ---*/
@@ -518,47 +531,127 @@ VG_(disp_cp_xindir):
         addi  2,2,.TOC.-0b@l
         .localentry VG_(disp_cp_xindir), .-VG_(disp_cp_xindir)
 #endif
-	/* Where are we going? */
-        ld      3,OFFSET_ppc64_CIA(31)
+        /* Where are we going? */
+        ld    20, OFFSET_ppc64_CIA(31)
 
         /* stats only */
-	ld	5, .tocent__vgPlain_stats__n_xindirs_32@toc(2)
-        lwz     6,0(5)
-        addi    6,6,1
-        stw     6,0(5)
-
-	/* r5 = &VG_(tt_fast) */
-	ld	5, .tocent__vgPlain_tt_fast@toc(2) /* &VG_(tt_fast) */
-
-        /* try a fast lookup in the translation cache */
-        /* r4 = VG_TT_FAST_HASH(addr)           * sizeof(FastCacheEntry)
-              = ((r3 >>u 2) & VG_TT_FAST_MASK)  << 4 */
-	rldicl	4,3, 62, 64-VG_TT_FAST_BITS   /* entry# */
-	sldi	4,4,4      /* entry# * sizeof(FastCacheEntry) */
-	add	5,5,4      /* & VG_(tt_fast)[entry#] */
-	ld	6,0(5)     /* .guest */
-	ld	7,8(5)     /* .host */
-        cmpd    3,6
-        bne     .fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-        mtctr   7
+        ld    24, .tocent__vgPlain_stats__n_xIndirs_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+
+        // LIVE: r31 (guest state ptr), r20 (guest address to go to).
+        // We use 6 temporaries:
+        //   r26 (to point at the relevant FastCacheSet),
+        //   r21, r22, r23 (scratch, for swapping entries within a set)
+        //   r24, r25 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute r26 = VG_TT_FAST_HASH(guest)
+        srdi  26, 20, 2                       // g2 = guest >> 2
+        srdi  25, 20, (VG_TT_FAST_BITS + 2)   // (g2 >> VG_TT_FAST_BITS)
+        xor   26, 26, 25                      // (g2 >> VG_TT_FAST_BITS) ^ g2
+        andi. 26, 26, VG_TT_FAST_MASK         // setNo
+
+        // Compute r6 = &VG_(tt_fast)[r6]
+        ld    25, .tocent__vgPlain_tt_fast@toc(2)
+        sldi  26, 26, VG_FAST_CACHE_SET_BITS
+        add   26, 26, 25
+
+        // LIVE: r31 (guest state ptr), r20 (guest addr), r26 (cache set)
+        // try way 0
+        ld    24, FCS_g0(26)   // .guest0
+        ld    25, FCS_h0(26)   // .host0
+        cmpd  24, 20  // cmp against .guest0
+        bne   1f
+        // hit at way 0
+        // goto .host0
+        mtctr 25
         bctr
-#if _CALL_ELF == 2
-        .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir)
-#endif
+        /*NOTREACHED*/
+
+1:      // try way 1
+        ld    24, FCS_g1(26)
+        cmpd  24, 20  // cmp against .guest1
+        bne   2f
+        // hit at way 1; swap upwards
+        ld    21, FCS_g0(26)   // 21 = old .guest0
+        ld    22, FCS_h0(26)   // 22 = old .host0
+        ld    23, FCS_h1(26)   // 23 = old .host1
+        std   20, FCS_g0(26)   // new .guest0 = guest
+        std   23, FCS_h0(26)   // new .host0 = old .host1
+        std   21, FCS_g1(26)   // new .guest1 = old .guest0
+        std   22, FCS_h1(26)   // new .host1 = old .host0
+        // stats only
+        ld    24, .tocent__vgPlain_stats__n_xIndir_hits1_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host1 a.k.a. new .host0
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
+
+2:      // try way 2
+        ld    24, FCS_g2(26)
+        cmpd  24, 20   // cmp against .guest2
+        bne   3f
+        // hit at way 2; swap upwards
+        ld    21, FCS_g1(26)
+        ld    22, FCS_h1(26)
+        ld    23, FCS_h2(26)
+        std   20, FCS_g1(26)
+        std   23, FCS_h1(26)
+        std   21, FCS_g2(26)
+        std   22, FCS_h2(26)
+        // stats only
+        ld    24, .tocent__vgPlain_stats__n_xIndir_hits2_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host2 a.k.a. new .host1
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
+
+3:      // try way 3
+        ld    24, FCS_g3(26)
+        cmpd  24, 20   // cmp against .guest3
+        bne   4f
+        // hit at way 3; swap upwards
+        ld    21, FCS_g2(26)
+        ld    22, FCS_h2(26)
+        ld    23, FCS_h3(26)
+        std   20, FCS_g2(26)
+        std   23, FCS_h2(26)
+        std   21, FCS_g3(26)
+        std   22, FCS_h3(26)
+        // stats only
+        ld    24, .tocent__vgPlain_stats__n_xIndir_hits3_32@toc(2)
+        lwz   25, 0(24)
+        addi  25, 25, 1
+        stw   25, 0(24)
+        // goto old .host3 a.k.a. new .host2
+        mtctr 23
+        bctr
+        /*NOTREACHED*/
 
-.fast_lookup_failed:
+4:      // fast lookup failed:
         /* stats only */
-	ld	5, .tocent__vgPlain_stats__n_xindir_misses_32@toc(2)
-        lwz     6,0(5)
-        addi    6,6,1
-        stw     6,0(5)
-
-        li      6,VG_TRC_INNER_FASTMISS
-        li      7,0
-        b       .postamble
+	ld    24, .tocent__vgPlain_stats__n_xIndir_misses_32@toc(2)
+        lwz   25, 0(24)
+        addi  25 ,25, 1
+        stw   25 ,0(24)
+
+        li    6,VG_TRC_INNER_FASTMISS
+        li    7,0
+        b     .postamble
 	/*NOTREACHED*/
+#if _CALL_ELF == 2
+        .size VG_(disp_cp_xindir),.-VG_(disp_cp_xindir)
+#endif
 
 /* ------ Assisted jump ------ */
 .section ".text"
diff --git a/coregrind/m_dispatch/dispatch-s390x-linux.S b/coregrind/m_dispatch/dispatch-s390x-linux.S
index 83c2e2a1da..c31e32a218 100644
--- a/coregrind/m_dispatch/dispatch-s390x-linux.S
+++ b/coregrind/m_dispatch/dispatch-s390x-linux.S
@@ -197,54 +197,121 @@ VG_(disp_cp_chain_me_to_fastEP):
 /* ------ Indirect but boring jump ------ */
         .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
-	/* Where are we going? */
-        lg      %r2, OFFSET_s390x_IA(%r13)
-
-        /* Increment VG_(stats__n_xindirs_32) */
-        larl    %r8, VG_(stats__n_xindirs_32)
-        l       %r10,0(%r8)
-        ahi     %r10,1
-        st      %r10,0(%r8)
-
-	/* Try a fast lookup in the translation cache:
-           Compute offset (not index) into VT_(tt_fast):
-
-           offset = VG_TT_FAST_HASH(addr) * sizeof(FastCacheEntry)
-
-           with VG_TT_FAST_HASH(addr) == (addr >> 1) & VG_TT_FAST_MASK
-           and  sizeof(FastCacheEntry) == 16
-
-           offset = ((addr >> 1) & VG_TT_FAST_MASK) << 4
-           which is
-           offset = ((addr & (VG_TT_FAST_MASK << 1) ) << 3
-        */
-        larl    %r8, VG_(tt_fast)
-        llill   %r5,(VG_TT_FAST_MASK << 1) & 0xffff
-#if ((( VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16 != 0)
-        iilh    %r5,((VG_TT_FAST_MASK << 1) & 0xffff0000) >> 16
-#endif
-        ngr     %r5,%r2
-        sllg    %r7,%r5,3
-        lg      %r11, 8(%r8,%r7)      /* .host */
-        cg      %r2,  0(%r8,%r7)      /* next guest address == .guest ? */
-        jne     fast_lookup_failed
-        
-        /* Found a match.  Call .host.
-           r11 is an address. There we will find the instrumented client code.
-           That code may modify the guest state register r13. */
-        br      %r11
-        .long   0x0   /* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
-        /* Increment VG_(stats__n_xindir_misses_32) */
-        larl    %r8, VG_(stats__n_xindir_misses_32)
-        l       %r10,0(%r8)
-        ahi     %r10,1
-        st      %r10,0(%r8)
-
-        lghi    %r0,VG_TRC_INNER_FASTMISS
-        lghi    %r1,0
+        /* Where are we going? */
+        lg      %r6, OFFSET_s390x_IA(%r13)   // "guest"
+
+        /* stats only */
+        larl    %r11, VG_(stats__n_xIndirs_32)
+        l       %r12, 0(%r11)
+        ahi     %r12, 1
+        st      %r12, 0(%r11)
+
+        // LIVE: r13 (guest state ptr), r6 (guest address to go to).
+        // We use 6 temporaries:
+        //   r7 (to point at the relevant FastCacheSet),
+        //   r8, r9, r10 (scratch, for swapping entries within a set)
+        //   r11, r12 (other scratch)
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute %r7 = VG_TT_FAST_HASH(guest)
+        srlg    %r7, %r6, 1                     // g1 = guest >> 1
+        srlg    %r8, %r6, (VG_TT_FAST_BITS + 1) // (g1 >> VG_TT_FAST_BITS)
+        xgr     %r7, %r8                        // (g1 >> VG_TT_FAST_BITS) ^ g1
+        llill   %r8, VG_TT_FAST_MASK & 0xffff
+#       if ((VG_TT_FAST_MASK & 0xffff0000) >> 16 != 0)
+        iilh    %r8, (VG_TT_FAST_MASK & 0xffff0000) >> 16
+#       endif
+        ngr     %r7, %r8                        // setNo
+
+        // Compute %r7 = &VG_(tt_fast)[%r7]
+        sllg    %r7,%r7, VG_FAST_CACHE_SET_BITS // setNo * sizeof(FastCacheSet)
+        larl    %r8, VG_(tt_fast)             // &VG_(tt_fast)[0]
+        agr     %r7, %r8                      // &VG_(tt_fast)[setNo]
+
+        // LIVE: %r13 (guest state ptr), %r6 (guest addr), %r7 (cache set)
+        // try way 0
+        cg      %r6, FCS_g0(%r7)   // cmp against .guest0
+        lg      %r8, FCS_h0(%r7)
+        jne     1f
+        // hit at way 0
+        // goto .host0
+        br      %r8
+        /*NOTREACHED*/
+        .long   0
+
+1:      // try way 1
+        cg      %r6, FCS_g1(%r7)   // cmp against .guest1
+        jne     2f
+        // hit at way 1; swap upwards
+        lg      %r8,  FCS_g0(%r7)  // r8  = old .guest0
+        lg      %r9,  FCS_h0(%r7)  // r9  = old .host0
+        lg      %r10, FCS_h1(%r7)  // r10 = old .host1
+        stg     %r6,  FCS_g0(%r7)  // new .guest0 = guest
+        stg     %r10, FCS_h0(%r7)  // new .host0 = old .host1
+        stg     %r8,  FCS_g1(%r7)  // new .guest1 = old .guest0
+        stg     %r9,  FCS_h1(%r7)  // new .host1 = old .host0
+        // stats only
+        larl    %r11, VG_(stats__n_xIndir_hits1_32)
+        l       %r12, 0(%r11)
+        ahi     %r12, 1
+        st      %r12, 0(%r11)
+        // goto old .host1 a.k.a. new .host0
+        br      %r10
+        /*NOTREACHED*/
+        .long 0
+
+2:      // try way 2
+        cg      %r6, FCS_g2(%r7)   // cmp against .guest2
+        jne     3f
+        lg      %r8,  FCS_g1(%r7)
+        lg      %r9,  FCS_h1(%r7)
+        lg      %r10, FCS_h2(%r7)
+        stg     %r6,  FCS_g1(%r7)
+        stg     %r10, FCS_h1(%r7)
+        stg     %r8,  FCS_g2(%r7)
+        stg     %r9,  FCS_h2(%r7)
+        // stats only
+        larl    %r11, VG_(stats__n_xIndir_hits2_32)
+        l       %r12, 0(%r11)
+        ahi     %r12, 1
+        st      %r12, 0(%r11)
+        // goto old .host2 a.k.a. new .host1
+        br      %r10
+        /*NOTREACHED*/
+        .long 0
+
+3:      // try way 3
+        cg      %r6, FCS_g3(%r7)   // cmp against .guest3
+        jne     4f
+        // hit at way 3; swap upwards
+        lg      %r8,  FCS_g2(%r7)
+        lg      %r9,  FCS_h2(%r7)
+        lg      %r10, FCS_h3(%r7)
+        stg     %r6,  FCS_g2(%r7)
+        stg     %r10, FCS_h2(%r7)
+        stg     %r8,  FCS_g3(%r7)
+        stg     %r9,  FCS_h3(%r7)
+        // stats only
+        larl    %r11, VG_(stats__n_xIndir_hits3_32)
+        l       %r12, 0(%r11)
+        ahi     %r12, 1
+        st      %r12, 0(%r11)
+        // goto old .host3 a.k.a. new .host2
+        br      %r10
+        .long 0
+
+4:      // fast lookup failed
+        larl    %r11, VG_(stats__n_xIndir_misses_32)
+        l       %r12, 0(%r11)
+        ahi     %r12, 1
+        st      %r12, 0(%r11)
+
+        lghi    %r0, VG_TRC_INNER_FASTMISS
+        lghi    %r1, 0
         j       postamble
+        /*NOTREACHED*/
 
         
 /* ------ Assisted jump ------ */
diff --git a/coregrind/m_dispatch/dispatch-x86-darwin.S b/coregrind/m_dispatch/dispatch-x86-darwin.S
index 55188e9c58..467d7d62de 100644
--- a/coregrind/m_dispatch/dispatch-x86-darwin.S
+++ b/coregrind/m_dispatch/dispatch-x86-darwin.S
@@ -194,29 +194,91 @@ VG_(disp_cp_chain_me_to_fastEP):
         jmp     postamble
 
 /* ------ Indirect but boring jump ------ */
-.globl VG_(disp_cp_xindir)
+.global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
 	/* Where are we going? */
-	movl	OFFSET_x86_EIP(%ebp), %eax
+	movl	OFFSET_x86_EIP(%ebp), %eax    // "guest"
 
         /* stats only */
-        addl    $1, VG_(stats__n_xindirs_32)
-        
-        /* try a fast lookup in the translation cache */
-        movl    %eax, %ebx                      /* next guest addr */
-        andl    $VG_TT_FAST_MASK, %ebx          /* entry# */
-        movl    0+VG_(tt_fast)(,%ebx,8), %esi   /* .guest */
-        movl    4+VG_(tt_fast)(,%ebx,8), %edi   /* .host */
-        cmpl    %eax, %esi
-        jnz     fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-	jmp 	*%edi
-	ud2	/* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+        addl    $1, VG_(stats__n_xIndirs_32)
+
+        // LIVE: %ebp (guest state ptr), %eax (guest address to go to).
+        // We use 4 temporaries:
+        //   %esi (to point at the relevant FastCacheSet),
+        //   %ebx, %ecx and %edx (scratch).
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute %esi = VG_TT_FAST_HASH(guest)
+        movl    %eax, %esi               // guest
+        shrl    $VG_TT_FAST_BITS, %esi   // (guest >> VG_TT_FAST_BITS)
+        xorl    %eax, %esi               // (guest >> VG_TT_FAST_BITS) ^ guest
+        andl    $VG_TT_FAST_MASK, %esi   // setNo
+
+        // Compute %esi = &VG_(tt_fast)[%esi]
+        shll    $VG_FAST_CACHE_SET_BITS, %esi  // setNo * sizeof(FastCacheSet)
+        leal    VG_(tt_fast)(%esi), %esi       // &VG_(tt_fast)[setNo]
+
+        // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set)
+        // try way 0
+        cmpl    %eax, FCS_g0(%esi)   // cmp against .guest0
+        jnz     1f
+        // hit at way 0
+        jmp    *FCS_h0(%esi)         // goto .host0
+        ud2
+
+1:      // try way 1
+        cmpl    %eax, FCS_g1(%esi)   // cmp against .guest1
+        jnz     2f
+        // hit at way 1; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits1_32)
+        movl    FCS_g0(%esi), %ebx   // ebx = old .guest0
+        movl    FCS_h0(%esi), %ecx   // ecx = old .host0
+        movl    FCS_h1(%esi), %edx   // edx = old .host1
+        movl    %eax, FCS_g0(%esi)   // new .guest0 = guest
+        movl    %edx, FCS_h0(%esi)   // new .host0 = old .host1
+        movl    %ebx, FCS_g1(%esi)   // new .guest1 = old .guest0
+        movl    %ecx, FCS_h1(%esi)   // new .host1 = old .host0
+        jmp     *%edx                // goto old .host1 a.k.a. new .host0
+        ud2
+
+2:      // try way 2
+        cmpl    %eax, FCS_g2(%esi)   // cmp against .guest2
+        jnz     3f
+        // hit at way 2; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits2_32)
+        movl    FCS_g1(%esi), %ebx
+        movl    FCS_h1(%esi), %ecx
+        movl    FCS_h2(%esi), %edx
+        movl    %eax, FCS_g1(%esi)
+        movl    %edx, FCS_h1(%esi)
+        movl    %ebx, FCS_g2(%esi)
+        movl    %ecx, FCS_h2(%esi)
+        jmp     *%edx
+        ud2
+
+3:      // try way 3
+        cmpl    %eax, FCS_g3(%esi)   // cmp against .guest3
+        jnz     4f
+        // hit at way 3; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits3_32)
+        movl    FCS_g2(%esi), %ebx
+        movl    FCS_h2(%esi), %ecx
+        movl    FCS_h3(%esi), %edx
+        movl    %eax, FCS_g2(%esi)
+        movl    %edx, FCS_h2(%esi)
+        movl    %ebx, FCS_g3(%esi)
+        movl    %ecx, FCS_h3(%esi)
+        jmp     *%edx
+        ud2
+
+4:      // fast lookup failed
         /* stats only */
-        addl    $1, VG_(stats__n_xindir_misses_32)
+        addl    $1, VG_(stats__n_xIndir_misses_32)
 
 	movl	$VG_TRC_INNER_FASTMISS, %eax
         movl    $0, %edx
diff --git a/coregrind/m_dispatch/dispatch-x86-linux.S b/coregrind/m_dispatch/dispatch-x86-linux.S
index d949f1fd3e..7270744db0 100644
--- a/coregrind/m_dispatch/dispatch-x86-linux.S
+++ b/coregrind/m_dispatch/dispatch-x86-linux.S
@@ -198,26 +198,88 @@ VG_(disp_cp_chain_me_to_fastEP):
 .global VG_(disp_cp_xindir)
 VG_(disp_cp_xindir):
 	/* Where are we going? */
-	movl	OFFSET_x86_EIP(%ebp), %eax
+	movl	OFFSET_x86_EIP(%ebp), %eax    // "guest"
 
         /* stats only */
-        addl    $1, VG_(stats__n_xindirs_32)
-        
-        /* try a fast lookup in the translation cache */
-        movl    %eax, %ebx                      /* next guest addr */
-        andl    $VG_TT_FAST_MASK, %ebx          /* entry# */
-        movl    0+VG_(tt_fast)(,%ebx,8), %esi   /* .guest */
-        movl    4+VG_(tt_fast)(,%ebx,8), %edi   /* .host */
-        cmpl    %eax, %esi
-        jnz     fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-	jmp 	*%edi
-	ud2	/* persuade insn decoders not to speculate past here */
-
-fast_lookup_failed:
+        addl    $1, VG_(stats__n_xIndirs_32)
+
+        // LIVE: %ebp (guest state ptr), %eax (guest address to go to).
+        // We use 4 temporaries:
+        //   %esi (to point at the relevant FastCacheSet),
+        //   %ebx, %ecx and %edx (scratch).
+
+        /* Try a fast lookup in the translation cache.  This is pretty much
+           a handcoded version of VG_(lookupInFastCache). */
+
+        // Compute %esi = VG_TT_FAST_HASH(guest)
+        movl    %eax, %esi               // guest
+        shrl    $VG_TT_FAST_BITS, %esi   // (guest >> VG_TT_FAST_BITS)
+        xorl    %eax, %esi               // (guest >> VG_TT_FAST_BITS) ^ guest
+        andl    $VG_TT_FAST_MASK, %esi   // setNo
+
+        // Compute %esi = &VG_(tt_fast)[%esi]
+        shll    $VG_FAST_CACHE_SET_BITS, %esi  // setNo * sizeof(FastCacheSet)
+        leal    VG_(tt_fast)(%esi), %esi       // &VG_(tt_fast)[setNo]
+
+        // LIVE: %ebp (guest state ptr), %eax (guest addr), %esi (cache set)
+        // try way 0
+        cmpl    %eax, FCS_g0(%esi)   // cmp against .guest0
+        jnz     1f
+        // hit at way 0
+        jmp    *FCS_h0(%esi)         // goto .host0
+        ud2
+
+1:      // try way 1
+        cmpl    %eax, FCS_g1(%esi)   // cmp against .guest1
+        jnz     2f
+        // hit at way 1; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits1_32)
+        movl    FCS_g0(%esi), %ebx   // ebx = old .guest0
+        movl    FCS_h0(%esi), %ecx   // ecx = old .host0
+        movl    FCS_h1(%esi), %edx   // edx = old .host1
+        movl    %eax, FCS_g0(%esi)   // new .guest0 = guest
+        movl    %edx, FCS_h0(%esi)   // new .host0 = old .host1
+        movl    %ebx, FCS_g1(%esi)   // new .guest1 = old .guest0
+        movl    %ecx, FCS_h1(%esi)   // new .host1 = old .host0
+        jmp     *%edx                // goto old .host1 a.k.a. new .host0
+        ud2
+
+2:      // try way 2
+        cmpl    %eax, FCS_g2(%esi)   // cmp against .guest2
+        jnz     3f
+        // hit at way 2; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits2_32)
+        movl    FCS_g1(%esi), %ebx
+        movl    FCS_h1(%esi), %ecx
+        movl    FCS_h2(%esi), %edx
+        movl    %eax, FCS_g1(%esi)
+        movl    %edx, FCS_h1(%esi)
+        movl    %ebx, FCS_g2(%esi)
+        movl    %ecx, FCS_h2(%esi)
+        jmp     *%edx
+        ud2
+
+3:      // try way 3
+        cmpl    %eax, FCS_g3(%esi)   // cmp against .guest3
+        jnz     4f
+        // hit at way 3; swap upwards
+        /* stats only */
+        addl    $1, VG_(stats__n_xIndir_hits3_32)
+        movl    FCS_g2(%esi), %ebx
+        movl    FCS_h2(%esi), %ecx
+        movl    FCS_h3(%esi), %edx
+        movl    %eax, FCS_g2(%esi)
+        movl    %edx, FCS_h2(%esi)
+        movl    %ebx, FCS_g3(%esi)
+        movl    %ecx, FCS_h3(%esi)
+        jmp     *%edx
+        ud2
+
+4:      // fast lookup failed
         /* stats only */
-        addl    $1, VG_(stats__n_xindir_misses_32)
+        addl    $1, VG_(stats__n_xIndir_misses_32)
 
 	movl	$VG_TRC_INNER_FASTMISS, %eax
         movl    $0, %edx
diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c
index 68e9590a01..bd266e4f39 100644
--- a/coregrind/m_scheduler/scheduler.c
+++ b/coregrind/m_scheduler/scheduler.c
@@ -130,16 +130,23 @@ static void mostly_clear_thread_record ( ThreadId tid );
 static ULong n_scheduling_events_MINOR = 0;
 static ULong n_scheduling_events_MAJOR = 0;
 
-/* Stats: number of XIndirs, and number that missed in the fast
-   cache. */
-static ULong stats__n_xindirs = 0;
-static ULong stats__n_xindir_misses = 0;
+/* Stats: number of XIndirs looked up in the fast cache, the number of hits in
+   ways 1, 2 and 3, and the number of misses.  The number of hits in way 0 isn't
+   recorded because it can be computed from these five numbers. */
+static ULong stats__n_xIndirs = 0;
+static ULong stats__n_xIndir_hits1 = 0;
+static ULong stats__n_xIndir_hits2 = 0;
+static ULong stats__n_xIndir_hits3 = 0;
+static ULong stats__n_xIndir_misses = 0;
 
 /* And 32-bit temp bins for the above, so that 32-bit platforms don't
    have to do 64 bit incs on the hot path through
-   VG_(cp_disp_xindir). */
-/*global*/ UInt VG_(stats__n_xindirs_32) = 0;
-/*global*/ UInt VG_(stats__n_xindir_misses_32) = 0;
+   VG_(disp_cp_xindir). */
+/*global*/ UInt VG_(stats__n_xIndirs_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_hits1_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_hits2_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_hits3_32) = 0;
+/*global*/ UInt VG_(stats__n_xIndir_misses_32) = 0;
 
 /* Sanity checking counts. */
 static UInt sanity_fast_count = 0;
@@ -149,11 +156,25 @@ void VG_(print_scheduler_stats)(void)
 {
    VG_(message)(Vg_DebugMsg,
       "scheduler: %'llu event checks.\n", bbs_done );
+
+   const ULong hits0
+      = stats__n_xIndirs - stats__n_xIndir_hits1 - stats__n_xIndir_hits2
+        - stats__n_xIndir_hits3 - stats__n_xIndir_misses;
+   VG_(message)(Vg_DebugMsg,
+                "scheduler: %'llu indir transfers, "
+                "%'llu misses (1 in %llu) ..\n",
+                stats__n_xIndirs, stats__n_xIndir_misses,
+                stats__n_xIndirs / (stats__n_xIndir_misses
+                                   ? stats__n_xIndir_misses : 1));
    VG_(message)(Vg_DebugMsg,
-                "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n",
-                stats__n_xindirs, stats__n_xindir_misses,
-                stats__n_xindirs / (stats__n_xindir_misses 
-                                    ? stats__n_xindir_misses : 1));
+                "scheduler: .. of which: %'llu hit0, %'llu hit1, "
+                "%'llu hit2, %'llu hit3, %'llu missed\n",
+                hits0,
+                stats__n_xIndir_hits1,
+                stats__n_xIndir_hits2,
+                stats__n_xIndir_hits3,
+                stats__n_xIndir_misses);
+
    VG_(message)(Vg_DebugMsg,
       "scheduler: %'llu/%'llu major/minor sched events.\n",
       n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
@@ -928,8 +949,11 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words,
    /* end Paranoia */
 
    /* Futz with the XIndir stats counters. */
-   vg_assert(VG_(stats__n_xindirs_32) == 0);
-   vg_assert(VG_(stats__n_xindir_misses_32) == 0);
+   vg_assert(VG_(stats__n_xIndirs_32) == 0);
+   vg_assert(VG_(stats__n_xIndir_hits1_32) == 0);
+   vg_assert(VG_(stats__n_xIndir_hits2_32) == 0);
+   vg_assert(VG_(stats__n_xIndir_hits3_32) == 0);
+   vg_assert(VG_(stats__n_xIndir_misses_32) == 0);
 
    /* Clear return area. */
    two_words[0] = two_words[1] = 0;
@@ -940,10 +964,13 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words,
       host_code_addr = alt_host_addr;
    } else {
       /* normal case -- redir translation */
-      UInt cno = (UInt)VG_TT_FAST_HASH((Addr)tst->arch.vex.VG_INSTR_PTR);
-      if (LIKELY(VG_(tt_fast)[cno].guest == (Addr)tst->arch.vex.VG_INSTR_PTR))
-         host_code_addr = VG_(tt_fast)[cno].host;
-      else {
+      Addr host_from_fast_cache = 0;
+      Bool found_in_fast_cache
+         = VG_(lookupInFastCache)( &host_from_fast_cache,
+                                   (Addr)tst->arch.vex.VG_INSTR_PTR );
+      if (found_in_fast_cache) {
+         host_code_addr = host_from_fast_cache;
+      } else {
          Addr res = 0;
          /* not found in VG_(tt_fast). Searching here the transtab
             improves the performance compared to returning directly
@@ -1027,10 +1054,16 @@ void run_thread_for_a_while ( /*OUT*/HWord* two_words,
    /* Merge the 32-bit XIndir/miss counters into the 64 bit versions,
       and zero out the 32-bit ones in preparation for the next run of
       generated code. */
-   stats__n_xindirs += (ULong)VG_(stats__n_xindirs_32);
-   VG_(stats__n_xindirs_32) = 0;
-   stats__n_xindir_misses += (ULong)VG_(stats__n_xindir_misses_32);
-   VG_(stats__n_xindir_misses_32) = 0;
+   stats__n_xIndirs += (ULong)VG_(stats__n_xIndirs_32);
+   VG_(stats__n_xIndirs_32) = 0;
+   stats__n_xIndir_hits1 += (ULong)VG_(stats__n_xIndir_hits1_32);
+   VG_(stats__n_xIndir_hits1_32) = 0;
+   stats__n_xIndir_hits2 += (ULong)VG_(stats__n_xIndir_hits2_32);
+   VG_(stats__n_xIndir_hits2_32) = 0;
+   stats__n_xIndir_hits3 += (ULong)VG_(stats__n_xIndir_hits3_32);
+   VG_(stats__n_xIndir_hits3_32) = 0;
+   stats__n_xIndir_misses += (ULong)VG_(stats__n_xIndir_misses_32);
+   VG_(stats__n_xIndir_misses_32) = 0;
 
    /* Inspect the event counter. */
    vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1);
diff --git a/coregrind/m_transtab.c b/coregrind/m_transtab.c
index ef2e3df863..23ecb11f04 100644
--- a/coregrind/m_transtab.c
+++ b/coregrind/m_transtab.c
@@ -457,9 +457,10 @@ static Int    tc_sector_szQ = 0;
 static SECno sector_search_order[MAX_N_SECTORS];
 
 
-/* Fast helper for the TC.  A direct-mapped cache which holds a set of
-   recently used (guest address, host address) pairs.  This array is
-   referred to directly from m_dispatch/dispatch-<platform>.S.
+/* Fast helper for the TC.  A 4-way set-associative cache, with more-or-less LRU
+   replacement.  It holds a set of recently used (guest address, host address)
+   pairs.  This array is referred to directly from
+   m_dispatch/dispatch-<platform>.S.
 
    Entries in tt_fast may refer to any valid TC entry, regardless of
    which sector it's in.  Consequently we must be very careful to
@@ -474,13 +475,19 @@ static SECno sector_search_order[MAX_N_SECTORS];
 /*
 typedef
    struct { 
-      Addr guest;
-      Addr host;
-   }
-   FastCacheEntry;
+      Addr guest0;
+      Addr host0;
+      Addr guest1;
+      Addr host1;
+      Addr guest2;
+      Addr host2;
+      Addr guest3;
+      Addr host3;
+   }
+   FastCacheSet;
 */
-/*global*/ __attribute__((aligned(16)))
-           FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE];
+/*global*/ __attribute__((aligned(64)))
+           FastCacheSet VG_(tt_fast)[VG_TT_FAST_SETS];
 
 /* Make sure we're not used before initialisation. */
 static Bool init_done = False;
@@ -1455,36 +1462,40 @@ static inline HTTno HASH_TT ( Addr key )
    return (HTTno)(k32 % N_HTTES_PER_SECTOR);
 }
 
-static void setFastCacheEntry ( Addr key, ULong* tcptr )
-{
-   UInt cno = (UInt)VG_TT_FAST_HASH(key);
-   VG_(tt_fast)[cno].guest = key;
-   VG_(tt_fast)[cno].host  = (Addr)tcptr;
-   n_fast_updates++;
-   /* This shouldn't fail.  It should be assured by m_translate
-      which should reject any attempt to make translation of code
-      starting at TRANSTAB_BOGUS_GUEST_ADDR. */
-   vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR);
-}
-
 /* Invalidate the fast cache VG_(tt_fast). */
 static void invalidateFastCache ( void )
 {
-   UInt j;
-   /* This loop is popular enough to make it worth unrolling a
-      bit, at least on ppc32. */
-   vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
-   for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
-      VG_(tt_fast)[j+0].guest = TRANSTAB_BOGUS_GUEST_ADDR;
-      VG_(tt_fast)[j+1].guest = TRANSTAB_BOGUS_GUEST_ADDR;
-      VG_(tt_fast)[j+2].guest = TRANSTAB_BOGUS_GUEST_ADDR;
-      VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR;
+   for (UWord j = 0; j < VG_TT_FAST_SETS; j++) {
+      FastCacheSet* set = &VG_(tt_fast)[j];
+      set->guest0 = TRANSTAB_BOGUS_GUEST_ADDR;
+      set->guest1 = TRANSTAB_BOGUS_GUEST_ADDR;
+      set->guest2 = TRANSTAB_BOGUS_GUEST_ADDR;
+      set->guest3 = TRANSTAB_BOGUS_GUEST_ADDR;
    }
-
-   vg_assert(j == VG_TT_FAST_SIZE);
    n_fast_flushes++;
 }
 
+static void setFastCacheEntry ( Addr guest, ULong* tcptr )
+{
+   /* This shouldn't fail.  It should be assured by m_translate
+      which should reject any attempt to make translation of code
+      starting at TRANSTAB_BOGUS_GUEST_ADDR. */
+   vg_assert(guest != TRANSTAB_BOGUS_GUEST_ADDR);
+   /* Shift all entries along one, so that the LRU one disappears, and put the
+      new entry at the MRU position. */
+   UWord setNo = (UInt)VG_TT_FAST_HASH(guest);
+   FastCacheSet* set = &VG_(tt_fast)[setNo];
+   set->host3  = set->host2;
+   set->guest3 = set->guest2;
+   set->host2  = set->host1;
+   set->guest2 = set->guest1;
+   set->host1  = set->host0;
+   set->guest1 = set->guest0;
+   set->host0  = (Addr)tcptr;
+   set->guest0 = guest;
+   n_fast_updates++;
+}
+
 
 static TTEno get_empty_tt_slot(SECno sNo)
 {
@@ -2432,15 +2443,36 @@ void VG_(init_tt_tc) ( void )
    vg_assert(N_HTTES_PER_SECTOR < INV_TTE);
    vg_assert(N_HTTES_PER_SECTOR < EC2TTE_DELETED);
    vg_assert(N_HTTES_PER_SECTOR < HTT_EMPTY);
-   /* check fast cache entries really are 2 words long */
+
+   /* check fast cache entries really are 8 words long */
    vg_assert(sizeof(Addr) == sizeof(void*));
-   vg_assert(sizeof(FastCacheEntry) == 2 * sizeof(Addr));
+   vg_assert(sizeof(FastCacheSet) == 8 * sizeof(Addr));
    /* check fast cache entries are packed back-to-back with no spaces */
    vg_assert(sizeof( VG_(tt_fast) ) 
-             == VG_TT_FAST_SIZE * sizeof(FastCacheEntry));
+             == VG_TT_FAST_SETS * sizeof(FastCacheSet));
+   /* check fast cache entries have the layout that the handwritten assembly
+      fragments assume. */
+   vg_assert(sizeof(FastCacheSet) == (1 << VG_FAST_CACHE_SET_BITS));
+   vg_assert(offsetof(FastCacheSet,guest0) == FCS_g0);
+   vg_assert(offsetof(FastCacheSet,host0)  == FCS_h0);
+   vg_assert(offsetof(FastCacheSet,guest1) == FCS_g1);
+   vg_assert(offsetof(FastCacheSet,host1)  == FCS_h1);
+   vg_assert(offsetof(FastCacheSet,guest2) == FCS_g2);
+   vg_assert(offsetof(FastCacheSet,host2)  == FCS_h2);
+   vg_assert(offsetof(FastCacheSet,guest3) == FCS_g3);
+   vg_assert(offsetof(FastCacheSet,host3)  == FCS_h3);
+   vg_assert(offsetof(FastCacheSet,guest0) == 0 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,host0)  == 1 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,guest1) == 2 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,host1)  == 3 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,guest2) == 4 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,host2)  == 5 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,guest3) == 6 * sizeof(Addr));
+   vg_assert(offsetof(FastCacheSet,host3)  == 7 * sizeof(Addr));
+
    /* check fast cache is aligned as we requested.  Not fatal if it
       isn't, but we might as well make sure. */
-   vg_assert(VG_IS_16_ALIGNED( ((Addr) & VG_(tt_fast)[0]) ));
+   vg_assert(VG_IS_64_ALIGNED( ((Addr) & VG_(tt_fast)[0]) ));
 
    /* The TTEntryH size is critical for keeping the LLC miss rate down
       when doing a lot of discarding.  Hence check it here.  We also
diff --git a/coregrind/pub_core_transtab.h b/coregrind/pub_core_transtab.h
index 951cbd9496..a77ca3c19e 100644
--- a/coregrind/pub_core_transtab.h
+++ b/coregrind/pub_core_transtab.h
@@ -41,20 +41,107 @@
 #include "pub_tool_transtab.h"
 #include "libvex.h"                   // VexGuestExtents
 
-/* The fast-cache for tt-lookup.  Unused entries are denoted by .guest
-   == 1, which is assumed to be a bogus address for all guest code. */
+/* The fast-cache for tt-lookup.  Unused entries are denoted by
+   .guest == TRANSTAB_BOGUS_GUEST_ADDR (viz, 1), which is assumed
+   to be a bogus address for all guest code.  See pub_core_transtab_asm.h
+   for further description. */
 typedef
    struct { 
-      Addr guest;
-      Addr host;
+      Addr guest0;
+      Addr host0;
+      Addr guest1;
+      Addr host1;
+      Addr guest2;
+      Addr host2;
+      Addr guest3;
+      Addr host3;
    }
-   FastCacheEntry;
+   FastCacheSet;
 
-extern __attribute__((aligned(16)))
-       FastCacheEntry VG_(tt_fast) [VG_TT_FAST_SIZE];
+STATIC_ASSERT(sizeof(Addr) == sizeof(UWord));
+STATIC_ASSERT(sizeof(FastCacheSet) == sizeof(Addr) * 8);
+
+extern __attribute__((aligned(64)))
+       FastCacheSet VG_(tt_fast) [VG_TT_FAST_SETS];
 
 #define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
 
+#if defined(VGA_x86) || defined(VGA_amd64)
+static inline UWord VG_TT_FAST_HASH ( Addr guest ) {
+   // There's no minimum insn alignment on these targets.
+   UWord merged = ((UWord)guest) >> 0;
+   merged = (merged >> VG_TT_FAST_BITS) ^ merged;
+   return merged & VG_TT_FAST_MASK;
+}
+
+#elif defined(VGA_s390x) || defined(VGA_arm)
+static inline UWord VG_TT_FAST_HASH ( Addr guest ) {
+   // Instructions are 2-byte aligned.
+   UWord merged = ((UWord)guest) >> 1;
+   merged = (merged >> VG_TT_FAST_BITS) ^ merged;
+   return merged & VG_TT_FAST_MASK;
+}
+
+#elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
+      || defined(VGA_mips32) || defined(VGA_mips64) || defined(VGA_arm64)
+static inline UWord VG_TT_FAST_HASH ( Addr guest ) {
+   // Instructions are 4-byte aligned.
+   UWord merged = ((UWord)guest) >> 2;
+   merged = (merged >> VG_TT_FAST_BITS) ^ merged;
+   return merged & VG_TT_FAST_MASK;
+}
+
+#else
+#  error "VG_TT_FAST_HASH: unknown platform"
+#endif
+
+static inline Bool VG_(lookupInFastCache)( /*MB_OUT*/Addr* host, Addr guest )
+{
+   UWord setNo = (UInt)VG_TT_FAST_HASH(guest);
+   FastCacheSet* set = &VG_(tt_fast)[setNo];
+   if (LIKELY(set->guest0 == guest)) {
+      // hit at way 0
+      *host = set->host0;
+      return True;
+   }
+   if (LIKELY(set->guest1 == guest)) {
+      // hit at way 1; swap upwards
+      Addr tG = guest;
+      Addr tH = set->host1;
+      set->guest1 = set->guest0;
+      set->host1  = set->host0;
+      set->guest0 = tG;
+      set->host0  = tH;
+      *host = tH;
+      return True;
+   }
+   if (LIKELY(set->guest2 == guest)) {
+      // hit at way 2; swap upwards
+      Addr tG = guest;
+      Addr tH = set->host2;
+      set->guest2 = set->guest1;
+      set->host2  = set->host1;
+      set->guest1 = tG;
+      set->host1  = tH;
+      *host = tH;
+      return True;
+   }
+   if (LIKELY(set->guest3 == guest)) {
+      // hit at way 3; swap upwards
+      Addr tG = guest;
+      Addr tH = set->host3;
+      set->guest3 = set->guest2;
+      set->host3  = set->host2;
+      set->guest2 = tG;
+      set->host2  = tH;
+      *host = tH;
+      return True;
+   }
+   // Not found
+   *host = 0;
+   return False;
+}
+
 
 /* Initialises the TC, using VG_(clo_num_transtab_sectors)
    and VG_(clo_avg_transtab_entry_size).
diff --git a/coregrind/pub_core_transtab_asm.h b/coregrind/pub_core_transtab_asm.h
index e1e2687c2b..9e85774c74 100644
--- a/coregrind/pub_core_transtab_asm.h
+++ b/coregrind/pub_core_transtab_asm.h
@@ -31,43 +31,88 @@
 #ifndef __PUB_CORE_TRANSTAB_ASM_H
 #define __PUB_CORE_TRANSTAB_ASM_H
 
-/* Constants for the fast translation lookup cache.  It is a direct
-   mapped cache, with 2^VG_TT_FAST_BITS entries.
+/* Constants for the fast translation lookup cache.  It is a 4 way associative
+   cache, with more-or-less LRU replacement.  It contains 2^VG_TT_FAST_BITS
+   sets.
+
+   On all targets, the set number is computed from least significant 2 *
+   VG_TT_FAST_BITS of the guest address.  This is a bit unusual in as much as
+   it is more normal just to use a VG_TT_FAST_BITS-sized slice of the address
+   as the set number.  Using twice as many bits (the two chunks are xor'd)
+   spreads entries out (reduces aliasing) and significantly reduces the overall
+   miss rate.  The cost is two extra cycles on the fast lookup path, to perform
+   an extra shift and an xor.
+
+   For each set there are 4 ways: way0, way1, way2 and way3.  way0 is intended
+   to be the MRU and way3 the LRU.  Most lookups hit way0 and involve no
+   modification of the line.  A hit at way1 causes way0 and way1 to be swapped.
+   A hit at way2 causes way1 and way2 to be swapped; that is, way2 is moved one
+   step closer to the front.  But not all the way to the front.  Similarly a
+   hit at way3 causes way2 and way3 to be swapped.
+
+   See VG_(lookupInFastCache) for a C implementation of this logic and
+   dispatch-*-*.S, label VG_(disp_cp_xindir), for the handcoded assembly
+   equivalents for each target.  Note that VG_(lookupInFastCache) is used in C
+   land for some administrative lookups but isn't really performance critical.
+   The dispatch-*-*.S implementations are used to process all indirect branches
+   in the simulator and so *are* performance critical.
+
+   Updates to the cache are rare.  These are performed by setFastCacheEntry.
+   New entries are put into way0 and all others are shifted down one slot, so
+   that the contents of way3 falls out of the cache.
 
    On x86/amd64, the cache index is computed as
-   'address[VG_TT_FAST_BITS-1 : 0]'.
-
-   On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of
-   instruction addresses are zero, which means that function causes
-   only 1/4 of the entries to ever be used.  So instead the function
-   is '(address >>u 2)[VG_TT_FAST_BITS-1 : 0]' on those targets.
-
-   On ARM we shift by 1, since Thumb insns can be of size 2, hence to
-   minimise collisions and maximise cache utilisation we need to take
-   into account all but the least significant bit.
-
-   On s390x the rightmost bit of an instruction address is zero.
-   For best table utilization shift the address to the right by 1 bit. */
-
-#define VG_TT_FAST_BITS 15
-#define VG_TT_FAST_SIZE (1 << VG_TT_FAST_BITS)
-#define VG_TT_FAST_MASK ((VG_TT_FAST_SIZE) - 1)
-
-/* This macro isn't usable in asm land; nevertheless this seems
-   like a good place to put it. */
-
-#if defined(VGA_x86) || defined(VGA_amd64)
-#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr))     ) & VG_TT_FAST_MASK)
-
-#elif defined(VGA_s390x) || defined(VGA_arm)
-#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr)) >> 1) & VG_TT_FAST_MASK)
-
-#elif defined(VGA_ppc32) || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
-      || defined(VGA_mips32) || defined(VGA_mips64) || defined(VGA_arm64)
-#  define VG_TT_FAST_HASH(_addr)  ((((UWord)(_addr)) >> 2) & VG_TT_FAST_MASK)
+   (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1 : 0]'.
+
+   On ppc32/ppc64/mips32/mips64/arm64, the bottom two bits of instruction
+   addresses are zero, which means the above function causes only 1/4 of the
+   sets to ever be used.  So instead the function is
+   (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+2 : 0+2]'.
+
+   On arm32, the minimum instruction size is 2, so we discard only the least
+   significant bit of the address, hence:
+   (address ^ (address >>u VG_TT_FAST_BITS))[VG_TT_FAST_BITS-1+1 : 0+1]'.
+
+   On s390x the rightmost bit of an instruction address is zero, so the arm32
+   scheme is used. */
+
+#define VG_TT_FAST_BITS 13
+#define VG_TT_FAST_SETS (1 << VG_TT_FAST_BITS)
+#define VG_TT_FAST_MASK ((VG_TT_FAST_SETS) - 1)
+
+// Log2(sizeof(FastCacheSet)).  This is needed in the handwritten assembly.
+
+#if defined(VGA_amd64) || defined(VGA_arm64) \
+    || defined(VGA_ppc64be) || defined(VGA_ppc64le) || defined(VGA_mips64) \
+    || defined(VGA_s390x)
+  // And all other 64-bit hosts
+# define VG_FAST_CACHE_SET_BITS 6
+  // These FCS_{g,h}{0,1,2,3} are the values of
+  // offsetof(FastCacheSet,{guest,host}{0,1,2,3}).
+# define FCS_g0 0
+# define FCS_h0 8
+# define FCS_g1 16
+# define FCS_h1 24
+# define FCS_g2 32
+# define FCS_h2 40
+# define FCS_g3 48
+# define FCS_h3 56
+
+#elif defined(VGA_x86) || defined(VGA_arm) || defined(VGA_ppc32) \
+      || defined(VGA_mips32)
+  // And all other 32-bit hosts
+# define VG_FAST_CACHE_SET_BITS 5
+# define FCS_g0 0
+# define FCS_h0 4
+# define FCS_g1 8
+# define FCS_h1 12
+# define FCS_g2 16
+# define FCS_h2 20
+# define FCS_g3 24
+# define FCS_h3 28
 
 #else
-#  error "VG_TT_FAST_HASH: unknown platform"
+# error "VG_FAST_CACHE_SET_BITS not known"
 #endif
 
 #endif   // __PUB_CORE_TRANSTAB_ASM_H
diff --git a/include/pub_tool_libcbase.h b/include/pub_tool_libcbase.h
index f68579a8fe..476272cdbc 100644
--- a/include/pub_tool_libcbase.h
+++ b/include/pub_tool_libcbase.h
@@ -193,6 +193,7 @@ static void VG_(bzero_inline) ( void* s, SizeT sz )
 #define VG_IS_8_ALIGNED(aaa_p)    (0 == (((Addr)(aaa_p)) & ((Addr)0x7)))
 #define VG_IS_16_ALIGNED(aaa_p)   (0 == (((Addr)(aaa_p)) & ((Addr)0xf)))
 #define VG_IS_32_ALIGNED(aaa_p)   (0 == (((Addr)(aaa_p)) & ((Addr)0x1f)))
+#define VG_IS_64_ALIGNED(aaa_p)   (0 == (((Addr)(aaa_p)) & ((Addr)0x3f)))
 #define VG_IS_WORD_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)(sizeof(Addr)-1))))
 #define VG_IS_PAGE_ALIGNED(aaa_p) (0 == (((Addr)(aaa_p)) & ((Addr)(VKI_PAGE_SIZE-1))))