From: Julian Seward <jseward@acm.org>
Date: Mon, 2 Apr 2012 21:56:03 +0000 (+0000)
Subject: Add translation chaining support for amd64, x86 and ARM
X-Git-Tag: svn/VALGRIND_3_8_0~350^2~11
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b6f93641ca2f6f8caccdf40c29c4271f6799489;p=thirdparty%2Fvalgrind.git

Add translation chaining support for amd64, x86 and ARM
(Valgrind side).  See #296422.


git-svn-id: svn://svn.valgrind.org/valgrind/branches/TCHAIN@12484
---

diff --git a/Makefile.all.am b/Makefile.all.am
index fe6fccd201..fdf4ab99aa 100644
--- a/Makefile.all.am
+++ b/Makefile.all.am
@@ -92,7 +92,9 @@ AM_CFLAGS_BASE = \
 	-Wmissing-declarations \
 	@FLAG_W_NO_FORMAT_ZERO_LENGTH@ \
 	-fno-strict-aliasing \
-	-fno-builtin
+	-fno-builtin \
+	\
+	-O
 
 # These flags are used for building the preload shared objects.
 # The aim is to give reasonable performance but also to have good
diff --git a/coregrind/m_dispatch/dispatch-amd64-linux.S b/coregrind/m_dispatch/dispatch-amd64-linux.S
index a3e22d5a83..459c44708c 100644
--- a/coregrind/m_dispatch/dispatch-amd64-linux.S
+++ b/coregrind/m_dispatch/dispatch-amd64-linux.S
@@ -39,30 +39,36 @@
 
 /*------------------------------------------------------------*/
 /*---                                                      ---*/
-/*--- The dispatch loop.  VG_(run_innerloop) is used to    ---*/
-/*--- run all translations except no-redir ones.           ---*/
+/*--- The dispatch loop.  VG_(disp_run_translations) is    ---*/
+/*--- used to run all translations,                        ---*/
+/*--- including no-redir ones.                             ---*/
 /*---                                                      ---*/
 /*------------------------------------------------------------*/
 
 /*----------------------------------------------------*/
-/*--- Preamble (set everything up)                 ---*/
+/*--- Entry and preamble (set everything up)       ---*/
 /*----------------------------------------------------*/
 
 /* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+                                  void*  guest_state, 
+                                  Addr   host_addr );
 */
-
 .text
-.globl VG_(run_innerloop)
-.type  VG_(run_innerloop), @function
-VG_(run_innerloop):
-	/* %rdi holds guest_state */
-	/* %rsi holds do_profiling */
-	
-	/* ----- entry point to VG_(run_innerloop) ----- */
+.globl VG_(disp_run_translations)
+.type  VG_(disp_run_translations), @function
+VG_(disp_run_translations):
+        /* %rdi holds two_words    */
+	/* %rsi holds guest_state  */
+	/* %rdx holds host_addr    */
+
+        /* The preamble */
+
+        /* Save integer registers, since this is a pseudo-function. */
+        pushq   %rax
 	pushq	%rbx
 	pushq	%rcx
-	pushq	%rdx
+        pushq   %rdx
 	pushq	%rsi
 	pushq	%rbp
 	pushq	%r8
@@ -73,20 +79,10 @@ VG_(run_innerloop):
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	pushq	%rdi  /* guest_state */
-
-	movq	VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
-	movl	(%r15), %r15d
-	pushq	%r15
+        /* %rdi must be saved last */
+	pushq	%rdi
 
-	/* 8(%rsp) holds cached copy of guest_state ptr */
-	/* 0(%rsp) holds cached copy of VG_(dispatch_ctr) */
-
-	/* Set up the guest state pointer */
-	movq	%rdi, %rbp
-	
-	/* fetch %RIP into %rax */
-	movq	OFFSET_amd64_RIP(%rbp), %rax
+        /* Get the host CPU in the state expected by generated code. */
 
 	/* set host FPU control word to the default mode expected 
            by VEX-generated code.  See comments in libvex.h for
@@ -105,158 +101,37 @@ VG_(run_innerloop):
 	/* set dir flag to known value */
 	cld
 
-	/* fall into main loop  (the right one) */
-	cmpq	$0, %rsi
-	je	VG_(run_innerloop__dispatch_unassisted_unprofiled)
-	jmp	VG_(run_innerloop__dispatch_unassisted_profiled)
-	/*NOTREACHED*/	
-
-/*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher           ---*/
-/*----------------------------------------------------*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_unassisted_unprofiled)
-VG_(run_innerloop__dispatch_unassisted_unprofiled):
-	/* AT ENTRY: %rax is next guest addr, %rbp is the
-           unmodified guest state ptr */
-
-	/* save the jump address in the guest state */
-	movq	%rax, OFFSET_amd64_RIP(%rbp)
-
-	/* Are we out of timeslice?  If yes, defer to scheduler. */
-	subl	$1, 0(%rsp)
-	jz	counter_is_zero
-
-	/* try a fast lookup in the translation cache */
-	movabsq $VG_(tt_fast), %rcx
-	movq	%rax, %rbx		/* next guest addr */
-	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
-	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
-	movq	0(%rcx,%rbx,1), %r10	/* .guest */
-	movq	8(%rcx,%rbx,1), %r11	/* .host */
-	cmpq	%rax, %r10
-	jnz	fast_lookup_failed
-
-        /* Found a match.  Jump to .host. */
-	jmp 	*%r11
-	ud2	/* persuade insn decoders not to speculate past here */
-	/* generated code should run, then jump back to either
-	   VG_(run_innerloop__dispatch_unassisted_unprofiled)
-	   VG_(run_innerloop__dispatch_assisted_unprofiled). */
-	/*NOTREACHED*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_assisted_unprofiled)
-VG_(run_innerloop__dispatch_assisted_unprofiled):
-	/* AT ENTRY: %rax is next guest addr, %rbp is the
-           modified guest state ptr */
-	/* We know the guest state pointer has been modified.
-	   So jump directly to gsp_changed. */
-	jmp	gsp_changed
-	ud2
-	/*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower)    ---*/
-/*----------------------------------------------------*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_unassisted_profiled)
-VG_(run_innerloop__dispatch_unassisted_profiled):
-	/* AT ENTRY: %rax is next guest addr, %rbp is the
-           unmodified guest state ptr */
-
-	/* save the jump address in the guest state */
-	movq	%rax, OFFSET_amd64_RIP(%rbp)
-
-	/* Are we out of timeslice?  If yes, defer to scheduler. */
-	subl	$1, 0(%rsp)
-	jz	counter_is_zero
-
-	/* try a fast lookup in the translation cache */
-	movabsq $VG_(tt_fast), %rcx
-	movq	%rax, %rbx
-	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
-	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
-	movq	0(%rcx,%rbx,1), %r10	/* .guest */
-	movq	8(%rcx,%rbx,1), %r11	/* .host */
-	cmpq	%rax, %r10
-	jnz	fast_lookup_failed
-
-	/* increment bb profile counter */
-	movabsq	$VG_(tt_fastN), %rdx
-	shrq	$1, %rbx		/* entry# * sizeof(UInt*) */
-	movq	(%rdx,%rbx,1), %rdx
-	addl	$1, (%rdx)
+	/* Set up the guest state pointer */
+	movq	%rsi, %rbp
 
-        /* Found a match.  Jump to .host. */
-	jmp 	*%r11
-	ud2	/* persuade insn decoders not to speculate past here */
-	/* generated code should run, then jump back to either
-	   VG_(run_innerloop__dispatch_unassisted_profiled)
-	   VG_(run_innerloop__dispatch_assisted_profiled). */
-	/*NOTREACHED*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_assisted_profiled)
-VG_(run_innerloop__dispatch_assisted_profiled):
-	/* AT ENTRY: %rax is next guest addr, %rbp is the
-           modified guest state ptr */
-
-	/* Well, we know the guest state pointer has been modified.
-	   So jump directly to gsp_changed. */
-	jmp	gsp_changed
-	ud2
-	/*NOTREACHED*/
+        /* and jump into the code cache.  Chained translations in
+           the code cache run, until for whatever reason, they can't
+           continue.  When that happens, the translation in question
+           will jump (or call) to one of the continuation points
+           VG_(cp_...) below. */
+        jmpq    *%rdx
+       	/*NOTREACHED*/	
 
 /*----------------------------------------------------*/
-/*--- exit points                                  ---*/
+/*--- Postamble and exit.                          ---*/
 /*----------------------------------------------------*/
 
-gsp_changed:
-	/* Someone messed with the gsp.  Have to
-           defer to scheduler to resolve this.  dispatch ctr
-	   is not yet decremented, so no need to increment. */
-	/* %RIP is NOT up to date here.  First, need to write
-	   %rax back to %RIP, but without trashing %rbp since
-	   that holds the value we want to return to the scheduler.
-	   Hence use %r15 transiently for the guest state pointer. */
-	movq	8(%rsp), %r15
-	movq	%rax, OFFSET_amd64_RIP(%r15)
-	movq	%rbp, %rax
-	jmp	run_innerloop_exit
-	/*NOTREACHED*/
-
-counter_is_zero:
-	/* %RIP is up to date here */
-	/* back out decrement of the dispatch counter */
-	addl	$1, 0(%rsp)
-	movq	$VG_TRC_INNER_COUNTERZERO, %rax
-	jmp	run_innerloop_exit
-
-fast_lookup_failed:
-	/* %RIP is up to date here */
-	/* back out decrement of the dispatch counter */
-	addl	$1, 0(%rsp)
-	movq	$VG_TRC_INNER_FASTMISS, %rax
-	jmp	run_innerloop_exit
-
-
-
-/* All exits from the dispatcher go through here.  %rax holds
-   the return value. 
-*/
-run_innerloop_exit: 
-	/* We're leaving.  Check that nobody messed with
-           %mxcsr or %fpucw.  We can't mess with %rax here as it
-	   holds the tentative return value, but any other is OK. */
+postamble:
+        /* At this point, %rax and %rdx contain two
+           words to be returned to the caller.  %rax
+           holds a TRC value, and %rdx optionally may
+           hold another word (for CHAIN_ME exits, the
+           address of the place to patch.) */
+        
+	/* We're leaving.  Check that nobody messed with %mxcsr
+           or %fpucw.  We can't mess with %rax or %rdx here as they
+           hold the tentative return values, but any others are OK. */
 #if !defined(ENABLE_INNER)
         /* This check fails for self-hosting, so skip in that case */
 	pushq	$0
 	fstcw	(%rsp)
 	cmpl	$0x027F, (%rsp)
-	popq	%r15 /* get rid of the word without trashing %eflags */
+	popq	%r15 /* get rid of the word without trashing %rflags */
 	jnz	invariant_violation
 #endif
 	pushq	$0
@@ -266,20 +141,17 @@ run_innerloop_exit:
 	popq	%r15
 	jnz	invariant_violation
 	/* otherwise we're OK */
-	jmp	run_innerloop_exit_REALLY
-
+	jmp	remove_frame
 invariant_violation:
 	movq	$VG_TRC_INVARIANT_FAILED, %rax
-	jmp	run_innerloop_exit_REALLY
-
-run_innerloop_exit_REALLY:
-
-	/* restore VG_(dispatch_ctr) */	
-	popq	%r14
-	movq	VG_(dispatch_ctr)@GOTPCREL(%rip), %r15
-	movl	%r14d, (%r15)
+        movq    $0, %rdx
 
+remove_frame:
+        /* Pop %rdi, stash return values */
 	popq	%rdi
+        movq    %rax, 0(%rdi)
+        movq    %rdx, 8(%rdi)
+        /* Now pop everything else */
 	popq	%r15
 	popq	%r14
 	popq	%r13
@@ -293,61 +165,89 @@ run_innerloop_exit_REALLY:
 	popq	%rdx
 	popq	%rcx
 	popq	%rbx
+	popq	%rax
 	ret	
-.size VG_(run_innerloop), .-VG_(run_innerloop)
+        
+/*----------------------------------------------------*/
+/*--- Continuation points                          ---*/
+/*----------------------------------------------------*/
 
-	
-/*------------------------------------------------------------*/
-/*---                                                      ---*/
-/*--- A special dispatcher, for running no-redir           ---*/
-/*--- translations.  Just runs the given translation once. ---*/
-/*---                                                      ---*/
-/*------------------------------------------------------------*/
+/* ------ Chain me to slow entry point ------ */
+.global VG_(disp_cp_chain_me_to_slowEP)
+VG_(disp_cp_chain_me_to_slowEP):
+        /* We got called.  The return address indicates
+           where the patching needs to happen.  Collect
+           the return address and, exit back to C land,
+           handing the caller the pair (Chain_me_S, RA) */
+        movq    $VG_TRC_CHAIN_ME_TO_SLOW_EP, %rax
+        popq    %rdx
+        /* 10 = movabsq $VG_(disp_chain_me_to_slowEP), %r11;
+           3  = call *%r11 */
+        subq    $10+3, %rdx
+        jmp     postamble
+
+/* ------ Chain me to fast entry point ------ */
+.global VG_(disp_cp_chain_me_to_fastEP)
+VG_(disp_cp_chain_me_to_fastEP):
+        /* We got called.  The return address indicates
+           where the patching needs to happen.  Collect
+           the return address and, exit back to C land,
+           handing the caller the pair (Chain_me_F, RA) */
+        movq    $VG_TRC_CHAIN_ME_TO_FAST_EP, %rax
+        popq    %rdx
+        /* 10 = movabsq $VG_(disp_chain_me_to_fastEP), %r11;
+           3  = call *%r11 */
+        subq    $10+3, %rdx
+        jmp     postamble
+
+/* ------ Indirect but boring jump ------ */
+.global VG_(disp_cp_xindir)
+VG_(disp_cp_xindir):
+	/* Where are we going? */
+	movq	OFFSET_amd64_RIP(%rbp), %rax
 
-/* signature:
-void VG_(run_a_noredir_translation) ( UWord* argblock );
-*/
+        /* RM ME -- stats only */
+        addq    $1, vgPlain_stats__n_xindirs
+        
+	/* try a fast lookup in the translation cache */
+	movabsq $VG_(tt_fast), %rcx
+	movq	%rax, %rbx		/* next guest addr */
+	andq	$VG_TT_FAST_MASK, %rbx	/* entry# */
+	shlq	$4, %rbx		/* entry# * sizeof(FastCacheEntry) */
+	movq	0(%rcx,%rbx,1), %r10	/* .guest */
+	movq	8(%rcx,%rbx,1), %r11	/* .host */
+	cmpq	%rax, %r10
+	jnz	fast_lookup_failed
 
-/* Run a no-redir translation.  argblock points to 4 UWords, 2 to carry args
-   and 2 to carry results:
-      0: input:  ptr to translation
-      1: input:  ptr to guest state
-      2: output: next guest PC
-      3: output: guest state pointer afterwards (== thread return code)
-*/
-.align 16
-.global VG_(run_a_noredir_translation)
-.type VG_(run_a_noredir_translation), @function
-VG_(run_a_noredir_translation):
-	/* Save callee-saves regs */
-	pushq %rbx
-	pushq %rbp
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	pushq %r15
-
-	pushq %rdi  /* we will need it after running the translation */
-	movq 8(%rdi), %rbp
-	jmp *0(%rdi)
-	/*NOTREACHED*/
-	ud2
-	/* If the translation has been correctly constructed, we
-	should resume at the the following label. */
-.global VG_(run_a_noredir_translation__return_point)
-VG_(run_a_noredir_translation__return_point):
-	popq %rdi
-	movq %rax, 16(%rdi)
-	movq %rbp, 24(%rdi)
-
-	popq  %r15
-	popq  %r14
-	popq  %r13
-	popq  %r12
-	popq  %rbp
-	popq  %rbx
-	ret
-.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation)
+        /* Found a match.  Jump to .host. */
+	jmp 	*%r11
+	ud2	/* persuade insn decoders not to speculate past here */
+
+fast_lookup_failed:
+        /* RM ME -- stats only */
+        addq    $1, vgPlain_stats__n_xindir_misses
+
+	movq	$VG_TRC_INNER_FASTMISS, %rax
+        movq    $0, %rdx
+	jmp	postamble
+
+/* ------ Assisted jump ------ */
+.global VG_(disp_cp_xassisted)
+VG_(disp_cp_xassisted):
+        /* %rbp contains the TRC */
+        movq    %rbp, %rax
+        movq    $0, %rdx
+        jmp     postamble
+
+/* ------ Event check failed ------ */
+.global VG_(disp_cp_evcheck_fail)
+VG_(disp_cp_evcheck_fail):
+       	movq	$VG_TRC_INNER_COUNTERZERO, %rax
+        movq    $0, %rdx
+	jmp	postamble
+
+
+.size VG_(disp_run_translations), .-VG_(disp_run_translations)
 
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S
index 9e2334957e..4833a75bcc 100644
--- a/coregrind/m_dispatch/dispatch-arm-linux.S
+++ b/coregrind/m_dispatch/dispatch-arm-linux.S
@@ -1,3 +1,4 @@
+
 /*--------------------------------------------------------------------*/
 /*--- The core dispatch loop, for jumping to a code address.       ---*/
 /*---                                         dispatch-arm-linux.S ---*/
@@ -45,121 +46,121 @@
 /*------------------------------------------------------------*/
 
 /*----------------------------------------------------*/
-/*--- Preamble (set everything up)                 ---*/
+/*--- Entry and preamble (set everything up)       ---*/
 /*----------------------------------------------------*/
 
 /* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+                                  void*  guest_state, 
+                                  Addr   host_addr );
 */
 .text
-.globl VG_(run_innerloop)
-VG_(run_innerloop):
-	push {r0, r1, r4, r5, r6, r7, r8, r9, fp, lr}
+.global VG_(disp_run_translations)
+VG_(disp_run_translations):
+        /* r0  holds two_words
+           r1  holds guest_state
+           r2  holds host_addr
+        */
+        /* The number of regs in this list needs to be even, in
+           order to keep the stack 8-aligned. */
+	push {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
 
         /* set FPSCR to vex-required default value */
         mov  r4, #0
         fmxr fpscr, r4
 
-        /* r0 (hence also [sp,#0]) holds guest_state */
-        /* r1 holds do_profiling */
-	mov r8, r0
-	ldr r0, [r8, #OFFSET_arm_R15T]
-        
-       	/* fall into main loop (the right one) */
-	cmp r1, #0      /* do_profiling */
-	beq VG_(run_innerloop__dispatch_unprofiled)
-	b   VG_(run_innerloop__dispatch_profiled)
-
+       	/* Set up the guest state pointer */
+        mov r8, r1
 
+        /* and jump into the code cache.  Chained translations in
+           the code cache run, until for whatever reason, they can't
+           continue.  When that happens, the translation in question
+           will jump (or call) to one of the continuation points
+           VG_(cp_...) below. */
+        bx r2
+        /* NOTREACHED */
+        
 /*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher           ---*/
+/*--- Postamble and exit.                          ---*/
 /*----------------------------------------------------*/
 
-/* Pairing of insns below is my guesstimate of how dual dispatch would
-   work on an A8.  JRS, 2011-May-28 */
- 
-.global	VG_(run_innerloop__dispatch_unprofiled)
-VG_(run_innerloop__dispatch_unprofiled):
-
-	/* AT ENTRY: r0 is next guest addr, r8 is possibly
-        modified guest state ptr */
-
-        /* Has the guest state pointer been messed with?  If yes, exit. */
-        movw r3, #:lower16:VG_(dispatch_ctr)
-        tst  r8, #1
-
-        movt r3, #:upper16:VG_(dispatch_ctr)
-
-	bne  gsp_changed
-
-	/* save the jump address in the guest state */
-        str  r0, [r8, #OFFSET_arm_R15T]
-
-        /* Are we out of timeslice?  If yes, defer to scheduler. */
-        ldr  r2, [r3]
-
-        subs r2, r2, #1
-
-        str  r2, [r3]
-
-        beq  counter_is_zero
-
-        /* try a fast lookup in the translation cache */
-        // r0 = next guest, r1,r2,r3,r4 scratch
-        movw r1, #VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
-        movw r4, #:lower16:VG_(tt_fast)
-
-	and  r2, r1, r0, LSR #1         // r2 = entry #
-        movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast)
-
-	add  r1, r4, r2, LSL #3         // r1 = &tt_fast[entry#]
-
-        ldrd r4, r5, [r1, #0]           // r4 = .guest, r5 = .host
-
-	cmp  r4, r0
+postamble:
+        /* At this point, r1 and r2 contain two
+           words to be returned to the caller.  r1
+           holds a TRC value, and r2 optionally may
+           hold another word (for CHAIN_ME exits, the
+           address of the place to patch.) */
 
-	bne  fast_lookup_failed
-        // r5: next-host    r8: live, gsp
-        // r4: next-guest
-        // r2: entry #
-        // LIVE: r5, r8; all others dead
-        
-        /* Found a match.  Jump to .host. */
-	blx  r5
-	b    VG_(run_innerloop__dispatch_unprofiled)
-.ltorg
-	/*NOTREACHED*/
+        /* We're leaving.  Check that nobody messed with
+           FPSCR in ways we don't expect. */
+        fmrx r4, fpscr
+        bic  r4, #0xF8000000 /* mask out NZCV and QC */
+        bic  r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */
+        cmp  r4, #0
+        beq  remove_frame /* we're OK */
+        /* otherwise we have an invariant violation */
+        movw r1, #VG_TRC_INVARIANT_FAILED
+        movw r2, #0
+        /* fall through */
+
+remove_frame:
+        /* Restore int regs, including importantly r0 (two_words) */
+	pop {r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}     
+        /* Stash return values */
+        str  r1, [r0, #0]
+        str  r2, [r0, #4]
+        bx   lr
 
 /*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower)    ---*/
+/*--- Continuation points                          ---*/
 /*----------------------------------------------------*/
 
-.global	VG_(run_innerloop__dispatch_profiled)
-VG_(run_innerloop__dispatch_profiled):
-
-	/* AT ENTRY: r0 is next guest addr, r8 is possibly
-        modified guest state ptr */
-
-        /* Has the guest state pointer been messed with?  If yes, exit. */
-        movw r3, #:lower16:VG_(dispatch_ctr)
-	tst  r8, #1
-
-        movt r3, #:upper16:VG_(dispatch_ctr)
-
-	bne  gsp_changed
-
-	/* save the jump address in the guest state */
-        str  r0, [r8, #OFFSET_arm_R15T]
-
-        /* Are we out of timeslice?  If yes, defer to scheduler. */
-        ldr  r2, [r3]
-
-        subs r2, r2, #1
-
-        str  r2, [r3]
-
-        beq  counter_is_zero
-
+/* ------ Chain me to slow entry point ------ */
+.global VG_(disp_cp_chain_me_to_slowEP)
+VG_(disp_cp_chain_me_to_slowEP):
+        /* We got called.  The return address indicates
+           where the patching needs to happen.  Collect
+           the return address and, exit back to C land,
+           handing the caller the pair (Chain_me_S, RA) */
+        mov  r1, #VG_TRC_CHAIN_ME_TO_SLOW_EP
+        mov  r2, lr
+        /* 4 = movw r12, lo16(disp_cp_chain_me_to_slowEP)
+           4 = movt r12, hi16(disp_cp_chain_me_to_slowEP)
+           4 = blx  r12 */
+        sub  r2, r2, #4+4+4
+        b    postamble
+
+/* ------ Chain me to fast entry point ------ */
+.global VG_(disp_cp_chain_me_to_fastEP)
+VG_(disp_cp_chain_me_to_fastEP):
+        /* We got called.  The return address indicates
+           where the patching needs to happen.  Collect
+           the return address and, exit back to C land,
+           handing the caller the pair (Chain_me_F, RA) */
+        mov  r1, #VG_TRC_CHAIN_ME_TO_FAST_EP
+        mov  r2, lr
+        /* 4 = movw r12, lo16(disp_cp_chain_me_to_fastEP)
+           4 = movt r12, hi16(disp_cp_chain_me_to_fastEP)
+           4 = blx  r12 */
+        sub  r2, r2, #4+4+4
+        b    postamble
+
+/* ------ Indirect but boring jump ------ */
+.global VG_(disp_cp_xindir)
+VG_(disp_cp_xindir):
+	/* Where are we going? */
+        ldr  r0, [r8, #OFFSET_arm_R15T]
+
+        /* RM ME -- stats only */
+        movw r1, #:lower16:vgPlain_stats__n_xindirs
+        movt r1, #:upper16:vgPlain_stats__n_xindirs
+        ldr  r2, [r1, #0]
+        adds r2, r2, #1
+        str  r2, [r1, #0]
+        ldr  r2, [r1, #4]
+        adc  r2, r2, #0
+        str  r2, [r1, #4]
+        
         /* try a fast lookup in the translation cache */
         // r0 = next guest, r1,r2,r3,r4 scratch
         movw r1, #VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
@@ -174,121 +175,41 @@ VG_(run_innerloop__dispatch_profiled):
 
 	cmp  r4, r0
 
-	bne  fast_lookup_failed
-        // r5: next-host    r8: live, gsp
-        // r4: next-guest
-        // r2: entry #
-        // LIVE: r5, r8; all others dead
-        
-        /* increment bb profile counter */
-        movw r0, #:lower16:VG_(tt_fastN)
-        movt r0, #:upper16:VG_(tt_fastN) // r0 = &tt_fastN[0]
-        ldr  r0, [r0, r2, LSL #2]        // r0 = tt_fast[entry #]
-        ldr  r3, [r0]                    // *r0 ++
-        add  r3, r3, #1
-        str  r3, [r0]
-
-        /* Found a match.  Jump to .host. */
-	blx  r5
-	b    VG_(run_innerloop__dispatch_profiled)
-	/*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- exit points                                  ---*/
-/*----------------------------------------------------*/
-
-gsp_changed:
-        // r0 = next guest addr (R15T), r8 = modified gsp
-        /* Someone messed with the gsp.  Have to
-           defer to scheduler to resolve this.  dispatch ctr
-           is not yet decremented, so no need to increment. */
-        /* R15T is NOT up to date here.  First, need to write
-           r0 back to R15T, but without trashing r8 since
-           that holds the value we want to return to the scheduler.
-           Hence use r1 transiently for the guest state pointer. */
-	ldr r1, [sp, #0]
-	str r0, [r1, #OFFSET_arm_R15T]
-	mov r0, r8      // "return modified gsp"
-	b run_innerloop_exit
-        /*NOTREACHED*/
-
-counter_is_zero:
-        /* R15T is up to date here */
-        /* Back out increment of the dispatch ctr */
-        ldr  r1, =VG_(dispatch_ctr)
-        ldr  r2, [r1]
-        add  r2, r2, #1
-        str  r2, [r1]
-        mov  r0, #VG_TRC_INNER_COUNTERZERO
-        b    run_innerloop_exit
-        /*NOTREACHED*/
-        
-fast_lookup_failed:
-        /* R15T is up to date here */
-        /* Back out increment of the dispatch ctr */
-        ldr  r1, =VG_(dispatch_ctr)
-        ldr  r2, [r1]
-        add  r2, r2, #1
-        str  r2, [r1]
-	mov  r0, #VG_TRC_INNER_FASTMISS
-	b    run_innerloop_exit
-        /*NOTREACHED*/
-
-/* All exits from the dispatcher go through here.  %r0 holds
-   the return value. 
-*/
-run_innerloop_exit:
-        /* We're leaving.  Check that nobody messed with
-           FPSCR in ways we don't expect. */
-        fmrx r4, fpscr
-        bic  r4, #0xF8000000 /* mask out NZCV and QC */
-        bic  r4, #0x0000009F /* mask out IDC,IXC,UFC,OFC,DZC,IOC */
-        cmp  r4, #0
-        bne  invariant_violation
-        b    run_innerloop_exit_REALLY
-
-invariant_violation:
-        mov  r0, #VG_TRC_INVARIANT_FAILED
-        b    run_innerloop_exit_REALLY
-
-run_innerloop_exit_REALLY:
-	add sp, sp, #8
-	pop {r4, r5, r6, r7, r8, r9, fp, pc}
-
-.size VG_(run_innerloop), .-VG_(run_innerloop)
-
-
-/*------------------------------------------------------------*/
-/*---                                                      ---*/
-/*--- A special dispatcher, for running no-redir           ---*/
-/*--- translations.  Just runs the given translation once. ---*/
-/*---                                                      ---*/
-/*------------------------------------------------------------*/
-
-/* signature:
-void VG_(run_a_noredir_translation) ( UWord* argblock );
-*/
-
-/* Run a no-redir translation.  argblock points to 4 UWords, 2 to carry args
-   and 2 to carry results:
-      0: input:  ptr to translation
-      1: input:  ptr to guest state
-      2: output: next guest PC
-      3: output: guest state pointer afterwards (== thread return code)
-*/
-.global VG_(run_a_noredir_translation)
-VG_(run_a_noredir_translation):
-	push {r0,r1 /* EABI compliance */, r4-r12, lr} 
-	ldr r8, [r0, #4]
-	mov lr, pc
-	ldr pc, [r0, #0]
-
-	pop {r1}
-	str r0, [r1, #8]
-	str r8, [r1, #12]
-	pop {r1/*EABI compliance*/,r4-r12, pc}	
-
-.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation)
+        // jump to host if lookup succeeded
+	bxeq r5
+
+        /* otherwise the fast lookup failed */
+        /* RM ME -- stats only */
+        movw r1, #:lower16:vgPlain_stats__n_xindir_misses
+        movt r1, #:upper16:vgPlain_stats__n_xindir_misses
+        ldr  r2, [r1, #0]
+        adds r2, r2, #1
+        str  r2, [r1, #0]
+        ldr  r2, [r1, #4]
+        adc  r2, r2, #0
+        str  r2, [r1, #4]
+
+	mov  r1, #VG_TRC_INNER_FASTMISS
+        mov  r2, #0
+	b    postamble
+
+/* ------ Assisted jump ------ */
+.global VG_(disp_cp_xassisted)
+VG_(disp_cp_xassisted):
+        /* r8 contains the TRC */
+        mov  r1, r8
+        mov  r2, #0
+        b    postamble
+
+/* ------ Event check failed ------ */
+.global VG_(disp_cp_evcheck_fail)
+VG_(disp_cp_evcheck_fail):
+       	mov  r1, #VG_TRC_INNER_COUNTERZERO
+        mov  r2, #0
+	b    postamble
+
+
+.size VG_(disp_run_translations), .-VG_(disp_run_translations)
 
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",%progbits
diff --git a/coregrind/m_dispatch/dispatch-x86-linux.S b/coregrind/m_dispatch/dispatch-x86-linux.S
index 3e13ba65e9..7db1de1e2a 100644
--- a/coregrind/m_dispatch/dispatch-x86-linux.S
+++ b/coregrind/m_dispatch/dispatch-x86-linux.S
@@ -45,20 +45,27 @@
 /*------------------------------------------------------------*/
 
 /*----------------------------------------------------*/
-/*--- Preamble (set everything up)                 ---*/
+/*--- Entry and preamble (set everything up)       ---*/
 /*----------------------------------------------------*/
 
 /* signature:
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
+UWord VG_(disp_run_translations)( UWord* two_words,
+                                  void*  guest_state, 
+                                  Addr   host_addr );
 */
 .text
-.globl VG_(run_innerloop)
-.type  VG_(run_innerloop), @function
-VG_(run_innerloop):
-	/* 4(%esp) holds guest_state */
-	/* 8(%esp) holds do_profiling */
-	
-	/* ----- entry point to VG_(run_innerloop) ----- */
+.globl VG_(disp_run_translations)
+.type  VG_(disp_run_translations), @function
+VG_(disp_run_translations):
+        /* 0(%esp) holds our return address. */
+	/* 4(%esp) holds two_words */
+	/* 8(%esp) holds guest_state */
+	/* 12(%esp) holds host_addr */
+
+        /* The preamble */
+
+        /* Save integer registers, since this is a pseudo-function. */
+        pushl   %eax
 	pushl	%ebx
 	pushl	%ecx
 	pushl	%edx
@@ -66,14 +73,11 @@ VG_(run_innerloop):
 	pushl	%edi
 	pushl	%ebp
 	
-	/* 28(%esp) holds guest_state */
-	/* 32(%esp) holds do_profiling */
+	/* 28+4(%esp) holds two_words */
+	/* 28+8(%esp) holds guest_state */
+	/* 28+12(%esp) holds host_addr */
 
-	/* Set up the guest state pointer */
-	movl	28(%esp), %ebp
-	
-	/* fetch %EIP into %eax */
-	movl	OFFSET_x86_EIP(%ebp), %eax
+        /* Get the host CPU in the state expected by generated code. */
 
 	/* set host FPU control word to the default mode expected 
            by VEX-generated code.  See comments in libvex.h for
@@ -93,151 +97,32 @@ VG_(run_innerloop):
 L1:
 	/* set dir flag to known value */
 	cld
-	
-	/* fall into main loop (the right one) */
-	cmpl	$0, 32(%esp) /* do_profiling */
-	je	VG_(run_innerloop__dispatch_unassisted_unprofiled)
-	jmp	VG_(run_innerloop__dispatch_unassisted_profiled)
-	/*NOTREACHED*/
 
-/*----------------------------------------------------*/
-/*--- NO-PROFILING (standard) dispatcher           ---*/
-/*----------------------------------------------------*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_unassisted_unprofiled)
-VG_(run_innerloop__dispatch_unassisted_unprofiled):
-	/* AT ENTRY: %eax is next guest addr, %ebp is the
-           unmodified guest state ptr */
-
-	/* save the jump address in the guest state */
-	movl	%eax, OFFSET_x86_EIP(%ebp)
-
-	/* Are we out of timeslice?  If yes, defer to scheduler. */
-	subl	$1, VG_(dispatch_ctr)
-	jz	counter_is_zero
-
-	/* try a fast lookup in the translation cache */
-	movl	%eax, %ebx			/* next guest addr */
-	andl	$ VG_TT_FAST_MASK, %ebx		/* entry# */
-	movl	0+VG_(tt_fast)(,%ebx,8), %esi	/* .guest */
-	movl	4+VG_(tt_fast)(,%ebx,8), %edi	/* .host */
-	cmpl	%eax, %esi
-	jnz	fast_lookup_failed
-
-	/* Found a match.  Jump to .host. */
-	jmp 	*%edi
-	ud2	/* persuade insn decoders not to speculate past here */
-	/* generated code should run, then jump back to either
-	   VG_(run_innerloop__dispatch_unassisted_unprofiled) or
-	   VG_(run_innerloop__dispatch_assisted_unprofiled). */
-	/*NOTREACHED*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_assisted_unprofiled)
-VG_(run_innerloop__dispatch_assisted_unprofiled):
-	/* AT ENTRY: %eax is next guest addr, %ebp is the
-           modified guest state ptr */
-	/* We know the guest state pointer has been modified.
-	   So jump directly to gsp_changed. */
-	jmp	gsp_changed
-	ud2
-	/*NOTREACHED*/
-
-/*----------------------------------------------------*/
-/*--- PROFILING dispatcher (can be much slower)    ---*/
-/*----------------------------------------------------*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_unassisted_profiled)
-VG_(run_innerloop__dispatch_unassisted_profiled):
-	/* AT ENTRY: %eax is next guest addr, %ebp is the
-           unmodified guest state ptr */
-
-	/* save the jump address in the guest state */
-	movl	%eax, OFFSET_x86_EIP(%ebp)
-
-	/* Are we out of timeslice?  If yes, defer to scheduler. */
-	subl	$1, VG_(dispatch_ctr)
-	jz	counter_is_zero
-
-	/* try a fast lookup in the translation cache */
-	movl	%eax, %ebx			/* next guest addr */
-	andl	$ VG_TT_FAST_MASK, %ebx		/* entry# */
-	movl	0+VG_(tt_fast)(,%ebx,8), %esi	/* .guest */
-	movl	4+VG_(tt_fast)(,%ebx,8), %edi	/* .host */
-	cmpl	%eax, %esi
-	jnz	fast_lookup_failed
-
-	/* increment bb profile counter */
-	/* note: innocuous as this sounds, it causes a huge amount more
-           stress on D1 and significantly slows everything down. */
-	movl	VG_(tt_fastN)(,%ebx,4), %edx
-	/* Use "addl $1", not "incl", to avoid partial-flags stall on P4 */
-	addl	$1, (%edx)
-
-	/* Found a match.  Jump to .host. */
-	jmp 	*%edi
-	ud2	/* persuade insn decoders not to speculate past here */
-	/* generated code should run, then jump back to either
-	   VG_(run_innerloop__dispatch_unassisted_profiled) or
-	   VG_(run_innerloop__dispatch_assisted_profiled). */
-	/*NOTREACHED*/
-
-.align	16
-.global	VG_(run_innerloop__dispatch_assisted_profiled)
-VG_(run_innerloop__dispatch_assisted_profiled):
-	/* AT ENTRY: %eax is next guest addr, %ebp is the
-           modified guest state ptr */
-	/* We know the guest state pointer has been modified.
-	   So jump directly to gsp_changed. */
-	jmp	gsp_changed
-	ud2
+	/* Set up the guest state pointer */
+	movl	28+8(%esp), %ebp
+
+        /* and jump into the code cache.  Chained translations in
+           the code cache run, until for whatever reason, they can't
+           continue.  When that happens, the translation in question
+           will jump (or call) to one of the continuation points
+           VG_(cp_...) below. */
+        jmpl    *28+12(%esp)
 	/*NOTREACHED*/
 
 /*----------------------------------------------------*/
-/*--- exit points                                  ---*/
+/*--- Postamble and exit.                          ---*/
 /*----------------------------------------------------*/
 
-gsp_changed:
-	/* Someone messed with the gsp.  Have to
-           defer to scheduler to resolve this.  dispatch ctr
-	   is not yet decremented, so no need to increment. */
-	/* %EIP is NOT up to date here.  First, need to write
-	   %eax back to %EIP, but without trashing %ebp since
-	   that holds the value we want to return to the scheduler.
-	   Hence use %esi transiently for the guest state pointer. */
-	movl	28(%esp), %esi
-	movl	%eax, OFFSET_x86_EIP(%esi)
-	movl	%ebp, %eax
-	jmp	run_innerloop_exit
-	/*NOTREACHED*/
-
-counter_is_zero:
-	/* %EIP is up to date here */
-	/* back out decrement of the dispatch counter */
-	addl	$1, VG_(dispatch_ctr)
-	movl	$ VG_TRC_INNER_COUNTERZERO, %eax
-	jmp	run_innerloop_exit
-	/*NOTREACHED*/
-
-fast_lookup_failed:
-	/* %EIP is up to date here */
-	/* back out decrement of the dispatch counter */
-	addl	$1, VG_(dispatch_ctr)
-	movl	$ VG_TRC_INNER_FASTMISS, %eax
-	jmp	run_innerloop_exit
-	/*NOTREACHED*/
+postamble:
+        /* At this point, %eax and %edx contain two
+           words to be returned to the caller.  %eax
+           holds a TRC value, and %edx optionally may
+           hold another word (for CHAIN_ME exits, the
+           address of the place to patch.) */
 
-
-
-/* All exits from the dispatcher go through here.  %eax holds
-   the return value. 
-*/
-run_innerloop_exit: 
-	/* We're leaving.  Check that nobody messed with
-           %mxcsr or %fpucw.  We can't mess with %eax here as it
-	   holds the tentative return value, but any other is OK. */
+	/* We're leaving.  Check that nobody messed with %mxcsr
+           or %fpucw.  We can't mess with %eax or %edx here as they
+	   holds the tentative return value, but any others are OK. */
 #if !defined(ENABLE_INNER)
         /* This check fails for self-hosting, so skip in that case */
 	pushl	$0
@@ -246,7 +131,7 @@ run_innerloop_exit:
 	popl	%esi /* get rid of the word without trashing %eflags */
 	jnz	invariant_violation
 #endif
-	cmpl	$0, VG_(machine_x86_have_mxcsr)
+#	cmpl	$0, VG_(machine_x86_have_mxcsr)
 	jz	L2
 	pushl	$0
 	stmxcsr	(%esp)
@@ -255,72 +140,107 @@ run_innerloop_exit:
 	popl	%esi
 	jnz	invariant_violation
 L2:	/* otherwise we're OK */
-	jmp	run_innerloop_exit_REALLY
-
+	jmp	remove_frame
 invariant_violation:
-	movl	$ VG_TRC_INVARIANT_FAILED, %eax
-	jmp	run_innerloop_exit_REALLY
-
-run_innerloop_exit_REALLY:
+	movl	$VG_TRC_INVARIANT_FAILED, %eax
+        movl    $0, %edx
+
+remove_frame:
+        /* Stash return values */
+        movl    28+4(%esp), %edi        /* two_words */
+        movl    %eax, 0(%edi)
+        movl    %edx, 4(%edi)
+        /* Restore int regs and return. */
 	popl	%ebp
 	popl	%edi
 	popl	%esi
 	popl	%edx
 	popl	%ecx
 	popl	%ebx
+	popl	%eax
 	ret	
-.size VG_(run_innerloop), .-VG_(run_innerloop)
+        
+/*----------------------------------------------------*/
+/*--- Continuation points                          ---*/
+/*----------------------------------------------------*/
 
+/* ------ Chain me to slow entry point ------ */
+.global VG_(disp_cp_chain_me_to_slowEP)
+VG_(disp_cp_chain_me_to_slowEP):
+        /* We got called.  The return address indicates
+           where the patching needs to happen.  Collect
+           the return address and, exit back to C land,
+           handing the caller the pair (Chain_me_S, RA) */
+        movl    $VG_TRC_CHAIN_ME_TO_SLOW_EP, %eax
+        popl    %edx
+        /* 5 = movl $VG_(disp_chain_me_to_slowEP), %edx;
+           2 = call *%edx */
+        subl    $5+2, %edx
+        jmp     postamble
+
+/* ------ Chain me to fast entry point ------ */
+.global VG_(disp_cp_chain_me_to_fastEP)
+VG_(disp_cp_chain_me_to_fastEP):
+        /* We got called.  The return address indicates
+           where the patching needs to happen.  Collect
+           the return address and, exit back to C land,
+           handing the caller the pair (Chain_me_F, RA) */
+        movl    $VG_TRC_CHAIN_ME_TO_FAST_EP, %eax
+        popl    %edx
+        /* 5 = movl $VG_(disp_chain_me_to_fastEP), %edx;
+           2 = call *%edx */
+        subl    $5+2, %edx
+        jmp     postamble
+
+/* ------ Indirect but boring jump ------ */
+.global VG_(disp_cp_xindir)
+VG_(disp_cp_xindir):
+	/* Where are we going? */
+	movl	OFFSET_x86_EIP(%ebp), %eax
 
-/*------------------------------------------------------------*/
-/*---                                                      ---*/
-/*--- A special dispatcher, for running no-redir           ---*/
-/*--- translations.  Just runs the given translation once. ---*/
-/*---                                                      ---*/
-/*------------------------------------------------------------*/
+        /* RM ME -- stats only */
+        addl    $1, vgPlain_stats__n_xindirs
+        adcl    $0, vgPlain_stats__n_xindirs+4
+        
+        /* try a fast lookup in the translation cache */
+        movl    %eax, %ebx                      /* next guest addr */
+        andl    $VG_TT_FAST_MASK, %ebx          /* entry# */
+        movl    0+VG_(tt_fast)(,%ebx,8), %esi   /* .guest */
+        movl    4+VG_(tt_fast)(,%ebx,8), %edi   /* .host */
+        cmpl    %eax, %esi
+        jnz     fast_lookup_failed
+
+        /* Found a match.  Jump to .host. */
+	jmp 	*%edi
+	ud2	/* persuade insn decoders not to speculate past here */
 
-/* signature:
-void VG_(run_a_noredir_translation) ( UWord* argblock );
-*/
+fast_lookup_failed:
+        /* RM ME -- stats only */
+        addl    $1, vgPlain_stats__n_xindir_misses
+        adcl    $0, vgPlain_stats__n_xindir_misses+4
+
+	movl	$VG_TRC_INNER_FASTMISS, %eax
+        movl    $0, %edx
+	jmp	postamble
+
+/* ------ Assisted jump ------ */
+.global VG_(disp_cp_xassisted)
+VG_(disp_cp_xassisted):
+        /* %ebp contains the TRC */
+        movl    %ebp, %eax
+        movl    $0, %edx
+        jmp     postamble
+
+/* ------ Event check failed ------ */
+.global VG_(disp_cp_evcheck_fail)
+VG_(disp_cp_evcheck_fail):
+       	movl	$VG_TRC_INNER_COUNTERZERO, %eax
+        movl    $0, %edx
+	jmp	postamble
+
+
+.size VG_(disp_run_translations), .-VG_(disp_run_translations)
 
-/* Run a no-redir translation.  argblock points to 4 UWords, 2 to carry args
-   and 2 to carry results:
-      0: input:  ptr to translation
-      1: input:  ptr to guest state
-      2: output: next guest PC
-      3: output: guest state pointer afterwards (== thread return code)
-*/
-.align 16
-.global VG_(run_a_noredir_translation)
-.type VG_(run_a_noredir_translation), @function
-VG_(run_a_noredir_translation):
-	/* Save callee-saves regs */
-	pushl %esi
-	pushl %edi
-	pushl %ebp
-	pushl %ebx
-
-	movl 20(%esp), %edi	/* %edi = argblock */
-	movl 4(%edi), %ebp	/* argblock[1] */
-	jmp *0(%edi)		/* argblock[0] */
-	/*NOTREACHED*/
-	ud2
-	/* If the translation has been correctly constructed, we
-	should resume at the the following label. */
-.global VG_(run_a_noredir_translation__return_point)
-VG_(run_a_noredir_translation__return_point):
-	movl 20(%esp), %edi
-	movl %eax, 8(%edi)	/* argblock[2] */
-	movl %ebp, 12(%edi)	/* argblock[3] */
-
-	popl %ebx
-	popl %ebp
-	popl %edi
-	popl %esi
-	ret
-.size VG_(run_a_noredir_translation), .-VG_(run_a_noredir_translation)
-
-			
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
diff --git a/coregrind/m_errormgr.c b/coregrind/m_errormgr.c
index 44976a804b..788041b443 100644
--- a/coregrind/m_errormgr.c
+++ b/coregrind/m_errormgr.c
@@ -966,7 +966,8 @@ void VG_(show_all_errors) (  Int verbosity, Bool xml )
 
       if ((i+1 == VG_(clo_dump_error))) {
          StackTrace ips = VG_(get_ExeContext_StackTrace)(p_min->where);
-         VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/,
+         VG_(translate) ( NULL/*caused_discardP*/,
+                          0 /* dummy ThreadId; irrelevant due to debugging*/,
                           ips[0], /*debugging*/True, 0xFE/*verbosity*/,
                           /*bbs_done*/0,
                           /*allow redir?*/True);
diff --git a/coregrind/m_gdbserver/server.c b/coregrind/m_gdbserver/server.c
index 2736419c8c..8e58589919 100644
--- a/coregrind/m_gdbserver/server.c
+++ b/coregrind/m_gdbserver/server.c
@@ -310,7 +310,8 @@ int handle_gdb_valgrind_command (char* mon, OutputSink* sink_wanted_at_return)
          address = thumb_pc (address);
 #        endif
 
-         VG_(translate) ( 0 /* dummy ThreadId; irrelevant due to debugging*/,
+         VG_(translate) ( NULL/*caused_discardP*/,
+                          0 /* dummy ThreadId; irrelevant due to debugging*/,
                           address,
                           /*debugging*/True, 
                           (Int) vex_verbosity,
diff --git a/coregrind/m_libcproc.c b/coregrind/m_libcproc.c
index 4462b6a2fc..de16c53bde 100644
--- a/coregrind/m_libcproc.c
+++ b/coregrind/m_libcproc.c
@@ -716,6 +716,59 @@ void VG_(do_atfork_child)(ThreadId tid)
 }
 
 
+/* ---------------------------------------------------------------------
+   icache invalidation
+   ------------------------------------------------------------------ */
+
+void VG_(invalidate_icache) ( void *ptr, SizeT nbytes )
+{
+#  if defined(VGA_ppc32) || defined(VGA_ppc64)
+   Addr startaddr = (Addr) ptr;
+   Addr endaddr   = startaddr + nbytes;
+   Addr cls;
+   Addr addr;
+   VexArchInfo vai;
+
+   if (nbytes == 0) return;
+   vg_assert(nbytes > 0);
+
+   VG_(machine_get_VexArchInfo)( NULL, &vai );
+   cls = vai.ppc_cache_line_szB;
+
+   /* Stay sane .. */
+   vg_assert(cls == 32 || cls == 64 || cls == 128);
+
+   startaddr &= ~(cls - 1);
+   for (addr = startaddr; addr < endaddr; addr += cls) {
+      __asm__ __volatile__("dcbst 0,%0" : : "r" (addr));
+   }
+   __asm__ __volatile__("sync");
+   for (addr = startaddr; addr < endaddr; addr += cls) {
+      __asm__ __volatile__("icbi 0,%0" : : "r" (addr));
+   }
+   __asm__ __volatile__("sync; isync");
+
+#  elif defined(VGA_x86)
+   /* no need to do anything, hardware provides coherence */
+
+#  elif defined(VGA_amd64)
+   /* no need to do anything, hardware provides coherence */
+
+#  elif defined(VGA_s390x)
+   /* no need to do anything, hardware provides coherence */
+
+#  elif defined(VGP_arm_linux)
+   /* ARM cache flushes are privileged, so we must defer to the kernel. */
+   Addr startaddr = (Addr) ptr;
+   Addr endaddr   = startaddr + nbytes;
+   VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr);
+
+#  else
+#    error "Unknown ARCH"
+#  endif
+}
+
+
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
 /*--------------------------------------------------------------------*/
diff --git a/coregrind/m_main.c b/coregrind/m_main.c
index 094e884dfb..3bddb47926 100644
--- a/coregrind/m_main.c
+++ b/coregrind/m_main.c
@@ -1373,7 +1373,8 @@ void show_BB_profile ( BBProfEntry tops[], UInt n_tops, ULong score_total )
                   score_here,  buf_here, tops[r].addr, name );
       VG_(printf)("\n");
       VG_(discard_translations)(tops[r].addr, 1, "bb profile");
-      VG_(translate)(0, tops[r].addr, True, VG_(clo_profile_flags), 0, True);
+      VG_(translate)(NULL/*caused_discardP*/,
+                     0, tops[r].addr, True, VG_(clo_profile_flags), 0, True);
       VG_(printf)("=-=-=-=-=-=-=-=-=-=-=-=-=-=  end BB rank %d  "
                   "=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n", r);
    }
@@ -1881,13 +1882,13 @@ Int valgrind_main ( Int argc, HChar **argv, HChar **envp )
       VG_(printf)("pid=%d, entering delay loop\n", VG_(getpid)());
 
 #     if defined(VGP_x86_linux)
-      iters = 5;
+      iters = 10;
 #     elif defined(VGP_amd64_linux) || defined(VGP_ppc64_linux)
       iters = 10;
 #     elif defined(VGP_ppc32_linux)
       iters = 5;
 #     elif defined(VGP_arm_linux)
-      iters = 1;
+      iters = 5;
 #     elif defined(VGP_s390x_linux)
       iters = 10;
 #     elif defined(VGO_darwin)
diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c
index be2ae59066..349b56b355 100644
--- a/coregrind/m_scheduler/scheduler.c
+++ b/coregrind/m_scheduler/scheduler.c
@@ -55,7 +55,23 @@
    the OS handles threading and signalling are abstracted away and
    implemented elsewhere.  [Some of the functions have worked their
    way back for the moment, until we do an OS port in earnest...]
- */
+*/
+
+/* FIXME tchaining tests:
+   - extensive spinrounds
+   - with sched quantum = 1  -- check that handle_noredir_jump
+     doesn't return with INNER_COUNTERZERO
+   other:
+   - out of date comment w.r.t. bit 0 set in libvex_trc_values.h
+   - can VG_TRC_BORING still happen?  if not, rm
+   - memory leaks in m_transtab (InEdgeArr/OutEdgeArr leaking?)
+   - move do_cacheflush out of m_transtab
+   - more economical unchaining when nuking an entire sector
+   - ditto w.r.t. cache flushes
+   - add comments about caused_discard to handle_chain_me()
+   - verify case of 2 paths from A to B
+   - check -- is IP_AT_SYSCALL still right?
+*/
 
 #include "pub_core_basics.h"
 #include "pub_core_debuglog.h"
@@ -108,9 +124,6 @@
 /* If False, a fault is Valgrind-internal (ie, a bug) */
 Bool VG_(in_generated_code) = False;
 
-/* Counts downwards in VG_(run_innerloop). */
-UInt VG_(dispatch_ctr);
-
 /* 64-bit counter for the number of basic blocks done. */
 static ULong bbs_done = 0;
 
@@ -130,6 +143,9 @@ static void mostly_clear_thread_record ( ThreadId tid );
 static ULong n_scheduling_events_MINOR = 0;
 static ULong n_scheduling_events_MAJOR = 0;
 
+ULong VG_(stats__n_xindirs) = 0;
+ULong VG_(stats__n_xindir_misses) = 0;
+
 /* Sanity checking counts. */
 static UInt sanity_fast_count = 0;
 static UInt sanity_slow_count = 0;
@@ -137,7 +153,12 @@ static UInt sanity_slow_count = 0;
 void VG_(print_scheduler_stats)(void)
 {
    VG_(message)(Vg_DebugMsg,
-      "scheduler: %'llu jumps (bb entries).\n", bbs_done );
+      "scheduler: %'llu event checks.\n", bbs_done );
+   VG_(message)(Vg_DebugMsg,
+                "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n",
+                VG_(stats__n_xindirs), VG_(stats__n_xindir_misses),
+                VG_(stats__n_xindirs) / (VG_(stats__n_xindir_misses) 
+                                         ? VG_(stats__n_xindir_misses) : 1));
    VG_(message)(Vg_DebugMsg,
       "scheduler: %'llu/%'llu major/minor sched events.\n",
       n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
@@ -700,14 +721,34 @@ static void do_pre_run_checks ( ThreadState* tst )
    vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
    vg_assert(a_vex + 3 * sz_vex == a_spill);
 
+#  if defined(VGA_x86)
+   /* x86 XMM regs must form an array, ie, have no holes in
+      between. */
+   vg_assert(
+      (offsetof(VexGuestX86State,guest_XMM7)
+       - offsetof(VexGuestX86State,guest_XMM0))
+      == (8/*#regs*/-1) * 16/*bytes per reg*/
+   );
+   vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestX86State,guest_XMM0)));
+   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestX86State,guest_FPREG)));
+   vg_assert(8 == offsetof(VexGuestX86State,guest_EAX));
+   vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EAX)));
+   vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EIP)));
+#  endif
+
 #  if defined(VGA_amd64)
-   /* x86/amd64 XMM regs must form an array, ie, have no
-      holes in between. */
+   /* amd64 XMM regs must form an array, ie, have no holes in
+      between. */
    vg_assert(
       (offsetof(VexGuestAMD64State,guest_XMM16)
        - offsetof(VexGuestAMD64State,guest_XMM0))
       == (17/*#regs*/-1) * 16/*bytes per reg*/
    );
+   vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestAMD64State,guest_XMM0)));
+   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_FPREG)));
+   vg_assert(16 == offsetof(VexGuestAMD64State,guest_RAX));
+   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RAX)));
+   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RIP)));
 #  endif
 
 #  if defined(VGA_ppc32) || defined(VGA_ppc64)
@@ -724,10 +765,10 @@ static void do_pre_run_checks ( ThreadState* tst )
 
 #  if defined(VGA_arm)
    /* arm guest_state VFP regs must be 8 byte aligned for
-      loads/stores. */
-   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D0));
-   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
-   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
+      loads/stores.  Let's use 16 just to be on the safe side. */
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_D0));
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
+   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
    /* be extra paranoid .. */
    vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D1));
    vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D1));
@@ -755,30 +796,82 @@ void VG_(force_vgdb_poll) ( void )
 }
 
 /* Run the thread tid for a while, and return a VG_TRC_* value
-   indicating why VG_(run_innerloop) stopped. */
-static UInt run_thread_for_a_while ( ThreadId tid )
+   indicating why VG_(disp_run_translations) stopped, and possibly an
+   auxiliary word.  Also, only allow the thread to run for at most
+   *dispatchCtrP events.  If (as is the normal case) use_alt_host_addr
+   is False, we are running ordinary redir'd translations, and we
+   should therefore start by looking up the guest next IP in TT.  If
+   it is True then we ignore the guest next IP and just run from
+   alt_host_addr, which presumably points at host code for a no-redir
+   translation.
+
+   Return results are placed in two_words.  two_words[0] is set to the
+   TRC.  In the case where that is VG_TRC_CHAIN_ME_TO_{SLOW,FAST}_EP,
+   the address to patch is placed in two_words[1].
+*/
+static
+void run_thread_for_a_while ( /*OUT*/HWord* two_words,
+                              /*MOD*/Int*   dispatchCtrP,
+                              ThreadId      tid,
+                              HWord         alt_host_addr,
+                              Bool          use_alt_host_addr )
 {
-   volatile UWord        jumped;
-   volatile ThreadState* tst = NULL; /* stop gcc complaining */
-   volatile UInt         trc;
-   volatile Int          dispatch_ctr_SAVED;
-   volatile Int          done_this_time;
+   volatile HWord        jumped         = 0;
+   volatile ThreadState* tst            = NULL; /* stop gcc complaining */
+   volatile UInt         trc            = 0;
+   volatile Int          done_this_time = 0;
+   volatile HWord        host_code_addr = 0;
 
    /* Paranoia */
    vg_assert(VG_(is_valid_tid)(tid));
    vg_assert(VG_(is_running_thread)(tid));
    vg_assert(!VG_(is_exiting)(tid));
+   vg_assert(*dispatchCtrP > 0);
 
    tst = VG_(get_ThreadState)(tid);
    do_pre_run_checks( (ThreadState*)tst );
    /* end Paranoia */
 
-   trc = 0;
-   dispatch_ctr_SAVED = VG_(dispatch_ctr);
+   /* Clear return area. */
+   two_words[0] = two_words[1] = 0;
+
+   /* Figure out where we're starting from. */
+   if (use_alt_host_addr) {
+      /* unusual case -- no-redir translation */
+      host_code_addr = alt_host_addr;
+      vg_assert(host_code_addr != 0); /* implausible */
+   } else {
+      /* normal case -- redir translation */
+      AddrH res   = 0;
+      Bool  found = VG_(search_transtab)(
+                       &res, NULL, NULL,
+                       (Addr64)tst->arch.vex.VG_INSTR_PTR,
+                       True/*upd cache -- necessary?*/
+                    );
+      if (found) {
+         host_code_addr = res;
+         vg_assert(host_code_addr != 0); /* implausible */
+      } else {
+         host_code_addr = 0;
+      }
+   }
+
+   /* At this point, either host_code_addr is nonzero, in which case
+      we're OK, or it's zero, in which case we know that we intended
+      to start at a normal redir translation, but it was not found.
+      In which case we can return now claiming it's not findable. */
+   if (host_code_addr == 0) {
+      two_words[0] = VG_TRC_INNER_FASTMISS; /* hmm, is that right? */
+      return;
+   }
 
    /* there should be no undealt-with signals */
    //vg_assert(VG_(threads)[tid].siginfo.si_signo == 0);
 
+   /* Set up event counter stuff for the run. */
+   tst->arch.vex.host_EvC_COUNTER  = *dispatchCtrP;
+   tst->arch.vex.host_EvC_FAILADDR = (HWord)&VG_(disp_cp_evcheck_fail);
+
    if (0) {
       vki_sigset_t m;
       Int i, err = VG_(sigprocmask)(VKI_SIG_SETMASK, NULL, &m);
@@ -790,6 +883,8 @@ static UInt run_thread_for_a_while ( ThreadId tid )
       VG_(printf)("\n");
    }
 
+   /* Set up return-value area. */
+
    // Tell the tool this thread is about to run client code
    VG_TRACK( start_client_code, tid, bbs_done );
 
@@ -799,26 +894,37 @@ static UInt run_thread_for_a_while ( ThreadId tid )
    SCHEDSETJMP(
       tid, 
       jumped, 
-      trc = (UInt)VG_(run_innerloop)( (void*)&tst->arch.vex,
-                                      VG_(clo_profile_flags) > 0 ? 1 : 0 )
+      trc = (UInt)VG_(disp_run_translations)( 
+                     two_words,
+                     (void*)&tst->arch.vex,
+                     host_code_addr
+                  )
    );
 
    vg_assert(VG_(in_generated_code) == True);
    VG_(in_generated_code) = False;
 
-   if (jumped != (UWord)0) {
+   if (jumped != (HWord)0) {
       /* We get here if the client took a fault that caused our signal
          handler to longjmp. */
       vg_assert(trc == 0);
-      trc = VG_TRC_FAULT_SIGNAL;
+      two_words[0] = VG_TRC_FAULT_SIGNAL;
+      two_words[1] = 0;
       block_signals();
    } 
 
-   done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 0;
+   vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1);
+   vg_assert(tst->arch.vex.host_EvC_FAILADDR
+             == (HWord)&VG_(disp_cp_evcheck_fail));
+
+   done_this_time = *dispatchCtrP - ((Int)tst->arch.vex.host_EvC_COUNTER + 1);
 
    vg_assert(done_this_time >= 0);
    bbs_done += (ULong)done_this_time;
 
+   *dispatchCtrP -= done_this_time;
+   vg_assert(*dispatchCtrP >= 0);
+
    // Tell the tool this thread has stopped running client code
    VG_TRACK( stop_client_code, tid, bbs_done );
 
@@ -832,89 +938,16 @@ static UInt run_thread_for_a_while ( ThreadId tid )
          VG_(gdbserver) (tid);
    }
 
-   return trc;
-}
-
-
-/* Run a no-redir translation just once, and return the resulting
-   VG_TRC_* value. */
-static UInt run_noredir_translation ( Addr hcode, ThreadId tid )
-{
-   volatile UWord        jumped;
-   volatile ThreadState* tst; 
-   volatile UWord        argblock[4];
-   volatile UInt         retval;
-
-   /* Paranoia */
-   vg_assert(VG_(is_valid_tid)(tid));
-   vg_assert(VG_(is_running_thread)(tid));
-   vg_assert(!VG_(is_exiting)(tid));
-
-   tst = VG_(get_ThreadState)(tid);
-   do_pre_run_checks( (ThreadState*)tst );
-   /* end Paranoia */
-
-#  if defined(VGA_ppc32) || defined(VGA_ppc64)
-   /* I don't think we need to clear this thread's guest_RESVN here,
-      because we can only get here if run_thread_for_a_while() has
-      been used immediately before, on this same thread. */
-#  endif
-
-   /* There can be 3 outcomes from VG_(run_a_noredir_translation):
-
-      - a signal occurred and the sighandler longjmp'd.  Then both [2]
-        and [3] are unchanged - hence zero.
-
-      - translation ran normally, set [2] (next guest IP) and set [3]
-        to whatever [1] was beforehand, indicating a normal (boring)
-        jump to the next block.
-
-      - translation ran normally, set [2] (next guest IP) and set [3]
-        to something different from [1] beforehand, which indicates a
-        TRC_ value.
-   */
-   argblock[0] = (UWord)hcode;
-   argblock[1] = (UWord)&VG_(threads)[tid].arch.vex;
-   argblock[2] = 0; /* next guest IP is written here */
-   argblock[3] = 0; /* guest state ptr afterwards is written here */
-
-   // Tell the tool this thread is about to run client code
-   VG_TRACK( start_client_code, tid, bbs_done );
-
-   vg_assert(VG_(in_generated_code) == False);
-   VG_(in_generated_code) = True;
-
-   SCHEDSETJMP(
-      tid, 
-      jumped, 
-      VG_(run_a_noredir_translation)( &argblock[0] )
-   );
-
-   VG_(in_generated_code) = False;
-
-   if (jumped != (UWord)0) {
-      /* We get here if the client took a fault that caused our signal
-         handler to longjmp. */
-      vg_assert(argblock[2] == 0); /* next guest IP was not written */
-      vg_assert(argblock[3] == 0); /* trc was not written */
-      block_signals();
-      retval = VG_TRC_FAULT_SIGNAL;
+   /* TRC value and possible auxiliary patch-address word are already
+      in two_words[0] and [1] respectively, as a result of the call to
+      VG_(run_innerloop). */
+   /* Stay sane .. */
+   if (two_words[0] == VG_TRC_CHAIN_ME_TO_SLOW_EP
+       || two_words[0] == VG_TRC_CHAIN_ME_TO_FAST_EP) {
+      vg_assert(two_words[1] != 0); /* we have a legit patch addr */
    } else {
-      /* store away the guest program counter */
-      VG_(set_IP)( tid, argblock[2] );
-      if (argblock[3] == argblock[1])
-         /* the guest state pointer afterwards was unchanged */
-         retval = VG_TRC_BORING;
-      else
-         retval = (UInt)argblock[3];
+      vg_assert(two_words[1] == 0); /* nobody messed with it */
    }
-
-   bbs_done++;
-
-   // Tell the tool this thread has stopped running client code
-   VG_TRACK( stop_client_code, tid, bbs_done );
-
-   return retval;
 }
 
 
@@ -929,13 +962,16 @@ static void handle_tt_miss ( ThreadId tid )
 
    /* Trivial event.  Miss in the fast-cache.  Do a full
       lookup for it. */
-   found = VG_(search_transtab)( NULL, ip, True/*upd_fast_cache*/ );
+   found = VG_(search_transtab)( NULL, NULL, NULL,
+                                 ip, True/*upd_fast_cache*/ );
    if (UNLIKELY(!found)) {
       /* Not found; we need to request a translation. */
-      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, 
+      if (VG_(translate)( NULL/*caused_discardP*/,
+                          tid, ip, /*debug*/False, 0/*not verbose*/, 
                           bbs_done, True/*allow redirection*/ )) {
-	 found = VG_(search_transtab)( NULL, ip, True ); 
-         vg_assert2(found, "VG_TRC_INNER_FASTMISS: missing tt_fast entry");
+         found = VG_(search_transtab)( NULL, NULL, NULL,
+                                       ip, True ); 
+         vg_assert2(found, "handle_tt_miss: missing tt_fast entry");
       
       } else {
 	 // If VG_(translate)() fails, it's because it had to throw a
@@ -947,6 +983,46 @@ static void handle_tt_miss ( ThreadId tid )
    }
 }
 
+static
+void handle_chain_me ( ThreadId tid, void* place_to_chain, Bool toFastEP )
+{
+   Bool found          = False;
+   Addr ip             = VG_(get_IP)(tid);
+   UInt to_sNo         = (UInt)-1;
+   UInt to_tteNo       = (UInt)-1;
+   Bool caused_discard = False;
+
+   found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
+                                 ip, False/*dont_upd_fast_cache*/ );
+   if (!found) {
+      /* Not found; we need to request a translation. */
+      if (VG_(translate)( &caused_discard,
+                          tid, ip, /*debug*/False, 0/*not verbose*/, 
+                          bbs_done, True/*allow redirection*/ )) {
+         found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
+                                       ip, False ); 
+         vg_assert2(found, "handle_chain_me: missing tt_fast entry");
+      } else {
+	 // If VG_(translate)() fails, it's because it had to throw a
+	 // signal because the client jumped to a bad address.  That
+	 // means that either a signal has been set up for delivery,
+	 // or the thread has been marked for termination.  Either
+	 // way, we just need to go back into the scheduler loop.
+        return;
+      }
+   }
+   vg_assert(found);
+   vg_assert(to_sNo != -1);
+   vg_assert(to_tteNo != -1);
+
+   /* So, finally we know where to patch through to.  Do the patching
+      and update the various admin tables that allow it to be undone
+      in the case that the destination block gets deleted. */
+   if (!caused_discard)
+      VG_(tt_tc_do_chaining)( place_to_chain,
+                              to_sNo, to_tteNo, toFastEP );
+}
+
 static void handle_syscall(ThreadId tid, UInt trc)
 {
    ThreadState * volatile tst = VG_(get_ThreadState)(tid);
@@ -978,28 +1054,35 @@ static void handle_syscall(ThreadId tid, UInt trc)
 
 /* tid just requested a jump to the noredir version of its current
    program counter.  So make up that translation if needed, run it,
-   and return the resulting thread return code. */
-static UInt/*trc*/ handle_noredir_jump ( ThreadId tid )
+   and return the resulting thread return code in two_words[]. */
+static
+void handle_noredir_jump ( /*OUT*/HWord* two_words,
+                           /*MOD*/Int*   dispatchCtrP,
+                           ThreadId tid )
 {
+   /* Clear return area. */
+   two_words[0] = two_words[1] = 0;
+
    AddrH hcode = 0;
    Addr  ip    = VG_(get_IP)(tid);
 
    Bool  found = VG_(search_unredir_transtab)( &hcode, ip );
    if (!found) {
       /* Not found; we need to request a translation. */
-      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
+      if (VG_(translate)( NULL/*caused_discardP*/,
+                          tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
                           False/*NO REDIRECTION*/ )) {
 
          found = VG_(search_unredir_transtab)( &hcode, ip );
          vg_assert2(found, "unredir translation missing after creation?!");
-      
       } else {
 	 // If VG_(translate)() fails, it's because it had to throw a
 	 // signal because the client jumped to a bad address.  That
 	 // means that either a signal has been set up for delivery,
 	 // or the thread has been marked for termination.  Either
 	 // way, we just need to go back into the scheduler loop.
-         return VG_TRC_BORING;
+         two_words[0] = VG_TRC_BORING;
+         return;
       }
 
    }
@@ -1007,8 +1090,10 @@ static UInt/*trc*/ handle_noredir_jump ( ThreadId tid )
    vg_assert(found);
    vg_assert(hcode != 0);
 
-   /* Otherwise run it and return the resulting VG_TRC_* value. */ 
-   return run_noredir_translation( hcode, tid );
+   /* Otherwise run it and return the resulting VG_TRC_* value. */
+   vg_assert(*dispatchCtrP > 0); /* so as to guarantee progress */
+   run_thread_for_a_while( two_words, dispatchCtrP, tid,
+                           hcode, True/*use hcode*/ );
 }
 
 
@@ -1020,7 +1105,9 @@ static UInt/*trc*/ handle_noredir_jump ( ThreadId tid )
  */
 VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
 {
-   UInt     trc;
+   /* Holds the remaining size of this thread's "timeslice". */
+   Int dispatch_ctr = 0;
+
    ThreadState *tst = VG_(get_ThreadState)(tid);
    static Bool vgdb_startup_action_done = False;
 
@@ -1079,11 +1166,12 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
    
    vg_assert(VG_(is_running_thread)(tid));
 
-   VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1;
+   dispatch_ctr = SCHEDULING_QUANTUM;
 
    while (!VG_(is_exiting)(tid)) {
 
-      if (VG_(dispatch_ctr) == 1) {
+      vg_assert(dispatch_ctr >= 0);
+      if (dispatch_ctr == 0) {
 
 	 /* Our slice is done, so yield the CPU to another thread.  On
             Linux, this doesn't sleep between sleeping and running,
@@ -1130,7 +1218,8 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
 	    exceed zero before entering the innerloop.  Also also, the
 	    decrement is done before the bb is actually run, so you
 	    always get at least one decrement even if nothing happens. */
-         VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1;
+         // FIXME is this right?
+         dispatch_ctr = SCHEDULING_QUANTUM;
 
 	 /* paranoia ... */
 	 vg_assert(tst->tid == tid);
@@ -1142,17 +1231,20 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
 
       if (0)
          VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs\n", 
-                                   tid, VG_(dispatch_ctr) - 1 );
+                                   tid, dispatch_ctr - 1 );
 
-      trc = run_thread_for_a_while ( tid );
+      HWord trc[2]; /* "two_words" */
+      run_thread_for_a_while( &trc[0],
+                              &dispatch_ctr,
+                              tid, 0/*ignored*/, False );
 
       if (VG_(clo_trace_sched) && VG_(clo_verbosity) > 2) {
-	 Char buf[50];
-	 VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc));
+	 HChar buf[50];
+	 VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc[0]));
 	 print_sched_event(tid, buf);
       }
 
-      if (trc == VEX_TRC_JMP_NOREDIR) {
+      if (trc[0] == VEX_TRC_JMP_NOREDIR) {
          /* If we got a request to run a no-redir version of
             something, do so now -- handle_noredir_jump just (creates
             and) runs that one translation.  The flip side is that the
@@ -1160,20 +1252,61 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
             request -- that would be nonsensical.  It can, however,
             return VG_TRC_BORING, which just means keep going as
             normal. */
-         trc = handle_noredir_jump(tid);
-         vg_assert(trc != VEX_TRC_JMP_NOREDIR);
+         /* Note that the fact that we need to continue with a
+            no-redir jump is not recorded anywhere else in this
+            thread's state.  So we *must* execute the block right now
+            -- we can't fail to execute it and later resume with it,
+            because by then we'll have forgotten the fact that it
+            should be run as no-redir, but will get run as a normal
+            potentially-redir'd, hence screwing up.  This really ought
+            to be cleaned up, by noting in the guest state that the
+            next block to be executed should be no-redir.  Then we can
+            suspend and resume at any point, which isn't the case at
+            the moment. */
+         handle_noredir_jump( &trc[0], 
+                              &dispatch_ctr,
+                              tid );
+         vg_assert(trc[0] != VEX_TRC_JMP_NOREDIR);
+
+         /* This can't be allowed to happen, since it means the block
+            didn't execute, and we have no way to resume-as-noredir
+            after we get more timeslice.  But I don't think it ever
+            can, since handle_noredir_jump will assert if the counter
+            is zero on entry. */
+         vg_assert(trc[0] != VG_TRC_INNER_COUNTERZERO);
+
+         /* A no-redir translation can't return with a chain-me
+            request, since chaining in the no-redir cache is too
+            complex. */
+         vg_assert(trc[0] != VG_TRC_CHAIN_ME_TO_SLOW_EP
+                   && trc[0] != VG_TRC_CHAIN_ME_TO_FAST_EP);
       }
 
-      switch (trc) {
+      switch (trc[0]) {
+      case VEX_TRC_JMP_BORING:
+         /* assisted dispatch, no event.  Used by no-redir
+            translations to force return to the scheduler. */
       case VG_TRC_BORING:
          /* no special event, just keep going. */
          break;
 
       case VG_TRC_INNER_FASTMISS:
-	 vg_assert(VG_(dispatch_ctr) > 1);
+	 vg_assert(dispatch_ctr > 0);
 	 handle_tt_miss(tid);
 	 break;
-	    
+
+      case VG_TRC_CHAIN_ME_TO_SLOW_EP: {
+         if (0) VG_(printf)("sched: CHAIN_TO_SLOW_EP: %p\n", (void*)trc[1] );
+         handle_chain_me(tid, (void*)trc[1], False);
+         break;
+      }
+
+      case VG_TRC_CHAIN_ME_TO_FAST_EP: {
+         if (0) VG_(printf)("sched: CHAIN_TO_FAST_EP: %p\n", (void*)trc[1] );
+         handle_chain_me(tid, (void*)trc[1], True);
+         break;
+      }
+
       case VEX_TRC_JMP_CLIENTREQ:
 	 do_client_request(tid);
 	 break;
@@ -1182,7 +1315,7 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
       case VEX_TRC_JMP_SYS_INT129:  /* x86-darwin */
       case VEX_TRC_JMP_SYS_INT130:  /* x86-darwin */
       case VEX_TRC_JMP_SYS_SYSCALL: /* amd64-linux, ppc32-linux, amd64-darwin */
-	 handle_syscall(tid, trc);
+	 handle_syscall(tid, trc[0]);
 	 if (VG_(clo_sanity_level) > 2)
 	    VG_(sanity_check_general)(True); /* sanity-check every syscall */
 	 break;
@@ -1195,13 +1328,13 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
             before swapping to another.  That means that short term
             spins waiting for hardware to poke memory won't cause a
             thread swap. */
-	 if (VG_(dispatch_ctr) > 2000) 
-            VG_(dispatch_ctr) = 2000;
+	 if (dispatch_ctr > 2000) 
+            dispatch_ctr = 2000;
 	 break;
 
       case VG_TRC_INNER_COUNTERZERO:
 	 /* Timeslice is out.  Let a new thread be scheduled. */
-	 vg_assert(VG_(dispatch_ctr) == 1);
+	 vg_assert(dispatch_ctr == 0);
 	 break;
 
       case VG_TRC_FAULT_SIGNAL:
@@ -1346,7 +1479,7 @@ VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
 
       default: 
 	 vg_assert2(0, "VG_(scheduler), phase 3: "
-                       "unexpected thread return code (%u)", trc);
+                       "unexpected thread return code (%u)", trc[0]);
 	 /* NOTREACHED */
 	 break;
 
diff --git a/coregrind/m_translate.c b/coregrind/m_translate.c
index cd0d269713..795c000e84 100644
--- a/coregrind/m_translate.c
+++ b/coregrind/m_translate.c
@@ -280,6 +280,7 @@ IRSB* vg_SP_update_pass ( void*             closureV,
    bb->tyenv    = deepCopyIRTypeEnv(sb_in->tyenv);
    bb->next     = deepCopyIRExpr(sb_in->next);
    bb->jumpkind = sb_in->jumpkind;
+   bb->offsIP   = sb_in->offsIP;
 
    delta = 0;
 
@@ -1259,14 +1260,18 @@ typedef
    instead of the normal one.
 
    TID is the identity of the thread requesting this translation.
-*/
 
-Bool VG_(translate) ( ThreadId tid, 
-                      Addr64   nraddr,
-                      Bool     debugging_translation,
-                      Int      debugging_verbosity,
-                      ULong    bbs_done,
-                      Bool     allow_redirection )
+   *caused_discardP returns whether or not this translation resulting
+   in code being dumped from the main translation cache in order to
+   make space for the new translation.
+*/
+Bool VG_(translate) ( /*OUT*/Bool* caused_discardP,
+                      ThreadId     tid, 
+                      Addr64       nraddr,
+                      Bool         debugging_translation,
+                      Int          debugging_verbosity,
+                      ULong        bbs_done,
+                      Bool         allow_redirection )
 {
    Addr64             addr;
    T_Kind             kind;
@@ -1280,8 +1285,9 @@ Bool VG_(translate) ( ThreadId tid,
    VexTranslateResult tres;
    VgCallbackClosure  closure;
 
-   /* Make sure Vex is initialised right. */
+   if (caused_discardP) *caused_discardP = False;
 
+   /* Make sure Vex is initialised right. */
    static Bool vex_init_done = False;
 
    if (!vex_init_done) {
@@ -1348,7 +1354,7 @@ Bool VG_(translate) ( ThreadId tid,
       }
       vg_assert(objname);
       VG_(printf)(
-         "==== SB %d (exec'd %lld) [tid %d] 0x%llx %s %s+0x%llx\n",
+         "==== SB %d (evchecks %lld) [tid %d] 0x%llx %s %s+0x%llx\n",
          VG_(get_bbs_translated)(), bbs_done, (Int)tid, addr,
          fnname, objname, (ULong)objoff
       );
@@ -1461,11 +1467,10 @@ Bool VG_(translate) ( ThreadId tid,
    vta.arch_host        = vex_arch;
    vta.archinfo_host    = vex_archinfo;
    vta.abiinfo_both     = vex_abiinfo;
+   vta.callback_opaque  = (void*)&closure;
    vta.guest_bytes      = (UChar*)ULong_to_Ptr(addr);
    vta.guest_bytes_addr = (Addr64)addr;
-   vta.callback_opaque  = (void*)&closure;
    vta.chase_into_ok    = chase_into_ok;
-   vta.preamble_function = preamble_fn;
    vta.guest_extents    = &vge;
    vta.host_bytes       = tmpbuf;
    vta.host_bytes_size  = N_TMPBUF;
@@ -1486,22 +1491,47 @@ Bool VG_(translate) ( ThreadId tid,
                IRSB*,VexGuestLayout*,VexGuestExtents*,
                IRType,IRType)
        = (IRSB*(*)(void*,IRSB*,VexGuestLayout*,VexGuestExtents*,IRType,IRType))f;
-     vta.instrument1    = g;
+     vta.instrument1     = g;
    }
    /* No need for type kludgery here. */
-   vta.instrument2      = need_to_handle_SP_assignment()
-                             ? vg_SP_update_pass
-                             : NULL;
-   vta.finaltidy        = VG_(needs).final_IR_tidy_pass
-                             ? VG_(tdict).tool_final_IR_tidy_pass
-                             : NULL;
-   vta.needs_self_check = needs_self_check;
-   vta.traceflags       = verbosity;
-
-   /* Set up the dispatch-return info.  For archs without a link
-      register, vex generates a jump back to the specified dispatch
-      address.  Else, it just generates a branch-to-LR. */
+   vta.instrument2       = need_to_handle_SP_assignment()
+                              ? vg_SP_update_pass
+                              : NULL;
+   vta.finaltidy         = VG_(needs).final_IR_tidy_pass
+                              ? VG_(tdict).tool_final_IR_tidy_pass
+                              : NULL;
+   vta.needs_self_check  = needs_self_check;
+   vta.preamble_function = preamble_fn;
+   vta.traceflags        = verbosity;
+   vta.addProfInc        = VG_(clo_profile_flags) > 0
+                           && kind != T_NoRedir;
+
+   /* Set up the dispatch continuation-point info.  If this is a
+      no-redir translation then it cannot be chained, and the chain-me
+      points are set to NULL to indicate that.  The indir point must
+      also be NULL, since we can't allow this translation to do an
+      indir transfer -- that would take it back into the main
+      translation cache too.
+
+      All this is because no-redir translations live outside the main
+      translation cache (in a secondary one) and chaining them would
+      involve more adminstrative complexity that isn't worth the
+      hassle, because we don't expect them to get used often.  So
+      don't bother. */
+   if (allow_redirection) {
+      vta.disp_cp_chain_me_to_slowEP = (void*) &VG_(disp_cp_chain_me_to_slowEP);
+      vta.disp_cp_chain_me_to_fastEP = (void*) &VG_(disp_cp_chain_me_to_fastEP);
+      vta.disp_cp_xindir             = (void*) &VG_(disp_cp_xindir);
+   } else {
+      vta.disp_cp_chain_me_to_slowEP = NULL;
+      vta.disp_cp_chain_me_to_fastEP = NULL;
+      vta.disp_cp_xindir             = NULL;
+   }
+   /* Thins  doesn't involve chaining and so is always allowable. */
+   vta.disp_cp_xassisted = (void*) &VG_(disp_cp_xassisted);
 
+#if 0
+   // FIXME tidy this up and make profiling work again
 #  if defined(VGA_x86) || defined(VGA_amd64)
    if (!allow_redirection) {
       /* It's a no-redir translation.  Will be run with the
@@ -1539,6 +1569,7 @@ Bool VG_(translate) ( ThreadId tid,
 #  else
 #    error "Unknown arch"
 #  endif
+#endif /* 0 */
 
    /* Sheesh.  Finally, actually _do_ the translation! */
    tres = LibVEX_Translate ( &vta );
@@ -1577,12 +1608,18 @@ Bool VG_(translate) ( ThreadId tid,
 
           // Note that we use nraddr (the non-redirected address), not
           // addr, which might have been changed by the redirection
-          VG_(add_to_transtab)( &vge,
-                                nraddr,
-                                (Addr)(&tmpbuf[0]), 
-                                tmpbuf_used,
-                                tres.n_sc_extents > 0 );
+          Bool caused_discard
+             = VG_(add_to_transtab)( &vge,
+                                     nraddr,
+                                     (Addr)(&tmpbuf[0]), 
+                                     tmpbuf_used,
+                                     tres.n_sc_extents > 0,
+                                     tres.offs_profInc,
+                                     vex_arch );
+          if (caused_discardP)
+             *caused_discardP = caused_discard;
       } else {
+          vg_assert(tres.offs_profInc == -1); /* -1 == unset */
           VG_(add_to_unredir_transtab)( &vge,
                                         nraddr,
                                         (Addr)(&tmpbuf[0]), 
diff --git a/coregrind/m_transtab.c b/coregrind/m_transtab.c
index fac0b1f994..3c2439ddc1 100644
--- a/coregrind/m_transtab.c
+++ b/coregrind/m_transtab.c
@@ -31,8 +31,10 @@
 
 #include "pub_core_basics.h"
 #include "pub_core_debuglog.h"
-#include "pub_core_machine.h"    // For VG(machine_get_VexArchInfo)
+#include "pub_core_machine.h"    // For VG_(machine_get_VexArchInfo)
 #include "pub_core_libcbase.h"
+#include "pub_core_vki.h"        // to keep pub_core_libproc.h happy, sigh
+#include "pub_core_libcproc.h"   // VG_(invalidate_icache)
 #include "pub_core_libcassert.h"
 #include "pub_core_libcprint.h"
 #include "pub_core_options.h"
@@ -40,12 +42,8 @@
 #include "pub_core_transtab.h"
 #include "pub_core_aspacemgr.h"
 #include "pub_core_mallocfree.h" // VG_(out_of_memory_NORETURN)
-
-// JRS FIXME get rid of this somehow
-#if defined(VGP_arm_linux)
-# include "pub_core_vkiscnums.h" // __ARM_NR_cacheflush
-# include "pub_core_syscall.h"   // VG_(do_syscallN)
-#endif
+#include "pub_core_xarray.h"
+#include "pub_core_dispatch.h"   // For VG_(disp_cp*) addresses
 
 
 /* #define DEBUG_TRANSTAB */
@@ -67,6 +65,7 @@
    'deleted') and it is strongly recommended not to change this.
    65521 is the largest prime <= 65535. */
 #define N_TTES_PER_SECTOR /*30011*/ /*40009*/ 65521
+//DEBUG-ONLY: #define N_TTES_PER_SECTOR 10007
 
 /* Because each sector contains a hash table of TTEntries, we need to
    specify the maximum allowable loading, after which the sector is
@@ -91,6 +90,46 @@
 
 /*------------------ TYPES ------------------*/
 
+/* In edges ("to-me") in the graph created by chaining. */
+typedef
+   struct {
+      UInt from_sNo;   /* sector number */
+      UInt from_tteNo; /* TTE number in given sector */
+      UInt from_offs;  /* code offset from TCEntry::tcptr where the patch is */
+      Bool to_fastEP;  /* Is the patch to a fast or slow entry point? */
+   }
+   InEdge;
+
+
+/* Out edges ("from-me") in the graph created by chaining. */
+typedef
+   struct {
+      UInt to_sNo;    /* sector number */
+      UInt to_tteNo;  /* TTE number in given sector */
+      UInt from_offs; /* code offset in owning translation where patch is */
+   }
+   OutEdge;
+
+
+#define N_FIXED_IN_EDGE_ARR 3
+typedef
+   struct {
+      UInt     n_fixed; /* 0 .. N_FIXED_IN_EDGE_ARR */
+      InEdge   fixed[N_FIXED_IN_EDGE_ARR];
+      XArray*  var; /* XArray* of InEdgeArr */
+   }
+   InEdgeArr;
+
+#define N_FIXED_OUT_EDGE_ARR 2
+typedef
+   struct {
+      UInt    n_fixed; /* 0 .. N_FIXED_OUT_EDGE_ARR */
+      OutEdge fixed[N_FIXED_OUT_EDGE_ARR];
+      XArray* var; /* XArray* of OutEdgeArr */
+   }
+   OutEdgeArr;
+
+
 /* A translation-table entry.  This indicates precisely which areas of
    guest code are included in the translation, and contains all other
    auxiliary info too.  */
@@ -102,7 +141,7 @@ typedef
          Count is an entry count for the translation and is
          incremented by 1 every time the translation is used, if we
          are profiling. */
-      UInt     count;
+      ULong    count;
       UShort   weight;
 
       /* Status of the slot.  Note, we need to be able to do lazy
@@ -143,15 +182,70 @@ typedef
       //    sec->ec2tte[ tte2ec_ec[i] ][ tte2ec_ix[i] ] 
       // should be the index 
       // of this TTEntry in the containing Sector's tt array.
+
+      /* Admin information for chaining.  'in_edges' is a set of the
+         patch points which jump to this translation -- hence are
+         predecessors in the control flow graph.  'out_edges' points
+         to successors in the control flow graph -- translations to
+         which this one has a patched jump.  In short these are just
+         backwards and forwards edges in the graph of patched-together
+         blocks.  The 'in_edges' contain slightly more info, enough
+         that we can undo the chaining of each mentioned patch point.
+         The 'out_edges' list exists only so that we can visit the
+         'in_edges' entries of all blocks we're patched through to, in
+         order to remove ourselves from then when we're deleted. */
+
+      /* It is possible, although very unlikely, that a block A has
+         more than one patched jump to block B.  This could happen if
+         (eg) A finishes "jcond B; jmp B".
+
+         This means in turn that B's in_edges set can list A more than
+         once (twice in this example).  However, each such entry must
+         have a different from_offs, since a patched jump can only
+         jump to one place at once (it's meaningless for it to have
+         multiple destinations.)  IOW, the successor and predecessor
+         edges in the graph are not uniquely determined by a 
+         TTEntry --> TTEntry pair, but rather by a 
+         (TTEntry,offset) --> TTEntry triple.
+
+         If A has multiple edges to B then B will mention A multiple
+         times in its in_edges.  To make things simpler, we then
+         require that A mentions B exactly the same number of times in
+         its out_edges.  Furthermore, a matching out-in pair must have
+         the same offset (from_offs).  This facilitates sanity
+         checking, and it facilitates establishing the invariant that
+         a out_edges set may not have duplicates when using the
+         equality defined by (TTEntry,offset).  Hence the out_edges
+         and in_edges sets really do have both have set semantics.
+
+         eg if  A has been patched to B at offsets 42 and 87 (in A)
+         then   A.out_edges = { (B,42), (B,87) }   (in any order)
+         and    B.in_edges  = { (A,42), (A,87) }   (in any order)
+
+         Hence for each node pair P->Q in the graph, there's a 1:1
+         mapping between P.out_edges and Q.in_edges.
+      */
+      InEdgeArr  in_edges;
+      OutEdgeArr out_edges;
    }
    TTEntry;
 
 
+/* A structure used for mapping host code addresses back to the
+   relevant TTEntry.  Used when doing chaining, for finding the
+   TTEntry to which some arbitrary patch address belongs. */
+typedef
+   struct {
+      UChar* start;
+      UInt   len;
+      UInt   tteNo;
+   }
+   HostExtent;
+
 /* Finally, a sector itself.  Each sector contains an array of
    TCEntries, which hold code, and an array of TTEntries, containing
    all required administrative info.  Profiling is supported using the
-   TTEntry .count and .weight fields, if required.  Each sector is
-   independent in that no cross-sector references are allowed.
+   TTEntry .count and .weight fields, if required.
 
    If the sector is not in use, all three pointers are NULL and
    tt_n_inuse is zero.  
@@ -181,6 +275,11 @@ typedef
       Int     ec2tte_size[ECLASS_N];
       Int     ec2tte_used[ECLASS_N];
       UShort* ec2tte[ECLASS_N];
+
+      /* The host extents.  The [start, +len) ranges are constructed
+         in strictly non-overlapping order, so we can binary search
+         them at any time. */
+      XArray* host_extents; /* XArray* of HostExtent */
    }
    Sector;
 
@@ -238,30 +337,6 @@ typedef
 */
 /*global*/ __attribute__((aligned(16)))
            FastCacheEntry VG_(tt_fast)[VG_TT_FAST_SIZE];
-/*
-#define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
-*/
-
-/* For profiling, we have a parallel array of pointers to .count
-   fields in TT entries.  Again, these pointers must be invalidated
-   when translations disappear.  A NULL pointer suffices to indicate
-   an unused slot.
-
-   When not profiling (the normal case, VG_(clo_profile_flags) == 0),
-   all tt_fastN entries are set to NULL at startup and never read nor
-   written after that.
-
-   When profiling (VG_(clo_profile_flags) > 0), tt_fast and tt_fastN
-   change together: if tt_fast[i].guest is TRANSTAB_BOGUS_GUEST_ADDR
-   then the corresponding tt_fastN[i] must be null.  If
-   tt_fast[i].guest is any other value, then tt_fastN[i] *must* point
-   to the .count field of the corresponding TT entry.
-
-   tt_fast and tt_fastN are referred to from assembly code
-   (dispatch.S).
-*/
-/*global*/ UInt* VG_(tt_fastN)[VG_TT_FAST_SIZE];
-
 
 /* Make sure we're not used before initialisation. */
 static Bool init_done = False;
@@ -270,27 +345,480 @@ static Bool init_done = False;
 /*------------------ STATS DECLS ------------------*/
 
 /* Number of fast-cache updates and flushes done. */
-ULong n_fast_flushes = 0;
-ULong n_fast_updates = 0;
+static ULong n_fast_flushes = 0;
+static ULong n_fast_updates = 0;
 
 /* Number of full lookups done. */
-ULong n_full_lookups = 0;
-ULong n_lookup_probes = 0;
+static ULong n_full_lookups = 0;
+static ULong n_lookup_probes = 0;
 
 /* Number/osize/tsize of translations entered; also the number of
    those for which self-checking was requested. */
-ULong n_in_count    = 0;
-ULong n_in_osize    = 0;
-ULong n_in_tsize    = 0;
-ULong n_in_sc_count = 0;
+static ULong n_in_count    = 0;
+static ULong n_in_osize    = 0;
+static ULong n_in_tsize    = 0;
+static ULong n_in_sc_count = 0;
 
 /* Number/osize of translations discarded due to lack of space. */
-ULong n_dump_count = 0;
-ULong n_dump_osize = 0;
+static ULong n_dump_count = 0;
+static ULong n_dump_osize = 0;
 
 /* Number/osize of translations discarded due to requests to do so. */
-ULong n_disc_count = 0;
-ULong n_disc_osize = 0;
+static ULong n_disc_count = 0;
+static ULong n_disc_osize = 0;
+
+
+/*-------------------------------------------------------------*/
+/*--- Misc                                                  ---*/
+/*-------------------------------------------------------------*/
+
+static void* ttaux_malloc ( HChar* tag, SizeT n )
+{
+   return VG_(arena_malloc)(VG_AR_TTAUX, tag, n);
+}
+
+static void ttaux_free ( void* p )
+{
+   VG_(arena_free)(VG_AR_TTAUX, p);
+}
+
+
+/*-------------------------------------------------------------*/
+/*--- Chaining support                                      ---*/
+/*-------------------------------------------------------------*/
+
+static inline TTEntry* index_tte ( UInt sNo, UInt tteNo )
+{
+   vg_assert(sNo < N_SECTORS);
+   vg_assert(tteNo < N_TTES_PER_SECTOR);
+   Sector* s = &sectors[sNo];
+   vg_assert(s->tt);
+   TTEntry* tte = &s->tt[tteNo];
+   vg_assert(tte->status == InUse);
+   return tte;
+}
+
+static void InEdge__init ( InEdge* ie )
+{
+   ie->from_sNo   = -1; /* invalid */
+   ie->from_tteNo = 0;
+   ie->from_offs  = 0;
+   ie->to_fastEP  = False;
+}
+
+static void OutEdge__init ( OutEdge* oe )
+{
+   oe->to_sNo    = -1; /* invalid */
+   oe->to_tteNo  = 0;
+   oe->from_offs = 0;
+}
+
+static void TTEntry__init ( TTEntry* tte )
+{
+   VG_(memset)(tte, 0, sizeof(*tte));
+}
+
+static UWord InEdgeArr__size ( InEdgeArr* iea )
+{
+   if (iea->var) {
+      vg_assert(iea->n_fixed == 0);
+      return VG_(sizeXA)(iea->var);
+   } else {
+      vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+      return iea->n_fixed;
+   }
+}
+
+static void InEdgeArr__makeEmpty ( InEdgeArr* iea )
+{
+   if (iea->var) {
+      vg_assert(iea->n_fixed == 0);
+      VG_(deleteXA)(iea->var);
+      iea->var = NULL;
+   } else {
+      vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+      iea->n_fixed = 0;
+   }
+}
+
+static
+InEdge* InEdgeArr__index ( InEdgeArr* iea, UWord i )
+{
+   if (iea->var) {
+      vg_assert(iea->n_fixed == 0);
+      return (InEdge*)VG_(indexXA)(iea->var, i);
+   } else {
+      vg_assert(i < iea->n_fixed);
+      return &iea->fixed[i];
+   }
+}
+
+static
+void InEdgeArr__deleteIndex ( InEdgeArr* iea, UWord i )
+{
+   if (iea->var) {
+      vg_assert(iea->n_fixed == 0);
+      VG_(removeIndexXA)(iea->var, i);
+   } else {
+      vg_assert(i < iea->n_fixed);
+      for (; i+1 < iea->n_fixed; i++) {
+         iea->fixed[i] = iea->fixed[i+1];
+      }
+      iea->n_fixed--;
+   }
+}
+
+static
+void InEdgeArr__add ( InEdgeArr* iea, InEdge* ie )
+{
+   if (iea->var) {
+      vg_assert(iea->n_fixed == 0);
+      VG_(addToXA)(iea->var, ie);
+   } else {
+      vg_assert(iea->n_fixed <= N_FIXED_IN_EDGE_ARR);
+      if (iea->n_fixed == N_FIXED_IN_EDGE_ARR) {
+         /* The fixed array is full, so we have to initialise an
+            XArray and copy the fixed array into it. */
+         iea->var = VG_(newXA)(ttaux_malloc, "transtab.IEA__add",
+                               ttaux_free,
+                               sizeof(InEdge));
+         UWord i;
+         for (i = 0; i < iea->n_fixed; i++) {
+            VG_(addToXA)(iea->var, &iea->fixed[i]);
+         }
+         VG_(addToXA)(iea->var, ie);
+         iea->n_fixed = 0;
+      } else {
+         /* Just add to the fixed array. */
+         iea->fixed[iea->n_fixed++] = *ie;
+      }
+   }
+}
+
+static UWord OutEdgeArr__size ( OutEdgeArr* oea )
+{
+   if (oea->var) {
+      vg_assert(oea->n_fixed == 0);
+      return VG_(sizeXA)(oea->var);
+   } else {
+      vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+      return oea->n_fixed;
+   }
+}
+
+static void OutEdgeArr__makeEmpty ( OutEdgeArr* oea )
+{
+   if (oea->var) {
+      vg_assert(oea->n_fixed == 0);
+      VG_(deleteXA)(oea->var);
+      oea->var = NULL;
+   } else {
+      vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+      oea->n_fixed = 0;
+   }
+}
+
+static
+OutEdge* OutEdgeArr__index ( OutEdgeArr* oea, UWord i )
+{
+   if (oea->var) {
+      vg_assert(oea->n_fixed == 0);
+      return (OutEdge*)VG_(indexXA)(oea->var, i);
+   } else {
+      vg_assert(i < oea->n_fixed);
+      return &oea->fixed[i];
+   }
+}
+
+static
+void OutEdgeArr__deleteIndex ( OutEdgeArr* oea, UWord i )
+{
+   if (oea->var) {
+      vg_assert(oea->n_fixed == 0);
+      VG_(removeIndexXA)(oea->var, i);
+   } else {
+      vg_assert(i < oea->n_fixed);
+      for (; i+1 < oea->n_fixed; i++) {
+         oea->fixed[i] = oea->fixed[i+1];
+      }
+      oea->n_fixed--;
+   }
+}
+
+static
+void OutEdgeArr__add ( OutEdgeArr* oea, OutEdge* oe )
+{
+   if (oea->var) {
+      vg_assert(oea->n_fixed == 0);
+      VG_(addToXA)(oea->var, oe);
+   } else {
+      vg_assert(oea->n_fixed <= N_FIXED_OUT_EDGE_ARR);
+      if (oea->n_fixed == N_FIXED_OUT_EDGE_ARR) {
+         /* The fixed array is full, so we have to initialise an
+            XArray and copy the fixed array into it. */
+         oea->var = VG_(newXA)(ttaux_malloc, "transtab.OEA__add",
+                               ttaux_free,
+                               sizeof(OutEdge));
+         UWord i;
+         for (i = 0; i < oea->n_fixed; i++) {
+            VG_(addToXA)(oea->var, &oea->fixed[i]);
+         }
+         VG_(addToXA)(oea->var, oe);
+         oea->n_fixed = 0;
+      } else {
+         /* Just add to the fixed array. */
+         oea->fixed[oea->n_fixed++] = *oe;
+      }
+   }
+}
+
+static
+Int HostExtent__cmpOrd ( void* v1, void* v2 )
+{
+   HostExtent* hx1 = (HostExtent*)v1;
+   HostExtent* hx2 = (HostExtent*)v2;
+   if (hx1->start + hx1->len <= hx2->start) return -1;
+   if (hx2->start + hx2->len <= hx1->start) return 1;
+   return 0; /* partial overlap */
+}
+
+static __attribute__((noinline))
+Bool find_TTEntry_from_hcode( /*OUT*/UInt* from_sNo,
+                              /*OUT*/UInt* from_tteNo,
+                              void* hcode )
+{
+   Int i;
+
+   /* Search order logic copied from VG_(search_transtab). */
+   for (i = 0; i < N_SECTORS; i++) {
+      Int sno = sector_search_order[i];
+      if (UNLIKELY(sno == -1))
+         return False; /* run out of sectors to search */
+
+      Sector* sec = &sectors[sno];
+      XArray* /* of HostExtent */ host_extents = sec->host_extents;
+      vg_assert(host_extents);
+
+      HostExtent key;
+      VG_(memset)(&key, 0, sizeof(key));
+      key.start = hcode;
+      key.len = 1;
+      Word firstW = -1, lastW = -1;
+      Bool found  = VG_(lookupXA_UNSAFE)(
+                       host_extents, &key, &firstW, &lastW,
+                       (Int(*)(void*,void*))HostExtent__cmpOrd
+                    );
+      vg_assert(firstW == lastW); // always true, even if not found
+      if (found) {
+         HostExtent* hx = VG_(indexXA)(host_extents, firstW);
+         UInt tteNo = hx->tteNo;
+         /* Do some additional sanity checks. */
+         vg_assert(tteNo <= N_TTES_PER_SECTOR);
+         vg_assert(sec->tt[tteNo].status == InUse);
+         /* Can only half check that the found TTEntry contains hcode,
+            due to not having a length value for the hcode in the
+            TTEntry. */
+         vg_assert((UChar*)sec->tt[tteNo].tcptr <= (UChar*)hcode);
+         /* Looks plausible */
+         *from_sNo   = sno;
+         *from_tteNo = (UInt)tteNo;
+         return True;
+      }
+   }
+   return False;
+}
+
+
+/* Figure out whether or not hcode is jitted code present in the main
+   code cache (but not in the no-redir cache).  Used for sanity
+   checking. */
+static Bool is_in_the_main_TC ( void* hcode )
+{
+   Int i, sno;
+   for (i = 0; i < N_SECTORS; i++) {
+      sno = sector_search_order[i];
+      if (sno == -1)
+         break; /* run out of sectors to search */
+      if ((UChar*)hcode >= (UChar*)sectors[sno].tc
+          && (UChar*)hcode <= (UChar*)sectors[sno].tc_next
+                              + sizeof(ULong) - 1)
+         return True;
+   }
+   return False;
+}
+
+
+/* Fulfill a chaining request, and record admin info so we
+   can undo it later, if required.
+*/
+void VG_(tt_tc_do_chaining) ( void* from__patch_addr,
+                              UInt  to_sNo,
+                              UInt  to_tteNo,
+                              Bool  to_fastEP )
+{
+   /* Get the CPU info established at startup. */
+   VexArch vex_arch = VexArch_INVALID;
+   VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
+   // host_code is where we're patching to.  So it needs to
+   // take into account, whether we're jumping to the slow
+   // or fast entry point.  By definition, the fast entry point
+   // is exactly one event check's worth of code along from
+   // the slow (tcptr) entry point.
+   TTEntry* to_tte    = index_tte(to_sNo, to_tteNo);
+   void*    host_code = ((UChar*)to_tte->tcptr)
+                        + (to_fastEP ? LibVEX_evCheckSzB(vex_arch) : 0);
+
+   // stay sane -- the patch point (dst) is in this sector's code cache
+   vg_assert( (UChar*)host_code >= (UChar*)sectors[to_sNo].tc );
+   vg_assert( (UChar*)host_code <= (UChar*)sectors[to_sNo].tc_next
+                                   + sizeof(ULong) - 1 );
+   // stay sane -- the patch src is in some sector's code cache
+   vg_assert( is_in_the_main_TC(from__patch_addr) );
+
+   /* Get VEX to do the patching itself.  We have to hand it off
+      since it is host-dependent. */
+   VexInvalRange vir
+      = LibVEX_Chain( vex_arch,
+                      from__patch_addr,
+                      to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP)
+                                : &VG_(disp_cp_chain_me_to_slowEP),
+                      (void*)host_code );
+   VG_(invalidate_icache)( (void*)vir.start, vir.len );
+
+   /* Now do the tricky bit -- update the ch_succs and ch_preds info
+      for the two translations involved, so we can undo the chaining
+      later, which we will have to do if the to_ block gets removed
+      for whatever reason. */
+   /* Find the TTEntry for the from__ code.  This isn't simple since
+      we only know the patch address, which is going to be somewhere
+      inside the from_ block. */
+   UInt from_sNo   = (UInt)-1;
+   UInt from_tteNo = (UInt)-1;
+   Bool from_found
+      = find_TTEntry_from_hcode( &from_sNo, &from_tteNo,
+                                 from__patch_addr );
+   vg_assert(from_found);
+   TTEntry* from_tte = index_tte(from_sNo, from_tteNo);
+
+   /* This is the new from_ -> to_ link to add. */
+   InEdge ie;
+   InEdge__init(&ie);
+   ie.from_sNo   = from_sNo;
+   ie.from_tteNo = from_tteNo;
+   ie.to_fastEP  = to_fastEP;
+   HWord from_offs = (HWord)( (UChar*)from__patch_addr
+                              - (UChar*)from_tte->tcptr );
+   vg_assert(from_offs < 100000/* let's say */);
+   ie.from_offs  = (UInt)from_offs;
+
+   /* This is the new to_ -> from_ backlink to add. */
+   OutEdge oe;
+   OutEdge__init(&oe);
+   oe.to_sNo    = to_sNo;
+   oe.to_tteNo  = to_tteNo;
+   oe.from_offs = (UInt)from_offs;
+
+   /* Add .. */
+   InEdgeArr__add(&to_tte->in_edges, &ie);
+   OutEdgeArr__add(&from_tte->out_edges, &oe);
+}
+
+
+/* Unchain one patch, as described by the specified InEdge.  For
+   sanity check purposes only (to check that the patched location is
+   as expected) it also requires the fast and slow entry point
+   addresses of the destination block (that is, the block that owns
+   this InEdge). */
+__attribute__((noinline))
+static void unchain_one ( VexArch vex_arch,
+                          InEdge* ie,
+                          void* to_fastEPaddr, void* to_slowEPaddr )
+{
+   vg_assert(ie);
+   TTEntry* tte
+      = index_tte(ie->from_sNo, ie->from_tteNo);
+   UChar* place_to_patch
+      = ((HChar*)tte->tcptr) + ie->from_offs;
+   UChar* disp_cp_chain_me
+      = ie->to_fastEP ? &VG_(disp_cp_chain_me_to_fastEP)
+                      : &VG_(disp_cp_chain_me_to_slowEP);
+   UChar* place_to_jump_to_EXPECTED
+      = ie->to_fastEP ? to_fastEPaddr : to_slowEPaddr;
+
+   // stay sane: both src and dst for this unchaining are
+   // in the main code cache
+   vg_assert( is_in_the_main_TC(place_to_patch) ); // src
+   vg_assert( is_in_the_main_TC(place_to_jump_to_EXPECTED) ); // dst
+   // dst check is ok because LibVEX_UnChain checks that
+   // place_to_jump_to_EXPECTED really is the current dst, and
+   // asserts if it isn't.
+   VexInvalRange vir
+       = LibVEX_UnChain( vex_arch, place_to_patch, 
+                         place_to_jump_to_EXPECTED, disp_cp_chain_me );
+   VG_(invalidate_icache)( (void*)vir.start, vir.len );
+}
+
+
+/* The specified block is about to be deleted.  Update the preds and
+   succs of its associated blocks accordingly.  This includes undoing
+   any chained jumps to this block. */
+static
+void unchain_in_preparation_for_deletion ( VexArch vex_arch,
+                                           UInt here_sNo, UInt here_tteNo )
+{
+   if (0)
+      VG_(printf)("QQQ unchain_in_prep %u.%u\n", here_sNo, here_tteNo);
+   UWord    i, j, n, m;
+   Int      evCheckSzB = LibVEX_evCheckSzB(vex_arch);
+   TTEntry* here_tte   = index_tte(here_sNo, here_tteNo);
+   vg_assert(here_tte->status == InUse);
+
+   /* Visit all InEdges owned by here_tte. */
+   n = InEdgeArr__size(&here_tte->in_edges);
+   for (i = 0; i < n; i++) {
+      InEdge* ie = InEdgeArr__index(&here_tte->in_edges, i);
+      // Undo the chaining.
+      UChar* here_slow_EP = (UChar*)here_tte->tcptr;
+      UChar* here_fast_EP = here_slow_EP + evCheckSzB;
+      unchain_one(vex_arch, ie, here_fast_EP, here_slow_EP);
+      // Find the corresponding entry in the "from" node's out_edges,
+      // and remove it.
+      TTEntry* from_tte = index_tte(ie->from_sNo, ie->from_tteNo);
+      m = OutEdgeArr__size(&from_tte->out_edges);
+      vg_assert(m > 0); // it must have at least one entry
+      for (j = 0; j < m; j++) {
+         OutEdge* oe = OutEdgeArr__index(&from_tte->out_edges, j);
+         if (oe->to_sNo == here_sNo && oe->to_tteNo == here_tteNo
+             && oe->from_offs == ie->from_offs)
+           break;
+      }
+      vg_assert(j < m); // "oe must be findable"
+      OutEdgeArr__deleteIndex(&from_tte->out_edges, j);
+   }
+
+   /* Visit all OutEdges owned by here_tte. */
+   n = OutEdgeArr__size(&here_tte->out_edges);
+   for (i = 0; i < n; i++) {
+      OutEdge* oe = OutEdgeArr__index(&here_tte->out_edges, i);
+      // Find the corresponding entry in the "to" node's in_edges,
+      // and remove it.
+      TTEntry* to_tte = index_tte(oe->to_sNo, oe->to_tteNo);
+      m = InEdgeArr__size(&to_tte->in_edges);
+      vg_assert(m > 0); // it must have at least one entry
+      for (j = 0; j < m; j++) {
+         InEdge* ie = InEdgeArr__index(&to_tte->in_edges, j);
+         if (ie->from_sNo == here_sNo && ie->from_tteNo == here_tteNo
+             && ie->from_offs == oe->from_offs)
+           break;
+      }
+      vg_assert(j < m); // "ie must be findable"
+      InEdgeArr__deleteIndex(&to_tte->in_edges, j);
+   }
+
+   InEdgeArr__makeEmpty(&here_tte->in_edges);
+   OutEdgeArr__makeEmpty(&here_tte->out_edges);
+}
 
 
 /*-------------------------------------------------------------*/
@@ -398,12 +926,12 @@ UInt addEClassNo ( /*MOD*/Sector* sec, Int ec, UShort tteno )
       old_sz = sec->ec2tte_size[ec];
       old_ar = sec->ec2tte[ec];
       new_sz = old_sz==0 ? 8 : old_sz<64 ? 2*old_sz : (3*old_sz)/2;
-      new_ar = VG_(arena_malloc)(VG_AR_TTAUX, "transtab.aECN.1",
-                                 new_sz * sizeof(UShort));
+      new_ar = ttaux_malloc("transtab.aECN.1",
+                            new_sz * sizeof(UShort));
       for (i = 0; i < old_sz; i++)
          new_ar[i] = old_ar[i];
       if (old_ar)
-         VG_(arena_free)(VG_AR_TTAUX, old_ar);
+         ttaux_free(old_ar);
       sec->ec2tte_size[ec] = new_sz;
       sec->ec2tte[ec] = new_ar;
 
@@ -575,7 +1103,6 @@ static Bool sanity_check_eclasses_in_sector ( Sector* sec )
 
 /* forwards */
 static Bool sanity_check_redir_tt_tc ( void );
-static Bool sanity_check_fastcache ( void );
 
 static Bool sanity_check_sector_search_order ( void )
 {
@@ -630,8 +1157,6 @@ static Bool sanity_check_all_sectors ( void )
    }
    if ( !sanity_check_redir_tt_tc() )
       return False;
-   if ( !sanity_check_fastcache() )
-      return False;
    if ( !sanity_check_sector_search_order() )
       return False;
    return True;
@@ -669,13 +1194,11 @@ static inline UInt HASH_TT ( Addr64 key )
    return k32 % N_TTES_PER_SECTOR;
 }
 
-static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count )
+static void setFastCacheEntry ( Addr64 key, ULong* tcptr )
 {
    UInt cno = (UInt)VG_TT_FAST_HASH(key);
    VG_(tt_fast)[cno].guest = (Addr)key;
    VG_(tt_fast)[cno].host  = (Addr)tcptr;
-   if (VG_(clo_profile_flags) > 0)
-      VG_(tt_fastN)[cno] = count;
    n_fast_updates++;
    /* This shouldn't fail.  It should be assured by m_translate
       which should reject any attempt to make translation of code
@@ -683,23 +1206,7 @@ static void setFastCacheEntry ( Addr64 key, ULong* tcptr, UInt* count )
    vg_assert(VG_(tt_fast)[cno].guest != TRANSTAB_BOGUS_GUEST_ADDR);
 }
 
-/* Invalidate the fast cache's counter array, VG_(tt_fastN). */
-static void invalidateFastNCache ( void )
-{
-   UInt j;
-   vg_assert(VG_TT_FAST_SIZE > 0 && (VG_TT_FAST_SIZE % 4) == 0);
-   for (j = 0; j < VG_TT_FAST_SIZE; j += 4) {
-      VG_(tt_fastN)[j+0] = NULL;
-      VG_(tt_fastN)[j+1] = NULL;
-      VG_(tt_fastN)[j+2] = NULL;
-      VG_(tt_fastN)[j+3] = NULL;
-   }
-   vg_assert(j == VG_TT_FAST_SIZE);
-}
-
-/* Invalidate the fast cache VG_(tt_fast).  If profiling, also
-   invalidate the fast cache's counter array VG_(tt_fastN), otherwise
-   don't touch it. */
+/* Invalidate the fast cache VG_(tt_fast). */
 static void invalidateFastCache ( void )
 {
    UInt j;
@@ -713,42 +1220,19 @@ static void invalidateFastCache ( void )
       VG_(tt_fast)[j+3].guest = TRANSTAB_BOGUS_GUEST_ADDR;
    }
 
-   if (VG_(clo_profile_flags) > 0)
-      invalidateFastNCache();
-
    vg_assert(j == VG_TT_FAST_SIZE);
    n_fast_flushes++;
 }
 
-static Bool sanity_check_fastcache ( void )
+/* Returns True if the sector has been used before (hence, if we have
+   to eject existing code in it), False if it's never been used
+   before. */
+static Bool initialiseSector ( Int sno )
 {
-   UInt j;
-   if (0) VG_(printf)("sanity check fastcache\n");
-   if (VG_(clo_profile_flags) > 0) {
-      /* profiling */
-      for (j = 0; j < VG_TT_FAST_SIZE; j++) {
-         if (VG_(tt_fastN)[j] == NULL 
-             && VG_(tt_fast)[j].guest != TRANSTAB_BOGUS_GUEST_ADDR)
-            return False;
-         if (VG_(tt_fastN)[j] != NULL 
-             && VG_(tt_fast)[j].guest == TRANSTAB_BOGUS_GUEST_ADDR)
-            return False;
-      }
-   } else {
-      /* not profiling */
-      for (j = 0; j < VG_TT_FAST_SIZE; j++) {
-         if (VG_(tt_fastN)[j] != NULL)
-            return False;
-      }
-   }
-   return True;
-}
-
-static void initialiseSector ( Int sno )
-{
-   Int    i;
-   SysRes sres;
+   Int     i;
+   SysRes  sres;
    Sector* sec;
+   Bool    has_been_used_before = False;
    vg_assert(isValidSector(sno));
 
    { Bool sane = sanity_check_sector_search_order();
@@ -768,6 +1252,7 @@ static void initialiseSector ( Int sno )
          vg_assert(sec->ec2tte_used[i] == 0);
          vg_assert(sec->ec2tte[i] == NULL);
       }
+      vg_assert(sec->host_extents == NULL);
 
       VG_(debugLog)(1,"transtab", "allocate sector %d\n", sno);
 
@@ -793,6 +1278,12 @@ static void initialiseSector ( Int sno )
          sec->tt[i].n_tte2ec = 0;
       }
 
+      /* Set up the host_extents array. */
+      sec->host_extents
+         = VG_(newXA)(ttaux_malloc, "transtab.initialiseSector(host_extents)",
+                      ttaux_free,
+                      sizeof(HostExtent));
+
       /* Add an entry in the sector_search_order */
       for (i = 0; i < N_SECTORS; i++) {
          if (sector_search_order[i] == -1)
@@ -808,11 +1299,16 @@ static void initialiseSector ( Int sno )
 
       /* Sector has been used before.  Dump the old contents. */
       VG_(debugLog)(1,"transtab", "recycle sector %d\n", sno);
+      has_been_used_before = True;
       vg_assert(sec->tt != NULL);
       vg_assert(sec->tc_next != NULL);
       n_dump_count += sec->tt_n_inuse;
 
+      VexArch vex_arch = VexArch_INVALID;
+      VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
       /* Visit each just-about-to-be-abandoned translation. */
+VG_(printf)("QQQ unlink-entire-sector: %d START\n", sno);
       for (i = 0; i < N_TTES_PER_SECTOR; i++) {
          if (sec->tt[i].status == InUse) {
             vg_assert(sec->tt[i].n_tte2ec >= 1);
@@ -824,12 +1320,14 @@ static void initialiseSector ( Int sno )
                               sec->tt[i].entry,
                               sec->tt[i].vge );
             }
+            unchain_in_preparation_for_deletion(vex_arch, sno, i);
          } else {
             vg_assert(sec->tt[i].n_tte2ec == 0);
          }
          sec->tt[i].status   = Empty;
          sec->tt[i].n_tte2ec = 0;
       }
+VG_(printf)("QQQ unlink-entire-sector: %d END\n", sno);
 
       /* Free up the eclass structures. */
       for (i = 0; i < ECLASS_N; i++) {
@@ -838,13 +1336,18 @@ static void initialiseSector ( Int sno )
             vg_assert(sec->ec2tte[i] == NULL);
          } else {
             vg_assert(sec->ec2tte[i] != NULL);
-            VG_(arena_free)(VG_AR_TTAUX, sec->ec2tte[i]);
+            ttaux_free(sec->ec2tte[i]);
             sec->ec2tte[i] = NULL;
             sec->ec2tte_size[i] = 0;
             sec->ec2tte_used[i] = 0;
          }
       }
 
+      /* Empty out the host extents array. */
+      vg_assert(sec->host_extents != NULL);
+      VG_(dropTailXA)(sec->host_extents, VG_(sizeXA)(sec->host_extents));
+      vg_assert(VG_(sizeXA)(sec->host_extents) == 0);
+
       /* Sanity check: ensure it is already in
          sector_search_order[]. */
       for (i = 0; i < N_SECTORS; i++) {
@@ -865,54 +1368,8 @@ static void initialiseSector ( Int sno )
    { Bool sane = sanity_check_sector_search_order();
      vg_assert(sane);
    }
-}
-
-static void invalidate_icache ( void *ptr, Int nbytes )
-{
-#  if defined(VGA_ppc32) || defined(VGA_ppc64)
-   Addr startaddr = (Addr) ptr;
-   Addr endaddr   = startaddr + nbytes;
-   Addr cls;
-   Addr addr;
-   VexArchInfo vai;
-
-   if (nbytes == 0) return;
-   vg_assert(nbytes > 0);
-
-   VG_(machine_get_VexArchInfo)( NULL, &vai );
-   cls = vai.ppc_cache_line_szB;
-
-   /* Stay sane .. */
-   vg_assert(cls == 32 || cls == 64 || cls == 128);
-
-   startaddr &= ~(cls - 1);
-   for (addr = startaddr; addr < endaddr; addr += cls) {
-      __asm__ __volatile__("dcbst 0,%0" : : "r" (addr));
-   }
-   __asm__ __volatile__("sync");
-   for (addr = startaddr; addr < endaddr; addr += cls) {
-      __asm__ __volatile__("icbi 0,%0" : : "r" (addr));
-   }
-   __asm__ __volatile__("sync; isync");
-
-#  elif defined(VGA_x86)
-   /* no need to do anything, hardware provides coherence */
-
-#  elif defined(VGA_amd64)
-   /* no need to do anything, hardware provides coherence */
-
-#  elif defined(VGA_s390x)
-   /* no need to do anything, hardware provides coherence */
-
-#  elif defined(VGP_arm_linux)
-   /* ARM cache flushes are privileged, so we must defer to the kernel. */
-   Addr startaddr = (Addr) ptr;
-   Addr endaddr   = startaddr + nbytes;
-   VG_(do_syscall2)(__NR_ARM_cacheflush, startaddr, endaddr);
 
-#  else
-#    error "Unknown ARCH"
-#  endif
+   return has_been_used_before;
 }
 
 
@@ -921,18 +1378,28 @@ static void invalidate_icache ( void *ptr, Int nbytes )
 
    pre: youngest_sector points to a valid (although possibly full)
    sector.
+
+   Returns True if the call caused any existing translation(s) to get
+   thrown away in order to make space for this one.
 */
-void VG_(add_to_transtab)( VexGuestExtents* vge,
+Bool VG_(add_to_transtab)( VexGuestExtents* vge,
                            Addr64           entry,
                            AddrH            code,
                            UInt             code_len,
-                           Bool             is_self_checking )
+                           Bool             is_self_checking,
+                           Int              offs_profInc,
+                           VexArch          arch_host )
 {
    Int    tcAvailQ, reqdQ, y, i;
    ULong  *tcptr, *tcptr2;
    UChar* srcP;
    UChar* dstP;
 
+   /* We need to tell the caller whether this call caused any code to
+      be thrown away due to the TC becoming full, and hence the oldest
+      Sector to be emptied out and recycled. */
+   Bool caused_code_discarding = False;
+
    vg_assert(init_done);
    vg_assert(vge->n_used >= 1 && vge->n_used <= 3);
 
@@ -952,8 +1419,10 @@ void VG_(add_to_transtab)( VexGuestExtents* vge,
    y = youngest_sector;
    vg_assert(isValidSector(y));
 
-   if (sectors[y].tc == NULL)
-      initialiseSector(y);
+   if (sectors[y].tc == NULL) {
+      Bool used_before = initialiseSector(y);
+      vg_assert(!used_before);
+   }
 
    /* Try putting the translation in this sector. */
    reqdQ = (code_len + 7) >> 3;
@@ -983,7 +1452,8 @@ void VG_(add_to_transtab)( VexGuestExtents* vge,
       if (youngest_sector >= N_SECTORS)
          youngest_sector = 0;
       y = youngest_sector;
-      initialiseSector(y);
+      caused_code_discarding = initialiseSector(y);
+      
    }
 
    /* Be sure ... */
@@ -1002,13 +1472,10 @@ void VG_(add_to_transtab)( VexGuestExtents* vge,
 
    dstP = (UChar*)tcptr;
    srcP = (UChar*)code;
-   for (i = 0; i < code_len; i++)
-      dstP[i] = srcP[i];
+   VG_(memcpy)(dstP, srcP, code_len);
    sectors[y].tc_next += reqdQ;
    sectors[y].tt_n_inuse++;
 
-   invalidate_icache( dstP, code_len );
-
    /* more paranoia */
    tcptr2 = sectors[y].tc_next;
    vg_assert(tcptr2 >= &sectors[y].tc[0]);
@@ -1027,6 +1494,7 @@ void VG_(add_to_transtab)( VexGuestExtents* vge,
          i = 0;
    }
 
+   TTEntry__init(&sectors[y].tt[i]);
    sectors[y].tt[i].status = InUse;
    sectors[y].tt[i].tcptr  = tcptr;
    sectors[y].tt[i].count  = 0;
@@ -1034,11 +1502,42 @@ void VG_(add_to_transtab)( VexGuestExtents* vge,
    sectors[y].tt[i].vge    = *vge;
    sectors[y].tt[i].entry  = entry;
 
+   /* Patch in the profile counter location, if necessary. */
+   if (offs_profInc != -1) {
+      vg_assert(offs_profInc >= 0 && offs_profInc < code_len);
+      VexInvalRange vir
+         = LibVEX_PatchProfInc( arch_host,
+                                dstP + offs_profInc,
+                                &sectors[y].tt[i].count );
+      VG_(invalidate_icache)( (void*)vir.start, vir.len );
+   }
+
+   VG_(invalidate_icache)( dstP, code_len );
+
+   /* Add this entry to the host_extents map, checking that we're
+      adding in order. */
+   { HostExtent hx;
+     hx.start = (UChar*)tcptr;
+     hx.len   = code_len;
+     hx.tteNo = i;
+     vg_assert(hx.len > 0); /* bsearch fails w/ zero length entries */
+     XArray* hx_array = sectors[y].host_extents;
+     vg_assert(hx_array);
+     Word n = VG_(sizeXA)(hx_array);
+     if (n > 0) {
+        HostExtent* hx_prev = (HostExtent*)VG_(indexXA)(hx_array, n-1);
+        vg_assert(hx_prev->start + hx_prev->len <= hx.start);
+     }
+     VG_(addToXA)(hx_array, &hx);
+   }
+
    /* Update the fast-cache. */
-   setFastCacheEntry( entry, tcptr, &sectors[y].tt[i].count );
+   setFastCacheEntry( entry, tcptr );
 
    /* Note the eclass numbers for this translation. */
    upd_eclasses_after_add( &sectors[y], i );
+
+   return caused_code_discarding;
 }
 
 
@@ -1046,7 +1545,9 @@ void VG_(add_to_transtab)( VexGuestExtents* vge,
    requested, a successful search can also cause the fast-caches to be
    updated.  
 */
-Bool VG_(search_transtab) ( /*OUT*/AddrH* result,
+Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode,
+                            /*OUT*/UInt*  res_sNo,
+                            /*OUT*/UInt*  res_tteNo,
                             Addr64        guest_addr, 
                             Bool          upd_cache )
 {
@@ -1076,10 +1577,13 @@ Bool VG_(search_transtab) ( /*OUT*/AddrH* result,
             /* found it */
             if (upd_cache)
                setFastCacheEntry( 
-                  guest_addr, sectors[sno].tt[k].tcptr, 
-                              &sectors[sno].tt[k].count );
-            if (result)
-               *result = (AddrH)sectors[sno].tt[k].tcptr;
+                  guest_addr, sectors[sno].tt[k].tcptr );
+            if (res_hcode)
+               *res_hcode = (AddrH)sectors[sno].tt[k].tcptr;
+            if (res_sNo)
+               *res_sNo = sno;
+            if (res_tteNo)
+               *res_tteNo = k;
             /* pull this one one step closer to the front.  For large
                apps this more or less halves the number of required
                probes. */
@@ -1147,16 +1651,23 @@ Bool overlaps ( Addr64 start, ULong range, VexGuestExtents* vge )
 
 /* Delete a tt entry, and update all the eclass data accordingly. */
 
-static void delete_tte ( /*MOD*/Sector* sec, Int tteno )
+static void delete_tte ( /*MOD*/Sector* sec, UInt secNo, Int tteno,
+                         VexArch vex_arch )
 {
    Int      i, ec_num, ec_idx;
    TTEntry* tte;
 
+   /* sec and secNo are mutually redundant; cross-check. */
+   vg_assert(sec == &sectors[secNo]);
+
    vg_assert(tteno >= 0 && tteno < N_TTES_PER_SECTOR);
    tte = &sec->tt[tteno];
    vg_assert(tte->status == InUse);
    vg_assert(tte->n_tte2ec >= 1 && tte->n_tte2ec <= 3);
 
+   /* Unchain .. */
+   unchain_in_preparation_for_deletion(vex_arch, secNo, tteno);
+
    /* Deal with the ec-to-tte links first. */
    for (i = 0; i < tte->n_tte2ec; i++) {
       ec_num = (Int)tte->tte2ec_ec[i];
@@ -1192,9 +1703,10 @@ static void delete_tte ( /*MOD*/Sector* sec, Int tteno )
    only consider translations in the specified eclass. */
 
 static 
-Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, 
+Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec, UInt secNo,
                                             Addr64 guest_start, ULong range,
-                                            Int ec )
+                                            Int ec,
+                                            VexArch vex_arch )
 {
    Int      i;
    UShort   tteno;
@@ -1218,7 +1730,7 @@ Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec,
 
       if (overlaps( guest_start, range, &tte->vge )) {
          anyDeld = True;
-         delete_tte( sec, (Int)tteno );
+         delete_tte( sec, secNo, (Int)tteno, vex_arch );
       }
 
    }
@@ -1231,8 +1743,9 @@ Bool delete_translations_in_sector_eclass ( /*MOD*/Sector* sec,
    slow way, by inspecting all translations in sec. */
 
 static 
-Bool delete_translations_in_sector ( /*MOD*/Sector* sec, 
-                                     Addr64 guest_start, ULong range )
+Bool delete_translations_in_sector ( /*MOD*/Sector* sec, UInt secNo,
+                                     Addr64 guest_start, ULong range,
+                                     VexArch vex_arch )
 {
    Int  i;
    Bool anyDeld = False;
@@ -1241,7 +1754,7 @@ Bool delete_translations_in_sector ( /*MOD*/Sector* sec,
       if (sec->tt[i].status == InUse
           && overlaps( guest_start, range, &sec->tt[i].vge )) {
          anyDeld = True;
-         delete_tte( sec, i );
+         delete_tte( sec, secNo, i, vex_arch );
       }
    }
 
@@ -1271,6 +1784,9 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range,
    if (range == 0)
       return;
 
+   VexArch vex_arch = VexArch_INVALID;
+   VG_(machine_get_VexArchInfo)( &vex_arch, NULL );
+
    /* There are two different ways to do this.
 
       If the range fits within a single address-range equivalence
@@ -1310,9 +1826,13 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range,
          if (sec->tc == NULL)
             continue;
          anyDeleted |= delete_translations_in_sector_eclass( 
-                         sec, guest_start, range, ec );
+                          sec, sno, guest_start, range, ec, 
+                          vex_arch
+                       );
          anyDeleted |= delete_translations_in_sector_eclass( 
-                         sec, guest_start, range, ECLASS_MISC );
+                          sec, sno, guest_start, range, ECLASS_MISC,
+                          vex_arch
+                       );
       }
 
    } else {
@@ -1327,7 +1847,7 @@ void VG_(discard_translations) ( Addr64 guest_start, ULong range,
          if (sec->tc == NULL)
             continue;
          anyDeleted |= delete_translations_in_sector( 
-                         sec, guest_start, range );
+                          sec, sno, guest_start, range, vex_arch );
       }
 
    }
@@ -1483,7 +2003,7 @@ void VG_(add_to_unredir_transtab)( VexGuestExtents* vge,
    for (j = 0; j < code_len; j++)
       dstP[j] = srcP[j];
 
-   invalidate_icache( dstP, code_len );
+   VG_(invalidate_icache)( dstP, code_len );
 
    unredir_tt[i].inUse = True;
    unredir_tt[i].vge   = *vge;
@@ -1573,18 +2093,15 @@ void VG_(init_tt_tc) ( void )
          sectors[i].ec2tte_used[j] = 0;
          sectors[i].ec2tte[j] = NULL;
       }
+      sectors[i].host_extents = NULL;
    }
 
    /* Initialise the sector_search_order hint table. */
    for (i = 0; i < N_SECTORS; i++)
       sector_search_order[i] = -1;
 
-   /* Initialise the fast caches.  If not profiling (the usual case),
-      we have to explicitly invalidate the fastN cache as
-      invalidateFastCache() won't do that for us. */
+   /* Initialise the fast cache. */
    invalidateFastCache();
-   if (VG_(clo_profile_flags) == 0)
-      invalidateFastNCache();
 
    /* and the unredir tt/tc */
    init_unredir_tt_tc();
diff --git a/coregrind/m_xarray.c b/coregrind/m_xarray.c
index 8859cec2e6..e9461ef7a2 100644
--- a/coregrind/m_xarray.c
+++ b/coregrind/m_xarray.c
@@ -311,6 +311,20 @@ void VG_(dropHeadXA) ( XArray* xao, Word n )
    xa->usedsizeE -= n;
 }
 
+void VG_(removeIndexXA)( XArray* xao, Word n )
+{
+   struct _XArray* xa = (struct _XArray*)xao;
+   vg_assert(xa);
+   vg_assert(n >= 0);
+   vg_assert(n < xa->usedsizeE);
+   if (n+1 < xa->usedsizeE) {
+      VG_(memmove)( ((char*)xa->arr) + (n+0) * xa->elemSzB,
+                    ((char*)xa->arr) + (n+1) * xa->elemSzB,
+                    (xa->usedsizeE - n - 1) * xa->elemSzB );
+   }
+   xa->usedsizeE--;
+}
+
 void VG_(getContentsXA_UNSAFE)( XArray* xao,
                                 /*OUT*/void** ctsP,
                                 /*OUT*/Word* usedP )
diff --git a/coregrind/pub_core_dispatch.h b/coregrind/pub_core_dispatch.h
index 6de7fcf323..08cc3f29f6 100644
--- a/coregrind/pub_core_dispatch.h
+++ b/coregrind/pub_core_dispatch.h
@@ -41,56 +41,38 @@
 
 #include "pub_core_dispatch_asm.h"
 
-/* This subroutine is called from the C world.  It is passed
-   a pointer to the VEX guest state (arch.vex).  It must run code
-   from the instruction pointer in the guest state, and exit when
-   VG_(dispatch_ctr) reaches zero, or we need to defer to the scheduler.
+/* Run translations, with the given guest state, and starting by
+   running the host code at 'host_addr'.  It is almost always the case
+   that host_addr is the translation for guest_state.guest_IP, that
+   is, host_addr is what it would be if we looked up the address of
+   the translation corresponding to guest_state.guest_IP.
+
+   The only case where this isn't true is where we're running a
+   no-redir translation.  In this case host_addr is the address of the
+   alternative (non-redirected) translation for guest_state.guest_IP.
+
    The return value must indicate why it returned back to the scheduler.
    It can also be exited if the executing code throws a non-resumable
    signal, for example SIGSEGV, in which case control longjmp()s back past
    here.
 
-   If do_profiling is nonzero, the profile counters arrays should be
-   updated for each translation run.
-
-   This code simply handles the common case fast -- when the translation
-   address is found in the translation cache.  For anything else, the
-   scheduler does the work.
-
-   NOTE, VG_(run_innerloop) MUST NOT BE USED for noredir translations.
-   Instead use VG_(run_a_noredir_translation).
-*/
-extern 
-UWord VG_(run_innerloop) ( void* guest_state, UWord do_profiling );
-#if defined(VGA_x86) || defined(VGA_amd64)
-/* We need to locate a couple of labels inside VG_(run_innerloop), so
-   that Vex can add branches to them from generated code.  Hence the
-   following somewhat bogus decls.  At least on x86 and amd64.  ppc32
-   and ppc64 use straightforward bl-blr to get from dispatcher to
-   translation and back and so do not need these labels. */
-extern Addr VG_(run_innerloop__dispatch_unassisted_unprofiled);
-extern Addr VG_(run_innerloop__dispatch_assisted_unprofiled);
-extern Addr VG_(run_innerloop__dispatch_unassisted_profiled);
-extern Addr VG_(run_innerloop__dispatch_assisted_profiled);
-#endif
-
-
-/* Run a no-redir translation.  argblock points to 4 UWords, 2 to carry args
-   and 2 to carry results:
-      0: input:  ptr to translation
-      1: input:  ptr to guest state
-      2: output: next guest PC
-      3: output: guest state pointer afterwards (== thread return code)
-   MUST NOT BE USED for non-noredir (normal) translations.
+   two_words holds the return values (two words).  First is
+   a TRC value.  Second is generally unused, except in the case
+   where we have to return a chain-me request.
 */
-extern void VG_(run_a_noredir_translation) ( volatile UWord* argblock );
-#if defined(VGA_x86) || defined(VGA_amd64)
-/* We need to a label inside VG_(run_a_noredir_translation), so that
-   Vex can add branches to them from generated code.  Hence the
-   following somewhat bogus decl. */
-extern Addr VG_(run_a_noredir_translation__return_point);
-#endif
-
+HWord VG_(disp_run_translations)( HWord* two_words,
+                                  void*  guest_state, 
+                                  Addr   host_addr );
+
+/* We need to know addresses of the continuation-point (cp_) labels so
+   we can tell VEX what they are.  They will get baked into the code
+   VEX generates.  The UChar is entirely mythical, but we need to
+   state _some_ type, so as to keep gcc happy. */
+UChar VG_(disp_cp_chain_me_to_slowEP);
+UChar VG_(disp_cp_chain_me_to_fastEP);
+UChar VG_(disp_cp_xindir);
+UChar VG_(disp_cp_xassisted);
+UChar VG_(disp_cp_evcheck_fail);
 
 #endif   // __PUB_CORE_DISPATCH_H
 
diff --git a/coregrind/pub_core_dispatch_asm.h b/coregrind/pub_core_dispatch_asm.h
index 3e7b4a20d7..31d2f59a70 100644
--- a/coregrind/pub_core_dispatch_asm.h
+++ b/coregrind/pub_core_dispatch_asm.h
@@ -43,16 +43,20 @@
 /* And some more of our own.  These must not have the same values as
    those from libvex_trc_values.h.  (viz, 60 or below is safe).
 
+   (The following comment is no longer relevant, but is retained
+   for historical purposes.)
    These values *must* be odd (have bit 0 set) because the dispatchers
    (coregrind/m_dispatch/dispatch-*-*.S) use this fact to distinguish
    a TRC value from the unchanged baseblock pointer -- which has 0 as
    its lowest bit.
 */
-#define VG_TRC_BORING             29 /* no event; just keep going */
-#define VG_TRC_INNER_FASTMISS     37 /* TRC only; means fast-cache miss. */
-#define VG_TRC_INNER_COUNTERZERO  41 /* TRC only; means bb ctr == 0 */
-#define VG_TRC_FAULT_SIGNAL       43 /* TRC only; got sigsegv/sigbus */
-#define VG_TRC_INVARIANT_FAILED   47 /* TRC only; invariant violation */
+#define VG_TRC_BORING              29 /* no event; just keep going */
+#define VG_TRC_INNER_FASTMISS      37 /* TRC only; means fast-cache miss. */
+#define VG_TRC_INNER_COUNTERZERO   41 /* TRC only; means bb ctr == 0 */
+#define VG_TRC_FAULT_SIGNAL        43 /* TRC only; got sigsegv/sigbus */
+#define VG_TRC_INVARIANT_FAILED    47 /* TRC only; invariant violation */
+#define VG_TRC_CHAIN_ME_TO_SLOW_EP 49 /* TRC only; chain to slow EP */
+#define VG_TRC_CHAIN_ME_TO_FAST_EP 51 /* TRC only; chain to fast EP */
 
 #endif   // __PUB_CORE_DISPATCH_ASM_H
 
diff --git a/coregrind/pub_core_libcproc.h b/coregrind/pub_core_libcproc.h
index cd9c18a29b..e573fd81a8 100644
--- a/coregrind/pub_core_libcproc.h
+++ b/coregrind/pub_core_libcproc.h
@@ -84,6 +84,10 @@ extern void VG_(do_atfork_pre)    ( ThreadId tid );
 extern void VG_(do_atfork_parent) ( ThreadId tid );
 extern void VG_(do_atfork_child)  ( ThreadId tid );
 
+// icache invalidation
+extern void VG_(invalidate_icache) ( void *ptr, SizeT nbytes );
+
+
 #endif   // __PUB_CORE_LIBCPROC_H
 
 /*--------------------------------------------------------------------*/
diff --git a/coregrind/pub_core_translate.h b/coregrind/pub_core_translate.h
index c6c24055d7..3182f4f796 100644
--- a/coregrind/pub_core_translate.h
+++ b/coregrind/pub_core_translate.h
@@ -37,12 +37,13 @@
 //--------------------------------------------------------------------
 
 extern 
-Bool VG_(translate) ( ThreadId tid, 
-                      Addr64   orig_addr,
-                      Bool     debugging_translation,
-                      Int      debugging_verbosity,
-                      ULong    bbs_done,
-                      Bool     allow_redirection );
+Bool VG_(translate) ( /*OUT*/Bool* caused_discardP,
+                      ThreadId     tid, 
+                      Addr64       orig_addr,
+                      Bool         debugging_translation,
+                      Int          debugging_verbosity,
+                      ULong        bbs_done,
+                      Bool         allow_redirection );
 
 extern void VG_(print_translation_stats) ( void );
 
diff --git a/coregrind/pub_core_transtab.h b/coregrind/pub_core_transtab.h
index 34ffee96e6..52dc5a7eee 100644
--- a/coregrind/pub_core_transtab.h
+++ b/coregrind/pub_core_transtab.h
@@ -39,9 +39,8 @@
 
 #include "pub_core_transtab_asm.h"
 
-/* The fast-cache for tt-lookup, and for finding counters.  Unused
-   entries are denoted by .guest == 1, which is assumed to be a bogus
-   address for all guest code. */
+/* The fast-cache for tt-lookup.  Unused entries are denoted by .guest
+   == 1, which is assumed to be a bogus address for all guest code. */
 typedef
    struct { 
       Addr guest;
@@ -54,18 +53,26 @@ extern __attribute__((aligned(16)))
 
 #define TRANSTAB_BOGUS_GUEST_ADDR ((Addr)1)
 
-extern UInt*          VG_(tt_fastN)[VG_TT_FAST_SIZE];
-
 extern void VG_(init_tt_tc)       ( void );
 
 extern
-void VG_(add_to_transtab)( VexGuestExtents* vge,
+Bool VG_(add_to_transtab)( VexGuestExtents* vge,
                            Addr64           entry,
                            AddrH            code,
                            UInt             code_len,
-                           Bool             is_self_checking );
+                           Bool             is_self_checking,
+                           Int              offs_profInc,
+                           VexArch          arch_host );
 
-extern Bool VG_(search_transtab) ( /*OUT*/AddrH* result,
+extern
+void VG_(tt_tc_do_chaining) ( void* from__patch_addr,
+                              UInt  to_sNo,
+                              UInt  to_tteNo,
+                              Bool  to_fastEP );
+
+extern Bool VG_(search_transtab) ( /*OUT*/AddrH* res_hcode,
+                                   /*OUT*/UInt*  res_sNo,
+                                   /*OUT*/UInt*  res_tteNo,
                                    Addr64        guest_addr, 
                                    Bool          upd_cache );
 
diff --git a/coregrind/pub_core_transtab_asm.h b/coregrind/pub_core_transtab_asm.h
index 6d43a7ac3c..00adced522 100644
--- a/coregrind/pub_core_transtab_asm.h
+++ b/coregrind/pub_core_transtab_asm.h
@@ -42,8 +42,9 @@
    ever be used.  So instead the function is '(address >>u
    2)[VG_TT_FAST_BITS-1 : 0]' on those targets.
 
-   On ARM we do like ppc32/ppc64, although that will have to be
-   revisited when we come to implement Thumb.
+   On ARM we shift by 1, since Thumb insns can be of size 2, hence to
+   minimise collisions and maximise cache utilisation we need to take
+   into account all but the least significant bit.
 
    On s390x the rightmost bit of an instruction address is zero.
    For best table utilization shift the address to the right by 1 bit. */
diff --git a/docs/Makefile.am b/docs/Makefile.am
index 82fa93ab35..2deeb011c9 100644
--- a/docs/Makefile.am
+++ b/docs/Makefile.am
@@ -44,6 +44,7 @@ EXTRA_DIST = \
 	internals/register-uses.txt \
 	internals/release-HOWTO.txt \
 	internals/segments-seginfos.txt \
+	internals/t-chaining-notes.txt \
 	internals/threads-syscalls-signals.txt \
 	internals/tm-mutexstates.dot \
 	internals/tm-threadstates.dot \
diff --git a/docs/internals/t-chaining-notes.txt b/docs/internals/t-chaining-notes.txt
new file mode 100644
index 0000000000..be2d70bda9
--- /dev/null
+++ b/docs/internals/t-chaining-notes.txt
@@ -0,0 +1,201 @@
+
+DO NOT MERGE
+~~~~~~~~~~~
+
+Changes memcheck/tests/Makefile.am w.r.t. -mfloat-abi=softfp
+Ditto none/tests/arm/Makefile.am
+
+
+Verification todo
+~~~~~~~~~~~~~~~~~
+check that illegal insns on all targets don't cause the _toIR.c's to
+assert.
+
+check also with --vex-guest-chase-cond=yes
+
+check that all targets can run their insn set tests with
+--vex-guest-max-insns=1.
+
+
+Cleanups
+~~~~~~~~
+host_arm_isel.c and host_arm_defs.c: get rid of global var arm_hwcaps.
+
+host_x86_defs.c, host_amd64_defs.c: return proper VexInvalRange
+records from the patchers, instead of {0,0}, so that transparent
+self hosting works properly.
+
+
+Optimisations
+~~~~~~~~~~~~~
+all targets: change VG_(stats__n_xindirs) to a 32 bit counter, and
+empty out every now and again.
+
+amd64: XDirect: write const value to guest_RIP using single
+insn when the value is < 0x8000'0000
+
+arm: chain_XDirect: generate short form jumps when possible
+
+arm codegen: Generate ORRS for CmpwNEZ32(Or32(x,y))
+
+all targets: when nuking an entire sector, don't bother to undo the
+patching for any translations within the sector (nor with their
+invalidations).
+
+(somewhat implausible) for jumps to disp_cp_indir, have multiple
+copies of disp_cp_indir, one for each of the possible registers that
+could have held the target guest address before jumping to the stub.
+Then disp_cp_indir wouldn't have to reload it from memory each time.
+Might also have the effect of spreading out the indirect mispredict
+burden somewhat (across the multiple copies.)
+
+
+Implementation notes
+~~~~~~~~~~~~~~~~~~~~
+T-chaining changes -- summary
+
+* The code generators (host_blah_isel.c, host_blah_defs.[ch]) interact
+  more closely with Valgrind than before.  In particular the
+  instruction selectors must use one of 3 different kinds of
+  control-transfer instructions: XDirect, XIndir and XAssisted.
+  All archs must use these the same; no more ad-hoc control transfer
+  instructions.
+  (more detail below)
+
+
+* With T-chaining, translations can jump between each other without
+  going through the dispatcher loop every time.  This means that the
+  event check (counter dec, and exit if negative) the dispatcher loop
+  previously did now needs to be compiled into each translation.
+
+
+* The assembly dispatcher code (dispatch-arch-os.S) is still
+  present.  It still provides table lookup services for 
+  indirect branches, but it also provides a new feature: 
+  dispatch points, to which the generated code jumps.  There
+  are 5:
+
+  VG_(disp_cp_chain_me_to_slowEP):
+  VG_(disp_cp_chain_me_to_fastEP):
+    These are chain-me requests, used for Boring conditional and
+    unconditional jumps to destinations known at JIT time.  The
+    generated code calls these (doesn't jump to them) and the
+    stub recovers the return address.  These calls never return;
+    instead the call is done so that the stub knows where the
+    calling point is.  It needs to know this so it can patch
+    the calling point to the requested destination.
+  VG_(disp_cp_xindir):
+    Old-style table lookup and go; used for indirect jumps
+  VG_(disp_cp_xassisted):
+    Most general and slowest kind.  Can transfer to anywhere, but
+    first returns to scheduler to do some other event (eg a syscall)
+    before continuing.
+  VG_(disp_cp_evcheck_fail):
+    Code jumps here when the event check fails.
+
+
+* new instructions in backends: XDirect, XIndir and XAssisted.
+  XDirect is used for chainable jumps.  It is compiled into a
+  call to VG_(disp_cp_chain_me_to_slowEP) or
+  VG_(disp_cp_chain_me_to_fastEP).
+
+  XIndir is used for indirect jumps.  It is compiled into a jump
+  to VG_(disp_cp_xindir)
+
+  XAssisted is used for "assisted" (do something first, then jump)
+  transfers.  It is compiled into a jump to VG_(disp_cp_xassisted)
+
+  All 3 of these may be conditional.
+
+  More complexity: in some circumstances (no-redir translations)
+  all transfers must be done with XAssisted.  In such cases the
+  instruction selector will be told this.
+
+
+* Patching: XDirect is compiled basically into
+     %r11 = &VG_(disp_cp_chain_me_to_{slow,fast}EP)
+     call *%r11
+  Backends must provide a function (eg) chainXDirect_AMD64
+  which converts it into a jump to a specified destination
+     jmp $delta-of-PCs
+  or
+     %r11 = 64-bit immediate
+     jmpq *%r11
+  depending on branch distance.
+
+  Backends must provide a function (eg) unchainXDirect_AMD64
+  which restores the original call-to-the-stub version.
+
+
+* Event checks.  Each translation now has two entry points,
+  the slow one (slowEP) and fast one (fastEP).  Like this:
+
+     slowEP:
+        counter--
+        if (counter < 0) goto VG_(disp_cp_evcheck_fail)
+     fastEP:
+        (rest of the translation)
+
+  slowEP is used for control flow transfers that are or might be
+  a back edge in the control flow graph.  Insn selectors are
+  given the address of the highest guest byte in the block so
+  they can determine which edges are definitely not back edges.
+
+  The counter is placed in the first 8 bytes of the guest state,
+  and the address of VG_(disp_cp_evcheck_fail) is placed in
+  the next 8 bytes.  This allows very compact checks on all
+  targets, since no immediates need to be synthesised, eg:
+
+    decq 0(%baseblock-pointer)
+    jns  fastEP
+    jmpq *8(baseblock-pointer)
+    fastEP:
+
+  On amd64 a non-failing check is therefore 2 insns; all 3 occupy
+  just 8 bytes.
+
+  On amd64 the event check is created by a special single
+  pseudo-instruction AMD64_EvCheck.
+
+
+* BB profiling (for --profile-flags=).  The dispatch assembly
+  dispatch-arch-os.S no longer deals with this and so is much
+  simplified.  Instead the profile inc is compiled into each
+  translation, as the insn immediately following the event
+  check.  Again, on amd64 a pseudo-insn AMD64_ProfInc is used.
+  Counters are now 64 bit even on 32 bit hosts, to avoid overflow.
+
+  One complexity is that at JIT time it is not known where the
+  address of the counter is.  To solve this, VexTranslateResult
+  now returns the offset of the profile inc in the generated
+  code.  When the counter address is known, VEX can be called
+  again to patch it in.  Backends must supply eg
+  patchProfInc_AMD64 to make this happen.
+
+
+* Front end changes (guest_blah_toIR.c)
+
+  The way the guest program counter is handled has changed
+  significantly.  Previously, the guest PC was updated (in IR)
+  at the start of each instruction, except for the first insn
+  in an IRSB.  This is inconsistent and doesn't work with the
+  new framework.
+
+  Now, each instruction must update the guest PC as its last
+  IR statement -- not its first.  And no special exemption for
+  the first insn in the block.  As before most of these are
+  optimised out by ir_opt, so no concerns about efficiency.
+
+  As a logical side effect of this, exits (IRStmt_Exit) and the
+  block-end transfer are both considered to write to the guest state
+  (the guest PC) and so need to be told the offset of it.
+
+  IR generators (eg disInstr_AMD64) are no longer allowed to set the
+  IRSB::next, to specify the block-end transfer address.  Instead they
+  now indicate, to the generic steering logic that drives them (iow,
+  guest_generic_bb_to_IR.c), that the block has ended.  This then
+  generates effectively "goto GET(PC)" (which, again, is optimised
+  away).  What this does mean is that if the IR generator function
+  ends the IR of the last instruction in the block with an incorrect
+  assignment to the guest PC, execution will transfer to an incorrect
+  destination -- making the error obvious quickly.
diff --git a/drd/drd_load_store.c b/drd/drd_load_store.c
index 996ee61173..3d99112f1f 100644
--- a/drd/drd_load_store.c
+++ b/drd/drd_load_store.c
@@ -593,6 +593,7 @@ IRSB* DRD_(instrument)(VgCallbackClosure* const closure,
    bb->tyenv    = deepCopyIRTypeEnv(bb_in->tyenv);
    bb->next     = deepCopyIRExpr(bb_in->next);
    bb->jumpkind = bb_in->jumpkind;
+   bb->offsIP   = bb_in->offsIP;
 
    for (i = 0; i < bb_in->stmts_used; i++)
    {
diff --git a/drd/tests/unit_bitmap.c b/drd/tests/unit_bitmap.c
index b64c4964cb..6de61acdba 100644
--- a/drd/tests/unit_bitmap.c
+++ b/drd/tests/unit_bitmap.c
@@ -48,6 +48,8 @@ void* VG_(memset)(void *s, Int c, SizeT sz)
 { return memset(s, c, sz); }
 void* VG_(memcpy)(void *d, const void *s, SizeT sz)
 { return memcpy(d, s, sz); }
+void* VG_(memmove)(void *d, const void *s, SizeT sz)
+{ return memmove(d, s, sz); }
 Int VG_(memcmp)(const void* s1, const void* s2, SizeT n)
 { return memcmp(s1, s2, n); }
 UInt VG_(printf)(const HChar *format, ...)
diff --git a/helgrind/hg_main.c b/helgrind/hg_main.c
index c1324db624..9bc36414b0 100644
--- a/helgrind/hg_main.c
+++ b/helgrind/hg_main.c
@@ -4317,6 +4317,7 @@ IRSB* hg_instrument ( VgCallbackClosure* closure,
    bbOut->tyenv    = deepCopyIRTypeEnv(bbIn->tyenv);
    bbOut->next     = deepCopyIRExpr(bbIn->next);
    bbOut->jumpkind = bbIn->jumpkind;
+   bbOut->offsIP   = bbIn->offsIP;
 
    // Copy verbatim any IR preamble preceding the first IMark
    i = 0;
diff --git a/include/pub_tool_xarray.h b/include/pub_tool_xarray.h
index 9b699874db..cd34e79229 100644
--- a/include/pub_tool_xarray.h
+++ b/include/pub_tool_xarray.h
@@ -117,6 +117,12 @@ extern void VG_(dropTailXA) ( XArray*, Word );
    is the number of elements remaining in the XArray. */
 extern void VG_(dropHeadXA) ( XArray*, Word );
 
+/* Remove the specified element of an XArray, and slide all elements
+   beyond it back one place.  This is an O(N) operation, where N is
+   the number of elements after the specified element, in the
+   array. */
+extern void VG_(removeIndexXA)( XArray*, Word );
+
 /* Make a new, completely independent copy of the given XArray, using
    the existing allocation function to allocate the new space.
    Returns NULL if the allocation function didn't manage to allocate
diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
index f61e99f5b2..4dbefa246b 100644
--- a/memcheck/tests/Makefile.am
+++ b/memcheck/tests/Makefile.am
@@ -285,10 +285,10 @@ check_PROGRAMS = \
 AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
 AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
 
-if VGCONF_PLATFORMS_INCLUDE_ARM_LINUX
-AM_CFLAGS   += -mfloat-abi=softfp
-AM_CXXFLAGS += -mfloat-abi=softfp
-endif
+#if VGCONF_PLATFORMS_INCLUDE_ARM_LINUX
+#AM_CFLAGS   += -mfloat-abi=softfp
+#AM_CXXFLAGS += -mfloat-abi=softfp
+#endif
 
 if VGCONF_OS_IS_DARWIN
 atomic_incs_CFLAGS = $(AM_CFLAGS) -mdynamic-no-pic
diff --git a/memcheck/tests/unit_oset.c b/memcheck/tests/unit_oset.c
index 84f5ea25d1..854edf12c1 100644
--- a/memcheck/tests/unit_oset.c
+++ b/memcheck/tests/unit_oset.c
@@ -27,6 +27,7 @@
 #define vgPlain_printf                 printf
 #define vgPlain_memset                 memset
 #define vgPlain_memcpy                 memcpy
+#define vgPlain_memmove                memmove
 
 // Crudely replace some functions (in m_xarray.c, but not needed for
 // this unit test) by (hopefully) failing asserts.
diff --git a/none/tests/arm/Makefile.am b/none/tests/arm/Makefile.am
index 013215ae80..ea25761c6a 100644
--- a/none/tests/arm/Makefile.am
+++ b/none/tests/arm/Makefile.am
@@ -39,14 +39,14 @@ v6intThumb_CFLAGS = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 -mthumb
 v6media_CFLAGS    = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 -mthumb
 
 vfp_CFLAGS        = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
-			-mfpu=neon -mfloat-abi=softfp \
+			-mfpu=neon \
 			-mthumb
 
 
 neon128_CFLAGS    = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
-			-mfpu=neon -mfloat-abi=softfp \
+			-mfpu=neon \
 			-mthumb
 
 neon64_CFLAGS     = $(AM_CFLAGS) -g -O0 -mcpu=cortex-a8 \
-			-mfpu=neon -mfloat-abi=softfp \
+			-mfpu=neon \
 			-mthumb