]> git.ipfire.org Git - thirdparty/valgrind.git/commitdiff
libvex_BackEnd: lift the assembler out into its own function, for tidyness. No funct...
authorJulian Seward <jseward@acm.org>
Mon, 2 Oct 2017 16:43:22 +0000 (18:43 +0200)
committerJulian Seward <jseward@acm.org>
Mon, 2 Oct 2017 16:43:22 +0000 (18:43 +0200)
VEX/priv/main_main.c

index 8caaca26e2121f68f95f73ef3167630589cf0996..0629c1559895d5f354df3b4d6a471704dce2b9c0 100644 (file)
@@ -891,760 +891,779 @@ AssemblyBufferOffset emitSimpleInsn ( /*MB_MOD*/Int* offs_profInc,
 }
 
 
-/* ---- The back end proper ---- */
+/* ---- The assembler ---- */
 
-/* Back end of the compilation pipeline.  Is not exported. */
+/* Assemble RCODE, writing the resulting machine code into the buffer
+   specified by VTA->host_bytes of size VTA->host_bytes_size.  When done,
+   store the number of bytes written at the location specified by
+   VTA->host_bytes_used.  RES->offs_profInc may be modified as a result.  No
+   other fields of RES are changed.
 
-static void libvex_BackEnd ( const VexTranslateArgs *vta,
-                             /*MOD*/ VexTranslateResult* res,
-                             /*MOD*/ IRSB* irsb,
-                             VexRegisterUpdates pxControl )
+   Returns True for OK, False for 'ran out of buffer space'.
+*/
+static
+Bool theAssembler ( /*MOD*/VexTranslateResult* res,
+                    const VexTranslateArgs* vta,
+                    HInstrIfThenElse* (*isIfThenElse)( const HInstr* ),
+                    const Bool mode64,
+                    const HInstrSB* rcode )
 {
-   /* This the bundle of functions we need to do the back-end stuff
-      (insn selection, reg-alloc, assembly) whilst being insulated
-      from the target instruction set. */
-   Bool         (*isMove)       ( const HInstr*, HReg*, HReg* );
-   void         (*getRegUsage)  ( HRegUsage*, const HInstr*, Bool );
-   void         (*mapRegs)      ( HRegRemap*, HInstr*, Bool );
-   HInstrIfThenElse* (*isIfThenElse)( const HInstr* );
-   void         (*genSpill)     ( HInstr**, HInstr**, HReg, Int, Bool );
-   void         (*genReload)    ( HInstr**, HInstr**, HReg, Int, Bool );
-   HInstr*      (*genMove)      ( HReg, HReg, Bool );
-   HInstr*      (*genHInstrITE) ( HInstrIfThenElse* );
-   HInstr*      (*directReload) ( HInstr*, HReg, Short );
-   void         (*ppInstr)      ( const HInstr*, Bool );
-   void         (*ppCondCode)   ( HCondCode );
-   UInt         (*ppReg)        ( HReg );
-   HInstrSB*    (*iselSB)       ( const IRSB*, VexArch, const VexArchInfo*,
-                                  const VexAbiInfo*, Int, Int, Bool, Bool,
-                                  Addr );
-   Int          (*emit)         ( /*MB_MOD*/Bool*,
-                                  UChar*, Int, const HInstr*, Bool, VexEndness,
-                                  const void*, const void*, const void*,
-                                  const void* );
-   Bool (*preciseMemExnsFn) ( Int, Int, VexRegisterUpdates );
-
-   const RRegUniverse* rRegUniv = NULL;
+   // QElem are work Queue elements.  The work Queue is the top level data
+   // structure for the emitter.  It is initialised with the HInstrVec* of
+   // the overall HInstrSB.  Every OOL HInstrVec* in the tree will at some
+   // point be present in the Queue.  IL HInstrVec*s are never present in
+   // the Queue because the inner emitter loop processes them in-line, using
+   // a Stack (see below) to keep track of its nesting level.
+   //
+   // The Stack (see below) is empty before and after every Queue element is
+   // processed.  In other words, the Stack only holds state needed during
+   // the processing of a single Queue element.
+   //
+   // The ordering of elements in the Queue is irrelevant -- correct code
+   // will be emitted even with set semantics (arbitrary order).  However,
+   // the FIFOness of the queue is believed to generate code in which
+   // colder and colder code (more deeply nested OOLs) is placed further
+   // and further from the start of the emitted machine code, which sounds
+   // like a layout which should minimise icache misses.
+   //
+   // QElems also contain two pieces of jump-fixup information.  When we
+   // finally come to process a QElem, we need to know:
+   //
+   // * |jumpToOOLpoint|: the place which wants to jump to the start of the
+   //   emitted insns for this QElem.  We must have already emitted that,
+   //   since it will be the conditional jump that leads to this QElem (OOL
+   //   block).
+   //
+   // * |resumePoint|: the place we should jump back to after the QElem is
+   //   finished (the "resume point"), which is the emitted code of the
+   //   HInstr immediately following the HInstrIfThenElse that has this
+   //   QElem as its OOL block.
+   //
+   // When the QElem is processed, we know both the |jumpToOOLpoint| and
+   // the |resumePoint|, and so the first can be patched, and the second
+   // we generate an instruction to jump to.
+   //
+   // There are three complications with patching:
+   //
+   // (1) per comments on Stack elems, we do not know the |resumePoint| when
+   //     creating a QElem.  That will only be known when processing of the
+   //     corresponding IL block is completed.
+   //
+   // (2) The top level HInstrVec* has neither a |jumpToOOLpoint| nor a
+   //     |resumePoint|.
+   //
+   // (3) Non-top-level OOLs may not have a valid |resumePoint| if they do
+   //     an unconditional IR-level Exit.  We can generate the resume point
+   //     branch, but it will be never be used.
+   typedef
+      struct {
+         // The HInstrs for this OOL.
+         HInstrVec* oolVec;
+         // Where we should patch to jump to the OOL ("how do we get here?")
+         Bool       jumpToOOLpoint_valid;
+         Relocation jumpToOOLpoint;
+         // Resume point offset, in bytes from start of output buffer
+         // ("where do we go after this block is completed?")
+         Bool                 resumePoint_valid;
+         AssemblyBufferOffset resumePoint;
+      }
+      QElem;
 
-   Bool            mode64, chainingAllowed;
-   Int             out_used;
-   Int guest_sizeB;
-   Int offB_HOST_EvC_COUNTER;
-   Int offB_HOST_EvC_FAILADDR;
-   Addr            max_ga;
-   HInstrSB*       vcode;
-   HInstrSB*       rcode;
 
-   isMove                  = NULL;
-   getRegUsage             = NULL;
-   mapRegs                 = NULL;
-   isIfThenElse            = NULL;
-   genSpill                = NULL;
-   genReload               = NULL;
-   genMove                 = NULL;
-   genHInstrITE            = NULL;
-   directReload            = NULL;
-   ppInstr                 = NULL;
-   ppCondCode              = NULL;
-   ppReg                   = NULL;
-   iselSB                  = NULL;
-   emit                    = NULL;
+   // SElem are stack elements.  When we suspend processing a HInstrVec* in
+   // order to process an IL path in an IfThenElse, we push the HInstrVec*
+   // and the next index to process on the stack, so that we know where to
+   // resume when the nested IL sequence is completed.  |vec| and |vec_next|
+   // record the resume HInstr.
+   //
+   // A second effect of processing a nested IL sequence is that we will
+   // have to (later) process the corresponding OOL sequence.  And that OOL
+   // sequence will have to finish with a jump back to the "resume point"
+   // (the emitted instruction immediately following the IfThenElse).  We
+   // only know the offset of the resume point instruction in the output
+   // buffer when we actually resume emitted from there -- that is, when the
+   // entry we pushed, is popped.  So, when we pop, we must mark the
+   // corresponding OOL entry in the Queue to record there the resume point
+   // offset.  For this reason we also carry |ool_qindex|, which is the
+   // index of the corresponding OOL entry in the Queue.
+   typedef
+      struct {
+         HInstrVec* vec;        // resume point HInstr vector
+         UInt       vec_next;   // resume point HInstr vector index
+         Int        ool_qindex; // index in Queue of OOL to mark when we resume
+      }
+      SElem;
 
-   mode64                 = False;
-   chainingAllowed        = False;
-   guest_sizeB            = 0;
-   offB_HOST_EvC_COUNTER  = 0;
-   offB_HOST_EvC_FAILADDR = 0;
-   preciseMemExnsFn       = NULL;
+   // The Stack.  The stack depth is bounded by maximum number of nested
+   // hot (IL) sections, so in practice it is going to be very small.
+   const Int nSTACK = 4;
 
-   vassert(vex_initdone);
-   vassert(vta->disp_cp_xassisted != NULL);
+   SElem stack[nSTACK];
+   Int   stackPtr; // points to most recently pushed entry <=> "-1 means empty"
 
-   vex_traceflags = vta->traceflags;
+   // The Queue.  The queue size is bounded by the number of cold (OOL)
+   // sections in the entire HInstrSB, so it's also going to be pretty
+   // small.
+   const Int nQUEUE = 8;
 
-   /* Both the chainers and the indir are either NULL or non-NULL. */
-   if (vta->disp_cp_chain_me_to_slowEP        != NULL) {
-      vassert(vta->disp_cp_chain_me_to_fastEP != NULL);
-      vassert(vta->disp_cp_xindir             != NULL);
-      chainingAllowed = True;
-   } else {
-      vassert(vta->disp_cp_chain_me_to_fastEP == NULL);
-      vassert(vta->disp_cp_xindir             == NULL);
-   }
+   QElem queue[nQUEUE];
+   Int   queueOldest; // index of oldest entry, initially 0
+   Int   queueNewest; // index of newest entry,
+                      // initially -1, otherwise must be >= queueOldest
 
-   switch (vta->arch_guest) {
+   ///////////////////////////////////////////////////////
 
-      case VexArchX86:
-         preciseMemExnsFn       
-            = X86FN(guest_x86_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestX86State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestX86State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestX86State,host_EvC_FAILADDR);
-         break;
+   const Bool verbose_asm = (vex_traceflags & VEX_TRACE_ASM) != 0;
 
-      case VexArchAMD64:
-         preciseMemExnsFn       
-            = AMD64FN(guest_amd64_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestAMD64State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestAMD64State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestAMD64State,host_EvC_FAILADDR);
-         break;
+   const EmitConstants emitConsts
+      = { .mode64                     = mode64,
+          .endness_host               = vta->archinfo_host.endness,
+          .disp_cp_chain_me_to_slowEP = vta->disp_cp_chain_me_to_slowEP,
+          .disp_cp_chain_me_to_fastEP = vta->disp_cp_chain_me_to_fastEP,
+          .disp_cp_xindir             = vta->disp_cp_xindir,
+          .disp_cp_xassisted          = vta->disp_cp_xassisted };
 
-      case VexArchPPC32:
-         preciseMemExnsFn       
-            = PPC32FN(guest_ppc32_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestPPC32State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC32State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC32State,host_EvC_FAILADDR);
-         break;
+   AssemblyBufferOffset cursor = 0;
+   AssemblyBufferOffset cursor_limit = vta->host_bytes_size;
 
-      case VexArchPPC64:
-         preciseMemExnsFn       
-            = PPC64FN(guest_ppc64_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestPPC64State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC64State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC64State,host_EvC_FAILADDR);
-         break;
+   *(vta->host_bytes_used) = 0;
 
-      case VexArchS390X:
-         preciseMemExnsFn 
-            = S390FN(guest_s390x_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestS390XState);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestS390XState,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestS390XState,host_EvC_FAILADDR);
-         break;
+   queueOldest = 0;
+   queueNewest = -1;
 
-      case VexArchARM:
-         preciseMemExnsFn       
-            = ARMFN(guest_arm_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestARMState);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestARMState,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestARMState,host_EvC_FAILADDR);
-         break;
+   vassert(queueNewest < nQUEUE);
+   queueNewest++;
+   {
+      QElem* qe = &queue[queueNewest];
+      vex_bzero(qe, sizeof(*qe));
+      qe->oolVec = rcode->insns;
+      qe->jumpToOOLpoint_valid = False;
+      qe->resumePoint_valid = False;
+   }
+   vassert(queueNewest == 0);
 
-      case VexArchARM64:
-         preciseMemExnsFn     
-            = ARM64FN(guest_arm64_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestARM64State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestARM64State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestARM64State,host_EvC_FAILADDR);
-         break;
+   /* Main loop, processing Queue entries, until there are no more. */
+   while (queueOldest <= queueNewest) {
 
-      case VexArchMIPS32:
-         preciseMemExnsFn       
-            = MIPS32FN(guest_mips32_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestMIPS32State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestMIPS32State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestMIPS32State,host_EvC_FAILADDR);
-         break;
+      Int qCur = queueOldest;
+      if (UNLIKELY(verbose_asm))
+         vex_printf("BEGIN queue[%d]\n", qCur);
 
-      case VexArchMIPS64:
-         preciseMemExnsFn       
-            = MIPS64FN(guest_mips64_state_requires_precise_mem_exns);
-         guest_sizeB            = sizeof(VexGuestMIPS64State);
-         offB_HOST_EvC_COUNTER  = offsetof(VexGuestMIPS64State,host_EvC_COUNTER);
-         offB_HOST_EvC_FAILADDR = offsetof(VexGuestMIPS64State,host_EvC_FAILADDR);
-         break;
+      // Take the oldest entry in the queue
+      QElem* qe = &queue[queueOldest];
+      queueOldest++;
 
-      default:
-         vpanic("LibVEX_Codegen: unsupported guest insn set");
-   }
+      // Stay sane.  Only the top level block has no branch to it and no
+      // resume point.
+      if (qe->oolVec == rcode->insns) {
+         // This is the top level block
+         vassert(!qe->jumpToOOLpoint_valid);
+         vassert(!qe->resumePoint_valid);
+      } else {
+         vassert(qe->jumpToOOLpoint_valid);
+         vassert(qe->resumePoint_valid);
+         // In the future, we might be able to allow the resume point to be
+         // invalid for non-top-level blocks, if the block contains an
+         // unconditional exit.  Currently the IR can't represent that, so
+         // the assertion is valid.
+      }
 
+      // Processing |qe|
+      if (qe->jumpToOOLpoint_valid) {
+         // patch qe->jmpToOOLpoint to jump to |here|
+         if (UNLIKELY(verbose_asm)) {
+            vex_printf("  -- APPLY ");
+            ppRelocation(qe->jumpToOOLpoint);
+            vex_printf("\n");
+         }
+         applyRelocation(qe->jumpToOOLpoint, &vta->host_bytes[0],
+                         cursor, cursor, vta->archinfo_host.endness,
+                         verbose_asm);
+      }
 
-   switch (vta->arch_host) {
+      // Initialise the stack, for processing of |qe|.
+      stackPtr = 0; // "contains one element"
 
-      case VexArchX86:
-         mode64       = False;
-         rRegUniv     = X86FN(getRRegUniverse_X86());
-         isMove       = CAST_TO_TYPEOF(isMove) X86FN(isMove_X86Instr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) X86FN(getRegUsage_X86Instr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) X86FN(mapRegs_X86Instr);
-         isIfThenElse = CAST_TO_TYPEOF(isIfThenElse) X86FN(isIfThenElse_X86Instr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) X86FN(genSpill_X86);
-         genReload    = CAST_TO_TYPEOF(genReload) X86FN(genReload_X86);
-         genMove      = CAST_TO_TYPEOF(genMove) X86FN(genMove_X86);
-         genHInstrITE = CAST_TO_TYPEOF(genHInstrITE) X86FN(X86Instr_IfThenElse);
-         directReload = CAST_TO_TYPEOF(directReload) X86FN(directReload_X86);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) X86FN(ppX86Instr);
-         ppCondCode   = CAST_TO_TYPEOF(ppCondCode) X86FN(ppX86CondCode);
-         ppReg        = CAST_TO_TYPEOF(ppReg) X86FN(ppHRegX86);
-         iselSB       = X86FN(iselSB_X86);
-         emit         = CAST_TO_TYPEOF(emit) X86FN(emit_X86Instr);
-         vassert(vta->archinfo_host.endness == VexEndnessLE);
-         break;
+      stack[stackPtr].vec        = qe->oolVec;
+      stack[stackPtr].vec_next   = 0;
+      stack[stackPtr].ool_qindex = -1; // INVALID
 
-      case VexArchAMD64:
-         mode64       = True;
-         rRegUniv     = AMD64FN(getRRegUniverse_AMD64());
-         isMove       = CAST_TO_TYPEOF(isMove) AMD64FN(isMove_AMD64Instr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) AMD64FN(getRegUsage_AMD64Instr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) AMD64FN(mapRegs_AMD64Instr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) AMD64FN(genSpill_AMD64);
-         genReload    = CAST_TO_TYPEOF(genReload) AMD64FN(genReload_AMD64);
-         genMove      = CAST_TO_TYPEOF(genMove) AMD64FN(genMove_AMD64);
-         directReload = CAST_TO_TYPEOF(directReload) AMD64FN(directReload_AMD64);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) AMD64FN(ppAMD64Instr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) AMD64FN(ppHRegAMD64);
-         iselSB       = AMD64FN(iselSB_AMD64);
-         emit         = CAST_TO_TYPEOF(emit) AMD64FN(emit_AMD64Instr);
-         vassert(vta->archinfo_host.endness == VexEndnessLE);
-         break;
+      // Iterate till the stack is empty.  This effectively does a
+      // depth-first traversal of the hot-path (IL) tree reachable from
+      // here, and at the same time adds any encountered cold-path (OOL)
+      // blocks to the Queue for later processing.  This is the heart of the
+      // flattening algorithm.
+      while (stackPtr >= 0) {
 
-      case VexArchPPC32:
-         mode64       = False;
-         rRegUniv     = PPC32FN(getRRegUniverse_PPC(mode64));
-         isMove       = CAST_TO_TYPEOF(isMove) PPC32FN(isMove_PPCInstr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) PPC32FN(getRegUsage_PPCInstr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) PPC32FN(mapRegs_PPCInstr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) PPC32FN(genSpill_PPC);
-         genReload    = CAST_TO_TYPEOF(genReload) PPC32FN(genReload_PPC);
-         genMove      = CAST_TO_TYPEOF(genMove) PPC32FN(genMove_PPC);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) PPC32FN(ppPPCInstr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) PPC32FN(ppHRegPPC);
-         iselSB       = PPC32FN(iselSB_PPC);
-         emit         = CAST_TO_TYPEOF(emit) PPC32FN(emit_PPCInstr);
-         vassert(vta->archinfo_host.endness == VexEndnessBE);
-         break;
+         if (UNLIKELY(verbose_asm))
+            vex_printf("  -- CONSIDER stack[%d]\n", stackPtr);
 
-      case VexArchPPC64:
-         mode64       = True;
-         rRegUniv     = PPC64FN(getRRegUniverse_PPC(mode64));
-         isMove       = CAST_TO_TYPEOF(isMove) PPC64FN(isMove_PPCInstr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) PPC64FN(getRegUsage_PPCInstr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) PPC64FN(mapRegs_PPCInstr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) PPC64FN(genSpill_PPC);
-         genReload    = CAST_TO_TYPEOF(genReload) PPC64FN(genReload_PPC);
-         genMove      = CAST_TO_TYPEOF(genMove) PPC64FN(genMove_PPC);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) PPC64FN(ppPPCInstr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) PPC64FN(ppHRegPPC);
-         iselSB       = PPC64FN(iselSB_PPC);
-         emit         = CAST_TO_TYPEOF(emit) PPC64FN(emit_PPCInstr);
-         vassert(vta->archinfo_host.endness == VexEndnessBE ||
-                 vta->archinfo_host.endness == VexEndnessLE );
-         break;
+         HInstrVec* vec        = stack[stackPtr].vec;
+         UInt       vec_next   = stack[stackPtr].vec_next;
+         Int        ool_qindex = stack[stackPtr].ool_qindex;
+         stackPtr--;
 
-      case VexArchS390X:
-         mode64       = True;
-         rRegUniv     = S390FN(getRRegUniverse_S390());
-         isMove       = CAST_TO_TYPEOF(isMove) S390FN(isMove_S390Instr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) S390FN(getRegUsage_S390Instr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) S390FN(mapRegs_S390Instr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) S390FN(genSpill_S390);
-         genReload    = CAST_TO_TYPEOF(genReload) S390FN(genReload_S390);
-         genMove      = CAST_TO_TYPEOF(genMove) S390FN(genMove_S390);
-         // fixs390: consider implementing directReload_S390
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) S390FN(ppS390Instr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) S390FN(ppHRegS390);
-         iselSB       = S390FN(iselSB_S390);
-         emit         = CAST_TO_TYPEOF(emit) S390FN(emit_S390Instr);
-         vassert(vta->archinfo_host.endness == VexEndnessBE);
-         break;
+         if (vec_next > 0) {
+            // We're resuming the current IL block having just finished
+            // processing a nested IL.  The OOL counterpart to the nested IL
+            // we just finished processing will have to jump back to here.
+            // So we'll need to mark its Queue entry to record that fact.
 
-      case VexArchARM:
-         mode64       = False;
-         rRegUniv     = ARMFN(getRRegUniverse_ARM());
-         isMove       = CAST_TO_TYPEOF(isMove) ARMFN(isMove_ARMInstr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) ARMFN(getRegUsage_ARMInstr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) ARMFN(mapRegs_ARMInstr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) ARMFN(genSpill_ARM);
-         genReload    = CAST_TO_TYPEOF(genReload) ARMFN(genReload_ARM);
-         genMove      = CAST_TO_TYPEOF(genMove) ARMFN(genMove_ARM);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) ARMFN(ppARMInstr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) ARMFN(ppHRegARM);
-         iselSB       = ARMFN(iselSB_ARM);
-         emit         = CAST_TO_TYPEOF(emit) ARMFN(emit_ARMInstr);
-         vassert(vta->archinfo_host.endness == VexEndnessLE);
-         break;
+            // First assert that the OOL actually *is* in the Queue (it
+            // must be, since we can't have processed it yet).
+            vassert(queueOldest <= queueNewest); // "at least 1 entry in Q"
+            vassert(queueOldest <= ool_qindex && ool_qindex <= queueNewest);
 
-      case VexArchARM64:
-         mode64       = True;
-         rRegUniv     = ARM64FN(getRRegUniverse_ARM64());
-         isMove       = CAST_TO_TYPEOF(isMove) ARM64FN(isMove_ARM64Instr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) ARM64FN(getRegUsage_ARM64Instr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) ARM64FN(mapRegs_ARM64Instr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) ARM64FN(genSpill_ARM64);
-         genReload    = CAST_TO_TYPEOF(genReload) ARM64FN(genReload_ARM64);
-         genMove      = CAST_TO_TYPEOF(genMove) ARM64FN(genMove_ARM64);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) ARM64FN(ppARM64Instr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) ARM64FN(ppHRegARM64);
-         iselSB       = ARM64FN(iselSB_ARM64);
-         emit         = CAST_TO_TYPEOF(emit) ARM64FN(emit_ARM64Instr);
-         vassert(vta->archinfo_host.endness == VexEndnessLE);
-         break;
+            vassert(!queue[ool_qindex].resumePoint_valid);
+            queue[ool_qindex].resumePoint       = cursor;
+            queue[ool_qindex].resumePoint_valid = True;
+            if (UNLIKELY(verbose_asm))
+               vex_printf("  -- RESUME previous IL\n");
+         } else {
+            // We're starting a new IL.  Due to the tail-recursive nature of
+            // entering ILs, this means we can actually only be starting the
+            // outermost (top level) block for this particular Queue entry.
+            vassert(ool_qindex == -1);
+            vassert(vec == qe->oolVec);
+            if (UNLIKELY(verbose_asm))
+               vex_printf("  -- START new IL\n");
+         }
 
-      case VexArchMIPS32:
-         mode64       = False;
-         rRegUniv     = MIPS32FN(getRRegUniverse_MIPS(mode64));
-         isMove       = CAST_TO_TYPEOF(isMove) MIPS32FN(isMove_MIPSInstr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) MIPS32FN(getRegUsage_MIPSInstr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) MIPS32FN(mapRegs_MIPSInstr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) MIPS32FN(genSpill_MIPS);
-         genReload    = CAST_TO_TYPEOF(genReload) MIPS32FN(genReload_MIPS);
-         genMove      = CAST_TO_TYPEOF(genMove) MIPS32FN(genMove_MIPS);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) MIPS32FN(ppMIPSInstr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) MIPS32FN(ppHRegMIPS);
-         iselSB       = MIPS32FN(iselSB_MIPS);
-         emit         = CAST_TO_TYPEOF(emit) MIPS32FN(emit_MIPSInstr);
-         vassert(vta->archinfo_host.endness == VexEndnessLE
-                 || vta->archinfo_host.endness == VexEndnessBE);
-         break;
+         // Repeatedly process "zero or more simple HInstrs followed by (an
+         // IfThenElse or end-of-block)"
+         while (True) {
 
-      case VexArchMIPS64:
-         mode64       = True;
-         rRegUniv     = MIPS64FN(getRRegUniverse_MIPS(mode64));
-         isMove       = CAST_TO_TYPEOF(isMove) MIPS64FN(isMove_MIPSInstr);
-         getRegUsage  
-            = CAST_TO_TYPEOF(getRegUsage) MIPS64FN(getRegUsage_MIPSInstr);
-         mapRegs      = CAST_TO_TYPEOF(mapRegs) MIPS64FN(mapRegs_MIPSInstr);
-         genSpill     = CAST_TO_TYPEOF(genSpill) MIPS64FN(genSpill_MIPS);
-         genReload    = CAST_TO_TYPEOF(genReload) MIPS64FN(genReload_MIPS);
-         genMove      = CAST_TO_TYPEOF(genMove) MIPS64FN(genMove_MIPS);
-         ppInstr      = CAST_TO_TYPEOF(ppInstr) MIPS64FN(ppMIPSInstr);
-         ppReg        = CAST_TO_TYPEOF(ppReg) MIPS64FN(ppHRegMIPS);
-         iselSB       = MIPS64FN(iselSB_MIPS);
-         emit         = CAST_TO_TYPEOF(emit) MIPS64FN(emit_MIPSInstr);
-         vassert(vta->archinfo_host.endness == VexEndnessLE
-                 || vta->archinfo_host.endness == VexEndnessBE);
-         break;
+            // Process "zero or more simple HInstrs"
+            while (vec_next < vec->insns_used
+                   && !isIfThenElse(vec->insns[vec_next])) {
+               AssemblyBufferOffset cursor_next
+                  = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
+                                    cursor, cursor_limit, vec->insns[vec_next],
+                                    &emitConsts, vta );
+               if (UNLIKELY(cursor_next == cursor)) {
+                  // We ran out of output space.  Give up.
+                  return False;
+               }
+               vec_next++;
+               cursor = cursor_next;
+            }
+
+            // Now we've either got to the end of the hot path, or we have
+            // an IfThenElse.
+            if (vec_next >= vec->insns_used)
+               break;
+
+            // So we have an IfThenElse.
+            HInstrIfThenElse* hite = isIfThenElse(vec->insns[vec_next]);
+            vassert(hite);
+            vassert(hite->n_phis == 0); // the regalloc will have removed them
+
+            // Put |ite|'s OOL block in the Queue.  We'll deal with it
+            // later.  Also, generate the (skeleton) conditional branch to it,
+            // and collect enough information that we can create patch the
+            // branch later, once we know where the destination is.
+            vassert(queueNewest < nQUEUE-1); // else out of Queue space
+            queueNewest++;
+            queue[queueNewest].oolVec = hite->outOfLine;
+            queue[queueNewest].resumePoint_valid = False; // not yet known
+            queue[queueNewest].resumePoint = -1; // invalid
+
+            HInstr* cond_branch
+               = X86Instr_JmpCond(hite->ccOOL,
+                                  queueNewest/*FOR DEBUG PRINTING ONLY*/);
+            AssemblyBufferOffset cursor_next
+               = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
+                                 cursor, cursor_limit, cond_branch,
+                                 &emitConsts, vta );
+            if (UNLIKELY(cursor_next == cursor)) {
+               // We ran out of output space.  Give up.
+               return False;
+            }
+            queue[queueNewest].jumpToOOLpoint_valid = True;
+            queue[queueNewest].jumpToOOLpoint
+               = collectRelocInfo_X86(cursor, cond_branch);
 
-      default:
-         vpanic("LibVEX_Translate: unsupported host insn set");
-   }
+            cursor = cursor_next;
 
-   // Are the host's hardware capabilities feasible. The function will
-   // not return if hwcaps are infeasible in some sense.
-   check_hwcaps(vta->arch_host, vta->archinfo_host.hwcaps);
+            // Now we descend into |ite's| IL block.  So we need to save
+            // where we are in this block, so we can resume when the inner
+            // one is done.
+            vassert(stackPtr < nSTACK-1); // else out of Stack space
+            stackPtr++;
+            stack[stackPtr].vec        = vec;
+            stack[stackPtr].vec_next   = vec_next+1;
+            stack[stackPtr].ool_qindex = queueNewest;
 
+            // And now descend into the inner block.  We could have just
+            // pushed its details on the stack and immediately pop it, but
+            // it seems simpler to update |vec| and |vec_next| and continue
+            // directly.
+            if (UNLIKELY(verbose_asm)) {
+               vex_printf("  -- START inner IL\n");
+            }
+            vec      = hite->fallThrough;
+            vec_next = 0;
 
-   /* Turn it into virtual-registerised code.  Build trees -- this
-      also throws away any dead bindings. */
-   max_ga = ado_treebuild_BB( irsb, preciseMemExnsFn, pxControl );
+            // And continue with "Repeatedly process ..."
+         }
 
-   if (vta->finaltidy) {
-      irsb = vta->finaltidy(irsb);
-   }
+         // Getting here means we've completed an inner IL and now want to
+         // resume the parent IL.  That is, pop a saved context off the
+         // stack.
+      }
 
-   vexAllocSanityCheck();
+      // Hot path is complete.  Now, probably, we have to add a jump
+      // back to the resume point.
+      if (qe->resumePoint_valid) {
+         if (0)
+            vex_printf("  // Generate jump to resume point [%03u]\n",
+                       qe->resumePoint);
+         HInstr* jmp = X86Instr_Jmp(cursor, qe->resumePoint);
+         AssemblyBufferOffset cursor_next
+            = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
+                              cursor, cursor_limit, jmp,
+                              &emitConsts, vta );
+         if (UNLIKELY(cursor_next == cursor)) {
+            // We ran out of output space.  Give up.
+            return False;
+         }
+         cursor = cursor_next;
+      }
 
-   if (vex_traceflags & VEX_TRACE_TREES) {
-      vex_printf("\n------------------------" 
-                   "  After tree-building "
-                   "------------------------\n\n");
-      ppIRSB ( irsb );
-      vex_printf("\n");
+      if (UNLIKELY(verbose_asm))
+         vex_printf("END queue[%d]\n\n", qCur);
+      // Finished with this Queue entry.
    }
+   // Queue empty, all blocks processed
 
-   /* HACK */
-   if (0) {
-      *(vta->host_bytes_used) = 0;
-      res->status = VexTransOK; return;
-   }
-   /* end HACK */
+   *(vta->host_bytes_used) = cursor;
 
-   if (vex_traceflags & VEX_TRACE_VCODE)
-      vex_printf("\n------------------------" 
-                   " Instruction selection "
-                   "------------------------\n");
+   return True; // OK
+}
 
-   /* No guest has its IP field at offset zero.  If this fails it
-      means some transformation pass somewhere failed to update/copy
-      irsb->offsIP properly. */
-   vassert(irsb->offsIP >= 16);
 
-   vcode = iselSB ( irsb, vta->arch_host,
-                    &vta->archinfo_host, 
-                    &vta->abiinfo_both,
-                    offB_HOST_EvC_COUNTER,
-                    offB_HOST_EvC_FAILADDR,
-                    chainingAllowed,
-                    vta->addProfInc,
-                    max_ga );
+/* ---- The back end proper ---- */
 
-   vexAllocSanityCheck();
+/* Back end of the compilation pipeline.  Is not exported. */
 
-   if (vex_traceflags & VEX_TRACE_VCODE)
-      vex_printf("\n");
+static void libvex_BackEnd ( const VexTranslateArgs* vta,
+                             /*MOD*/ VexTranslateResult* res,
+                             /*MOD*/ IRSB* irsb,
+                             VexRegisterUpdates pxControl )
+{
+   /* This the bundle of functions we need to do the back-end stuff
+      (insn selection, reg-alloc, assembly) whilst being insulated
+      from the target instruction set. */
+   Bool         (*isMove)       ( const HInstr*, HReg*, HReg* );
+   void         (*getRegUsage)  ( HRegUsage*, const HInstr*, Bool );
+   void         (*mapRegs)      ( HRegRemap*, HInstr*, Bool );
+   HInstrIfThenElse* (*isIfThenElse)( const HInstr* );
+   void         (*genSpill)     ( HInstr**, HInstr**, HReg, Int, Bool );
+   void         (*genReload)    ( HInstr**, HInstr**, HReg, Int, Bool );
+   HInstr*      (*genMove)      ( HReg, HReg, Bool );
+   HInstr*      (*genHInstrITE) ( HInstrIfThenElse* );
+   HInstr*      (*directReload) ( HInstr*, HReg, Short );
+   void         (*ppInstr)      ( const HInstr*, Bool );
+   void         (*ppCondCode)   ( HCondCode );
+   UInt         (*ppReg)        ( HReg );
+   HInstrSB*    (*iselSB)       ( const IRSB*, VexArch, const VexArchInfo*,
+                                  const VexAbiInfo*, Int, Int, Bool, Bool,
+                                  Addr );
+   Int          (*emit)         ( /*MB_MOD*/Bool*,
+                                  UChar*, Int, const HInstr*, Bool, VexEndness,
+                                  const void*, const void*, const void*,
+                                  const void* );
+   Bool (*preciseMemExnsFn) ( Int, Int, VexRegisterUpdates );
 
-   if (vex_traceflags & VEX_TRACE_VCODE) {
-      ppHInstrSB(vcode, isIfThenElse, ppInstr, ppCondCode, mode64);
-   }
+   const RRegUniverse* rRegUniv = NULL;
 
-   /* Register allocate. */
-   RegAllocControl con = {
-      .univ = rRegUniv, .isMove = isMove, .getRegUsage = getRegUsage,
-      .mapRegs = mapRegs, .isIfThenElse = isIfThenElse, .genSpill = genSpill,
-      .genReload = genReload, .genMove = genMove, .genHInstrITE = genHInstrITE,
-      .directReload = directReload, .guest_sizeB = guest_sizeB,
-      .ppInstr = ppInstr, .ppCondCode = ppCondCode, .ppReg = ppReg,
-      .mode64 = mode64};
-   rcode = doRegisterAllocation(vcode, &con);
+   Bool      mode64, chainingAllowed;
+   Int       guest_sizeB;
+   Int       offB_HOST_EvC_COUNTER;
+   Int       offB_HOST_EvC_FAILADDR;
+   Addr      max_ga;
+   HInstrSB* vcode;
+   HInstrSB* rcode;
 
-   vexAllocSanityCheck();
+   isMove                  = NULL;
+   getRegUsage             = NULL;
+   mapRegs                 = NULL;
+   isIfThenElse            = NULL;
+   genSpill                = NULL;
+   genReload               = NULL;
+   genMove                 = NULL;
+   genHInstrITE            = NULL;
+   directReload            = NULL;
+   ppInstr                 = NULL;
+   ppCondCode              = NULL;
+   ppReg                   = NULL;
+   iselSB                  = NULL;
+   emit                    = NULL;
 
-   if (vex_traceflags & VEX_TRACE_RCODE) {
-      vex_printf("\n------------------------" 
-                   " Register-allocated code "
-                   "------------------------\n\n");
-      ppHInstrSB(rcode, isIfThenElse, ppInstr, ppCondCode, mode64);
-      vex_printf("\n");
-   }
+   mode64                 = False;
+   chainingAllowed        = False;
+   guest_sizeB            = 0;
+   offB_HOST_EvC_COUNTER  = 0;
+   offB_HOST_EvC_FAILADDR = 0;
+   preciseMemExnsFn       = NULL;
 
-   /* HACK */
-   if (0) { 
-      *(vta->host_bytes_used) = 0;
-      res->status = VexTransOK; return;
-   }
-   /* end HACK */
+   vassert(vex_initdone);
+   vassert(vta->disp_cp_xassisted != NULL);
 
-   /* Assemble */
-   if (vex_traceflags & VEX_TRACE_ASM) {
-      vex_printf("\n------------------------" 
-                   " Assembly "
-                   "------------------------\n\n");
+   vex_traceflags = vta->traceflags;
+
+   /* Both the chainers and the indir are either NULL or non-NULL. */
+   if (vta->disp_cp_chain_me_to_slowEP        != NULL) {
+      vassert(vta->disp_cp_chain_me_to_fastEP != NULL);
+      vassert(vta->disp_cp_xindir             != NULL);
+      chainingAllowed = True;
+   } else {
+      vassert(vta->disp_cp_chain_me_to_fastEP == NULL);
+      vassert(vta->disp_cp_xindir             == NULL);
    }
 
-   ////////////////////////////////////////////////////////
-   //// BEGIN the assembler
+   switch (vta->arch_guest) {
 
-   // QElem are work Queue elements.  The work Queue is the top level data
-   // structure for the emitter.  It is initialised with the HInstrVec* of
-   // the overall HInstrSB.  Every OOL HInstrVec* in the tree will at some
-   // point be present in the Queue.  IL HInstrVec*s are never present in
-   // the Queue because the inner emitter loop processes them in-line, using
-   // a Stack (see below) to keep track of its nesting level.
-   //
-   // The Stack (see below) is empty before and after every Queue element is
-   // processed.  In other words, the Stack only holds state needed during
-   // the processing of a single Queue element.
-   //
-   // The ordering of elements in the Queue is irrelevant -- correct code
-   // will be emitted even with set semantics (arbitrary order).  However,
-   // the FIFOness of the queue is believed to generate code in which
-   // colder and colder code (more deeply nested OOLs) is placed further
-   // and further from the start of the emitted machine code, which sounds
-   // like a layout which should minimise icache misses.
-   //
-   // QElems also contain two pieces of jump-fixup information.  When we
-   // finally come to process a QElem, we need to know:
-   //
-   // * |jumpToOOLpoint|: the place which wants to jump to the start of the
-   //   emitted insns for this QElem.  We must have already emitted that,
-   //   since it will be the conditional jump that leads to this QElem (OOL
-   //   block).
-   //
-   // * |resumePoint|: the place we should jump back to after the QElem is
-   //   finished (the "resume point"), which is the emitted code of the
-   //   HInstr immediately following the HInstrIfThenElse that has this
-   //   QElem as its OOL block.
-   //
-   // When the QElem is processed, we know both the |jumpToOOLpoint| and
-   // the |resumePoint|, and so the first can be patched, and the second
-   // we generate an instruction to jump to.
-   //
-   // There are three complications with patching:
-   //
-   // (1) per comments on Stack elems, we do not know the |resumePoint| when
-   //     creating a QElem.  That will only be known when processing of the
-   //     corresponding IL block is completed.
-   //
-   // (2) The top level HInstrVec* has neither a |jumpToOOLpoint| nor a
-   //     |resumePoint|.
-   //
-   // (3) Non-top-level OOLs may not have a valid |resumePoint| if they do
-   //     an unconditional IR-level Exit.  We can generate the resume point
-   //     branch, but it will be never be used.
-   typedef
-      struct {
-         // The HInstrs for this OOL.
-         HInstrVec* oolVec;
-         // Where we should patch to jump to the OOL ("how do we get here?")
-         Bool       jumpToOOLpoint_valid;
-         Relocation jumpToOOLpoint;
-         // Resume point offset, in bytes from start of output buffer
-         // ("where do we go after this block is completed?")
-         Bool                 resumePoint_valid;
-         AssemblyBufferOffset resumePoint;
-      }
-      QElem;
+      case VexArchX86:
+         preciseMemExnsFn       
+            = X86FN(guest_x86_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestX86State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestX86State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestX86State,host_EvC_FAILADDR);
+         break;
+
+      case VexArchAMD64:
+         preciseMemExnsFn       
+            = AMD64FN(guest_amd64_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestAMD64State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestAMD64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestAMD64State,host_EvC_FAILADDR);
+         break;
 
+      case VexArchPPC32:
+         preciseMemExnsFn       
+            = PPC32FN(guest_ppc32_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestPPC32State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC32State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC32State,host_EvC_FAILADDR);
+         break;
 
-   // SElem are stack elements.  When we suspend processing a HInstrVec* in
-   // order to process an IL path in an IfThenElse, we push the HInstrVec*
-   // and the next index to process on the stack, so that we know where to
-   // resume when the nested IL sequence is completed.  |vec| and |vec_next|
-   // record the resume HInstr.
-   //
-   // A second effect of processing a nested IL sequence is that we will
-   // have to (later) process the corresponding OOL sequence.  And that OOL
-   // sequence will have to finish with a jump back to the "resume point"
-   // (the emitted instruction immediately following the IfThenElse).  We
-   // only know the offset of the resume point instruction in the output
-   // buffer when we actually resume emitted from there -- that is, when the
-   // entry we pushed, is popped.  So, when we pop, we must mark the
-   // corresponding OOL entry in the Queue to record there the resume point
-   // offset.  For this reason we also carry |ool_qindex|, which is the
-   // index of the corresponding OOL entry in the Queue.
-   typedef
-      struct {
-         HInstrVec* vec;        // resume point HInstr vector
-         UInt       vec_next;   // resume point HInstr vector index
-         Int        ool_qindex; // index in Queue of OOL to mark when we resume
-      }
-      SElem;
+      case VexArchPPC64:
+         preciseMemExnsFn       
+            = PPC64FN(guest_ppc64_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestPPC64State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestPPC64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestPPC64State,host_EvC_FAILADDR);
+         break;
 
-   // The Stack.  The stack depth is bounded by maximum number of nested
-   // hot (IL) sections, so in practice it is going to be very small.
-   const Int nSTACK = 4;
+      case VexArchS390X:
+         preciseMemExnsFn 
+            = S390FN(guest_s390x_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestS390XState);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestS390XState,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestS390XState,host_EvC_FAILADDR);
+         break;
 
-   SElem stack[nSTACK];
-   Int   stackPtr; // points to most recently pushed entry <=> "-1 means empty"
+      case VexArchARM:
+         preciseMemExnsFn       
+            = ARMFN(guest_arm_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestARMState);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestARMState,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestARMState,host_EvC_FAILADDR);
+         break;
 
-   // The Queue.  The queue size is bounded by the number of cold (OOL)
-   // sections in the entire HInstrSB, so it's also going to be pretty
-   // small.
-   const Int nQUEUE = 8;
+      case VexArchARM64:
+         preciseMemExnsFn     
+            = ARM64FN(guest_arm64_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestARM64State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestARM64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestARM64State,host_EvC_FAILADDR);
+         break;
 
-   QElem queue[nQUEUE];
-   Int   queueOldest; // index of oldest entry, initially 0
-   Int   queueNewest; // index of newest entry,
-                      // initially -1, otherwise must be >= queueOldest
+      case VexArchMIPS32:
+         preciseMemExnsFn       
+            = MIPS32FN(guest_mips32_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestMIPS32State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestMIPS32State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestMIPS32State,host_EvC_FAILADDR);
+         break;
 
-   ///////////////////////////////////////////////////////
+      case VexArchMIPS64:
+         preciseMemExnsFn       
+            = MIPS64FN(guest_mips64_state_requires_precise_mem_exns);
+         guest_sizeB            = sizeof(VexGuestMIPS64State);
+         offB_HOST_EvC_COUNTER  = offsetof(VexGuestMIPS64State,host_EvC_COUNTER);
+         offB_HOST_EvC_FAILADDR = offsetof(VexGuestMIPS64State,host_EvC_FAILADDR);
+         break;
 
-   const Bool verbose_asm = (vex_traceflags & VEX_TRACE_ASM) != 0;
+      default:
+         vpanic("LibVEX_Codegen: unsupported guest insn set");
+   }
 
-   const EmitConstants emitConsts
-      = { .mode64                     = mode64,
-          .endness_host               = vta->archinfo_host.endness,
-          .disp_cp_chain_me_to_slowEP = vta->disp_cp_chain_me_to_slowEP,
-          .disp_cp_chain_me_to_fastEP = vta->disp_cp_chain_me_to_fastEP,
-          .disp_cp_xindir             = vta->disp_cp_xindir,
-          .disp_cp_xassisted          = vta->disp_cp_xassisted };
 
-   AssemblyBufferOffset cursor = 0;
-   AssemblyBufferOffset cursor_limit = vta->host_bytes_size;
+   switch (vta->arch_host) {
 
-   queueOldest = 0;
-   queueNewest = -1;
+      case VexArchX86:
+         mode64       = False;
+         rRegUniv     = X86FN(getRRegUniverse_X86());
+         isMove       = CAST_TO_TYPEOF(isMove) X86FN(isMove_X86Instr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) X86FN(getRegUsage_X86Instr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) X86FN(mapRegs_X86Instr);
+         isIfThenElse = CAST_TO_TYPEOF(isIfThenElse) X86FN(isIfThenElse_X86Instr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) X86FN(genSpill_X86);
+         genReload    = CAST_TO_TYPEOF(genReload) X86FN(genReload_X86);
+         genMove      = CAST_TO_TYPEOF(genMove) X86FN(genMove_X86);
+         genHInstrITE = CAST_TO_TYPEOF(genHInstrITE) X86FN(X86Instr_IfThenElse);
+         directReload = CAST_TO_TYPEOF(directReload) X86FN(directReload_X86);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) X86FN(ppX86Instr);
+         ppCondCode   = CAST_TO_TYPEOF(ppCondCode) X86FN(ppX86CondCode);
+         ppReg        = CAST_TO_TYPEOF(ppReg) X86FN(ppHRegX86);
+         iselSB       = X86FN(iselSB_X86);
+         emit         = CAST_TO_TYPEOF(emit) X86FN(emit_X86Instr);
+         vassert(vta->archinfo_host.endness == VexEndnessLE);
+         break;
 
-   vassert(queueNewest < nQUEUE);
-   queueNewest++;
-   {
-      QElem* qe = &queue[queueNewest];
-      vex_bzero(qe, sizeof(*qe));
-      qe->oolVec = rcode->insns;
-      qe->jumpToOOLpoint_valid = False;
-      qe->resumePoint_valid = False;
-   }
-   vassert(queueNewest == 0);
+      case VexArchAMD64:
+         mode64       = True;
+         rRegUniv     = AMD64FN(getRRegUniverse_AMD64());
+         isMove       = CAST_TO_TYPEOF(isMove) AMD64FN(isMove_AMD64Instr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) AMD64FN(getRegUsage_AMD64Instr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) AMD64FN(mapRegs_AMD64Instr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) AMD64FN(genSpill_AMD64);
+         genReload    = CAST_TO_TYPEOF(genReload) AMD64FN(genReload_AMD64);
+         genMove      = CAST_TO_TYPEOF(genMove) AMD64FN(genMove_AMD64);
+         directReload = CAST_TO_TYPEOF(directReload) AMD64FN(directReload_AMD64);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) AMD64FN(ppAMD64Instr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) AMD64FN(ppHRegAMD64);
+         iselSB       = AMD64FN(iselSB_AMD64);
+         emit         = CAST_TO_TYPEOF(emit) AMD64FN(emit_AMD64Instr);
+         vassert(vta->archinfo_host.endness == VexEndnessLE);
+         break;
 
-   /* Main loop, processing Queue entries, until there are no more. */
-   while (queueOldest <= queueNewest) {
+      case VexArchPPC32:
+         mode64       = False;
+         rRegUniv     = PPC32FN(getRRegUniverse_PPC(mode64));
+         isMove       = CAST_TO_TYPEOF(isMove) PPC32FN(isMove_PPCInstr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) PPC32FN(getRegUsage_PPCInstr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) PPC32FN(mapRegs_PPCInstr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) PPC32FN(genSpill_PPC);
+         genReload    = CAST_TO_TYPEOF(genReload) PPC32FN(genReload_PPC);
+         genMove      = CAST_TO_TYPEOF(genMove) PPC32FN(genMove_PPC);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) PPC32FN(ppPPCInstr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) PPC32FN(ppHRegPPC);
+         iselSB       = PPC32FN(iselSB_PPC);
+         emit         = CAST_TO_TYPEOF(emit) PPC32FN(emit_PPCInstr);
+         vassert(vta->archinfo_host.endness == VexEndnessBE);
+         break;
 
-      Int qCur = queueOldest;
-      if (UNLIKELY(verbose_asm))
-         vex_printf("BEGIN queue[%d]\n", qCur);
+      case VexArchPPC64:
+         mode64       = True;
+         rRegUniv     = PPC64FN(getRRegUniverse_PPC(mode64));
+         isMove       = CAST_TO_TYPEOF(isMove) PPC64FN(isMove_PPCInstr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) PPC64FN(getRegUsage_PPCInstr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) PPC64FN(mapRegs_PPCInstr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) PPC64FN(genSpill_PPC);
+         genReload    = CAST_TO_TYPEOF(genReload) PPC64FN(genReload_PPC);
+         genMove      = CAST_TO_TYPEOF(genMove) PPC64FN(genMove_PPC);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) PPC64FN(ppPPCInstr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) PPC64FN(ppHRegPPC);
+         iselSB       = PPC64FN(iselSB_PPC);
+         emit         = CAST_TO_TYPEOF(emit) PPC64FN(emit_PPCInstr);
+         vassert(vta->archinfo_host.endness == VexEndnessBE ||
+                 vta->archinfo_host.endness == VexEndnessLE );
+         break;
 
-      // Take the oldest entry in the queue
-      QElem* qe = &queue[queueOldest];
-      queueOldest++;
+      case VexArchS390X:
+         mode64       = True;
+         rRegUniv     = S390FN(getRRegUniverse_S390());
+         isMove       = CAST_TO_TYPEOF(isMove) S390FN(isMove_S390Instr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) S390FN(getRegUsage_S390Instr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) S390FN(mapRegs_S390Instr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) S390FN(genSpill_S390);
+         genReload    = CAST_TO_TYPEOF(genReload) S390FN(genReload_S390);
+         genMove      = CAST_TO_TYPEOF(genMove) S390FN(genMove_S390);
+         // fixs390: consider implementing directReload_S390
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) S390FN(ppS390Instr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) S390FN(ppHRegS390);
+         iselSB       = S390FN(iselSB_S390);
+         emit         = CAST_TO_TYPEOF(emit) S390FN(emit_S390Instr);
+         vassert(vta->archinfo_host.endness == VexEndnessBE);
+         break;
 
-      // Stay sane.  Only the top level block has no branch to it and no
-      // resume point.
-      if (qe->oolVec == rcode->insns) {
-         // This is the top level block
-         vassert(!qe->jumpToOOLpoint_valid);
-         vassert(!qe->resumePoint_valid);
-      } else {
-         vassert(qe->jumpToOOLpoint_valid);
-         vassert(qe->resumePoint_valid);
-         // In the future, we might be able to allow the resume point to be
-         // invalid for non-top-level blocks, if the block contains an
-         // unconditional exit.  Currently the IR can't represent that, so
-         // the assertion is valid.
-      }
+      case VexArchARM:
+         mode64       = False;
+         rRegUniv     = ARMFN(getRRegUniverse_ARM());
+         isMove       = CAST_TO_TYPEOF(isMove) ARMFN(isMove_ARMInstr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) ARMFN(getRegUsage_ARMInstr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) ARMFN(mapRegs_ARMInstr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) ARMFN(genSpill_ARM);
+         genReload    = CAST_TO_TYPEOF(genReload) ARMFN(genReload_ARM);
+         genMove      = CAST_TO_TYPEOF(genMove) ARMFN(genMove_ARM);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) ARMFN(ppARMInstr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) ARMFN(ppHRegARM);
+         iselSB       = ARMFN(iselSB_ARM);
+         emit         = CAST_TO_TYPEOF(emit) ARMFN(emit_ARMInstr);
+         vassert(vta->archinfo_host.endness == VexEndnessLE);
+         break;
 
-      // Processing |qe|
-      if (qe->jumpToOOLpoint_valid) {
-         // patch qe->jmpToOOLpoint to jump to |here|
-         if (UNLIKELY(verbose_asm)) {
-            vex_printf("  -- APPLY ");
-            ppRelocation(qe->jumpToOOLpoint);
-            vex_printf("\n");
-         }
-         applyRelocation(qe->jumpToOOLpoint, &vta->host_bytes[0],
-                         cursor, cursor, vta->archinfo_host.endness,
-                         verbose_asm);
-      }
+      case VexArchARM64:
+         mode64       = True;
+         rRegUniv     = ARM64FN(getRRegUniverse_ARM64());
+         isMove       = CAST_TO_TYPEOF(isMove) ARM64FN(isMove_ARM64Instr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) ARM64FN(getRegUsage_ARM64Instr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) ARM64FN(mapRegs_ARM64Instr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) ARM64FN(genSpill_ARM64);
+         genReload    = CAST_TO_TYPEOF(genReload) ARM64FN(genReload_ARM64);
+         genMove      = CAST_TO_TYPEOF(genMove) ARM64FN(genMove_ARM64);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) ARM64FN(ppARM64Instr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) ARM64FN(ppHRegARM64);
+         iselSB       = ARM64FN(iselSB_ARM64);
+         emit         = CAST_TO_TYPEOF(emit) ARM64FN(emit_ARM64Instr);
+         vassert(vta->archinfo_host.endness == VexEndnessLE);
+         break;
 
-      // Initialise the stack, for processing of |qe|.
-      stackPtr = 0; // "contains one element"
+      case VexArchMIPS32:
+         mode64       = False;
+         rRegUniv     = MIPS32FN(getRRegUniverse_MIPS(mode64));
+         isMove       = CAST_TO_TYPEOF(isMove) MIPS32FN(isMove_MIPSInstr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) MIPS32FN(getRegUsage_MIPSInstr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) MIPS32FN(mapRegs_MIPSInstr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) MIPS32FN(genSpill_MIPS);
+         genReload    = CAST_TO_TYPEOF(genReload) MIPS32FN(genReload_MIPS);
+         genMove      = CAST_TO_TYPEOF(genMove) MIPS32FN(genMove_MIPS);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) MIPS32FN(ppMIPSInstr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) MIPS32FN(ppHRegMIPS);
+         iselSB       = MIPS32FN(iselSB_MIPS);
+         emit         = CAST_TO_TYPEOF(emit) MIPS32FN(emit_MIPSInstr);
+         vassert(vta->archinfo_host.endness == VexEndnessLE
+                 || vta->archinfo_host.endness == VexEndnessBE);
+         break;
 
-      stack[stackPtr].vec        = qe->oolVec;
-      stack[stackPtr].vec_next   = 0;
-      stack[stackPtr].ool_qindex = -1; // INVALID
+      case VexArchMIPS64:
+         mode64       = True;
+         rRegUniv     = MIPS64FN(getRRegUniverse_MIPS(mode64));
+         isMove       = CAST_TO_TYPEOF(isMove) MIPS64FN(isMove_MIPSInstr);
+         getRegUsage  
+            = CAST_TO_TYPEOF(getRegUsage) MIPS64FN(getRegUsage_MIPSInstr);
+         mapRegs      = CAST_TO_TYPEOF(mapRegs) MIPS64FN(mapRegs_MIPSInstr);
+         genSpill     = CAST_TO_TYPEOF(genSpill) MIPS64FN(genSpill_MIPS);
+         genReload    = CAST_TO_TYPEOF(genReload) MIPS64FN(genReload_MIPS);
+         genMove      = CAST_TO_TYPEOF(genMove) MIPS64FN(genMove_MIPS);
+         ppInstr      = CAST_TO_TYPEOF(ppInstr) MIPS64FN(ppMIPSInstr);
+         ppReg        = CAST_TO_TYPEOF(ppReg) MIPS64FN(ppHRegMIPS);
+         iselSB       = MIPS64FN(iselSB_MIPS);
+         emit         = CAST_TO_TYPEOF(emit) MIPS64FN(emit_MIPSInstr);
+         vassert(vta->archinfo_host.endness == VexEndnessLE
+                 || vta->archinfo_host.endness == VexEndnessBE);
+         break;
 
-      // Iterate till the stack is empty.  This effectively does a
-      // depth-first traversal of the hot-path (IL) tree reachable from
-      // here, and at the same time adds any encountered cold-path (OOL)
-      // blocks to the Queue for later processing.  This is the heart of the
-      // flattening algorithm.
-      while (stackPtr >= 0) {
+      default:
+         vpanic("LibVEX_Translate: unsupported host insn set");
+   }
 
-         if (UNLIKELY(verbose_asm))
-            vex_printf("  -- CONSIDER stack[%d]\n", stackPtr);
+   // Are the host's hardware capabilities feasible. The function will
+   // not return if hwcaps are infeasible in some sense.
+   check_hwcaps(vta->arch_host, vta->archinfo_host.hwcaps);
 
-         HInstrVec* vec        = stack[stackPtr].vec;
-         UInt       vec_next   = stack[stackPtr].vec_next;
-         Int        ool_qindex = stack[stackPtr].ool_qindex;
-         stackPtr--;
 
-         if (vec_next > 0) {
-            // We're resuming the current IL block having just finished
-            // processing a nested IL.  The OOL counterpart to the nested IL
-            // we just finished processing will have to jump back to here.
-            // So we'll need to mark its Queue entry to record that fact.
+   /* Turn it into virtual-registerised code.  Build trees -- this
+      also throws away any dead bindings. */
+   max_ga = ado_treebuild_BB( irsb, preciseMemExnsFn, pxControl );
 
-            // First assert that the OOL actually *is* in the Queue (it
-            // must be, since we can't have processed it yet).
-            vassert(queueOldest <= queueNewest); // "at least 1 entry in Q"
-            vassert(queueOldest <= ool_qindex && ool_qindex <= queueNewest);
+   if (vta->finaltidy) {
+      irsb = vta->finaltidy(irsb);
+   }
 
-            vassert(!queue[ool_qindex].resumePoint_valid);
-            queue[ool_qindex].resumePoint       = cursor;
-            queue[ool_qindex].resumePoint_valid = True;
-            if (UNLIKELY(verbose_asm))
-               vex_printf("  -- RESUME previous IL\n");
-         } else {
-            // We're starting a new IL.  Due to the tail-recursive nature of
-            // entering ILs, this means we can actually only be starting the
-            // outermost (top level) block for this particular Queue entry.
-            vassert(ool_qindex == -1);
-            vassert(vec == qe->oolVec);
-            if (UNLIKELY(verbose_asm))
-               vex_printf("  -- START new IL\n");
-         }
+   vexAllocSanityCheck();
 
-         // Repeatedly process "zero or more simple HInstrs followed by (an
-         // IfThenElse or end-of-block)"
-         while (True) {
+   if (vex_traceflags & VEX_TRACE_TREES) {
+      vex_printf("\n------------------------" 
+                   "  After tree-building "
+                   "------------------------\n\n");
+      ppIRSB ( irsb );
+      vex_printf("\n");
+   }
 
-            // Process "zero or more simple HInstrs"
-            while (vec_next < vec->insns_used
-                   && !isIfThenElse(vec->insns[vec_next])) {
-               AssemblyBufferOffset cursor_next
-                  = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
-                                    cursor, cursor_limit, vec->insns[vec_next],
-                                    &emitConsts, vta );
-               if (UNLIKELY(cursor_next == cursor)) {
-                  // We ran out of output space.  Give up.
-                  goto out_of_buffer_space;
-               }
-               vec_next++;
-               cursor = cursor_next;
-            }
+   /* HACK */
+   if (0) {
+      *(vta->host_bytes_used) = 0;
+      res->status = VexTransOK; return;
+   }
+   /* end HACK */
 
-            // Now we've either got to the end of the hot path, or we have
-            // an IfThenElse.
-            if (vec_next >= vec->insns_used)
-               break;
+   if (vex_traceflags & VEX_TRACE_VCODE)
+      vex_printf("\n------------------------" 
+                   " Instruction selection "
+                   "------------------------\n");
 
-            // So we have an IfThenElse.
-            HInstrIfThenElse* hite = isIfThenElse(vec->insns[vec_next]);
-            vassert(hite);
-            vassert(hite->n_phis == 0); // the regalloc will have removed them
+   /* No guest has its IP field at offset zero.  If this fails it
+      means some transformation pass somewhere failed to update/copy
+      irsb->offsIP properly. */
+   vassert(irsb->offsIP >= 16);
 
-            // Put |ite|'s OOL block in the Queue.  We'll deal with it
-            // later.  Also, generate the (skeleton) conditional branch to it,
-            // and collect enough information that we can create patch the
-            // branch later, once we know where the destination is.
-            vassert(queueNewest < nQUEUE-1); // else out of Queue space
-            queueNewest++;
-            queue[queueNewest].oolVec = hite->outOfLine;
-            queue[queueNewest].resumePoint_valid = False; // not yet known
-            queue[queueNewest].resumePoint = -1; // invalid
+   vcode = iselSB ( irsb, vta->arch_host,
+                    &vta->archinfo_host, 
+                    &vta->abiinfo_both,
+                    offB_HOST_EvC_COUNTER,
+                    offB_HOST_EvC_FAILADDR,
+                    chainingAllowed,
+                    vta->addProfInc,
+                    max_ga );
 
-            HInstr* cond_branch
-               = X86Instr_JmpCond(hite->ccOOL,
-                                  queueNewest/*FOR DEBUG PRINTING ONLY*/);
-            AssemblyBufferOffset cursor_next
-               = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
-                                 cursor, cursor_limit, cond_branch,
-                                 &emitConsts, vta );
-            if (UNLIKELY(cursor_next == cursor)) {
-               // We ran out of output space.  Give up.
-               goto out_of_buffer_space;
-            }
-            queue[queueNewest].jumpToOOLpoint_valid = True;
-            queue[queueNewest].jumpToOOLpoint
-               = collectRelocInfo_X86(cursor, cond_branch);
+   vexAllocSanityCheck();
 
-            cursor = cursor_next;
+   if (vex_traceflags & VEX_TRACE_VCODE)
+      vex_printf("\n");
 
-            // Now we descend into |ite's| IL block.  So we need to save
-            // where we are in this block, so we can resume when the inner
-            // one is done.
-            vassert(stackPtr < nSTACK-1); // else out of Stack space
-            stackPtr++;
-            stack[stackPtr].vec        = vec;
-            stack[stackPtr].vec_next   = vec_next+1;
-            stack[stackPtr].ool_qindex = queueNewest;
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      ppHInstrSB(vcode, isIfThenElse, ppInstr, ppCondCode, mode64);
+   }
 
-            // And now descend into the inner block.  We could have just
-            // pushed its details on the stack and immediately pop it, but
-            // it seems simpler to update |vec| and |vec_next| and continue
-            // directly.
-            if (UNLIKELY(verbose_asm)) {
-               vex_printf("  -- START inner IL\n");
-            }
-            vec      = hite->fallThrough;
-            vec_next = 0;
+   /* Register allocate. */
+   RegAllocControl con = {
+      .univ = rRegUniv, .isMove = isMove, .getRegUsage = getRegUsage,
+      .mapRegs = mapRegs, .isIfThenElse = isIfThenElse, .genSpill = genSpill,
+      .genReload = genReload, .genMove = genMove, .genHInstrITE = genHInstrITE,
+      .directReload = directReload, .guest_sizeB = guest_sizeB,
+      .ppInstr = ppInstr, .ppCondCode = ppCondCode, .ppReg = ppReg,
+      .mode64 = mode64};
+   rcode = doRegisterAllocation(vcode, &con);
 
-            // And continue with "Repeatedly process ..."
-         }
+   vexAllocSanityCheck();
 
-         // Getting here means we've completed an inner IL and now want to
-         // resume the parent IL.  That is, pop a saved context off the
-         // stack.
-      }
+   if (vex_traceflags & VEX_TRACE_RCODE) {
+      vex_printf("\n------------------------" 
+                   " Register-allocated code "
+                   "------------------------\n\n");
+      ppHInstrSB(rcode, isIfThenElse, ppInstr, ppCondCode, mode64);
+      vex_printf("\n");
+   }
 
-      // Hot path is complete.  Now, probably, we have to add a jump
-      // back to the resume point.
-      if (qe->resumePoint_valid) {
-         if (0)
-            vex_printf("  // Generate jump to resume point [%03u]\n",
-                       qe->resumePoint);
-         HInstr* jmp = X86Instr_Jmp(cursor, qe->resumePoint);
-         AssemblyBufferOffset cursor_next
-            = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
-                              cursor, cursor_limit, jmp,
-                              &emitConsts, vta );
-         if (UNLIKELY(cursor_next == cursor)) {
-            // We ran out of output space.  Give up.
-            goto out_of_buffer_space;
-         }
-         cursor = cursor_next;
-      }
+   /* HACK */
+   if (0) { 
+      *(vta->host_bytes_used) = 0;
+      res->status = VexTransOK; return;
+   }
+   /* end HACK */
 
-      if (UNLIKELY(verbose_asm))
-         vex_printf("END queue[%d]\n\n", qCur);
-      // Finished with this Queue entry.
+   /* Assemble */
+   if (vex_traceflags & VEX_TRACE_ASM) {
+      vex_printf("\n------------------------" 
+                   " Assembly "
+                   "------------------------\n\n");
    }
-   // Queue empty, all blocks processed
 
-   *(vta->host_bytes_used) = cursor;
-   out_used = cursor;
-   ////
-   //// END of the assembler
-   ////////////////////////////////////////////////////////
+   Bool assembly_ok = theAssembler( res, vta, isIfThenElse, mode64, rcode );
+   if (!assembly_ok)
+      goto out_of_buffer_space;
 
    vexAllocSanityCheck();
 
@@ -1657,7 +1676,8 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta,
          j += vta->guest_extents->len[i];
       }
       if (1) vex_printf("VexExpansionRatio %d %d   %d :10\n\n",
-                        j, out_used, (10 * out_used) / (j == 0 ? 1 : j));
+                        j, *(vta->host_bytes_used), 
+                        (10 * *(vta->host_bytes_used)) / (j == 0 ? 1 : j));
    }
 
    vex_traceflags = 0;