From 829549ae33cf40d553e26df28e5cc1e914068c1c Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Tue, 26 Sep 2017 16:30:12 +0200
Subject: [PATCH] First attempt at an assembler for the new IfThenElse stuff. 
 Currently this will only work for x86-linux.

---
 VEX/priv/host_generic_regs.h |  72 +++++
 VEX/priv/host_x86_defs.c     | 139 +++++++--
 VEX/priv/host_x86_defs.h     |  36 ++-
 VEX/priv/main_main.c         | 572 ++++++++++++++++++++++++++++++++---
 4 files changed, 753 insertions(+), 66 deletions(-)

diff --git a/VEX/priv/host_generic_regs.h b/VEX/priv/host_generic_regs.h
index 4dad207bcb..add30e2e9a 100644
--- a/VEX/priv/host_generic_regs.h
+++ b/VEX/priv/host_generic_regs.h
@@ -555,6 +555,78 @@ extern HInstrSB* doRegisterAllocation(
 );
 
 
+/*---------------------------------------------------------*/
+/*--- The assembler                                     ---*/
+/*---------------------------------------------------------*/
+
+/* A struct that carries constants to the emit_*Instr functions. */
+typedef
+   struct {
+      // Are we emitting 64-bit code?  Is only relevant on hosts that share
+      // the same backend for 32- and 64-bit variants: PPC and MIPS.
+      const Bool mode64;
+      // What's the endianness of the host?
+      const VexEndness endness_host;
+      // Host addresses for various block-exit paths
+      const void* disp_cp_chain_me_to_slowEP;
+      const void* disp_cp_chain_me_to_fastEP;
+      const void* disp_cp_xindir;
+      const void* disp_cp_xassisted;
+   }
+   EmitConstants;
+
+
+// An offset in the assembly buffer.  Wrapped up in a struct so we don't
+// confuse it with any other kind of number.
+//typedef  struct { UInt u; }  AssemblyBufferOffset;
+typedef  UInt  AssemblyBufferOffset;  // for now
+
+
+// Information about a relocation that we will need to do.
+//
+// What this describes is as follows.  Let |where| be an offset in the
+// assembly buffer.  Let |dst| be some other offset in the assembly buffer.
+//
+// What this describes is a modification of a 32 bit integer located at
+// |where[0..3]|, interpreted in the host's endianness.  The 32 bit value at
+// that location has bits [bitNoMax .. bitNoMin] inclusive replaced by the
+// least significant bits of the following expression
+//
+//   E = (dst - where + sign_extend(bias)) >>signed rshift
+//
+// and all other bits of that 32 bit value are left unchanged.  Observe that
+// this description assumes E is signed.  If the bits that we copy out of E
+// and into the 32 bit word do not sign extend back to E, then the offset is
+// too large to be represented and the relocation cannot be performed.
+//
+// The use of |bitNo{Max,Min}| facilitates writing an arbitrary bitfield in
+// the middle of (eg) an ARM branch instruction.  For amd64-style 32-bit
+// branch offsets we expect the values to be 0 and 31 respectively.
+//
+// The use of |rshift| facilitates architectures (eg ARM) that use fixed
+// length instructions, and hence require the offset to be stated in number
+// of instructions instead of (eg on amd64) number of bytes.
+//
+// The use of |bias| is necessary because on some targets (eg amd64) the
+// processor expects the branch offset to be stated relative to the first
+// byte of the instruction, but |where| points somewhere further along the
+// instruction.  Hence we have to add a small negative bias to "back up"
+// |where| to the start of the instruction.
+//
+// Now, we don't actually store |dst|, because we know it "from context" at
+// the time we want to perform the relocation: it is the current offset in
+// the assembly buffer, for the entire assembly process.
+typedef
+   struct {
+      AssemblyBufferOffset where; // where the relocation should be applied
+      UChar                bitNoMin; // 0 .. 31 inclusive
+      UChar                bitNoMax; // bitNoMin .. 31 inclusive
+      Int                  bias;   // arbitrary, but in practice very small
+      UChar                rshift; // 0, 1 or 2 only
+   }
+   Relocation;
+
+
 #endif /* ndef __VEX_HOST_GENERIC_REGS_H */
 
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_x86_defs.c b/VEX/priv/host_x86_defs.c
index f4ff04917c..272828ab6f 100644
--- a/VEX/priv/host_x86_defs.c
+++ b/VEX/priv/host_x86_defs.c
@@ -654,6 +654,21 @@ X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms,
    vassert(is_sane_RetLoc(rloc));
    return i;
 }
+X86Instr* X86Instr_Jmp ( UInt hereOffs, UInt dstOffs ) {
+   X86Instr* i         = LibVEX_Alloc_inline(sizeof(X86Instr));
+   i->tag              = Xin_Jmp;
+   i->Xin.Jmp.hereOffs = hereOffs;
+   i->Xin.Jmp.dstOffs  = dstOffs;
+   return i;
+}
+X86Instr* X86Instr_JmpCond ( X86CondCode cond,
+                             UInt dst_qentno /* FOR DEBUG PRINTING ONLY */ ) {
+   X86Instr* i               = LibVEX_Alloc_inline(sizeof(X86Instr));
+   i->tag                    = Xin_JmpCond;
+   i->Xin.JmpCond.cond       = cond;
+   i->Xin.JmpCond.dst_qentno = dst_qentno;
+   return i;
+}
 X86Instr* X86Instr_XDirect ( Addr32 dstGA, X86AMode* amEIP,
                              X86CondCode cond, Bool toFastEP ) {
    X86Instr* i             = LibVEX_Alloc_inline(sizeof(X86Instr));
@@ -1001,7 +1016,17 @@ void ppX86Instr ( const X86Instr* i, Bool mode64 ) {
                     i->Xin.Call.regparms);
          ppRetLoc(i->Xin.Call.rloc);
          vex_printf("] 0x%x", i->Xin.Call.target);
-         break;
+         return;
+      case Xin_Jmp:
+         vex_printf("jmp from [%03u] to [%03u] (delta = %d)",
+                    i->Xin.Jmp.hereOffs, i->Xin.Jmp.dstOffs,
+                    (Int)(i->Xin.Jmp.dstOffs) - (Int)(i->Xin.Jmp.hereOffs));
+         return;
+      case Xin_JmpCond:
+         vex_printf("j%s to queue[%u] (delta currently unknown)",
+                    showX86CondCode(i->Xin.JmpCond.cond),
+                    i->Xin.JmpCond.dst_qentno);
+         return;
       case Xin_XDirect:
          vex_printf("(xDirect) ");
          vex_printf("if (%%eflags.%s) { ",
@@ -1225,7 +1250,7 @@ void ppX86Instr ( const X86Instr* i, Bool mode64 ) {
                     "adcl $0,NotKnownYet+4");
          return;
       case Xin_IfThenElse:
-         vex_printf("if (!%s) then {...",
+         vex_printf("if (ccOOL=%s) then IL {...",
                     showX86CondCode(i->Xin.IfThenElse.hite->ccOOL));
          return;
       default:
@@ -2162,13 +2187,9 @@ static UChar* push_word_from_tags ( UChar* p, UShort tags )
    instruction was a profiler inc, set *is_profInc to True, else
    leave it unchanged. */
 
-Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
-                    UChar* buf, Int nbuf, const X86Instr* i, 
-                    Bool mode64, VexEndness endness_host,
-                    const void* disp_cp_chain_me_to_slowEP,
-                    const void* disp_cp_chain_me_to_fastEP,
-                    const void* disp_cp_xindir,
-                    const void* disp_cp_xassisted )
+UInt emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
+                     UChar* buf, UInt nbuf, const X86Instr* i,
+                     const EmitConstants* emitConsts )
 {
    UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
 
@@ -2176,7 +2197,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
    UChar* p = &buf[0];
    UChar* ptmp;
    vassert(nbuf >= 32);
-   vassert(mode64 == False);
+   vassert(emitConsts->mode64 == False);
 
    /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
 
@@ -2484,14 +2505,56 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
       *p++ = toUChar(0xD0 + irno);
       goto done;
 
+   case Xin_Jmp: {
+      Long deltaLL
+         = ((Long)(i->Xin.Jmp.hereOffs)) - ((Long)(i->Xin.Jmp.dstOffs));
+      /* Stay sane .. */
+      vassert(-1000000LL <= deltaLL && deltaLL <= 1000000LL);
+      Int delta = (Int)deltaLL;
+      /* The offset that we must encode is actually relative to the start of
+         the next instruction.  Also, there are short and long encodings of
+         this instruction.  Try to use the short one if possible. */
+      if (delta >= -0x78 && delta <= 0x78) {
+         delta += 2;
+         *p++ = toUChar(0xEB);
+         *p++ = toUChar(delta & 0xFF);
+         delta >>= 8;
+         vassert(delta == 0 || delta == -1);
+      } else {
+         delta += 5;
+         *p++ = toUChar(0xE9);
+         p = emit32(p, (UInt)delta);
+      }
+      goto done;
+   }
+
+   case Xin_JmpCond: {
+      /* We don't know the destination yet, so just emit this so that it
+         loops back to itself.  The main (host-independent) assembler logic
+         will later patch it when the destination is known.  Until then,
+         emitting an instruction that jumps to itself seems like a good
+         idea, since if we mistakenly forget to patch it, the generated code
+         will spin at this point, and when we attach a debugger, it will be
+         obvious what has happened. */
+      *p++ = toUChar(0x0F);
+      *p++ = toUChar(0x80 + (UInt)i->Xin.JmpCond.cond);
+      // FFFFFFFA == -6, which, relative to the next insn, points to
+      // the start of this one.
+      *p++ = toUChar(0xFA);
+      *p++ = toUChar(0xFF);
+      *p++ = toUChar(0xFF);
+      *p++ = toUChar(0xFF);
+      goto done;
+   }
+
    case Xin_XDirect: {
       /* NB: what goes on here has to be very closely coordinated with the
          chainXDirect_X86 and unchainXDirect_X86 below. */
       /* We're generating chain-me requests here, so we need to be
          sure this is actually allowed -- no-redir translations can't
          use chain-me's.  Hence: */
-      vassert(disp_cp_chain_me_to_slowEP != NULL);
-      vassert(disp_cp_chain_me_to_fastEP != NULL);
+      vassert(emitConsts->disp_cp_chain_me_to_slowEP != NULL);
+      vassert(emitConsts->disp_cp_chain_me_to_fastEP != NULL);
 
       /* Use ptmp for backpatching conditional jumps. */
       ptmp = NULL;
@@ -2519,8 +2582,9 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
       /* movl $disp_cp_chain_me_to_{slow,fast}EP,%edx; */
       *p++ = 0xBA;
       const void* disp_cp_chain_me
-               = i->Xin.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP 
-                                         : disp_cp_chain_me_to_slowEP;
+               = i->Xin.XDirect.toFastEP
+                    ? emitConsts->disp_cp_chain_me_to_fastEP
+                    : emitConsts->disp_cp_chain_me_to_slowEP;
       p = emit32(p, (UInt)(Addr)disp_cp_chain_me);
       /* call *%edx */
       *p++ = 0xFF;
@@ -2543,7 +2607,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
          translations without going through the scheduler.  That means
          no XDirects or XIndirs out from no-redir translations.
          Hence: */
-      vassert(disp_cp_xindir != NULL);
+      vassert(emitConsts->disp_cp_xindir != NULL);
 
       /* Use ptmp for backpatching conditional jumps. */
       ptmp = NULL;
@@ -2563,7 +2627,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
 
       /* movl $disp_indir, %edx */
       *p++ = 0xBA;
-      p = emit32(p, (UInt)(Addr)disp_cp_xindir);
+      p = emit32(p, (UInt)(Addr)emitConsts->disp_cp_xindir);
       /* jmp *%edx */
       *p++ = 0xFF;
       *p++ = 0xE2;
@@ -2627,7 +2691,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
 
       /* movl $disp_indir, %edx */
       *p++ = 0xBA;
-      p = emit32(p, (UInt)(Addr)disp_cp_xassisted);
+      p = emit32(p, (UInt)(Addr)emitConsts->disp_cp_xassisted);
       /* jmp *%edx */
       *p++ = 0xFF;
       *p++ = 0xE2;
@@ -3366,7 +3430,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
    }
 
   bad:
-   ppX86Instr(i, mode64);
+   ppX86Instr(i, emitConsts->mode64);
    vpanic("emit_X86Instr");
    /*NOTREACHED*/
    
@@ -3515,6 +3579,45 @@ VexInvalRange patchProfInc_X86 ( VexEndness endness_host,
 }
 
 
+/* Create relocation info needed to patch a branch offset for instruction I
+   whose first instruction is at WHERE in the assembly buffer. */
+Relocation collectRelocInfo_X86 ( AssemblyBufferOffset where,
+                                  X86Instr* i )
+{
+   /* Xin_JmpCond produces a conditional branch, of the form
+         0F 8x <32-bit-offset>
+      where 'x' encodes the condition code.
+
+      When we come to patch it so as to jump to a particular destination, we
+      must be aware that:
+
+      (1) the field we want to patch starts 2 bytes into the instruction.
+          Hence ".where = where + 2"
+
+      (2) the processor expects the 32-bit-offset value to be relative to
+          the start of the *next* instruction.  The patcher will patch
+          the location we specified ("where + 2", but that is 4 bytes
+          before the start of the next instruction.  Hence we set the bias
+          to -4, so that it reduces the computed offset by 4, which makes
+          it relative to the start of the next instruction.
+   */
+   switch (i->tag) {
+      case Xin_JmpCond:  {
+         Relocation rel = { .where    = where + 2,
+                            .bitNoMin = 0,
+                            .bitNoMax = 31,
+                            .bias     = -4,
+                            .rshift   = 0 };
+         return rel;
+      }
+      default:
+         // We don't expect to be asked to compute relocation information
+         // for any other kind of instruction.
+         vpanic("collectRelocInfo_X86");
+   }
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- end                                     host_x86_defs.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_x86_defs.h b/VEX/priv/host_x86_defs.h
index d32ff9862d..ea02c4aa0a 100644
--- a/VEX/priv/host_x86_defs.h
+++ b/VEX/priv/host_x86_defs.h
@@ -355,6 +355,8 @@ typedef
       Xin_Sh3232,    /* shldl or shrdl */
       Xin_Push,      /* push (32-bit?) value on stack */
       Xin_Call,      /* call to address in register */
+      Xin_Jmp,       /* unconditional branch */
+      Xin_JmpCond,   /* conditional branch */
       Xin_XDirect,   /* direct transfer to GA */
       Xin_XIndir,    /* indirect transfer to GA */
       Xin_XAssisted, /* assisted transfer to GA */
@@ -456,6 +458,21 @@ typedef
             Int         regparms; /* 0 .. 3 */
             RetLoc      rloc;     /* where the return value will be */
          } Call;
+         /* Unconditional branch, using the offset (dstOffs - hereOffs).
+            This could be done with just one offset, but storing two values
+            makes it easier to understand assembly debug printing. */
+         struct {
+            UInt hereOffs;
+            UInt dstOffs;
+         } Jmp;
+         /* Conditional branch.  At the time this is generated, we don't
+            know the offset and so this merely denotes a conditional branch
+            with a 32-bit unknown offset.  For debug printing ONLY, we also
+            record the assembler queue entry number. */
+         struct {
+            X86CondCode cond;
+            UInt        dst_qentno; // FOR DEBUG PRINTING ONLY
+         } JmpCond;
          /* Update the guest EIP value, then exit requesting to chain
             to it.  May be conditional.  Urr, use of Addr32 implicitly
             assumes that wordsize(guest) == wordsize(host). */
@@ -673,6 +690,9 @@ extern X86Instr* X86Instr_Div       ( Bool syned, X86RM* );
 extern X86Instr* X86Instr_Sh3232    ( X86ShiftOp, UInt amt, HReg src, HReg dst );
 extern X86Instr* X86Instr_Push      ( X86RMI* );
 extern X86Instr* X86Instr_Call      ( X86CondCode, Addr32, Int, RetLoc );
+extern X86Instr* X86Instr_Jmp       ( UInt hereOffs, UInt dstOffs );
+extern X86Instr* X86Instr_JmpCond   ( X86CondCode,
+                                      UInt/*FOR DEBUG PRINTING ONLY*/);
 extern X86Instr* X86Instr_XDirect   ( Addr32 dstGA, X86AMode* amEIP,
                                       X86CondCode cond, Bool toFastEP );
 extern X86Instr* X86Instr_XIndir    ( HReg dstGA, X86AMode* amEIP,
@@ -724,14 +744,10 @@ extern void         getRegUsage_X86Instr ( HRegUsage*, const X86Instr*, Bool );
 extern void         mapRegs_X86Instr     ( HRegRemap*, X86Instr*, Bool );
 extern Bool         isMove_X86Instr      ( const X86Instr*, HReg*, HReg* );
 extern HInstrIfThenElse* isIfThenElse_X86Instr(X86Instr*);
-extern Int          emit_X86Instr   ( /*MB_MOD*/Bool* is_profInc,
-                                      UChar* buf, Int nbuf, const X86Instr* i, 
-                                      Bool mode64,
-                                      VexEndness endness_host,
-                                      const void* disp_cp_chain_me_to_slowEP,
-                                      const void* disp_cp_chain_me_to_fastEP,
-                                      const void* disp_cp_xindir,
-                                      const void* disp_cp_xassisted );
+extern UInt         emit_X86Instr   ( /*MB_MOD*/Bool* is_profInc,
+                                      UChar* buf, UInt nbuf,
+                                      const X86Instr* i,
+                                      const EmitConstants* emitConsts );
 
 extern void genSpill_X86  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
                             HReg rreg, Int offset, Bool );
@@ -775,6 +791,10 @@ extern VexInvalRange patchProfInc_X86 ( VexEndness endness_host,
                                         void*  place_to_patch,
                                         const ULong* location_of_counter );
 
+/* Create relocation info needed to patch a branch offset for instruction I
+   whose first instruction is at WHERE in the assembly buffer. */
+extern Relocation collectRelocInfo_X86 ( AssemblyBufferOffset where,
+                                         X86Instr* i );
 
 #endif /* ndef __VEX_HOST_X86_DEFS_H */
 
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
index 689c2e8cdf..a42e6792e4 100644
--- a/VEX/priv/main_main.c
+++ b/VEX/priv/main_main.c
@@ -294,7 +294,9 @@ void LibVEX_Init (
 }
 
 
-/* --------- Make a translation. --------- */
+/*---------------------------------------------------------*/
+/*--- The compilation pipeline: front end               ---*/
+/*---------------------------------------------------------*/
 
 /* KLUDGE: S390 need to know the hwcaps of the host when generating
    code. But that info is not passed to emit_S390Instr. Only mode64 is
@@ -676,7 +678,7 @@ IRSB* LibVEX_FrontEnd ( /*MOD*/ VexTranslateArgs* vta,
                        True/*must be flat*/, guest_word_type );
 
    /* Do a post-instrumentation cleanup pass. */
-   if (vta->instrument1 || vta->instrument2) {
+   if (1 || vta->instrument1 || vta->instrument2) {
       do_deadcode_BB( irsb );
       irsb = cprop_BB( irsb );
       do_deadcode_BB( irsb );
@@ -698,6 +700,199 @@ IRSB* LibVEX_FrontEnd ( /*MOD*/ VexTranslateArgs* vta,
 }
 
 
+/*---------------------------------------------------------*/
+/*--- The compilation pipeline: back end                ---*/
+/*---------------------------------------------------------*/
+
+/* ---- Code patching helpers for the assembler ---- */
+
+static UInt loadUnaligned32 ( UChar* where, VexEndness endness )
+{
+   UInt w32 = 0;
+   switch (endness) {
+      case VexEndnessLE:
+         w32 = (w32 << 8) | where[3];
+         w32 = (w32 << 8) | where[2];
+         w32 = (w32 << 8) | where[1];
+         w32 = (w32 << 8) | where[0];
+         break;
+      case VexEndnessBE:
+         w32 = (w32 << 8) | where[0];
+         w32 = (w32 << 8) | where[1];
+         w32 = (w32 << 8) | where[2];
+         w32 = (w32 << 8) | where[3];
+         break;
+      default:
+         vassert(0);
+   }
+   return w32;
+}
+
+static void storeUnaligned32 ( UChar* where, UInt w32, VexEndness endness )
+{
+   switch (endness) {
+      case VexEndnessLE:
+         where[0] = (w32 & 0xFF); w32 >>= 8;
+         where[1] = (w32 & 0xFF); w32 >>= 8;
+         where[2] = (w32 & 0xFF); w32 >>= 8;
+         where[3] = (w32 & 0xFF); w32 >>= 8;
+         break;
+      case VexEndnessBE:
+         where[3] = (w32 & 0xFF); w32 >>= 8;
+         where[2] = (w32 & 0xFF); w32 >>= 8;
+         where[1] = (w32 & 0xFF); w32 >>= 8;
+         where[0] = (w32 & 0xFF); w32 >>= 8;
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+
+/* ---- Helpers for processing relocations ---- */
+
+/* Print a relocation. */
+static
+void ppRelocation ( Relocation rel )
+{
+   vex_printf("Reloc{where=[%03u], bits=[%u..%u], bias=%d, rshift=%u}",
+              rel.where, (UInt)rel.bitNoMax, (UInt)rel.bitNoMin,
+              rel.bias, (UInt)rel.rshift);
+}
+
+/* Apply a relocation, so that the instruction detailed in REL (which is
+   assumed already to be in ASSEMBLY_BUF) will jump to DST. */
+static
+void applyRelocation ( Relocation rel,
+                       UChar* assembly_buf,
+                       AssemblyBufferOffset assembly_buf_used,
+                       AssemblyBufferOffset dst,
+                       VexEndness host_endness,
+                       Bool verbose_asm )
+{
+   const Bool debug_reloc = False;
+
+   /* Sanity check the location.  "+4" because we will patch a 32 bit
+      field. */
+   vassert(rel.where + 4 <= assembly_buf_used);
+
+   /* Figure out where the relocation should be applied. */
+   UChar* whereP = assembly_buf + rel.where;
+
+   /* And where the jump destination is. */
+   UChar* dstP = assembly_buf + dst;
+
+    /* Compute the 'E' value, using 64 bit arithmetic.  See comment on
+       definition of type Relocation. */
+   vassert(rel.rshift <= 2);
+   Long E = ((Long)(ULong)(HWord)dstP) + ((Long)rel.bias)
+                                       - ((Long)(ULong)(HWord)whereP);
+   E = E >>/*signed*/rel.rshift;
+   if (debug_reloc)
+      vex_printf("E = 0x%llx\n", E);
+
+   /* Figure out how many significant bits of E we need, and check that they
+      sign extend back to E.  If they don't, the relocation is impossible to
+      perform, and the system is in deep trouble -- we have to abort.  It
+      should never fail in practice because the jump distances are small and
+      so are representable in all reasonable instruction sets. */
+   vassert(rel.bitNoMin <= 31);
+   vassert(rel.bitNoMax <= 31);
+   vassert(rel.bitNoMax >= rel.bitNoMin);
+   UInt nBitsOfE = (UInt)rel.bitNoMax - (UInt)rel.bitNoMin + 1;
+   vassert(nBitsOfE >= 1 && nBitsOfE <= 32);
+   if (debug_reloc)
+      vex_printf("nBitsOfE = %u\n", nBitsOfE);
+
+   /* "The lowest nBitsOfE of E sign extend back to E itself". */
+   vassert(E == ((E << (64-nBitsOfE)) >>/*signed*/ (64-nBitsOfE)));
+
+   /* Figure out the 32-bit mask for the location at |where| to be modified.
+      It contains |nBitsOfE| bits that need to be modified. */
+   vassert(nBitsOfE + rel.bitNoMin <= 32);
+   ULong mask64 = ((1ULL << nBitsOfE) - 1) << rel.bitNoMin;
+   vassert(0 == (mask64 & 0xFFFFFFFF00000000ULL));
+   UInt mask = (UInt)mask64;
+   if (debug_reloc)
+      vex_printf("mask = 0x%x\n", mask);
+   UInt eBits = (((UInt)E) << rel.bitNoMin) & mask;
+
+   /* Actually apply it. */
+   UInt w32old = loadUnaligned32(whereP, host_endness);
+   UInt w32new = (w32old & ~mask) | eBits;
+
+   if (UNLIKELY(verbose_asm)) {
+      vex_printf("  --   RELOC changing 0x%08x to 0x%08x\n", w32old, w32new);
+   }
+   storeUnaligned32(whereP, w32new, host_endness);
+
+   /* We are done.  Sheesh. */
+}
+
+
+/* ---- A helper for emitting simple instructions ---- */
+
+/* Emit a simple insn INSN into BUF at offset BUF_USED.  Returns the new
+   number of bytes used, which will be > BUF_USED to denote success and ==
+   BUF_USED to denote failure (output buffer full).  If INSN is a
+   profiler-inc, *OFFS_PROFINC will be set to show its offset in the output
+   buffer, else *OFFS_PROFINC will be unchanged.
+*/
+static
+AssemblyBufferOffset emitSimpleInsn ( /*MB_MOD*/Int* offs_profInc,
+                                      UChar* buf,
+                                      AssemblyBufferOffset buf_used,
+                                      AssemblyBufferOffset buf_limit,
+                                      const HInstr* insn,
+                                      const EmitConstants* emitConsts,
+                                      const VexTranslateArgs* vta )
+{
+   /* Emit into a 128 byte temp buffer */
+   UChar insn_bytes[128];
+   Bool  isProfInc = False;
+   UInt  j = emit_X86Instr(&isProfInc, insn_bytes, sizeof(insn_bytes),
+                           insn, emitConsts);
+   /* Debug printing? */
+   if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) {
+      const Int min_spacing = 9;
+      vex_printf("  [%03u]  ", buf_used);
+      for (Int k = 0; k < j; k++) {
+         vex_printf("%02x ", (UInt)insn_bytes[k]);
+      }
+      for (Int k = j; k < min_spacing; k++) {
+         vex_printf(" . ");
+      }
+      vex_printf("  ");
+      ppX86Instr(insn, False);
+      vex_printf("\n");
+   }
+   /* Will it fit? */
+   if (UNLIKELY(buf_used + j > buf_limit)) {
+      return buf_used; // denotes FAIL
+   }
+   /* Do we need to record the ProfInc offset? */
+   if (UNLIKELY(isProfInc)) {
+      vassert(vta->addProfInc); /* else where did it come from? */
+      vassert(*offs_profInc == -1); /* there can be only one (tm) */
+      vassert(buf_used >= 0);
+      *offs_profInc = (Int)buf_used;
+   }
+   /* Copy to output. */
+   { UChar* dst = &vta->host_bytes[buf_used];
+     for (UInt k = 0; k < j; k++) {
+        dst[k] = insn_bytes[k];
+     }
+   }
+   /* This needs to return a value greater than the original OFFS, in order
+      to indicate that this function succeeded.  That depends on the
+      assumption that the emit_*Insn call above will emit at least one byte
+      for any instruction it is given, which seems reasonable. */
+   return buf_used + j;
+}
+
+
+/* ---- The back end proper ---- */
+
 /* Back end of the compilation pipeline.  Is not exported. */
 
 static void libvex_BackEnd ( const VexTranslateArgs *vta,
@@ -737,7 +932,6 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta,
    Int offB_HOST_EvC_COUNTER;
    Int offB_HOST_EvC_FAILADDR;
    Addr            max_ga;
-   UChar           insn_bytes[128];
    HInstrSB*       vcode;
    HInstrSB*       rcode;
 
@@ -1117,49 +1311,340 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta,
                    "------------------------\n\n");
    }
 
-   out_used = 0; /* tracks along the host_bytes array */
-   /* TODO-JIT: This needs another interface when assembler/flattener
-                is given whole HInstrSB and also pointer to function
-                which prints emitted bytes. */
-   for (UInt i = 0; i < rcode->insns->insns_used; i++) {
-      HInstr* hi           = rcode->insns->insns[i];
-      Bool    hi_isProfInc = False;
-      if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) {
-         ppInstr(hi, mode64);
-         vex_printf("\n");
+   ////////////////////////////////////////////////////////
+   //// BEGIN the assembler
+
+   // QElem are work Queue elements.  The work Queue is the top level data
+   // structure for the emitter.  It is initialised with the HInstrVec* of
+   // the overall HInstrSB.  Every OOL HInstrVec* in the tree will at some
+   // point be present in the Queue.  IL HInstrVec*s are never present in
+   // the Queue because the inner emitter loop processes them in-line, using
+   // a Stack (see below) to keep track of its nesting level.
+   //
+   // The Stack (see below) is empty before and after every Queue element is
+   // processed.  In other words, the Stack only holds state needed during
+   // the processing of a single Queue element.
+   //
+   // The ordering of elements in the Queue is irrelevant -- correct code
+   // will be emitted even with set semantics (arbitrary order).  However,
+   // the FIFOness of the queue is believed to generate code in which
+   // colder and colder code (more deeply nested OOLs) is placed further
+   // and further from the start of the emitted machine code, which sounds
+   // like a layout which should minimise icache misses.
+   //
+   // QElems also contain two pieces of jump-fixup information.  When we
+   // finally come to process a QElem, we need to know:
+   //
+   // * |jumpToOOLpoint|: the place which wants to jump to the start of the
+   //   emitted insns for this QElem.  We must have already emitted that,
+   //   since it will be the conditional jump that leads to this QElem (OOL
+   //   block).
+   //
+   // * |resumePoint|: the place we should jump back to after the QElem is
+   //   finished (the "resume point"), which is the emitted code of the
+   //   HInstr immediately following the HInstrIfThenElse that has this
+   //   QElem as its OOL block.
+   //
+   // When the QElem is processed, we know both the |jumpToOOLpoint| and
+   // the |resumePoint|, and so the first can be patched, and the second
+   // we generate an instruction to jump to.
+   //
+   // There are three complications with patching:
+   //
+   // (1) per comments on Stack elems, we do not know the |resumePoint| when
+   //     creating a QElem.  That will only be known when processing of the
+   //     corresponding IL block is completed.
+   //
+   // (2) The top level HInstrVec* has neither a |jumpToOOLpoint| nor a
+   //     |resumePoint|.
+   //
+   // (3) Non-top-level OOLs may not have a valid |resumePoint| if they do
+   //     an unconditional IR-level Exit.  We can generate the resume point
+   //     branch, but it will be never be used.
+   typedef
+      struct {
+         // The HInstrs for this OOL.
+         HInstrVec* oolVec;
+         // Where we should patch to jump to the OOL ("how do we get here?")
+         Bool       jumpToOOLpoint_valid;
+         Relocation jumpToOOLpoint;
+         // Resume point offset, in bytes from start of output buffer
+         // ("where do we go after this block is completed?")
+         Bool                 resumePoint_valid;
+         AssemblyBufferOffset resumePoint;
       }
-      Int j = emit(&hi_isProfInc,
-                   insn_bytes, sizeof insn_bytes, hi,
-                   mode64, vta->archinfo_host.endness,
-                   vta->disp_cp_chain_me_to_slowEP,
-                   vta->disp_cp_chain_me_to_fastEP,
-                   vta->disp_cp_xindir,
-                   vta->disp_cp_xassisted);
-      if (UNLIKELY(vex_traceflags & VEX_TRACE_ASM)) {
-         for (Int k = 0; k < j; k++)
-            vex_printf("%02x ", (UInt)insn_bytes[k]);
-         vex_printf("\n\n");
+      QElem;
+
+
+   // SElem are stack elements.  When we suspend processing a HInstrVec* in
+   // order to process an IL path in an IfThenElse, we push the HInstrVec*
+   // and the next index to process on the stack, so that we know where to
+   // resume when the nested IL sequence is completed.  |vec| and |vec_next|
+   // record the resume HInstr.
+   //
+   // A second effect of processing a nested IL sequence is that we will
+   // have to (later) process the corresponding OOL sequence.  And that OOL
+   // sequence will have to finish with a jump back to the "resume point"
+   // (the emitted instruction immediately following the IfThenElse).  We
+   // only know the offset of the resume point instruction in the output
+   // buffer when we actually resume emitted from there -- that is, when the
+   // entry we pushed, is popped.  So, when we pop, we must mark the
+   // corresponding OOL entry in the Queue to record there the resume point
+   // offset.  For this reason we also carry |ool_qindex|, which is the
+   // index of the corresponding OOL entry in the Queue.
+   typedef
+      struct {
+         HInstrVec* vec;        // resume point HInstr vector
+         UInt       vec_next;   // resume point HInstr vector index
+         Int        ool_qindex; // index in Queue of OOL to mark when we resume
       }
-      if (UNLIKELY(out_used + j > vta->host_bytes_size)) {
-         vexSetAllocModeTEMP_and_clear();
-         vex_traceflags = 0;
-         res->status = VexTransOutputFull;
-         return;
+      SElem;
+
+   // The Stack.  The stack depth is bounded by maximum number of nested
+   // hot (IL) sections, so in practice it is going to be very small.
+   const Int nSTACK = 4;
+
+   SElem stack[nSTACK];
+   Int   stackPtr; // points to most recently pushed entry <=> "-1 means empty"
+
+   // The Queue.  The queue size is bounded by the number of cold (OOL)
+   // sections in the entire HInstrSB, so it's also going to be pretty
+   // small.
+   const Int nQUEUE = 8;
+
+   QElem queue[nQUEUE];
+   Int   queueOldest; // index of oldest entry, initially 0
+   Int   queueNewest; // index of newest entry,
+                      // initially -1, otherwise must be >= queueOldest
+
+   ///////////////////////////////////////////////////////
+
+   const Bool verbose_asm = (vex_traceflags & VEX_TRACE_ASM) != 0;
+
+   const EmitConstants emitConsts
+      = { .mode64                     = mode64,
+          .endness_host               = vta->archinfo_host.endness,
+          .disp_cp_chain_me_to_slowEP = vta->disp_cp_chain_me_to_slowEP,
+          .disp_cp_chain_me_to_fastEP = vta->disp_cp_chain_me_to_fastEP,
+          .disp_cp_xindir             = vta->disp_cp_xindir,
+          .disp_cp_xassisted          = vta->disp_cp_xassisted };
+
+   AssemblyBufferOffset cursor = 0;
+   AssemblyBufferOffset cursor_limit = vta->host_bytes_size;
+
+   queueOldest = 0;
+   queueNewest = -1;
+
+   vassert(queueNewest < nQUEUE);
+   queueNewest++;
+   {
+      QElem* qe = &queue[queueNewest];
+      vex_bzero(qe, sizeof(*qe));
+      qe->oolVec = rcode->insns;
+      qe->jumpToOOLpoint_valid = False;
+      qe->resumePoint_valid = False;
+   }
+   vassert(queueNewest == 0);
+
+   /* Main loop, processing Queue entries, until there are no more. */
+   while (queueOldest <= queueNewest) {
+
+      Int qCur = queueOldest;
+      if (UNLIKELY(verbose_asm))
+         vex_printf("BEGIN queue[%d]\n", qCur);
+
+      // Take the oldest entry in the queue
+      QElem* qe = &queue[queueOldest];
+      queueOldest++;
+
+      // Stay sane.  Only the top level block has no branch to it and no
+      // resume point.
+      if (qe->oolVec == rcode->insns) {
+         // This is the top level block
+         vassert(!qe->jumpToOOLpoint_valid);
+         vassert(!qe->resumePoint_valid);
+      } else {
+         vassert(qe->jumpToOOLpoint_valid);
+         vassert(qe->resumePoint_valid);
+         // In the future, we might be able to allow the resume point to be
+         // invalid for non-top-level blocks, if the block contains an
+         // unconditional exit.  Currently the IR can't represent that, so
+         // the assertion is valid.
+      }
+
+      // Processing |qe|
+      if (qe->jumpToOOLpoint_valid) {
+         // patch qe->jmpToOOLpoint to jump to |here|
+         if (UNLIKELY(verbose_asm)) {
+            vex_printf("  -- APPLY ");
+            ppRelocation(qe->jumpToOOLpoint);
+            vex_printf("\n");
+         }
+         applyRelocation(qe->jumpToOOLpoint, &vta->host_bytes[0],
+                         cursor, cursor, vta->archinfo_host.endness,
+                         verbose_asm);
       }
-      if (UNLIKELY(hi_isProfInc)) {
-         vassert(vta->addProfInc); /* else where did it come from? */
-         vassert(res->offs_profInc == -1); /* there can be only one (tm) */
-         vassert(out_used >= 0);
-         res->offs_profInc = out_used;
+
+      // Initialise the stack, for processing of |qe|.
+      stackPtr = 0; // "contains one element"
+
+      stack[stackPtr].vec        = qe->oolVec;
+      stack[stackPtr].vec_next   = 0;
+      stack[stackPtr].ool_qindex = -1; // INVALID
+
+      // Iterate till the stack is empty.  This effectively does a
+      // depth-first traversal of the hot-path (IL) tree reachable from
+      // here, and at the same time adds any encountered cold-path (OOL)
+      // blocks to the Queue for later processing.  This is the heart of the
+      // flattening algorithm.
+      while (stackPtr >= 0) {
+
+         if (UNLIKELY(verbose_asm))
+            vex_printf("  -- CONSIDER stack[%d]\n", stackPtr);
+
+         HInstrVec* vec        = stack[stackPtr].vec;
+         UInt       vec_next   = stack[stackPtr].vec_next;
+         Int        ool_qindex = stack[stackPtr].ool_qindex;
+         stackPtr--;
+
+         if (vec_next > 0) {
+            // We're resuming the current IL block having just finished
+            // processing a nested IL.  The OOL counterpart to the nested IL
+            // we just finished processing will have to jump back to here.
+            // So we'll need to mark its Queue entry to record that fact.
+
+            // First assert that the OOL actually *is* in the Queue (it
+            // must be, since we can't have processed it yet).
+            vassert(queueOldest <= queueNewest); // "at least 1 entry in Q"
+            vassert(queueOldest <= ool_qindex && ool_qindex <= queueNewest);
+
+            vassert(!queue[ool_qindex].resumePoint_valid);
+            queue[ool_qindex].resumePoint       = cursor;
+            queue[ool_qindex].resumePoint_valid = True;
+            if (UNLIKELY(verbose_asm))
+               vex_printf("  -- RESUME previous IL\n");
+         } else {
+            // We're starting a new IL.  Due to the tail-recursive nature of
+            // entering ILs, this means we can actually only be starting the
+            // outermost (top level) block for this particular Queue entry.
+            vassert(ool_qindex == -1);
+            vassert(vec == qe->oolVec);
+            if (UNLIKELY(verbose_asm))
+               vex_printf("  -- START new IL\n");
+         }
+
+         // Repeatedly process "zero or more simple HInstrs followed by (an
+         // IfThenElse or end-of-block)"
+         while (True) {
+
+            // Process "zero or more simple HInstrs"
+            while (vec_next < vec->insns_used
+                   && !isIfThenElse(vec->insns[vec_next])) {
+               AssemblyBufferOffset cursor_next
+                  = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
+                                    cursor, cursor_limit, vec->insns[vec_next],
+                                    &emitConsts, vta );
+               if (UNLIKELY(cursor_next == cursor)) {
+                  // We ran out of output space.  Give up.
+                  goto out_of_buffer_space;
+               }
+               vec_next++;
+               cursor = cursor_next;
+            }
+
+            // Now we've either got to the end of the hot path, or we have
+            // an IfThenElse.
+            if (vec_next >= vec->insns_used)
+               break;
+
+            // So we have an IfThenElse.
+            HInstrIfThenElse* hite = isIfThenElse(vec->insns[vec_next]);
+            vassert(hite);
+            vassert(hite->n_phis == 0); // the regalloc will have removed them
+
+            // Put |ite|'s OOL block in the Queue.  We'll deal with it
+            // later.  Also, generate the (skeleton) conditional branch to it,
+            // and collect enough information that we can create patch the
+            // branch later, once we know where the destination is.
+            vassert(queueNewest < nQUEUE-1); // else out of Queue space
+            queueNewest++;
+            queue[queueNewest].oolVec = hite->outOfLine;
+            queue[queueNewest].resumePoint_valid = False; // not yet known
+            queue[queueNewest].resumePoint = -1; // invalid
+
+            HInstr* cond_branch
+               = X86Instr_JmpCond(hite->ccOOL,
+                                  queueNewest/*FOR DEBUG PRINTING ONLY*/);
+            AssemblyBufferOffset cursor_next
+               = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
+                                 cursor, cursor_limit, cond_branch,
+                                 &emitConsts, vta );
+            if (UNLIKELY(cursor_next == cursor)) {
+               // We ran out of output space.  Give up.
+               goto out_of_buffer_space;
+            }
+            queue[queueNewest].jumpToOOLpoint_valid = True;
+            queue[queueNewest].jumpToOOLpoint
+               = collectRelocInfo_X86(cursor, cond_branch);
+
+            cursor = cursor_next;
+
+            // Now we descend into |ite's| IL block.  So we need to save
+            // where we are in this block, so we can resume when the inner
+            // one is done.
+            vassert(stackPtr < nSTACK-1); // else out of Stack space
+            stackPtr++;
+            stack[stackPtr].vec        = vec;
+            stack[stackPtr].vec_next   = vec_next+1;
+            stack[stackPtr].ool_qindex = queueNewest;
+
+            // And now descend into the inner block.  We could have just
+            // pushed its details on the stack and immediately pop it, but
+            // it seems simpler to update |vec| and |vec_next| and continue
+            // directly.
+            if (UNLIKELY(verbose_asm)) {
+               vex_printf("  -- START inner IL\n");
+            }
+            vec      = hite->fallThrough;
+            vec_next = 0;
+
+            // And continue with "Repeatedly process ..."
+         }
+
+         // Getting here means we've completed an inner IL and now want to
+         // resume the parent IL.  That is, pop a saved context off the
+         // stack.
       }
-      { UChar* dst = &vta->host_bytes[out_used];
-        for (Int k = 0; k < j; k++) {
-           dst[k] = insn_bytes[k];
-        }
-        out_used += j;
+
+      // Hot path is complete.  Now, probably, we have to add a jump
+      // back to the resume point.
+      if (qe->resumePoint_valid) {
+         if (0)
+            vex_printf("  // Generate jump to resume point [%03u]\n",
+                       qe->resumePoint);
+         HInstr* jmp = X86Instr_Jmp(cursor, qe->resumePoint);
+         AssemblyBufferOffset cursor_next
+            = emitSimpleInsn( &(res->offs_profInc), &vta->host_bytes[0],
+                              cursor, cursor_limit, jmp,
+                              &emitConsts, vta );
+         if (UNLIKELY(cursor_next == cursor)) {
+            // We ran out of output space.  Give up.
+            goto out_of_buffer_space;
+         }
+         cursor = cursor_next;
       }
+
+      if (UNLIKELY(verbose_asm))
+         vex_printf("END queue[%d]\n\n", qCur);
+      // Finished with this Queue entry.
    }
-   *(vta->host_bytes_used) = out_used;
+   // Queue empty, all blocks processed
+
+   *(vta->host_bytes_used) = cursor;
+   out_used = cursor;
+   ////
+   //// END of the assembler
+   ////////////////////////////////////////////////////////
 
    vexAllocSanityCheck();
 
@@ -1178,6 +1663,13 @@ static void libvex_BackEnd ( const VexTranslateArgs *vta,
    vex_traceflags = 0;
    res->status = VexTransOK;
    return;
+
+   // Return path for when we've run out of output buffer space.
+  out_of_buffer_space:
+   vexSetAllocModeTEMP_and_clear();
+   vex_traceflags = 0;
+   res->status = VexTransOutputFull;
+   return;
 }
 
 
-- 
2.47.2