From c144a390d18859fe32e946428d9d23b48317a8fd Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Thu, 1 May 2008 20:13:04 +0000
Subject: [PATCH] Merge branches/OTRACK_BY_INSTRUMENTATION into the trunk. 
 This provides vex-side support for origin tracking in Memcheck.

git-svn-id: svn://svn.valgrind.org/vex/trunk@1832
---
 VEX/priv/guest-amd64/toIR.c        |  27 +++++---
 VEX/priv/guest-ppc/toIR.c          |  41 +++++++++---
 VEX/priv/guest-x86/toIR.c          |   2 +-
 VEX/priv/host-amd64/isel.c         |  18 +++++
 VEX/priv/host-generic/reg_alloc2.c |  43 +++++++++---
 VEX/priv/host-ppc/hdefs.h          |   2 +-
 VEX/priv/host-ppc/isel.c           |  42 ++++++++++--
 VEX/priv/host-x86/isel.c           |  11 +++
 VEX/priv/ir/irdefs.c               |  19 ++++--
 VEX/priv/ir/iropt.c                |  37 +++++++---
 VEX/pub/libvex.h                   |  16 +++--
 VEX/pub/libvex_guest_amd64.h       |  25 ++++---
 VEX/pub/libvex_guest_ppc32.h       |  67 ++++++++++---------
 VEX/pub/libvex_guest_ppc64.h       |  67 ++++++++++---------
 VEX/pub/libvex_guest_x86.h         |   2 +-
 VEX/pub/libvex_ir.h                |  12 ++--
 VEX/test_main.c                    | 104 ++++++++++++++++-------------
 17 files changed, 355 insertions(+), 180 deletions(-)

diff --git a/VEX/priv/guest-amd64/toIR.c b/VEX/priv/guest-amd64/toIR.c
index 652737aa62..bb6e0718f5 100644
--- a/VEX/priv/guest-amd64/toIR.c
+++ b/VEX/priv/guest-amd64/toIR.c
@@ -1653,7 +1653,7 @@ static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
       may require reading all four thunk fields. */
    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
-   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(res)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
 }
 
@@ -1944,12 +1944,15 @@ void jcc_01 ( AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
    }
 }
 
-/* Let new_rsp be the %rsp value after a call/return.  This function
-   generates an AbiHint to say that -128(%rsp) .. -1(%rsp) should now
-   be regarded as uninitialised.
+/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
+   guest address of the next instruction to be executed.
+
+   This function generates an AbiHint to say that -128(%rsp)
+   .. -1(%rsp) should now be regarded as uninitialised.
 */
 static 
-void make_redzone_AbiHint ( VexAbiInfo* vbi, IRTemp new_rsp, HChar* who )
+void make_redzone_AbiHint ( VexAbiInfo* vbi,
+                            IRTemp new_rsp, IRTemp nia, HChar* who )
 {
    Int szB = vbi->guest_stack_redzone_size;
    vassert(szB >= 0);
@@ -1961,10 +1964,12 @@ void make_redzone_AbiHint ( VexAbiInfo* vbi, IRTemp new_rsp, HChar* who )
 
    if (0) vex_printf("AbiHint: %s\n", who);
    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
+   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
    if (szB > 0)
       stmt( IRStmt_AbiHint( 
                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)), 
-               szB
+               szB,
+               mkexpr(nia)
             ));
 }
 
@@ -3713,7 +3718,7 @@ ULong dis_Grp5 ( VexAbiInfo* vbi,
             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
             putIReg64(R_RSP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
-            make_redzone_AbiHint(vbi, t2, "call-Ev(reg)");
+            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
             jmp_treg(Ijk_Call,t3);
             dres->whatNext = Dis_StopHere;
             showSz = False;
@@ -3767,7 +3772,7 @@ ULong dis_Grp5 ( VexAbiInfo* vbi,
             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
             putIReg64(R_RSP, mkexpr(t2));
             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
-            make_redzone_AbiHint(vbi, t2, "call-Ev(mem)");
+            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
             jmp_treg(Ijk_Call,t3);
             dres->whatNext = Dis_StopHere;
             showSz = False;
@@ -7679,7 +7684,7 @@ void dis_ret ( VexAbiInfo* vbi, ULong d64 )
    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
    putIReg64(R_RSP, mkexpr(t3));
-   make_redzone_AbiHint(vbi, t3, "ret");
+   make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
    jmp_treg(Ijk_Ret,t2);
 }
 
@@ -13494,7 +13499,9 @@ DisResult disInstr_AMD64_WRK (
       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
       putIReg64(R_RSP, mkexpr(t1));
       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
-      make_redzone_AbiHint(vmi, t1, "call-d32");
+      t2 = newTemp(Ity_I64);
+      assign(t2, mkU64((Addr64)d64));
+      make_redzone_AbiHint(vmi, t1, t2/*nia*/, "call-d32");
       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
          /* follow into the call target. */
          dres.whatNext   = Dis_Resteer;
diff --git a/VEX/priv/guest-ppc/toIR.c b/VEX/priv/guest-ppc/toIR.c
index 6cd5e8dfaf..414c272a67 100644
--- a/VEX/priv/guest-ppc/toIR.c
+++ b/VEX/priv/guest-ppc/toIR.c
@@ -1211,24 +1211,31 @@ static IRExpr* addr_align( IRExpr* addr, UChar align )
 /* Generate AbiHints which mark points at which the ELF or PowerOpen
    ABIs say that the stack red zone (viz, -N(r1) .. -1(r1), for some
    N) becomes undefined.  That is at function calls and returns.  ELF
-   ppc32 doesn't have this "feature" (how fortunate for it).
+   ppc32 doesn't have this "feature" (how fortunate for it).  nia is
+   the address of the next instruction to be executed.
 */
-static void make_redzone_AbiHint ( VexAbiInfo* vbi, HChar* who )
+static void make_redzone_AbiHint ( VexAbiInfo* vbi, 
+                                   IRTemp nia, HChar* who )
 {
    Int szB = vbi->guest_stack_redzone_size;
    if (0) vex_printf("AbiHint: %s\n", who);
    vassert(szB >= 0);
    if (szB > 0) {
-      if (mode64)
+      if (mode64) {
+         vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
          stmt( IRStmt_AbiHint( 
                   binop(Iop_Sub64, getIReg(1), mkU64(szB)), 
-                  szB
+                  szB,
+                  mkexpr(nia)
          ));
-      else
+      } else {
+         vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I32);
          stmt( IRStmt_AbiHint( 
                   binop(Iop_Sub32, getIReg(1), mkU32(szB)), 
-                  szB
+                  szB,
+                  mkexpr(nia)
          ));
+      }
    }
 }
 
@@ -4308,9 +4315,12 @@ static Bool dis_branch ( UInt theInstr,
       if (flag_LK) {
          putGST( PPC_GST_LR, e_nia );
          if (vbi->guest_ppc_zap_RZ_at_bl
-             && vbi->guest_ppc_zap_RZ_at_bl( (ULong)tgt) )
-            make_redzone_AbiHint( vbi, 
+             && vbi->guest_ppc_zap_RZ_at_bl( (ULong)tgt) ) {
+            IRTemp t_tgt = newTemp(ty);
+            assign(t_tgt, mode64 ? mkU64(tgt) : mkU32(tgt) );
+            make_redzone_AbiHint( vbi, t_tgt,
                                   "branch-and-link (unconditional call)" );
+         }
       }
 
       if (resteerOkFn( callback_opaque, tgt )) {
@@ -4379,6 +4389,8 @@ static Bool dis_branch ( UInt theInstr,
          
          assign( cond_ok, branch_cond_ok( BO, BI ) );
 
+         /* FIXME: this is confusing.  lr_old holds the old value
+            of ctr, not lr :-) */
          assign( lr_old, addr_align( getGST( PPC_GST_CTR ), 4 ));
 
          if (flag_LK)
@@ -4388,7 +4400,12 @@ static Bool dis_branch ( UInt theInstr,
                   binop(Iop_CmpEQ32, mkexpr(cond_ok), mkU32(0)),
                   Ijk_Boring,
                   c_nia ));
-         
+
+         if (flag_LK && vbi->guest_ppc_zap_RZ_at_bl) {
+            make_redzone_AbiHint( vbi, lr_old,
+                                  "b-ctr-l (indirect call)" );
+	 }
+
          irsb->jumpkind = flag_LK ? Ijk_Call : Ijk_Boring;
          irsb->next     = mkexpr(lr_old);
          break;
@@ -4424,8 +4441,10 @@ static Bool dis_branch ( UInt theInstr,
                   Ijk_Boring,
                   c_nia ));
 
-	 if (vanilla_return && vbi->guest_ppc_zap_RZ_at_blr)
-            make_redzone_AbiHint( vbi, "branch-to-lr (unconditional return)" );
+         if (vanilla_return && vbi->guest_ppc_zap_RZ_at_blr) {
+            make_redzone_AbiHint( vbi, lr_old,
+                                  "branch-to-lr (unconditional return)" );
+         }
 
          /* blrl is pretty strange; it's like a return that sets the
             return address of its caller to the insn following this
diff --git a/VEX/priv/guest-x86/toIR.c b/VEX/priv/guest-x86/toIR.c
index 26b5af40fa..7e6c1f52ae 100644
--- a/VEX/priv/guest-x86/toIR.c
+++ b/VEX/priv/guest-x86/toIR.c
@@ -958,7 +958,7 @@ static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
       may require reading all four thunk fields. */
    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
    stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
-   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(res)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
 }
 
diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c
index 94ee65ca66..a9909f8cb9 100644
--- a/VEX/priv/host-amd64/isel.c
+++ b/VEX/priv/host-amd64/isel.c
@@ -1150,6 +1150,24 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
 
       /* Handle misc other ops. */
 
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         /* This generates a truly rotten piece of code.  Just as well
+            it doesn't happen very often. */
+         HReg src1  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg src1L = newVRegI(env);
+         HReg src2  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg src2L = newVRegI(env);
+         HReg dst   = newVRegI(env);
+         addInstr(env, mk_iMOVsd_RR(src1,dst));
+         addInstr(env, mk_iMOVsd_RR(src1,src1L));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, src1L));
+         addInstr(env, mk_iMOVsd_RR(src2,src2L));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, src2L));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_CMP, AMD64RMI_Reg(src2L), src1L));
+         addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
+         return dst;
+      }
+
       if (e->Iex.Binop.op == Iop_DivModS64to32
           || e->Iex.Binop.op == Iop_DivModU64to32) {
          /* 64 x 32 -> (32(rem),32(div)) division */
diff --git a/VEX/priv/host-generic/reg_alloc2.c b/VEX/priv/host-generic/reg_alloc2.c
index 5959b73552..c64333daac 100644
--- a/VEX/priv/host-generic/reg_alloc2.c
+++ b/VEX/priv/host-generic/reg_alloc2.c
@@ -215,6 +215,17 @@ Int findMostDistantlyMentionedVReg (
 }
 
 
+/* Check that this vreg has been assigned a sane spill offset. */
+static inline void sanity_check_spill_offset ( VRegLR* vreg )
+{
+   if (vreg->reg_class == HRcVec128 || vreg->reg_class == HRcFlt64) {
+      vassert(0 == ((UShort)vreg->spill_offset % 16));
+   } else {
+      vassert(0 == ((UShort)vreg->spill_offset % 8));
+   }
+}
+
+
 /* Double the size of the real-reg live-range array, if needed. */
 static void ensureRRLRspace ( RRegLR** info, Int* size, Int used )
 {
@@ -396,8 +407,9 @@ HInstrArray* doRegisterAllocation (
       not at each insn processed. */
    Bool do_sanity_check;
 
-   vassert(0 == LibVEX_N_SPILL_BYTES % 16);
-   vassert(0 == guest_sizeB % 8);
+   vassert(0 == (guest_sizeB % 16));
+   vassert(0 == (LibVEX_N_SPILL_BYTES % 16));
+   vassert(0 == (N_SPILL64S % 2));
 
    /* The live range numbers are signed shorts, and so limiting the
       number of insns to 10000 comfortably guards against them
@@ -789,6 +801,16 @@ HInstrArray* doRegisterAllocation (
       64 bits to spill (classes Flt64 and Vec128), we have to allocate
       two spill slots.
 
+      For Vec128-class on PowerPC, the spill slot's actual address
+      must be 16-byte aligned.  Since the spill slot's address is
+      computed as an offset from the guest state pointer, and since
+      the user of the generated code must set that pointer to a
+      16-aligned value, we have the residual obligation here of
+      choosing a 16-aligned spill slot offset for Vec128-class values.
+      Since each spill slot is 8 bytes long, that means for
+      Vec128-class values we must allocated a spill slot number which
+      is zero mod 2.
+
       Do a rank-based allocation of vregs to spill slot numbers.  We
       put as few values as possible in spill slots, but nevertheless
       need to have a spill slot available for all vregs, just in case.
@@ -817,16 +839,19 @@ HInstrArray* doRegisterAllocation (
           || vreg_lrs[j].reg_class == HRcFlt64) {
 
          /* Find two adjacent free slots in which between them provide
-            up to 128 bits in which to spill the vreg. */
+            up to 128 bits in which to spill the vreg.  Since we are
+            trying to find an even:odd pair, move along in steps of 2
+            (slots). */
 
-         for (k = 0; k < N_SPILL64S-1; k++)
+         for (k = 0; k < N_SPILL64S-1; k += 2)
             if (ss_busy_until_before[k] <= vreg_lrs[j].live_after
                 && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after)
                break;
-         if (k == N_SPILL64S-1) {
+         if (k >= N_SPILL64S-1) {
             vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
                    "Increase and recompile.");
          }
+         if (0) vex_printf("16-byte spill offset in spill slot %d\n", (Int)k);
          ss_busy_until_before[k+0] = vreg_lrs[j].dead_before;
          ss_busy_until_before[k+1] = vreg_lrs[j].dead_before;
 
@@ -849,10 +874,12 @@ HInstrArray* doRegisterAllocation (
       }
 
       /* This reflects LibVEX's hard-wired knowledge of the baseBlock
-         layout: the guest state, then an equal sized area following
-         it for shadow state, and then the spill area. */
-      vreg_lrs[j].spill_offset = toShort(guest_sizeB * 2 + k * 8);
+         layout: the guest state, then two equal sized areas following
+         it for two sets of shadow state, and then the spill area. */
+      vreg_lrs[j].spill_offset = toShort(guest_sizeB * 3 + k * 8);
 
+      /* Independent check that we've made a sane choice of slot */
+      sanity_check_spill_offset( &vreg_lrs[j] );
       /* if (j > max_ss_no) */
       /*    max_ss_no = j; */
    }
diff --git a/VEX/priv/host-ppc/hdefs.h b/VEX/priv/host-ppc/hdefs.h
index 68bbcca873..51c3bf1211 100644
--- a/VEX/priv/host-ppc/hdefs.h
+++ b/VEX/priv/host-ppc/hdefs.h
@@ -288,7 +288,7 @@ typedef
    PPCRI;
 
 extern PPCRI* PPCRI_Imm ( ULong );
-extern PPCRI* PPCRI_Reg ( HReg );
+extern PPCRI* PPCRI_Reg( HReg );
 
 extern void ppPPCRI ( PPCRI* );
 
diff --git a/VEX/priv/host-ppc/isel.c b/VEX/priv/host-ppc/isel.c
index 97b61688f3..b0bc3bd21b 100644
--- a/VEX/priv/host-ppc/isel.c
+++ b/VEX/priv/host-ppc/isel.c
@@ -570,8 +570,8 @@ PPCAMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
 
    if (bias < -100 || bias > 100) /* somewhat arbitrarily */
       vpanic("genGuestArrayOffset(ppc host)(3)");
-   if (descr->base < 0 || descr->base > 2000) /* somewhat arbitrarily */
-     vpanic("genGuestArrayOffset(ppc host)(4)");
+   if (descr->base < 0 || descr->base > 4000) /* somewhat arbitrarily */
+      vpanic("genGuestArrayOffset(ppc host)(4)");
 
    /* Compute off into a reg, %off.  Then return:
 
@@ -1367,6 +1367,18 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, IRExpr* e )
          return dst;
       }
 
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         HReg        r1   = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg        r2   = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         HReg        rdst = newVRegI(env);
+         PPCCondCode cc   = mk_PPCCondCode( Pct_TRUE, Pcf_7LT );
+         addInstr(env, mk_iMOVds_RR(rdst, r1));
+         addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                    7/*cr*/, rdst, PPCRH_Reg(r2)));
+         addInstr(env, PPCInstr_CMov(cc, rdst, PPCRI_Reg(r2)));
+         return rdst;
+      }
+
       if (e->Iex.Binop.op == Iop_32HLto64) {
          HReg   r_Hi  = iselWordExpr_R(env, e->Iex.Binop.arg1);
          HReg   r_Lo  = iselWordExpr_R(env, e->Iex.Binop.arg2);
@@ -1908,7 +1920,7 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, IRExpr* e )
          addInstr(env, mk_iMOVds_RR(r_dst,rX));
          addInstr(env, PPCInstr_Alu(Palu_AND, r_tmp,
                                     r_cond, PPCRH_Imm(False,0xFF)));
-         addInstr(env, PPCInstr_Cmp(False/*unsined*/, True/*32bit cmp*/,
+         addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
                                     7/*cr*/, r_tmp, PPCRH_Imm(False,0)));
          addInstr(env, PPCInstr_CMov(cc,r_dst,r0));
          return r_dst;
@@ -2672,7 +2684,7 @@ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
             return;
          }
 
-         /* Add64/Sub64 */
+         /* Add64 */
          case Iop_Add64: {
             HReg xLo, xHi, yLo, yHi;
             HReg tLo = newVRegI(env);
@@ -2751,6 +2763,28 @@ static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
          return;
       }
 
+      /* Left64 */
+      case Iop_Left64: {
+         HReg argHi, argLo;
+         HReg zero32 = newVRegI(env);
+         HReg resHi  = newVRegI(env);
+         HReg resLo  = newVRegI(env);
+         iselInt64Expr(&argHi, &argLo, env, e->Iex.Unop.arg);
+         vassert(env->mode64 == False);
+         addInstr(env, PPCInstr_LI(zero32, 0, env->mode64));
+         /* resHi:resLo = - argHi:argLo */
+         addInstr(env, PPCInstr_AddSubC( False/*sub*/, True/*set carry*/,
+                                         resLo, zero32, argLo ));
+         addInstr(env, PPCInstr_AddSubC( False/*sub*/, False/*read carry*/,
+                                         resHi, zero32, argHi ));
+         /* resHi:resLo |= srcHi:srcLo */
+         addInstr(env, PPCInstr_Alu(Palu_OR, resLo, resLo, PPCRH_Reg(argLo)));
+         addInstr(env, PPCInstr_Alu(Palu_OR, resHi, resHi, PPCRH_Reg(argHi)));
+         *rHi = resHi;
+         *rLo = resLo;
+         return;
+      }
+
       /* 32Sto64(e) */
       case Iop_32Sto64: {
          HReg tHi = newVRegI(env);
diff --git a/VEX/priv/host-x86/isel.c b/VEX/priv/host-x86/isel.c
index 957692d982..0644c5c852 100644
--- a/VEX/priv/host-x86/isel.c
+++ b/VEX/priv/host-x86/isel.c
@@ -926,6 +926,17 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
       }
 
       /* Handle misc other ops. */
+
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg dst  = newVRegI(env);
+         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(src1,dst));
+         addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
+         addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
+         return dst;
+      }
+
       if (e->Iex.Binop.op == Iop_8HLto16) {
          HReg hi8  = newVRegI(env);
          HReg lo8  = newVRegI(env);
diff --git a/VEX/priv/ir/irdefs.c b/VEX/priv/ir/irdefs.c
index 7279357030..55c5aaa57b 100644
--- a/VEX/priv/ir/irdefs.c
+++ b/VEX/priv/ir/irdefs.c
@@ -210,6 +210,7 @@ void ppIROp ( IROp op )
       case Iop_Left16: vex_printf("Left16"); return;
       case Iop_Left32: vex_printf("Left32"); return;
       case Iop_Left64: vex_printf("Left64"); return;
+      case Iop_Max32U: vex_printf("Max32U"); return;
 
       case Iop_CmpORD32U: vex_printf("CmpORD32U"); return;
       case Iop_CmpORD32S: vex_printf("CmpORD32S"); return;
@@ -768,7 +769,9 @@ void ppIRStmt ( IRStmt* s )
       case Ist_AbiHint:
          vex_printf("====== AbiHint(");
          ppIRExpr(s->Ist.AbiHint.base);
-         vex_printf(", %d) ======", s->Ist.AbiHint.len);
+         vex_printf(", %d, ", s->Ist.AbiHint.len);
+         ppIRExpr(s->Ist.AbiHint.nia);
+         vex_printf(") ======");
          break;
       case Ist_Put:
          vex_printf( "PUT(%d) = ", s->Ist.Put.offset);
@@ -1155,11 +1158,12 @@ IRStmt* IRStmt_IMark ( Addr64 addr, Int len ) {
    s->Ist.IMark.len  = len;
    return s;
 }
-IRStmt* IRStmt_AbiHint ( IRExpr* base, Int len ) {
+IRStmt* IRStmt_AbiHint ( IRExpr* base, Int len, IRExpr* nia ) {
    IRStmt* s           = LibVEX_Alloc(sizeof(IRStmt));
    s->tag              = Ist_AbiHint;
    s->Ist.AbiHint.base = base;
    s->Ist.AbiHint.len  = len;
+   s->Ist.AbiHint.nia  = nia;
    return s;
 }
 IRStmt* IRStmt_Put ( Int off, IRExpr* data ) {
@@ -1383,7 +1387,8 @@ IRStmt* deepCopyIRStmt ( IRStmt* s )
          return IRStmt_NoOp();
       case Ist_AbiHint:
          return IRStmt_AbiHint(deepCopyIRExpr(s->Ist.AbiHint.base),
-                               s->Ist.AbiHint.len);
+                               s->Ist.AbiHint.len,
+                               deepCopyIRExpr(s->Ist.AbiHint.nia));
       case Ist_IMark:
          return IRStmt_IMark(s->Ist.IMark.addr, s->Ist.IMark.len);
       case Ist_Put: 
@@ -1498,6 +1503,7 @@ void typeOfPrimop ( IROp op,
       case Iop_CmpORD32S:
       case Iop_Add32: case Iop_Sub32: case Iop_Mul32:
       case Iop_Or32:  case Iop_And32: case Iop_Xor32:
+      case Iop_Max32U:
          BINARY(Ity_I32,Ity_I32, Ity_I32);
 
       case Iop_Add64: case Iop_Sub64: case Iop_Mul64:
@@ -1982,7 +1988,8 @@ Bool isFlatIRStmt ( IRStmt* st )
 
    switch (st->tag) {
       case Ist_AbiHint:
-         return isIRAtom(st->Ist.AbiHint.base);
+         return isIRAtom(st->Ist.AbiHint.base)
+                && isIRAtom(st->Ist.AbiHint.nia);
       case Ist_Put:
          return isIRAtom(st->Ist.Put.data);
       case Ist_PutI:
@@ -2192,6 +2199,7 @@ void useBeforeDef_Stmt ( IRSB* bb, IRStmt* stmt, Int* def_counts )
          break;
       case Ist_AbiHint:
          useBeforeDef_Expr(bb,stmt,stmt->Ist.AbiHint.base,def_counts);
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.AbiHint.nia,def_counts);
          break;
       case Ist_Put:
          useBeforeDef_Expr(bb,stmt,stmt->Ist.Put.data,def_counts);
@@ -2445,6 +2453,9 @@ void tcStmt ( IRSB* bb, IRStmt* stmt, IRType gWordTy )
          if (typeOfIRExpr(tyenv, stmt->Ist.AbiHint.base) != gWordTy)
             sanityCheckFail(bb,stmt,"IRStmt.AbiHint.base: "
                                     "not :: guest word type");
+         if (typeOfIRExpr(tyenv, stmt->Ist.AbiHint.nia) != gWordTy)
+            sanityCheckFail(bb,stmt,"IRStmt.AbiHint.nia: "
+                                    "not :: guest word type");
          break;
       case Ist_Put:
          tcExpr( bb, stmt, stmt->Ist.Put.data, gWordTy );
diff --git a/VEX/priv/ir/iropt.c b/VEX/priv/ir/iropt.c
index ea0d54c745..a4937af9fb 100644
--- a/VEX/priv/ir/iropt.c
+++ b/VEX/priv/ir/iropt.c
@@ -448,7 +448,8 @@ static void flatten_Stmt ( IRSB* bb, IRStmt* st )
          break;
       case Ist_AbiHint:
          e1 = flatten_Expr(bb, st->Ist.AbiHint.base);
-         addStmtToIRSB(bb, IRStmt_AbiHint(e1, st->Ist.AbiHint.len));
+         e2 = flatten_Expr(bb, st->Ist.AbiHint.nia);
+         addStmtToIRSB(bb, IRStmt_AbiHint(e1, st->Ist.AbiHint.len, e2));
          break;
       case Ist_Exit:
          e1 = flatten_Expr(bb, st->Ist.Exit.guard);
@@ -712,6 +713,7 @@ static void handle_gets_Stmt (
          AbiHints.*/
       case Ist_AbiHint:
          vassert(isIRAtom(st->Ist.AbiHint.base));
+         vassert(isIRAtom(st->Ist.AbiHint.nia));
          /* fall through */
       case Ist_MBE:
       case Ist_Dirty:
@@ -1200,6 +1202,15 @@ static IRExpr* fold_Expr ( IRExpr* e )
                         - e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
                break;
 
+            /* -- Max32U -- */
+            case Iop_Max32U: {
+               UInt u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
+               UInt u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+               UInt res  = u32a > u32b ? u32a : u32b;
+               e2 = IRExpr_Const(IRConst_U32(res));
+               break;
+            }
+
             /* -- Mul -- */
             case Iop_Mul32:
                e2 = IRExpr_Const(IRConst_U32(
@@ -1421,8 +1432,9 @@ static IRExpr* fold_Expr ( IRExpr* e )
             e2 = e->Iex.Binop.arg1;
          } else
 
-         /* Or32/Add32(x,0) ==> x */
-         if ((e->Iex.Binop.op == Iop_Add32 || e->Iex.Binop.op == Iop_Or32)
+         /* Or32/Add32/Max32U(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Add32 
+              || e->Iex.Binop.op == Iop_Or32 || e->Iex.Binop.op == Iop_Max32U)
              && e->Iex.Binop.arg2->tag == Iex_Const
              && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0) {
             e2 = e->Iex.Binop.arg1;
@@ -1500,8 +1512,8 @@ static IRExpr* fold_Expr ( IRExpr* e )
             e2 = e->Iex.Binop.arg2;
          } else
 
-         /* Or32(0,x) ==> x */
-         if (e->Iex.Binop.op == Iop_Or32
+         /* Or32/Max32U(0,x) ==> x */
+         if ((e->Iex.Binop.op == Iop_Or32 || e->Iex.Binop.op == Iop_Max32U)
              && e->Iex.Binop.arg1->tag == Iex_Const
              && e->Iex.Binop.arg1->Iex.Const.con->Ico.U32 == 0) {
             e2 = e->Iex.Binop.arg2;
@@ -1516,6 +1528,7 @@ static IRExpr* fold_Expr ( IRExpr* e )
 
          /* Or8/16/32/64(t,t) ==> t, for some IRTemp t */
          /* And8/16/32/64(t,t) ==> t, for some IRTemp t */
+         /* Max32U(t,t) ==> t, for some IRTemp t */
          if (   (e->Iex.Binop.op == Iop_And64
               || e->Iex.Binop.op == Iop_And32
               || e->Iex.Binop.op == Iop_And16
@@ -1523,7 +1536,8 @@ static IRExpr* fold_Expr ( IRExpr* e )
               || e->Iex.Binop.op == Iop_Or64
               || e->Iex.Binop.op == Iop_Or32
               || e->Iex.Binop.op == Iop_Or16
-              || e->Iex.Binop.op == Iop_Or8)
+              || e->Iex.Binop.op == Iop_Or8
+              || e->Iex.Binop.op == Iop_Max32U)
              && sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
             e2 = e->Iex.Binop.arg1;
          }
@@ -1697,9 +1711,11 @@ static IRStmt* subst_and_fold_Stmt ( IRExpr** env, IRStmt* st )
    switch (st->tag) {
       case Ist_AbiHint:
          vassert(isIRAtom(st->Ist.AbiHint.base));
+         vassert(isIRAtom(st->Ist.AbiHint.nia));
          return IRStmt_AbiHint(
                    fold_Expr(subst_Expr(env, st->Ist.AbiHint.base)),
-                   st->Ist.AbiHint.len
+                   st->Ist.AbiHint.len,
+                   fold_Expr(subst_Expr(env, st->Ist.AbiHint.nia))
                 );
       case Ist_Put:
          vassert(isIRAtom(st->Ist.Put.data));
@@ -1943,6 +1959,7 @@ static void addUses_Stmt ( Bool* set, IRStmt* st )
    switch (st->tag) {
       case Ist_AbiHint:
          addUses_Expr(set, st->Ist.AbiHint.base);
+         addUses_Expr(set, st->Ist.AbiHint.nia);
          return;
       case Ist_PutI:
          addUses_Expr(set, st->Ist.PutI.ix);
@@ -3211,6 +3228,7 @@ static void deltaIRStmt ( IRStmt* st, Int delta )
          break;
       case Ist_AbiHint:
          deltaIRExpr(st->Ist.AbiHint.base, delta);
+         deltaIRExpr(st->Ist.AbiHint.nia, delta);
          break;
       case Ist_Put:
          deltaIRExpr(st->Ist.Put.data, delta);
@@ -3667,6 +3685,7 @@ static void aoccCount_Stmt ( UShort* uses, IRStmt* st )
    switch (st->tag) {
       case Ist_AbiHint:
          aoccCount_Expr(uses, st->Ist.AbiHint.base);
+         aoccCount_Expr(uses, st->Ist.AbiHint.nia);
          return;
       case Ist_WrTmp: 
          aoccCount_Expr(uses, st->Ist.WrTmp.data); 
@@ -3898,7 +3917,8 @@ static IRStmt* atbSubst_Stmt ( ATmpInfo* env, IRStmt* st )
       case Ist_AbiHint:
          return IRStmt_AbiHint(
                    atbSubst_Expr(env, st->Ist.AbiHint.base),
-                   st->Ist.AbiHint.len
+                   st->Ist.AbiHint.len,
+                   atbSubst_Expr(env, st->Ist.AbiHint.nia)
                 );
       case Ist_Store:
          return IRStmt_Store(
@@ -4231,6 +4251,7 @@ static void considerExpensives ( /*OUT*/Bool* hasGetIorPutI,
       switch (st->tag) {
          case Ist_AbiHint:
             vassert(isIRAtom(st->Ist.AbiHint.base));
+            vassert(isIRAtom(st->Ist.AbiHint.nia));
             break;
          case Ist_PutI: 
             *hasGetIorPutI = True;
diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h
index bf98189433..5cf21ea15f 100644
--- a/VEX/pub/libvex.h
+++ b/VEX/pub/libvex.h
@@ -335,14 +335,18 @@ typedef
 /* A note about guest state layout.
 
    LibVEX defines the layout for the guest state, in the file
-   pub/libvex_guest_<arch>.h.  The struct will have an 8-aligned size.
-   Each translated bb is assumed to be entered with a specified
-   register pointing at such a struct.  Beyond that is a shadow
-   state area with the same size as the struct.  Beyond that is
-   a spill area that LibVEX may spill into.  It must have size
+   pub/libvex_guest_<arch>.h.  The struct will have an 16-aligned
+   size.  Each translated bb is assumed to be entered with a specified
+   register pointing at such a struct.  Beyond that is two copies of
+   the shadow state area with the same size as the struct.  Beyond
+   that is a spill area that LibVEX may spill into.  It must have size
    LibVEX_N_SPILL_BYTES, and this must be a 16-aligned number.
 
-   On entry, the baseblock pointer register must be 8-aligned.
+   On entry, the baseblock pointer register must be 16-aligned.
+
+   There must be no holes in between the primary guest state, its two
+   copies, and the spill area.  In short, all 4 areas must have a
+   16-aligned size and be 16-aligned, and placed back-to-back.
 */
 
 #define LibVEX_N_SPILL_BYTES 2048
diff --git a/VEX/pub/libvex_guest_amd64.h b/VEX/pub/libvex_guest_amd64.h
index 7a648ad915..41c4ad6baa 100644
--- a/VEX/pub/libvex_guest_amd64.h
+++ b/VEX/pub/libvex_guest_amd64.h
@@ -85,8 +85,7 @@ typedef
       /* 144 */ ULong  guest_CC_DEP2;
       /* 152 */ ULong  guest_CC_NDEP;
       /* The D flag is stored here, encoded as either -1 or +1 */
-      /* 160 */ ULong  guest_DFLAG;       /* 48 */
-      /* RIP */
+      /* 160 */ ULong  guest_DFLAG;
       /* 168 */ ULong  guest_RIP;
       /* Probably a lot more stuff too. 
          D,ID flags
@@ -96,16 +95,16 @@ typedef
       */
 
       /* Bit 21 (ID) of eflags stored here, as either 0 or 1. */
-      ULong guest_IDFLAG;
+      /* 176 */ ULong guest_IDFLAG;
 
       /* HACK to make tls on amd64-linux work.  %fs only ever seems to
          hold zero, and so guest_FS_ZERO holds the 64-bit offset
          associated with a %fs value of zero. */
-      ULong guest_FS_ZERO;
+      /* 184 */ ULong guest_FS_ZERO;
 
       /* XMM registers */
-      ULong guest_SSEROUND;
-      U128  guest_XMM0;
+      /* 192 */ULong guest_SSEROUND;
+      /* 200 */U128  guest_XMM0;
       U128  guest_XMM1;
       U128  guest_XMM2;
       U128  guest_XMM3;
@@ -126,14 +125,14 @@ typedef
       /* Note.  Setting guest_FTOP to be ULong messes up the
          delicately-balanced PutI/GetI optimisation machinery.
          Therefore best to leave it as a UInt. */
-      UInt  guest_FTOP;
+      /* 456 */UInt  guest_FTOP;
       ULong guest_FPREG[8];
-      UChar guest_FPTAG[8];
-      ULong guest_FPROUND;
-      ULong guest_FC3210;
+      /* 528 */ UChar guest_FPTAG[8];
+      /* 536 */ ULong guest_FPROUND;
+      /* 544 */ ULong guest_FC3210;
 
       /* Emulation warnings */
-      UInt   guest_EMWARN;
+      /* 552 */ UInt  guest_EMWARN;
 
       /* Translation-invalidation area description.  Not used on amd64
          (there is no invalidate-icache insn), but needed so as to
@@ -153,8 +152,8 @@ typedef
          replace-style ones. */
       ULong guest_NRADDR;
 
-      /* Padding to make it have an 8-aligned size */
-      /* UInt   padding; */
+      /* Padding to make it have an 16-aligned size */
+      ULong padding;
    }
    VexGuestAMD64State;
 
diff --git a/VEX/pub/libvex_guest_ppc32.h b/VEX/pub/libvex_guest_ppc32.h
index a797d10d0d..2163adf162 100644
--- a/VEX/pub/libvex_guest_ppc32.h
+++ b/VEX/pub/libvex_guest_ppc32.h
@@ -128,38 +128,41 @@ typedef
       /* 376 */ ULong guest_FPR31;
 
       // Vector Registers
-      /* 384 */ U128 guest_VR0 __attribute__ ((aligned (16)));
-      /* 400 */ U128 guest_VR1 __attribute__ ((aligned (16)));
-      /* 416 */ U128 guest_VR2 __attribute__ ((aligned (16)));
-      /* 432 */ U128 guest_VR3 __attribute__ ((aligned (16)));
-      /* 448 */ U128 guest_VR4 __attribute__ ((aligned (16)));
-      /* 464 */ U128 guest_VR5 __attribute__ ((aligned (16)));
-      /* 480 */ U128 guest_VR6 __attribute__ ((aligned (16)));
-      /* 496 */ U128 guest_VR7 __attribute__ ((aligned (16)));
-      /* 512 */ U128 guest_VR8 __attribute__ ((aligned (16)));
-      /* 528 */ U128 guest_VR9 __attribute__ ((aligned (16)));
-      /* 544 */ U128 guest_VR10 __attribute__ ((aligned (16)));
-      /* 560 */ U128 guest_VR11 __attribute__ ((aligned (16)));
-      /* 576 */ U128 guest_VR12 __attribute__ ((aligned (16)));
-      /* 592 */ U128 guest_VR13 __attribute__ ((aligned (16)));
-      /* 608 */ U128 guest_VR14 __attribute__ ((aligned (16)));
-      /* 624 */ U128 guest_VR15 __attribute__ ((aligned (16)));
-      /* 640 */ U128 guest_VR16 __attribute__ ((aligned (16)));
-      /* 656 */ U128 guest_VR17 __attribute__ ((aligned (16)));
-      /* 672 */ U128 guest_VR18 __attribute__ ((aligned (16)));
-      /* 688 */ U128 guest_VR19 __attribute__ ((aligned (16)));
-      /* 704 */ U128 guest_VR20 __attribute__ ((aligned (16)));
-      /* 720 */ U128 guest_VR21 __attribute__ ((aligned (16)));
-      /* 736 */ U128 guest_VR22 __attribute__ ((aligned (16)));
-      /* 752 */ U128 guest_VR23 __attribute__ ((aligned (16)));
-      /* 768 */ U128 guest_VR24 __attribute__ ((aligned (16)));
-      /* 784 */ U128 guest_VR25 __attribute__ ((aligned (16)));
-      /* 800 */ U128 guest_VR26 __attribute__ ((aligned (16)));
-      /* 816 */ U128 guest_VR27 __attribute__ ((aligned (16)));
-      /* 832 */ U128 guest_VR28 __attribute__ ((aligned (16)));
-      /* 848 */ U128 guest_VR29 __attribute__ ((aligned (16)));
-      /* 864 */ U128 guest_VR30 __attribute__ ((aligned (16)));
-      /* 880 */ U128 guest_VR31 __attribute__ ((aligned (16)));
+      // IMPORTANT: the user of libvex must place the guest state so as
+      // to ensure that guest_VR{0..31}, and any shadows thereof, are
+      // 16-aligned.
+      /* 384 */ U128 guest_VR0;
+      /* 400 */ U128 guest_VR1;
+      /* 416 */ U128 guest_VR2;
+      /* 432 */ U128 guest_VR3;
+      /* 448 */ U128 guest_VR4;
+      /* 464 */ U128 guest_VR5;
+      /* 480 */ U128 guest_VR6;
+      /* 496 */ U128 guest_VR7;
+      /* 512 */ U128 guest_VR8;
+      /* 528 */ U128 guest_VR9;
+      /* 544 */ U128 guest_VR10;
+      /* 560 */ U128 guest_VR11;
+      /* 576 */ U128 guest_VR12;
+      /* 592 */ U128 guest_VR13;
+      /* 608 */ U128 guest_VR14;
+      /* 624 */ U128 guest_VR15;
+      /* 640 */ U128 guest_VR16;
+      /* 656 */ U128 guest_VR17;
+      /* 672 */ U128 guest_VR18;
+      /* 688 */ U128 guest_VR19;
+      /* 704 */ U128 guest_VR20;
+      /* 720 */ U128 guest_VR21;
+      /* 736 */ U128 guest_VR22;
+      /* 752 */ U128 guest_VR23;
+      /* 768 */ U128 guest_VR24;
+      /* 784 */ U128 guest_VR25;
+      /* 800 */ U128 guest_VR26;
+      /* 816 */ U128 guest_VR27;
+      /* 832 */ U128 guest_VR28;
+      /* 848 */ U128 guest_VR29;
+      /* 864 */ U128 guest_VR30;
+      /* 880 */ U128 guest_VR31;
 
       /* 896 */ UInt guest_CIA;    // IP (no arch visible register)
       /* 900 */ UInt guest_LR;     // Link Register
diff --git a/VEX/pub/libvex_guest_ppc64.h b/VEX/pub/libvex_guest_ppc64.h
index 68c3846136..d03c01db99 100644
--- a/VEX/pub/libvex_guest_ppc64.h
+++ b/VEX/pub/libvex_guest_ppc64.h
@@ -166,38 +166,41 @@ typedef
       /* 504 */ ULong guest_FPR31;
 
       // Vector Registers
-      /*  512 */ U128 guest_VR0 __attribute__ ((aligned (16)));
-      /*  528 */ U128 guest_VR1 __attribute__ ((aligned (16)));
-      /*  544 */ U128 guest_VR2 __attribute__ ((aligned (16)));
-      /*  560 */ U128 guest_VR3 __attribute__ ((aligned (16)));
-      /*  576 */ U128 guest_VR4 __attribute__ ((aligned (16)));
-      /*  592 */ U128 guest_VR5 __attribute__ ((aligned (16)));
-      /*  608 */ U128 guest_VR6 __attribute__ ((aligned (16)));
-      /*  624 */ U128 guest_VR7 __attribute__ ((aligned (16)));
-      /*  640 */ U128 guest_VR8 __attribute__ ((aligned (16)));
-      /*  656 */ U128 guest_VR9 __attribute__ ((aligned (16)));
-      /*  672 */ U128 guest_VR10 __attribute__ ((aligned (16)));
-      /*  688 */ U128 guest_VR11 __attribute__ ((aligned (16)));
-      /*  704 */ U128 guest_VR12 __attribute__ ((aligned (16)));
-      /*  720 */ U128 guest_VR13 __attribute__ ((aligned (16)));
-      /*  736 */ U128 guest_VR14 __attribute__ ((aligned (16)));
-      /*  752 */ U128 guest_VR15 __attribute__ ((aligned (16)));
-      /*  768 */ U128 guest_VR16 __attribute__ ((aligned (16)));
-      /*  784 */ U128 guest_VR17 __attribute__ ((aligned (16)));
-      /*  800 */ U128 guest_VR18 __attribute__ ((aligned (16)));
-      /*  816 */ U128 guest_VR19 __attribute__ ((aligned (16)));
-      /*  832 */ U128 guest_VR20 __attribute__ ((aligned (16)));
-      /*  848 */ U128 guest_VR21 __attribute__ ((aligned (16)));
-      /*  864 */ U128 guest_VR22 __attribute__ ((aligned (16)));
-      /*  880 */ U128 guest_VR23 __attribute__ ((aligned (16)));
-      /*  896 */ U128 guest_VR24 __attribute__ ((aligned (16)));
-      /*  912 */ U128 guest_VR25 __attribute__ ((aligned (16)));
-      /*  928 */ U128 guest_VR26 __attribute__ ((aligned (16)));
-      /*  944 */ U128 guest_VR27 __attribute__ ((aligned (16)));
-      /*  960 */ U128 guest_VR28 __attribute__ ((aligned (16)));
-      /*  976 */ U128 guest_VR29 __attribute__ ((aligned (16)));
-      /*  992 */ U128 guest_VR30 __attribute__ ((aligned (16)));
-      /* 1008 */ U128 guest_VR31 __attribute__ ((aligned (16)));
+      // IMPORTANT: the user of libvex must place the guest state so as
+      // to ensure that guest_VR{0..31}, and any shadows thereof, are
+      // 16-aligned.
+      /*  512 */ U128 guest_VR0;
+      /*  528 */ U128 guest_VR1;
+      /*  544 */ U128 guest_VR2;
+      /*  560 */ U128 guest_VR3;
+      /*  576 */ U128 guest_VR4;
+      /*  592 */ U128 guest_VR5;
+      /*  608 */ U128 guest_VR6;
+      /*  624 */ U128 guest_VR7;
+      /*  640 */ U128 guest_VR8;
+      /*  656 */ U128 guest_VR9;
+      /*  672 */ U128 guest_VR10;
+      /*  688 */ U128 guest_VR11;
+      /*  704 */ U128 guest_VR12;
+      /*  720 */ U128 guest_VR13;
+      /*  736 */ U128 guest_VR14;
+      /*  752 */ U128 guest_VR15;
+      /*  768 */ U128 guest_VR16;
+      /*  784 */ U128 guest_VR17;
+      /*  800 */ U128 guest_VR18;
+      /*  816 */ U128 guest_VR19;
+      /*  832 */ U128 guest_VR20;
+      /*  848 */ U128 guest_VR21;
+      /*  864 */ U128 guest_VR22;
+      /*  880 */ U128 guest_VR23;
+      /*  896 */ U128 guest_VR24;
+      /*  912 */ U128 guest_VR25;
+      /*  928 */ U128 guest_VR26;
+      /*  944 */ U128 guest_VR27;
+      /*  960 */ U128 guest_VR28;
+      /*  976 */ U128 guest_VR29;
+      /*  992 */ U128 guest_VR30;
+      /* 1008 */ U128 guest_VR31;
 
       /* 1024 */ ULong guest_CIA;    // IP (no arch visible register)
       /* 1032 */ ULong guest_LR;     // Link Register
diff --git a/VEX/pub/libvex_guest_x86.h b/VEX/pub/libvex_guest_x86.h
index af93c24dd5..062482e3f5 100644
--- a/VEX/pub/libvex_guest_x86.h
+++ b/VEX/pub/libvex_guest_x86.h
@@ -220,7 +220,7 @@ typedef
          replace-style ones. */
       UInt guest_NRADDR;
 
-      /* Padding to make it have an 8-aligned size */
+      /* Padding to make it have an 16-aligned size */
       UInt padding;
    }
    VexGuestX86State;
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index e89bef119c..6a8dc76322 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -446,6 +446,7 @@ typedef
       Iop_CmpNEZ8, Iop_CmpNEZ16,  Iop_CmpNEZ32,  Iop_CmpNEZ64,
       Iop_CmpwNEZ32, Iop_CmpwNEZ64, /* all-0s -> all-Os; other -> all-1s */
       Iop_Left8, Iop_Left16, Iop_Left32, Iop_Left64, /*  \x -> x | -x */
+      Iop_Max32U, /* unsigned max */
 
       /* PowerPC-style 3-way integer comparisons.  Without them it is
          difficult to simulate PPC efficiently.
@@ -1411,14 +1412,17 @@ typedef
             that a given chunk of address space, [base .. base+len-1],
             has become undefined.  This is used on amd64-linux and
             some ppc variants to pass stack-redzoning hints to whoever
-            wants to see them.
+            wants to see them.  It also indicates the address of the
+            next (dynamic) instruction that will be executed.  This is
+            to help Memcheck to origin tracking.
 
-            ppIRExpr output: ====== AbiHint(<base>, <len>) ======
-                         eg. ====== AbiHint(t1, 16) ======
+            ppIRExpr output: ====== AbiHint(<base>, <len>, <nia>) ======
+                         eg. ====== AbiHint(t1, 16, t2) ======
          */
          struct {
             IRExpr* base;     /* Start  of undefined chunk */
             Int     len;      /* Length of undefined chunk */
+            IRExpr* nia;      /* Address of next (guest) insn */
          } AbiHint;
 
          /* Write a guest register, at a fixed offset in the guest state.
@@ -1505,7 +1509,7 @@ typedef
 /* Statement constructors. */
 extern IRStmt* IRStmt_NoOp    ( void );
 extern IRStmt* IRStmt_IMark   ( Addr64 addr, Int len );
-extern IRStmt* IRStmt_AbiHint ( IRExpr* base, Int len );
+extern IRStmt* IRStmt_AbiHint ( IRExpr* base, Int len, IRExpr* nia );
 extern IRStmt* IRStmt_Put     ( Int off, IRExpr* data );
 extern IRStmt* IRStmt_PutI    ( IRRegArray* descr, IRExpr* ix, Int bias, 
                                 IRExpr* data );
diff --git a/VEX/test_main.c b/VEX/test_main.c
index c67ea97900..6b9cb72d56 100644
--- a/VEX/test_main.c
+++ b/VEX/test_main.c
@@ -48,9 +48,13 @@ static UChar transbuf[N_TRANSBUF];
 static Bool verbose = True;
 
 /* Forwards */
-#if 0 /* UNUSED */
+#if 1 /* UNUSED */
 static IRSB* ac_instrument ( IRSB*, VexGuestLayout*, IRType );
-static IRSB* mc_instrument ( IRSB*, VexGuestLayout*, IRType, IRType );
+static
+IRSB* mc_instrument ( void* closureV,
+                      IRSB* bb_in, VexGuestLayout* layout, 
+                      VexGuestExtents* vge,
+                      IRType gWordTy, IRType hWordTy );
 #endif
 
 static Bool chase_into_not_ok ( void* opaque, Addr64 dst ) { return False; }
@@ -167,7 +171,7 @@ int main ( int argc, char** argv )
       vta.host_bytes      = transbuf;
       vta.host_bytes_size = N_TRANSBUF;
       vta.host_bytes_used = &trans_used;
-#if 1 /* no instrumentation */
+#if 0 /* no instrumentation */
       vta.instrument1     = NULL;
       vta.instrument2     = NULL;
 #endif
@@ -175,7 +179,7 @@ int main ( int argc, char** argv )
       vta.instrument1     = ac_instrument;
       vta.instrument2     = NULL;
 #endif
-#if 0 /* memcheck */
+#if 1 /* memcheck */
       vta.instrument1     = mc_instrument;
       vta.instrument2     = NULL;
 #endif
@@ -379,7 +383,15 @@ IRSB* ac_instrument (IRSB* bb_in, VexGuestLayout* layout, IRType hWordTy )
 //////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////
 
-#if 0 /* UNUSED */
+#if 1 /* UNUSED */
+
+static
+__attribute((noreturn))
+void panic ( HChar* s )
+{
+  printf("\npanic: %s\n", s);
+  failure_exit();
+}
 
 #define tl_assert(xxx) assert(xxx)
 #define VG_(xxxx) xxxx
@@ -560,7 +572,7 @@ static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
 {
    if (a1->tag == Iex_Const)
       return True;
-   if (a1->tag == Iex_Tmp && a1->Iex.Tmp.tmp < mce->n_originalTmps)
+   if (a1->tag == Iex_RdTmp && a1->Iex.RdTmp.tmp < mce->n_originalTmps)
       return True;
    return False;
 }
@@ -571,7 +583,7 @@ static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
 {
    if (a1->tag == Iex_Const)
       return True;
-   if (a1->tag == Iex_Tmp && a1->Iex.Tmp.tmp >= mce->n_originalTmps)
+   if (a1->tag == Iex_RdTmp && a1->Iex.RdTmp.tmp >= mce->n_originalTmps)
       return True;
    return False;
 }
@@ -580,7 +592,7 @@ static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
    are identically-kinded. */
 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 {
-   if (a1->tag == Iex_Tmp && a1->tag == Iex_Tmp)
+   if (a1->tag == Iex_RdTmp && a1->tag == Iex_RdTmp)
       return True;
    if (a1->tag == Iex_Const && a1->tag == Iex_Const)
       return True;
@@ -634,7 +646,7 @@ static IRExpr* definedOfType ( IRType ty ) {
 
 /* assign value to tmp */
 #define assign(_bb,_tmp,_expr)   \
-   addStmtToIRSB((_bb), IRStmt_Tmp((_tmp),(_expr)))
+   addStmtToIRSB((_bb), IRStmt_WrTmp((_tmp),(_expr)))
 
 /* add stmt to a bb */
 #define stmt(_bb,_stmt)    \
@@ -648,7 +660,7 @@ static IRExpr* definedOfType ( IRType ty ) {
 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
-#define mkexpr(_tmp)             IRExpr_Tmp((_tmp))
+#define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
 
 /* bind the given expression to a new temporary, and return the
    temporary.  This effectively converts an arbitrary expression into
@@ -1029,10 +1041,10 @@ static void complainIfUndefined ( MCEnv* mce, IRAtom* atom )
       getting a new value. */
    tl_assert(isIRAtom(vatom));
    /* sameKindedAtoms ... */
-   if (vatom->tag == Iex_Tmp) {
-      tl_assert(atom->tag == Iex_Tmp);
-      newShadowTmp(mce, atom->Iex.Tmp.tmp);
-      assign(mce->bb, findShadowTmp(mce, atom->Iex.Tmp.tmp), 
+   if (vatom->tag == Iex_RdTmp) {
+      tl_assert(atom->tag == Iex_RdTmp);
+      newShadowTmp(mce, atom->Iex.RdTmp.tmp);
+      assign(mce->bb, findShadowTmp(mce, atom->Iex.RdTmp.tmp), 
                       definedOfType(ty));
    }
 }
@@ -1110,7 +1122,7 @@ void do_shadow_PUT ( MCEnv* mce,  Int offset,
 */
 static
 void do_shadow_PUTI ( MCEnv* mce, 
-                      IRArray* descr, IRAtom* ix, Int bias, IRAtom* atom )
+                      IRRegArray* descr, IRAtom* ix, Int bias, IRAtom* atom )
 {
    IRAtom* vatom;
    IRType  ty, tyS;
@@ -1132,8 +1144,8 @@ void do_shadow_PUTI ( MCEnv* mce,
    } else {
       /* Do a cloned version of the Put that refers to the shadow
          area. */
-      IRArray* new_descr 
-         = mkIRArray( descr->base + mce->layout->total_sizeB, 
+      IRRegArray* new_descr 
+         = mkIRRegArray( descr->base + mce->layout->total_sizeB, 
                       tyS, descr->nElems);
       stmt( mce->bb, IRStmt_PutI( new_descr, ix, bias, vatom ));
    }
@@ -1163,7 +1175,7 @@ IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
    given GETI (passed in in pieces). 
 */
 static
-IRExpr* shadow_GETI ( MCEnv* mce, IRArray* descr, IRAtom* ix, Int bias )
+IRExpr* shadow_GETI ( MCEnv* mce, IRRegArray* descr, IRAtom* ix, Int bias )
 {
    IRType ty   = descr->elemTy;
    IRType tyS  = shadowType(ty);
@@ -1177,8 +1189,8 @@ IRExpr* shadow_GETI ( MCEnv* mce, IRArray* descr, IRAtom* ix, Int bias )
    } else {
       /* return a cloned version of the Get that refers to the shadow
          area. */
-      IRArray* new_descr 
-         = mkIRArray( descr->base + mce->layout->total_sizeB, 
+      IRRegArray* new_descr 
+         = mkIRRegArray( descr->base + mce->layout->total_sizeB, 
                       tyS, descr->nElems);
       return IRExpr_GetI( new_descr, ix, bias );
    }
@@ -1684,7 +1696,7 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
 
       /* Scalar floating point */
 
-      case Iop_RoundF64:
+         //      case Iop_RoundF64:
       case Iop_F64toI64:
       case Iop_I64toF64:
          /* First arg is I32 (rounding mode), second is F64 or I64
@@ -2068,8 +2080,8 @@ IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
          return shadow_GETI( mce, e->Iex.GetI.descr, 
                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
 
-      case Iex_Tmp:
-         return IRExpr_Tmp( findShadowTmp(mce, e->Iex.Tmp.tmp) );
+      case Iex_RdTmp:
+         return IRExpr_RdTmp( findShadowTmp(mce, e->Iex.RdTmp.tmp) );
 
       case Iex_Const:
          return definedOfType(shadowType(typeOfIRExpr(mce->bb->tyenv, e)));
@@ -2084,9 +2096,9 @@ IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
       case Iex_Unop:
          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
 
-      case Iex_LDle:
-         return expr2vbits_LDle( mce, e->Iex.LDle.ty, 
-                                      e->Iex.LDle.addr, 0/*addr bias*/ );
+      case Iex_Load:
+         return expr2vbits_LDle( mce, e->Iex.Load.ty, 
+                                      e->Iex.Load.addr, 0/*addr bias*/ );
 
       case Iex_CCall:
          return mkLazyN( mce, e->Iex.CCall.args, 
@@ -2154,7 +2166,7 @@ void do_shadow_STle ( MCEnv* mce,
    IRAtom   *vdataLo64, *vdataHi64;
    IRAtom   *eBias, *eBias0, *eBias8;
    void*    helper = NULL;
-   Char*    hname = NULL;
+   HChar*   hname = NULL;
 
    tyAddr = mce->hWordTy;
    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
@@ -2447,7 +2459,7 @@ static Bool isBogusAtom ( IRAtom* at )
    ULong n = 0;
    IRConst* con;
    tl_assert(isIRAtom(at));
-   if (at->tag == Iex_Tmp)
+   if (at->tag == Iex_RdTmp)
       return False;
    tl_assert(at->tag == Iex_Const);
    con = at->Iex.Const.con;
@@ -2470,11 +2482,11 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
    Int     i;
    IRExpr* e;
    switch (st->tag) {
-      case Ist_Tmp:
-         e = st->Ist.Tmp.data;
+      case Ist_WrTmp:
+         e = st->Ist.WrTmp.data;
          switch (e->tag) {
             case Iex_Get:
-            case Iex_Tmp:
+            case Iex_RdTmp:
                return False;
             case Iex_Unop: 
                return isBogusAtom(e->Iex.Unop.arg);
@@ -2485,8 +2497,8 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
                return isBogusAtom(e->Iex.Mux0X.cond)
                       || isBogusAtom(e->Iex.Mux0X.expr0)
                       || isBogusAtom(e->Iex.Mux0X.exprX);
-            case Iex_LDle: 
-               return isBogusAtom(e->Iex.LDle.addr);
+            case Iex_Load: 
+               return isBogusAtom(e->Iex.Load.addr);
             case Iex_CCall:
                for (i = 0; e->Iex.CCall.args[i]; i++)
                   if (isBogusAtom(e->Iex.CCall.args[i]))
@@ -2497,9 +2509,9 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
          }
       case Ist_Put:
          return isBogusAtom(st->Ist.Put.data);
-      case Ist_STle:
-         return isBogusAtom(st->Ist.STle.addr) 
-                || isBogusAtom(st->Ist.STle.data);
+      case Ist_Store:
+         return isBogusAtom(st->Ist.Store.addr) 
+                || isBogusAtom(st->Ist.Store.data);
       case Ist_Exit:
          return isBogusAtom(st->Ist.Exit.guard);
       default: 
@@ -2509,7 +2521,9 @@ static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
    }
 }
 
-IRSB* mc_instrument ( IRSB* bb_in, VexGuestLayout* layout, 
+IRSB* mc_instrument ( void* closureV,
+                      IRSB* bb_in, VexGuestLayout* layout, 
+                      VexGuestExtents* vge,
                       IRType gWordTy, IRType hWordTy )
 {
    Bool verboze = False; //True; 
@@ -2522,8 +2536,8 @@ IRSB* mc_instrument ( IRSB* bb_in, VexGuestLayout* layout,
 
    /* Set up BB */
    IRSB* bb     = emptyIRSB();
-   bb->tyenv    = dopyIRTypeEnv(bb_in->tyenv);
-   bb->next     = dopyIRExpr(bb_in->next);
+   bb->tyenv    = deepCopyIRTypeEnv(bb_in->tyenv);
+   bb->next     = deepCopyIRExpr(bb_in->next);
    bb->jumpkind = bb_in->jumpkind;
 
    /* Set up the running environment.  Only .bb is modified as we go
@@ -2563,9 +2577,9 @@ IRSB* mc_instrument ( IRSB* bb_in, VexGuestLayout* layout,
 
       switch (st->tag) {
 
-         case Ist_Tmp:
-            assign( bb, findShadowTmp(&mce, st->Ist.Tmp.tmp), 
-                        expr2vbits( &mce, st->Ist.Tmp.data) );
+         case Ist_WrTmp:
+            assign( bb, findShadowTmp(&mce, st->Ist.WrTmp.tmp), 
+                        expr2vbits( &mce, st->Ist.WrTmp.data) );
             break;
 
          case Ist_Put:
@@ -2583,9 +2597,9 @@ IRSB* mc_instrument ( IRSB* bb_in, VexGuestLayout* layout,
                             st->Ist.PutI.data );
             break;
 
-         case Ist_STle:
-            do_shadow_STle( &mce, st->Ist.STle.addr, 0/* addr bias */,
-                                  st->Ist.STle.data,
+         case Ist_Store:
+            do_shadow_STle( &mce, st->Ist.Store.addr, 0/* addr bias */,
+                                  st->Ist.Store.data,
                                   NULL /* shadow data */ );
             break;
 
-- 
2.47.2