From: Julian Seward <jseward@acm.org>
Date: Thu, 18 Apr 2013 11:50:58 +0000 (+0000)
Subject: Improved front end translations for Neon V{LD,ST}{1,2} instructions,
X-Git-Tag: svn/VALGRIND_3_9_0^2~87
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=16884ec3299c3107f96e7bbde6f9baa0d1f71ab8;p=thirdparty%2Fvalgrind.git

Improved front end translations for Neon V{LD,ST}{1,2} instructions,
that do deinterleaving/interleaving via IROps and so generate far
fewer memory references.  As a side effect, fix incorrect ARM back end
implementation of many of the SIMD lane interleaving/deinterleaving
and concatenation IROps.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2708
---

diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c
index ea454becc3..5df32b9270 100644
--- a/VEX/priv/guest_arm_toIR.c
+++ b/VEX/priv/guest_arm_toIR.c
@@ -6596,9 +6596,9 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
    UInt dreg = get_neon_d_regno(theInstr);
    UInt mreg = get_neon_m_regno(theInstr);
    UInt F = (theInstr >> 10) & 1;
-   IRTemp arg_d;
-   IRTemp arg_m;
-   IRTemp res;
+   IRTemp arg_d = IRTemp_INVALID;
+   IRTemp arg_m = IRTemp_INVALID;
+   IRTemp res = IRTemp_INVALID;
    switch (A) {
       case 0:
          if (Q) {
@@ -7158,36 +7158,36 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
             return True;
          } else if ((B >> 1) == 1) {
             /* VTRN */
-            IROp op_lo, op_hi;
-            IRTemp res1, res2;
+            IROp op_odd = Iop_INVALID, op_even = Iop_INVALID;
+            IRTemp old_m, old_d, new_d, new_m;
             if (Q) {
-               arg_m = newTemp(Ity_V128);
-               arg_d = newTemp(Ity_V128);
-               res1 = newTemp(Ity_V128);
-               res2 = newTemp(Ity_V128);
-               assign(arg_m, getQReg(mreg));
-               assign(arg_d, getQReg(dreg));
+               old_m = newTemp(Ity_V128);
+               old_d = newTemp(Ity_V128);
+               new_m = newTemp(Ity_V128);
+               new_d = newTemp(Ity_V128);
+               assign(old_m, getQReg(mreg));
+               assign(old_d, getQReg(dreg));
             } else {
-               res1 = newTemp(Ity_I64);
-               res2 = newTemp(Ity_I64);
-               arg_m = newTemp(Ity_I64);
-               arg_d = newTemp(Ity_I64);
-               assign(arg_m, getDRegI64(mreg));
-               assign(arg_d, getDRegI64(dreg));
+               old_m = newTemp(Ity_I64);
+               old_d = newTemp(Ity_I64);
+               new_m = newTemp(Ity_I64);
+               new_d = newTemp(Ity_I64);
+               assign(old_m, getDRegI64(mreg));
+               assign(old_d, getDRegI64(dreg));
             }
             if (Q) {
                switch (size) {
                   case 0:
-                     op_lo = Iop_InterleaveOddLanes8x16;
-                     op_hi = Iop_InterleaveEvenLanes8x16;
+                     op_odd  = Iop_InterleaveOddLanes8x16;
+                     op_even = Iop_InterleaveEvenLanes8x16;
                      break;
                   case 1:
-                     op_lo = Iop_InterleaveOddLanes16x8;
-                     op_hi = Iop_InterleaveEvenLanes16x8;
+                     op_odd  = Iop_InterleaveOddLanes16x8;
+                     op_even = Iop_InterleaveEvenLanes16x8;
                      break;
                   case 2:
-                     op_lo = Iop_InterleaveOddLanes32x4;
-                     op_hi = Iop_InterleaveEvenLanes32x4;
+                     op_odd  = Iop_InterleaveOddLanes32x4;
+                     op_even = Iop_InterleaveEvenLanes32x4;
                      break;
                   case 3:
                      return False;
@@ -7197,16 +7197,16 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
             } else {
                switch (size) {
                   case 0:
-                     op_lo = Iop_InterleaveOddLanes8x8;
-                     op_hi = Iop_InterleaveEvenLanes8x8;
+                     op_odd  = Iop_InterleaveOddLanes8x8;
+                     op_even = Iop_InterleaveEvenLanes8x8;
                      break;
                   case 1:
-                     op_lo = Iop_InterleaveOddLanes16x4;
-                     op_hi = Iop_InterleaveEvenLanes16x4;
+                     op_odd  = Iop_InterleaveOddLanes16x4;
+                     op_even = Iop_InterleaveEvenLanes16x4;
                      break;
                   case 2:
-                     op_lo = Iop_InterleaveLO32x2;
-                     op_hi = Iop_InterleaveHI32x2;
+                     op_odd  = Iop_InterleaveHI32x2;
+                     op_even = Iop_InterleaveLO32x2;
                      break;
                   case 3:
                      return False;
@@ -7214,65 +7214,65 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
                      vassert(0);
                }
             }
-            assign(res1, binop(op_lo, mkexpr(arg_m), mkexpr(arg_d)));
-            assign(res2, binop(op_hi, mkexpr(arg_m), mkexpr(arg_d)));
+            assign(new_d, binop(op_even, mkexpr(old_m), mkexpr(old_d)));
+            assign(new_m, binop(op_odd, mkexpr(old_m), mkexpr(old_d)));
             if (Q) {
-               putQReg(dreg, mkexpr(res1), condT);
-               putQReg(mreg, mkexpr(res2), condT);
+               putQReg(dreg, mkexpr(new_d), condT);
+               putQReg(mreg, mkexpr(new_m), condT);
             } else {
-               putDRegI64(dreg, mkexpr(res1), condT);
-               putDRegI64(mreg, mkexpr(res2), condT);
+               putDRegI64(dreg, mkexpr(new_d), condT);
+               putDRegI64(mreg, mkexpr(new_m), condT);
             }
             DIP("vtrn.%u %c%u, %c%u\n",
                 8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
             return True;
          } else if ((B >> 1) == 2) {
             /* VUZP */
-            IROp op_lo, op_hi;
-            IRTemp res1, res2;
+            IROp op_even, op_odd;
+            IRTemp old_m, old_d, new_m, new_d;
             if (!Q && size == 2)
                return False;
             if (Q) {
-               arg_m = newTemp(Ity_V128);
-               arg_d = newTemp(Ity_V128);
-               res1 = newTemp(Ity_V128);
-               res2 = newTemp(Ity_V128);
-               assign(arg_m, getQReg(mreg));
-               assign(arg_d, getQReg(dreg));
+               old_m = newTemp(Ity_V128);
+               old_d = newTemp(Ity_V128);
+               new_m = newTemp(Ity_V128);
+               new_d = newTemp(Ity_V128);
+               assign(old_m, getQReg(mreg));
+               assign(old_d, getQReg(dreg));
             } else {
-               res1 = newTemp(Ity_I64);
-               res2 = newTemp(Ity_I64);
-               arg_m = newTemp(Ity_I64);
-               arg_d = newTemp(Ity_I64);
-               assign(arg_m, getDRegI64(mreg));
-               assign(arg_d, getDRegI64(dreg));
+               old_m = newTemp(Ity_I64);
+               old_d = newTemp(Ity_I64);
+               new_m = newTemp(Ity_I64);
+               new_d = newTemp(Ity_I64);
+               assign(old_m, getDRegI64(mreg));
+               assign(old_d, getDRegI64(dreg));
             }
             switch (size) {
                case 0:
-                  op_lo = Q ? Iop_CatOddLanes8x16 : Iop_CatOddLanes8x8;
-                  op_hi = Q ? Iop_CatEvenLanes8x16 : Iop_CatEvenLanes8x8;
+                  op_odd  = Q ? Iop_CatOddLanes8x16 : Iop_CatOddLanes8x8;
+                  op_even = Q ? Iop_CatEvenLanes8x16 : Iop_CatEvenLanes8x8;
                   break;
                case 1:
-                  op_lo = Q ? Iop_CatOddLanes16x8 : Iop_CatOddLanes16x4;
-                  op_hi = Q ? Iop_CatEvenLanes16x8 : Iop_CatEvenLanes16x4;
+                  op_odd  = Q ? Iop_CatOddLanes16x8 : Iop_CatOddLanes16x4;
+                  op_even = Q ? Iop_CatEvenLanes16x8 : Iop_CatEvenLanes16x4;
                   break;
                case 2:
-                  op_lo = Iop_CatOddLanes32x4;
-                  op_hi = Iop_CatEvenLanes32x4;
+                  op_odd  = Iop_CatOddLanes32x4;
+                  op_even = Iop_CatEvenLanes32x4;
                   break;
                case 3:
                   return False;
                default:
                   vassert(0);
             }
-            assign(res1, binop(op_lo, mkexpr(arg_m), mkexpr(arg_d)));
-            assign(res2, binop(op_hi, mkexpr(arg_m), mkexpr(arg_d)));
+            assign(new_d, binop(op_even, mkexpr(old_m), mkexpr(old_d)));
+            assign(new_m, binop(op_odd,  mkexpr(old_m), mkexpr(old_d)));
             if (Q) {
-               putQReg(dreg, mkexpr(res1), condT);
-               putQReg(mreg, mkexpr(res2), condT);
+               putQReg(dreg, mkexpr(new_d), condT);
+               putQReg(mreg, mkexpr(new_m), condT);
             } else {
-               putDRegI64(dreg, mkexpr(res1), condT);
-               putDRegI64(mreg, mkexpr(res2), condT);
+               putDRegI64(dreg, mkexpr(new_d), condT);
+               putDRegI64(mreg, mkexpr(new_m), condT);
             }
             DIP("vuzp.%u %c%u, %c%u\n",
                 8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
@@ -7280,50 +7280,50 @@ Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
          } else if ((B >> 1) == 3) {
             /* VZIP */
             IROp op_lo, op_hi;
-            IRTemp res1, res2;
+            IRTemp old_m, old_d, new_m, new_d;
             if (!Q && size == 2)
                return False;
             if (Q) {
-               arg_m = newTemp(Ity_V128);
-               arg_d = newTemp(Ity_V128);
-               res1 = newTemp(Ity_V128);
-               res2 = newTemp(Ity_V128);
-               assign(arg_m, getQReg(mreg));
-               assign(arg_d, getQReg(dreg));
+               old_m = newTemp(Ity_V128);
+               old_d = newTemp(Ity_V128);
+               new_m = newTemp(Ity_V128);
+               new_d = newTemp(Ity_V128);
+               assign(old_m, getQReg(mreg));
+               assign(old_d, getQReg(dreg));
             } else {
-               res1 = newTemp(Ity_I64);
-               res2 = newTemp(Ity_I64);
-               arg_m = newTemp(Ity_I64);
-               arg_d = newTemp(Ity_I64);
-               assign(arg_m, getDRegI64(mreg));
-               assign(arg_d, getDRegI64(dreg));
+               old_m = newTemp(Ity_I64);
+               old_d = newTemp(Ity_I64);
+               new_m = newTemp(Ity_I64);
+               new_d = newTemp(Ity_I64);
+               assign(old_m, getDRegI64(mreg));
+               assign(old_d, getDRegI64(dreg));
             }
             switch (size) {
                case 0:
-                  op_lo = Q ? Iop_InterleaveHI8x16 : Iop_InterleaveHI8x8;
-                  op_hi = Q ? Iop_InterleaveLO8x16 : Iop_InterleaveLO8x8;
+                  op_hi = Q ? Iop_InterleaveHI8x16 : Iop_InterleaveHI8x8;
+                  op_lo = Q ? Iop_InterleaveLO8x16 : Iop_InterleaveLO8x8;
                   break;
                case 1:
-                  op_lo = Q ? Iop_InterleaveHI16x8 : Iop_InterleaveHI16x4;
-                  op_hi = Q ? Iop_InterleaveLO16x8 : Iop_InterleaveLO16x4;
+                  op_hi = Q ? Iop_InterleaveHI16x8 : Iop_InterleaveHI16x4;
+                  op_lo = Q ? Iop_InterleaveLO16x8 : Iop_InterleaveLO16x4;
                   break;
                case 2:
-                  op_lo = Iop_InterleaveHI32x4;
-                  op_hi = Iop_InterleaveLO32x4;
+                  op_hi = Iop_InterleaveHI32x4;
+                  op_lo = Iop_InterleaveLO32x4;
                   break;
                case 3:
                   return False;
                default:
                   vassert(0);
             }
-            assign(res1, binop(op_lo, mkexpr(arg_m), mkexpr(arg_d)));
-            assign(res2, binop(op_hi, mkexpr(arg_m), mkexpr(arg_d)));
+            assign(new_d, binop(op_lo, mkexpr(old_m), mkexpr(old_d)));
+            assign(new_m, binop(op_hi, mkexpr(old_m), mkexpr(old_d)));
             if (Q) {
-               putQReg(dreg, mkexpr(res1), condT);
-               putQReg(mreg, mkexpr(res2), condT);
+               putQReg(dreg, mkexpr(new_d), condT);
+               putQReg(mreg, mkexpr(new_m), condT);
             } else {
-               putDRegI64(dreg, mkexpr(res1), condT);
-               putDRegI64(mreg, mkexpr(res2), condT);
+               putDRegI64(dreg, mkexpr(new_d), condT);
+               putDRegI64(mreg, mkexpr(new_m), condT);
             }
             DIP("vzip.%u %c%u, %c%u\n",
                 8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
@@ -7900,6 +7900,78 @@ void mk_neon_elem_store_from_one_lane( UInt rD, UInt inc, UInt index,
    }
 }
 
+/* Generate 2x64 -> 2x64 deinterleave code, for VLD2.  Caller must
+   make *u0 and *u1 be valid IRTemps before the call. */
+static void math_DEINTERLEAVE_2 (/*OUT*/IRTemp* u0, /*OUT*/IRTemp* u1,
+                                 IRTemp i0, IRTemp i1, Int laneszB)
+{
+   /* The following assumes that the guest is little endian, and hence
+      that the memory-side (interleaved) data is stored
+      little-endianly. */
+   vassert(u0 && u1);
+   /* This is pretty easy, since we have primitives directly to
+      hand. */
+   if (laneszB == 4) {
+      // memLE(128 bits) == A0 B0 A1 B1
+      // i0 == B0 A0, i1 == B1 A1
+      // u0 == A1 A0, u1 == B1 B0
+      assign(*u0, binop(Iop_InterleaveLO32x2, mkexpr(i1), mkexpr(i0)));
+      assign(*u1, binop(Iop_InterleaveHI32x2, mkexpr(i1), mkexpr(i0)));
+   } else if (laneszB == 2) {
+      // memLE(128 bits) == A0 B0 A1 B1 A2 B2 A3 B3
+      // i0 == B1 A1 B0 A0, i1 == B3 A3 B2 A2
+      // u0 == A3 A2 A1 A0, u1 == B3 B2 B1 B0
+      assign(*u0, binop(Iop_CatEvenLanes16x4, mkexpr(i1), mkexpr(i0)));
+      assign(*u1, binop(Iop_CatOddLanes16x4,  mkexpr(i1), mkexpr(i0)));
+   } else if (laneszB == 1) {
+      // memLE(128 bits) == A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+      // i0 == B3 A3 B2 A2 B1 A1 B0 A0, i1 == B7 A7 B6 A6 B5 A5 B4 A4
+      // u0 == A7 A6 A5 A4 A3 A2 A1 A0, u1 == B7 B6 B5 B4 B3 B2 B1 B0
+      assign(*u0, binop(Iop_CatEvenLanes8x8, mkexpr(i1), mkexpr(i0)));
+      assign(*u1, binop(Iop_CatOddLanes8x8,  mkexpr(i1), mkexpr(i0)));
+   } else {
+      // Can never happen, since VLD2 only has valid lane widths of 32,
+      // 16 or 8 bits.
+      vpanic("math_DEINTERLEAVE_2");
+   }
+}
+
+/* Generate 2x64 -> 2x64 interleave code, for VST2.  Caller must make
+   *u0 and *u1 be valid IRTemps before the call. */
+static void math_INTERLEAVE_2 (/*OUT*/IRTemp* i0, /*OUT*/IRTemp* i1,
+                               IRTemp u0, IRTemp u1, Int laneszB)
+{
+   /* The following assumes that the guest is little endian, and hence
+      that the memory-side (interleaved) data is stored
+      little-endianly. */
+   vassert(i0 && *i1);
+   /* This is pretty easy, since we have primitives directly to
+      hand. */
+   if (laneszB == 4) {
+      // memLE(128 bits) == A0 B0 A1 B1
+      // i0 == B0 A0, i1 == B1 A1
+      // u0 == A1 A0, u1 == B1 B0
+      assign(*i0, binop(Iop_InterleaveLO32x2, mkexpr(u1), mkexpr(u0)));
+      assign(*i1, binop(Iop_InterleaveHI32x2, mkexpr(u1), mkexpr(u0)));
+   } else if (laneszB == 2) {
+      // memLE(128 bits) == A0 B0 A1 B1 A2 B2 A3 B3
+      // i0 == B1 A1 B0 A0, i1 == B3 A3 B2 A2
+      // u0 == A3 A2 A1 A0, u1 == B3 B2 B1 B0
+      assign(*i0, binop(Iop_InterleaveLO16x4, mkexpr(u1), mkexpr(u0)));
+      assign(*i1, binop(Iop_InterleaveHI16x4, mkexpr(u1), mkexpr(u0)));
+   } else if (laneszB == 1) {
+      // memLE(128 bits) == A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+      // i0 == B3 A3 B2 A2 B1 A1 B0 A0, i1 == B7 A7 B6 A6 B5 A5 B4 A4
+      // u0 == A7 A6 A5 A4 A3 A2 A1 A0, u1 == B7 B6 B5 B4 B3 B2 B1 B0
+      assign(*i0, binop(Iop_InterleaveLO8x8, mkexpr(u1), mkexpr(u0)));
+      assign(*i1, binop(Iop_InterleaveHI8x8, mkexpr(u1), mkexpr(u0)));
+   } else {
+      // Can never happen, since VST2 only has valid lane widths of 32,
+      // 16 or 8 bits.
+      vpanic("math_INTERLEAVE_2");
+   }
+}
+
 /* A7.7 Advanced SIMD element or structure load/store instructions */
 static
 Bool dis_neon_load_or_store ( UInt theInstr,
@@ -8100,14 +8172,20 @@ Bool dis_neon_load_or_store ( UInt theInstr,
    } else {
       /* ------------ Case (3) ------------
          VSTn / VLDn (multiple n-element structures) */
-      IRTemp tmp;
-      UInt r, elems;
-      if (fB == BITS4(0,0,1,0) || fB == BITS4(0,1,1,0)
-          || fB == BITS4(0,1,1,1) || fB == BITS4(1,0,1,0)) {
-         N = 0;
-      } else if (fB == BITS4(0,0,1,1) || fB == BITS4(1,0,0,0)
-                 || fB == BITS4(1,0,0,1)) {
-         N = 1;
+      UInt r, lanes;
+      if (fB == BITS4(0,0,1,0)       // Dd, Dd+1, Dd+2, Dd+3  inc = 1  regs = 4
+          || fB == BITS4(0,1,1,0)    // Dd, Dd+1, Dd+2        inc = 1  regs = 3
+          || fB == BITS4(0,1,1,1)    // Dd                    inc = 2  regs = 1
+          || fB == BITS4(1,0,1,0)) { // Dd, Dd+1              inc = 1  regs = 2
+         N = 0; // VLD1/VST1.  'inc' does not appear to have any
+                // meaning for the VLD1/VST1 cases.  'regs' is the number of
+                // registers involved.
+      } 
+      else 
+      if (fB == BITS4(0,0,1,1)       // Dd, Dd+1, Dd+2, Dd+3  inc=2  regs = 2
+          || fB == BITS4(1,0,0,0)    // Dd, Dd+1              inc=1  regs = 1
+          || fB == BITS4(1,0,0,1)) { // Dd, Dd+2              inc=2  regs = 1
+         N = 1; // VLD2/VST2.  'regs' is the number of register-pairs involved
       } else if (fB == BITS4(0,1,0,0) || fB == BITS4(0,1,0,1)) {
          N = 2;
       } else if (fB == BITS4(0,0,0,0) || fB == BITS4(0,0,0,1)) {
@@ -8134,7 +8212,7 @@ Bool dis_neon_load_or_store ( UInt theInstr,
       if (size == 3)
          return False;
 
-      elems = 8 / (1 << size);
+      lanes = 8 / (1 << size);
 
       // go uncond
       if (condT != IRTemp_INVALID)
@@ -8144,38 +8222,142 @@ Bool dis_neon_load_or_store ( UInt theInstr,
       IRTemp addr = newTemp(Ity_I32);
       assign(addr, mkexpr(initialRn));
 
-      for (r = 0; r < regs; r++) {
-         for (i = 0; i < elems; i++) {
+      if (N == 0 /* No interleaving -- VLD1/VST1 */) {
+         vassert(regs == 1 || regs == 2 || regs == 3 || regs == 4);
+         /* inc has no relevance here */
+         for (r = 0; r < regs; r++) {
             if (bL)
-               mk_neon_elem_load_to_one_lane(rD + r, inc, i, N, size, addr);
+               putDRegI64(rD+r, loadLE(Ity_I64, mkexpr(addr)), IRTemp_INVALID);
             else
-               mk_neon_elem_store_from_one_lane(rD + r, inc, i, N, size, addr);
-            tmp = newTemp(Ity_I32);
-            assign(tmp, binop(Iop_Add32, mkexpr(addr),
-                                         mkU32((1 << size) * (N + 1))));
+               storeLE(mkexpr(addr), getDRegI64(rD+r));
+            IRTemp tmp = newTemp(Ity_I32);
+            assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(8)));
+            addr = tmp;
+         }
+      }
+      else if (N == 1 /* 2-interleaving -- VLD2/VST2 */) {
+         vassert( (regs == 1 && (inc == 1 || inc == 2))
+                   || (regs == 2 && inc == 2) );
+         // Make 'nregs' be the number of registers and 'regstep'
+         // equal the actual register-step.  The ARM encoding, using 'regs'
+         // and 'inc', is bizarre.  After this, we have:
+         // Dd, Dd+1              regs = 1, inc = 1,   nregs = 2, regstep = 1
+         // Dd, Dd+2              regs = 1, inc = 2,   nregs = 2, regstep = 2
+         // Dd, Dd+1, Dd+2, Dd+3  regs = 2, inc = 2,   nregs = 4, regstep = 1
+         UInt nregs   = 2;
+         UInt regstep = 1;
+         if (regs == 1 && inc == 1) {
+            /* nothing */
+         } else if (regs == 1 && inc == 2) {
+            regstep = 2;
+         } else if (regs == 2 && inc == 2) {
+            nregs = 4;
+         } else {
+            vassert(0);
+         }
+         // 'a' is address,
+         // 'di' is interleaved data, 'du' is uninterleaved data
+         if (nregs == 2) {
+            IRExpr* a0  = binop(Iop_Add32, mkexpr(addr), mkU32(0));
+            IRExpr* a1  = binop(Iop_Add32, mkexpr(addr), mkU32(8));
+            IRTemp  di0 = newTemp(Ity_I64);
+            IRTemp  di1 = newTemp(Ity_I64);
+            IRTemp  du0 = newTemp(Ity_I64); 
+            IRTemp  du1 = newTemp(Ity_I64);
+            if (bL) {
+               assign(di0, loadLE(Ity_I64, a0));
+               assign(di1, loadLE(Ity_I64, a1));
+               math_DEINTERLEAVE_2(&du0, &du1, di0, di1, 1 << size);
+               putDRegI64(rD + 0 * regstep, mkexpr(du0), IRTemp_INVALID);
+               putDRegI64(rD + 1 * regstep, mkexpr(du1), IRTemp_INVALID);
+            } else {
+               assign(du0, getDRegI64(rD + 0 * regstep));
+               assign(du1, getDRegI64(rD + 1 * regstep));
+               math_INTERLEAVE_2(&di0, &di1, du0, du1, 1 << size);
+               storeLE(a0, mkexpr(di0));
+               storeLE(a1, mkexpr(di1));
+            }
+            IRTemp tmp = newTemp(Ity_I32);
+            assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(16)));
+            addr = tmp;
+         } else {
+            vassert(nregs == 4);
+            vassert(regstep == 1);
+            IRExpr* a0  = binop(Iop_Add32, mkexpr(addr), mkU32(0));
+            IRExpr* a1  = binop(Iop_Add32, mkexpr(addr), mkU32(8));
+            IRExpr* a2  = binop(Iop_Add32, mkexpr(addr), mkU32(16));
+            IRExpr* a3  = binop(Iop_Add32, mkexpr(addr), mkU32(24));
+            IRTemp  di0 = newTemp(Ity_I64);
+            IRTemp  di1 = newTemp(Ity_I64);
+            IRTemp  di2 = newTemp(Ity_I64);
+            IRTemp  di3 = newTemp(Ity_I64);
+            IRTemp  du0 = newTemp(Ity_I64); 
+            IRTemp  du1 = newTemp(Ity_I64);
+            IRTemp  du2 = newTemp(Ity_I64); 
+            IRTemp  du3 = newTemp(Ity_I64);
+            if (bL) {
+               assign(di0, loadLE(Ity_I64, a0));
+               assign(di1, loadLE(Ity_I64, a1));
+               assign(di2, loadLE(Ity_I64, a2));
+               assign(di3, loadLE(Ity_I64, a3));
+               math_DEINTERLEAVE_2(&du0, &du2, di0, di1, 1 << size);
+               math_DEINTERLEAVE_2(&du1, &du3, di2, di3, 1 << size);
+               putDRegI64(rD + 0 * regstep, mkexpr(du0), IRTemp_INVALID);
+               putDRegI64(rD + 1 * regstep, mkexpr(du1), IRTemp_INVALID);
+               putDRegI64(rD + 2 * regstep, mkexpr(du2), IRTemp_INVALID);
+               putDRegI64(rD + 3 * regstep, mkexpr(du3), IRTemp_INVALID);
+            } else {
+               assign(du0, getDRegI64(rD + 0 * regstep));
+               assign(du1, getDRegI64(rD + 1 * regstep));
+               assign(du2, getDRegI64(rD + 2 * regstep));
+               assign(du3, getDRegI64(rD + 3 * regstep));
+               math_INTERLEAVE_2(&di0, &di1, du0, du2, 1 << size);
+               math_INTERLEAVE_2(&di2, &di3, du1, du3, 1 << size);
+               storeLE(a0, mkexpr(di0));
+               storeLE(a1, mkexpr(di1));
+               storeLE(a2, mkexpr(di2));
+               storeLE(a3, mkexpr(di3));
+            }
+
+            IRTemp tmp = newTemp(Ity_I32);
+            assign(tmp, binop(Iop_Add32, mkexpr(addr), mkU32(32)));
             addr = tmp;
          }
+
+      }
+      else {
+         /* Fallback case */
+         for (r = 0; r < regs; r++) {
+            for (i = 0; i < lanes; i++) {
+               if (bL)
+                  mk_neon_elem_load_to_one_lane(rD + r, inc, i, N, size, addr);
+               else
+                  mk_neon_elem_store_from_one_lane(rD + r,
+                                                   inc, i, N, size, addr);
+               IRTemp tmp = newTemp(Ity_I32);
+               assign(tmp, binop(Iop_Add32, mkexpr(addr),
+                                            mkU32((1 << size) * (N + 1))));
+               addr = tmp;
+            }
+         }
       }
+
       /* Writeback */
       if (rM != 15) {
+         IRExpr* e;
          if (rM == 13) {
-            IRExpr* e = binop(Iop_Add32,
-                              mkexpr(initialRn),
-                              mkU32(8 * (N + 1) * regs));
-            if (isT)
-               putIRegT(rN, e, IRTemp_INVALID);
-            else
-               putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
+            e = binop(Iop_Add32, mkexpr(initialRn),
+                                 mkU32(8 * (N + 1) * regs));
          } else {
-            IRExpr* e = binop(Iop_Add32,
-                              mkexpr(initialRn),
-                              mkexpr(initialRm));
-            if (isT)
-               putIRegT(rN, e, IRTemp_INVALID);
-            else
-               putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
+            e = binop(Iop_Add32, mkexpr(initialRn),
+                                 mkexpr(initialRm));
          }
+         if (isT)
+            putIRegT(rN, e, IRTemp_INVALID);
+         else
+            putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
       }
+
       DIP("v%s%u.%u {", bL ? "ld" : "st", N + 1, 8 << INSN(7,6));
       if ((inc == 1 && regs * (N + 1) > 1)
           || (inc == 2 && regs > 1 && N > 0)) {
@@ -19319,6 +19501,254 @@ int main ( void )
 }
 */
 
+/* Spare code for doing reference implementations of various 64-bit
+   SIMD interleaves/deinterleaves/concatenation ops. */
+/*
+// Split a 64 bit value into 4 16 bit ones, in 32-bit IRTemps with
+// the top halves guaranteed to be zero.
+static void break64to16s ( IRTemp* out3, IRTemp* out2, IRTemp* out1,
+                           IRTemp* out0, IRTemp v64 )
+{
+  if (out3) *out3 = newTemp(Ity_I32);
+  if (out2) *out2 = newTemp(Ity_I32);
+  if (out1) *out1 = newTemp(Ity_I32);
+  if (out0) *out0 = newTemp(Ity_I32);
+  IRTemp hi32 = newTemp(Ity_I32);
+  IRTemp lo32 = newTemp(Ity_I32);
+  assign(hi32, unop(Iop_64HIto32, mkexpr(v64)) );
+  assign(lo32, unop(Iop_64to32, mkexpr(v64)) );
+  if (out3) assign(*out3, binop(Iop_Shr32, mkexpr(hi32), mkU8(16)));
+  if (out2) assign(*out2, binop(Iop_And32, mkexpr(hi32), mkU32(0xFFFF)));
+  if (out1) assign(*out1, binop(Iop_Shr32, mkexpr(lo32), mkU8(16)));
+  if (out0) assign(*out0, binop(Iop_And32, mkexpr(lo32), mkU32(0xFFFF)));
+}
+
+// Make a 64 bit value from 4 16 bit ones, each of which is in a 32 bit
+// IRTemp.
+static IRTemp mk64from16s ( IRTemp in3, IRTemp in2, IRTemp in1, IRTemp in0 )
+{
+  IRTemp hi32 = newTemp(Ity_I32);
+  IRTemp lo32 = newTemp(Ity_I32);
+  assign(hi32,
+         binop(Iop_Or32,
+               binop(Iop_Shl32, mkexpr(in3), mkU8(16)),
+               binop(Iop_And32, mkexpr(in2), mkU32(0xFFFF))));
+  assign(lo32,
+         binop(Iop_Or32,
+               binop(Iop_Shl32, mkexpr(in1), mkU8(16)),
+               binop(Iop_And32, mkexpr(in0), mkU32(0xFFFF))));
+  IRTemp res = newTemp(Ity_I64);
+  assign(res, binop(Iop_32HLto64, mkexpr(hi32), mkexpr(lo32)));
+  return res;
+}
+
+static IRExpr* mk_InterleaveLO16x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a1 b1 a0 b0
+  IRTemp a1, a0, b1, b0;
+  break64to16s(NULL, NULL, &a1, &a0, a3210);
+  break64to16s(NULL, NULL, &b1, &b0, b3210);
+  return mkexpr(mk64from16s(a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI16x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a3 b3 a2 b2
+  IRTemp a3, a2, b3, b2;
+  break64to16s(&a3, &a2, NULL, NULL, a3210);
+  break64to16s(&b3, &b2, NULL, NULL, b3210);
+  return mkexpr(mk64from16s(a3, b3, a2, b2));
+}
+
+static IRExpr* mk_CatEvenLanes16x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a2 a0 b2 b0
+  IRTemp a2, a0, b2, b0;
+  break64to16s(NULL, &a2, NULL, &a0, a3210);
+  break64to16s(NULL, &b2, NULL, &b0, b3210);
+  return mkexpr(mk64from16s(a2, a0, b2, b0));
+}
+
+static IRExpr* mk_CatOddLanes16x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a3 a1 b3 b1
+  IRTemp a3, a1, b3, b1;
+  break64to16s(&a3, NULL, &a1, NULL, a3210);
+  break64to16s(&b3, NULL, &b1, NULL, b3210);
+  return mkexpr(mk64from16s(a3, a1, b3, b1));
+}
+
+static IRExpr* mk_InterleaveOddLanes16x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a3 b3 a1 b1
+  IRTemp a3, b3, a1, b1;
+  break64to16s(&a3, NULL, &a1, NULL, a3210);
+  break64to16s(&b3, NULL, &b1, NULL, b3210);
+  return mkexpr(mk64from16s(a3, b3, a1, b1));
+}
+
+static IRExpr* mk_InterleaveEvenLanes16x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a2 b2 a0 b0
+  IRTemp a2, b2, a0, b0;
+  break64to16s(NULL, &a2, NULL, &a0, a3210);
+  break64to16s(NULL, &b2, NULL, &b0, b3210);
+  return mkexpr(mk64from16s(a2, b2, a0, b0));
+}
+
+static void break64to8s ( IRTemp* out7, IRTemp* out6, IRTemp* out5,
+                          IRTemp* out4, IRTemp* out3, IRTemp* out2,
+                          IRTemp* out1,IRTemp* out0, IRTemp v64 )
+{
+  if (out7) *out7 = newTemp(Ity_I32);
+  if (out6) *out6 = newTemp(Ity_I32);
+  if (out5) *out5 = newTemp(Ity_I32);
+  if (out4) *out4 = newTemp(Ity_I32);
+  if (out3) *out3 = newTemp(Ity_I32);
+  if (out2) *out2 = newTemp(Ity_I32);
+  if (out1) *out1 = newTemp(Ity_I32);
+  if (out0) *out0 = newTemp(Ity_I32);
+  IRTemp hi32 = newTemp(Ity_I32);
+  IRTemp lo32 = newTemp(Ity_I32);
+  assign(hi32, unop(Iop_64HIto32, mkexpr(v64)) );
+  assign(lo32, unop(Iop_64to32, mkexpr(v64)) );
+  if (out7)
+    assign(*out7, binop(Iop_And32,
+                        binop(Iop_Shr32, mkexpr(hi32), mkU8(24)),
+                        mkU32(0xFF)));
+  if (out6)
+    assign(*out6, binop(Iop_And32,
+                        binop(Iop_Shr32, mkexpr(hi32), mkU8(16)),
+                        mkU32(0xFF)));
+  if (out5)
+    assign(*out5, binop(Iop_And32,
+                        binop(Iop_Shr32, mkexpr(hi32), mkU8(8)),
+                        mkU32(0xFF)));
+  if (out4)
+    assign(*out4, binop(Iop_And32, mkexpr(hi32), mkU32(0xFF)));
+  if (out3)
+    assign(*out3, binop(Iop_And32,
+                        binop(Iop_Shr32, mkexpr(lo32), mkU8(24)),
+                        mkU32(0xFF)));
+  if (out2)
+    assign(*out2, binop(Iop_And32,
+                        binop(Iop_Shr32, mkexpr(lo32), mkU8(16)),
+                        mkU32(0xFF)));
+  if (out1)
+    assign(*out1, binop(Iop_And32,
+                        binop(Iop_Shr32, mkexpr(lo32), mkU8(8)),
+                        mkU32(0xFF)));
+  if (out0)
+    assign(*out0, binop(Iop_And32, mkexpr(lo32), mkU32(0xFF)));
+}
+
+static IRTemp mk64from8s ( IRTemp in7, IRTemp in6, IRTemp in5, IRTemp in4,
+                           IRTemp in3, IRTemp in2, IRTemp in1, IRTemp in0 )
+{
+  IRTemp hi32 = newTemp(Ity_I32);
+  IRTemp lo32 = newTemp(Ity_I32);
+  assign(hi32,
+         binop(Iop_Or32,
+               binop(Iop_Or32,
+                     binop(Iop_Shl32,
+                           binop(Iop_And32, mkexpr(in7), mkU32(0xFF)),
+                           mkU8(24)),
+                     binop(Iop_Shl32,
+                           binop(Iop_And32, mkexpr(in6), mkU32(0xFF)),
+                           mkU8(16))),
+               binop(Iop_Or32,
+                     binop(Iop_Shl32,
+                           binop(Iop_And32, mkexpr(in5), mkU32(0xFF)), mkU8(8)),
+                     binop(Iop_And32,
+                           mkexpr(in4), mkU32(0xFF)))));
+  assign(lo32,
+         binop(Iop_Or32,
+               binop(Iop_Or32,
+                     binop(Iop_Shl32,
+                           binop(Iop_And32, mkexpr(in3), mkU32(0xFF)),
+                           mkU8(24)),
+                     binop(Iop_Shl32,
+                           binop(Iop_And32, mkexpr(in2), mkU32(0xFF)),
+                           mkU8(16))),
+               binop(Iop_Or32,
+                     binop(Iop_Shl32,
+                           binop(Iop_And32, mkexpr(in1), mkU32(0xFF)), mkU8(8)),
+                     binop(Iop_And32,
+                           mkexpr(in0), mkU32(0xFF)))));
+  IRTemp res = newTemp(Ity_I64);
+  assign(res, binop(Iop_32HLto64, mkexpr(hi32), mkexpr(lo32)));
+  return res;
+}
+
+static IRExpr* mk_InterleaveLO8x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a3 b3 a2 b2 a1 b1 a0 b0
+  IRTemp a3, b3, a2, b2, a1, a0, b1, b0;
+  break64to8s(NULL, NULL, NULL, NULL, &a3, &a2, &a1, &a0, a76543210);
+  break64to8s(NULL, NULL, NULL, NULL, &b3, &b2, &b1, &b0, b76543210);
+  return mkexpr(mk64from8s(a3, b3, a2, b2, a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI8x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a7 b7 a6 b6 a5 b5 a4 b4
+  IRTemp a7, b7, a6, b6, a5, b5, a4, b4;
+  break64to8s(&a7, &a6, &a5, &a4, NULL, NULL, NULL, NULL, a76543210);
+  break64to8s(&b7, &b6, &b5, &b4, NULL, NULL, NULL, NULL, b76543210);
+  return mkexpr(mk64from8s(a7, b7, a6, b6, a5, b5, a4, b4));
+}
+
+static IRExpr* mk_CatEvenLanes8x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a6 a4 a2 a0 b6 b4 b2 b0
+  IRTemp a6, a4, a2, a0, b6, b4, b2, b0;
+  break64to8s(NULL, &a6, NULL, &a4, NULL, &a2, NULL, &a0, a76543210);
+  break64to8s(NULL, &b6, NULL, &b4, NULL, &b2, NULL, &b0, b76543210);
+  return mkexpr(mk64from8s(a6, a4, a2, a0, b6, b4, b2, b0));
+}
+
+static IRExpr* mk_CatOddLanes8x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a7 a5 a3 a1 b7 b5 b3 b1
+  IRTemp a7, a5, a3, a1, b7, b5, b3, b1;
+  break64to8s(&a7, NULL, &a5, NULL, &a3, NULL, &a1, NULL, a76543210);
+  break64to8s(&b7, NULL, &b5, NULL, &b3, NULL, &b1, NULL, b76543210);
+  return mkexpr(mk64from8s(a7, a5, a3, a1, b7, b5, b3, b1));
+}
+
+static IRExpr* mk_InterleaveEvenLanes8x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a6 b6 a4 b4 a2 b2 a0 b0
+  IRTemp a6, b6, a4, b4, a2, b2, a0, b0;
+  break64to8s(NULL, &a6, NULL, &a4, NULL, &a2, NULL, &a0, a76543210);
+  break64to8s(NULL, &b6, NULL, &b4, NULL, &b2, NULL, &b0, b76543210);
+  return mkexpr(mk64from8s(a6, b6, a4, b4, a2, b2, a0, b0));
+}
+
+static IRExpr* mk_InterleaveOddLanes8x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a7 b7 a5 b5 a3 b3 a1 b1
+  IRTemp a7, b7, a5, b5, a3, b3, a1, b1;
+  break64to8s(&a7, NULL, &a5, NULL, &a3, NULL, &a1, NULL, a76543210);
+  break64to8s(&b7, NULL, &b5, NULL, &b3, NULL, &b1, NULL, b76543210);
+  return mkexpr(mk64from8s(a7, b7, a5, b5, a3, b3, a1, b1));
+}
+
+static IRExpr* mk_InterleaveLO32x2 ( IRTemp a10, IRTemp b10 )
+{
+  // returns a0 b0
+  return binop(Iop_32HLto64, unop(Iop_64to32, mkexpr(a10)),
+                             unop(Iop_64to32, mkexpr(b10)));
+}
+
+static IRExpr* mk_InterleaveHI32x2 ( IRTemp a10, IRTemp b10 )
+{
+  // returns a1 b1
+  return binop(Iop_32HLto64, unop(Iop_64HIto32, mkexpr(a10)),
+                             unop(Iop_64HIto32, mkexpr(b10)));
+}
+*/
+
 /*--------------------------------------------------------------------*/
 /*--- end                                         guest_arm_toIR.c ---*/
 /*--------------------------------------------------------------------*/
diff --git a/VEX/priv/host_arm_isel.c b/VEX/priv/host_arm_isel.c
index b2c9edebcc..b744cc380c 100644
--- a/VEX/priv/host_arm_isel.c
+++ b/VEX/priv/host_arm_isel.c
@@ -2203,112 +2203,83 @@ static HReg iselNeon64Expr_wrk ( ISelEnv* env, IRExpr* e )
                                            res, argL, argR, size, False));
             return res;
          }
-         case Iop_InterleaveOddLanes8x8:
-         case Iop_InterleaveOddLanes16x4:
+
+         // These 6 verified 18 Apr 2013
+         case Iop_InterleaveHI32x2:
          case Iop_InterleaveLO32x2:
+         case Iop_InterleaveOddLanes8x8:
          case Iop_InterleaveEvenLanes8x8:
-         case Iop_InterleaveEvenLanes16x4:
-         case Iop_InterleaveHI32x2: {
-            HReg tmp = newVRegD(env);
-            HReg res = newVRegD(env);
+         case Iop_InterleaveOddLanes16x4:
+         case Iop_InterleaveEvenLanes16x4: {
+            HReg rD   = newVRegD(env);
+            HReg rM   = newVRegD(env);
             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
             UInt size;
-            UInt is_lo;
+            Bool resRd;  // is the result in rD or rM ?
             switch (e->Iex.Binop.op) {
-               case Iop_InterleaveOddLanes8x8: is_lo = 1; size = 0; break;
-               case Iop_InterleaveEvenLanes8x8: is_lo = 0; size = 0; break;
-               case Iop_InterleaveOddLanes16x4: is_lo = 1; size = 1; break;
-               case Iop_InterleaveEvenLanes16x4: is_lo = 0; size = 1; break;
-               case Iop_InterleaveLO32x2: is_lo = 1; size = 2; break;
-               case Iop_InterleaveHI32x2: is_lo = 0; size = 2; break;
+               case Iop_InterleaveOddLanes8x8:   resRd = False; size = 0; break;
+               case Iop_InterleaveEvenLanes8x8:  resRd = True;  size = 0; break;
+               case Iop_InterleaveOddLanes16x4:  resRd = False; size = 1; break;
+               case Iop_InterleaveEvenLanes16x4: resRd = True;  size = 1; break;
+               case Iop_InterleaveHI32x2:        resRd = False; size = 2; break;
+               case Iop_InterleaveLO32x2:        resRd = True;  size = 2; break;
                default: vassert(0);
             }
-            if (is_lo) {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argL, 4, False));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argR, 4, False));
-               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
-                                            res, tmp, size, False));
-            } else {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argR, 4, False));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argL, 4, False));
-               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
-                                            tmp, res, size, False));
-            }
-            return res;
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, False));
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, False));
+            addInstr(env, ARMInstr_NDual(ARMneon_TRN, rD, rM, size, False));
+            return resRd ? rD : rM;
          }
+
+         // These 4 verified 18 Apr 2013
          case Iop_InterleaveHI8x8:
-         case Iop_InterleaveHI16x4:
          case Iop_InterleaveLO8x8:
+         case Iop_InterleaveHI16x4:
          case Iop_InterleaveLO16x4: {
-            HReg tmp = newVRegD(env);
-            HReg res = newVRegD(env);
+            HReg rD   = newVRegD(env);
+            HReg rM   = newVRegD(env);
             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
             UInt size;
-            UInt is_lo;
+            Bool resRd;  // is the result in rD or rM ?
             switch (e->Iex.Binop.op) {
-               case Iop_InterleaveHI8x8: is_lo = 1; size = 0; break;
-               case Iop_InterleaveLO8x8: is_lo = 0; size = 0; break;
-               case Iop_InterleaveHI16x4: is_lo = 1; size = 1; break;
-               case Iop_InterleaveLO16x4: is_lo = 0; size = 1; break;
+               case Iop_InterleaveHI8x8:  resRd = False; size = 0; break;
+               case Iop_InterleaveLO8x8:  resRd = True;  size = 0; break;
+               case Iop_InterleaveHI16x4: resRd = False; size = 1; break;
+               case Iop_InterleaveLO16x4: resRd = True;  size = 1; break;
                default: vassert(0);
             }
-            if (is_lo) {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argL, 4, False));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argR, 4, False));
-               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
-                                            res, tmp, size, False));
-            } else {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argR, 4, False));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argL, 4, False));
-               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
-                                            tmp, res, size, False));
-            }
-            return res;
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, False));
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, False));
+            addInstr(env, ARMInstr_NDual(ARMneon_ZIP, rD, rM, size, False));
+            return resRd ? rD : rM;
          }
+
+         // These 4 verified 18 Apr 2013
          case Iop_CatOddLanes8x8:
-         case Iop_CatOddLanes16x4:
          case Iop_CatEvenLanes8x8:
+         case Iop_CatOddLanes16x4:
          case Iop_CatEvenLanes16x4: {
-            HReg tmp = newVRegD(env);
-            HReg res = newVRegD(env);
+            HReg rD   = newVRegD(env);
+            HReg rM   = newVRegD(env);
             HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
             UInt size;
-            UInt is_lo;
+            Bool resRd;  // is the result in rD or rM ?
             switch (e->Iex.Binop.op) {
-               case Iop_CatOddLanes8x8: is_lo = 1; size = 0; break;
-               case Iop_CatEvenLanes8x8: is_lo = 0; size = 0; break;
-               case Iop_CatOddLanes16x4: is_lo = 1; size = 1; break;
-               case Iop_CatEvenLanes16x4: is_lo = 0; size = 1; break;
+               case Iop_CatOddLanes8x8:   resRd = False; size = 0; break;
+               case Iop_CatEvenLanes8x8:  resRd = True;  size = 0; break;
+               case Iop_CatOddLanes16x4:  resRd = False; size = 1; break;
+               case Iop_CatEvenLanes16x4: resRd = True;  size = 1; break;
                default: vassert(0);
             }
-            if (is_lo) {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argL, 4, False));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argR, 4, False));
-               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
-                                            res, tmp, size, False));
-            } else {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argR, 4, False));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argL, 4, False));
-               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
-                                            tmp, res, size, False));
-            }
-            return res;
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, False));
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, False));
+            addInstr(env, ARMInstr_NDual(ARMneon_UZP, rD, rM, size, False));
+            return resRd ? rD : rM;
          }
+
          case Iop_QAdd8Ux8:
          case Iop_QAdd16Ux4:
          case Iop_QAdd32Ux2:
@@ -4399,126 +4370,91 @@ static HReg iselNeonExpr_wrk ( ISelEnv* env, IRExpr* e )
                                            res, argL, argR, size, True));
             return res;
          }
+
+         // These 6 verified 18 Apr 2013
          case Iop_InterleaveEvenLanes8x16:
-         case Iop_InterleaveEvenLanes16x8:
-         case Iop_InterleaveEvenLanes32x4:
          case Iop_InterleaveOddLanes8x16:
+         case Iop_InterleaveEvenLanes16x8:
          case Iop_InterleaveOddLanes16x8:
+         case Iop_InterleaveEvenLanes32x4:
          case Iop_InterleaveOddLanes32x4: {
-            HReg tmp = newVRegV(env);
-            HReg res = newVRegV(env);
+            HReg rD   = newVRegV(env);
+            HReg rM   = newVRegV(env);
             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
             UInt size;
-            UInt is_lo;
+            Bool resRd;  // is the result in rD or rM ?
             switch (e->Iex.Binop.op) {
-               case Iop_InterleaveEvenLanes8x16: is_lo = 0; size = 0; break;
-               case Iop_InterleaveOddLanes8x16: is_lo = 1; size = 0; break;
-               case Iop_InterleaveEvenLanes16x8: is_lo = 0; size = 1; break;
-               case Iop_InterleaveOddLanes16x8: is_lo = 1; size = 1; break;
-               case Iop_InterleaveEvenLanes32x4: is_lo = 0; size = 2; break;
-               case Iop_InterleaveOddLanes32x4: is_lo = 1; size = 2; break;
-               default:
-                  ppIROp(e->Iex.Binop.op);
-                  vpanic("Illegal element size in VTRN");
-            }
-            if (is_lo) {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argL, 4, True));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argR, 4, True));
-               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
-                                            res, tmp, size, True));
-            } else {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argR, 4, True));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argL, 4, True));
-               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
-                                            tmp, res, size, True));
+               case Iop_InterleaveOddLanes8x16:  resRd = False; size = 0; break;
+               case Iop_InterleaveEvenLanes8x16: resRd = True;  size = 0; break;
+               case Iop_InterleaveOddLanes16x8:  resRd = False; size = 1; break;
+               case Iop_InterleaveEvenLanes16x8: resRd = True;  size = 1; break;
+               case Iop_InterleaveOddLanes32x4:  resRd = False; size = 2; break;
+               case Iop_InterleaveEvenLanes32x4: resRd = True;  size = 2; break;
+               default: vassert(0);
             }
-            return res;
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, True));
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, True));
+            addInstr(env, ARMInstr_NDual(ARMneon_TRN, rD, rM, size, True));
+            return resRd ? rD : rM;
          }
+
+         // These 6 verified 18 Apr 2013
          case Iop_InterleaveHI8x16:
-         case Iop_InterleaveHI16x8:
-         case Iop_InterleaveHI32x4:
          case Iop_InterleaveLO8x16:
+         case Iop_InterleaveHI16x8:
          case Iop_InterleaveLO16x8:
+         case Iop_InterleaveHI32x4:
          case Iop_InterleaveLO32x4: {
-            HReg tmp = newVRegV(env);
-            HReg res = newVRegV(env);
+            HReg rD   = newVRegV(env);
+            HReg rM   = newVRegV(env);
             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
             UInt size;
-            UInt is_lo;
+            Bool resRd;  // is the result in rD or rM ?
             switch (e->Iex.Binop.op) {
-               case Iop_InterleaveHI8x16: is_lo = 1; size = 0; break;
-               case Iop_InterleaveLO8x16: is_lo = 0; size = 0; break;
-               case Iop_InterleaveHI16x8: is_lo = 1; size = 1; break;
-               case Iop_InterleaveLO16x8: is_lo = 0; size = 1; break;
-               case Iop_InterleaveHI32x4: is_lo = 1; size = 2; break;
-               case Iop_InterleaveLO32x4: is_lo = 0; size = 2; break;
-               default:
-                  ppIROp(e->Iex.Binop.op);
-                  vpanic("Illegal element size in VZIP");
-            }
-            if (is_lo) {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argL, 4, True));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argR, 4, True));
-               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
-                                            res, tmp, size, True));
-            } else {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argR, 4, True));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argL, 4, True));
-               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
-                                            tmp, res, size, True));
+               case Iop_InterleaveHI8x16: resRd = False; size = 0; break;
+               case Iop_InterleaveLO8x16: resRd = True;  size = 0; break;
+               case Iop_InterleaveHI16x8: resRd = False; size = 1; break;
+               case Iop_InterleaveLO16x8: resRd = True;  size = 1; break;
+               case Iop_InterleaveHI32x4: resRd = False; size = 2; break;
+               case Iop_InterleaveLO32x4: resRd = True;  size = 2; break;
+               default: vassert(0);
             }
-            return res;
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, True));
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, True));
+            addInstr(env, ARMInstr_NDual(ARMneon_ZIP, rD, rM, size, True));
+            return resRd ? rD : rM;
          }
+
+         // These 6 verified 18 Apr 2013
          case Iop_CatOddLanes8x16:
-         case Iop_CatOddLanes16x8:
-         case Iop_CatOddLanes32x4:
          case Iop_CatEvenLanes8x16:
+         case Iop_CatOddLanes16x8:
          case Iop_CatEvenLanes16x8:
+         case Iop_CatOddLanes32x4:
          case Iop_CatEvenLanes32x4: {
-            HReg tmp = newVRegV(env);
-            HReg res = newVRegV(env);
+            HReg rD   = newVRegV(env);
+            HReg rM   = newVRegV(env);
             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
             UInt size;
-            UInt is_lo;
+            Bool resRd;  // is the result in rD or rM ?
             switch (e->Iex.Binop.op) {
-               case Iop_CatOddLanes8x16: is_lo = 1; size = 0; break;
-               case Iop_CatEvenLanes8x16: is_lo = 0; size = 0; break;
-               case Iop_CatOddLanes16x8: is_lo = 1; size = 1; break;
-               case Iop_CatEvenLanes16x8: is_lo = 0; size = 1; break;
-               case Iop_CatOddLanes32x4: is_lo = 1; size = 2; break;
-               case Iop_CatEvenLanes32x4: is_lo = 0; size = 2; break;
-               default:
-                  ppIROp(e->Iex.Binop.op);
-                  vpanic("Illegal element size in VUZP");
-            }
-            if (is_lo) {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argL, 4, True));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argR, 4, True));
-               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
-                                            res, tmp, size, True));
-            } else {
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             tmp, argR, 4, True));
-               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
-                                             res, argL, 4, True));
-               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
-                                            tmp, res, size, True));
+               case Iop_CatOddLanes8x16:  resRd = False; size = 0; break;
+               case Iop_CatEvenLanes8x16: resRd = True;  size = 0; break;
+               case Iop_CatOddLanes16x8:  resRd = False; size = 1; break;
+               case Iop_CatEvenLanes16x8: resRd = True;  size = 1; break;
+               case Iop_CatOddLanes32x4:  resRd = False; size = 2; break;
+               case Iop_CatEvenLanes32x4: resRd = True;  size = 2; break;
+               default: vassert(0);
             }
-            return res;
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rM, argL, 4, True));
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, rD, argR, 4, True));
+            addInstr(env, ARMInstr_NDual(ARMneon_UZP, rD, rM, size, True));
+            return resRd ? rD : rM;
          }
+
          case Iop_QAdd8Ux16:
          case Iop_QAdd16Ux8:
          case Iop_QAdd32Ux4:
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 00a463a353..cc8ea54972 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -953,7 +953,6 @@ typedef
       Iop_InterleaveOddLanes8x8, Iop_InterleaveEvenLanes8x8,
       Iop_InterleaveOddLanes16x4, Iop_InterleaveEvenLanes16x4,
 
-
       /* CONCATENATION -- build a new value by concatenating either
          the even or odd lanes of both operands.  Note that
          Cat{Odd,Even}Lanes32x2 are identical to Interleave{HI,LO}32x2