From: Julian Seward <jseward@acm.org>
Date: Sun, 26 Jan 2014 19:11:14 +0000 (+0000)
Subject: Improve front and back end support for SIMD instructions on Arm64.
X-Git-Tag: svn/VALGRIND_3_10_1^2~160
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c4a62f1f1684669092028b4aca027c6836df3793;p=thirdparty%2Fvalgrind.git

Improve front and back end support for SIMD instructions on Arm64.

Implement the following instructions -- some but not necessarily
all laneage combinations:

  LD1 {vT.2d},  [Xn|SP]
  ST1 {vT.2d},  [Xn|SP]
  LD1 {vT.4s},  [Xn|SP]
  ST1 {vT.4s},  [Xn|SP]
  LD1 {vT.8h},  [Xn|SP]
  ST1 {vT.8h},  [Xn|SP]
  LD1 {vT.16b}, [Xn|SP]
  ST1 {vT.16b}, [Xn|SP]
  LD1 {vT.1d}, [Xn|SP]
  ST1 {vT.1d}, [Xn|SP]
  LD1 {vT.2s}, [Xn|SP]
  ST1 {vT.2s}, [Xn|SP]
  LD1 {vT.4h}, [Xn|SP]
  ST1 {vT.4h}, [Xn|SP]
  LD1 {vT.8b}, [Xn|SP]
  ST1 {vT.8b}, [Xn|SP]
  ST1 {vT.2d}, [xN|SP], #16
  LD1 {vT.2d}, [xN|SP], #16
  ST1 {vT.4s}, [xN|SP], #16
  ST1 {vT.8h}, [xN|SP], #16
  ST1 {vT.2s}, [xN|SP], #8
  SCVTF Vd, Vn
  UCVTF Vd, Vn
  FADD Vd,Vn,Vm   1
  FSUB Vd,Vn,Vm   2
  FMUL Vd,Vn,Vm   3
  FDIV Vd,Vn,Vm   4
  FMLA Vd,Vn,Vm   5
  FMLS Vd,Vn,Vm   6
  ADD Vd.T, Vn.T, Vm.T
  SUB Vd.T, Vn.T, Vm.T
  XTN {,2}
  DUP Vd.T, Vn.Ts[index]


git-svn-id: svn://svn.valgrind.org/vex/trunk@2810
---

diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index 7fbb6a6ed7..290e1c840c 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -896,7 +896,6 @@ static Int offsetQReg128 ( UInt qregNo )
    }
 }
 
-
 /* Write to a complete Qreg. */
 static void putQReg128 ( UInt qregNo, IRExpr* e )
 {
@@ -929,54 +928,61 @@ static IRType preferredVectorSubTypeFromSize ( UInt szB )
    }
 }
 
-/* Find the offset of the szB'th least significant bytes of the given
-   Qreg.  This requires knowing the endianness of the host. */
-static Int offsetQReg ( UInt szB, UInt qregNo )
+/* Find the offset of the laneNo'th lane of type laneTy in the given
+   Qreg.  Since the host is little-endian, the least significant lane
+   has the lowest offset. */
+static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
 {
    vassert(!host_is_bigendian);
    Int base = offsetQReg128(qregNo);
-   /* Since we're dealing with a little-endian host, all of the
-      sub-parts will have the same offset as the base register.  But
-      we still need to check that szB is valid. */
-   switch (szB) {
-      case 1: case 2: case 4: case 8: case 16: break;
-      default: vassert(0);
+   /* Since the host is little-endian, the least significant lane
+      will be at the lowest address. */
+   /* Restrict this to known types, so as to avoid silently accepting
+      stupid types. */
+   UInt laneSzB = 0;
+   switch (laneTy) {
+      case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
+      case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
+      case Ity_V128:               laneSzB = 16; break;
+      default: break;
    }
-   return base;
+   vassert(laneSzB > 0);
+   UInt minOff = laneNo * laneSzB;
+   UInt maxOff = minOff + laneSzB - 1;
+   vassert(maxOff < 16);
+   return base + minOff;
 }
 
-static void putQReg ( UInt qregNo, IRExpr* e )
+/* Put to the least significant lane of a Qreg. */
+static void putQRegLO ( UInt qregNo, IRExpr* e )
 {
    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
-   Int    off = offsetQReg(sizeofIRType(ty), qregNo);
+   Int    off = offsetQRegLane(qregNo, ty, 0);
    switch (ty) {
-      case Ity_I8:   break;
-      case Ity_I16:  break;
-      case Ity_I32:  break;
-      case Ity_F32:  break;
-      case Ity_I64:  break;
-      case Ity_F64:  break;
-      case Ity_V128: break;
-      default:       vassert(0); // Other cases are ATC
+      case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
+      case Ity_F32: case Ity_F64: case Ity_V128:
+         break;
+      default:
+         vassert(0); // Other cases are probably invalid
    }
    stmt(IRStmt_Put(off, e));
 }
 
-static IRExpr* getQReg ( IRType ty, UInt qregNo )
+/* Get from the least significant lane of a Qreg. */
+static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
 {
-   Int off = offsetQReg(sizeofIRType(ty), qregNo);
+   Int off = offsetQRegLane(qregNo, ty, 0);
    switch (ty) {
-      case Ity_I32:  break;
-      case Ity_F32:  break;
-      case Ity_I64:  break;
-      case Ity_F64:  break;
-      case Ity_V128: break;
-      default:       vassert(0); // Other cases are ATC
+      case Ity_I32: case Ity_I64:
+      case Ity_F32: case Ity_F64: case Ity_V128:
+         break;
+      default:
+         vassert(0); // Other cases are ATC
    }
    return IRExpr_Get(off, ty);
 }
 
-static const HChar* nameQReg ( UInt szB, UInt qregNo )
+static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
 {
    static const HChar* namesQ[32]
       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7", 
@@ -1004,7 +1010,7 @@ static const HChar* nameQReg ( UInt szB, UInt qregNo )
           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23", 
           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
    vassert(qregNo < 32);
-   switch (szB) {
+   switch (sizeofIRType(laneTy)) {
       case 1:  return namesB[qregNo];
       case 2:  return namesH[qregNo];
       case 4:  return namesS[qregNo];
@@ -1015,34 +1021,64 @@ static const HChar* nameQReg ( UInt szB, UInt qregNo )
    /*NOTREACHED*/
 }
 
+static const HChar* nameQReg128 ( UInt qregNo )
+{
+   return nameQRegLO(qregNo, Ity_V128);
+}
+
 /* Find the offset of the most significant half (8 bytes) of the given
    Qreg.  This requires knowing the endianness of the host. */
-static Int offsetQReg64HI ( UInt qregNo )
+static Int offsetQRegHI64 ( UInt qregNo )
 {
-   vassert(!host_is_bigendian);
-   Int base = offsetQReg128(qregNo);
-   /* Since the host is little endian, the least significant half is
-      at the lower offset.  So add 8 to get the MS half offset. */
-   return base+8;
+   return offsetQRegLane(qregNo, Ity_I64, 1);
 }
 
-static IRExpr* getQReg64HI ( UInt qregNo )
+static IRExpr* getQRegHI64 ( UInt qregNo )
 {
-   return IRExpr_Get(offsetQReg64HI(qregNo), Ity_I64);
+   return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
 }
 
-static void putQReg64HI ( UInt qregNo, IRExpr* e )
+static void putQRegHI64 ( UInt qregNo, IRExpr* e )
 {
    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
-   Int    off = offsetQReg64HI(qregNo);
+   Int    off = offsetQRegHI64(qregNo);
    switch (ty) {
-      case Ity_I64:  break;
-      case Ity_F64:  break;
-      default:       vassert(0); // Other cases are plain wrong
+      case Ity_I64: case Ity_F64:
+         break;
+      default:
+         vassert(0); // Other cases are plain wrong
    }
    stmt(IRStmt_Put(off, e));
 }
 
+/* Put to a specified lane of a Qreg. */
+static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
+{
+   IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
+   Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
+   switch (laneTy) {
+      case Ity_F64: case Ity_I64:
+         break;
+      default:
+         vassert(0); // Other cases are ATC
+   }
+   stmt(IRStmt_Put(off, e));
+}
+
+/* Get from the least significant lane of a Qreg. */
+static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
+{
+   Int off = offsetQRegLane(qregNo, laneTy, laneNo);
+   switch (laneTy) {
+      case Ity_I64: case Ity_I32:
+         break;
+      default:
+         vassert(0); // Other cases are ATC
+   }
+   return IRExpr_Get(off, laneTy);
+}
+
+
 //ZZ /* ---------------- Misc registers ---------------- */
 //ZZ 
 //ZZ static void putMiscReg32 ( UInt    gsoffset, 
@@ -1533,6 +1569,45 @@ static IRTemp math_BSWAP64 ( IRTemp t1 )
 }
 
 
+/* Duplicates the bits at the bottom of the given word to fill the
+   whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
+   except for the bottom bits. */
+static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
+{
+   if (srcTy == Ity_I8) {
+      IRTemp t16 = newTemp(Ity_I64);
+      assign(t16, binop(Iop_Or64, mkexpr(src),
+                                  binop(Iop_Shl64, mkexpr(src), mkU8(8))));
+      IRTemp t32 = newTemp(Ity_I64);
+      assign(t32, binop(Iop_Or64, mkexpr(t16),
+                                  binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
+      IRTemp t64 = newTemp(Ity_I64);
+      assign(t64, binop(Iop_Or64, mkexpr(t32),
+                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
+      return t64;
+   }
+   if (srcTy == Ity_I16) {
+      IRTemp t32 = newTemp(Ity_I64);
+      assign(t32, binop(Iop_Or64, mkexpr(src),
+                                  binop(Iop_Shl64, mkexpr(src), mkU8(16))));
+      IRTemp t64 = newTemp(Ity_I64);
+      assign(t64, binop(Iop_Or64, mkexpr(t32),
+                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
+      return t64;
+   }
+   if (srcTy == Ity_I32) {
+      IRTemp t64 = newTemp(Ity_I64);
+      assign(t64, binop(Iop_Or64, mkexpr(src),
+                                  binop(Iop_Shl64, mkexpr(src), mkU8(32))));
+      return t64;
+   }
+   if (srcTy == Ity_I64) {
+      return src;
+   }
+   vassert(0);
+}
+
+
 /*------------------------------------------------------------*/
 /*--- FP comparison helpers                                ---*/
 /*------------------------------------------------------------*/
@@ -3535,15 +3610,15 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
          }
 
          if (isLD) {
-            putQReg(tt1,
-                    loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
-            putQReg(tt2,
-                    loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
+            putQRegLO(tt1,
+                      loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
+            putQRegLO(tt2,
+                      loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
          } else {
             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
-                    getQReg(ty, tt1));
+                    getQRegLO(tt1, ty));
             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
-                    getQReg(ty, tt2));
+                    getQRegLO(tt2, ty));
          }
 
          if (wBack)
@@ -3564,7 +3639,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
                vassert(0);
          }
          DIP(fmt_str, isLD ? "ld" : "st",
-                      nameQReg(szB, tt1), nameQReg(szB, tt2),
+                      nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
                       nameIReg64orSP(nn), simm7);
          return True;
       }
@@ -3598,43 +3673,43 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
          case 0: /* 8 bit */
             if (isLD) {
                putQReg128(tt, mkV128(0x0000));
-               putQReg(tt, loadLE(Ity_I8, mkexpr(ea)));
-               DIP("ldr %s, %s\n", nameQReg(1, tt), dis_buf);
+               putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
             } else {
                vassert(0); //ATC
-               storeLE(mkexpr(ea), getQReg(Ity_I8, tt));
-               DIP("str %s, %s\n", nameQReg(1, tt), dis_buf);
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
             }
             break;
          case 1:
             if (isLD) {
                putQReg128(tt, mkV128(0x0000));
-               putQReg(tt, loadLE(Ity_I16, mkexpr(ea)));
-               DIP("ldr %s, %s\n", nameQReg(2, tt), dis_buf);
+               putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
             } else {
                vassert(0); //ATC
-               storeLE(mkexpr(ea), getQReg(Ity_I16, tt));
-               DIP("str %s, %s\n", nameQReg(2, tt), dis_buf);
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
             }
             break;
          case 2: /* 32 bit */
             if (isLD) {
                putQReg128(tt, mkV128(0x0000));
-               putQReg(tt, loadLE(Ity_I32, mkexpr(ea)));
-               DIP("ldr %s, %s\n", nameQReg(4, tt), dis_buf);
+               putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
             } else {
-               storeLE(mkexpr(ea), getQReg(Ity_I32, tt));
-               DIP("str %s, %s\n", nameQReg(4, tt), dis_buf);
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
             }
             break;
          case 3: /* 64 bit */
             if (isLD) {
                putQReg128(tt, mkV128(0x0000));
-               putQReg(tt, loadLE(Ity_I64, mkexpr(ea)));
-               DIP("ldr %s, %s\n", nameQReg(8, tt), dis_buf);
+               putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
+               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
             } else {
-               storeLE(mkexpr(ea), getQReg(Ity_I64, tt));
-               DIP("str %s, %s\n", nameQReg(8, tt), dis_buf);
+               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
+               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
             }
             break;
          case 4:  return False; //ATC
@@ -3727,13 +3802,13 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
          if (szLg2 < 4) {
             putQReg128(tt, mkV128(0x0000));
          }
-        putQReg(tt, loadLE(ty, mkexpr(tEA)));
+         putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
       } else {
-         storeLE(mkexpr(tEA), getQReg(ty, tt));
+         storeLE(mkexpr(tEA), getQRegLO(tt, ty));
       }
       DIP("%s %s, [%s, #%u]\n",
           isLD ? "ldr" : "str",
-          nameQReg(1 << szLg2, tt), nameIReg64orSP(nn), pimm12);
+          nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
       return True;
    }
 
@@ -3778,14 +3853,14 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
          if (szLg2 < 4) {
             putQReg128(tt, mkV128(0x0000));
          }
-         putQReg(tt, loadLE(ty, mkexpr(tTA)));
+         putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
       } else {
-         storeLE(mkexpr(tTA), getQReg(ty, tt));
+         storeLE(mkexpr(tTA), getQRegLO(tt, ty));
       }
       putIReg64orSP(nn, mkexpr(tEA));
       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
           isLD ? "ldr" : "str",
-          nameQReg(1 << szLg2, tt), nameIReg64orSP(nn), simm9);
+          nameQRegLO(tt, ty), nameIReg64orSP(nn), simm9);
       return True;
    }
 
@@ -3816,16 +3891,16 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
       if (isLD) {
-        if (szLg2 < 4) {
-           putQReg128(tt, mkV128(0x0000));
-        }
-        putQReg(tt, loadLE(ty, mkexpr(tEA)));
+         if (szLg2 < 4) {
+            putQReg128(tt, mkV128(0x0000));
+         }
+         putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
       } else {
-        storeLE(mkexpr(tEA), getQReg(ty, tt));
+         storeLE(mkexpr(tEA), getQRegLO(tt, ty));
       }
       DIP("%s %s, [%s, #%lld]\n",
           isLD ? "ldur" : "stur",
-          nameQReg(1 << szLg2, tt), nameIReg64orSP(nn), (Long)simm9);
+          nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
       return True;
    }
 
@@ -3841,49 +3916,98 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt   tt    = INSN(4,0);
       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
       IRType ty    = preferredVectorSubTypeFromSize(szB);
-      putQReg(tt, loadLE(ty, mkU64(ea)));
-      DIP("ldr %s, 0x%llx (literal)\n", nameQReg(szB, tt), ea);
+      putQReg128(tt, mkV128(0x0000));
+      putQRegLO(tt, loadLE(ty, mkU64(ea)));
+      DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
       return True;
    }
 
-   /* FIXME Temporary hacks to get through ld.so FIXME */
+   /* ---------- LD1/ST1 (single structure, no offset) ---------- */
+   /* 31        23
+      0100 1100 0100 0000 0111 11 N T   LD1 {vT.2d},  [Xn|SP]
+      0100 1100 0000 0000 0111 11 N T   ST1 {vT.2d},  [Xn|SP]
+      0100 1100 0100 0000 0111 10 N T   LD1 {vT.4s},  [Xn|SP]
+      0100 1100 0000 0000 0111 10 N T   ST1 {vT.4s},  [Xn|SP]
+      0100 1100 0100 0000 0111 01 N T   LD1 {vT.8h},  [Xn|SP]
+      0100 1100 0000 0000 0111 01 N T   ST1 {vT.8h},  [Xn|SP]
+      0100 1100 0100 0000 0111 00 N T   LD1 {vT.16b}, [Xn|SP]
+      0100 1100 0000 0000 0111 00 N T   ST1 {vT.16b}, [Xn|SP]
+      FIXME does this assume that the host is little endian?
+   */
+   if (   (insn & 0xFFFFF000) == 0x4C407000 // LD1 cases
+       || (insn & 0xFFFFF000) == 0x4C007000 // ST1 cases
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* names[4] = { "2d", "4s", "8h", "16b" };
+      const HChar* name = names[INSN(11,10)];
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      if (isLD) {
+         putQReg128(vT, loadLE(Ity_V128, mkexpr(tEA)));
+      } else {
+         storeLE(mkexpr(tEA), getQReg128(vT));
+      }
+      DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
+      return True;
+   }
 
-   /* ------------------ ST1 variants ------------------ */
-   /* st1 {vT.2d}, [<xN|SP>], #16.
-      Note that #16 is implied and cannot be set to any
-      other value.
-      0100 1100 1001 1111 0111 11 N T
-      FIXME doesn't this assume that the host is little endian?
+   /* 31        23
+      0000 1100 0100 0000 0111 11 N T   LD1 {vT.1d}, [Xn|SP]
+      0000 1100 0000 0000 0111 11 N T   ST1 {vT.1d}, [Xn|SP]
+      0000 1100 0100 0000 0111 10 N T   LD1 {vT.2s}, [Xn|SP]
+      0000 1100 0000 0000 0111 10 N T   ST1 {vT.2s}, [Xn|SP]
+      0000 1100 0100 0000 0111 01 N T   LD1 {vT.4h}, [Xn|SP]
+      0000 1100 0000 0000 0111 01 N T   ST1 {vT.4h}, [Xn|SP]
+      0000 1100 0100 0000 0111 00 N T   LD1 {vT.8b}, [Xn|SP]
+      0000 1100 0000 0000 0111 00 N T   ST1 {vT.8b}, [Xn|SP]
+      FIXME does this assume that the host is little endian?
    */
-   if ((insn & 0xFFFFFC00) == 0x4C9F7C00) {
-      UInt   rN  = INSN(9,5);
-      UInt   vT  = INSN(4,0);
-      IRTemp tEA = newTemp(Ity_I64);
+   if (   (insn & 0xFFFFF000) == 0x0C407000 // LD1 cases
+       || (insn & 0xFFFFF000) == 0x0C007000 // ST1 cases
+      ) {
+      Bool   isLD = INSN(22,22) == 1;
+      UInt   rN   = INSN(9,5);
+      UInt   vT   = INSN(4,0);
+      IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* names[4] = { "1d", "2s", "4h", "8b" };
+      const HChar* name = names[INSN(11,10)];
       assign(tEA, getIReg64orSP(rN));
       if (rN == 31) { /* FIXME generate stack alignment check */ }
-      storeLE(mkexpr(tEA), getQReg128(vT));
-      putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16)));
-      DIP("st1 {v%u.2d}, [%s], #16\n", vT, nameIReg64orSP(rN));
+      if (isLD) {
+         putQRegLane(vT, 0, loadLE(Ity_I64, mkexpr(tEA)));
+         putQRegLane(vT, 1, mkU64(0));
+      } else {
+         storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
+      }
+      DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
       return True;
    }
 
-   /* ------------------ LD1 variants ------------------ */
+   /* ---------- LD1/ST1 (single structure, post index) ---------- */
    /* 31        23
-      0100 1100 0100 0000 0111 11 N T   LD1 {vT.2d}, [Xn|SP]
-      0100 1100 0000 0000 0111 11 N T   ST1 {vT.2d}, [Xn|SP]
-      0100 1100 0100 0000 0111 00 N T   LD1 {vT.16b}, [Xn|SP]
-      0100 1100 0000 0000 0111 00 N T   ST1 {vT.16b}, [Xn|SP]
-      FIXME doesn't this assume that the host is little endian?
+      0100 1100 1001 1111 0111 11 N T  ST1 {vT.2d}, [xN|SP], #16
+      0100 1100 1101 1111 0111 11 N T  LD1 {vT.2d}, [xN|SP], #16
+      0100 1100 1001 1111 0111 10 N T  ST1 {vT.4s}, [xN|SP], #16
+      0100 1100 1001 1111 0111 01 N T  ST1 {vT.8h}, [xN|SP], #16
+      Note that #16 is implied and cannot be any other value.
+      FIXME does this assume that the host is little endian?
    */
-   if (   (insn & 0xFFFFFC00) == 0x4C407C00 // LD1 {vT.2d}, [Xn|SP]
-       || (insn & 0xFFFFFC00) == 0x4C007C00 // ST1 {vT.2d}, [Xn|SP]
-       || (insn & 0xFFFFFC00) == 0x4C407000 // LD1 {vT.16b}, [Xn|SP]
-       || (insn & 0xFFFFFC00) == 0x4C007000 // ST1 {vT.16b}, [Xn|SP]
+   if (   (insn & 0xFFFFFC00) == 0x4C9F7C00 // ST1 {vT.2d}, [xN|SP], #16
+       || (insn & 0xFFFFFC00) == 0x4CDF7C00 // LD1 {vT.2d}, [xN|SP], #16
+       || (insn & 0xFFFFFC00) == 0x4C9F7800 // ST1 {vT.4s}, [xN|SP], #16
+       || (insn & 0xFFFFFC00) == 0x4C9F7400 // ST1 {vT.8h}, [xN|SP], #16
       ) {
       Bool   isLD = INSN(22,22) == 1;
       UInt   rN   = INSN(9,5);
       UInt   vT   = INSN(4,0);
       IRTemp tEA  = newTemp(Ity_I64);
+      const HChar* names[4] = { "2d", "4s", "8h", "16b" };
+      const HChar* name = names[INSN(11,10)];
       assign(tEA, getIReg64orSP(rN));
       if (rN == 31) { /* FIXME generate stack alignment check */ }
       if (isLD) {
@@ -3891,12 +4015,34 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
       } else {
          storeLE(mkexpr(tEA), getQReg128(vT));
       }
-      DIP("%s {v%u.%s}, [%s]\n", isLD ? "ld1" : "st1",
-          vT, INSN(11,10) == BITS2(0,0) ? "16b" : "2d",
-          nameIReg64orSP(rN));
+      putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(16)));
+      DIP("%s {v%u.%s}, [%s], #16\n", isLD ? "ld1" : "st1",
+          vT, name, nameIReg64orSP(rN));
+      return True;
+   }
+
+   /* 
+      0000 1100 1001 1111 0111 10 N T  ST1 {vT.2s}, [xN|SP], #8
+      Note that #8 is implied and cannot be any other value.
+      FIXME does this assume that the host is little endian?
+   */
+   if (   (insn & 0xFFFFFC00) == 0x0C9F7800 // st1 {vT.2s}, [xN|SP], #8
+      ) {
+      UInt   rN  = INSN(9,5);
+      UInt   vT  = INSN(4,0);
+      IRTemp tEA = newTemp(Ity_I64);
+      const HChar* names[4] = { "1d", "2s", "4h", "8b" };
+      const HChar* name = names[INSN(11,10)];
+      assign(tEA, getIReg64orSP(rN));
+      if (rN == 31) { /* FIXME generate stack alignment check */ }
+      storeLE(mkexpr(tEA), getQRegLane(vT, 0, Ity_I64));
+      putIReg64orSP(rN, binop(Iop_Add64, mkexpr(tEA), mkU64(8)));
+      DIP("st1 {v%u.%s}, [%s], #8\n", vT, name, nameIReg64orSP(rN));
       return True;
    }
 
+   /* FIXME Temporary hacks to get through ld.so FIXME */
+
    /* -------------------- LD{A}XR -------------------- */
    /* FIXME: this is a hack; needs real atomicity stuff. */
    /* 31       29        20 19           9 4
@@ -4216,36 +4362,102 @@ Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn)
 /* Generate N copies of |bit| in the bottom of a ULong. */
 static ULong Replicate ( ULong bit, Int N )
 {
-  vassert(bit <= 1 && N >= 1 && N < 64);
-  if (bit == 0) {
-    return 0;
-  } else {
-    /* Careful.  This won't work for N == 64. */
-    return (1ULL << N) - 1;
-  }
+   vassert(bit <= 1 && N >= 1 && N < 64);
+   if (bit == 0) {
+      return 0;
+    } else {
+      /* Careful.  This won't work for N == 64. */
+      return (1ULL << N) - 1;
+   }
 }
 
 static ULong VFPExpandImm ( ULong imm8, Int N )
 {
-  vassert(imm8 <= 0xFF);
-  vassert(N == 32 || N == 64);
-  Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
-  Int F = N - E - 1;
-  ULong imm8_6 = (imm8 >> 6) & 1;
-  /* sign: 1 bit */
-  /* exp:  E bits */
-  /* frac: F bits */
-  ULong sign = (imm8 >> 7) & 1;
-  ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
-  ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
-  vassert(sign < (1ULL << 1));
-  vassert(exp  < (1ULL << E));
-  vassert(frac < (1ULL << F));
-  vassert(1 + E + F == N);
-  ULong res = (sign << (E+F)) | (exp << F) | frac;
-  return res;
+   vassert(imm8 <= 0xFF);
+   vassert(N == 32 || N == 64);
+   Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
+   Int F = N - E - 1;
+   ULong imm8_6 = (imm8 >> 6) & 1;
+   /* sign: 1 bit */
+   /* exp:  E bits */
+   /* frac: F bits */
+   ULong sign = (imm8 >> 7) & 1;
+   ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
+   ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
+   vassert(sign < (1ULL << 1));
+   vassert(exp  < (1ULL << E));
+   vassert(frac < (1ULL << F));
+   vassert(1 + E + F == N);
+   ULong res = (sign << (E+F)) | (exp << F) | frac;
+   return res;
 }
 
+/* Help a bit for decoding laneage for vector operations that can be
+   of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
+   and SZ bits, typically for vector floating point. */
+static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
+                               /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
+                               /*OUT*/const HChar** arrSpec,
+                               Bool bitQ, Bool bitSZ )
+{
+   vassert(bitQ == True || bitQ == False);
+   vassert(bitSZ == True || bitSZ == False);
+   if (bitQ && bitSZ) { // 2x64
+      if (tyI)       *tyI       = Ity_I64;
+      if (tyF)       *tyF       = Ity_F64;
+      if (nLanes)    *nLanes    = 2;
+      if (zeroUpper) *zeroUpper = False;
+      if (arrSpec)   *arrSpec   = "2d";
+      return True;
+   }
+   if (bitQ && !bitSZ) { // 4x32
+      if (tyI)       *tyI       = Ity_I32;
+      if (tyF)       *tyF       = Ity_F32;
+      if (nLanes)    *nLanes    = 4;
+      if (zeroUpper) *zeroUpper = False;
+      if (arrSpec)   *arrSpec   = "4s";
+      return True;
+   }
+   if (!bitQ && !bitSZ) { // 2x32
+      if (tyI)       *tyI       = Ity_I32;
+      if (tyF)       *tyF       = Ity_F32;
+      if (nLanes)    *nLanes    = 2;
+      if (zeroUpper) *zeroUpper = True;
+      if (arrSpec)   *arrSpec   = "2s";
+      return True;
+   }
+   // Else impliedly 1x64, which isn't allowed.
+   return False;
+}
+
+/* Helper for decoding laneage for simple vector operations,
+   eg integer add. */
+static Bool getLaneInfo_SIMPLE ( /*OUT*/Bool* zeroUpper,
+                                 /*OUT*/const HChar** arrSpec,
+                                 Bool bitQ, UInt szBlg2 )
+{
+   vassert(bitQ == True || bitQ == False);
+   vassert(szBlg2 < 4);
+   Bool zu = False;
+   const HChar* as = NULL;
+   switch ((szBlg2 << 1) | (bitQ ? 1 : 0)) {
+      case 0: zu = True;  as = "8b";  break;
+      case 1: zu = False; as = "16b"; break;
+      case 2: zu = True;  as = "4h";  break;
+      case 3: zu = False; as = "8h";  break;
+      case 4: zu = True;  as = "2s";  break;
+      case 5: zu = False; as = "4s";  break;
+      case 6: return False; // impliedly 1x64
+      case 7: zu = False; as = "2d";  break;
+      default: vassert(0);
+   }
+   vassert(as);
+   if (arrSpec)   *arrSpec = as;
+   if (zeroUpper) *zeroUpper = zu;
+   return True;
+}
+
+
 static
 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
 {
@@ -4294,28 +4506,28 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
          switch (ix) {
             case 1:
                putQReg128(dd, mkV128(0));
-               putQReg(dd, getIReg32orZR(nn));
+               putQRegLO(dd, getIReg32orZR(nn));
                DIP("fmov s%u, w%u\n", dd, nn);
                break;
             case 2:
                putQReg128(dd, mkV128(0));
-               putQReg(dd, getIReg64orZR(nn));
+               putQRegLO(dd, getIReg64orZR(nn));
                DIP("fmov d%u, x%u\n", dd, nn);
                break;
             case 3:
-               putQReg64HI(dd, getIReg64orZR(nn));
+               putQRegHI64(dd, getIReg64orZR(nn));
                DIP("fmov v%u.d[1], x%u\n", dd, nn);
                break;
             case 4:
-               putIReg32orZR(dd, getQReg(Ity_I32, nn));
+               putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
                DIP("fmov w%u, s%u\n", dd, nn);
                break;
             case 5:
-               putIReg64orZR(dd, getQReg(Ity_I64, nn));
+               putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
                DIP("fmov x%u, d%u\n", dd, nn);
                break;
             case 6:
-               putIReg64orZR(dd, getQReg64HI(nn));
+               putIReg64orZR(dd, getQRegHI64(nn));
                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
                break;
             default:
@@ -4341,8 +4553,9 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
       }
       putQReg128(dd, mkV128(0));
-      putQReg(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
-      DIP("fmov %s, #0x%llx\n", nameQReg(isD ? 8 : 4, dd), imm);
+      putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
+      DIP("fmov %s, #0x%llx\n",
+          nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
       return True;
    }
 
@@ -4377,9 +4590,9 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
                        ? unop(ops[ix], src)
                        : binop(ops[ix], mkexpr(mk_get_IR_rounding_mode()), src);
       putQReg128(dd, mkV128(0));
-      putQReg(dd, res);
+      putQRegLO(dd, res);
       DIP("%ccvtf %s, %s\n",
-          isU ? 'u' : 's', nameQReg(isF64 ? 8 : 4, dd), 
+          isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32), 
           nameIRegOrZR(isI64, nn));
       return True;
    }
@@ -4402,7 +4615,6 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt   dd  = INSN(4,0);
       IROp   iop = Iop_INVALID;
       IRType ty  = isD ? Ity_F64 : Ity_F32;
-      UInt   szB = isD ? 8 : 4;
       Bool   neg = False;
       const HChar* nm = "???";
       switch (op) {
@@ -4416,13 +4628,13 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       }
       vassert(iop != Iop_INVALID);
       IRExpr* resE = triop(iop, mkexpr(mk_get_IR_rounding_mode()),
-                           getQReg(ty, nn), getQReg(ty, mm));
+                           getQRegLO(nn, ty), getQRegLO(mm, ty));
       IRTemp res = newTemp(ty);
       assign(res, neg ? unop(mkNEGF(ty),resE) : resE);
       putQReg128(dd, mkV128(0));
-      putQReg(dd, mkexpr(res));
+      putQRegLO(dd, mkexpr(res));
       DIP("%s %s, %s, %s\n",
-          nm, nameQReg(szB, dd), nameQReg(szB, nn), nameQReg(szB, mm));
+          nm, nameQRegLO(dd, ty), nameQRegLO(nn, ty), nameQRegLO(mm, ty));
       return True;
    }
 
@@ -4442,32 +4654,32 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt   nn  = INSN(9,5);
       UInt   dd  = INSN(4,0);
       IRType ty  = isD ? Ity_F64 : Ity_F32;
-      UInt   szB = isD ? 8 : 4;
       IRTemp res = newTemp(ty);
       if (opc == BITS2(0,0)) {
-         assign(res, getQReg(ty, nn));
+         assign(res, getQRegLO(nn, ty));
          putQReg128(dd, mkV128(0x0000));
-         putQReg(dd, mkexpr(res));
-         DIP("fmov %s, %s\n", nameQReg(szB, dd), nameQReg(szB, nn));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fmov %s, %s\n",
+             nameQRegLO(dd, ty), nameQRegLO(nn, ty));
          return True;
       }
       if (opc == BITS2(1,0) || opc == BITS2(0,1)) {
          Bool isAbs = opc == BITS2(0,1);
          IROp op    = isAbs ? mkABSF(ty) : mkNEGF(ty);
-         assign(res, unop(op, getQReg(ty, nn)));
+         assign(res, unop(op, getQRegLO(nn, ty)));
          putQReg128(dd, mkV128(0x0000));
-         putQReg(dd, mkexpr(res));
+         putQRegLO(dd, mkexpr(res));
          DIP("%s %s, %s\n", isAbs ? "fabs" : "fneg",
-                            nameQReg(szB, dd), nameQReg(szB, nn));
+             nameQRegLO(dd, ty), nameQRegLO(nn, ty));
          return True;
       }
       if (opc == BITS2(1,1)) {
          assign(res,
                 binop(mkSQRTF(ty),
-                      mkexpr(mk_get_IR_rounding_mode()), getQReg(ty, nn)));
+                      mkexpr(mk_get_IR_rounding_mode()), getQRegLO(nn, ty)));
          putQReg128(dd, mkV128(0x0000));
-         putQReg(dd, mkexpr(res));
-         DIP("fsqrt %s, %s\n", nameQReg(szB, dd), nameQReg(szB, nn));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fsqrt %s, %s\n", nameQRegLO(dd, ty), nameQRegLO(nn, ty));
          return True;
       }
       /* else fall through; other cases are ATC */
@@ -4498,26 +4710,25 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       Bool   isCMPE  = INSN(4,4) == 1;
       Bool   cmpZero = INSN(3,3) == 1;
       IRType ty      = isD ? Ity_F64 : Ity_F32;
-      UInt   szB     = isD ? 8 : 4;
       Bool   valid   = True;
       if (cmpZero && mm != 0) valid = False;
       if (valid) {
          IRTemp argL  = newTemp(ty);
          IRTemp argR  = newTemp(ty);
          IRTemp irRes = newTemp(Ity_I32);
-         assign(argL, getQReg(ty, nn));
+         assign(argL, getQRegLO(nn, ty));
          assign(argR,
                 cmpZero 
                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
-                   : getQReg(ty, mm));
+                   : getQRegLO(mm, ty));
          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
                              mkexpr(argL), mkexpr(argR)));
          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
          IRTemp nzcv_28x0 = newTemp(Ity_I64);
          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
          setFlags_COPY(nzcv_28x0);
-         DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "",
-             nameQReg(szB, nn), cmpZero ? "#0.0" : nameQReg(szB, mm));
+         DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ty),
+             cmpZero ? "#0.0" : nameQRegLO(mm, ty));
          return True;
       }
    }
@@ -4544,15 +4755,14 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt    dd    = INSN(4,0);
       UInt    ix    = (INSN(21,21) << 1) | INSN(15,15);
       IRType  ty    = isD ? Ity_F64 : Ity_F32;
-      UInt    szB   = isD ? 8 : 4;
       IROp    opADD = mkADDF(ty);
       IROp    opSUB = mkSUBF(ty);
       IROp    opMUL = mkMULF(ty);
       IROp    opNEG = mkNEGF(ty);
       IRTemp  res   = newTemp(ty);
-      IRExpr* eA    = getQReg(ty, aa);
-      IRExpr* eN    = getQReg(ty, nn);
-      IRExpr* eM    = getQReg(ty, mm);
+      IRExpr* eA    = getQRegLO(aa, ty);
+      IRExpr* eN    = getQRegLO(nn, ty);
+      IRExpr* eM    = getQRegLO(mm, ty);
       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
       IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
       switch (ix) {
@@ -4563,11 +4773,11 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
          default: vassert(0);
       }
       putQReg128(dd, mkV128(0x0000));
-      putQReg(dd, mkexpr(res));
+      putQRegLO(dd, mkexpr(res));
       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
       DIP("%s %s, %s, %s, %s\n",
-          names[ix], nameQReg(szB, dd), nameQReg(szB, nn),
-          nameQReg(szB, mm), nameQReg(szB, aa));
+          names[ix], nameQRegLO(dd, ty), nameQRegLO(nn, ty),
+                     nameQRegLO(mm, ty), nameQRegLO(aa, ty));
       return True;
    }
 
@@ -4642,16 +4852,15 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       } else {
         return False;
       }
-      UInt   srcSzB = isF64 ? 8 : 4;
       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
       IRTemp src    = newTemp(srcTy);
       IRTemp dst    = newTemp(dstTy);
-      assign(src, getQReg(srcTy, nn));
+      assign(src, getQRegLO(nn, srcTy));
       assign(dst, binop(op, mkU32(irrm), mkexpr(src)));
       putIRegOrZR(isI64, dd, mkexpr(dst));
       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
-          nameIRegOrZR(isI64, dd), nameQReg(srcSzB, nn));
+          nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
       return True;
    }
 
@@ -4677,7 +4886,6 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt    nn    = INSN(9,5);
       UInt    dd    = INSN(4,0);
       IRType  ty    = isD ? Ity_F64 : Ity_F32;
-      UInt    szB   = isD ? 8 : 4;
       IRExpr* irrmE = NULL;
       UChar   ch    = '?';
       switch (rm) {
@@ -4689,12 +4897,13 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       if (irrmE) {
          IRTemp src = newTemp(ty);
          IRTemp dst = newTemp(ty);
-         assign(src, getQReg(ty, nn));
+         assign(src, getQRegLO(nn, ty));
          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
                            irrmE, mkexpr(src)));
          putQReg128(dd, mkV128(0x0000));
-         putQReg(dd, mkexpr(dst));
-         DIP("frint%c %s, %s\n", ch, nameQReg(szB, dd), nameQReg(szB, nn));
+         putQRegLO(dd, mkexpr(dst));
+         DIP("frint%c %s, %s\n",
+             ch, nameQRegLO(dd, ty), nameQRegLO(nn, ty));
          return True;
       }
       /* else unhandled rounding mode case -- fall through */
@@ -4720,20 +4929,22 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       if (b2322 == BITS2(0,0) && b1615 == BITS2(0,1)) {
          /* Convert S to D */
          IRTemp res = newTemp(Ity_F64);
-         assign(res, unop(Iop_F32toF64, getQReg(Ity_F32, nn)));
+         assign(res, unop(Iop_F32toF64, getQRegLO(nn, Ity_F32)));
          putQReg128(dd, mkV128(0x0000));
-         putQReg(dd, mkexpr(res));
-         DIP("fcvt %s, %s\n", nameQReg(8, dd), nameQReg(4, nn));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fcvt %s, %s\n",
+             nameQRegLO(dd, Ity_F64), nameQRegLO(nn, Ity_F32));
          return True;
       }
       if (b2322 == BITS2(0,1) && b1615 == BITS2(0,0)) {
          /* Convert D to S */
          IRTemp res = newTemp(Ity_F32);
          assign(res, binop(Iop_F64toF32, mkexpr(mk_get_IR_rounding_mode()),
-                                         getQReg(Ity_F64, nn)));
+                                         getQRegLO(nn, Ity_F64)));
          putQReg128(dd, mkV128(0x0000));
-         putQReg(dd, mkexpr(res));
-         DIP("fcvt %s, %s\n", nameQReg(4, dd), nameQReg(8, nn));
+         putQRegLO(dd, mkexpr(res));
+         DIP("fcvt %s, %s\n",
+             nameQRegLO(dd, Ity_F32), nameQRegLO(nn, Ity_F64));
          return True;
       }
       /* else unhandled */
@@ -4751,18 +4962,242 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt   nn  = INSN(9,5);
       UInt   dd  = INSN(4,0);
       IRType ty  = isD ? Ity_F64 : Ity_F32;
-      UInt   szB = isD ? 8 : 4;
       IRTemp res = newTemp(ty);
-      assign(res, unop(mkABSF(ty), triop(mkSUBF(ty),
-                                         mkexpr(mk_get_IR_rounding_mode()),
-                                         getQReg(ty,nn), getQReg(ty,mm))));
+      assign(res, unop(mkABSF(ty),
+                       triop(mkSUBF(ty),
+                             mkexpr(mk_get_IR_rounding_mode()),
+                             getQRegLO(nn,ty), getQRegLO(mm,ty))));
       putQReg128(dd, mkV128(0x0000));
-      putQReg(dd, mkexpr(res));
+      putQRegLO(dd, mkexpr(res));
       DIP("fabd %s, %s, %s\n",
-          nameQReg(szB, dd), nameQReg(szB, nn), nameQReg(szB, mm));
+          nameQRegLO(dd, ty), nameQRegLO(nn, ty), nameQRegLO(mm, ty));
       return True;
    }
 
+   /* -------------- {S,U}CVTF (vector, integer) -------------- */
+   /* 31  28      22 21       15     9 4
+      0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
+      0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
+      with laneage:
+      case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,0,0)
+       && INSN(21,16) == BITS6(1,0,0,0,0,1)
+       && INSN(15,10) == BITS6(1,1,0,1,1,0)) {
+      Bool isQ   = INSN(30,30) == 1;
+      Bool isU   = INSN(29,29) == 1;
+      Bool isF64 = INSN(22,22) == 1;
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      if (isQ || !isF64) {
+         IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
+         UInt   nLanes = 0;
+         Bool   zeroHI = False;
+         const HChar* arrSpec = NULL;
+         Bool   ok = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
+                                      isQ, isF64 );
+         IROp   op = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
+                         : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
+         IRTemp rm = mk_get_IR_rounding_mode();
+         UInt   i;
+         vassert(ok); /* the 'if' above should ensure this */
+         for (i = 0; i < nLanes; i++) {
+            putQRegLane(dd, i,
+                        binop(op, mkexpr(rm), getQRegLane(nn, i, tyI)));
+         }
+         if (zeroHI) {
+            putQRegLane(dd, 1, mkU64(0));
+         }
+         DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
+             nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------- F{ADD,SUB,MUL,DIV,MLA,MLS} (vector) ---------- */
+   /* 31  28      22 21 20 15     9 4                  case
+      0q0 01110 0 sz 1  m  110101 n d  FADD Vd,Vn,Vm   1
+      0q0 01110 1 sz 1  m  110101 n d  FSUB Vd,Vn,Vm   2
+      0q1 01110 0 sz 1  m  110111 n d  FMUL Vd,Vn,Vm   3
+      0q1 01110 0 sz 1  m  111111 n d  FDIV Vd,Vn,Vm   4
+      0q0 01110 0 sz 1  m  110011 n d  FMLA Vd,Vn,Vm   5
+      0q0 01110 1 sz 1  m  110011 n d  FMLS Vd,Vn,Vm   6
+   */
+   if (INSN(31,31) == 0
+       && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1) {
+      Bool isQ   = INSN(30,30) == 1;
+      UInt b29   = INSN(29,29);
+      UInt b23   = INSN(23,23);
+      Bool isF64 = INSN(22,22) == 1;
+      UInt mm    = INSN(20,16);
+      UInt b1510 = INSN(15,10);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      UInt ix    = 0;
+      /**/ if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,1,0,1)) ix = 1;
+      else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,1,0,1)) ix = 2;
+      else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,0,1,1,1)) ix = 3;
+      else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,1,1,1,1)) ix = 4;
+      else if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,0,1,1)) ix = 5;
+      else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,0,1,1)) ix = 6;
+      IRType laneTy = Ity_INVALID;
+      Bool   zeroHI = False;
+      const HChar* arr = "??";
+      Bool ok
+         = getLaneInfo_Q_SZ(NULL, &laneTy, NULL, &zeroHI, &arr, isQ, isF64);
+      /* Skip MLA/MLS for the time being */
+      if (ok && ix >= 1 && ix <= 4) {
+         const IROp ops64[4]
+            = { Iop_Add64Fx2, Iop_Sub64Fx2, Iop_Mul64Fx2, Iop_Div64Fx2 };
+         const IROp ops32[4]
+            = { Iop_Add32Fx4, Iop_Sub32Fx4, Iop_Mul32Fx4, Iop_Div32Fx4 };
+         const HChar* names[4]
+            = { "fadd", "fsub", "fmul", "fdiv" };
+         IROp   op = laneTy==Ity_F64 ? ops64[ix-1] : ops32[ix-1];
+         IRTemp rm = mk_get_IR_rounding_mode();
+         IRTemp t1 = newTemp(Ity_V128);
+         IRTemp t2 = newTemp(Ity_V128);
+         assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
+         assign(t2, zeroHI ? unop(Iop_ZeroHI64, mkexpr(t1)) : mkexpr(t1));
+         putQReg128(dd, mkexpr(t2));
+         DIP("%s %s.%s, %s.%s, %s.%s\n", names[ix-1],
+             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+         return True;
+      }
+   }
+
+   /* ---------------- ADD/SUB (vector) ---------------- */
+   /* 31  28    23   21 20 15     9 4
+      0q0 01110 size 1  m  100001 n d  ADD Vd.T, Vn.T, Vm.T
+      0q1 01110 size 1  m  100001 n d  SUB Vd.T, Vn.T, Vm.T
+   */
+   if (INSN(31,31) == 0 && INSN(28,24) == BITS5(0,1,1,1,0)
+       && INSN(21,21) == 1 && INSN(15,10) == BITS6(1,0,0,0,0,1)) {
+      Bool isQ    = INSN(30,30) == 1;
+      UInt szBlg2 = INSN(23,22);
+      Bool isSUB  = INSN(29,29) == 1;
+      UInt mm     = INSN(20,16);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      Bool zeroHI = False;
+      const HChar* arrSpec = "";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2 );
+      if (ok) {
+         const IROp opADD[4]
+            = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
+         const IROp opSUB[4]
+            = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
+         vassert(szBlg2 < 4);
+         IROp op = isSUB ? opSUB[szBlg2] : opADD[szBlg2];
+         IRTemp t = newTemp(Ity_V128);
+         assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64, mkexpr(t)) : mkexpr(t));
+         const HChar* nm = isSUB ? "sub" : "add";
+         DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
+             nameQReg128(dd), arrSpec, 
+             nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- XTN{,2} -------------------- */
+   /* 31  28    23   21     15     9 4
+      0q0 01110 size 100001 001010 n d
+   */
+   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,1,0)
+       && INSN(21,16) == BITS6(1,0,0,0,0,1)
+       && INSN(15,10) == BITS6(0,0,1,0,1,0)) {
+      Bool isQ  = INSN(30,30) == 1;
+      UInt size = INSN(23,22);
+      UInt nn   = INSN(9,5);
+      UInt dd   = INSN(4,0);
+      IROp op   = Iop_INVALID;
+      const HChar* tb = NULL;
+      const HChar* ta = NULL;
+      switch ((size << 1) | (isQ ? 1 : 0)) {
+         case 0: tb = "8b";  ta = "8h"; op = Iop_NarrowUn16to8x8;  break;
+         case 1: tb = "16b"; ta = "8h"; op = Iop_NarrowUn16to8x8;  break;
+         case 2: tb = "4h";  ta = "4s"; op = Iop_NarrowUn32to16x4; break;
+         case 3: tb = "8h";  ta = "4s"; op = Iop_NarrowUn32to16x4; break;
+         case 4: tb = "2s";  ta = "2d"; op = Iop_NarrowUn64to32x2; break;
+         case 5: tb = "4s";  ta = "2d"; op = Iop_NarrowUn64to32x2; break;
+         case 6: break;
+         case 7: break;
+         default: vassert(0);
+      }
+      if (op != Iop_INVALID) {
+         if (!isQ) {
+            putQRegLane(dd, 1, mkU64(0));
+         }
+         putQRegLane(dd, isQ ? 1 : 0, unop(op, getQReg128(nn)));
+         DIP("xtn%s %s.%s, %s.%s\n", isQ ? "2" : "",
+             nameQReg128(dd), tb, nameQReg128(nn), ta);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* ---------------- DUP (element, vector) ---------------- */
+   /* 31  28       20   15     9 4
+      0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
+   */
+   if (INSN(31,31) == 0 && INSN(29,21) == BITS9(0,0,1,1,1,0,0,0,0)
+       && INSN(15,10) == BITS6(0,0,0,0,0,1)) {
+      Bool   isQ  = INSN(30,30) == 1;
+      UInt   imm5 = INSN(20,16);
+      UInt   nn   = INSN(9,5);
+      UInt   dd   = INSN(4,0);
+      IRTemp w0   = newTemp(Ity_I64);
+      const HChar* arT  = "??";
+      const HChar* arTs = "??";
+      IRType laneTy = Ity_INVALID;
+      UInt   laneNo = 16; /* invalid */
+      if (imm5 & 1) {
+         arT    = isQ ? "16b" : "8b";
+         arTs   = "b";
+         laneNo = (imm5 >> 1) & 15;
+         laneTy = Ity_I8;
+         assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
+      }
+      else if (imm5 & 2) {
+         arT    = isQ ? "8h" : "4h";
+         arTs   = "h";
+         laneNo = (imm5 >> 2) & 7;
+         laneTy = Ity_I16;
+         assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
+      }
+      else if (imm5 & 4) {
+         arT    = isQ ? "4s" : "2s";
+         arTs   = "s";
+         laneNo = (imm5 >> 3) & 3;
+         laneTy = Ity_I32;
+         assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
+      }
+      else if ((imm5 & 8) && isQ) {
+         arT  = "2d";
+         arTs = "d";
+         laneNo = (imm5 >> 4) & 1;
+         laneTy = Ity_I64;
+         assign(w0, getQRegLane(nn, laneNo, laneTy));
+      }
+      else {
+         /* invalid; leave laneTy unchanged. */
+      }
+      /* */
+      if (laneTy != Ity_INVALID) {
+         vassert(laneNo < 16);
+         IRTemp w1 = math_DUP_TO_64(w0, laneTy);
+         putQReg128(dd, binop(Iop_64HLtoV128,
+                              isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
+         DIP("dup %s.%s, %s.%s[%u]\n",
+             nameQReg128(dd), arT, nameQReg128(nn), arTs, laneNo);
+         return True;
+      }
+      /* else fall through */
+   }
+
    /* FIXME Temporary hacks to get through ld.so FIXME */
 
    /* ------------------ movi vD.4s, #0x0 ------------------ */
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
index d2061face9..aef98dba91 100644
--- a/VEX/priv/host_arm64_defs.c
+++ b/VEX/priv/host_arm64_defs.c
@@ -848,6 +848,23 @@ static const HChar* showARM64FpUnaryOp ( ARM64FpUnaryOp op ) {
    }
 }
 
+static void showARM64VecBinOp(/*OUT*/const HChar** nm,
+                              /*OUT*/const HChar** ar, ARM64VecBinOp op ) {
+   switch (op) {
+      case ARM64vecb_ADD64x2:  *nm = "add "; *ar = "2d"; return;
+      case ARM64vecb_ADD32x4:  *nm = "add "; *ar = "4s"; return;
+      case ARM64vecb_ADD16x8:  *nm = "add "; *ar = "8h"; return;
+      case ARM64vecb_SUB64x2:  *nm = "sub "; *ar = "2d"; return;
+      case ARM64vecb_SUB32x4:  *nm = "sub "; *ar = "4s"; return;
+      case ARM64vecb_SUB16x8:  *nm = "sub "; *ar = "8h"; return;
+      case ARM64vecb_FADD64x2: *nm = "fadd"; *ar = "2d"; return;
+      case ARM64vecb_FSUB64x2: *nm = "fsub"; *ar = "2d"; return;
+      case ARM64vecb_FMUL64x2: *nm = "fmul"; *ar = "2d"; return;
+      case ARM64vecb_FDIV64x2: *nm = "fdiv"; *ar = "2d"; return;
+      default: vpanic("showARM64VecBinOp");
+   }
+}
+
 //ZZ const HChar* showARMNeonBinOp ( ARMNeonBinOp op ) {
 //ZZ    switch (op) {
 //ZZ       case ARMneon_VAND: return "vand";
@@ -1512,6 +1529,25 @@ ARM64Instr* ARM64Instr_FPCR ( Bool toFPCR, HReg iReg ) {
    i->ARM64in.FPCR.iReg   = iReg;
    return i;
 }
+ARM64Instr* ARM64Instr_VBinV ( ARM64VecBinOp op,
+                               HReg dst, HReg argL, HReg argR ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                = ARM64in_VBinV;
+   i->ARM64in.VBinV.op   = op;
+   i->ARM64in.VBinV.dst  = dst;
+   i->ARM64in.VBinV.argL = argL;
+   i->ARM64in.VBinV.argR = argR;
+   return i;
+}
+ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                      = ARM64in_VNarrowV;
+   i->ARM64in.VNarrowV.dszBlg2 = dszBlg2;
+   i->ARM64in.VNarrowV.dst     = dst;
+   i->ARM64in.VNarrowV.src     = src;
+   vassert(dszBlg2 == 0 || dszBlg2 == 1 || dszBlg2 == 2);
+   return i;
+}
 //ZZ ARMInstr* ARMInstr_VAluS ( ARMVfpOp op, HReg dst, HReg argL, HReg argR ) {
 //ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
 //ZZ    i->tag              = ARMin_VAluS;
@@ -2104,6 +2140,30 @@ void ppARM64Instr ( ARM64Instr* i ) {
             vex_printf(", fpcr");
          }
          return;
+      case ARM64in_VBinV: {
+         const HChar* nm = "??";
+         const HChar* ar = "??";
+         showARM64VecBinOp(&nm, &ar, i->ARM64in.VBinV.op);
+         vex_printf("%s   ", nm);
+         ppHRegARM64(i->ARM64in.VBinV.dst);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VBinV.argL);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VBinV.argR);
+         vex_printf(".%s", ar);
+         return;
+      }
+      case ARM64in_VNarrowV: {
+         UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2;
+         const HChar* darr[3] = { "8b", "4h", "2s" };
+         const HChar* sarr[3] = { "8h", "4s", "2d" };
+         vex_printf("xtn    ");
+         ppHRegARM64(i->ARM64in.VNarrowV.dst);
+         vex_printf(".%s, ", dszBlg2 < 3 ? darr[dszBlg2] : "??");
+         ppHRegARM64(i->ARM64in.VNarrowV.src);
+         vex_printf(".%s", dszBlg2 < 3 ? sarr[dszBlg2] : "??");
+         return;
+      }
 //ZZ       case ARMin_VAluS:
 //ZZ          vex_printf("f%-3ss ", showARMVfpOp(i->ARMin.VAluS.op));
 //ZZ          ppHRegARM(i->ARMin.VAluS.dst);
@@ -2567,6 +2627,15 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, ARM64Instr* i, Bool mode64 )
          else
             addHRegUse(u, HRmWrite, i->ARM64in.FPCR.iReg);
          return;
+      case ARM64in_VBinV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VBinV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinV.argL);
+         addHRegUse(u, HRmRead, i->ARM64in.VBinV.argR);
+         return;
+      case ARM64in_VNarrowV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VNarrowV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VNarrowV.src);
+         return;
 //ZZ       case ARMin_VAluS:
 //ZZ          addHRegUse(u, HRmWrite, i->ARMin.VAluS.dst);
 //ZZ          addHRegUse(u, HRmRead, i->ARMin.VAluS.argL);
@@ -2842,6 +2911,15 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
       case ARM64in_FPCR:
          i->ARM64in.FPCR.iReg = lookupHRegRemap(m, i->ARM64in.FPCR.iReg);
          return;
+      case ARM64in_VBinV:
+         i->ARM64in.VBinV.dst  = lookupHRegRemap(m, i->ARM64in.VBinV.dst);
+         i->ARM64in.VBinV.argL = lookupHRegRemap(m, i->ARM64in.VBinV.argL);
+         i->ARM64in.VBinV.argR = lookupHRegRemap(m, i->ARM64in.VBinV.argR);
+         return;
+      case ARM64in_VNarrowV:
+         i->ARM64in.VNarrowV.dst = lookupHRegRemap(m, i->ARM64in.VNarrowV.dst);
+         i->ARM64in.VNarrowV.src = lookupHRegRemap(m, i->ARM64in.VNarrowV.src);
+         return;
 //ZZ       case ARMin_VAluS:
 //ZZ          i->ARMin.VAluS.dst  = lookupHRegRemap(m, i->ARMin.VAluS.dst);
 //ZZ          i->ARMin.VAluS.argL = lookupHRegRemap(m, i->ARMin.VAluS.argL);
@@ -3117,15 +3195,16 @@ static inline UChar qregNo ( HReg r )
 #define X110 BITS4(0, 1,1,0)
 #define X111 BITS4(0, 1,1,1)
 
-#define BITS8(zzb7,zzb6,zzb5,zzb4,zzb3,zzb2,zzb1,zzb0) \
-  ((BITS4(zzb7,zzb6,zzb5,zzb4) << 4) | BITS4(zzb3,zzb2,zzb1,zzb0))
-
 #define X0000 BITS4(0,0,0,0)
 #define X0001 BITS4(0,0,0,1)
 #define X0010 BITS4(0,0,1,0)
 #define X0011 BITS4(0,0,1,1)
 
+#define BITS8(zzb7,zzb6,zzb5,zzb4,zzb3,zzb2,zzb1,zzb0) \
+  ((BITS4(zzb7,zzb6,zzb5,zzb4) << 4) | BITS4(zzb3,zzb2,zzb1,zzb0))
+
 #define X00000   BITS8(0,0,0, 0,0,0,0,0)
+#define X00001   BITS8(0,0,0, 0,0,0,0,1)
 #define X00111   BITS8(0,0,0, 0,0,1,1,1)
 #define X01000   BITS8(0,0,0, 0,1,0,0,0)
 #define X10000   BITS8(0,0,0, 1,0,0,0,0)
@@ -3143,14 +3222,18 @@ static inline UChar qregNo ( HReg r )
 #define X010001  BITS8(0,0, 0,1,0,0,0,1)
 #define X011010  BITS8(0,0, 0,1,1,0,1,0)
 #define X011111  BITS8(0,0, 0,1,1,1,1,1)
+#define X100001  BITS8(0,0, 1,0,0,0,0,1)
 #define X100100  BITS8(0,0, 1,0,0,1,0,0)
 #define X100101  BITS8(0,0, 1,0,0,1,0,1)
 #define X100110  BITS8(0,0, 1,0,0,1,1,0)
 #define X110000  BITS8(0,0, 1,1,0,0,0,0)
 #define X110001  BITS8(0,0, 1,1,0,0,0,1)
+#define X110101  BITS8(0,0, 1,1,0,1,0,1)
+#define X110111  BITS8(0,0, 1,1,0,1,1,1)
 #define X111000  BITS8(0,0, 1,1,1,0,0,0)
 #define X111001  BITS8(0,0, 1,1,1,0,0,1)
 #define X111101  BITS8(0,0, 1,1,1,1,0,1)
+#define X111111  BITS8(0,0, 1,1,1,1,1,1)
 
 #define X00100000  BITS8(0,0,1,0,0,0,0,0)
 #define X00100001  BITS8(0,0,1,0,0,0,0,1)
@@ -3165,6 +3248,10 @@ static inline UChar qregNo ( HReg r )
 #define X01100010  BITS8(0,1,1,0,0,0,1,0)
 #define X01100011  BITS8(0,1,1,0,0,0,1,1)
 #define X01110000  BITS8(0,1,1,1,0,0,0,0)
+#define X01110001  BITS8(0,1,1,1,0,0,0,1)
+#define X01110011  BITS8(0,1,1,1,0,0,1,1)
+#define X01110101  BITS8(0,1,1,1,0,1,0,1)
+#define X01110111  BITS8(0,1,1,1,0,1,1,1)
 #define X11000001  BITS8(1,1,0,0,0,0,0,1)
 #define X11000011  BITS8(1,1,0,0,0,0,1,1)
 #define X11010100  BITS8(1,1,0,1,0,1,0,0)
@@ -4418,7 +4505,7 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
          /* 31  28    23 21 20 18  15     9 4
             000 11110 00 1  00 010 000000 n d  SCVTF Sd, Wn
             000 11110 01 1  00 010 000000 n d  SCVTF Dd, Wn
-            100 11110 00 1  00 010 000000 n d  SCVTF Sd, Xn x
+            100 11110 00 1  00 010 000000 n d  SCVTF Sd, Xn
             100 11110 01 1  00 010 000000 n d  SCVTF Dd, Xn
             000 11110 00 1  00 011 000000 n d  UCVTF Sd, Wn
             000 11110 01 1  00 011 000000 n d  UCVTF Dd, Wn
@@ -4521,16 +4608,6 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
          }
          goto done;
       }
-      case ARM64in_FPCR: {
-         Bool toFPCR = i->ARM64in.FPCR.toFPCR;
-         UInt iReg   = iregNo(i->ARM64in.FPCR.iReg);
-         if (toFPCR) {
-            /* 0xD51B44 000 Rt  MSR fpcr, rT */
-            *p++ = 0xD51B4400 | (iReg & 0x1F);
-            goto done;
-         }
-         goto bad; // FPCR -> iReg case currently ATC
-      }
       case ARM64in_VUnaryD: {
          /* 31        23 21     16 14    9 4
             000,11110 01 1,0000 0,0 10000 n d  FMOV Dd, Dn (not handled)
@@ -4653,6 +4730,75 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
          *p++ = X_3_8_5_6_5_5(X000, X11110001, sM, X001000, sN, X00000);
          goto done;
       }
+      case ARM64in_FPCR: {
+         Bool toFPCR = i->ARM64in.FPCR.toFPCR;
+         UInt iReg   = iregNo(i->ARM64in.FPCR.iReg);
+         if (toFPCR) {
+            /* 0xD51B44 000 Rt  MSR fpcr, rT */
+            *p++ = 0xD51B4400 | (iReg & 0x1F);
+            goto done;
+         }
+         goto bad; // FPCR -> iReg case currently ATC
+      }
+      case ARM64in_VBinV: {
+         /* 31        23   20 15     9 4
+            010 01110 11 1 m  100001 n d   ADD Vd.2d, Vn.2d, Vm.2d
+            010 01110 10 1 m  100001 n d   ADD Vd.4s, Vn.4s, Vm.4s
+            011 01110 11 1 m  100001 n d   SUB Vd.2d, Vn.2d, Vm.2d
+            011 01110 10 1 m  100001 n d   SUB Vd.4s, Vn.4s, Vm.4s
+            011 01110 01 1 m  100001 n d   SUB Vd.8h, Vn.8h, Vm.8h
+            010 01110 01 1 m  110101 n d   FADD Vd.2d, Vn.2d, Vm.2d
+            010 01110 11 1 m  110101 n d   FSUB Vd.2d, Vn.2d, Vm.2d
+            011 01110 01 1 m  110111 n d   FMUL Vd.2d, Vn.2d, Vm.2d
+            011 01110 01 1 m  111111 n d   FDIV Vd.2d, Vn.2d, Vm.2d
+         */
+         UInt vD = qregNo(i->ARM64in.VBinV.dst);
+         UInt vN = qregNo(i->ARM64in.VBinV.argL);
+         UInt vM = qregNo(i->ARM64in.VBinV.argR);
+         switch (i->ARM64in.VBinV.op) {
+            case ARM64vecb_ADD64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB32x4:
+               *p++ = X_3_8_5_6_5_5(X011, X01110101, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_SUB16x8:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X100001, vN, vD);
+               break;
+            case ARM64vecb_FADD64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X110101, vN, vD);
+               break;
+            case ARM64vecb_FSUB64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, vM, X110101, vN, vD);
+               break;
+            case ARM64vecb_FMUL64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X110111, vN, vD);
+               break;
+            case ARM64vecb_FDIV64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110011, vM, X111111, vN, vD);
+               break;
+            default:
+               goto bad;
+         }
+         goto done;
+      }
+      case ARM64in_VNarrowV: {
+         /* 31        23 21      15     9 4
+            000 01110 00 1,00001 001010 n d  XTN Vd.8b, Vn.8h
+            000 01110 01 1,00001 001010 n d  XTN Vd.4h, Vn.4s
+            000 01110 10 1,00001 001010 n d  XTN Vd.2s, Vn.2d
+         */
+         UInt vD = qregNo(i->ARM64in.VNarrowV.dst);
+         UInt vN = qregNo(i->ARM64in.VNarrowV.src);
+         UInt dszBlg2 = i->ARM64in.VNarrowV.dszBlg2;
+         vassert(dszBlg2 >= 0 && dszBlg2 <= 2);
+         *p++ = X_3_8_5_6_5_5(X000, X01110001 | (dszBlg2 << 1),
+                              X00001, X001010, vN, vD);
+         goto done;
+      }
 //ZZ       case ARMin_VAluS: {
 //ZZ          UInt dN = fregNo(i->ARMin.VAluS.argL);
 //ZZ          UInt dD = fregNo(i->ARMin.VAluS.dst);
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
index 6a52377d73..c3a63c8368 100644
--- a/VEX/priv/host_arm64_defs.h
+++ b/VEX/priv/host_arm64_defs.h
@@ -119,7 +119,7 @@ typedef
 
 typedef
    enum {
-      ARM64am_RI9=1,  /* reg + simm9 */
+      ARM64am_RI9=10, /* reg + simm9 */
       ARM64am_RI12,   /* reg + uimm12 * szB (iow, scaled by access size) */
       ARM64am_RR      /* reg1 + reg2 */
    }
@@ -155,8 +155,8 @@ extern ARM64AMode* ARM64AMode_RR   ( HReg base, HReg index );
 
 typedef
    enum {
-      ARM64riA_I12=4, /* uimm12 << 0 or 12 only */
-      ARM64riA_R      /* reg */
+      ARM64riA_I12=20, /* uimm12 << 0 or 12 only */
+      ARM64riA_R       /* reg */
    }
    ARM64RIATag;
 
@@ -212,7 +212,7 @@ extern ARM64RIL* ARM64RIL_R   ( HReg );
 
 typedef
    enum {
-      ARM64ri6_I6=8,  /* uimm6, 1 .. 63 only */
+      ARM64ri6_I6=30, /* uimm6, 1 .. 63 only */
       ARM64ri6_R      /* reg */
    }
    ARM64RI6Tag;
@@ -239,7 +239,7 @@ extern ARM64RI6* ARM64RI6_R  ( HReg );
 
 typedef
    enum {
-      ARM64lo_AND=10,
+      ARM64lo_AND=40,
       ARM64lo_OR,
       ARM64lo_XOR
    }
@@ -247,7 +247,7 @@ typedef
 
 typedef
    enum {
-      ARM64sh_SHL=13,
+      ARM64sh_SHL=50,
       ARM64sh_SHR,
       ARM64sh_SAR
    }
@@ -255,7 +255,7 @@ typedef
 
 typedef
    enum {
-      ARM64un_NEG=16,
+      ARM64un_NEG=60,
       ARM64un_NOT,
       ARM64un_CLZ,
    }
@@ -263,7 +263,7 @@ typedef
 
 typedef
    enum {
-      ARM64mul_PLAIN=60, /* lo64(64 * 64)  */
+      ARM64mul_PLAIN=70, /* lo64(64 * 64)  */
       ARM64mul_ZX,       /* hi64(64 *u 64) */
       ARM64mul_SX        /* hi64(64 *s 64) */
    }
@@ -273,7 +273,7 @@ typedef
    /* These characterise an integer-FP conversion, but don't imply any
       particular direction. */
    enum {
-      ARM64cvt_F32_I32S=65,
+      ARM64cvt_F32_I32S=80,
       ARM64cvt_F64_I32S,
       ARM64cvt_F32_I64S,
       ARM64cvt_F64_I64S,
@@ -287,7 +287,7 @@ typedef
 
 typedef
    enum {
-      ARM64fpb_ADD=75,
+      ARM64fpb_ADD=100,
       ARM64fpb_SUB,
       ARM64fpb_MUL,
       ARM64fpb_DIV,
@@ -297,7 +297,7 @@ typedef
 
 typedef
    enum {
-      ARM64fpu_NEG=82,
+      ARM64fpu_NEG=110,
       ARM64fpu_ABS,
       ARM64fpu_SQRT,
       ARM64fpu_RINT,
@@ -305,6 +305,22 @@ typedef
    }
    ARM64FpUnaryOp;
 
+typedef
+   enum {
+      ARM64vecb_ADD64x2=120,
+      ARM64vecb_ADD32x4,
+      ARM64vecb_ADD16x8,
+      ARM64vecb_SUB64x2,
+      ARM64vecb_SUB32x4,
+      ARM64vecb_SUB16x8,
+      ARM64vecb_FADD64x2,
+      ARM64vecb_FSUB64x2,
+      ARM64vecb_FMUL64x2,
+      ARM64vecb_FDIV64x2,
+      ARM64vecb_INVALID
+   }
+   ARM64VecBinOp;
+
 //ZZ extern const HChar* showARMVfpUnaryOp ( ARMVfpUnaryOp op );
 //ZZ 
 //ZZ typedef
@@ -470,7 +486,7 @@ typedef
       ARM64in_Mul,
 //ZZ       ARMin_LdrEX,
 //ZZ       ARMin_StrEX,
-      /* vector */
+      /* ARM64in_V*: scalar ops involving vector registers */
       ARM64in_VLdStS,   /* 32-bit FP load/store, with imm offset  */
       ARM64in_VLdStD,   /* 64-bit FP load/store, with imm offset  */
       ARM64in_VLdStQ,
@@ -484,6 +500,9 @@ typedef
       ARM64in_VCmpD,
       ARM64in_VCmpS,
       ARM64in_FPCR,
+      /* ARM64in_V*V: vector ops on vector registers */
+      ARM64in_VBinV,
+      ARM64in_VNarrowV,
 //ZZ       ARMin_VAluS,
 //ZZ       ARMin_VCMovD,
 //ZZ       ARMin_VCMovS,
@@ -749,6 +768,20 @@ typedef
             Bool toFPCR;
             HReg iReg;
          } FPCR;
+         /* binary vector operation on vector registers */
+         struct {
+            ARM64VecBinOp op;
+            HReg          dst;
+            HReg          argL;
+            HReg          argR;
+         } VBinV;
+         /* vector narrowing, Q -> Q.  Result goes in the bottom half
+            of dst and the top half is zeroed out.  Iow is XTN. */
+        struct {
+           UInt dszBlg2; // 0: 16to8_x8  1: 32to16_x4  2: 64to32_x2
+           HReg dst;     // Q reg
+           HReg src;     // Q reg
+        } VNarrowV;
 //ZZ          /* 32-bit FP binary arithmetic */
 //ZZ          struct {
 //ZZ             ARMVfpOp op;
@@ -949,6 +982,8 @@ extern ARM64Instr* ARM64Instr_VBinS   ( ARM64FpBinOp op, HReg, HReg, HReg );
 extern ARM64Instr* ARM64Instr_VCmpD   ( HReg argL, HReg argR );
 extern ARM64Instr* ARM64Instr_VCmpS   ( HReg argL, HReg argR );
 extern ARM64Instr* ARM64Instr_FPCR    ( Bool toFPCR, HReg iReg );
+extern ARM64Instr* ARM64Instr_VBinV   ( ARM64VecBinOp op, HReg, HReg, HReg );
+extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src );
 //ZZ extern ARMInstr* ARMInstr_VAluS    ( ARMVfpOp op, HReg, HReg, HReg );
 //ZZ extern ARMInstr* ARMInstr_VCMovD   ( ARMCondCode, HReg dst, HReg src );
 //ZZ extern ARMInstr* ARMInstr_VCMovS   ( ARMCondCode, HReg dst, HReg src );
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
index 9852fe3ab5..125497e0c4 100644
--- a/VEX/priv/host_arm64_isel.c
+++ b/VEX/priv/host_arm64_isel.c
@@ -2118,7 +2118,21 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
                                            ARM64sh_SAR));
             return dst;
          }
-
+         case Iop_NarrowUn32to16x4:
+         case Iop_NarrowUn64to32x2: {
+            HReg src = iselV128Expr(env, e->Iex.Unop.arg);
+            HReg tmp = newVRegV(env);
+            HReg dst = newVRegI(env);
+            UInt dszBlg2 = 3; /* illegal */
+            switch (e->Iex.Unop.op) {
+               case Iop_NarrowUn32to16x4: dszBlg2 = 1; break; // 32to16_x4
+               case Iop_NarrowUn64to32x2: dszBlg2 = 2; break; // 64to32_x2
+               default: vassert(0);
+            }
+            addInstr(env, ARM64Instr_VNarrowV(dszBlg2, tmp, src));
+            addInstr(env, ARM64Instr_VXfromQ(dst, tmp, 0/*laneNo*/));
+            return dst;
+         }
 //ZZ          case Iop_64HIto32: {
 //ZZ             HReg rHi, rLo;
 //ZZ             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
@@ -4835,49 +4849,24 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
 //ZZ          case Iop_Add8x16:
 //ZZ          case Iop_Add16x8:
 //ZZ          case Iop_Add32x4:
-//ZZ          case Iop_Add64x2: {
-//ZZ             /* 
-//ZZ             FIXME: remove this if not used
-//ZZ             DECLARE_PATTERN(p_vrhadd_32sx4);
-//ZZ             ULong one = (1LL << 32) | 1LL;
-//ZZ             DEFINE_PATTERN(p_vrhadd_32sx4,
-//ZZ                   binop(Iop_Add32x4,
-//ZZ                         binop(Iop_Add32x4,
-//ZZ                               binop(Iop_SarN32x4,
-//ZZ                                     bind(0),
-//ZZ                                     mkU8(1)),
-//ZZ                               binop(Iop_SarN32x4,
-//ZZ                                     bind(1),
-//ZZ                                     mkU8(1))),
-//ZZ                         binop(Iop_SarN32x4,
-//ZZ                               binop(Iop_Add32x4,
-//ZZ                                     binop(Iop_Add32x4,
-//ZZ                                           binop(Iop_AndV128,
-//ZZ                                                 bind(0),
-//ZZ                                                 mkU128(one)),
-//ZZ                                           binop(Iop_AndV128,
-//ZZ                                                 bind(1),
-//ZZ                                                 mkU128(one))),
-//ZZ                                     mkU128(one)),
-//ZZ                               mkU8(1))));
-//ZZ             */
-//ZZ             HReg res = newVRegV(env);
-//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-//ZZ             HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
-//ZZ             UInt size;
-//ZZ             switch (e->Iex.Binop.op) {
-//ZZ                case Iop_Add8x16: size = 0; break;
-//ZZ                case Iop_Add16x8: size = 1; break;
-//ZZ                case Iop_Add32x4: size = 2; break;
-//ZZ                case Iop_Add64x2: size = 3; break;
-//ZZ                default:
-//ZZ                   ppIROp(e->Iex.Binop.op);
-//ZZ                   vpanic("Illegal element size in VADD");
-//ZZ             }
-//ZZ             addInstr(env, ARMInstr_NBinary(ARMneon_VADD,
-//ZZ                                            res, argL, argR, size, True));
-//ZZ             return res;
-//ZZ          }
+         case Iop_Add64x2:
+         case Iop_Sub64x2:
+         case Iop_Sub32x4:
+         case Iop_Sub16x8: {
+            HReg res  = newVRegV(env);
+            HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
+            ARM64VecBinOp op = ARM64vecb_INVALID;
+            switch (e->Iex.Binop.op) {
+               case Iop_Add64x2: op = ARM64vecb_ADD64x2; break;
+               case Iop_Sub64x2: op = ARM64vecb_SUB64x2; break;
+               case Iop_Sub32x4: op = ARM64vecb_SUB32x4; break;
+               case Iop_Sub16x8: op = ARM64vecb_SUB16x8; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARM64Instr_VBinV(op, res, argL, argR));
+            return res;
+         }
 //ZZ          case Iop_Add32Fx4: {
 //ZZ             HReg res = newVRegV(env);
 //ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
@@ -5750,9 +5739,25 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
       } /* switch on the binop */
    } /* if (e->tag == Iex_Binop) */
 
-//ZZ    if (e->tag == Iex_Triop) {
-//ZZ       IRTriop *triop = e->Iex.Triop.details;
-//ZZ 
+   if (e->tag == Iex_Triop) {
+      IRTriop*      triop  = e->Iex.Triop.details;
+      ARM64VecBinOp vecbop = ARM64vecb_INVALID;
+      switch (triop->op) {
+         case Iop_Add64Fx2: vecbop = ARM64vecb_FADD64x2; break;
+         case Iop_Sub64Fx2: vecbop = ARM64vecb_FSUB64x2; break;
+         case Iop_Mul64Fx2: vecbop = ARM64vecb_FMUL64x2; break;
+         case Iop_Div64Fx2: vecbop = ARM64vecb_FDIV64x2; break;
+         default: break;
+      }
+      if (vecbop != ARM64vecb_INVALID) {
+         HReg argL = iselV128Expr(env, triop->arg2);
+         HReg argR = iselV128Expr(env, triop->arg3);
+         HReg dst  = newVRegV(env);
+         set_FPCR_rounding_mode(env, triop->arg1);
+         addInstr(env, ARM64Instr_VBinV(vecbop, dst, argL, argR));
+         return dst;
+      }
+
 //ZZ       switch (triop->op) {
 //ZZ          case Iop_ExtractV128: {
 //ZZ             HReg res = newVRegV(env);
@@ -5776,8 +5781,8 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
 //ZZ          default:
 //ZZ             break;
 //ZZ       }
-//ZZ    }
-//ZZ 
+   }
+
 //ZZ    if (e->tag == Iex_ITE) { // VFD
 //ZZ       ARMCondCode cc;
 //ZZ       HReg r1  = iselNeonExpr(env, e->Iex.ITE.iftrue);
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index 33e795fc13..0cbb118ff2 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -694,6 +694,7 @@ void ppIROp ( IROp op )
 
       case Iop_64UtoV128:   vex_printf("64UtoV128"); return;
       case Iop_SetV128lo64: vex_printf("SetV128lo64"); return;
+      case Iop_ZeroHI64:    vex_printf("ZeroHI64"); return;
 
       case Iop_32UtoV128:   vex_printf("32UtoV128"); return;
       case Iop_V128to32:    vex_printf("V128to32"); return;
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 32936d3c5d..cef10c1c8f 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -1363,6 +1363,9 @@ typedef
       Iop_64UtoV128,
       Iop_SetV128lo64,
 
+      /* Copies lower 64 bits, zeroes out upper 64 bits. */
+      Iop_ZeroHI64,     // :: V128 -> V128
+
       /* 32 <-> 128 bit vector */
       Iop_32UtoV128,
       Iop_V128to32,     // :: V128 -> I32, lowest lane