From: Julian Seward <jseward@acm.org>
Date: Mon, 17 Feb 2014 11:00:53 +0000 (+0000)
Subject: Implement more aarch64 vector insns:
X-Git-Tag: svn/VALGRIND_3_10_1^2~150
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c33e63a44e8e403e4001d7f8a7ec3dc5281d116c;p=thirdparty%2Fvalgrind.git

Implement more aarch64 vector insns:
CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector)
{EOR,BSL,BIT,BIF} (vector)
{USHR,SSHR} (vector, immediate)
{U,S}SHLL{,2}
INS (general)
FABD Vd,Vn,Vm


git-svn-id: svn://svn.valgrind.org/vex/trunk@2820
---

diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index 070c7dfb87..e386ca1179 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -4372,12 +4372,21 @@ Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn)
 /* begin FIXME -- rm temp scaffolding */
 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp, IRTemp );
 static IRExpr* mk_CatOddLanes64x2  ( IRTemp, IRTemp );
+
 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp, IRTemp );
 static IRExpr* mk_CatOddLanes32x4  ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveLO32x4 ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveHI32x4 ( IRTemp, IRTemp );
+
 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp, IRTemp );
 static IRExpr* mk_CatOddLanes16x8  ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveLO16x8 ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveHI16x8 ( IRTemp, IRTemp );
+
 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp, IRTemp );
 static IRExpr* mk_CatOddLanes8x16  ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveLO8x16 ( IRTemp, IRTemp );
+static IRExpr* mk_InterleaveHI8x16 ( IRTemp, IRTemp );
 /* end FIXME -- rm temp scaffolding */
 
 /* Generate N copies of |bit| in the bottom of a ULong. */
@@ -4583,6 +4592,38 @@ static Bool getLaneInfo_SIMPLE ( /*OUT*/Bool* zeroUpper,
 }
 
 
+/* Helper for decoding laneage for shift-style vector operations 
+   that involve an immediate shift amount. */
+static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
+                                    UInt immh, UInt immb )
+{
+   vassert(immh < (1<<4));
+   vassert(immb < (1<<3));
+   UInt immhb = (immh << 3) | immb;
+   if (immh & 8) {
+      if (shift)  *shift  = 128 - immhb;
+      if (szBlg2) *szBlg2 = 3;
+      return True;
+   }
+   if (immh & 4) {
+      if (shift)  *shift  = 64 - immhb;
+      if (szBlg2) *szBlg2 = 2;
+      return True;
+   }
+   if (immh & 2) {
+      if (shift)  *shift  = 32 - immhb;
+      if (szBlg2) *szBlg2 = 1;
+      return True;
+   }
+   if (immh & 1) {
+      if (shift)  *shift  = 16 - immhb;
+      if (szBlg2) *szBlg2 = 0;
+      return True;
+   }
+   return False;
+}
+
+
 /* Generate IR to fold all lanes of the V128 value in 'src' as
    characterised by the operator 'op', and return the result in the
    bottom bits of a V128, with all other bits set to zero. */
@@ -5447,6 +5488,7 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       0q1 01110 0 sz 1  m  111111 n d  FDIV Vd,Vn,Vm   4
       0q0 01110 0 sz 1  m  110011 n d  FMLA Vd,Vn,Vm   5
       0q0 01110 1 sz 1  m  110011 n d  FMLS Vd,Vn,Vm   6
+      0q1 01110 1 sz 1  m  110101 n d  FABD Vd,Vn,Vm   7
    */
    if (INSN(31,31) == 0
        && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1) {
@@ -5465,6 +5507,7 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       else if (b29 == 1 && b23 == 0 && b1510 == BITS6(1,1,1,1,1,1)) ix = 4;
       else if (b29 == 0 && b23 == 0 && b1510 == BITS6(1,1,0,0,1,1)) ix = 5;
       else if (b29 == 0 && b23 == 1 && b1510 == BITS6(1,1,0,0,1,1)) ix = 6;
+      else if (b29 == 1 && b23 == 1 && b1510 == BITS6(1,1,0,1,0,1)) ix = 7;
       IRType laneTy = Ity_INVALID;
       Bool   zeroHI = False;
       const HChar* arr = "??";
@@ -5502,11 +5545,28 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
                           mkexpr(rm), getQReg128(nn), getQReg128(mm)));
          assign(t2, triop(ix == 5 ? opADD : opSUB,
                           mkexpr(rm), getQReg128(dd), mkexpr(t1)));
-         putQReg128(dd, mkexpr(t2));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t2))
+                               : mkexpr(t2));
          DIP("%s %s.%s, %s.%s, %s.%s\n", ix == 5 ? "fmla" : "fmls",
              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
          return True;
       }
+      if (ok && ix == 7) {
+         IROp opSUB = laneTy==Ity_F64 ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
+         IROp opABS = laneTy==Ity_F64 ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
+         IRTemp rm = mk_get_IR_rounding_mode();
+         IRTemp t1 = newTemp(Ity_V128);
+         IRTemp t2 = newTemp(Ity_V128);
+         // FIXME: use Abd primop instead?
+         assign(t1, triop(opSUB,
+                          mkexpr(rm), getQReg128(nn), getQReg128(mm)));
+         assign(t2, unop(opABS, mkexpr(t1)));
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, mkexpr(t2))
+                               : mkexpr(t2));
+         DIP("fabd %s.%s, %s.%s, %s.%s\n",
+             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+         return True;
+      }
    }
 
    /* ---------------- ADD/SUB (vector) ---------------- */
@@ -5762,6 +5822,274 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       return True;
    }
 
+   /* ---------- CM{EQ,HI,HS,GE,GT,TST,LE,LT} (vector) ---------- */
+   /* 31  28    23   21     15     9 4                          ix 
+      0q1 01110 size 1  m   100011 n d  CMEQ  Vd.T, Vn.T, Vm.T  (1) ==
+      0q0 01110 size 1  m   100011 n d  CMTST Vd.T, Vn.T, Vm.T  (2) &, == 0
+
+      0q1 01110 size 1  m   001101 n d  CMHI Vd.T, Vn.T, Vm.T   (3) >u
+      0q0 01110 size 1  m   001101 n d  CMGT Vd.T, Vn.T, Vm.T   (4) >s
+
+      0q1 01110 size 1  m   001111 n d  CMHS Vd.T, Vn.T, Vm.T   (5) >=u
+      0q0 01110 size 1  m   001111 n d  CMGE Vd.T, Vn.T, Vm.T   (6) >=s
+
+      0q1 01110 size 100000 100010 n d  CMGE Vd.T, Vn.T, #0     (7) >=s 0
+      0q0 01110 size 100000 100010 n d  CMGT Vd.T, Vn.T, #0     (8) >s 0
+
+      0q1 01110 size 100000 100110 n d  CMLE Vd.T, Vn.T, #0     (9) <=s 0
+      0q0 01110 size 100000 100110 n d  CMEQ Vd.T, Vn.T, #0     (10) == 0
+
+      0q0 01110 size 100000 101010 n d  CMLT Vd.T, Vn.T, #0     (11) <s 0
+   */
+   if (INSN(31,31) == 0
+       && INSN(28,24) == BITS5(0,1,1,1,0) && INSN(21,21) == 1) {
+      Bool isQ    = INSN(30,30) == 1;
+      UInt bit29  = INSN(29,29);
+      UInt szBlg2 = INSN(23,22);
+      UInt mm     = INSN(20,16);
+      UInt b1510  = INSN(15,10);
+      UInt nn     = INSN(9,5);
+      UInt dd     = INSN(4,0);
+      const IROp opsEQ[4]
+         = { Iop_CmpEQ8x16,  Iop_CmpEQ16x8,  Iop_CmpEQ32x4,  Iop_CmpEQ64x2 };
+      const IROp opsGTS[4]
+         = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
+      const IROp opsGTU[4]
+         = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
+      Bool zeroHI = False;
+      const HChar* arrSpec = "??";
+      Bool ok = getLaneInfo_SIMPLE(&zeroHI, &arrSpec, isQ, szBlg2);
+      UInt ix = 0;
+      if (ok) {
+         switch (b1510) {
+            case BITS6(1,0,0,0,1,1): ix = bit29 ? 1 : 2; break;
+            case BITS6(0,0,1,1,0,1): ix = bit29 ? 3 : 4; break;
+            case BITS6(0,0,1,1,1,1): ix = bit29 ? 5 : 6; break;
+            case BITS6(1,0,0,0,1,0):
+               if (mm == 0) { ix = bit29 ? 7 : 8; }; break;
+            case BITS6(1,0,0,1,1,0):
+               if (mm == 0) { ix = bit29 ? 9 : 10; }; break;
+            case BITS6(1,0,1,0,1,0):
+               if (mm == 0 && bit29 == 0) { ix = 11; }; break;
+            default: break;
+         }
+      }
+      if (ix != 0) {
+         vassert(ok && szBlg2 < 4);
+         IRExpr* argL = getQReg128(nn);
+         IRExpr* argR = (ix <= 6) ? getQReg128(mm) : mkV128(0x0000);
+         IRExpr* res  = NULL;
+         /* Some useful identities:
+               x >  y   can be expressed directly
+               x <  y   ==   y > x
+               x <= y   ==   not (x > y)
+               x >= y   ==   not (y > x)
+         */
+         switch (ix) {
+            case 1: res = binop(opsEQ[szBlg2], argL, argR); break;
+            case 2: binop(opsEQ[szBlg2],
+                          binop(Iop_AndV128, argL, argR), 
+                          mkV128(0x0000));
+                    break;
+            case 3: res = binop(opsGTU[szBlg2], argL, argR); break;
+            case 4: res = binop(opsGTS[szBlg2], argL, argR); break;
+            case 5: res = unop(Iop_NotV128, binop(opsGTU[szBlg2], argR, argL));
+                    break;
+            case 6: res = unop(Iop_NotV128, binop(opsGTS[szBlg2], argR, argL));
+                    break;
+            case 7: res = unop(Iop_NotV128, binop(opsGTS[szBlg2], argR, argL));
+                    break;
+            case 8: res = binop(opsGTS[szBlg2], argL, argR); break;
+            case 9: res = unop(Iop_NotV128,
+                               binop(opsGTS[szBlg2], argL, argR));
+                    break;
+            case 10: res = binop(opsEQ[szBlg2],  argL, argR); break;
+            case 11: res = binop(opsGTS[szBlg2], argR, argL); break;
+            default: vassert(0);
+         }
+         vassert(res);
+         putQReg128(dd, zeroHI ? unop(Iop_ZeroHI64ofV128, res) : res);
+         const HChar* nms[11] = { "eq", "tst", "hi", "gt", "hs", "ge",
+                                  "ge", "gt", "le", "eq", "lt" };
+         if (ix <= 6) {
+            DIP("cm%s %s.%s, %s.%s, %s.%s\n", nms[ix-1],
+                nameQReg128(dd), arrSpec,
+                nameQReg128(nn), arrSpec, nameQReg128(mm), arrSpec);
+         } else {
+            DIP("cm%s %s.%s, %s.%s, #0\n", nms[ix-1],
+                nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
+         }
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------- {EOR,BSL,BIT,BIF} (vector) -------------- */
+   /* 31  28    23   20 15     9 4
+      0q1 01110 00 1 m  000111 n d  EOR Vd.T, Vm.T, Vn.T
+      0q1 01110 01 1 m  000111 n d  BSL Vd.T, Vm.T, Vn.T
+      0q1 01110 10 1 m  000111 n d  BIT Vd.T, Vm.T, Vn.T
+      0q1 01110 11 1 m  000111 n d  BIF Vd.T, Vm.T, Vn.T
+   */
+   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(1,0,1,1,1,0)
+       && INSN(21,21) == 1 && INSN(15,10) == BITS6(0,0,0,1,1,1)) {
+      Bool   isQ  = INSN(30,30) == 1;
+      UInt   op   = INSN(23,22);
+      UInt   mm   = INSN(20,16);
+      UInt   nn   = INSN(9,5);
+      UInt   dd   = INSN(4,0);
+      IRTemp argD = newTemp(Ity_V128);
+      IRTemp argN = newTemp(Ity_V128);
+      IRTemp argM = newTemp(Ity_V128);
+      assign(argD, getQReg128(dd));
+      assign(argN, getQReg128(nn));
+      assign(argM, getQReg128(mm));
+      const IROp opXOR = Iop_XorV128;
+      const IROp opAND = Iop_AndV128;
+      const IROp opNOT = Iop_NotV128;
+      IRExpr* res = NULL;
+      switch (op) {
+         case BITS2(0,0):
+            res = binop(opXOR, mkexpr(argM), mkexpr(argN));
+            break;
+         case BITS2(0,1):
+            res = binop(opXOR, mkexpr(argM),
+                               binop(opAND,
+                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
+                                     mkexpr(argD)));
+            break;
+         case BITS2(1,0):
+            res = binop(opXOR, mkexpr(argD),
+                               binop(opAND,
+                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
+                                     mkexpr(argM)));
+            break;
+         case BITS2(1,1):
+            res = binop(opXOR, mkexpr(argD),
+                               binop(opAND,
+                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
+                                     unop(opNOT, mkexpr(argM))));
+            break;
+         default:
+            vassert(0);
+      }
+      vassert(res);
+      putQReg128(dd, isQ ? res : unop(Iop_ZeroHI64ofV128, res));
+      const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
+      const HChar* arr = isQ ? "16b" : "8b";
+      vassert(op < 4);
+      DIP("%s %s.%s, %s.%s, %s.%s\n", nms[op],
+          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
+      return True;
+   }
+
+   /* ------------ {USHR,SSHR} (vector, immediate) ------------ */
+   /* 31  28     22   18   15     9 4
+      0q1 011110 immh immb 000001 n d  USHR Vd.T, Vn.T, #shift
+      0q0 011110 immh immb 000001 n d  SSHR Vd.T, Vn.T, #shift
+      laneTy, shift = case immh:immb of
+                         0001:xxx -> B, 8-xxx
+                         001x:xxx -> H, 16-xxxx
+                         01xx:xxx -> S, 32-xxxxx
+                         1xxx:xxx -> D, 64-xxxxxx
+                         other    -> invalid
+      As usual the case laneTy==D && q==0 is not allowed.
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,1,0)
+       && INSN(15,10) == BITS6(0,0,0,0,0,1)) {
+      Bool isQ  = INSN(30,30) == 1;
+      Bool isU  = INSN(29,29) == 1;
+      UInt immh = INSN(22,19);
+      UInt immb = INSN(18,16);
+      UInt nn   = INSN(9,5);
+      UInt dd   = INSN(4,0);
+      const IROp opsSHRN[4]
+         = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
+      const IROp opsSARN[4]
+         = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
+      UInt   szBlg2 = 0;
+      UInt   shift  = 0;
+      Bool   ok     = getLaneInfo_IMMH_IMMB(&shift, &szBlg2, immh, immb);
+      if (ok && szBlg2 < 4 && shift > 0 && shift < (8 << szBlg2)
+          && !(szBlg2 == 3/*64bit*/ && !isQ)) {
+         IROp    op  = isU ? opsSHRN[szBlg2] : opsSARN[szBlg2];
+         IRExpr* src = getQReg128(nn);
+         IRExpr* res = binop(op, src, mkU8(shift));
+         putQReg128(dd, isQ ? res : unop(Iop_ZeroHI64ofV128, res));
+         HChar laneCh = "bhsd"[szBlg2];
+         UInt  nLanes = (isQ ? 128 : 64) / (8 << szBlg2);
+         DIP("%s %s.%u%c, %s.%u%c, #%u\n", isU ? "ushr" : "sshr",
+             nameQReg128(dd), nLanes, laneCh,
+             nameQReg128(nn), nLanes, laneCh, shift);
+         return True;
+      }
+      /* else fall through */
+   }
+
+   /* -------------------- {U,S}SHLL{,2} -------------------- */
+   /* 31  28     22   18   15     9 4
+      0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
+      0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
+      where Ta,Tb,sh
+        = case immh of 0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
+                       001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
+                       01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
+   */
+   if (INSN(31,31) == 0 && INSN(28,23) == BITS6(0,1,1,1,1,0)
+       && INSN(15,10) == BITS6(1,0,1,0,0,1)) {
+      Bool isQ   = INSN(30,30) == 1;
+      Bool isU   = INSN(29,29) == 1;
+      UInt immh  = INSN(22,19);
+      UInt immb  = INSN(18,16);
+      UInt nn    = INSN(9,5);
+      UInt dd    = INSN(4,0);
+      UInt immhb = (immh << 3) | immb;
+      IRTemp  src  = newTemp(Ity_V128);
+      IRTemp  zero = newTemp(Ity_V128);
+      IRExpr* res  = NULL;
+      UInt    sh   = 0;
+      const HChar* ta = "??";
+      const HChar* tb = "??";
+      assign(src, getQReg128(nn));
+      assign(zero, mkV128(0x0000));
+      if (immh & 1) {
+         sh = immhb - 8;
+         vassert(sh < 8); /* so 8-sh is 1..8 */
+         ta = "8h";
+         tb = isQ ? "16b" : "8b";
+         IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero) 
+                           : mk_InterleaveLO8x16(src, zero);
+         res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
+      }
+      else if (immh & 2) {
+         sh = immhb - 16;
+         vassert(sh < 16); /* so 16-sh is 1..16 */
+         ta = "4s";
+         tb = isQ ? "8h" : "4h";
+         IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero) 
+                           : mk_InterleaveLO16x8(src, zero);
+         res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
+      }
+      else if (immh & 4) {
+         sh = immhb - 32;
+         vassert(sh < 32); /* so 32-sh is 1..32 */
+         ta = "2d";
+         tb = isQ ? "4s" : "2s";
+         IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero) 
+                           : mk_InterleaveLO32x4(src, zero);
+         res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
+      }
+      /* */
+      if (res) {
+         putQReg128(dd, res);
+         DIP("%cshll%s %s.%s, %s.%s, #%d\n",
+             isU ? 'u' : 's', isQ ? "2" : "",
+             nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
+         return True;
+      }
+      /* else fall through */
+   }
+
    /* -------------------- XTN{,2} -------------------- */
    /* 31  28    23   21     15     9 4  XTN{,2} Vd.Tb, Vn.Ta
       0q0 01110 size 100001 001010 n d
@@ -6010,6 +6338,53 @@ Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
       /* else fall through */
    }
 
+   /* -------------------- INS (general) -------------------- */
+   /* 31  28       20   15     9 4
+      010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
+      where Ts,ix = case imm5 of xxxx1 -> B, xxxx
+                                 xxx10 -> H, xxx
+                                 xx100 -> S, xx
+                                 x1000 -> D, x
+   */
+   if (INSN(31,21) == BITS11(0,1,0,0,1,1,1,0,0,0,0)
+       && INSN(15,10) == BITS6(0,0,0,1,1,1)) {
+      UInt    imm5   = INSN(20,16);
+      UInt    nn     = INSN(9,5);
+      UInt    dd     = INSN(4,0);
+      HChar   ts     = '?';
+      UInt    laneNo = 16;
+      IRExpr* src    = NULL;
+      if (imm5 & 1) {
+         src    = unop(Iop_64to8, getIReg64orZR(nn));
+         laneNo = (imm5 >> 1) & 15;
+         ts     = 'b';
+      }
+      else if (imm5 & 2) {
+         src    = unop(Iop_64to16, getIReg64orZR(nn));
+         laneNo = (imm5 >> 2) & 7;
+         ts     = 'h';
+      }
+      else if (imm5 & 4) {
+         src    = unop(Iop_64to32, getIReg64orZR(nn));
+         laneNo = (imm5 >> 3) & 3;
+         ts     = 's';
+      }
+      else if (imm5 & 8) {
+         src    = getIReg64orZR(nn);
+         laneNo = (imm5 >> 4) & 1;
+         ts     = 'd';
+      }
+      /* */
+      if (src) {
+         vassert(laneNo < 16);
+         putQRegLane(dd, laneNo, src);
+         DIP("ins %s.%c[%u], %s\n",
+             nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
+         return True;
+      }
+      /* else invalid; fall through */
+   }
+
    /* FIXME Temporary hacks to get through ld.so FIXME */
 
    /* ------------------ movi vD.4s, #0x0 ------------------ */
@@ -6400,6 +6775,23 @@ static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 )
   return mkexpr(mkV128from32s(a3, a1, b3, b1));
 }
 
+static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a1 b1 a0 b0
+  IRTemp a1, a0, b1, b0;
+  breakV128to32s(NULL, NULL, &a1, &a0, a3210);
+  breakV128to32s(NULL, NULL, &b1, &b0, b3210);
+  return mkexpr(mkV128from32s(a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 )
+{
+  // returns a3 b3 a2 b2
+  IRTemp a3, a2, b3, b2;
+  breakV128to32s(&a3, &a2, NULL, NULL, a3210);
+  breakV128to32s(&b3, &b2, NULL, NULL, b3210);
+  return mkexpr(mkV128from32s(a3, b3, a2, b2));
+}
 
 ////////////////////////////////////////////////////////////////
 // 16x8 operations
@@ -6509,6 +6901,24 @@ static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 )
   return mkexpr(mkV128from16s(a7, a5, a3, a1, b7, b5, b3, b1));
 }
 
+static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a3 b3 a2 b2 a1 b1 a0 b0
+  IRTemp a3, b3, a2, b2, a1, a0, b1, b0;
+  breakV128to16s(NULL, NULL, NULL, NULL, &a3, &a2, &a1, &a0, a76543210);
+  breakV128to16s(NULL, NULL, NULL, NULL, &b3, &b2, &b1, &b0, b76543210);
+  return mkexpr(mkV128from16s(a3, b3, a2, b2, a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 )
+{
+  // returns a7 b7 a6 b6 a5 b5 a4 b4
+  IRTemp a7, b7, a6, b6, a5, b5, a4, b4;
+  breakV128to16s(&a7, &a6, &a5, &a4, NULL, NULL, NULL, NULL, a76543210);
+  breakV128to16s(&b7, &b6, &b5, &b4, NULL, NULL, NULL, NULL, b76543210);
+  return mkexpr(mkV128from16s(a7, b7, a6, b6, a5, b5, a4, b4));
+}
+
 ////////////////////////////////////////////////////////////////
 // 8x16 operations
 //
@@ -6686,6 +7096,35 @@ static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
                              bF, bD, bB, b9, b7, b5, b3, b1));
 }
 
+static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
+                                     IRTemp bFEDCBA9876543210 )
+{
+  // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
+  IRTemp a7, b7, a6, b6, a5, b5, a4, b4, a3, b3, a2, b2, a1, b1, a0, b0;
+  breakV128to8s(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                &a7,  &a6,  &a5,  &a4,  &a3,  &a2,  &a1,  &a0,
+                aFEDCBA9876543210);
+  breakV128to8s(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                &b7,  &b6,  &b5,  &b4,  &b3,  &b2,  &b1,  &b0,
+                bFEDCBA9876543210);
+  return mkexpr(mkV128from8s(a7, b7, a6, b6, a5, b5, a4, b4,
+                             a3, b3, a2, b2, a1, b1, a0, b0));
+}
+
+static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
+                                     IRTemp bFEDCBA9876543210 )
+{
+  // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
+  IRTemp aF, bF, aE, bE, aD, bD, aC, bC, aB, bB, aA, bA, a9, b9, a8, b8;
+  breakV128to8s(&aF,  &aE,  &aD,  &aC,  &aB,  &aA,  &a9,  &a8,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                aFEDCBA9876543210);
+  breakV128to8s(&bF,  &bE,  &bD,  &bC,  &bB,  &bA,  &b9,  &b8,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                bFEDCBA9876543210);
+  return mkexpr(mkV128from8s(aF, bF, aE, bE, aD, bD, aC, bC,
+                             aB, bB, aA, bA, a9, b9, a8, b8));
+}
 
 /*--------------------------------------------------------------------*/
 /*--- end                                       guest_arm64_toIR.c ---*/
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
index accc74ce6d..21e12813a6 100644
--- a/VEX/priv/host_arm64_defs.c
+++ b/VEX/priv/host_arm64_defs.c
@@ -200,7 +200,6 @@ void getAllocableRegs_ARM64 ( Int* nregs, HReg** arr )
 }
 
 
-
 /* --------- Condition codes, ARM64 encoding. --------- */
 
 static const HChar* showARM64CondCode ( ARM64CondCode cond ) {
@@ -881,6 +880,8 @@ static void showARM64VecBinOp(/*OUT*/const HChar** nm,
       case ARM64vecb_SMIN8x16: *nm = "smin"; *ar = "16b"; return;
       case ARM64vecb_AND:      *nm = "and "; *ar = "all"; return;
       case ARM64vecb_ORR:      *nm = "orr "; *ar = "all"; return;
+      case ARM64vecb_XOR:      *nm = "eor "; *ar = "all"; return;
+      case ARM64vecb_CMEQ64x2: *nm = "cmeq"; *ar = "2d";  return;
       default: vpanic("showARM64VecBinOp");
    }
 }
@@ -889,11 +890,23 @@ static void showARM64VecUnaryOp(/*OUT*/const HChar** nm,
                                 /*OUT*/const HChar** ar, ARM64VecUnaryOp op )
 {
    switch (op) {
-      case ARM64vecu_FNEG64x2: *nm = "fneg "; *ar = "2d"; return;
+      case ARM64vecu_FNEG64x2: *nm = "fneg "; *ar = "2d";  return;
       case ARM64vecu_FNEG32x4: *nm = "fneg "; *ar = "4s";  return;
       case ARM64vecu_FABS64x2: *nm = "fabs "; *ar = "2d";  return;
       case ARM64vecu_FABS32x4: *nm = "fabs "; *ar = "4s";  return;
-      default: vpanic("showARM64VecBinOp");
+      case ARM64vecu_NOT:      *nm = "not  "; *ar = "all"; return;
+      default: vpanic("showARM64VecUnaryOp");
+   }
+}
+
+static void showARM64VecShiftOp(/*OUT*/const HChar** nm,
+                                /*OUT*/const HChar** ar,
+                                ARM64VecShiftOp op )
+{
+   switch (op) {
+      case ARM64vecsh_USHR64x2: *nm = "ushr  "; *ar = "2d";  return;
+      case ARM64vecsh_SSHR64x2: *nm = "sshr  "; *ar = "2d";  return;
+      default: vpanic("showARM64VecShiftImmOp");
    }
 }
 
@@ -1588,6 +1601,25 @@ ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src ) {
    vassert(dszBlg2 == 0 || dszBlg2 == 1 || dszBlg2 == 2);
    return i;
 }
+ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op,
+                                    HReg dst, HReg src, UInt amt ) {
+   ARM64Instr* i = LibVEX_Alloc(sizeof(ARM64Instr));
+   i->tag                    = ARM64in_VShiftImmV;
+   i->ARM64in.VShiftImmV.op  = op;
+   i->ARM64in.VShiftImmV.dst = dst;
+   i->ARM64in.VShiftImmV.src = src;
+   i->ARM64in.VShiftImmV.amt = amt;
+   UInt maxSh = 0;
+   switch (op) {
+      case ARM64vecsh_USHR64x2: case ARM64vecsh_SSHR64x2:
+         maxSh = 63; break;
+      default:
+         vassert(0);
+   }
+   vassert(maxSh > 0);
+   vassert(amt > 0 && amt <= maxSh);
+   return i;
+}
 //ZZ ARMInstr* ARMInstr_VAluS ( ARMVfpOp op, HReg dst, HReg argL, HReg argR ) {
 //ZZ    ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
 //ZZ    i->tag              = ARMin_VAluS;
@@ -2215,6 +2247,17 @@ void ppARM64Instr ( ARM64Instr* i ) {
          vex_printf(".%s", dszBlg2 < 3 ? sarr[dszBlg2] : "??");
          return;
       }
+      case ARM64in_VShiftImmV: {
+         const HChar* nm = "??";
+         const HChar* ar = "??";
+         showARM64VecShiftOp(&nm, &ar, i->ARM64in.VShiftImmV.op);
+         vex_printf("%s ", nm);
+         ppHRegARM64(i->ARM64in.VShiftImmV.dst);
+         vex_printf(".%s, ", ar);
+         ppHRegARM64(i->ARM64in.VShiftImmV.src);
+         vex_printf(".%s, #%u", ar, i->ARM64in.VShiftImmV.amt);
+         return;
+      }
 //ZZ       case ARMin_VAluS:
 //ZZ          vex_printf("f%-3ss ", showARMVfpOp(i->ARMin.VAluS.op));
 //ZZ          ppHRegARM(i->ARMin.VAluS.dst);
@@ -2691,6 +2734,10 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, ARM64Instr* i, Bool mode64 )
          addHRegUse(u, HRmWrite, i->ARM64in.VNarrowV.dst);
          addHRegUse(u, HRmRead, i->ARM64in.VNarrowV.src);
          return;
+      case ARM64in_VShiftImmV:
+         addHRegUse(u, HRmWrite, i->ARM64in.VShiftImmV.dst);
+         addHRegUse(u, HRmRead, i->ARM64in.VShiftImmV.src);
+         return;
 //ZZ       case ARMin_VAluS:
 //ZZ          addHRegUse(u, HRmWrite, i->ARMin.VAluS.dst);
 //ZZ          addHRegUse(u, HRmRead, i->ARMin.VAluS.argL);
@@ -2979,6 +3026,12 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
          i->ARM64in.VNarrowV.dst = lookupHRegRemap(m, i->ARM64in.VNarrowV.dst);
          i->ARM64in.VNarrowV.src = lookupHRegRemap(m, i->ARM64in.VNarrowV.src);
          return;
+      case ARM64in_VShiftImmV:
+         i->ARM64in.VShiftImmV.dst
+            = lookupHRegRemap(m, i->ARM64in.VShiftImmV.dst);
+         i->ARM64in.VShiftImmV.src
+            = lookupHRegRemap(m, i->ARM64in.VShiftImmV.src);
+         return;
 //ZZ       case ARMin_VAluS:
 //ZZ          i->ARMin.VAluS.dst  = lookupHRegRemap(m, i->ARMin.VAluS.dst);
 //ZZ          i->ARMin.VAluS.argL = lookupHRegRemap(m, i->ARMin.VAluS.argL);
@@ -3272,6 +3325,7 @@ static inline UChar qregNo ( HReg r )
 #define X11111   BITS8(0,0,0, 1,1,1,1,1)
 
 #define X000000  BITS8(0,0, 0,0,0,0,0,0)
+#define X000001  BITS8(0,0, 0,0,0,0,0,1)
 #define X000100  BITS8(0,0, 0,0,0,1,0,0)
 #define X000111  BITS8(0,0, 0,0,0,1,1,1)
 #define X001000  BITS8(0,0, 0,0,1,0,0,0)
@@ -3280,11 +3334,14 @@ static inline UChar qregNo ( HReg r )
 #define X001111  BITS8(0,0, 0,0,1,1,1,1)
 #define X010000  BITS8(0,0, 0,1,0,0,0,0)
 #define X010001  BITS8(0,0, 0,1,0,0,0,1)
+#define X010110  BITS8(0,0, 0,1,0,1,1,0)
 #define X011001  BITS8(0,0, 0,1,1,0,0,1)
 #define X011010  BITS8(0,0, 0,1,1,0,1,0)
 #define X011011  BITS8(0,0, 0,1,1,0,1,1)
+#define X011110  BITS8(0,0, 0,1,1,1,1,0)
 #define X011111  BITS8(0,0, 0,1,1,1,1,1)
 #define X100001  BITS8(0,0, 1,0,0,0,0,1)
+#define X100011  BITS8(0,0, 1,0,0,0,1,1)
 #define X100100  BITS8(0,0, 1,0,0,1,0,0)
 #define X100101  BITS8(0,0, 1,0,0,1,0,1)
 #define X100110  BITS8(0,0, 1,0,0,1,1,0)
@@ -3299,6 +3356,8 @@ static inline UChar qregNo ( HReg r )
 #define X111110  BITS8(0,0, 1,1,1,1,1,0)
 #define X111111  BITS8(0,0, 1,1,1,1,1,1)
 
+#define X1000000  BITS8(0, 1,0,0,0,0,0,0)
+
 #define X00100000  BITS8(0,0,1,0,0,0,0,0)
 #define X00100001  BITS8(0,0,1,0,0,0,0,1)
 #define X00100010  BITS8(0,0,1,0,0,0,1,0)
@@ -3421,6 +3480,25 @@ static inline UInt X_3_5_8_6_5_5 ( UInt f1, UInt f2, UInt f3,
    return w;
 }
 
+static inline UInt X_3_6_7_6_5_5 ( UInt f1, UInt f2, UInt f3,
+                                   UInt f4, UInt f5, UInt f6 ) {
+   vassert(3+6+7+6+5+5 == 32);
+   vassert(f1 < (1<<3));
+   vassert(f2 < (1<<6));
+   vassert(f3 < (1<<7));
+   vassert(f4 < (1<<6));
+   vassert(f5 < (1<<5));
+   vassert(f6 < (1<<5));
+   UInt w = 0;
+   w = (w <<  3) | f1;
+   w = (w <<  6) | f2;
+   w = (w <<  7) | f3;
+   w = (w <<  6) | f4;
+   w = (w <<  5) | f5;
+   w = (w <<  5) | f6;
+   return w;
+}
+
 /* --- 7 fields --- */
 
 static inline UInt X_2_6_3_9_2_5_5 ( UInt f1, UInt f2, UInt f3,
@@ -4843,6 +4921,11 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
 
             010 01110 00 1 m  000111 n d   AND Vd, Vn, Vm
             010 01110 10 1 m  000111 n d   ORR Vd, Vn, Vm
+            011 01110 00 1 m  000111 n d   EOR Vd, Vn, Vm
+
+            011 01110 11 1 m  100011 n d   CMEQ Vd.2d, Vn.2d, Vm.2d
+            011 01110 11 1 m  001101 n d   CMHI Vd.2d, Vn.2d, Vm.2d  >u, ATC
+            010 01110 11 1 m  001101 n d   CMGT Vd.2d, Vn.2d, Vm.2d  >s, ATC
          */
          UInt vD = qregNo(i->ARM64in.VBinV.dst);
          UInt vN = qregNo(i->ARM64in.VBinV.argL);
@@ -4931,12 +5014,20 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
                *p++ = X_3_8_5_6_5_5(X010, X01110011, vM, X011011, vN, vD);
                break;
 
+            case ARM64vecb_AND:
+               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X000111, vN, vD);
+               break;
             case ARM64vecb_ORR:
                *p++ = X_3_8_5_6_5_5(X010, X01110101, vM, X000111, vN, vD);
                break;
-            case ARM64vecb_AND:
-               *p++ = X_3_8_5_6_5_5(X010, X01110001, vM, X000111, vN, vD);
+            case ARM64vecb_XOR:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, vM, X000111, vN, vD);
+               break;
+
+            case ARM64vecb_CMEQ64x2:
+               *p++ = X_3_8_5_6_5_5(X011, X01110111, vM, X100011, vN, vD);
                break;
+
             default:
                goto bad;
          }
@@ -4944,17 +5035,24 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
       }
       case ARM64in_VUnaryV: {
          /* 31        23   20    15     9 4
-            010 01110 11 1 00000 111110 n d  FABS Vd.2d, Vn.2d
-            010 01110 10 1 00000 111110 n d  FABS Vd.4s, Vn.4s
-            011 01110 11 1 00000 111110 n d  FNEG Vd.2d, Vn.2d
-            011 01110 10 1 00000 111110 n d  FNEG Vd.4s, Vn.4s
+            010 01110 11 1 00000 111110 n d  FABS Vd.2d,  Vn.2d
+            010 01110 10 1 00000 111110 n d  FABS Vd.4s,  Vn.4s
+            011 01110 11 1 00000 111110 n d  FNEG Vd.2d,  Vn.2d
+            011 01110 10 1 00000 111110 n d  FNEG Vd.4s,  Vn.4s
+            011 01110 00 1 00000 010110 n d  NOT  Vd.16b, Vn.16b
          */
          UInt vD = qregNo(i->ARM64in.VUnaryV.dst);
          UInt vN = qregNo(i->ARM64in.VUnaryV.arg);
          switch (i->ARM64in.VUnaryV.op) {
+            case ARM64vecu_FABS64x2:
+               *p++ = X_3_8_5_6_5_5(X010, X01110111, X00000, X111110, vN, vD);
+               break;
             case ARM64vecu_FNEG64x2:
                *p++ = X_3_8_5_6_5_5(X011, X01110111, X00000, X111110, vN, vD);
                break;
+            case ARM64vecu_NOT:
+               *p++ = X_3_8_5_6_5_5(X011, X01110001, X00000, X010110, vN, vD);
+               break;
             default:
                goto bad;
          }
@@ -4974,6 +5072,37 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
                               X00001, X001010, vN, vD);
          goto done;
       }
+      case ARM64in_VShiftImmV: {
+         /*
+            0q1 011110 immh immb 000001 n d  USHR Vd.T, Vn.T, #sh
+            0q0 011110 immh immb 000001 n d  SSHR Vd.T, Vn.T, #sh
+            where immh:immb
+               = case T of 
+                    2d  | sh in 1..63 -> let xxxxxx = 64-sh in 1xxx:xxx
+                    4s  | sh in 1..31 -> let  xxxxx = 32-sh in 01xx:xxx
+                    8h  | sh in 1..15 -> let   xxxx = 16-sh in 001x:xxx
+                    16b | sh in 1..7  -> let    xxx =  8-sh in 0001:xxx
+         */
+         UInt vD = qregNo(i->ARM64in.VShiftImmV.dst);
+         UInt vN = qregNo(i->ARM64in.VShiftImmV.src);
+         UInt sh = i->ARM64in.VShiftImmV.amt;
+         ARM64VecShiftOp op = i->ARM64in.VShiftImmV.op;
+         Bool syned = False;
+         switch (op) {
+            case ARM64vecsh_SSHR64x2: syned = True;
+            case ARM64vecsh_USHR64x2: /* fallthrough */
+               if (sh >= 1 && sh <= 63) {
+                  UInt xxxxxx = 64-sh;
+                  *p++ = X_3_6_7_6_5_5(syned ? X010 : X011, X011110,
+                                       X1000000 | xxxxxx, X000001, vN, vD);
+                  goto done;
+               }
+               break;
+            default:
+               break;
+         }
+         goto bad;
+      }
 //ZZ       case ARMin_VAluS: {
 //ZZ          UInt dN = fregNo(i->ARMin.VAluS.argL);
 //ZZ          UInt dD = fregNo(i->ARMin.VAluS.dst);
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
index 489b1b05c9..b6e4da8185 100644
--- a/VEX/priv/host_arm64_defs.h
+++ b/VEX/priv/host_arm64_defs.h
@@ -337,6 +337,8 @@ typedef
       ARM64vecb_SMIN8x16,
       ARM64vecb_AND,
       ARM64vecb_ORR,
+      ARM64vecb_XOR,
+      ARM64vecb_CMEQ64x2,
       ARM64vecb_INVALID
    }
    ARM64VecBinOp;
@@ -347,10 +349,19 @@ typedef
       ARM64vecu_FNEG32x4,
       ARM64vecu_FABS64x2,
       ARM64vecu_FABS32x4,
+      ARM64vecu_NOT,
       ARM64vecu_INVALID
    }
    ARM64VecUnaryOp;
 
+typedef
+   enum {
+      ARM64vecsh_USHR64x2=350,
+      ARM64vecsh_SSHR64x2,
+      ARM64vecsh_INVALID
+   }
+   ARM64VecShiftOp;
+
 //ZZ extern const HChar* showARMVfpUnaryOp ( ARMVfpUnaryOp op );
 //ZZ 
 //ZZ typedef
@@ -534,6 +545,7 @@ typedef
       ARM64in_VBinV,
       ARM64in_VUnaryV,
       ARM64in_VNarrowV,
+      ARM64in_VShiftImmV,
 //ZZ       ARMin_VAluS,
 //ZZ       ARMin_VCMovD,
 //ZZ       ARMin_VCMovS,
@@ -819,6 +831,15 @@ typedef
            HReg dst;     // Q reg
            HReg src;     // Q reg
         } VNarrowV;
+        /* Vector shift by immediate.  |amt| needs to be > 0 and <
+           implied lane size of |op|.  Zero shifts and out of range
+           shifts are not allowed. */
+        struct {
+           ARM64VecShiftOp op;
+           HReg            dst;
+           HReg            src;
+           UInt            amt;
+        } VShiftImmV;
 //ZZ          /* 32-bit FP binary arithmetic */
 //ZZ          struct {
 //ZZ             ARMVfpOp op;
@@ -1022,6 +1043,8 @@ extern ARM64Instr* ARM64Instr_FPCR    ( Bool toFPCR, HReg iReg );
 extern ARM64Instr* ARM64Instr_VBinV   ( ARM64VecBinOp op, HReg, HReg, HReg );
 extern ARM64Instr* ARM64Instr_VUnaryV ( ARM64VecUnaryOp op, HReg, HReg );
 extern ARM64Instr* ARM64Instr_VNarrowV ( UInt dszBlg2, HReg dst, HReg src );
+extern ARM64Instr* ARM64Instr_VShiftImmV ( ARM64VecShiftOp op,
+                                           HReg dst, HReg src, UInt amt );
 //ZZ extern ARMInstr* ARMInstr_VAluS    ( ARMVfpOp op, HReg, HReg, HReg );
 //ZZ extern ARMInstr* ARMInstr_VCMovD   ( ARMCondCode, HReg dst, HReg src );
 //ZZ extern ARMInstr* ARMInstr_VCMovS   ( ARMCondCode, HReg dst, HReg src );
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
index 3d81c0b7b1..0c8d2315b5 100644
--- a/VEX/priv/host_arm64_isel.c
+++ b/VEX/priv/host_arm64_isel.c
@@ -4361,11 +4361,15 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
 
       /* Other cases */
       switch (e->Iex.Unop.op) {
+         case Iop_NotV128:
+         case Iop_Abs64Fx2:
          case Iop_Neg64Fx2: {
             HReg res = newVRegV(env);
             HReg arg = iselV128Expr(env, e->Iex.Unop.arg);
             ARM64VecUnaryOp op = ARM64vecu_INVALID;
             switch (e->Iex.Unop.op) {
+               case Iop_NotV128:  op = ARM64vecu_NOT;      break;
+               case Iop_Abs64Fx2: op = ARM64vecu_FABS64x2; break;
                case Iop_Neg64Fx2: op = ARM64vecu_FNEG64x2; break;
                default: vassert(0);
             }
@@ -4852,6 +4856,7 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
 //ZZ          case Iop_Add32x4:
          case Iop_AndV128:
          case Iop_OrV128:
+         case Iop_XorV128:
          case Iop_Max32Ux4:
          case Iop_Max16Ux8:
          case Iop_Max8Ux16:
@@ -4869,32 +4874,35 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
          case Iop_Sub32x4:
          case Iop_Sub16x8:
          case Iop_Mul32x4:
-         case Iop_Mul16x8: {
+         case Iop_Mul16x8:
+         case Iop_CmpEQ64x2: {
             HReg res  = newVRegV(env);
             HReg argL = iselV128Expr(env, e->Iex.Binop.arg1);
             HReg argR = iselV128Expr(env, e->Iex.Binop.arg2);
             ARM64VecBinOp op = ARM64vecb_INVALID;
             switch (e->Iex.Binop.op) {
-               case Iop_AndV128:  op = ARM64vecb_AND; break;
-               case Iop_OrV128:   op = ARM64vecb_ORR; break;
-               case Iop_Max32Ux4: op = ARM64vecb_UMAX32x4; break;
-               case Iop_Max16Ux8: op = ARM64vecb_UMAX16x8; break;
-               case Iop_Max8Ux16: op = ARM64vecb_UMAX8x16; break;
-               case Iop_Min32Ux4: op = ARM64vecb_UMIN32x4; break;
-               case Iop_Min16Ux8: op = ARM64vecb_UMIN16x8; break;
-               case Iop_Min8Ux16: op = ARM64vecb_UMIN8x16; break;
-               case Iop_Max32Sx4: op = ARM64vecb_SMAX32x4; break;
-               case Iop_Max16Sx8: op = ARM64vecb_SMAX16x8; break;
-               case Iop_Min32Sx4: op = ARM64vecb_SMIN32x4; break;
-               case Iop_Min16Sx8: op = ARM64vecb_SMIN16x8; break;
-               case Iop_Add64x2:  op = ARM64vecb_ADD64x2; break;
-               case Iop_Add32x4:  op = ARM64vecb_ADD32x4; break;
-               case Iop_Add16x8:  op = ARM64vecb_ADD16x8; break;
-               case Iop_Sub64x2:  op = ARM64vecb_SUB64x2; break;
-               case Iop_Sub32x4:  op = ARM64vecb_SUB32x4; break;
-               case Iop_Sub16x8:  op = ARM64vecb_SUB16x8; break;
-               case Iop_Mul32x4:  op = ARM64vecb_MUL32x4; break;
-               case Iop_Mul16x8:  op = ARM64vecb_MUL16x8; break;
+               case Iop_AndV128:   op = ARM64vecb_AND; break;
+               case Iop_OrV128:    op = ARM64vecb_ORR; break;
+               case Iop_XorV128:   op = ARM64vecb_XOR; break;
+               case Iop_Max32Ux4:  op = ARM64vecb_UMAX32x4; break;
+               case Iop_Max16Ux8:  op = ARM64vecb_UMAX16x8; break;
+               case Iop_Max8Ux16:  op = ARM64vecb_UMAX8x16; break;
+               case Iop_Min32Ux4:  op = ARM64vecb_UMIN32x4; break;
+               case Iop_Min16Ux8:  op = ARM64vecb_UMIN16x8; break;
+               case Iop_Min8Ux16:  op = ARM64vecb_UMIN8x16; break;
+               case Iop_Max32Sx4:  op = ARM64vecb_SMAX32x4; break;
+               case Iop_Max16Sx8:  op = ARM64vecb_SMAX16x8; break;
+               case Iop_Min32Sx4:  op = ARM64vecb_SMIN32x4; break;
+               case Iop_Min16Sx8:  op = ARM64vecb_SMIN16x8; break;
+               case Iop_Add64x2:   op = ARM64vecb_ADD64x2; break;
+               case Iop_Add32x4:   op = ARM64vecb_ADD32x4; break;
+               case Iop_Add16x8:   op = ARM64vecb_ADD16x8; break;
+               case Iop_Sub64x2:   op = ARM64vecb_SUB64x2; break;
+               case Iop_Sub32x4:   op = ARM64vecb_SUB32x4; break;
+               case Iop_Sub16x8:   op = ARM64vecb_SUB16x8; break;
+               case Iop_Mul32x4:   op = ARM64vecb_MUL32x4; break;
+               case Iop_Mul16x8:   op = ARM64vecb_MUL16x8; break;
+               case Iop_CmpEQ64x2: op = ARM64vecb_CMEQ64x2; break;
                default: vassert(0);
             }
             addInstr(env, ARM64Instr_VBinV(op, res, argL, argR));
@@ -5393,27 +5401,32 @@ static HReg iselV128Expr_wrk ( ISelEnv* env, IRExpr* e )
 //ZZ          case Iop_ShrN8x16:
 //ZZ          case Iop_ShrN16x8:
 //ZZ          case Iop_ShrN32x4:
-//ZZ          case Iop_ShrN64x2: {
-//ZZ             HReg res = newVRegV(env);
-//ZZ             HReg tmp = newVRegV(env);
-//ZZ             HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
-//ZZ             HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
-//ZZ             HReg argR2 = newVRegI(env);
-//ZZ             UInt size;
-//ZZ             switch (e->Iex.Binop.op) {
-//ZZ                case Iop_ShrN8x16: size = 0; break;
-//ZZ                case Iop_ShrN16x8: size = 1; break;
-//ZZ                case Iop_ShrN32x4: size = 2; break;
-//ZZ                case Iop_ShrN64x2: size = 3; break;
-//ZZ                default: vassert(0);
-//ZZ             }
-//ZZ             addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
-//ZZ             addInstr(env, ARMInstr_NUnary(ARMneon_DUP,
-//ZZ                                           tmp, argR2, 0, True));
-//ZZ             addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
-//ZZ                                           res, argL, tmp, size, True));
-//ZZ             return res;
-//ZZ          }
+         case Iop_ShrN64x2:
+         case Iop_SarN64x2: {
+            IRExpr* argL = e->Iex.Binop.arg1;
+            IRExpr* argR = e->Iex.Binop.arg2;
+            if (argR->tag == Iex_Const && argR->Iex.Const.con->tag == Ico_U8) {
+               UInt amt   = argR->Iex.Const.con->Ico.U8;
+               UInt limit = 0;
+               ARM64VecShiftOp op = ARM64vecsh_INVALID;
+               switch (e->Iex.Binop.op) {
+                  case Iop_ShrN64x2:
+                     op = ARM64vecsh_USHR64x2; limit = 63; break;
+                  case Iop_SarN64x2:
+                     op = ARM64vecsh_SSHR64x2; limit = 63; break;
+                  default:
+                     vassert(0);
+               }
+               if (op != ARM64vecsh_INVALID && amt > 0 && amt <= limit) {
+                  HReg src = iselV128Expr(env, argL);
+                  HReg dst = newVRegV(env);
+                  addInstr(env, ARM64Instr_VShiftImmV(op, dst, src, amt));
+                  return dst;
+               }
+            }
+            /* else fall out; this is unhandled */
+            break;
+         }
 //ZZ          case Iop_ShlN8x16:
 //ZZ          case Iop_ShlN16x8:
 //ZZ          case Iop_ShlN32x4: