From: Julian Seward <jseward@acm.org>
Date: Wed, 27 Mar 2013 11:37:33 +0000 (+0000)
Subject: AMD64: Add support for AVX2, BMI1, BMI2 and FMA instructions (VEX side).
X-Git-Tag: svn/VALGRIND_3_9_0^2~93
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=26fc722aa7a2e9ee4aee659a76cf8014f387f888;p=thirdparty%2Fvalgrind.git

AMD64: Add support for AVX2, BMI1, BMI2 and FMA instructions (VEX side).
Fixes #305728.  (Jakub Jelinek, jakub@redhat.com)


git-svn-id: svn://svn.valgrind.org/vex/trunk@2702
---

diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h
index 487f6f89a9..3bd52d4839 100644
--- a/VEX/priv/guest_amd64_defs.h
+++ b/VEX/priv/guest_amd64_defs.h
@@ -154,6 +154,9 @@ extern ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
                                    ULong dHi, ULong dLo,
                                    ULong imm_and_return_control_bit );
 
+extern ULong amd64g_calculate_pext  ( ULong, ULong );
+extern ULong amd64g_calculate_pdep  ( ULong, ULong );
+
 /* --- DIRTY HELPERS --- */
 
 extern ULong amd64g_dirtyhelper_loadF80le  ( ULong/*addr*/ );
@@ -508,6 +511,18 @@ enum {
     AMD64G_CC_OP_SMULL,   /* 51 */
     AMD64G_CC_OP_SMULQ,   /* 52 */
 
+    AMD64G_CC_OP_ANDN32,  /* 53 */
+    AMD64G_CC_OP_ANDN64,  /* 54 DEP1 = res, DEP2 = 0, NDEP = unused */
+
+    AMD64G_CC_OP_BLSI32,  /* 55 */
+    AMD64G_CC_OP_BLSI64,  /* 56 DEP1 = res, DEP2 = arg, NDEP = unused */
+
+    AMD64G_CC_OP_BLSMSK32,/* 57 */
+    AMD64G_CC_OP_BLSMSK64,/* 58 DEP1 = res, DEP2 = arg, NDEP = unused */
+
+    AMD64G_CC_OP_BLSR32,  /* 59 */
+    AMD64G_CC_OP_BLSR64,  /* 60 DEP1 = res, DEP2 = arg, NDEP = unused */
+
     AMD64G_CC_OP_NUMBER
 };
 
diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c
index 488757fe04..c3cf1e20b5 100644
--- a/VEX/priv/guest_amd64_helpers.c
+++ b/VEX/priv/guest_amd64_helpers.c
@@ -492,6 +492,72 @@ static inline ULong idULong ( ULong x )
    }								\
 }
 
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = 0;							\
+     pf = 0;							\
+     af = 0;							\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     of = 0;							\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = ((DATA_UTYPE)CC_DEP2 != 0);				\
+     pf = 0;							\
+     af = 0;							\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     of = 0;							\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
+     pf = 0;							\
+     af = 0;							\
+     zf = 0;							\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     of = 0;							\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = ((DATA_UTYPE)CC_DEP2 == 0);				\
+     pf = 0;							\
+     af = 0;							\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     of = 0;							\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
 
 #if PROFILE_RFLAGS
 
@@ -655,6 +721,18 @@ ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
 
       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
 
+      case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
+      case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
+
+      case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
+      case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
+
+      case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
+      case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
+
+      case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
+      case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
+
       default:
          /* shouldn't really make these calls from generated code */
          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
@@ -3139,6 +3217,36 @@ ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
    return res;
 }
 
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
+{
+   ULong dst = 0;
+   ULong src_bit;
+   ULong dst_bit = 1;
+   for (src_bit = 1; src_bit; src_bit <<= 1) {
+      if (mask & src_bit) {
+         if (src_masked & src_bit) dst |= dst_bit;
+         dst_bit <<= 1;
+      }
+   }
+   return dst;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_pdep ( ULong src, ULong mask )
+{
+   ULong dst = 0;
+   ULong dst_bit;
+   ULong src_bit = 1;
+   for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
+      if (mask & dst_bit) {
+         if (src & src_bit) dst |= dst_bit;
+         src_bit <<= 1;
+      }
+   }
+   return dst;
+}
+
 /*---------------------------------------------------------------*/
 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 2b200fc190..481f7531c2 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -1290,6 +1290,38 @@ const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
 }
 
 
+static
+IRExpr* getIRegV ( Int sz, Prefix pfx )
+{
+   if (sz == 4) {
+      sz = 8;
+      return unop(Iop_64to32,
+                  IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
+                              szToITy(sz) ));
+   } else {
+      return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
+                         szToITy(sz) );
+   }
+}
+
+static
+void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
+   if (sz == 4) {
+      e = unop(Iop_32Uto64,e);
+   }
+   stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
+}
+
+static
+const HChar* nameIRegV ( Int sz, Prefix pfx )
+{
+   return nameIReg( sz, getVexNvvvv(pfx), False );
+}
+
+
+
 /* Produce the guest state offset for a reference to the 'e' register
    field in a modrm byte, taking into account REX (or its absence),
    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
@@ -2677,6 +2709,88 @@ IRTemp disAMode ( /*OUT*/Int* len,
 }
 
 
+/* Similarly for VSIB addressing.  This returns just the addend,
+   and fills in *rI and *vscale with the register number of the vector
+   index and its multiplicand.  */
+static
+IRTemp disAVSIBMode ( /*OUT*/Int* len,
+                      VexAbiInfo* vbi, Prefix pfx, Long delta,
+                      /*OUT*/HChar* buf, /*OUT*/UInt* rI,
+                      IRType ty, /*OUT*/Int* vscale )
+{
+   UChar mod_reg_rm = getUChar(delta);
+   const HChar *vindex;
+
+   *len = 0;
+   *rI = 0;
+   *vscale = 0;
+   buf[0] = (UChar)0;
+   if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
+      return IRTemp_INVALID;
+
+   UChar sib     = getUChar(delta+1);
+   UChar scale   = toUChar((sib >> 6) & 3);
+   UChar index_r = toUChar((sib >> 3) & 7);
+   UChar base_r  = toUChar(sib & 7);
+   Long  d       = 0;
+   /* correct since #(R13) == 8 + #(RBP) */
+   Bool  base_is_BPor13 = toBool(base_r == R_RBP);
+   delta += 2;
+   *len = 2;
+
+   *rI = index_r | (getRexX(pfx) << 3);
+   if (ty == Ity_V128)
+      vindex = nameXMMReg(*rI);
+   else
+      vindex = nameYMMReg(*rI);
+   *vscale = 1<<scale;
+
+   switch (mod_reg_rm >> 6) {
+   case 0:
+      if (base_is_BPor13) {
+         d = getSDisp32(delta);
+         *len += 4;
+         if (scale == 0) {
+            DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
+         } else {
+            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
+         }
+         return disAMode_copy2tmp( mkU64(d) );
+      } else {
+         if (scale == 0) {
+            DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
+                     nameIRegRexB(8,pfx,base_r), vindex);
+         } else {
+            DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
+                     nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
+         }
+      }
+      break;
+   case 1:
+      d = getSDisp8(delta);
+      *len += 1;
+      goto have_disp;
+   case 2:
+      d = getSDisp32(delta);
+      *len += 4;
+   have_disp:
+      if (scale == 0) {
+         DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
+                  nameIRegRexB(8,pfx,base_r), vindex);
+      } else {
+         DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
+                  nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
+      }
+      break;
+   }
+
+   if (!d)
+      return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
+   return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
+                                   mkU64(d)) );
+}
+
+
 /* Figure out the number of (insn-stream) bytes constituting the amode
    beginning at delta.  Is useful for getting hold of literals beyond
    the end of the amode before it has been disassembled.  */
@@ -2822,7 +2936,7 @@ ULong dis_op2_E_G ( VexAbiInfo* vbi,
           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
          if (False && op8 == Iop_Sub8)
             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
-	 putIRegG(size,pfx,rm, mkU(ty,0));
+         putIRegG(size,pfx,rm, mkU(ty,0));
       }
 
       assign( dst0, getIRegG(size,pfx,rm) );
@@ -3734,7 +3848,7 @@ ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
    /* Write the result back, if non-BT. */
    if (gregLO3ofRM(modrm) != 4 /* BT */) {
       if (epartIsReg(modrm)) {
-	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
+        putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
       } else {
          if (pfx & PFX_LOCK) {
             casLE( mkexpr(t_addr),
@@ -3931,7 +4045,7 @@ ULong dis_Grp3 ( VexAbiInfo* vbi,
    } else {
       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
                         /* we have to inform disAMode of any immediate
-			   bytes used */
+                           bytes used */
                         gregLO3ofRM(modrm)==0/*TEST*/
                            ? imin(4,sz)
                            : 0
@@ -4212,9 +4326,9 @@ ULong dis_Grp5 ( VexAbiInfo* vbi,
                putIReg64(R_RSP, mkexpr(t2) );
                storeLE( mkexpr(t2), mkexpr(t3) );
                break;
-	    } else {
+            } else {
                goto unhandled; /* awaiting test case */
-	    }
+            }
          default: 
          unhandled:
             *decode_OK = False;
@@ -4673,6 +4787,34 @@ static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
 }
 
 
+/* Generate an IR sequence to do a count-trailing-zeroes operation on
+   the supplied IRTemp, and return a new IRTemp holding the result.
+   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
+   the argument is zero, return the number of bits in the word (the
+   natural semantics). */
+static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
+{
+   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
+
+   IRTemp src64 = newTemp(Ity_I64);
+   assign(src64, widenUto64( mkexpr(src) ));
+
+   // Ctz64 has undefined semantics when its input is zero, so
+   // special-case around that.
+   IRTemp res64 = newTemp(Ity_I64);
+   assign(res64,
+          IRExpr_ITE(
+             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
+             mkU64(8 * sizeofIRType(ty)),
+             unop(Iop_Ctz64, mkexpr(src64))
+   ));
+
+   IRTemp res = newTemp(ty);
+   assign(res, narrowTo(ty, mkexpr(res64)));
+   return res;
+}
+
+
 /*------------------------------------------------------------*/
 /*---                                                      ---*/
 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
@@ -5248,7 +5390,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                   issue.  If needed, side-exit to the next insn,
                   reporting the warning, so that Valgrind's dispatcher
                   sees the warning. */
-	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
+               assign(ew, unop(Iop_64to32,mkexpr(w64)) );
                put_emwarn( mkexpr(ew) );
                stmt( 
                   IRStmt_Exit(
@@ -7512,7 +7654,7 @@ ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
          ));
-	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
+         /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
          assign( res64, 
                  binop(Iop_Shr64, 
                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
@@ -8142,8 +8284,7 @@ ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
       putIRegG(sz, pfx, rm, mkexpr(tmpd));
       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
       DIP("xadd%c %s, %s\n",
-          nameISize(sz), nameIRegG(sz,pfx,rm),
-          				 nameIRegE(sz,pfx,rm));
+          nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
       *decode_ok = True;
       return 1+delta0;
    }
@@ -8570,7 +8711,7 @@ static ULong dis_SSEint_E_to_G(
    }
    putXMMReg( gregOfRexRM(pfx,rm), 
               eLeft ? binop(op, epart, gpart)
-	            : binop(op, gpart, epart) );
+                    : binop(op, gpart, epart) );
    return delta;
 }
 
@@ -8743,7 +8884,7 @@ static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi,
                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
                    : /*sz==4*/
                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
-	      ) 
+              ) 
       );
       delta += alen+1;
       DIP("%s $%d,%s,%s\n", opname,
@@ -9267,6 +9408,31 @@ static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
    return math_PABS_XMM(aa, 1);
 }
 
+/* YMM version of math_PABS_XMM. */
+static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
+{
+   IRTemp res  = newTemp(Ity_V256);
+   IRTemp aaHi = IRTemp_INVALID;
+   IRTemp aaLo = IRTemp_INVALID;
+   breakupV256toV128s(aa, &aaHi, &aaLo);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PABS_XMM(aaHi, laneszB)),
+                     mkexpr(math_PABS_XMM(aaLo, laneszB))));
+   return res;
+}
+
+static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
+   return math_PABS_YMM(aa, 4);
+}
+
+static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
+   return math_PABS_YMM(aa, 2);
+}
+
+static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
+   return math_PABS_YMM(aa, 1);
+}
+
 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
                                         IRTemp lo64, Long byteShift )
 {
@@ -9634,6 +9800,47 @@ static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+static Long dis_PSHUFD_32x8 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
+{
+   Int    order;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   IRTemp sV    = newTemp(Ity_V256);
+   UChar  modrm = getUChar(delta);
+   IRTemp addr  = IRTemp_INVALID;
+   UInt   rG    = gregOfRexRM(pfx,modrm);
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( sV, getYMMReg(rE) );
+      order = (Int)getUChar(delta+1);
+      delta += 1+1;
+      DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
+                        1/*byte after the amode*/ );
+      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+      order = (Int)getUChar(delta+alen);
+      delta += alen+1;
+      DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
+   }
+
+   IRTemp s[8];
+   s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
+   breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
+                         &s[3], &s[2], &s[1], &s[0] );
+
+   putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
+                                 s[4 + ((order>>4)&3)],
+                                 s[4 + ((order>>2)&3)],
+                                 s[4 + ((order>>0)&3)],
+                                 s[0 + ((order>>6)&3)],
+                                 s[0 + ((order>>4)&3)],
+                                 s[0 + ((order>>2)&3)],
+                                 s[0 + ((order>>0)&3)] ) );
+   return delta;
+}
+
+
 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
 {
    IRTemp dV    = newTemp(Ity_V128);
@@ -10280,6 +10487,28 @@ static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+static Long dis_PMOVMSKB_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta  )
+{
+   UChar modrm = getUChar(delta);
+   vassert(epartIsReg(modrm)); /* ensured by caller */
+   UInt   rE = eregOfRexRM(pfx,modrm);
+   UInt   rG = gregOfRexRM(pfx,modrm);
+   IRTemp t0 = newTemp(Ity_V128);
+   IRTemp t1 = newTemp(Ity_V128);
+   IRTemp t2 = newTemp(Ity_I16);
+   IRTemp t3 = newTemp(Ity_I16);
+   assign(t0, getYMMRegLane128(rE, 0));
+   assign(t1, getYMMRegLane128(rE, 1));
+   assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
+   assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
+   putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
+   DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
+   delta += 1;
+   return delta;
+}
+
+
 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
@@ -10542,6 +10771,22 @@ static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
 }
 
 
+static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
+{
+   /* This is a really poor translation -- could be improved if
+      performance critical */
+   IRTemp sHi, sLo, dHi, dLo;
+   sHi = sLo = dHi = dLo = IRTemp_INVALID;
+   breakupV256toV128s( dV, &dHi, &dLo);
+   breakupV256toV128s( sV, &sHi, &sLo);
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PMULUDQ_128(sHi, dHi)),
+                     mkexpr(math_PMULUDQ_128(sLo, dLo))));
+   return res;
+}
+
+
 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
 {
    /* This is a really poor translation -- could be improved if
@@ -10558,6 +10803,22 @@ static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
 }
 
 
+static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
+{
+   /* This is a really poor translation -- could be improved if
+      performance critical */
+   IRTemp sHi, sLo, dHi, dLo;
+   sHi = sLo = dHi = dLo = IRTemp_INVALID;
+   breakupV256toV128s( dV, &dHi, &dLo);
+   breakupV256toV128s( sV, &sHi, &sLo);
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PMULDQ_128(sHi, dHi)),
+                     mkexpr(math_PMULDQ_128(sLo, dLo))));
+   return res;
+}
+
+
 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
 {
    IRTemp sVhi, sVlo, dVhi, dVlo;
@@ -10580,6 +10841,20 @@ static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
 }
 
 
+static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
+{
+   IRTemp sHi, sLo, dHi, dLo;
+   sHi = sLo = dHi = dLo = IRTemp_INVALID;
+   breakupV256toV128s( dV, &dHi, &dLo);
+   breakupV256toV128s( sV, &sHi, &sLo);
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PMADDWD_128(dHi, sHi)),
+                     mkexpr(math_PMADDWD_128(dLo, sLo))));
+   return res;
+}
+
+
 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
 {
    IRTemp addV = newTemp(Ity_V128);
@@ -10713,6 +10988,54 @@ static Long dis_PSHUFxW_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+/* Handle 256 bit PSHUFLW and PSHUFHW. */
+static Long dis_PSHUFxW_256 ( VexAbiInfo* vbi, Prefix pfx,
+                              Long delta, Bool xIsH )
+{
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   UInt   rG = gregOfRexRM(pfx,modrm);
+   UInt   imm8;
+   IRTemp sV, s[8], sV64[4], dVhi, dVlo;
+   sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
+   s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
+   sV    = newTemp(Ity_V256);
+   dVhi  = newTemp(Ity_I64);
+   dVlo  = newTemp(Ity_I64);
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( sV, getYMMReg(rE) );
+      imm8 = (UInt)getUChar(delta+1);
+      delta += 1+1;
+      DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
+          imm8, nameYMMReg(rE), nameYMMReg(rG));
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
+      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+      imm8 = (UInt)getUChar(delta+alen);
+      delta += alen+1;
+      DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
+          imm8, dis_buf, nameYMMReg(rG));
+   }
+
+   breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
+   breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
+   breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
+
+   assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
+                              s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
+   assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
+                              s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
+   putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
+                                 xIsH ? sV64[2] : dVhi,
+                                 xIsH ? dVlo : sV64[1],
+                                 xIsH ? sV64[0] : dVlo ) );
+   return delta;
+}
+
+
 static Long dis_PEXTRW_128_EregOnly_toG ( VexAbiInfo* vbi, Prefix pfx,
                                           Long delta, Bool isAvx )
 {
@@ -10923,6 +11246,20 @@ static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
 }
 
 
+static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
+{
+   IRTemp sHi, sLo, dHi, dLo;
+   sHi = sLo = dHi = dLo = IRTemp_INVALID;
+   breakupV256toV128s( dV, &dHi, &dLo);
+   breakupV256toV128s( sV, &sHi, &sLo);
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PSADBW_128(dHi, sHi)),
+                     mkexpr(math_PSADBW_128(dLo, sLo))));
+   return res;
+}
+
+
 static Long dis_MASKMOVDQU ( VexAbiInfo* vbi, Prefix pfx,
                              Long delta, Bool isAvx )
 {
@@ -11259,9 +11596,9 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
          modrm = getUChar(delta);
          if (epartIsReg(modrm)) {
             putXMMReg( eregOfRexRM(pfx,modrm),
-   		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
+                       getXMMReg( gregOfRexRM(pfx,modrm) ) );
             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
-   	                       nameXMMReg(eregOfRexRM(pfx,modrm)));
+                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
             delta += 1;
          } else {
             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
@@ -11607,9 +11944,9 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
          modrm = getUChar(delta);
          if (epartIsReg(modrm)) {
             putXMMReg( eregOfRexRM(pfx,modrm),
-   		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
+                       getXMMReg( gregOfRexRM(pfx,modrm) ) );
             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
-   	                       nameXMMReg(eregOfRexRM(pfx,modrm)));
+                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
             delta += 1;
          } else {
             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
@@ -12618,7 +12955,7 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
 #        define SEL(n) \
                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
          assign(dV,
-   	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
                              SEL((order>>2)&3), SEL((order>>0)&3) )
          );
          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
@@ -12799,12 +13136,12 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), 
                                     nameIReg32(eregOfRexRM(pfx,modrm)));
-   	 } else {
+            } else {
                putIReg64( eregOfRexRM(pfx,modrm),
                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), 
                                     nameIReg64(eregOfRexRM(pfx,modrm)));
-   	 }
+            }
          } else {
             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
             delta += alen;
@@ -14451,6 +14788,21 @@ IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
 }
 
 
+static
+IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
+{
+   IRTemp sHi, sLo, dHi, dLo;
+   sHi = sLo = dHi = dLo = IRTemp_INVALID;
+   breakupV256toV128s( dV, &dHi, &dLo);
+   breakupV256toV128s( sV, &sHi, &sLo);
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PSHUFB_XMM(dHi, sHi)),
+                     mkexpr(math_PSHUFB_XMM(dLo, sLo))));
+   return res;
+}
+
+
 static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
                             Bool isAvx, UChar opc )
 {
@@ -14490,14 +14842,16 @@ static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
    if (epartIsReg(modrm)) {
       UInt rE = eregOfRexRM(pfx,modrm);
       assign( sV, getXMMReg(rE) );
-      DIP("ph%s %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
+      DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
+          nameXMMReg(rE), nameXMMReg(rG));
       delta += 1;
    } else {
       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
       if (!isAvx)
          gen_SEGV_if_not_16_aligned( addr );
       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
-      DIP("ph%s %s,%s\n", str, dis_buf, nameXMMReg(rG));
+      DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
+          dis_buf, nameXMMReg(rG));
       delta += alen;
    }
 
@@ -14523,6 +14877,78 @@ static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
 }
 
 
+static Long dis_PHADD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
+{
+   IRTemp addr   = IRTemp_INVALID;
+   Int    alen   = 0;
+   HChar  dis_buf[50];
+   const HChar* str = "???";
+   IROp   opV64  = Iop_INVALID;
+   IROp   opCatO = Iop_CatOddLanes16x4;
+   IROp   opCatE = Iop_CatEvenLanes16x4;
+   IRTemp sV     = newTemp(Ity_V256);
+   IRTemp dV     = newTemp(Ity_V256);
+   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+   UChar  modrm  = getUChar(delta);
+   UInt   rG     = gregOfRexRM(pfx,modrm);
+   UInt   rV     = getVexNvvvv(pfx);
+
+   switch (opc) {
+      case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+      case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+      case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+      case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+      case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+      case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+      default: vassert(0);
+   }
+   if (opc == 0x02 || opc == 0x06) {
+      opCatO = Iop_InterleaveHI32x2;
+      opCatE = Iop_InterleaveLO32x2;
+   }
+
+   assign( dV, getYMMReg(rV) );
+
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( sV, getYMMReg(rE) );
+      DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
+      delta += 1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+      DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
+      delta += alen;
+   }
+
+   breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
+   breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
+
+   /* This isn't a particularly efficient way to compute the
+      result, but at least it avoids a proliferation of IROps,
+      hence avoids complication all the backends. */
+
+   putYMMReg( rG,
+              binop(Iop_V128HLtoV256,
+                    binop(Iop_64HLtoV128,
+                          binop(opV64,
+                                binop(opCatE,mkexpr(s3),mkexpr(s2)),
+                                binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
+                          binop(opV64,
+                                binop(opCatE,mkexpr(d3),mkexpr(d2)),
+                                binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
+                    binop(Iop_64HLtoV128,
+                          binop(opV64,
+                                binop(opCatE,mkexpr(s1),mkexpr(s0)),
+                                binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
+                          binop(opV64,
+                                binop(opCatE,mkexpr(d1),mkexpr(d0)),
+                                binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
+   return delta;
+}
+
+
 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
 {
    IRTemp sVoddsSX  = newTemp(Ity_V128);
@@ -14549,6 +14975,21 @@ static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
 }
 
 
+static
+IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
+{
+   IRTemp sHi, sLo, dHi, dLo;
+   sHi = sLo = dHi = dLo = IRTemp_INVALID;
+   breakupV256toV128s( dV, &dHi, &dLo);
+   breakupV256toV128s( sV, &sHi, &sLo);
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256,
+                     mkexpr(math_PMADDUBSW_128(dHi, sHi)),
+                     mkexpr(math_PMADDUBSW_128(dLo, sLo))));
+   return res;
+}
+
+
 __attribute__((noinline))
 static
 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
@@ -15257,32 +15698,31 @@ Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
       }
       break;
 
-   case 0xBD:
-      /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
-         which we can only decode if we're sure this is an AMD cpu
-         that supports LZCNT, since otherwise it's BSR, which behaves
-         differently.  Bizarrely, my Sandy Bridge also accepts these
-         instructions but produces different results. */
+   case 0xBC:
+      /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
+         which we can only decode if we're sure this is a BMI1 capable cpu
+         that supports TZCNT, since otherwise it's BSF, which behaves
+         differently on zero source.  */
       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
-          && (sz == 2 || sz == 4 || sz == 8) 
-          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
+          && (sz == 2 || sz == 4 || sz == 8)
+          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
          /*IRType*/ ty  = szToITy(sz);
          IRTemp     src = newTemp(ty);
          modrm = getUChar(delta);
          if (epartIsReg(modrm)) {
             assign(src, getIRegE(sz, pfx, modrm));
             delta += 1;
-            DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
+            DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
                 nameIRegG(sz, pfx, modrm));
          } else {
             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
             assign(src, loadLE(ty, mkexpr(addr)));
             delta += alen;
-            DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
+            DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
                 nameIRegG(sz, pfx, modrm));
          }
 
-         IRTemp res = gen_LZCNT(ty, src);
+         IRTemp res = gen_TZCNT(ty, src);
          putIRegG(sz, pfx, modrm, mkexpr(res));
 
          // Update flags.  This is pretty lame .. perhaps can do better
@@ -15318,18 +15758,79 @@ Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
       }
       break;
 
-   default:
-      break;
-
-   }
-
-  //decode_failure:
-   *decode_OK = False;
-   return deltaIN;
-
-  decode_success:
-   *decode_OK = True;
-   return delta;
+   case 0xBD:
+      /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
+         which we can only decode if we're sure this is an AMD cpu
+         that supports LZCNT, since otherwise it's BSR, which behaves
+         differently.  Bizarrely, my Sandy Bridge also accepts these
+         instructions but produces different results. */
+      if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
+          && (sz == 2 || sz == 4 || sz == 8) 
+          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
+         /*IRType*/ ty  = szToITy(sz);
+         IRTemp     src = newTemp(ty);
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            assign(src, getIRegE(sz, pfx, modrm));
+            delta += 1;
+            DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
+                nameIRegG(sz, pfx, modrm));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
+            assign(src, loadLE(ty, mkexpr(addr)));
+            delta += alen;
+            DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
+                nameIRegG(sz, pfx, modrm));
+         }
+
+         IRTemp res = gen_LZCNT(ty, src);
+         putIRegG(sz, pfx, modrm, mkexpr(res));
+
+         // Update flags.  This is pretty lame .. perhaps can do better
+         // if this turns out to be performance critical.
+         // O S A P are cleared.  Z is set if RESULT == 0.
+         // C is set if SRC is zero.
+         IRTemp src64 = newTemp(Ity_I64);
+         IRTemp res64 = newTemp(Ity_I64);
+         assign(src64, widenUto64(mkexpr(src)));
+         assign(res64, widenUto64(mkexpr(res)));
+
+         IRTemp oszacp = newTemp(Ity_I64);
+         assign(
+            oszacp,
+            binop(Iop_Or64,
+                  binop(Iop_Shl64,
+                        unop(Iop_1Uto64,
+                             binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
+                        mkU8(AMD64G_CC_SHIFT_Z)),
+                  binop(Iop_Shl64,
+                        unop(Iop_1Uto64,
+                             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
+                        mkU8(AMD64G_CC_SHIFT_C))
+            )
+         );
+
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
+
+         goto decode_success;
+      }
+      break;
+
+   default:
+      break;
+
+   }
+
+  //decode_failure:
+   *decode_OK = False;
+   return deltaIN;
+
+  decode_success:
+   *decode_OK = True;
+   return delta;
 }
 
 
@@ -15722,6 +16223,47 @@ static Long dis_PMOVxXBW_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+/* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
+static Long dis_PMOVxXBW_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta, Bool xIsZ )
+{
+   IRTemp addr   = IRTemp_INVALID;
+   Int    alen   = 0;
+   HChar  dis_buf[50];
+   IRTemp srcVec = newTemp(Ity_V128);
+   UChar  modrm  = getUChar(delta);
+   UChar  how    = xIsZ ? 'z' : 's';
+   UInt   rG     = gregOfRexRM(pfx, modrm);
+   if ( epartIsReg(modrm) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcVec, getXMMReg(rE) );
+      delta += 1;
+      DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
+      delta += alen;
+      DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
+   }
+
+   /* First do zero extend.  */
+   IRExpr* res
+      = binop( Iop_V128HLtoV256,
+               binop( Iop_InterleaveHI8x16,
+                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
+               binop( Iop_InterleaveLO8x16,
+                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
+   /* And if needed sign extension as well.  */
+   if (!xIsZ)
+      res = binop( Iop_SarN16x16,
+                   binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
+
+   putYMMReg ( rG, res );
+
+   return delta;
+}
+
+
 static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx, Bool xIsZ )
 {
@@ -15761,6 +16303,45 @@ static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+static Long dis_PMOVxXWD_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta, Bool xIsZ )
+{
+   IRTemp addr   = IRTemp_INVALID;
+   Int    alen   = 0;
+   HChar  dis_buf[50];
+   IRTemp srcVec = newTemp(Ity_V128);
+   UChar  modrm  = getUChar(delta);
+   UChar  how    = xIsZ ? 'z' : 's';
+   UInt   rG     = gregOfRexRM(pfx, modrm);
+
+   if ( epartIsReg(modrm) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcVec, getXMMReg(rE) );
+      delta += 1;
+      DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
+      delta += alen;
+      DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
+   }
+
+   IRExpr* res
+      = binop( Iop_V128HLtoV256,
+               binop( Iop_InterleaveHI16x8,
+                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
+               binop( Iop_InterleaveLO16x8,
+                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
+   if (!xIsZ)
+      res = binop(Iop_SarN32x8,
+                  binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
+
+   putYMMReg ( rG, res );
+
+   return delta;
+}
+
+
 static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx )
 {
@@ -15794,6 +16375,41 @@ static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+static Long dis_PMOVSXWQ_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
+{
+   IRTemp addr     = IRTemp_INVALID;
+   Int    alen     = 0;
+   HChar  dis_buf[50];
+   IRTemp srcBytes = newTemp(Ity_I64);
+   UChar  modrm    = getUChar(delta);
+   UInt   rG       = gregOfRexRM(pfx, modrm);
+   IRTemp s3, s2, s1, s0;
+   s3 = s2 = s1 = s0 = IRTemp_INVALID;
+
+   if ( epartIsReg( modrm ) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcBytes, getXMMRegLane64( rE, 0 ) );
+      delta += 1;
+      DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
+      delta += alen;
+      DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
+   }
+
+   breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
+   putYMMReg( rG, binop( Iop_V128HLtoV256,
+                         binop( Iop_64HLtoV128,
+                                unop( Iop_16Sto64, mkexpr(s3) ),
+                                unop( Iop_16Sto64, mkexpr(s2) ) ),
+                         binop( Iop_64HLtoV128,
+                                unop( Iop_16Sto64, mkexpr(s1) ),
+                                unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
+   return delta;
+}
+
+
 static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx )
 {
@@ -15830,6 +16446,45 @@ static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+static Long dis_PMOVZXWQ_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta )
+{
+   IRTemp addr     = IRTemp_INVALID;
+   Int    alen     = 0;
+   HChar  dis_buf[50];
+   IRTemp srcVec = newTemp(Ity_V128);
+   UChar  modrm    = getUChar(delta);
+   UInt   rG       = gregOfRexRM(pfx, modrm);
+
+   if ( epartIsReg( modrm ) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcVec, getXMMReg(rE) );
+      delta += 1;
+      DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcVec,
+              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+      delta += alen;
+      DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
+   }
+
+   IRTemp zeroVec = newTemp( Ity_V128 );
+   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+   putYMMReg( rG, binop( Iop_V128HLtoV256,
+                         binop( Iop_InterleaveHI16x8,
+                                mkexpr(zeroVec),
+                                binop( Iop_InterleaveLO16x8,
+                                       mkexpr(zeroVec), mkexpr(srcVec) ) ),
+                         binop( Iop_InterleaveLO16x8,
+                                mkexpr(zeroVec),
+                                binop( Iop_InterleaveLO16x8,
+                                       mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
+   return delta;
+}
+
+
 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
 static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx, Bool xIsZ )
@@ -15877,6 +16532,59 @@ static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+/* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
+static Long dis_PMOVxXDQ_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta, Bool xIsZ )
+{
+   IRTemp addr   = IRTemp_INVALID;
+   Int    alen   = 0;
+   HChar  dis_buf[50];
+   IRTemp srcVec = newTemp(Ity_V128);
+   UChar  modrm  = getUChar(delta);
+   UChar  how    = xIsZ ? 'z' : 's';
+   UInt   rG     = gregOfRexRM(pfx, modrm);
+   /* Compute both srcI64 -- the value to expand -- and srcVec -- same
+      thing in a V128, with arbitrary junk in the top 64 bits.  Use
+      one or both of them and let iropt clean up afterwards (as
+      usual). */
+   if ( epartIsReg(modrm) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcVec, getXMMReg(rE) );
+      delta += 1;
+      DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
+      delta += alen;
+      DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
+   }
+
+   IRExpr* res;
+   if (xIsZ)
+      res = binop( Iop_V128HLtoV256,
+                   binop( Iop_InterleaveHI32x4,
+                          IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
+                   binop( Iop_InterleaveLO32x4,
+                          IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
+   else {
+      IRTemp s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
+      res = binop( Iop_V128HLtoV256,
+                   binop( Iop_64HLtoV128,
+                          unop( Iop_32Sto64, mkexpr(s3) ),
+                          unop( Iop_32Sto64, mkexpr(s2) ) ),
+                   binop( Iop_64HLtoV128,
+                          unop( Iop_32Sto64, mkexpr(s1) ),
+                          unop( Iop_32Sto64, mkexpr(s0) ) ) );
+   }
+
+   putYMMReg ( rG, res );
+
+   return delta;
+}
+
+
 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
 static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx, Bool xIsZ )
@@ -15920,6 +16628,53 @@ static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+/* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
+static Long dis_PMOVxXBD_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta, Bool xIsZ )
+{
+   IRTemp addr   = IRTemp_INVALID;
+   Int    alen   = 0;
+   HChar  dis_buf[50];
+   IRTemp srcVec = newTemp(Ity_V128);
+   UChar  modrm  = getUChar(delta);
+   UChar  how    = xIsZ ? 'z' : 's';
+   UInt   rG     = gregOfRexRM(pfx, modrm);
+   if ( epartIsReg(modrm) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcVec, getXMMReg(rE) );
+      delta += 1;
+      DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcVec,
+              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+      delta += alen;
+      DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
+   }
+
+   IRTemp zeroVec = newTemp(Ity_V128);
+   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+   IRExpr* res
+      = binop( Iop_V128HLtoV256,
+               binop(Iop_InterleaveHI8x16,
+                     mkexpr(zeroVec),
+                     binop(Iop_InterleaveLO8x16,
+                           mkexpr(zeroVec), mkexpr(srcVec)) ),
+               binop(Iop_InterleaveLO8x16,
+                     mkexpr(zeroVec),
+                     binop(Iop_InterleaveLO8x16,
+                           mkexpr(zeroVec), mkexpr(srcVec)) ) );
+   if (!xIsZ)
+      res = binop(Iop_SarN32x8,
+                  binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
+
+   putYMMReg ( rG, res );
+
+   return delta;
+}
+
+
 /* Handles 128 bit versions of PMOVSXBQ. */
 static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx )
@@ -15953,6 +16708,52 @@ static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
+/* Handles 256 bit versions of PMOVSXBQ. */
+static Long dis_PMOVSXBQ_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta )
+{
+   IRTemp addr     = IRTemp_INVALID;
+   Int    alen     = 0;
+   HChar  dis_buf[50];
+   IRTemp srcBytes = newTemp(Ity_I32);
+   UChar  modrm    = getUChar(delta);
+   UInt   rG       = gregOfRexRM(pfx, modrm);
+   if ( epartIsReg(modrm) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcBytes, getXMMRegLane32( rE, 0 ) );
+      delta += 1;
+      DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
+      delta += alen;
+      DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
+   }
+
+   putYMMReg
+      ( rG, binop( Iop_V128HLtoV256,
+                   binop( Iop_64HLtoV128,
+                          unop( Iop_8Sto64,
+                                unop( Iop_16HIto8,
+                                      unop( Iop_32HIto16,
+                                            mkexpr(srcBytes) ) ) ),
+                          unop( Iop_8Sto64,
+                                unop( Iop_16to8,
+                                      unop( Iop_32HIto16,
+                                            mkexpr(srcBytes) ) ) ) ),
+                   binop( Iop_64HLtoV128,
+                          unop( Iop_8Sto64,
+                                unop( Iop_16HIto8,
+                                      unop( Iop_32to16,
+                                            mkexpr(srcBytes) ) ) ),
+                          unop( Iop_8Sto64,
+                                unop( Iop_16to8,
+                                      unop( Iop_32to16,
+                                            mkexpr(srcBytes) ) ) ) ) ) );
+   return delta;
+}
+
+
 /* Handles 128 bit versions of PMOVZXBQ. */
 static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx )
@@ -15992,16 +16793,61 @@ static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
-static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx,
-                                 Long delta, Bool isAvx )
+/* Handles 256 bit versions of PMOVZXBQ. */
+static Long dis_PMOVZXBQ_256 ( VexAbiInfo* vbi, Prefix pfx,
+                               Long delta )
 {
-   IRTemp addr   = IRTemp_INVALID;
-   Int    alen   = 0;
+   IRTemp addr     = IRTemp_INVALID;
+   Int    alen     = 0;
    HChar  dis_buf[50];
-   UChar  modrm  = getUChar(delta);
-   const HChar* mbV = isAvx ? "v" : "";
-   IRTemp sV     = newTemp(Ity_V128);
-   IRTemp sHi    = newTemp(Ity_I64);
+   IRTemp srcVec   = newTemp(Ity_V128);
+   UChar  modrm    = getUChar(delta);
+   UInt   rG       = gregOfRexRM(pfx, modrm);
+   if ( epartIsReg(modrm) ) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      assign( srcVec, getXMMReg(rE) );
+      delta += 1;
+      DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( srcVec,
+              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
+      delta += alen;
+      DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
+   }
+
+   IRTemp zeroVec = newTemp(Ity_V128);
+   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+   putYMMReg
+      ( rG, binop( Iop_V128HLtoV256,
+                   binop( Iop_InterleaveHI8x16,
+                          mkexpr(zeroVec),
+                          binop( Iop_InterleaveLO8x16,
+                                 mkexpr(zeroVec),
+                                 binop( Iop_InterleaveLO8x16,
+                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
+                   binop( Iop_InterleaveLO8x16,
+                          mkexpr(zeroVec),
+                          binop( Iop_InterleaveLO8x16,
+                                 mkexpr(zeroVec),
+                                 binop( Iop_InterleaveLO8x16,
+                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) )
+                 ) );
+   return delta;
+}
+
+
+static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx,
+                                 Long delta, Bool isAvx )
+{
+   IRTemp addr   = IRTemp_INVALID;
+   Int    alen   = 0;
+   HChar  dis_buf[50];
+   UChar  modrm  = getUChar(delta);
+   const HChar* mbV = isAvx ? "v" : "";
+   IRTemp sV     = newTemp(Ity_V128);
+   IRTemp sHi    = newTemp(Ity_I64);
    IRTemp sLo    = newTemp(Ity_I64);
    IRTemp dLo    = newTemp(Ity_I64);
    UInt   rG     = gregOfRexRM(pfx,modrm);
@@ -19427,7 +20273,7 @@ Long dis_ESC_NONE (
             cond = mkAnd1(cond, zbit);
             break;
          default:
-	    vassert(0);
+            vassert(0);
       }
       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
 
@@ -20258,9 +21104,17 @@ Long dis_ESC_0F (
       return delta;
 
    case 0xBC: /* BSF Gv,Ev */
-      if (haveF2(pfx)) goto decode_failure;
-      delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
-      return delta;
+      if (!haveF2orF3(pfx)
+          || (haveF3noF2(pfx)
+              && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
+         /* no-F2 no-F3 0F BC = BSF
+                  or F3 0F BC = REP; BSF on older CPUs.  */
+         delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
+         return delta;
+      }
+      /* Fall through, since F3 0F BC is TZCNT, and needs to
+         be handled by dis_ESC_0F__SSE4. */
+      break;
 
    case 0xBD: /* BSR Gv,Ev */
       if (!haveF2orF3(pfx)
@@ -20913,6 +21767,192 @@ static ULong dis_AVX128_shiftV_byE ( VexAbiInfo* vbi,
 }
 
 
+/* Vector by scalar shift of V by the amount specified at the bottom
+   of E. */
+static ULong dis_AVX256_shiftV_byE ( VexAbiInfo* vbi,
+                                     Prefix pfx, Long delta, 
+                                     const HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen, size;
+   IRTemp  addr;
+   Bool    shl, shr, sar;
+   UChar   modrm = getUChar(delta);
+   UInt    rG    = gregOfRexRM(pfx,modrm);
+   UInt    rV    = getVexNvvvv(pfx);;
+   IRTemp  g0    = newTemp(Ity_V256);
+   IRTemp  g1    = newTemp(Ity_V256);
+   IRTemp  amt   = newTemp(Ity_I64);
+   IRTemp  amt8  = newTemp(Ity_I8);
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( amt, getXMMRegLane64(rE, 0) );
+      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
+          nameYMMReg(rV), nameYMMReg(rG) );
+      delta++;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
+      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
+      delta += alen;
+   }
+   assign( g0, getYMMReg(rV) );
+   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x16: shl = True; size = 32; break;
+      case Iop_ShlN32x8:  shl = True; size = 32; break;
+      case Iop_ShlN64x4:  shl = True; size = 64; break;
+      case Iop_SarN16x16: sar = True; size = 16; break;
+      case Iop_SarN32x8:  sar = True; size = 32; break;
+      case Iop_ShrN16x16: shr = True; size = 16; break;
+      case Iop_ShrN32x8:  shr = True; size = 32; break;
+      case Iop_ShrN64x4:  shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( 
+        g1,
+        IRExpr_ITE(
+           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
+           binop(op, mkexpr(g0), mkexpr(amt8)),
+           binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
+        )
+     );
+   } else 
+   if (sar) {
+     assign( 
+        g1,
+        IRExpr_ITE(
+           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
+           binop(op, mkexpr(g0), mkexpr(amt8)),
+           binop(op, mkexpr(g0), mkU8(size-1))
+        )
+     );
+   } else {
+      vassert(0);
+   }
+
+   putYMMReg( rG, mkexpr(g1) );
+   return delta;
+}
+
+
+/* Vector by vector shift of V by the amount specified at the bottom
+   of E.  Vector by vector shifts are defined for all shift amounts,
+   so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
+   anyway).  */
+static ULong dis_AVX_var_shiftV_byE ( VexAbiInfo* vbi,
+                                      Prefix pfx, Long delta,
+                                      const HChar* opname, IROp op, Bool isYMM )
+{
+   HChar   dis_buf[50];
+   Int     alen, size, i;
+   IRTemp  addr;
+   UChar   modrm = getUChar(delta);
+   UInt    rG    = gregOfRexRM(pfx,modrm);
+   UInt    rV    = getVexNvvvv(pfx);;
+   IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
+   IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
+   IRTemp  amts[8], sVs[8], res[8];
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx,modrm);
+      assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
+      if (isYMM) {
+         DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
+             nameYMMReg(rV), nameYMMReg(rG) );
+      } else {
+         DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
+             nameXMMReg(rV), nameXMMReg(rG) );
+      }
+      delta++;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
+      if (isYMM) {
+         DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
+             nameYMMReg(rG) );
+      } else {
+         DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
+             nameXMMReg(rG) );
+      }
+      delta += alen;
+   }
+   assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
+
+   size = 0;
+   switch (op) {
+      case Iop_Shl32: size = 32; break;
+      case Iop_Shl64: size = 64; break;
+      case Iop_Sar32: size = 32; break;
+      case Iop_Shr32: size = 32; break;
+      case Iop_Shr64: size = 64; break;
+      default: vassert(0);
+   }
+
+   for (i = 0; i < 8; i++) {
+      sVs[i] = IRTemp_INVALID;
+      amts[i] = IRTemp_INVALID;
+   }
+   switch (size) {
+      case 32:
+         if (isYMM) {
+            breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
+                                  &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
+            breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
+                                   &amts[3], &amts[2], &amts[1], &amts[0] );
+         } else {
+            breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
+            breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
+        }
+         break;
+      case 64:
+         if (isYMM) {
+            breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
+            breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
+         } else {
+            breakupV128to64s( sV, &sVs[1], &sVs[0] );
+            breakupV128to64s( amt, &amts[1], &amts[0] );
+         }
+         break;
+      default: vassert(0);
+   }
+   for (i = 0; i < 8; i++)
+      if (sVs[i] != IRTemp_INVALID) {
+         res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
+         assign( res[i],
+                 IRExpr_ITE(
+                    binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
+                          mkexpr(amts[i]),
+                          size == 32 ? mkU32(size) : mkU64(size)),
+                    binop(op, mkexpr(sVs[i]),
+                               unop(size == 32 ? Iop_32to8 : Iop_64to8,
+                                    mkexpr(amts[i]))),
+                    op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
+                                    : size == 32 ? mkU32(0) : mkU64(0)
+         ));
+      }
+   switch (size) {
+      case 32:
+         for (i = 0; i < 8; i++)
+            putYMMRegLane32( rG, i, (i < 4 || isYMM)
+                                    ? mkexpr(res[i]) : mkU32(0) );
+         break;
+      case 64:
+         for (i = 0; i < 4; i++)
+            putYMMRegLane64( rG, i, (i < 2 || isYMM)
+                                    ? mkexpr(res[i]) : mkU64(0) );
+         break;
+      default: vassert(0);
+   }
+
+   return delta;
+}
+
+
 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
    version of dis_SSE_shiftE_imm. */
 static
@@ -20970,6 +22010,64 @@ Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
 }
 
 
+/* Vector by scalar shift of E into V, by an immediate byte.  Modified
+   version of dis_AVX128_shiftE_to_V_imm. */
+static
+Long dis_AVX256_shiftE_to_V_imm( Prefix pfx, 
+                                 Long delta, const HChar* opname, IROp op )
+{
+   Bool    shl, shr, sar;
+   UChar   rm   = getUChar(delta);
+   IRTemp  e0   = newTemp(Ity_V256);
+   IRTemp  e1   = newTemp(Ity_V256);
+   UInt    rD   = getVexNvvvv(pfx);
+   UChar   amt, size;
+   vassert(epartIsReg(rm));
+   vassert(gregLO3ofRM(rm) == 2 
+           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
+   amt = getUChar(delta+1);
+   delta += 2;
+   DIP("%s $%d,%s,%s\n", opname,
+                         (Int)amt,
+                         nameYMMReg(eregOfRexRM(pfx,rm)),
+                         nameYMMReg(rD));
+   assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x16: shl = True; size = 16; break;
+      case Iop_ShlN32x8:  shl = True; size = 32; break;
+      case Iop_ShlN64x4:  shl = True; size = 64; break;
+      case Iop_SarN16x16: sar = True; size = 16; break;
+      case Iop_SarN32x8:  sar = True; size = 32; break;
+      case Iop_ShrN16x16: shr = True; size = 16; break;
+      case Iop_ShrN32x8:  shr = True; size = 32; break;
+      case Iop_ShrN64x4:  shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+
+   if (shl || shr) {
+     assign( e1, amt >= size 
+                    ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
+                    : binop(op, mkexpr(e0), mkU8(amt))
+     );
+   } else 
+   if (sar) {
+     assign( e1, amt >= size 
+                    ? binop(op, mkexpr(e0), mkU8(size-1))
+                    : binop(op, mkexpr(e0), mkU8(amt))
+     );
+   } else {
+      vassert(0);
+   }
+
+   putYMMReg( rD, mkexpr(e1) );
+   return delta;
+}
+
+
 /* Lower 64-bit lane only AVX128 binary operation:
    G[63:0]    = V[63:0] `op` E[63:0]
    G[127:64]  = V[127:64]
@@ -21483,6 +22581,21 @@ static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
 }
 
 
+/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
+   for the operation, no inversion of the left arg, and no swapping of
+   args. */
+static
+Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
+        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
+        Prefix pfx, Long delta, const HChar* name,
+        IROp op
+     )
+{
+   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
+             uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
+}
+
+
 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
    generator to compute the result, no inversion of the left
    arg, and no swapping of args. */
@@ -21499,6 +22612,39 @@ Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
 }
 
 
+/* Handles AVX256 unary E-to-G all-lanes operations. */
+static
+Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
+                               VexAbiInfo* vbi,
+                               Prefix pfx, Long delta,
+                               const HChar* opname,
+                               IRTemp (*opFn)(IRTemp) )
+{
+   HChar  dis_buf[50];
+   Int    alen;
+   IRTemp addr;
+   IRTemp res  = newTemp(Ity_V256);
+   IRTemp arg  = newTemp(Ity_V256);
+   UChar  rm   = getUChar(delta);
+   UInt   rG   = gregOfRexRM(pfx, rm);
+   if (epartIsReg(rm)) {
+      UInt rE = eregOfRexRM(pfx,rm);
+      assign(arg, getYMMReg(rE));
+      delta += 1;
+      DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign(arg, loadLE(Ity_V256, mkexpr(addr)));
+      delta += alen;
+      DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
+   }
+   res = opFn(arg);
+   putYMMReg( rG, mkexpr(res) );
+   *uses_vvvv = False;
+   return delta;
+}
+
+
 /* Handles AVX256 unary E-to-G all-lanes operations. */
 static
 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
@@ -21605,37 +22751,123 @@ static Long dis_CVTPD2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
 }
 
 
-__attribute__((noinline))
-static
-Long dis_ESC_0F__VEX (
-        /*MB_OUT*/DisResult* dres,
-        /*OUT*/   Bool*      uses_vvvv,
-        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
-        Bool         resteerCisOk,
-        void*        callback_opaque,
-        VexArchInfo* archinfo,
-        VexAbiInfo*  vbi,
-        Prefix pfx, Int sz, Long deltaIN 
-     )
+static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
 {
-   IRTemp addr  = IRTemp_INVALID;
-   Int    alen  = 0;
-   HChar  dis_buf[50];
-   Long   delta = deltaIN;
-   UChar  opc   = getUChar(delta);
-   delta++;
-   *uses_vvvv = False;
+   IRTemp tLhi, tLlo, tRhi, tRlo;
+   tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
+   IRTemp res = newTemp(Ity_V256);
+   breakupV256toV128s( tL, &tLhi, &tLlo );
+   breakupV256toV128s( tR, &tRhi, &tRlo );
+   assign( res, binop( Iop_V128HLtoV256,
+                       binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
+                       binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
+   return res;
+}
 
-   switch (opc) {
 
-   case 0x10:
-      /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
-      /* Move 64 bits from E (mem only) to G (lo half xmm).
-         Bits 255-64 of the dest are zeroed out. */
-      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
-         UChar modrm = getUChar(delta);
-         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-         UInt   rG   = gregOfRexRM(pfx,modrm);
+static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
+}
+
+
+static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
+}
+
+
+static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
+}
+
+
+static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
+}
+
+
+static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
+}
+
+
+static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
+}
+
+
+static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
+}
+
+
+static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
+}
+
+
+static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
+}
+
+
+static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
+}
+
+
+static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
+}
+
+
+static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
+{
+   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
+}
+
+
+__attribute__((noinline))
+static
+Long dis_ESC_0F__VEX (
+        /*MB_OUT*/DisResult* dres,
+        /*OUT*/   Bool*      uses_vvvv,
+        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+        Bool         resteerCisOk,
+        void*        callback_opaque,
+        VexArchInfo* archinfo,
+        VexAbiInfo*  vbi,
+        Prefix pfx, Int sz, Long deltaIN 
+     )
+{
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   Long   delta = deltaIN;
+   UChar  opc   = getUChar(delta);
+   delta++;
+   *uses_vvvv = False;
+
+   switch (opc) {
+
+   case 0x10:
+      /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
+      /* Move 64 bits from E (mem only) to G (lo half xmm).
+         Bits 255-64 of the dest are zeroed out. */
+      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         UInt   rG   = gregOfRexRM(pfx,modrm);
          IRTemp z128 = newTemp(Ity_V128);
          assign(z128, mkV128(0));
          putXMMReg( rG, mkexpr(z128) );
@@ -23128,6 +24360,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
+      /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
+                    math_VPUNPCKLBW_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x61:
@@ -23140,6 +24380,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
+      /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
+                    math_VPUNPCKLWD_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x62:
@@ -23152,6 +24400,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
+      /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpckldq",
+                    math_VPUNPCKLDQ_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x63:
@@ -23164,6 +24420,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
+      /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpacksswb",
+                    math_VPACKSSWB_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x64:
@@ -23174,6 +24438,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
          goto decode_success;
       }
+      /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
+      /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
+         goto decode_success;
+      }
       break;
 
    case 0x65:
@@ -23184,6 +24455,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
          goto decode_success;
       }
+      /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
+      /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
+         goto decode_success;
+      }
       break;
 
    case 0x66:
@@ -23194,6 +24472,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
          goto decode_success;
       }
+      /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
+      /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
+         goto decode_success;
+      }
       break;
 
    case 0x67:
@@ -23206,6 +24491,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
+      /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpackuswb",
+                    math_VPACKUSWB_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x68:
@@ -23218,6 +24511,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
+      /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
+                    math_VPUNPCKHBW_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x69:
@@ -23230,6 +24531,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
+      /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
+                    math_VPUNPCKHWD_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x6A:
@@ -23242,6 +24551,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
+      /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
+                    math_VPUNPCKHDQ_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x6B:
@@ -23254,6 +24571,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
+      /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpackssdw",
+                    math_VPACKSSDW_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x6C:
@@ -23266,6 +24591,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
+      /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
+                    math_VPUNPCKLQDQ_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x6D:
@@ -23278,6 +24611,14 @@ Long dis_ESC_0F__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
+      /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
+                    math_VPUNPCKHQDQ_YMM );
+         goto decode_success;
+      }
       break;
 
    case 0x6E:
@@ -23392,18 +24733,33 @@ Long dis_ESC_0F__VEX (
          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
          goto decode_success;
       }
+      /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PSHUFD_32x8( vbi, pfx, delta);
+         goto decode_success;
+      }
       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
          delta = dis_PSHUFxW_128( vbi, pfx, delta,
                                   True/*isAvx*/, False/*!xIsH*/ );
          goto decode_success;
       }
+      /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
+      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
+         goto decode_success;
+      }
       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
          delta = dis_PSHUFxW_128( vbi, pfx, delta,
                                   True/*isAvx*/, True/*xIsH*/ );
          goto decode_success;
       }
+      /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
+      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x71:
@@ -23433,6 +24789,32 @@ Long dis_ESC_0F__VEX (
          }
          /* else fall through */
       }
+      /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
+      /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
+      /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/
+          && epartIsReg(getUChar(delta))) {
+         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsrlw", Iop_ShrN16x16 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsraw", Iop_SarN16x16 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsllw", Iop_ShlN16x16 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         /* else fall through */
+      }
       break;
 
    case 0x72:
@@ -23462,6 +24844,32 @@ Long dis_ESC_0F__VEX (
          }
          /* else fall through */
       }
+      /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
+      /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
+      /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/
+          && epartIsReg(getUChar(delta))) {
+         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsrld", Iop_ShrN32x8 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsrad", Iop_SarN32x8 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpslld", Iop_ShlN32x8 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         /* else fall through */
+      }
       break;
 
    case 0x73:
@@ -23506,6 +24914,54 @@ Long dis_ESC_0F__VEX (
          }
          /* else fall through */
       }
+      /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
+      /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
+      /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
+      /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && epartIsReg(getUChar(delta))) {
+         Int    rS   = eregOfRexRM(pfx,getUChar(delta));
+         Int    rD   = getVexNvvvv(pfx);
+         if (gregLO3ofRM(getUChar(delta)) == 3) {
+            IRTemp vecS0 = newTemp(Ity_V128);
+            IRTemp vecS1 = newTemp(Ity_V128);
+            Int imm = (Int)getUChar(delta+1);
+            DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
+            delta += 2;
+            assign( vecS0, getYMMRegLane128(rS, 0));
+            assign( vecS1, getYMMRegLane128(rS, 1));
+            putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
+            putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 7) {
+            IRTemp vecS0 = newTemp(Ity_V128);
+            IRTemp vecS1 = newTemp(Ity_V128);
+            Int imm = (Int)getUChar(delta+1);
+            DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
+            delta += 2;
+            assign( vecS0, getYMMRegLane128(rS, 0));
+            assign( vecS1, getYMMRegLane128(rS, 1));
+            putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
+            putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 2) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsrlq", Iop_ShrN64x4 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         if (gregLO3ofRM(getUChar(delta)) == 6) {
+            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
+                                                "vpsllq", Iop_ShlN64x4 );
+            *uses_vvvv = True;
+            goto decode_success;
+         }
+         /* else fall through */
+      }
       break;
 
    case 0x74:
@@ -23516,6 +24972,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
          goto decode_success;
       }
+      /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
+      /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
+         goto decode_success;
+      }
       break;
 
    case 0x75:
@@ -23526,6 +24989,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
          goto decode_success;
       }
+      /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
+      /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
+         goto decode_success;
+      }
       break;
 
    case 0x76:
@@ -23536,6 +25006,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
          goto decode_success;
       }
+      /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
+      /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
+         goto decode_success;
+      }
       break;
 
    case 0x77:
@@ -24102,6 +25579,14 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
                         
+      }
+      /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsrlw", Iop_ShrN16x16 );
+         *uses_vvvv = True;
+         goto decode_success;
+                        
       }
       break;
 
@@ -24113,6 +25598,13 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsrld", Iop_ShrN32x8 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0xD3:
@@ -24123,6 +25615,13 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsrlq", Iop_ShrN64x4 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0xD4:
@@ -24133,6 +25632,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
          goto decode_success;
       }
+      /* VPADDQ r/m, rV, r ::: r = rV + r/m */
+      /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
+         goto decode_success;
+      }
       break;
 
    case 0xD5:
@@ -24142,8 +25648,14 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
          goto decode_success;
       }
-      break;
-
+      /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
+         goto decode_success;
+      }
+      break;
+
    case 0xD6:
       /* I can't even find any Intel docs for this one. */
       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
@@ -24172,6 +25684,11 @@ Long dis_ESC_0F__VEX (
          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
          goto decode_success;
       }
+      /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVMSKB_256( vbi, pfx, delta );
+         goto decode_success;
+      }
       break;
 
    case 0xD8:
@@ -24181,7 +25698,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
          goto decode_success;
       }
-     break;
+      /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
+         goto decode_success;
+      }
+      break;
 
    case 0xD9:
       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
@@ -24190,6 +25713,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
          goto decode_success;
       }
+      /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
+         goto decode_success;
+      }
       break;
 
    case 0xDA:
@@ -24199,6 +25728,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
          goto decode_success;
       }
+      /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
+         goto decode_success;
+      }
       break;
 
    case 0xDB:
@@ -24209,6 +25744,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
          goto decode_success;
       }
+      /* VPAND r/m, rV, r ::: r = rV & r/m */
+      /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
+         goto decode_success;
+      }
       break;
 
    case 0xDC:
@@ -24218,6 +25760,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
          goto decode_success;
       }
+      /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
+         goto decode_success;
+      }
       break;
 
    case 0xDD:
@@ -24227,6 +25775,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
          goto decode_success;
       }
+      /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
+         goto decode_success;
+      }
       break;
 
    case 0xDE:
@@ -24236,6 +25790,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
          goto decode_success;
       }
+      /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
+         goto decode_success;
+      }
       break;
 
    case 0xDF:
@@ -24247,6 +25807,14 @@ Long dis_ESC_0F__VEX (
                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
+      /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
+                    uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
+                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
+         goto decode_success;
+      }
       break;
 
    case 0xE0:
@@ -24256,6 +25824,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
          goto decode_success;
       }
+      /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
+         goto decode_success;
+      }
       break;
 
    case 0xE1:
@@ -24266,6 +25840,13 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsraw", Iop_SarN16x16 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0xE2:
@@ -24276,6 +25857,13 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsrad", Iop_SarN32x8 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0xE3:
@@ -24285,6 +25873,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
          goto decode_success;
       }
+      /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
+         goto decode_success;
+      }
       break;
 
    case 0xE4:
@@ -24294,6 +25888,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
          goto decode_success;
       }
+      /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
+         goto decode_success;
+      }
       break;
 
    case 0xE5:
@@ -24303,6 +25903,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
          goto decode_success;
       }
+      /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
+         goto decode_success;
+      }
       break;
 
    case 0xE6:
@@ -24378,6 +25984,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
          goto decode_success;
       }
+      /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
+         goto decode_success;
+      }
       break;
 
    case 0xE9:
@@ -24387,6 +25999,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
          goto decode_success;
       }
+      /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
+         goto decode_success;
+      }
       break;
 
    case 0xEA:
@@ -24397,6 +26015,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
          goto decode_success;
       }
+      /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
+      /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
+         goto decode_success;
+      }
       break;
 
    case 0xEB:
@@ -24407,6 +26032,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
          goto decode_success;
       }
+      /* VPOR r/m, rV, r ::: r = rV | r/m */
+      /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
+         goto decode_success;
+      }
       break;
 
    case 0xEC:
@@ -24416,6 +26048,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
          goto decode_success;
       }
+      /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
+         goto decode_success;
+      }
       break;
 
    case 0xED:
@@ -24425,6 +26063,12 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
          goto decode_success;
       }
+      /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_V_to_G(
+                    uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
+         goto decode_success;
+      }
       break;
 
    case 0xEE:
@@ -24435,6 +26079,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
          goto decode_success;
       }
+      /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
+      /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
+         goto decode_success;
+      }
       break;
 
    case 0xEF:
@@ -24445,6 +26096,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
          goto decode_success;
       }
+      /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
+      /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
+         goto decode_success;
+      }
       break;
 
    case 0xF0:
@@ -24484,6 +26142,14 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
                         
+      }
+      /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsllw", Iop_ShlN16x16 );
+         *uses_vvvv = True;
+         goto decode_success;
+                        
       }
       break;
 
@@ -24495,6 +26161,13 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpslld", Iop_ShlN32x8 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0xF3:
@@ -24505,6 +26178,13 @@ Long dis_ESC_0F__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
+                                        "vpsllq", Iop_ShlN64x4 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0xF4:
@@ -24515,6 +26195,13 @@ Long dis_ESC_0F__VEX (
                     "vpmuludq", math_PMULUDQ_128 );
          goto decode_success;
       }
+      /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpmuludq", math_PMULUDQ_256 );
+         goto decode_success;
+      }
       break;
 
    case 0xF5:
@@ -24525,6 +26212,13 @@ Long dis_ESC_0F__VEX (
                     "vpmaddwd", math_PMADDWD_128 );
          goto decode_success;
       }
+      /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpmaddwd", math_PMADDWD_256 );
+         goto decode_success;
+      }
       break;
 
    case 0xF6:
@@ -24535,6 +26229,13 @@ Long dis_ESC_0F__VEX (
                     "vpsadbw", math_PSADBW_128 );
          goto decode_success;
       }
+      /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpsadbw", math_PSADBW_256 );
+         goto decode_success;
+      }
       break;
 
    case 0xF7:
@@ -24554,6 +26255,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
          goto decode_success;
       }
+      /* VPSUBB r/m, rV, r ::: r = rV - r/m */
+      /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
+         goto decode_success;
+      }
       break;
 
    case 0xF9:
@@ -24564,6 +26272,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
          goto decode_success;
       }
+      /* VPSUBW r/m, rV, r ::: r = rV - r/m */
+      /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
+         goto decode_success;
+      }
       break;
 
    case 0xFA:
@@ -24574,6 +26289,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
          goto decode_success;
       }
+      /* VPSUBD r/m, rV, r ::: r = rV - r/m */
+      /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
+         goto decode_success;
+      }
       break;
 
    case 0xFB:
@@ -24584,6 +26306,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
          goto decode_success;
       }
+      /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
+      /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
+         goto decode_success;
+      }
       break;
 
    case 0xFC:
@@ -24594,6 +26323,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
          goto decode_success;
       }
+      /* VPADDB r/m, rV, r ::: r = rV + r/m */
+      /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
+         goto decode_success;
+      }
       break;
 
    case 0xFD:
@@ -24604,6 +26340,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
          goto decode_success;
       }
+      /* VPADDW r/m, rV, r ::: r = rV + r/m */
+      /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
+         goto decode_success;
+      }
       break;
 
    case 0xFE:
@@ -24614,6 +26357,13 @@ Long dis_ESC_0F__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
          goto decode_success;
       }
+      /* VPADDD r/m, rV, r ::: r = rV + r/m */
+      /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
+         goto decode_success;
+      }
       break;
 
    default:
@@ -24695,81 +26445,504 @@ static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
    return res;
 }
 
-__attribute__((noinline))
-static
-Long dis_ESC_0F38__VEX (
-        /*MB_OUT*/DisResult* dres,
-        /*OUT*/   Bool*      uses_vvvv,
-        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
-        Bool         resteerCisOk,
-        void*        callback_opaque,
-        VexArchInfo* archinfo,
-        VexAbiInfo*  vbi,
-        Prefix pfx, Int sz, Long deltaIN 
-     )
+static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
 {
-   IRTemp addr  = IRTemp_INVALID;
-   Int    alen  = 0;
-   HChar  dis_buf[50];
-   Long   delta = deltaIN;
-   UChar  opc   = getUChar(delta);
-   delta++;
-   *uses_vvvv = False;
+   /* In the control vector, zero out all but the bottom three bits of
+      each 32-bit lane. */
+   IRExpr* cv1 = binop(Iop_ShrN32x8,
+                       binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
+                       mkU8(29));
+   /* And use the resulting cleaned-up control vector as steering
+      in a Perm operation. */
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
+   return res;
+}
 
-   switch (opc) {
+static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
+                         VexAbiInfo* vbi, Prefix pfx, Long delta,
+                         const HChar* opname, IROp op8 )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   Int     size = getRexW(pfx) ? 8 : 4;
+   IRType  ty   = szToITy(size);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  amt  = newTemp(ty);
+   UChar   rm   = getUChar(delta);
 
-   case 0x00:
-      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
-      /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
-      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
-         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
-                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
-         goto decode_success;
-      }
-      break;
+   assign( amt, getIRegV(size,pfx) );
+   if (epartIsReg(rm)) {
+      assign( src, getIRegE(size,pfx,rm) );
+      DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
+                           nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
+      delta++;
+   } else {
+      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( src, loadLE(ty, mkexpr(addr)) );
+      DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
+                           nameIRegG(size,pfx,rm));
+      delta += alen;
+   }
 
-   case 0x01:
-   case 0x02:
-   case 0x03:
-      /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
-      /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
-      /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
-      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
-         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
-         *uses_vvvv = True;
-         goto decode_success;
-      }
-      break;
+   putIRegG( size, pfx, rm,
+             binop(mkSizedOp(ty,op8), mkexpr(src),
+                   narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
+                                          mkU(ty,8*size-1)))) );
+   /* Flags aren't modified.  */
+   *uses_vvvv = True;
+   return delta;
+}
 
-   case 0x04:
-      /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
-      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
-         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
-                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
-                    math_PMADDUBSW_128 );
-         goto decode_success;
-      }
+
+static Long dis_FMA ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
+{
+   UChar  modrm   = getUChar(delta);
+   UInt   rG      = gregOfRexRM(pfx, modrm);
+   UInt   rV      = getVexNvvvv(pfx);
+   Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
+   IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
+   IRType vty     = scalar ? ty : getVexL(pfx) ? Ity_V256 : Ity_V128;
+   IRTemp vX      = newTemp(vty);
+   IRTemp vY      = newTemp(vty);
+   IRTemp vZ      = newTemp(vty);
+   IRExpr *x[8], *y[8], *z[8];
+   IRTemp addr    = IRTemp_INVALID;
+   HChar  dis_buf[50];
+   Int    alen    = 0;
+   const HChar *name;
+   const HChar *suffix;
+   const HChar *order;
+   Bool   negateRes   = False;
+   Bool   negateZeven = False;
+   Bool   negateZodd  = False;
+   Int    i, j;
+   Int    count;
+   static IROp ops[] = { Iop_V256to64_0, Iop_V256to64_1,
+                         Iop_V256to64_2, Iop_V256to64_3,
+                         Iop_V128to64, Iop_V128HIto64 };
+
+   switch (opc & 0xF) {
+   case 0x6:
+      name = "addsub";
+      negateZeven = True;
+      break;
+   case 0x7:
+      name = "subadd";
+      negateZodd = True;
+      break;
+   case 0x8:
+   case 0x9:
+      name = "add";
+      break;
+   case 0xA:
+   case 0xB:
+      name = "sub";
+      negateZeven = True;
+      negateZodd = True;
+      break;
+   case 0xC:
+   case 0xD:
+      name = "add";
+      negateRes = True;
+      negateZeven = True;
+      negateZodd = True;
+      break;
+   case 0xE:
+   case 0xF:
+      name = "sub";
+      negateRes = True;
       break;
-      
-   case 0x05:
-   case 0x06:
-   case 0x07:
-      /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
-      /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
-      /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
-      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
-         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
-         *uses_vvvv = True;
-         goto decode_success;
-      }
+   default:
+      vpanic("dis_FMA(amd64)");
       break;
+   }
+   switch (opc & 0xF0) {
+   case 0x90: order = "132"; break;
+   case 0xA0: order = "213"; break;
+   case 0xB0: order = "231"; break;
+   default: vpanic("dis_FMA(amd64)"); break;
+   }
+   if (scalar)
+      suffix = ty == Ity_F64 ? "sd" : "ss";
+   else
+      suffix = ty == Ity_F64 ? "pd" : "ps";
 
-   case 0x08:
-   case 0x09:
-   case 0x0A:
-      /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
-      /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
-      /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
+   if (scalar) {
+      assign( vX, ty == Ity_F64
+                  ? getXMMRegLane64F(rG, 0) : getXMMRegLane32F(rG, 0) );
+      assign( vZ, ty == Ity_F64
+                  ? getXMMRegLane64F(rV, 0) : getXMMRegLane32F(rV, 0) );
+   } else {
+      assign( vX, vty == Ity_V256 ? getYMMReg(rG) : getXMMReg(rG) );
+      assign( vZ, vty == Ity_V256 ? getYMMReg(rV) : getXMMReg(rV) );
+   }
+
+   if (epartIsReg(modrm)) {
+      UInt rE = eregOfRexRM(pfx, modrm);
+      delta += 1;
+      if (scalar)
+         assign( vY, ty == Ity_F64
+                     ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
+      else
+         assign( vY, vty == Ity_V256 ? getYMMReg(rE) : getXMMReg(rE) );
+      if (vty == Ity_V256) {
+         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
+             name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
+             nameYMMReg(rG));
+      } else {
+         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
+             name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
+             nameXMMReg(rG));
+      }
+   } else {
+      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+      delta += alen;
+      assign(vY, loadLE(vty, mkexpr(addr)));
+      if (vty == Ity_V256) {
+         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
+             name, order, suffix, dis_buf, nameYMMReg(rV),
+             nameYMMReg(rG));
+      } else {
+         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
+             name, order, suffix, dis_buf, nameXMMReg(rV),
+             nameXMMReg(rG));
+      }
+   }
+
+   /* vX/vY/vZ now in 132 order.  If it is different order, swap the
+      arguments.  */
+   if ((opc & 0xF0) != 0x90) {
+      IRTemp tem = vX;
+      if ((opc & 0xF0) == 0xA0) {
+         vX = vZ;
+         vZ = vY;
+         vY = tem;
+      } else {
+         vX = vZ;
+         vZ = tem;
+      }
+   }
+
+   if (scalar) {
+      count = 1;
+      x[0] = mkexpr(vX);
+      y[0] = mkexpr(vY);
+      z[0] = mkexpr(vZ);
+   } else if (ty == Ity_F32) {
+      count = vty == Ity_V256 ? 8 : 4;
+      j = vty == Ity_V256 ? 0 : 4;
+      for (i = 0; i < count; i += 2) {
+         IRTemp tem = newTemp(Ity_I64);
+         assign(tem, unop(ops[i / 2 + j], mkexpr(vX)));
+         x[i] = unop(Iop_64to32, mkexpr(tem));
+         x[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
+         tem = newTemp(Ity_I64);
+         assign(tem, unop(ops[i / 2 + j], mkexpr(vY)));
+         y[i] = unop(Iop_64to32, mkexpr(tem));
+         y[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
+         tem = newTemp(Ity_I64);
+         assign(tem, unop(ops[i / 2 + j], mkexpr(vZ)));
+         z[i] = unop(Iop_64to32, mkexpr(tem));
+         z[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
+      }
+   } else {
+      count = vty == Ity_V256 ? 4 : 2;
+      j = vty == Ity_V256 ? 0 : 4;
+      for (i = 0; i < count; i++) {
+         x[i] = unop(ops[i + j], mkexpr(vX));
+         y[i] = unop(ops[i + j], mkexpr(vY));
+         z[i] = unop(ops[i + j], mkexpr(vZ));
+      }
+   }
+   if (!scalar)
+      for (i = 0; i < count; i++) {
+         IROp op = ty == Ity_F64
+                   ? Iop_ReinterpI64asF64 : Iop_ReinterpI32asF32;
+         x[i] = unop(op, x[i]);
+         y[i] = unop(op, y[i]);
+         z[i] = unop(op, z[i]);
+      }
+   for (i = 0; i < count; i++) {
+      if ((i & 1) ? negateZodd : negateZeven)
+         z[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, z[i]);
+      x[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
+                        get_FAKE_roundingmode(), x[i], y[i], z[i]);
+      if (negateRes)
+         x[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, x[i]);
+      if (ty == Ity_F64)
+         putYMMRegLane64F( rG, i, x[i] );
+      else
+         putYMMRegLane32F( rG, i, x[i] );
+   }
+   if (vty != Ity_V256)
+      putYMMRegLane128( rG, 1, mkV128(0) );
+
+   return delta;
+}
+
+
+/* Masked load.  */
+static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, VexAbiInfo* vbi,
+                                 Prefix pfx, Long delta,
+                                 const HChar* opname, Bool isYMM, IRType ty )
+{
+   HChar   dis_buf[50];
+   Int     alen, i;
+   IRTemp  addr;
+   UChar   modrm = getUChar(delta);
+   UInt    rG    = gregOfRexRM(pfx,modrm);
+   UInt    rV    = getVexNvvvv(pfx);
+   IRTemp  res[8], cond;
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   if (isYMM) {
+      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
+   } else {
+      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
+   }
+   delta += alen;
+
+   for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) {
+      res[i] = newTemp(ty);
+      cond = newTemp(Ity_I1);
+      assign( cond, 
+              binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
+                    ty == Ity_I32 ? getYMMRegLane32( rV, i )
+                                  : getYMMRegLane64( rV, i ),
+                    mkU(ty, 0) ));
+      assign( res[i],
+              IRExpr_ITE(
+                 mkexpr(cond),
+                 loadLE(ty, IRExpr_ITE(
+                               mkexpr(cond),
+                               binop(Iop_Add64, mkexpr(addr),
+                                     mkU64(i*(ty == Ity_I32 ? 4 : 8))),
+                               getIReg64(R_RSP)
+                            )
+                       ),
+                 mkU(ty, 0)
+              )
+            );
+   }
+   switch (ty) {
+      case Ity_I32:
+         for (i = 0; i < 8; i++)
+            putYMMRegLane32( rG, i, (i < 4 || isYMM)
+                                    ? mkexpr(res[i]) : mkU32(0) );
+         break;
+      case Ity_I64:
+         for (i = 0; i < 4; i++)
+            putYMMRegLane64( rG, i, (i < 2 || isYMM)
+                                    ? mkexpr(res[i]) : mkU64(0) );
+         break;
+      default: vassert(0);
+   }
+
+   *uses_vvvv = True;
+   return delta;
+}
+
+
+/* Gather.  */
+static ULong dis_VGATHER ( Bool *uses_vvvv, VexAbiInfo* vbi,
+                           Prefix pfx, Long delta,
+                           const HChar* opname, Bool isYMM,
+                           Bool isVM64x, IRType ty )
+{
+   HChar  dis_buf[50];
+   Int    alen, i, vscale, count1, count2;
+   IRTemp addr;
+   UChar  modrm = getUChar(delta);
+   UInt   rG    = gregOfRexRM(pfx,modrm);
+   UInt   rV    = getVexNvvvv(pfx);
+   UInt   rI;
+   IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
+   IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
+   IRTemp cond;
+   addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
+                         idxTy, &vscale );
+   if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
+      return delta;
+   if (dstTy == Ity_V256) {
+      DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
+   } else {
+      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
+   }
+   delta += alen;
+
+   if (ty == Ity_I32) {
+      count1 = isYMM ? 8 : 4;
+      count2 = isVM64x ? count1 / 2 : count1;
+   } else {
+      count1 = count2 = isYMM ? 4 : 2;
+   }
+
+   /* First update the mask register to copies of the sign bit.  */
+   if (ty == Ity_I32) {
+      if (isYMM)
+         putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
+      else
+         putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
+   } else {
+      for (i = 0; i < count1; i++) {
+         putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
+                                       mkU8(63)) );
+      }
+   }
+
+   /* Next gather the individual elements.  If any fault occurs, the
+      corresponding mask element will be set and the loop stops.  */
+   for (i = 0; i < count2; i++) {
+      IRExpr *expr, *addr_expr;
+      cond = newTemp(Ity_I1);
+      assign( cond, 
+              binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
+                    ty == Ity_I32 ? getYMMRegLane32( rV, i )
+                                  : getYMMRegLane64( rV, i ),
+                    mkU(ty, 0)) );
+      expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
+                           : getYMMRegLane64( rG, i );
+      addr_expr = isVM64x ? getYMMRegLane64( rI, i )
+                          : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
+      switch (vscale) {
+         case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
+         case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
+         case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
+         default: break;
+      }
+      addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
+      addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
+      addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
+      expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
+      if (ty == Ity_I32) {
+         putYMMRegLane32( rG, i, expr );
+         putYMMRegLane32( rV, i, mkU32(0) );
+      } else {
+         putYMMRegLane64( rG, i, expr);
+         putYMMRegLane64( rV, i, mkU64(0) );
+      }
+   }
+
+   if (!isYMM || (ty == Ity_I32 && isVM64x)) {
+      if (ty == Ity_I64 || isYMM)
+         putYMMRegLane128( rV, 1, mkV128(0) );
+      else if (ty == Ity_I32 && count2 == 2) {
+         putYMMRegLane64( rV, 1, mkU64(0) );
+         putYMMRegLane64( rG, 1, mkU64(0) );
+      }
+      putYMMRegLane128( rG, 1, mkV128(0) );
+   }
+
+   *uses_vvvv = True;
+   return delta;
+}
+
+
+__attribute__((noinline))
+static
+Long dis_ESC_0F38__VEX (
+        /*MB_OUT*/DisResult* dres,
+        /*OUT*/   Bool*      uses_vvvv,
+        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+        Bool         resteerCisOk,
+        void*        callback_opaque,
+        VexArchInfo* archinfo,
+        VexAbiInfo*  vbi,
+        Prefix pfx, Int sz, Long deltaIN 
+     )
+{
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   Long   delta = deltaIN;
+   UChar  opc   = getUChar(delta);
+   delta++;
+   *uses_vvvv = False;
+
+   switch (opc) {
+
+   case 0x00:
+      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
+      /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
+         goto decode_success;
+      }
+      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
+      /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
+         goto decode_success;
+      }
+      break;
+
+   case 0x01:
+   case 0x02:
+   case 0x03:
+      /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
+      /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
+      /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
+      /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
+      /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PHADD_256( vbi, pfx, delta, opc );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0x04:
+      /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
+                    math_PMADDUBSW_128 );
+         goto decode_success;
+      }
+      /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
+                    math_PMADDUBSW_256 );
+         goto decode_success;
+      }
+      break;
+      
+   case 0x05:
+   case 0x06:
+   case 0x07:
+      /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
+      /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
+      /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
+      /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
+      /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PHADD_256( vbi, pfx, delta, opc );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0x08:
+   case 0x09:
+   case 0x0A:
+      /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
+      /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
+      /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
          IRTemp sV      = newTemp(Ity_V128);
          IRTemp dV      = newTemp(Ity_V128);
@@ -24817,6 +26990,63 @@ Long dis_ESC_0F38__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
+      /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
+      /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         IRTemp sV      = newTemp(Ity_V256);
+         IRTemp dV      = newTemp(Ity_V256);
+         IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+         s3 = s2 = s1 = s0 = IRTemp_INVALID;
+         d3 = d2 = d1 = d0 = IRTemp_INVALID;
+         UChar  ch      = '?';
+         Int    laneszB = 0;
+         UChar  modrm   = getUChar(delta);
+         UInt   rG      = gregOfRexRM(pfx,modrm);
+         UInt   rV      = getVexNvvvv(pfx);
+
+         switch (opc) {
+            case 0x08: laneszB = 1; ch = 'b'; break;
+            case 0x09: laneszB = 2; ch = 'w'; break;
+            case 0x0A: laneszB = 4; ch = 'd'; break;
+            default: vassert(0);
+         }
+
+         assign( dV, getYMMReg(rV) );
+
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx,modrm);
+            assign( sV, getYMMReg(rE) );
+            delta += 1;
+            DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
+                nameYMMReg(rV), nameYMMReg(rG));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+            delta += alen;
+            DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
+                nameYMMReg(rV), nameYMMReg(rG));
+         }
+
+         breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
+         breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
+
+         putYMMReg(
+            rG,
+            binop( Iop_V128HLtoV256,
+                   binop(Iop_64HLtoV128,
+                         dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
+                         dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
+                   ),
+                   binop(Iop_64HLtoV128,
+                         dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
+                         dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
+                   )
+            )
+         );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0x0B:
@@ -24859,6 +27089,49 @@ Long dis_ESC_0F38__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         IRTemp sV      = newTemp(Ity_V256);
+         IRTemp dV      = newTemp(Ity_V256);
+         IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+         s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+         UChar  modrm   = getUChar(delta);
+         UInt   rG      = gregOfRexRM(pfx,modrm);
+         UInt   rV      = getVexNvvvv(pfx);
+
+         assign( dV, getYMMReg(rV) );
+
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx,modrm);
+            assign( sV, getYMMReg(rE) );
+            delta += 1;
+            DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
+                nameYMMReg(rV), nameYMMReg(rG));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+            delta += alen;
+            DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
+                nameYMMReg(rV), nameYMMReg(rG));
+         }
+
+         breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
+         breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
+
+         putYMMReg(
+            rG,
+            binop(Iop_V128HLtoV256,
+                  binop(Iop_64HLtoV128,
+                        dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
+                        dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
+                  binop(Iop_64HLtoV128,
+                        dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
+                        dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
+            )
+         );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0x0C:
@@ -25001,6 +27274,16 @@ Long dis_ESC_0F38__VEX (
       }
       break;
 
+   case 0x16:
+      /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
+         goto decode_success;
+      }
+      break;
+
    case 0x17:
       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
@@ -25050,10 +27333,45 @@ Long dis_ESC_0F38__VEX (
          putYMMReg(rG, res);
          goto decode_success;
       }
-      break;
-
-   case 0x19:
-      /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
+      /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
+      if (have66noF2noF3(pfx)
+          && 0==getVexL(pfx)/*128*/
+          && epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         UInt  rE    = eregOfRexRM(pfx, modrm);
+         DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+         IRTemp t32 = newTemp(Ity_I32);
+         assign(t32, getXMMRegLane32(rE, 0));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
+         putYMMRegLoAndZU(rG, res);
+         delta++;
+         goto decode_success;
+      }
+      /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/
+          && epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         UInt  rE    = eregOfRexRM(pfx, modrm);
+         DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
+         IRTemp t32 = newTemp(Ity_I32);
+         assign(t32, getXMMRegLane32(rE, 0));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+                                                  mkexpr(t64), mkexpr(t64));
+         putYMMReg(rG, res);
+         delta++;
+         goto decode_success;
+      }
+      break;
+
+   case 0x19:
+      /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
       if (have66noF2noF3(pfx)
           && 1==getVexL(pfx)/*256*/
           && !epartIsReg(getUChar(delta))) {
@@ -25069,6 +27387,22 @@ Long dis_ESC_0F38__VEX (
          putYMMReg(rG, res);
          goto decode_success;
       }
+      /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/
+          && epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         UInt  rE    = eregOfRexRM(pfx, modrm);
+         DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, getXMMRegLane64(rE, 0));
+         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+                                                  mkexpr(t64), mkexpr(t64));
+         putYMMReg(rG, res);
+         delta++;
+         goto decode_success;
+      }
       break;
 
    case 0x1A:
@@ -25096,6 +27430,13 @@ Long dis_ESC_0F38__VEX (
                     "vpabsb", math_PABS_XMM_pap1 );
          goto decode_success;
       }
+      /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_to_G_unary(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpabsb", math_PABS_YMM_pap1 );
+         goto decode_success;
+      }
       break;
 
    case 0x1D:
@@ -25106,6 +27447,13 @@ Long dis_ESC_0F38__VEX (
                     "vpabsw", math_PABS_XMM_pap2 );
          goto decode_success;
       }
+      /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_to_G_unary(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpabsw", math_PABS_YMM_pap2 );
+         goto decode_success;
+      }
       break;
 
    case 0x1E:
@@ -25116,6 +27464,13 @@ Long dis_ESC_0F38__VEX (
                     "vpabsd", math_PABS_XMM_pap4 );
          goto decode_success;
       }
+      /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_AVX256_E_to_G_unary(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpabsd", math_PABS_YMM_pap4 );
+         goto decode_success;
+      }
       break;
 
    case 0x20:
@@ -25126,6 +27481,12 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, False/*!xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVSXBW xmm2/m128, ymm1 */
+      /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x21:
@@ -25136,6 +27497,12 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, False/*!xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVSXBD xmm2/m64, ymm1 */
+      /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x22:
@@ -25145,6 +27512,12 @@ Long dis_ESC_0F38__VEX (
          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
          goto decode_success;
       }
+      /* VPMOVSXBQ xmm2/m32, ymm1 */
+      /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
+         goto decode_success;
+      }
       break;
 
    case 0x23:
@@ -25154,6 +27527,11 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, False/*!xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x24:
@@ -25162,6 +27540,11 @@ Long dis_ESC_0F38__VEX (
          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
          goto decode_success;
       }
+      /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
+         goto decode_success;
+      }
       break;
 
    case 0x25:
@@ -25171,6 +27554,11 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, False/*!xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x28:
@@ -25181,6 +27569,13 @@ Long dis_ESC_0F38__VEX (
                     "vpmuldq", math_PMULDQ_128 );
          goto decode_success;
       }
+      /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta,
+                    "vpmuldq", math_PMULDQ_256 );
+         goto decode_success;
+      }
       break;
 
    case 0x29:
@@ -25191,6 +27586,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
          goto decode_success;
       }
+      /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
+      /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
+         goto decode_success;
+      }
       break;
 
    case 0x2A:
@@ -25208,6 +27610,20 @@ Long dis_ESC_0F38__VEX (
          putYMMRegLoAndZU(rD, mkexpr(tD));
          goto decode_success;
       }
+      /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && !epartIsReg(getUChar(delta))) {
+         UChar  modrm = getUChar(delta);
+         UInt   rD    = gregOfRexRM(pfx, modrm);
+         IRTemp tD    = newTemp(Ity_V256);
+         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         gen_SEGV_if_not_32_aligned(addr);
+         assign(tD, loadLE(Ity_V256, mkexpr(addr)));
+         DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
+         putYMMReg(rD, mkexpr(tD));
+         goto decode_success;
+      }
       break;
 
    case 0x2B:
@@ -25220,6 +27636,48 @@ Long dis_ESC_0F38__VEX (
                     False/*!invertLeftArg*/, True/*swapArgs*/ );
          goto decode_success;
       }
+      /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
+      /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpackusdw",
+                    math_VPACKUSDW_YMM );
+         goto decode_success;
+      }
+      break;
+
+   case 0x2C:
+      /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
+                                    /*!isYMM*/False, Ity_I32 );
+         goto decode_success;
+      }
+      /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
+                                    /*isYMM*/True, Ity_I32 );
+         goto decode_success;
+      }
+      break;
+
+   case 0x2D:
+      /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
+                                    /*!isYMM*/False, Ity_I64 );
+         goto decode_success;
+      }
+      /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
+                                    /*isYMM*/True, Ity_I64 );
+         goto decode_success;
+      }
       break;
 
    case 0x30:
@@ -25230,6 +27688,12 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, True/*xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVZXBW xmm2/m128, ymm1 */
+      /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x31:
@@ -25240,6 +27704,12 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, True/*xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVZXBD xmm2/m64, ymm1 */
+      /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x32:
@@ -25249,6 +27719,12 @@ Long dis_ESC_0F38__VEX (
          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
          goto decode_success;
       }
+      /* VPMOVZXBQ xmm2/m32, ymm1 */
+      /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
+         goto decode_success;
+      }
       break;
 
    case 0x33:
@@ -25259,6 +27735,12 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, True/*xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVZXWD xmm2/m128, ymm1 */
+      /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
+         goto decode_success;
+      }
       break;
 
    case 0x34:
@@ -25267,6 +27749,11 @@ Long dis_ESC_0F38__VEX (
          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
          goto decode_success;
       }
+      /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
+         goto decode_success;
+      }
       break;
 
    case 0x35:
@@ -25276,6 +27763,21 @@ Long dis_ESC_0F38__VEX (
                                    True/*isAvx*/, True/*xIsZ*/ );
          goto decode_success;
       }
+      /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
+         goto decode_success;
+      }
+      break;
+
+   case 0x36:
+      /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
+                    uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
+         goto decode_success;
+      }
       break;
 
    case 0x37:
@@ -25286,6 +27788,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
          goto decode_success;
       }
+      /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
+      /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
+         goto decode_success;
+      }
       break;
 
    case 0x38:
@@ -25296,6 +27805,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
          goto decode_success;
       }
+      /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
+      /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
+         goto decode_success;
+      }
       break;
 
    case 0x39:
@@ -25306,6 +27822,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
          goto decode_success;
       }
+      /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
+      /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
+         goto decode_success;
+      }
       break;
 
    case 0x3A:
@@ -25316,6 +27839,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
          goto decode_success;
       }
+      /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
+      /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
+         goto decode_success;
+      }
       break;
 
    case 0x3B:
@@ -25326,6 +27856,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
          goto decode_success;
       }
+      /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
+      /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
+         goto decode_success;
+      }
       break;
 
    case 0x3C:
@@ -25336,6 +27873,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
          goto decode_success;
       }
+      /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
+      /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
+         goto decode_success;
+      }
       break;
 
    case 0x3D:
@@ -25346,6 +27890,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
          goto decode_success;
       }
+      /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
+      /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
+         goto decode_success;
+      }
       break;
 
    case 0x3E:
@@ -25356,6 +27907,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
          goto decode_success;
       }
+      /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
+      /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
+         goto decode_success;
+      }
       break;
 
    case 0x3F:
@@ -25366,6 +27924,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
          goto decode_success;
       }
+      /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
+      /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
+         goto decode_success;
+      }
       break;
 
    case 0x40:
@@ -25376,6 +27941,13 @@ Long dis_ESC_0F38__VEX (
                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
          goto decode_success;
       }
+      /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
+      /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
+                    uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
+         goto decode_success;
+      }
       break;
 
    case 0x41:
@@ -25386,33 +27958,1017 @@ Long dis_ESC_0F38__VEX (
       } 
       break;
 
-   case 0xDB:
-   case 0xDC:
-   case 0xDD:
-   case 0xDE:
-   case 0xDF:
-      /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
-      /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
-      /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
-      /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
-      /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
-      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
-         delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
-         if (opc != 0xDB) *uses_vvvv = True;
+   case 0x45:
+      /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
+      /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
+      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
+         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
+                                         Iop_Shr32, 1==getVexL(pfx) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
+      /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
+      if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
+         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
+                                         Iop_Shr64, 1==getVexL(pfx) );
+         *uses_vvvv = True;
          goto decode_success;
       }
       break;
 
-   default:
+   case 0x46:
+      /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
+      /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
+      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
+         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
+                                         Iop_Sar32, 1==getVexL(pfx) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
-   }
+   case 0x47:
+      /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
+      /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
+      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
+         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
+                                         Iop_Shl32, 1==getVexL(pfx) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
+      /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
+      if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
+         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
+                                         Iop_Shl64, 1==getVexL(pfx) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
 
-  //decode_failure:
-   return deltaIN;
+   case 0x58:
+      /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t32 = newTemp(Ity_I32);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta++;
+            DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+            assign(t32, getXMMRegLane32(rE, 0));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
+            assign(t32, loadLE(Ity_I32, mkexpr(addr)));
+         }
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
+         putYMMRegLoAndZU(rG, res);
+         goto decode_success;
+      }
+      /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t32 = newTemp(Ity_I32);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta++;
+            DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
+            assign(t32, getXMMRegLane32(rE, 0));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
+            assign(t32, loadLE(Ity_I32, mkexpr(addr)));
+         }
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+                                                  mkexpr(t64), mkexpr(t64));
+         putYMMReg(rG, res);
+         goto decode_success;
+      }
+      break;
 
-  decode_success:
-   return delta;
+   case 0x59:
+      /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t64 = newTemp(Ity_I64);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta++;
+            DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+            assign(t64, getXMMRegLane64(rE, 0));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
+            assign(t64, loadLE(Ity_I64, mkexpr(addr)));
+         }
+         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
+         putYMMRegLoAndZU(rG, res);
+         goto decode_success;
+      }
+      /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t64 = newTemp(Ity_I64);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta++;
+            DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
+            assign(t64, getXMMRegLane64(rE, 0));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
+            assign(t64, loadLE(Ity_I64, mkexpr(addr)));
+         }
+         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+                                                  mkexpr(t64), mkexpr(t64));
+         putYMMReg(rG, res);
+         goto decode_success;
+      }
+      break;
+
+   case 0x5A:
+      /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/
+          && !epartIsReg(getUChar(delta))) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
+         IRTemp t128 = newTemp(Ity_V128);
+         assign(t128, loadLE(Ity_V128, mkexpr(addr)));
+         putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
+         goto decode_success;
+      }
+      break;
+
+   case 0x78:
+      /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t8   = newTemp(Ity_I8);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+            assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
+            assign(t8, loadLE(Ity_I8, mkexpr(addr)));
+         }
+         IRTemp t16 = newTemp(Ity_I16);
+         assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
+         IRTemp t32 = newTemp(Ity_I32);
+         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
+         putYMMRegLoAndZU(rG, res);
+         goto decode_success;
+      }
+      /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t8   = newTemp(Ity_I8);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
+            assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
+            assign(t8, loadLE(Ity_I8, mkexpr(addr)));
+         }
+         IRTemp t16 = newTemp(Ity_I16);
+         assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
+         IRTemp t32 = newTemp(Ity_I32);
+         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+                                                  mkexpr(t64), mkexpr(t64));
+         putYMMReg(rG, res);
+         goto decode_success;
+      }
+      break;
+
+   case 0x79:
+      /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t16  = newTemp(Ity_I16);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
+            assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
+            assign(t16, loadLE(Ity_I16, mkexpr(addr)));
+         }
+         IRTemp t32 = newTemp(Ity_I32);
+         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
+         putYMMRegLoAndZU(rG, res);
+         goto decode_success;
+      }
+      /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0==getRexW(pfx)/*W0*/) {
+         UChar modrm = getUChar(delta);
+         UInt  rG    = gregOfRexRM(pfx, modrm);
+         IRTemp t16  = newTemp(Ity_I16);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
+            assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
+            assign(t16, loadLE(Ity_I16, mkexpr(addr)));
+         }
+         IRTemp t32 = newTemp(Ity_I32);
+         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
+         IRTemp t64 = newTemp(Ity_I64);
+         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
+         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
+                                                  mkexpr(t64), mkexpr(t64));
+         putYMMReg(rG, res);
+         goto decode_success;
+      }
+      break;
+
+   case 0x8C:
+      /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+                                    /*!isYMM*/False, Ity_I32 );
+         goto decode_success;
+      }
+      /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
+                                    /*isYMM*/True, Ity_I32 );
+         goto decode_success;
+      }
+      /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+                                    /*!isYMM*/False, Ity_I64 );
+         goto decode_success;
+      }
+      /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
+                                    /*isYMM*/True, Ity_I64 );
+         goto decode_success;
+      }
+      break;
+
+   case 0x90:
+      /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
+                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
+                              /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
+                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
+                              /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      break;
+
+   case 0x91:
+      /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
+                              /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
+                              /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
+                              /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
+                              /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      break;
+
+   case 0x92:
+      /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
+                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
+                              /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
+                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
+                              /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      break;
+
+   case 0x93:
+      /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
+                              /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
+                              /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
+                              /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
+         Long delta0 = delta;
+         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
+                              /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
+         if (delta != delta0)
+            goto decode_success;
+      }
+      break;
+
+   case 0x96 ... 0x9F:
+   case 0xA6 ... 0xAF:
+   case 0xB6 ... 0xBF:
+      /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
+      /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
+      /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
+      /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
+      /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
+      /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
+      /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
+      /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
+      /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
+      /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
+      /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
+      /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
+      /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
+      /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
+      /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
+      /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
+      /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
+      /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
+      /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
+      /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
+      /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
+      /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
+      /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
+      /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
+      /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
+      /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
+      /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
+      /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
+      /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
+      /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
+      /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
+      /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
+      /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
+      /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
+      /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
+      /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
+      /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
+      /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
+      /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
+      /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
+      /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
+      /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
+      /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
+      /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
+      /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
+      /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
+      /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
+      /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
+      /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
+      /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
+      /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
+      /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
+      /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
+      /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
+      /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
+      /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
+      /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
+      /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
+      /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
+      /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
+      /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
+      /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
+      /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
+      /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
+      /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
+      /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
+      /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
+      /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
+      /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
+      /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
+      /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
+      /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
+      /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
+      /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
+      /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
+      /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
+      /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
+      /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
+      /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
+      /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
+      /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
+      /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
+      /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
+      /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
+      /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
+      /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
+      /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
+      /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
+      /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
+      /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
+      /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
+      /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
+      /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
+      /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
+      /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
+      /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
+      if (have66noF2noF3(pfx)) {
+         delta = dis_FMA( vbi, pfx, delta, opc );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0xDB:
+   case 0xDC:
+   case 0xDD:
+   case 0xDE:
+   case 0xDF:
+      /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
+      /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
+      /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
+      /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
+      /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
+         delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
+         if (opc != 0xDB) *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0xF2:
+      /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
+      /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
+      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  dst  = newTemp(ty);
+         IRTemp  src1 = newTemp(ty);
+         IRTemp  src2 = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+
+         assign( src1, getIRegV(size,pfx) );
+         if (epartIsReg(rm)) {
+            assign( src2, getIRegE(size,pfx,rm) );
+            DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src2, loadLE(ty, mkexpr(addr)) );
+            DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
+                nameIRegG(size,pfx,rm));
+            delta += alen;
+         }
+
+         assign( dst, binop( mkSizedOp(ty,Iop_And8),
+                             unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
+                             mkexpr(src2) ) );
+         putIRegG( size, pfx, rm, mkexpr(dst) );
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
+                                               ? AMD64G_CC_OP_ANDN64
+                                               : AMD64G_CC_OP_ANDN32)) );
+         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
+         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0xF3:
+      /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
+      /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
+      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
+          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src  = newTemp(ty);
+         IRTemp  dst  = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+
+         if (epartIsReg(rm)) {
+            assign( src, getIRegE(size,pfx,rm) );
+            DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src, loadLE(ty, mkexpr(addr)) );
+            DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
+            delta += alen;
+         }
+
+         assign( dst, binop(mkSizedOp(ty,Iop_And8),
+                            binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
+                                  mkexpr(src)), mkexpr(src)) );
+         putIRegV( size, pfx, mkexpr(dst) );
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
+                                               ? AMD64G_CC_OP_BLSI64
+                                               : AMD64G_CC_OP_BLSI32)) );
+         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
+         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
+      /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
+      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
+          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src  = newTemp(ty);
+         IRTemp  dst  = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+
+         if (epartIsReg(rm)) {
+            assign( src, getIRegE(size,pfx,rm) );
+            DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src, loadLE(ty, mkexpr(addr)) );
+            DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
+            delta += alen;
+         }
+
+         assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
+                            binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
+                                  mkU(ty, 1)), mkexpr(src)) );
+         putIRegV( size, pfx, mkexpr(dst) );
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
+                                               ? AMD64G_CC_OP_BLSMSK64
+                                               : AMD64G_CC_OP_BLSMSK32)) );
+         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
+         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
+      /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
+      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
+          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src  = newTemp(ty);
+         IRTemp  dst  = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+
+         if (epartIsReg(rm)) {
+            assign( src, getIRegE(size,pfx,rm) );
+            DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src, loadLE(ty, mkexpr(addr)) );
+            DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
+            delta += alen;
+         }
+
+         assign( dst, binop(mkSizedOp(ty,Iop_And8),
+                            binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
+                                  mkU(ty, 1)), mkexpr(src)) );
+         putIRegV( size, pfx, mkexpr(dst) );
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
+                                               ? AMD64G_CC_OP_BLSR64
+                                               : AMD64G_CC_OP_BLSR32)) );
+         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
+         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0xF5:
+      /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
+      /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
+      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size  = getRexW(pfx) ? 8 : 4;
+         IRType  ty    = szToITy(size);
+         IRTemp  dst   = newTemp(ty);
+         IRTemp  src1  = newTemp(ty);
+         IRTemp  src2  = newTemp(ty);
+         IRTemp  start = newTemp(Ity_I8);
+         IRTemp  cond  = newTemp(Ity_I8);
+         UChar   rm    = getUChar(delta);
+
+         assign( src2, getIRegV(size,pfx) );
+         if (epartIsReg(rm)) {
+            assign( src1, getIRegE(size,pfx,rm) );
+            DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
+                nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src1, loadLE(ty, mkexpr(addr)) );
+            DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
+                nameIRegG(size,pfx,rm));
+            delta += alen;
+         }
+
+         assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
+         assign( cond, binop(Iop_CmpLT32U,
+                             unop(Iop_8Uto32, mkexpr(start)),
+                             mkU32(8*size)) );
+         /* if (start < opsize) {
+               if (start == 0)
+                  dst = 0;
+               else
+                  dst = (src1 << (opsize-start)) u>> (opsize-start);
+            } else {
+               dst = src1;
+            } */
+         assign( dst,
+                 IRExpr_ITE(
+                    mkexpr(cond),
+                    IRExpr_ITE(
+                       binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
+                       mkU(ty, 0),
+                       binop(
+                          mkSizedOp(ty,Iop_Shr8),
+                          binop(
+                             mkSizedOp(ty,Iop_Shl8),
+                             mkexpr(src1),
+                             binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
+                          ),
+                          binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
+                       )
+                    ),
+                    mkexpr(src1)
+                 )
+               );
+         putIRegG( size, pfx, rm, mkexpr(dst) );
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
+                                               ? AMD64G_CC_OP_BLSR64
+                                               : AMD64G_CC_OP_BLSR32)) );
+         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
+         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
+      /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
+      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src  = newTemp(ty);
+         IRTemp  mask = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+
+         assign( src, getIRegV(size,pfx) );
+         if (epartIsReg(rm)) {
+            assign( mask, getIRegE(size,pfx,rm) );
+            DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( mask, loadLE(ty, mkexpr(addr)) );
+            DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
+                nameIRegG(size,pfx,rm));
+            delta += alen;
+         }
+
+         IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
+                                        widenUto64(mkexpr(mask)) );
+         putIRegG( size, pfx, rm,
+                   narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
+                                              "amd64g_calculate_pdep",
+                                              &amd64g_calculate_pdep, args)) );
+         *uses_vvvv = True;
+         /* Flags aren't modified.  */
+         goto decode_success;
+      }
+      /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
+      /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
+      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src  = newTemp(ty);
+         IRTemp  mask = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+
+         assign( src, getIRegV(size,pfx) );
+         if (epartIsReg(rm)) {
+            assign( mask, getIRegE(size,pfx,rm) );
+            DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( mask, loadLE(ty, mkexpr(addr)) );
+            DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
+                nameIRegG(size,pfx,rm));
+            delta += alen;
+         }
+
+         /* First mask off bits not set in mask, they are ignored
+            and it should be fine if they contain undefined values.  */
+         IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
+                                mkexpr(src), mkexpr(mask));
+         IRExpr** args = mkIRExprVec_2( widenUto64(masked),
+                                        widenUto64(mkexpr(mask)) );
+         putIRegG( size, pfx, rm,
+                   narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
+                                              "amd64g_calculate_pext",
+                                              &amd64g_calculate_pext, args)) );
+         *uses_vvvv = True;
+         /* Flags aren't modified.  */
+         goto decode_success;
+      }
+      break;
+
+   case 0xF6:
+      /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
+      /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
+      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src1 = newTemp(ty);
+         IRTemp  src2 = newTemp(ty);
+         IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
+         UChar   rm   = getUChar(delta);
+
+         assign( src1, getIRegRDX(size) );
+         if (epartIsReg(rm)) {
+            assign( src2, getIRegE(size,pfx,rm) );
+            DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
+                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src2, loadLE(ty, mkexpr(addr)) );
+            DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
+                nameIRegG(size,pfx,rm));
+            delta += alen;
+         }
+
+         assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
+                            mkexpr(src1), mkexpr(src2)) );
+         putIRegV( size, pfx,
+                   unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
+         putIRegG( size, pfx, rm,
+                   unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
+                        mkexpr(res)) );
+         *uses_vvvv = True;
+         /* Flags aren't modified.  */
+         goto decode_success;
+      }
+      break;
+
+   case 0xF7:
+      /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
+      /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
+      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
+         goto decode_success;
+      }
+      /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
+      /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
+      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
+         goto decode_success;
+      }
+      /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
+      /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
+      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
+         goto decode_success;
+      }
+      /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
+      /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
+      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size  = getRexW(pfx) ? 8 : 4;
+         IRType  ty    = szToITy(size);
+         IRTemp  dst   = newTemp(ty);
+         IRTemp  src1  = newTemp(ty);
+         IRTemp  src2  = newTemp(ty);
+         IRTemp  stle  = newTemp(Ity_I16);
+         IRTemp  start = newTemp(Ity_I8);
+         IRTemp  len   = newTemp(Ity_I8);
+         UChar   rm    = getUChar(delta);
+
+         assign( src2, getIRegV(size,pfx) );
+         if (epartIsReg(rm)) {
+            assign( src1, getIRegE(size,pfx,rm) );
+            DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
+                nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
+            delta++;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            assign( src1, loadLE(ty, mkexpr(addr)) );
+            DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
+                nameIRegG(size,pfx,rm));
+            delta += alen;
+         }
+
+         assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
+         assign( start, unop( Iop_16to8, mkexpr(stle) ) );
+         assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
+         /* if (start+len < opsize) {
+               if (len != 0)
+                  dst = (src1 << (opsize-start-len)) u>> (opsize-len);
+               else
+                  dst = 0;
+            } else {
+               if (start < opsize)
+                  dst = src1 u>> start;
+               else
+                  dst = 0;
+            } */
+         assign( dst,
+                 IRExpr_ITE(
+                    binop(Iop_CmpLT32U,
+                          binop(Iop_Add32,
+                                unop(Iop_8Uto32, mkexpr(start)),
+                                unop(Iop_8Uto32, mkexpr(len))),
+                          mkU32(8*size)),
+                    IRExpr_ITE(
+                       binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
+                       mkU(ty, 0),
+                       binop(mkSizedOp(ty,Iop_Shr8),
+                             binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
+                                   binop(Iop_Sub8,
+                                         binop(Iop_Sub8, mkU8(8*size),
+                                               mkexpr(start)),
+                                         mkexpr(len))),
+                             binop(Iop_Sub8, mkU8(8*size),
+                                   mkexpr(len)))
+                    ),
+                    IRExpr_ITE(
+                       binop(Iop_CmpLT32U,
+                             unop(Iop_8Uto32, mkexpr(start)),
+                             mkU32(8*size)),
+                       binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
+                             mkexpr(start)),
+                       mkU(ty, 0)
+                    )
+                 )
+               );
+         putIRegG( size, pfx, rm, mkexpr(dst) );
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
+                                               ? AMD64G_CC_OP_ANDN64
+                                               : AMD64G_CC_OP_ANDN32)) );
+         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
+         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   default:
+      break;
+
+   }
+
+  //decode_failure:
+   return deltaIN;
+
+  decode_success:
+   return delta;
 }
 
 
@@ -25460,7 +29016,132 @@ Long dis_ESC_0F3A__VEX (
    delta++;
    *uses_vvvv = False;
 
-   switch (opc) {
+   switch (opc) {
+
+   case 0x00:
+   case 0x01:
+      /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
+      /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
+          && 1==getRexW(pfx)/*W1*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   imm8  = 0;
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         IRTemp sV    = newTemp(Ity_V256);
+         const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            imm8 = getUChar(delta);
+            DIP("%s $%u,%s,%s\n",
+                name, imm8, nameYMMReg(rE), nameYMMReg(rG));
+            assign(sV, getYMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            imm8 = getUChar(delta);
+            DIP("%s $%u,%s,%s\n",
+                name, imm8, dis_buf, nameYMMReg(rG));
+            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
+         }
+         delta++;
+         IRTemp s[4];
+         s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
+         breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
+         IRTemp dV = newTemp(Ity_V256);
+         assign(dV, IRExpr_Qop(Iop_64x4toV256,
+                               mkexpr(s[(imm8 >> 6) & 3]),
+                               mkexpr(s[(imm8 >> 4) & 3]),
+                               mkexpr(s[(imm8 >> 2) & 3]),
+                               mkexpr(s[(imm8 >> 0) & 3])));
+         putYMMReg(rG, mkexpr(dV));
+         goto decode_success;
+      }
+      break;
+
+   case 0x02:
+      /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
+      if (have66noF2noF3(pfx)
+          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   imm8  = 0;
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp sV    = newTemp(Ity_V128);
+         IRTemp dV    = newTemp(Ity_V128);
+         UInt   i;
+         IRTemp s[4], d[4];
+         assign(sV, getXMMReg(rV));
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            imm8 = getUChar(delta);
+            DIP("vpblendd $%u,%s,%s,%s\n",
+                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+            assign(dV, getXMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            imm8 = getUChar(delta);
+            DIP("vpblendd $%u,%s,%s,%s\n",
+                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
+            assign(dV, loadLE(Ity_V128, mkexpr(addr)));
+         }
+         delta++;
+         for (i = 0; i < 4; i++) {
+            s[i] = IRTemp_INVALID;
+            d[i] = IRTemp_INVALID;
+         }
+         breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
+         breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
+         for (i = 0; i < 4; i++)
+            putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
+         putYMMRegLane128(rG, 1, mkV128(0));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   imm8  = 0;
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp sV    = newTemp(Ity_V256);
+         IRTemp dV    = newTemp(Ity_V256);
+         UInt   i;
+         IRTemp s[8], d[8];
+         assign(sV, getYMMReg(rV));
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            imm8 = getUChar(delta);
+            DIP("vpblendd $%u,%s,%s,%s\n",
+                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+            assign(dV, getYMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            imm8 = getUChar(delta);
+            DIP("vpblendd $%u,%s,%s,%s\n",
+                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+            assign(dV, loadLE(Ity_V256, mkexpr(addr)));
+         }
+         delta++;
+         for (i = 0; i < 8; i++) {
+            s[i] = IRTemp_INVALID;
+            d[i] = IRTemp_INVALID;
+         }
+         breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
+                               &s[3], &s[2], &s[1], &s[0] );
+         breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
+                               &d[3], &d[2], &d[1], &d[0] );
+         for (i = 0; i < 8; i++)
+            putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
 
    case 0x04:
       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
@@ -26045,7 +29726,7 @@ Long dis_ESC_0F3A__VEX (
             delta += alen;
             imm8 = getUChar(delta);
             DIP("vpblendw $%u,%s,%s,%s\n",
-                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
          }
          delta++;
@@ -26054,6 +29735,42 @@ Long dis_ESC_0F3A__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
+      /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   imm8;
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp sV    = newTemp(Ity_V256);
+         IRTemp sE    = newTemp(Ity_V256);
+         IRTemp sVhi, sVlo, sEhi, sElo;
+         sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
+         assign ( sV, getYMMReg(rV) );
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            imm8 = getUChar(delta);
+            DIP("vpblendw $%u,%s,%s,%s\n",
+                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+            assign(sE, getYMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            imm8 = getUChar(delta);
+            DIP("vpblendw $%u,%s,%s,%s\n",
+                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
+         }
+         delta++;
+         breakupV256toV128s( sV, &sVhi, &sVlo );
+         breakupV256toV128s( sE, &sEhi, &sElo );
+         putYMMReg( rG, binop( Iop_V128HLtoV256,
+                               mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
+                               mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0x0F:
@@ -26090,6 +29807,45 @@ Long dis_ESC_0F3A__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
+      /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp sV    = newTemp(Ity_V256);
+         IRTemp dV    = newTemp(Ity_V256);
+         IRTemp sHi, sLo, dHi, dLo;
+         sHi = sLo = dHi = dLo = IRTemp_INVALID;
+         UInt   imm8;
+
+         assign( dV, getYMMReg(rV) );
+
+         if ( epartIsReg( modrm ) ) {
+            UInt   rE = eregOfRexRM(pfx, modrm);
+            assign( sV, getYMMReg(rE) );
+            imm8 = getUChar(delta+1);
+            delta += 1+1;
+            DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameYMMReg(rE),
+                                           nameYMMReg(rV), nameYMMReg(rG));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
+            imm8 = getUChar(delta+alen);
+            delta += alen+1;
+            DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
+                                           nameYMMReg(rV), nameYMMReg(rG));
+         }
+
+         breakupV256toV128s( dV, &dHi, &dLo );
+         breakupV256toV128s( sV, &sHi, &sLo );
+         putYMMReg( rG, binop( Iop_V128HLtoV256,
+                               mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
+                               mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
+                    );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0x14:
@@ -26345,6 +30101,74 @@ Long dis_ESC_0F3A__VEX (
       }
       break;
 
+   case 0x38:
+      /* VINSERTI128 r/m, rV, rD
+         ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
+      /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   ib    = 0;
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp t128  = newTemp(Ity_V128);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            assign(t128, getXMMReg(rE));
+            ib = getUChar(delta);
+            DIP("vinserti128 $%u,%s,%s,%s\n",
+                ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            assign(t128, loadLE(Ity_V128, mkexpr(addr)));
+            delta += alen;
+            ib = getUChar(delta);
+            DIP("vinserti128 $%u,%s,%s,%s\n",
+                ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+         }
+         delta++;
+         putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
+         putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
+         putYMMRegLane128(rG, ib & 1, mkexpr(t128));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0x39:
+      /* VEXTRACTI128 $lane_no, rS, r/m
+         ::: r/m:V128 = a lane of rS:V256 (RM format) */
+      /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   ib    = 0;
+         UInt   rS    = gregOfRexRM(pfx, modrm);
+         IRTemp t128  = newTemp(Ity_V128);
+         if (epartIsReg(modrm)) {
+            UInt rD = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            ib = getUChar(delta);
+            assign(t128, getYMMRegLane128(rS, ib & 1));
+            putYMMRegLoAndZU(rD, mkexpr(t128));
+            DIP("vextracti128 $%u,%s,%s\n",
+                ib, nameXMMReg(rS), nameYMMReg(rD));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            ib = getUChar(delta);
+            assign(t128, getYMMRegLane128(rS, ib & 1));
+            storeLE(mkexpr(addr), mkexpr(t128));
+            DIP("vextracti128 $%u,%s,%s\n",
+                ib, nameYMMReg(rS), dis_buf);
+         }
+         delta++;
+         /* doesn't use vvvv */
+         goto decode_success;
+      }
+      break;
+
    case 0x40:
       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
@@ -26482,6 +30306,46 @@ Long dis_ESC_0F3A__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
+      /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         UChar  modrm   = getUChar(delta);
+         Int    imm8;
+         IRTemp src_vec = newTemp(Ity_V256);
+         IRTemp dst_vec = newTemp(Ity_V256);
+         UInt   rG      = gregOfRexRM(pfx, modrm);
+         UInt   rV      = getVexNvvvv(pfx);
+         IRTemp sHi, sLo, dHi, dLo;
+         sHi = sLo = dHi = dLo = IRTemp_INVALID;
+
+         assign( dst_vec, getYMMReg(rV) );
+
+         if ( epartIsReg( modrm ) ) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+
+            imm8 = (Int)getUChar(delta+1);
+            assign( src_vec, getYMMReg(rE) );
+            delta += 1+1;
+            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
+                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
+                             1/* imm8 is 1 byte after the amode */ );
+            assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
+            imm8 = (Int)getUChar(delta+alen);
+            delta += alen+1;
+            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
+                 dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
+         }
+
+         breakupV256toV128s( dst_vec, &dHi, &dLo );
+         breakupV256toV128s( src_vec, &sHi, &sLo );
+         putYMMReg( rG, binop( Iop_V128HLtoV256,
+                               mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
+                               mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0x44:
@@ -26524,6 +30388,52 @@ Long dis_ESC_0F3A__VEX (
       }
       break;
 
+   case 0x46:
+      /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   imm8  = 0;
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp s00   = newTemp(Ity_V128);
+         IRTemp s01   = newTemp(Ity_V128);
+         IRTemp s10   = newTemp(Ity_V128);
+         IRTemp s11   = newTemp(Ity_V128);
+         assign(s00, getYMMRegLane128(rV, 0));
+         assign(s01, getYMMRegLane128(rV, 1));
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            imm8 = getUChar(delta);
+            DIP("vperm2i128 $%u,%s,%s,%s\n",
+                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+            assign(s10, getYMMRegLane128(rE, 0));
+            assign(s11, getYMMRegLane128(rE, 1));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            imm8 = getUChar(delta);
+            DIP("vperm2i128 $%u,%s,%s,%s\n",
+                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+            assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
+                                               mkexpr(addr), mkU64(0))));
+            assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
+                                               mkexpr(addr), mkU64(16))));
+         }
+         delta++;
+#        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
+                                           : ((_nn)==2) ? s10 : s11)
+         putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
+         putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
+#        undef SEL
+         if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
+         if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
    case 0x4A:
       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
@@ -26576,6 +30486,15 @@ Long dis_ESC_0F3A__VEX (
          *uses_vvvv = True;
          goto decode_success;
       }
+      /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
+         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
+      /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
+      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
+         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
+                                   "vpblendvb", 1, Iop_SarN8x16 );
+         *uses_vvvv = True;
+         goto decode_success;
+      }
       break;
 
    case 0x60:
@@ -26605,6 +30524,44 @@ Long dis_ESC_0F3A__VEX (
       }
       break;
 
+   case 0xF0:
+      /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
+      /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
+      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
+         Int     size = getRexW(pfx) ? 8 : 4;
+         IRType  ty   = szToITy(size);
+         IRTemp  src  = newTemp(ty);
+         UChar   rm   = getUChar(delta);
+         UChar   imm8;
+
+         if (epartIsReg(rm)) {
+            imm8 = getUChar(delta+1);
+            assign( src, getIRegE(size,pfx,rm) );
+            DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
+                                   nameIRegG(size,pfx,rm));
+            delta += 2;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            imm8 = getUChar(delta+alen);
+            assign( src, loadLE(ty, mkexpr(addr)) );
+            DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
+            delta += alen + 1;
+         }
+         imm8 &= 8*size-1;
+
+         /* dst = (src >>u imm8) | (src << (size-imm8)) */
+         putIRegG( size, pfx, rm,
+                   imm8 == 0 ? mkexpr(src)
+                   : binop( mkSizedOp(ty,Iop_Or8),
+                            binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
+                                   mkU8(imm8) ),
+                            binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
+                                   mkU8(8*size-imm8) ) ) );
+         /* Flags aren't modified.  */
+         goto decode_success;
+      }
+      break;
+
    default:
       break;
 
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index e7b878eb2f..9f2aa64fa2 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -43,6 +43,8 @@
 #include "host_generic_regs.h"
 #include "host_generic_simd64.h"
 #include "host_generic_simd128.h"
+#include "host_generic_simd256.h"
+#include "host_generic_maddf.h"
 #include "host_amd64_defs.h"
 
 
@@ -2531,6 +2533,73 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
       return dst;
    }
 
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
+      /* Sigh ... very rough code.  Could do much better. */
+      /* Get the 128-bit literal 00---0 10---0 into a register
+         and xor it with the value to be negated. */
+      HReg r1  = newVRegI(env);
+      HReg dst = newVRegV(env);
+      HReg tmp = newVRegV(env);
+      HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+      addInstr(env, mk_vMOVsd_RR(src,tmp));
+      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+      addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
+      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
+      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
+      addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
+      add_to_rsp(env, 16);
+      return dst;
+   }
+
+   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
+      IRQop *qop = e->Iex.Qop.details;
+      HReg dst  = newVRegV(env);
+      HReg argX = iselFltExpr(env, qop->arg2);
+      HReg argY = iselFltExpr(env, qop->arg3);
+      HReg argZ = iselFltExpr(env, qop->arg4);
+      /* XXXROUNDINGFIXME */
+      /* set roundingmode here */
+      /* subq $16, %rsp         -- make a space*/
+      sub_from_rsp(env, 16);
+      /* Prepare 4 arg regs:
+         leaq 0(%rsp), %rdi
+         leaq 4(%rsp), %rsi
+         leaq 8(%rsp), %rdx
+         leaq 12(%rsp), %rcx
+      */
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
+                                     hregAMD64_RDI()));
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
+                                     hregAMD64_RSI()));
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
+                                     hregAMD64_RDX()));
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
+                                     hregAMD64_RCX()));
+      /* Store the three args, at (%rsi), (%rdx) and (%rcx):
+         movss  %argX, 0(%rsi)
+         movss  %argY, 0(%rdx)
+         movss  %argZ, 0(%rcx)
+         */
+      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
+                                       AMD64AMode_IR(0, hregAMD64_RSI())));
+      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
+                                       AMD64AMode_IR(0, hregAMD64_RDX())));
+      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
+                                       AMD64AMode_IR(0, hregAMD64_RCX())));
+      /* call the helper */
+      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+                                     (ULong)(HWord)h_generic_calc_MAddF32,
+                                     4, RetLocNone ));
+      /* fetch the result from memory, using %r_argp, which the
+         register allocator will keep alive across the call. */
+      addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
+                                       AMD64AMode_IR(0, hregAMD64_RSP())));
+      /* and finally, clear the space */
+      add_to_rsp(env, 16);
+      return dst;
+   }
+
    ppIRExpr(e);
    vpanic("iselFltExpr_wrk");
 }
@@ -2662,6 +2731,54 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
       }
    }
 
+   if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
+      IRQop *qop = e->Iex.Qop.details;
+      HReg dst  = newVRegV(env);
+      HReg argX = iselDblExpr(env, qop->arg2);
+      HReg argY = iselDblExpr(env, qop->arg3);
+      HReg argZ = iselDblExpr(env, qop->arg4);
+      /* XXXROUNDINGFIXME */
+      /* set roundingmode here */
+      /* subq $32, %rsp         -- make a space*/
+      sub_from_rsp(env, 32);
+      /* Prepare 4 arg regs:
+         leaq 0(%rsp), %rdi
+         leaq 8(%rsp), %rsi
+         leaq 16(%rsp), %rdx
+         leaq 24(%rsp), %rcx
+      */
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
+                                     hregAMD64_RDI()));
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
+                                     hregAMD64_RSI()));
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
+                                     hregAMD64_RDX()));
+      addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
+                                     hregAMD64_RCX()));
+      /* Store the three args, at (%rsi), (%rdx) and (%rcx):
+         movsd  %argX, 0(%rsi)
+         movsd  %argY, 0(%rdx)
+         movsd  %argZ, 0(%rcx)
+         */
+      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
+                                       AMD64AMode_IR(0, hregAMD64_RSI())));
+      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
+                                       AMD64AMode_IR(0, hregAMD64_RDX())));
+      addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
+                                       AMD64AMode_IR(0, hregAMD64_RCX())));
+      /* call the helper */
+      addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+                                     (ULong)(HWord)h_generic_calc_MAddF64,
+                                     4, RetLocNone ));
+      /* fetch the result from memory, using %r_argp, which the
+         register allocator will keep alive across the call. */
+      addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
+                                       AMD64AMode_IR(0, hregAMD64_RSP())));
+      /* and finally, clear the space */
+      add_to_rsp(env, 32);
+      return dst;
+   }
+
    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
@@ -3478,6 +3595,7 @@ static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, 
                                ISelEnv* env, IRExpr* e )
 {
+   HWord fn = 0; /* address of helper fn, if required */
    vassert(e);
    IRType ty = typeOfIRExpr(env->type_env,e);
    vassert(ty == Ity_V256);
@@ -3599,6 +3717,8 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
       }
 
       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
+      case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
+      case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
       do_CmpNEZ_vector:
       {
          HReg argHi, argLo;
@@ -3673,6 +3793,37 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
+      case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
+      case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
+      case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
+      case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
+      case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
+      case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
+      case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
+      case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
+      case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
+      case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
+      case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
+      case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
+      case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
+      case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
+      case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
+      case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
+      case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
+      case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
+      case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
+      case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
+      case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
+      case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
+      case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
+      case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
+      case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
+      case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
+      case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
+      case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
+      case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
+      case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
+      case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
       do_SseReRg:
       {
          HReg argLhi, argLlo, argRhi, argRlo;
@@ -3689,12 +3840,198 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
          return;
       }
 
+      case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
+      case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
+      case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
+      do_SseShift: {
+         HReg gregHi, gregLo;
+         iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
+         AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+         AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
+         HReg        ereg  = newVRegV(env);
+         HReg        dstHi = newVRegV(env);
+         HReg        dstLo = newVRegV(env);
+         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+         addInstr(env, AMD64Instr_Push(rmi));
+         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
+         addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
+         addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
+         addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
+         addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
+         add_to_rsp(env, 16);
+         *rHi = dstHi;
+         *rLo = dstLo;
+         return;
+      }
+
       case Iop_V128HLtoV256: {
          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
          return;
       }
 
+      case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
+                           goto do_SseAssistedBinary;
+      case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
+                           goto do_SseAssistedBinary;
+      case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
+                           goto do_SseAssistedBinary;
+      case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
+                           goto do_SseAssistedBinary;
+      case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
+                           goto do_SseAssistedBinary;
+      case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
+                           goto do_SseAssistedBinary;
+      case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
+                           goto do_SseAssistedBinary;
+      case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
+                           goto do_SseAssistedBinary;
+      do_SseAssistedBinary: {
+         /* RRRufff!  RRRufff code is what we're generating here.  Oh
+            well. */
+         vassert(fn != 0);
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
+         HReg argLhi, argLlo, argRhi, argRlo;
+         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
+         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
+         HReg argp = newVRegI(env);
+         /* subq $160, %rsp         -- make a space*/
+         sub_from_rsp(env, 160);
+         /* leaq 48(%rsp), %r_argp  -- point into it */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+                                        argp));
+         /* andq $-16, %r_argp      -- 16-align the pointer */
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                         AMD64RMI_Imm( ~(UInt)15 ),
+                                         argp));
+         /* Prepare 3 arg regs:
+            leaq 0(%r_argp), %rdi
+            leaq 16(%r_argp), %rsi
+            leaq 32(%r_argp), %rdx
+         */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+                                        hregAMD64_RDI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+                                        hregAMD64_RSI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+                                        hregAMD64_RDX()));
+         /* Store the two high args, at (%rsi) and (%rdx):
+            movupd  %argLhi, 0(%rsi)
+            movupd  %argRhi, 0(%rdx)
+         */
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
+                                          AMD64AMode_IR(0, hregAMD64_RSI())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
+                                          AMD64AMode_IR(0, hregAMD64_RDX())));
+         /* Store the two low args, at 48(%rsi) and 48(%rdx):
+            movupd  %argLlo, 48(%rsi)
+            movupd  %argRlo, 48(%rdx)
+         */
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
+                                          AMD64AMode_IR(48, hregAMD64_RSI())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
+                                          AMD64AMode_IR(48, hregAMD64_RDX())));
+         /* call the helper */
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone ));
+         /* Prepare 3 arg regs:
+            leaq 48(%r_argp), %rdi
+            leaq 64(%r_argp), %rsi
+            leaq 80(%r_argp), %rdx
+         */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
+                                        hregAMD64_RDI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
+                                        hregAMD64_RSI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
+                                        hregAMD64_RDX()));
+         /* call the helper */
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone ));
+         /* fetch the result from memory, using %r_argp, which the
+            register allocator will keep alive across the call. */
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
+                                          AMD64AMode_IR(0, argp)));
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
+                                          AMD64AMode_IR(48, argp)));
+         /* and finally, clear the space */
+         add_to_rsp(env, 160);
+         *rHi = dstHi;
+         *rLo = dstLo;
+         return;
+      }
+
+      case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
+                           goto do_SseAssistedBinary256;
+      do_SseAssistedBinary256: {
+         /* RRRufff!  RRRufff code is what we're generating here.  Oh
+            well. */
+         vassert(fn != 0);
+         HReg dstHi = newVRegV(env);
+         HReg dstLo = newVRegV(env);
+         HReg argLhi, argLlo, argRhi, argRlo;
+         iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
+         iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
+         HReg argp = newVRegI(env);
+         /* subq $160, %rsp         -- make a space*/
+         sub_from_rsp(env, 160);
+         /* leaq 48(%rsp), %r_argp  -- point into it */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+                                        argp));
+         /* andq $-16, %r_argp      -- 16-align the pointer */
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                         AMD64RMI_Imm( ~(UInt)15 ),
+                                         argp));
+         /* Prepare 3 arg regs:
+            leaq 0(%r_argp), %rdi
+            leaq 32(%r_argp), %rsi
+            leaq 64(%r_argp), %rdx
+         */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+                                        hregAMD64_RDI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+                                        hregAMD64_RSI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
+                                        hregAMD64_RDX()));
+         /* Store the two args, at (%rsi) and (%rdx):
+            movupd  %argLlo, 0(%rsi)
+            movupd  %argLhi, 16(%rsi)
+            movupd  %argRlo, 0(%rdx)
+            movupd  %argRhi, 16(%rdx)
+         */
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
+                                          AMD64AMode_IR(0, hregAMD64_RSI())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
+                                          AMD64AMode_IR(16, hregAMD64_RSI())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
+                                          AMD64AMode_IR(0, hregAMD64_RDX())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
+                                          AMD64AMode_IR(16, hregAMD64_RDX())));
+         /* call the helper */
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, RetLocNone ));
+         /* fetch the result from memory, using %r_argp, which the
+            register allocator will keep alive across the call. */
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
+                                          AMD64AMode_IR(0, argp)));
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
+                                          AMD64AMode_IR(16, argp)));
+         /* and finally, clear the space */
+         add_to_rsp(env, 160);
+         *rHi = dstHi;
+         *rLo = dstLo;
+         return;
+      }
+
       default:
          break;
    } /* switch (e->Iex.Binop.op) */
@@ -3725,6 +4062,22 @@ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
       return;
    }
 
+   if (e->tag == Iex_ITE) {
+      HReg r1Hi, r1Lo, r0Hi, r0Lo;
+      iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
+      iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
+      HReg dstHi = newVRegV(env);
+      HReg dstLo = newVRegV(env);
+      addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
+      addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
+      AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
+      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
+      addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
+      *rHi = dstHi;
+      *rLo = dstLo;
+      return;
+   }
+
    //avx_fail:
    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
@@ -4303,7 +4656,9 @@ HInstrArray* iselSB_AMD64 ( IRSB* bb,
                      | VEX_HWCAPS_AMD64_CX16
                      | VEX_HWCAPS_AMD64_LZCNT
                      | VEX_HWCAPS_AMD64_AVX
-                     | VEX_HWCAPS_AMD64_RDTSCP)));
+                     | VEX_HWCAPS_AMD64_RDTSCP
+                     | VEX_HWCAPS_AMD64_BMI
+                     | VEX_HWCAPS_AMD64_AVX2)));
 
    /* Make up an initial environment to use. */
    env = LibVEX_Alloc(sizeof(ISelEnv));
diff --git a/VEX/priv/host_generic_maddf.c b/VEX/priv/host_generic_maddf.c
new file mode 100644
index 0000000000..d4e9fb7d60
--- /dev/null
+++ b/VEX/priv/host_generic_maddf.c
@@ -0,0 +1,320 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                              host_generic_maddf.c ---*/
+/*---------------------------------------------------------------*/
+
+/* 
+   Compute x * y + z as ternary operation.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.
+*/
+
+/* Generic helper functions for doing FMA, i.e. compute x * y + z
+   as ternary operation.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_maddf.h"
+#include "main_util.h"
+
+/* This implementation relies on Double being more than twice as
+   precise as Float and uses rounding to odd in order to avoid problems
+   with double rounding.
+   See a paper by Boldo and Melquiond:
+   http://www.lri.fr/~melquion/doc/08-tc.pdf  */
+
+#define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
+
+#if defined(__x86_64__) && defined(__SSE2_MATH__)
+# define ENV_TYPE unsigned int
+/* Save current rounding mode into ENV, hold exceptions, set rounding
+   mode to rounding toward zero.  */
+# define ROUNDTOZERO(env) \
+   do {							\
+      unsigned int mxcsr;				\
+      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
+      (env) = mxcsr;					\
+      mxcsr = (mxcsr | 0x7f80) & ~0x3f;			\
+      __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
+   } while (0)
+/* Restore exceptions from ENV, return if inexact exception has been raised
+   since ROUNDTOZERO.  */
+# define RESET_TESTINEXACT(env) \
+   ({							\
+      unsigned int mxcsr, ret;				\
+      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
+      ret = (mxcsr >> 5) & 1;				\
+      mxcsr = (mxcsr & 0x3d) | (env);			\
+      __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
+      ret;						\
+   })
+/* Return if inexact exception has been raised since ROUNDTOZERO.  */
+# define TESTINEXACT() \
+   ({							\
+      unsigned int mxcsr;				\
+      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
+      (mxcsr >> 5) & 1;					\
+   })
+#endif
+
+#define DBL_MANT_DIG 53
+#define IEEE754_DOUBLE_BIAS 0x3ff
+
+union vg_ieee754_double {
+   Double d;
+
+   /* This is the IEEE 754 double-precision format.  */
+   struct {
+#ifdef VKI_BIG_ENDIAN
+      unsigned int negative:1;
+      unsigned int exponent:11;
+      unsigned int mantissa0:20;
+      unsigned int mantissa1:32;
+#else
+      unsigned int mantissa1:32;
+      unsigned int mantissa0:20;
+      unsigned int exponent:11;
+      unsigned int negative:1;
+#endif
+   } ieee;
+};
+
+void VEX_REGPARM(3)
+     h_generic_calc_MAddF32 ( /*OUT*/Float* res,
+                               Float* argX, Float* argY, Float* argZ )
+{
+#ifndef ENV_TYPE
+   /* Lame fallback implementation.  */
+   *res = *argX * *argY + *argZ;
+#else
+   ENV_TYPE env;
+   /* Multiplication is always exact.  */
+   Double temp = (Double) *argX * (Double) *argY;
+   union vg_ieee754_double u;
+
+   ROUNDTOZERO (env);
+
+   /* Perform addition with round to odd.  */
+   u.d = temp + (Double) *argZ;
+   /* Ensure the addition is not scheduled after fetestexcept call.  */
+   FORCE_EVAL (u.d);
+
+   /* Reset rounding mode and test for inexact simultaneously.  */
+   int j = RESET_TESTINEXACT (env);
+
+   if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+      u.ieee.mantissa1 |= j;
+
+   /* And finally truncation with round to nearest.  */
+   *res = (Float) u.d;
+#endif
+}
+
+
+void VEX_REGPARM(3)
+     h_generic_calc_MAddF64 ( /*OUT*/Double* res,
+                               Double* argX, Double* argY, Double* argZ )
+{
+#ifndef ENV_TYPE
+   /* Lame fallback implementation.  */
+   *res = *argX * *argY + *argZ;
+#else
+   Double x = *argX, y = *argY, z = *argZ;
+   union vg_ieee754_double u, v, w;
+   int adjust = 0;
+   u.d = x;
+   v.d = y;
+   w.d = z;
+   if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
+                 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
+       || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
+       || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
+       || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
+       || UNLIKELY (u.ieee.exponent + v.ieee.exponent
+                    <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
+      /* If z is Inf, but x and y are finite, the result should be
+         z rather than NaN.  */
+      if (w.ieee.exponent == 0x7ff
+          && u.ieee.exponent != 0x7ff
+          && v.ieee.exponent != 0x7ff) {
+         *res = (z + x) + y;
+         return;
+      }
+      /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
+         or if x * y is less than half of DBL_DENORM_MIN,
+         compute as x * y + z.  */
+      if (u.ieee.exponent == 0x7ff
+          || v.ieee.exponent == 0x7ff
+          || w.ieee.exponent == 0x7ff
+          || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
+          || u.ieee.exponent + v.ieee.exponent
+             < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
+         *res = x * y + z;
+         return;
+      }
+      if (u.ieee.exponent + v.ieee.exponent
+          >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
+         /* Compute 1p-53 times smaller result and multiply
+            at the end.  */
+         if (u.ieee.exponent > v.ieee.exponent)
+            u.ieee.exponent -= DBL_MANT_DIG;
+         else
+            v.ieee.exponent -= DBL_MANT_DIG;
+         /* If x + y exponent is very large and z exponent is very small,
+            it doesn't matter if we don't adjust it.  */
+         if (w.ieee.exponent > DBL_MANT_DIG)
+            w.ieee.exponent -= DBL_MANT_DIG;
+         adjust = 1;
+      } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
+         /* Similarly.
+            If z exponent is very large and x and y exponents are
+            very small, it doesn't matter if we don't adjust it.  */
+         if (u.ieee.exponent > v.ieee.exponent) {
+            if (u.ieee.exponent > DBL_MANT_DIG)
+               u.ieee.exponent -= DBL_MANT_DIG;
+         } else if (v.ieee.exponent > DBL_MANT_DIG)
+            v.ieee.exponent -= DBL_MANT_DIG;
+         w.ieee.exponent -= DBL_MANT_DIG;
+         adjust = 1;
+      } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
+         u.ieee.exponent -= DBL_MANT_DIG;
+         if (v.ieee.exponent)
+            v.ieee.exponent += DBL_MANT_DIG;
+         else
+            v.d *= 0x1p53;
+      } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
+         v.ieee.exponent -= DBL_MANT_DIG;
+         if (u.ieee.exponent)
+            u.ieee.exponent += DBL_MANT_DIG;
+         else
+            u.d *= 0x1p53;
+      } else /* if (u.ieee.exponent + v.ieee.exponent
+                    <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
+         if (u.ieee.exponent > v.ieee.exponent)
+            u.ieee.exponent += 2 * DBL_MANT_DIG;
+         else
+            v.ieee.exponent += 2 * DBL_MANT_DIG;
+         if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
+            if (w.ieee.exponent)
+               w.ieee.exponent += 2 * DBL_MANT_DIG;
+            else
+               w.d *= 0x1p106;
+            adjust = -1;
+         }
+         /* Otherwise x * y should just affect inexact
+            and nothing else.  */
+      }
+      x = u.d;
+      y = v.d;
+      z = w.d;
+   }
+   /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
+#  define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
+   Double x1 = x * C;
+   Double y1 = y * C;
+   Double m1 = x * y;
+   x1 = (x - x1) + x1;
+   y1 = (y - y1) + y1;
+   Double x2 = x - x1;
+   Double y2 = y - y1;
+   Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
+#  undef C
+
+   /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
+   Double a1 = z + m1;
+   Double t1 = a1 - z;
+   Double t2 = a1 - t1;
+   t1 = m1 - t1;
+   t2 = z - t2;
+   Double a2 = t1 + t2;
+
+   ENV_TYPE env;
+   ROUNDTOZERO (env);
+
+   /* Perform m2 + a2 addition with round to odd.  */
+   u.d = a2 + m2;
+
+   if (UNLIKELY (adjust < 0)) {
+      if ((u.ieee.mantissa1 & 1) == 0)
+         u.ieee.mantissa1 |= TESTINEXACT ();
+      v.d = a1 + u.d;
+      /* Ensure the addition is not scheduled after fetestexcept call.  */
+      FORCE_EVAL (v.d);
+   }
+
+   /* Reset rounding mode and test for inexact simultaneously.  */
+   int j = RESET_TESTINEXACT (env) != 0;
+
+   if (LIKELY (adjust == 0)) {
+      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+         u.ieee.mantissa1 |= j;
+      /* Result is a1 + u.d.  */
+      *res = a1 + u.d;
+   } else if (LIKELY (adjust > 0)) {
+      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+         u.ieee.mantissa1 |= j;
+      /* Result is a1 + u.d, scaled up.  */
+      *res = (a1 + u.d) * 0x1p53;
+   } else {
+      /* If a1 + u.d is exact, the only rounding happens during
+         scaling down.  */
+      if (j == 0) {
+         *res = v.d * 0x1p-106;
+         return;
+      }
+      /* If result rounded to zero is not subnormal, no double
+         rounding will occur.  */
+      if (v.ieee.exponent > 106) {
+         *res = (a1 + u.d) * 0x1p-106;
+         return;
+      }
+      /* If v.d * 0x1p-106 with round to zero is a subnormal above
+         or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
+         down just by 1 bit, which means v.ieee.mantissa1 |= j would
+         change the round bit, not sticky or guard bit.
+         v.d * 0x1p-106 never normalizes by shifting up,
+         so round bit plus sticky bit should be already enough
+         for proper rounding.  */
+      if (v.ieee.exponent == 106) {
+         /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
+            v.ieee.mantissa1 & 1 is the round bit and j is our sticky
+            bit.  In round-to-nearest 001 rounds down like 00,
+            011 rounds up, even though 01 rounds down (thus we need
+            to adjust), 101 rounds down like 10 and 111 rounds up
+            like 11.  */
+         if ((v.ieee.mantissa1 & 3) == 1) {
+            v.d *= 0x1p-106;
+            if (v.ieee.negative)
+               *res = v.d - 0x1p-1074;
+            else
+               *res = v.d + 0x1p-1074;
+         } else
+            *res = v.d * 0x1p-106;
+         return;
+      }
+      v.ieee.mantissa1 |= j;
+      *res = v.d * 0x1p-106;
+      return;
+    }
+#endif
+}
+
+/*---------------------------------------------------------------*/
+/*--- end                                 host_generic_maddf.c --*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_generic_maddf.h b/VEX/priv/host_generic_maddf.h
new file mode 100644
index 0000000000..6757f74544
--- /dev/null
+++ b/VEX/priv/host_generic_maddf.h
@@ -0,0 +1,48 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                              host_generic_maddf.h ---*/
+/*---------------------------------------------------------------*/
+
+/* 
+   Compute x * y + z as ternary operation.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.
+*/
+
+/* Generic helper functions for doing FMA, i.e. compute x * y + z
+   as ternary operation.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#ifndef __VEX_HOST_GENERIC_MADDF_H
+#define __VEX_HOST_GENERIC_MADDF_H
+
+#include "libvex_basictypes.h"
+
+extern VEX_REGPARM(3)
+       void h_generic_calc_MAddF32 ( /*OUT*/Float*, Float*, Float*, Float* );
+
+extern VEX_REGPARM(3)
+       void h_generic_calc_MAddF64 ( /*OUT*/Double*, Double*, Double*,
+                                     Double* );
+
+#endif /* ndef __VEX_HOST_GENERIC_MADDF_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                 host_generic_maddf.h --*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_generic_simd256.c b/VEX/priv/host_generic_simd256.c
new file mode 100644
index 0000000000..93990d22d0
--- /dev/null
+++ b/VEX/priv/host_generic_simd256.c
@@ -0,0 +1,57 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                            host_generic_simd256.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2012 OpenWorks GbR
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 256-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd256.h"
+
+
+void VEX_REGPARM(3)
+     h_generic_calc_Perm32x8 ( /*OUT*/V256* res,
+                               V256* argL, V256* argR )
+{
+   res->w32[0] = argL->w32[ argR->w32[0] & 7 ];
+   res->w32[1] = argL->w32[ argR->w32[1] & 7 ];
+   res->w32[2] = argL->w32[ argR->w32[2] & 7 ];
+   res->w32[3] = argL->w32[ argR->w32[3] & 7 ];
+   res->w32[4] = argL->w32[ argR->w32[4] & 7 ];
+   res->w32[5] = argL->w32[ argR->w32[5] & 7 ];
+   res->w32[6] = argL->w32[ argR->w32[6] & 7 ];
+   res->w32[7] = argL->w32[ argR->w32[7] & 7 ];
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                              host_generic_simd256.c ---*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_generic_simd256.h b/VEX/priv/host_generic_simd256.h
new file mode 100644
index 0000000000..1254316f1d
--- /dev/null
+++ b/VEX/priv/host_generic_simd256.h
@@ -0,0 +1,55 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             host_generic_simd256.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2012 OpenWorks GbR
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 256-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   as clean helper functions from IR.
+
+   These will get called from generated code and therefore should be
+   well behaved -- no floating point or mmx insns, just straight
+   integer code.
+
+   Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD256_H
+#define __VEX_HOST_GENERIC_SIMD256_H
+
+#include "libvex_basictypes.h"
+
+extern VEX_REGPARM(3)
+       void h_generic_calc_Perm32x8   ( /*OUT*/V256*, V256*, V256* );
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD256_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                              host_generic_simd256.h ---*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index 39be267f0f..c3f7bfd593 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -1036,6 +1036,68 @@ void ppIROp ( IROp op )
       case Iop_NotV256: vex_printf("NotV256"); return;
       case Iop_CmpNEZ64x4: vex_printf("CmpNEZ64x4"); return;
       case Iop_CmpNEZ32x8: vex_printf("CmpNEZ32x8"); return;
+      case Iop_CmpNEZ16x16: vex_printf("CmpNEZ16x16"); return;
+      case Iop_CmpNEZ8x32: vex_printf("CmpNEZ8x32"); return;
+
+      case Iop_Add8x32:   vex_printf("Add8x32"); return;
+      case Iop_Add16x16:  vex_printf("Add16x16"); return;
+      case Iop_Add32x8:   vex_printf("Add32x8"); return;
+      case Iop_Add64x4:   vex_printf("Add64x4"); return;
+      case Iop_Sub8x32:   vex_printf("Sub8x32"); return;
+      case Iop_Sub16x16:  vex_printf("Sub16x16"); return;
+      case Iop_Sub32x8:   vex_printf("Sub32x8"); return;
+      case Iop_Sub64x4:   vex_printf("Sub64x4"); return;
+      case Iop_QAdd8Ux32: vex_printf("QAdd8Ux32"); return;
+      case Iop_QAdd16Ux16: vex_printf("QAdd16Ux16"); return;
+      case Iop_QAdd8Sx32: vex_printf("QAdd8Sx32"); return;
+      case Iop_QAdd16Sx16: vex_printf("QAdd16Sx16"); return;
+      case Iop_QSub8Ux32: vex_printf("QSub8Ux32"); return;
+      case Iop_QSub16Ux16: vex_printf("QSub16Ux16"); return;
+      case Iop_QSub8Sx32: vex_printf("QSub8Sx32"); return;
+      case Iop_QSub16Sx16: vex_printf("QSub16Sx16"); return;
+
+      case Iop_Mul16x16:    vex_printf("Mul16x16"); return;
+      case Iop_Mul32x8:     vex_printf("Mul32x8"); return;
+      case Iop_MulHi16Ux16: vex_printf("MulHi16Ux16"); return;
+      case Iop_MulHi16Sx16: vex_printf("MulHi16Sx16"); return;
+
+      case Iop_Avg8Ux32:  vex_printf("Avg8Ux32"); return;
+      case Iop_Avg16Ux16: vex_printf("Avg16Ux16"); return;
+
+      case Iop_Max8Sx32:  vex_printf("Max8Sx32"); return;
+      case Iop_Max16Sx16: vex_printf("Max16Sx16"); return;
+      case Iop_Max32Sx8:  vex_printf("Max32Sx8"); return;
+      case Iop_Max8Ux32:  vex_printf("Max8Ux32"); return;
+      case Iop_Max16Ux16: vex_printf("Max16Ux16"); return;
+      case Iop_Max32Ux8:  vex_printf("Max32Ux8"); return;
+
+      case Iop_Min8Sx32:  vex_printf("Min8Sx32"); return;
+      case Iop_Min16Sx16: vex_printf("Min16Sx16"); return;
+      case Iop_Min32Sx8:  vex_printf("Min32Sx8"); return;
+      case Iop_Min8Ux32:  vex_printf("Min8Ux32"); return;
+      case Iop_Min16Ux16: vex_printf("Min16Ux16"); return;
+      case Iop_Min32Ux8:  vex_printf("Min32Ux8"); return;
+
+      case Iop_CmpEQ8x32:   vex_printf("CmpEQ8x32"); return;
+      case Iop_CmpEQ16x16:  vex_printf("CmpEQ16x16"); return;
+      case Iop_CmpEQ32x8:   vex_printf("CmpEQ32x8"); return;
+      case Iop_CmpEQ64x4:   vex_printf("CmpEQ64x4"); return;
+      case Iop_CmpGT8Sx32:  vex_printf("CmpGT8Sx32"); return;
+      case Iop_CmpGT16Sx16: vex_printf("CmpGT16Sx16"); return;
+      case Iop_CmpGT32Sx8:  vex_printf("CmpGT32Sx8"); return;
+      case Iop_CmpGT64Sx4:  vex_printf("CmpGT64Sx4"); return;
+
+      case Iop_ShlN16x16:  vex_printf("ShlN16x16"); return;
+      case Iop_ShlN32x8:   vex_printf("ShlN32x8"); return;
+      case Iop_ShlN64x4:   vex_printf("ShlN64x4"); return;
+      case Iop_ShrN16x16:  vex_printf("ShrN16x16"); return;
+      case Iop_ShrN32x8:   vex_printf("ShrN32x8"); return;
+      case Iop_ShrN64x4:   vex_printf("ShrN64x4"); return;
+      case Iop_SarN16x16:  vex_printf("SarN16x16"); return;
+      case Iop_SarN32x8:   vex_printf("SarN32x8"); return;
+
+      case Iop_Perm32x8:   vex_printf("Perm32x8"); return;
+
       default: vpanic("ppIROp(1)");
    }
 
@@ -3001,6 +3063,26 @@ void typeOfPrimop ( IROp op,
       case Iop_XorV256:
       case Iop_Max32Fx8: case Iop_Min32Fx8:
       case Iop_Max64Fx4: case Iop_Min64Fx4:
+      case Iop_Add8x32:  case Iop_Add16x16:
+      case Iop_Add32x8:  case Iop_Add64x4:
+      case Iop_Sub8x32:  case Iop_Sub16x16:
+      case Iop_Sub32x8:  case Iop_Sub64x4:
+      case Iop_Mul16x16: case Iop_Mul32x8:
+      case Iop_MulHi16Ux16: case Iop_MulHi16Sx16:
+      case Iop_Avg8Ux32: case Iop_Avg16Ux16:
+      case Iop_Max8Sx32: case Iop_Max16Sx16: case Iop_Max32Sx8:
+      case Iop_Max8Ux32: case Iop_Max16Ux16: case Iop_Max32Ux8:
+      case Iop_Min8Sx32: case Iop_Min16Sx16: case Iop_Min32Sx8:
+      case Iop_Min8Ux32: case Iop_Min16Ux16: case Iop_Min32Ux8:
+      case Iop_CmpEQ8x32:  case Iop_CmpEQ16x16:
+      case Iop_CmpEQ32x8:  case Iop_CmpEQ64x4:
+      case Iop_CmpGT8Sx32: case Iop_CmpGT16Sx16:
+      case Iop_CmpGT32Sx8: case Iop_CmpGT64Sx4:
+      case Iop_QAdd8Ux32: case Iop_QAdd16Ux16:
+      case Iop_QAdd8Sx32: case Iop_QAdd16Sx16:
+      case Iop_QSub8Ux32: case Iop_QSub16Ux16:
+      case Iop_QSub8Sx32: case Iop_QSub16Sx16:
+      case Iop_Perm32x8:
          BINARY(Ity_V256,Ity_V256, Ity_V256);
 
       case Iop_V256toV128_1: case Iop_V256toV128_0:
@@ -3014,9 +3096,17 @@ void typeOfPrimop ( IROp op,
       case Iop_Sqrt32Fx8:
       case Iop_Sqrt64Fx4:
       case Iop_Recip32Fx8:
+      case Iop_CmpNEZ8x32: case Iop_CmpNEZ16x16:
       case Iop_CmpNEZ64x4: case Iop_CmpNEZ32x8:
          UNARY(Ity_V256, Ity_V256);
 
+      case Iop_ShlN16x16: case Iop_ShlN32x8:
+      case Iop_ShlN64x4:
+      case Iop_ShrN16x16: case Iop_ShrN32x8:
+      case Iop_ShrN64x4:
+      case Iop_SarN16x16: case Iop_SarN32x8:
+         BINARY(Ity_V256,Ity_I8, Ity_V256);
+
       default:
          ppIROp(op);
          vpanic("typeOfPrimop");
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
index aa45d11efb..149b651690 100644
--- a/VEX/priv/main_main.c
+++ b/VEX/priv/main_main.c
@@ -1208,11 +1208,16 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps )
       orthogonal. */
 
    /* Throw out obviously stupid cases: */
-   /* AVX without SSE3 */
    Bool have_sse3 = (hwcaps & VEX_HWCAPS_AMD64_SSE3) != 0;
    Bool have_avx  = (hwcaps & VEX_HWCAPS_AMD64_AVX)  != 0;
+   Bool have_bmi  = (hwcaps & VEX_HWCAPS_AMD64_BMI)  != 0;
+   Bool have_avx2 = (hwcaps & VEX_HWCAPS_AMD64_AVX2) != 0;
+   /* AVX without SSE3 */
    if (have_avx && !have_sse3)
       return NULL;
+   /* AVX2 or BMI without AVX */
+   if ((have_avx2 || have_bmi) && !have_avx)
+      return NULL;
 
    /* This isn't threadsafe.  We might need to fix it at some point. */
    static HChar buf[100] = { 0 };
@@ -1243,6 +1248,12 @@ static const HChar* show_hwcaps_amd64 ( UInt hwcaps )
    if (hwcaps & VEX_HWCAPS_AMD64_AVX) {
       p = p + vex_sprintf(p, "%s", "-avx");
    }
+   if (hwcaps & VEX_HWCAPS_AMD64_AVX2) {
+      p = p + vex_sprintf(p, "%s", "-avx2");
+   }
+   if (hwcaps & VEX_HWCAPS_AMD64_BMI) {
+      p = p + vex_sprintf(p, "%s", "-bmi");
+   }
 
   out:
    vassert(buf[sizeof(buf)-1] == 0);
diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h
index b1061fc9a6..5a67349df2 100644
--- a/VEX/pub/libvex.h
+++ b/VEX/pub/libvex.h
@@ -79,11 +79,13 @@ typedef
 
 /* amd64: baseline capability is SSE2, with cmpxchg8b but not
    cmpxchg16b. */
-#define VEX_HWCAPS_AMD64_SSE3  (1<<5)  /* SSE3 support */
-#define VEX_HWCAPS_AMD64_CX16  (1<<6)  /* cmpxchg16b support */
-#define VEX_HWCAPS_AMD64_LZCNT (1<<7)  /* SSE4a LZCNT insn */
-#define VEX_HWCAPS_AMD64_AVX   (1<<8)  /* AVX instructions */
-#define VEX_HWCAPS_AMD64_RDTSCP (1<<9) /* RDTSCP instruction */
+#define VEX_HWCAPS_AMD64_SSE3   (1<<5)  /* SSE3 support */
+#define VEX_HWCAPS_AMD64_CX16   (1<<6)  /* cmpxchg16b support */
+#define VEX_HWCAPS_AMD64_LZCNT  (1<<7)  /* SSE4a LZCNT insn */
+#define VEX_HWCAPS_AMD64_AVX    (1<<8)  /* AVX instructions */
+#define VEX_HWCAPS_AMD64_RDTSCP (1<<9)  /* RDTSCP instruction */
+#define VEX_HWCAPS_AMD64_BMI    (1<<10) /* BMI1 instructions */
+#define VEX_HWCAPS_AMD64_AVX2   (1<<11) /* AVX2 instructions */
 
 /* ppc32: baseline capability is integer only */
 #define VEX_HWCAPS_PPC32_F     (1<<8)  /* basic (non-optional) FP */
diff --git a/VEX/pub/libvex_basictypes.h b/VEX/pub/libvex_basictypes.h
index 5335e2d053..1d08206855 100644
--- a/VEX/pub/libvex_basictypes.h
+++ b/VEX/pub/libvex_basictypes.h
@@ -75,6 +75,16 @@ typedef
    }
    V128;
 
+/* A union for doing 256-bit vector primitives conveniently. */
+typedef
+   union {
+      UChar  w8[32];
+      UShort w16[16];
+      UInt   w32[8];
+      ULong  w64[4];
+   }
+   V256;
+
 /* Floating point. */
 typedef  float   Float;    /* IEEE754 single-precision (32-bit) value */
 typedef  double  Double;   /* IEEE754 double-precision (64-bit) value */
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 627ffd7cf2..00a463a353 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -1514,7 +1514,34 @@ typedef
       Iop_NotV256,
 
       /* MISC (vector integer cmp != 0) */
-      Iop_CmpNEZ32x8, Iop_CmpNEZ64x4,
+      Iop_CmpNEZ8x32, Iop_CmpNEZ16x16, Iop_CmpNEZ32x8, Iop_CmpNEZ64x4,
+
+      Iop_Add8x32,    Iop_Add16x16,    Iop_Add32x8,    Iop_Add64x4,
+      Iop_Sub8x32,    Iop_Sub16x16,    Iop_Sub32x8,    Iop_Sub64x4,
+
+      Iop_CmpEQ8x32,  Iop_CmpEQ16x16,  Iop_CmpEQ32x8,  Iop_CmpEQ64x4,
+      Iop_CmpGT8Sx32, Iop_CmpGT16Sx16, Iop_CmpGT32Sx8, Iop_CmpGT64Sx4,
+
+      Iop_ShlN16x16, Iop_ShlN32x8, Iop_ShlN64x4,
+      Iop_ShrN16x16, Iop_ShrN32x8, Iop_ShrN64x4,
+      Iop_SarN16x16, Iop_SarN32x8,
+
+      Iop_Max8Sx32, Iop_Max16Sx16, Iop_Max32Sx8,
+      Iop_Max8Ux32, Iop_Max16Ux16, Iop_Max32Ux8,
+      Iop_Min8Sx32, Iop_Min16Sx16, Iop_Min32Sx8,
+      Iop_Min8Ux32, Iop_Min16Ux16, Iop_Min32Ux8,
+
+      Iop_Mul16x16, Iop_Mul32x8,
+      Iop_MulHi16Ux16, Iop_MulHi16Sx16,
+
+      Iop_QAdd8Ux32, Iop_QAdd16Ux16,
+      Iop_QAdd8Sx32, Iop_QAdd16Sx16,
+      Iop_QSub8Ux32, Iop_QSub16Ux16,
+      Iop_QSub8Sx32, Iop_QSub16Sx16,
+
+      Iop_Avg8Ux32, Iop_Avg16Ux16,
+
+      Iop_Perm32x8,
 
       /* ------------------ 256-bit SIMD FP. ------------------ */
       Iop_Add64Fx4,