From: Julian Seward <jseward@acm.org>
Date: Wed, 6 Feb 2008 11:42:45 +0000 (+0000)
Subject: Add SSSE3 support.  Currently only for 64-bit.  TODO:
X-Git-Tag: svn/VALGRIND_3_4_1^2~43
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8ddb62873b02fce10061acf4f8540b1cf7c8c5eb;p=thirdparty%2Fvalgrind.git

Add SSSE3 support.  Currently only for 64-bit.  TODO:
* Check through IR generation
* For 128-bit variants accessing memory, generate an exception
  if effective address is not 128-bit aligned
* Change CPUID output to be Core-2 like
* Enable for 32-bit code too.

* Make Memcheck handle the new IROps
* Commit test cases


git-svn-id: svn://svn.valgrind.org/vex/trunk@1808
---

diff --git a/VEX/priv/guest-amd64/toIR.c b/VEX/priv/guest-amd64/toIR.c
index 0003976228..5c8570d80b 100644
--- a/VEX/priv/guest-amd64/toIR.c
+++ b/VEX/priv/guest-amd64/toIR.c
@@ -8309,6 +8309,165 @@ static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
 }
 
 
+/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
+   values (aa,bb), computes, for each of the 4 16-bit lanes:
+
+   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
+*/
+static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
+{
+   IRTemp aa      = newTemp(Ity_I64);
+   IRTemp bb      = newTemp(Ity_I64);
+   IRTemp aahi32s = newTemp(Ity_I64);
+   IRTemp aalo32s = newTemp(Ity_I64);
+   IRTemp bbhi32s = newTemp(Ity_I64);
+   IRTemp bblo32s = newTemp(Ity_I64);
+   IRTemp rHi     = newTemp(Ity_I64);
+   IRTemp rLo     = newTemp(Ity_I64);
+   IRTemp one32x2 = newTemp(Ity_I64);
+   assign(aa, aax);
+   assign(bb, bbx);
+   assign( aahi32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
+                 mkU8(16) ));
+   assign( aalo32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
+                 mkU8(16) ));
+   assign( bbhi32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
+                 mkU8(16) ));
+   assign( bblo32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
+                 mkU8(16) ));
+   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
+   assign(
+      rHi,
+      binop(
+         Iop_ShrN32x2,
+         binop(
+            Iop_Add32x2, 
+            binop(
+               Iop_ShrN32x2,
+               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
+               mkU8(14)
+            ),
+            mkexpr(one32x2)
+         ),
+         mkU8(1)
+      )
+   );
+   assign(
+      rLo,
+      binop(
+         Iop_ShrN32x2,
+         binop(
+            Iop_Add32x2, 
+            binop(
+               Iop_ShrN32x2,
+               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
+               mkU8(14)
+            ),
+            mkexpr(one32x2)
+         ),
+         mkU8(1)
+      )
+   );
+   return
+      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
+}
+
+/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
+   values (aa,bb), computes, for each lane:
+
+          if aa_lane < 0 then - bb_lane
+     else if aa_lane > 0 then bb_lane
+     else 0
+*/
+static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
+{
+   IRTemp aa       = newTemp(Ity_I64);
+   IRTemp bb       = newTemp(Ity_I64);
+   IRTemp zero     = newTemp(Ity_I64);
+   IRTemp bbNeg    = newTemp(Ity_I64);
+   IRTemp negMask  = newTemp(Ity_I64);
+   IRTemp posMask  = newTemp(Ity_I64);
+   IROp   opSub    = Iop_INVALID;
+   IROp   opCmpGTS = Iop_INVALID;
+
+   switch (laneszB) {
+      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
+      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
+      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
+      default: vassert(0);
+   }
+
+   assign( aa,      aax );
+   assign( bb,      bbx );
+   assign( zero,    mkU64(0) );
+   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
+   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
+   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
+
+   return
+      binop(Iop_Or64,
+            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
+            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
+
+}
+
+/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
+   value aa, computes, for each lane
+
+   if aa < 0 then -aa else aa
+
+   Note that the result is interpreted as unsigned, so that the
+   absolute value of the most negative signed input can be
+   represented.
+*/
+static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
+{
+   IRTemp aa      = newTemp(Ity_I64);
+   IRTemp zero    = newTemp(Ity_I64);
+   IRTemp aaNeg   = newTemp(Ity_I64);
+   IRTemp negMask = newTemp(Ity_I64);
+   IRTemp posMask = newTemp(Ity_I64);
+   IROp   opSub   = Iop_INVALID;
+   IROp   opSarN  = Iop_INVALID;
+
+   switch (laneszB) {
+      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
+      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
+      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
+      default: vassert(0);
+   }
+
+   assign( aa,      aax );
+   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
+   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
+   assign( zero,    mkU64(0) );
+   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
+   return
+      binop(Iop_Or64,
+            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
+            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
+}
+
+static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
+                                        IRTemp lo64, Long byteShift )
+{
+   vassert(byteShift >= 1 && byteShift <= 7);
+   return
+      binop(Iop_Or64,
+            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
+            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
+      );
+}
+
+
 /* Helper for deciding whether a given insn (starting at the opcode
    byte) may validly be used with a LOCK prefix.  The following insns
    may be used with LOCK when their destination operand is in memory.
@@ -12455,6 +12614,830 @@ DisResult disInstr_AMD64_WRK (
    /* --- end of the SSE3 decoder.                     --- */
    /* ---------------------------------------------------- */
 
+   /* ---------------------------------------------------- */
+   /* --- start of the SSSE3 decoder.                  --- */
+   /* ---------------------------------------------------- */
+
+   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+      Unsigned Bytes (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+      IRTemp sV        = newTemp(Ity_I64);
+      IRTemp dV        = newTemp(Ity_I64);
+      IRTemp sVoddsSX  = newTemp(Ity_I64);
+      IRTemp sVevensSX = newTemp(Ity_I64);
+      IRTemp dVoddsZX  = newTemp(Ity_I64);
+      IRTemp dVevensZX = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmaddubsw %s,%s\n", dis_buf,
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      /* compute dV unsigned x sV signed */
+      assign( sVoddsSX,
+              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
+      assign( sVevensSX,
+              binop(Iop_SarN16x4, 
+                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)), 
+                    mkU8(8)) );
+      assign( dVoddsZX,
+              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
+      assign( dVevensZX,
+              binop(Iop_ShrN16x4,
+                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
+                    mkU8(8)) );
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         binop(Iop_QAdd16Sx4,
+               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+      Unsigned Bytes (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+      IRTemp sV        = newTemp(Ity_V128);
+      IRTemp dV        = newTemp(Ity_V128);
+      IRTemp sVoddsSX  = newTemp(Ity_V128);
+      IRTemp sVevensSX = newTemp(Ity_V128);
+      IRTemp dVoddsZX  = newTemp(Ity_V128);
+      IRTemp dVevensZX = newTemp(Ity_V128);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmaddubsw %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      /* compute dV unsigned x sV signed */
+      assign( sVoddsSX,
+              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
+      assign( sVevensSX,
+              binop(Iop_SarN16x8, 
+                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), 
+                    mkU8(8)) );
+      assign( dVoddsZX,
+              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
+      assign( dVevensZX,
+              binop(Iop_ShrN16x8,
+                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
+                    mkU8(8)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_QAdd16Sx8,
+               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
+   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
+      mmx) and G to G (mmx). */
+   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
+      mmx) and G to G (mmx). */
+   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
+      to G (mmx). */
+
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+      HChar* str    = "???";
+      IROp   opV64  = Iop_INVALID;
+      IROp   opCatO = Iop_CatOddLanes16x4;
+      IROp   opCatE = Iop_CatEvenLanes16x4;
+      IRTemp sV     = newTemp(Ity_I64);
+      IRTemp dV     = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      switch (insn[2]) {
+         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+         default: vassert(0);
+      }
+      if (insn[2] == 0x02 || insn[2] == 0x06) {
+         opCatO = Iop_InterleaveHI32x2;
+         opCatE = Iop_InterleaveLO32x2;
+      }
+
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("ph%s %s,%s\n", str, dis_buf,
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         binop(opV64,
+               binop(opCatE,mkexpr(sV),mkexpr(dV)),
+               binop(opCatO,mkexpr(sV),mkexpr(dV))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
+      xmm) and G to G (xmm). */
+   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
+      xmm) and G to G (xmm). */
+   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
+      G to G (xmm). */
+
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+      HChar* str    = "???";
+      IROp   opV64  = Iop_INVALID;
+      IROp   opCatO = Iop_CatOddLanes16x4;
+      IROp   opCatE = Iop_CatEvenLanes16x4;
+      IRTemp sV     = newTemp(Ity_V128);
+      IRTemp dV     = newTemp(Ity_V128);
+      IRTemp sHi    = newTemp(Ity_I64);
+      IRTemp sLo    = newTemp(Ity_I64);
+      IRTemp dHi    = newTemp(Ity_I64);
+      IRTemp dLo    = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      switch (insn[2]) {
+         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+         default: vassert(0);
+      }
+      if (insn[2] == 0x02 || insn[2] == 0x06) {
+         opCatO = Iop_InterleaveHI32x2;
+         opCatE = Iop_InterleaveLO32x2;
+      }
+
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("ph%s %s,%s\n", str, dis_buf,
+                             nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 3+alen;
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      /* This isn't a particularly efficient way to compute the
+         result, but at least it avoids a proliferation of IROps,
+         hence avoids complication all the backends. */
+      putXMMReg(
+         gregOfRexRM(pfx,modrm), 
+         binop(Iop_64HLtoV128,
+               binop(opV64,
+                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
+                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
+               ),
+               binop(opV64,
+                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
+                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
+               )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
+      (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+      IRTemp sV = newTemp(Ity_I64);
+      IRTemp dV = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                 nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmulhrsw %s,%s\n", dis_buf,
+                                 nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
+      Scale (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+      IRTemp sV  = newTemp(Ity_V128);
+      IRTemp dV  = newTemp(Ity_V128);
+      IRTemp sHi = newTemp(Ity_I64);
+      IRTemp sLo = newTemp(Ity_I64);
+      IRTemp dHi = newTemp(Ity_I64);
+      IRTemp dLo = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmulhrsw %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128,
+               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
+               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
+   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
+   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+      IRTemp sV      = newTemp(Ity_I64);
+      IRTemp dV      = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x08: laneszB = 1; str = "b"; break;
+         case 0x09: laneszB = 2; str = "w"; break;
+         case 0x0A: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+                                     nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("psign%s %s,%s\n", str, dis_buf,
+                                     nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
+   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
+   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+      IRTemp sV      = newTemp(Ity_V128);
+      IRTemp dV      = newTemp(Ity_V128);
+      IRTemp sHi     = newTemp(Ity_I64);
+      IRTemp sLo     = newTemp(Ity_I64);
+      IRTemp dHi     = newTemp(Ity_I64);
+      IRTemp dLo     = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x08: laneszB = 1; str = "b"; break;
+         case 0x09: laneszB = 2; str = "w"; break;
+         case 0x0A: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("psign%s %s,%s\n", str, dis_buf,
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128,
+               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
+               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
+   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
+   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+      IRTemp sV      = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x1C: laneszB = 1; str = "b"; break;
+         case 0x1D: laneszB = 2; str = "w"; break;
+         case 0x1E: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      do_MMX_preamble();
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+                                    nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pabs%s %s,%s\n", str, dis_buf,
+                                    nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         dis_PABS_helper( mkexpr(sV), laneszB )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
+   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
+   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+      IRTemp sV      = newTemp(Ity_V128);
+      IRTemp sHi     = newTemp(Ity_I64);
+      IRTemp sLo     = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x1C: laneszB = 1; str = "b"; break;
+         case 0x1D: laneszB = 2; str = "w"; break;
+         case 0x1E: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pabs%s %s,%s\n", str, dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128,
+               dis_PABS_helper( mkexpr(sHi), laneszB ),
+               dis_PABS_helper( mkexpr(sLo), laneszB )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+      IRTemp sV  = newTemp(Ity_I64);
+      IRTemp dV  = newTemp(Ity_I64);
+      IRTemp res = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         d64 = (Long)insn[3+1];
+         delta += 3+1+1;
+         DIP("palignr $%d,%s,%s\n",  (Int)d64, 
+                                     nameMMXReg(eregLO3ofRM(modrm)),
+                                     nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         d64 = (Long)insn[3+alen];
+         delta += 3+alen+1;
+         DIP("palignr $%d%s,%s\n", (Int)d64,
+                                   dis_buf,
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      if (d64 == 0) {
+         assign( res, mkexpr(sV) );
+      }
+      else if (d64 >= 1 && d64 <= 7) {
+         assign(res, 
+                binop(Iop_Or64,
+                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
+                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
+                     )));
+      }
+      else if (d64 == 8) {
+        assign( res, mkexpr(dV) );
+      }
+      else if (d64 >= 9 && d64 <= 15) {
+         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
+      }
+      else if (d64 >= 16 && d64 <= 255) {
+         assign( res, mkU64(0) );
+      }
+      else
+         vassert(0);
+
+      putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
+      goto decode_success;
+   }
+
+   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+      IRTemp sV  = newTemp(Ity_V128);
+      IRTemp dV  = newTemp(Ity_V128);
+      IRTemp sHi = newTemp(Ity_I64);
+      IRTemp sLo = newTemp(Ity_I64);
+      IRTemp dHi = newTemp(Ity_I64);
+      IRTemp dLo = newTemp(Ity_I64);
+      IRTemp rHi = newTemp(Ity_I64);
+      IRTemp rLo = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         d64 = (Long)insn[3+1];
+         delta += 3+1+1;
+         DIP("palignr $%d,%s,%s\n", (Int)d64,
+                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         d64 = (Long)insn[3+alen];
+         delta += 3+alen+1;
+         DIP("palignr $%d,%s,%s\n", (Int)d64,
+                                    dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      if (d64 == 0) {
+         assign( rHi, mkexpr(sHi) );
+         assign( rLo, mkexpr(sLo) );
+      }
+      else if (d64 >= 1 && d64 <= 7) {
+         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) );
+	 assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) );
+      }
+      else if (d64 == 8) {
+         assign( rHi, mkexpr(dLo) );
+         assign( rLo, mkexpr(sHi) );
+      }
+      else if (d64 >= 9 && d64 <= 15) {
+         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) );
+	 assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) );
+      }
+      else if (d64 == 16) {
+         assign( rHi, mkexpr(dHi) );
+         assign( rLo, mkexpr(dLo) );
+      }
+      else if (d64 >= 17 && d64 <= 23) {
+         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) );
+	 assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) );
+      }
+      else if (d64 == 24) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, mkexpr(dHi) );
+      }
+      else if (d64 >= 25 && d64 <= 31) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) );
+      }
+      else if (d64 >= 32 && d64 <= 255) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, mkU64(0) );
+      }
+      else
+         vassert(0);
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+      IRTemp sV      = newTemp(Ity_I64);
+      IRTemp dV      = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                               nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pshufb %s,%s\n", dis_buf,
+                               nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         binop(
+            Iop_And64,
+            /* permute the lanes */
+            binop(
+               Iop_Perm8x8,
+               mkexpr(dV),
+               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
+	    ),
+            /* mask off lanes which have (index & 0x80) == 0x80 */
+            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+      IRTemp sV         = newTemp(Ity_V128);
+      IRTemp dV         = newTemp(Ity_V128);
+      IRTemp sHi        = newTemp(Ity_I64);
+      IRTemp sLo        = newTemp(Ity_I64);
+      IRTemp dHi        = newTemp(Ity_I64);
+      IRTemp dLo        = newTemp(Ity_I64);
+      IRTemp rHi        = newTemp(Ity_I64);
+      IRTemp rLo        = newTemp(Ity_I64);
+      IRTemp sevens     = newTemp(Ity_I64);
+      IRTemp mask0x80hi = newTemp(Ity_I64);
+      IRTemp mask0x80lo = newTemp(Ity_I64);
+      IRTemp maskBit3hi = newTemp(Ity_I64);
+      IRTemp maskBit3lo = newTemp(Ity_I64);
+      IRTemp sAnd7hi    = newTemp(Ity_I64);
+      IRTemp sAnd7lo    = newTemp(Ity_I64);
+      IRTemp permdHi    = newTemp(Ity_I64);
+      IRTemp permdLo    = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, pfx, delta+3, dis_buf, 0 );
+         /* FIXME: generate trap if addr is not 16-aligned */
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pshufb %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      assign( sevens, mkU64(0x0707070707070707ULL) );
+
+      /*
+      mask0x80hi = Not(SarN8x8(sHi,7))
+      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
+      sAnd7hi    = And(sHi,sevens)
+      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
+                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
+      rHi        = And(permdHi,mask0x80hi)
+      */
+      assign(
+         mask0x80hi,
+         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
+
+      assign(
+         maskBit3hi,
+         binop(Iop_SarN8x8,
+               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
+               mkU8(7)));
+
+      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
+
+      assign(
+         permdHi,
+         binop(
+            Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
+                  mkexpr(maskBit3hi)),
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
+                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
+
+      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
+
+      /* And the same for the lower half of the result.  What fun. */
+
+      assign(
+         mask0x80lo,
+         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
+
+      assign(
+         maskBit3lo,
+         binop(Iop_SarN8x8,
+               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
+               mkU8(7)));
+
+      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
+
+      assign(
+         permdLo,
+         binop(
+            Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
+                  mkexpr(maskBit3lo)),
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
+                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
+
+      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+      );
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSSE3 decoder.                    --- */
+   /* ---------------------------------------------------- */
+
    /*after_sse_decoders:*/
 
    /* Get the primary opcode. */
@@ -14699,11 +15682,13 @@ DisResult disInstr_AMD64_WRK (
   decode_failure:
    /* All decode failures end up here. */
    vex_printf("vex amd64->IR: unhandled instruction bytes: "
-              "0x%x 0x%x 0x%x 0x%x\n",
+              "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
               (Int)getUChar(delta_start+0),
               (Int)getUChar(delta_start+1),
               (Int)getUChar(delta_start+2),
-              (Int)getUChar(delta_start+3) );
+              (Int)getUChar(delta_start+3),
+              (Int)getUChar(delta_start+4),
+              (Int)getUChar(delta_start+5) );
 
    /* Tell the dispatcher that this insn cannot be decoded, and so has
       not been executed, and (is currently) the next to be executed.
diff --git a/VEX/priv/host-amd64/isel.c b/VEX/priv/host-amd64/isel.c
index c00dbdab91..d49031e7c7 100644
--- a/VEX/priv/host-amd64/isel.c
+++ b/VEX/priv/host-amd64/isel.c
@@ -1038,6 +1038,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
             fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
          case Iop_InterleaveLO32x2:
             fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
+         case Iop_CatOddLanes16x4:
+            fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
+         case Iop_CatEvenLanes16x4:
+            fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
+         case Iop_Perm8x8:
+            fn = (HWord)h_generic_calc_Perm8x8; break;
 
          case Iop_Max8Ux8:
             fn = (HWord)h_generic_calc_Max8Ux8; break;
@@ -1050,6 +1056,8 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
 
          case Iop_Mul16x4:
             fn = (HWord)h_generic_calc_Mul16x4; break;
+         case Iop_Mul32x2:
+            fn = (HWord)h_generic_calc_Mul32x2; break;
          case Iop_MulHi16Sx4:
             fn = (HWord)h_generic_calc_MulHi16Sx4; break;
          case Iop_MulHi16Ux4:
@@ -1095,6 +1103,10 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
             fn = (HWord)h_generic_calc_ShlN16x4;
             second_is_UInt = True;
             break;
+         case Iop_ShlN8x8:
+            fn = (HWord)h_generic_calc_ShlN8x8;
+            second_is_UInt = True;
+            break;
          case Iop_ShrN32x2:
             fn = (HWord)h_generic_calc_ShrN32x2; 
             second_is_UInt = True; 
diff --git a/VEX/priv/host-generic/h_generic_simd64.c b/VEX/priv/host-generic/h_generic_simd64.c
index b271c0fea1..252fd4814d 100644
--- a/VEX/priv/host-generic/h_generic_simd64.c
+++ b/VEX/priv/host-generic/h_generic_simd64.c
@@ -142,6 +142,11 @@ static inline UChar sel8x8_0 ( ULong w64 ) {
    return toUChar(0xFF & (lo32 >> 0));
 }
 
+static inline UChar index8x8 ( ULong w64, UChar ix ) {
+   ix &= 7;
+   return toUChar((w64 >> (8*ix)) & 0xFF);
+}
+
 
 /* Scalar helpers. */
 
@@ -213,6 +218,12 @@ static inline Short mul16 ( Short xx, Short yy )
    return (Short)t;
 }
 
+static inline Int mul32 ( Int xx, Int yy )
+{
+   Int t = ((Int)xx) * ((Int)yy);
+   return (Int)t;
+}
+
 static inline Short mulhi16S ( Short xx, Short yy )
 {
    Int t = ((Int)xx) * ((Int)yy);
@@ -299,6 +310,11 @@ static inline UChar qnarrow16Uto8 ( UShort xx0 )
 /* shifts: we don't care about out-of-range ones, since
    that is dealt with at a higher level. */
 
+static inline UChar shl8 ( UChar v, UInt n )
+{
+   return toUChar(v << n);
+}
+
 static inline UChar sar8 ( UChar v, UInt n )
 {
    return toUChar(((Char)v) >> n);
@@ -555,6 +571,14 @@ ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
           );
 }
 
+ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
+{
+   return mk32x2(
+             mul32( sel32x2_1(xx), sel32x2_1(yy) ),
+             mul32( sel32x2_0(xx), sel32x2_0(yy) )
+          );
+}
+
 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
 {
    return mk16x4(
@@ -799,6 +823,42 @@ ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
           );
 }
 
+/* ------------ Concatenation ------------ */
+
+ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
+{
+   return mk16x4(
+             sel16x4_3(aa),
+             sel16x4_1(aa),
+             sel16x4_3(bb),
+             sel16x4_1(bb)
+          );
+}
+
+ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
+{
+   return mk16x4(
+             sel16x4_2(aa),
+             sel16x4_0(aa),
+             sel16x4_2(bb),
+             sel16x4_0(bb)
+          );
+}
+
+/* misc hack looking for a proper home */
+ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
+{
+   return mk8x8(
+             index8x8(aa, sel8x8_7(bb)),
+             index8x8(aa, sel8x8_6(bb)),
+             index8x8(aa, sel8x8_5(bb)),
+             index8x8(aa, sel8x8_4(bb)),
+             index8x8(aa, sel8x8_3(bb)),
+             index8x8(aa, sel8x8_2(bb)),
+             index8x8(aa, sel8x8_1(bb)),
+             index8x8(aa, sel8x8_0(bb))
+          );
+}
 
 /* ------------ Shifting ------------ */
 /* Note that because these primops are undefined if the shift amount
@@ -829,6 +889,22 @@ ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
           );
 }
 
+ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 8); */
+   nn &= 7;
+   return mk8x8(
+             shl8( sel8x8_7(xx), nn ),
+             shl8( sel8x8_6(xx), nn ),
+             shl8( sel8x8_5(xx), nn ),
+             shl8( sel8x8_4(xx), nn ),
+             shl8( sel8x8_3(xx), nn ),
+             shl8( sel8x8_2(xx), nn ),
+             shl8( sel8x8_1(xx), nn ),
+             shl8( sel8x8_0(xx), nn )
+          );
+}
+
 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
 {
    /* vassert(nn < 32); */
diff --git a/VEX/priv/host-generic/h_generic_simd64.h b/VEX/priv/host-generic/h_generic_simd64.h
index eb00e690cf..e7d4c6927e 100644
--- a/VEX/priv/host-generic/h_generic_simd64.h
+++ b/VEX/priv/host-generic/h_generic_simd64.h
@@ -83,6 +83,7 @@ extern ULong h_generic_calc_QSub16Ux4 ( ULong, ULong );
 extern ULong h_generic_calc_QSub8Ux8  ( ULong, ULong );
 
 extern ULong h_generic_calc_Mul16x4    ( ULong, ULong );
+extern ULong h_generic_calc_Mul32x2    ( ULong, ULong );
 extern ULong h_generic_calc_MulHi16Sx4 ( ULong, ULong );
 extern ULong h_generic_calc_MulHi16Ux4 ( ULong, ULong );
 
@@ -108,6 +109,11 @@ extern ULong h_generic_calc_InterleaveLO16x4 ( ULong, ULong );
 extern ULong h_generic_calc_InterleaveHI32x2 ( ULong, ULong );
 extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong );
 
+extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong );
+extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong );
+extern ULong h_generic_calc_Perm8x8 ( ULong, ULong );
+
+extern ULong h_generic_calc_ShlN8x8  ( ULong, UInt );
 extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt );
 extern ULong h_generic_calc_ShlN32x2 ( ULong, UInt );
 
diff --git a/VEX/priv/ir/irdefs.c b/VEX/priv/ir/irdefs.c
index f4d80b88ed..af949c560e 100644
--- a/VEX/priv/ir/irdefs.c
+++ b/VEX/priv/ir/irdefs.c
@@ -326,6 +326,7 @@ void ppIROp ( IROp op )
       case Iop_QSub8Sx8: vex_printf("QSub8Sx8"); return;
       case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
       case Iop_Mul16x4: vex_printf("Mul16x4"); return;
+      case Iop_Mul32x2: vex_printf("Mul32x2"); return;
       case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
       case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
       case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
@@ -340,6 +341,7 @@ void ppIROp ( IROp op )
       case Iop_CmpGT8Sx8: vex_printf("CmpGT8Sx8"); return;
       case Iop_CmpGT16Sx4: vex_printf("CmpGT16Sx4"); return;
       case Iop_CmpGT32Sx2: vex_printf("CmpGT32Sx2"); return;
+      case Iop_ShlN8x8: vex_printf("ShlN8x8"); return;
       case Iop_ShlN16x4: vex_printf("ShlN16x4"); return;
       case Iop_ShlN32x2: vex_printf("ShlN32x2"); return;
       case Iop_ShrN16x4: vex_printf("ShrN16x4"); return;
@@ -356,6 +358,9 @@ void ppIROp ( IROp op )
       case Iop_InterleaveLO8x8: vex_printf("InterleaveLO8x8"); return;
       case Iop_InterleaveLO16x4: vex_printf("InterleaveLO16x4"); return;
       case Iop_InterleaveLO32x2: vex_printf("InterleaveLO32x2"); return;
+      case Iop_CatOddLanes16x4: vex_printf("CatOddLanes16x4"); return;
+      case Iop_CatEvenLanes16x4: vex_printf("CatEvenLanes16x4"); return;
+      case Iop_Perm8x8: vex_printf("Iop_Perm8x8"); return;
 
       case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
       case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
@@ -1506,9 +1511,12 @@ void typeOfPrimop ( IROp op,
       case Iop_InterleaveHI8x8: case Iop_InterleaveLO8x8:
       case Iop_InterleaveHI16x4: case Iop_InterleaveLO16x4:
       case Iop_InterleaveHI32x2: case Iop_InterleaveLO32x2:
+      case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4:
+      case Iop_Perm8x8:
       case Iop_Max8Ux8: case Iop_Max16Sx4:
       case Iop_Min8Ux8: case Iop_Min16Sx4:
-      case Iop_Mul16x4: case Iop_MulHi16Sx4: case Iop_MulHi16Ux4:
+      case Iop_Mul16x4: case Iop_Mul32x2:
+      case Iop_MulHi16Sx4: case Iop_MulHi16Ux4:
       case Iop_QAdd8Sx8: case Iop_QAdd16Sx4:
       case Iop_QAdd8Ux8: case Iop_QAdd16Ux4:
       case Iop_QNarrow32Sx2:
@@ -1518,7 +1526,7 @@ void typeOfPrimop ( IROp op,
       case Iop_QSub8Ux8: case Iop_QSub16Ux4:
          BINARY(Ity_I64,Ity_I64, Ity_I64);
 
-      case Iop_ShlN32x2: case Iop_ShlN16x4:
+      case Iop_ShlN32x2: case Iop_ShlN16x4: case Iop_ShlN8x8:
       case Iop_ShrN32x2: case Iop_ShrN16x4:
       case Iop_SarN32x2: case Iop_SarN16x4: case Iop_SarN8x8:
          BINARY(Ity_I64,Ity_I8, Ity_I64);
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 056dd23dbb..4f8ef30771 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -658,7 +658,7 @@ typedef
       Iop_QSub8Sx8, Iop_QSub16Sx4,
 
       /* MULTIPLICATION (normal / high half of signed/unsigned) */
-      Iop_Mul16x4,
+      Iop_Mul16x4, Iop_Mul32x2,
       Iop_MulHi16Ux4,
       Iop_MulHi16Sx4,
 
@@ -677,7 +677,7 @@ typedef
       Iop_CmpGT8Sx8, Iop_CmpGT16Sx4, Iop_CmpGT32Sx2,
 
       /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */
-                   Iop_ShlN16x4, Iop_ShlN32x2,
+      Iop_ShlN8x8, Iop_ShlN16x4, Iop_ShlN32x2,
                    Iop_ShrN16x4, Iop_ShrN32x2,
       Iop_SarN8x8, Iop_SarN16x4, Iop_SarN32x2,
 
@@ -692,6 +692,19 @@ typedef
       Iop_InterleaveHI8x8, Iop_InterleaveHI16x4, Iop_InterleaveHI32x2,
       Iop_InterleaveLO8x8, Iop_InterleaveLO16x4, Iop_InterleaveLO32x2,
 
+      /* CONCATENATION -- build a new value by concatenating either
+         the even or odd lanes of both operands.  Note that
+         Cat{Odd,Even}Lanes32x2 are identical to Interleave{HI,LO}32x2
+         and so are omitted. */
+      Iop_CatOddLanes16x4, Iop_CatEvenLanes16x4,
+
+      /* PERMUTING -- copy src bytes to dst,
+         as indexed by control vector bytes:
+            for i in 0 .. 7 . result[i] = argL[ argR[i] ] 
+         argR[i] values may only be in the range 0 .. 7, else behaviour
+         is undefined. */
+      Iop_Perm8x8,
+
       /* ------------------ 128-bit SIMD FP. ------------------ */
 
       /* --- 32x4 vector FP --- */