From b145cd9c3c0a2e27af4aaba00778b12e733b8e34 Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Tue, 17 Aug 2010 22:52:08 +0000
Subject: [PATCH] Add a moderately comprehensive implementation of the SSE4.2
 string instructions PCMP{I,E}STR{I,M}.  They are an absolute nightmare of
 complexity.  Most of the 8-bit data processing variants are supported, but
 none of the 16-bit variants.

Also add support for PINSRB and PTEST.

With these changes, I believe Valgrind supports all the SSE4.2
instructions used in glibc-2.11 on x86_64-linux, as well as anything
that gcc can emit.  So that gives fairly good coverage.

Currently these instructions are handled, but CPUID still claims to be
an older, non-SSE4 capable Core 2, so that software that correctly
checks CPU features should not use them.  Following further testing I
will enable the relevant SSE4.2 bits in CPUID.


git-svn-id: svn://svn.valgrind.org/vex/trunk@2010
---
 VEX/priv/guest_amd64_defs.h     |  53 +++-
 VEX/priv/guest_amd64_helpers.c  | 144 ++++++++---
 VEX/priv/guest_amd64_toIR.c     | 263 +++++++++++++++++---
 VEX/priv/guest_generic_x87.c    | 427 +++++++++++++++++++-------------
 VEX/priv/guest_generic_x87.h    |  17 +-
 VEX/priv/host_amd64_isel.c      | 102 +++-----
 VEX/priv/host_generic_simd128.h |  12 -
 VEX/pub/libvex_basictypes.h     |  10 +
 8 files changed, 692 insertions(+), 336 deletions(-)

diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h
index 7a40ba6e01..3d6e128bd0 100644
--- a/VEX/priv/guest_amd64_defs.h
+++ b/VEX/priv/guest_amd64_defs.h
@@ -157,14 +157,51 @@ extern void  amd64g_dirtyhelper_OUT ( ULong portno, ULong data,
 extern void amd64g_dirtyhelper_SxDT ( void* address,
                                       ULong op /* 0 or 1 */ );
 
-extern ULong amd64g_dirtyhelper_ISTRI_08 ( VexGuestAMD64State*,
-                                           HWord, HWord );
-extern ULong amd64g_dirtyhelper_ISTRI_0C ( VexGuestAMD64State*,
-                                           HWord, HWord );
-extern ULong amd64g_dirtyhelper_ISTRI_3A ( VexGuestAMD64State*,
-                                           HWord, HWord );
-extern ULong amd64g_dirtyhelper_ISTRI_4A ( VexGuestAMD64State*,
-                                           HWord, HWord );
+/* Helps with PCMP{I,E}STR{I,M}.
+
+   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
+   actually it could be a clean helper, but for the fact that we can't
+   pass by value 2 x V128 to a clean helper, nor have one returned.)
+   Reads guest state, writes to guest state for the xSTRM cases, no
+   accesses of memory, is a pure function.
+
+   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
+   the callee knows which I/E and I/M variant it is dealing with and
+   what the specific operation is.  4th byte of opcode is in the range
+   0x60 to 0x63:
+       istri  66 0F 3A 63
+       istrm  66 0F 3A 62
+       estri  66 0F 3A 61
+       estrm  66 0F 3A 60
+
+   gstOffL and gstOffR are the guest state offsets for the two XMM
+   register inputs.  We never have to deal with the memory case since
+   that is handled by pre-loading the relevant value into the fake
+   XMM16 register.
+
+   For ESTRx variants, edxIN and eaxIN hold the values of those two
+   registers.
+
+   In all cases, the bottom 16 bits of the result contain the new
+   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
+   result hold the new %ecx value.  For xSTRM variants, the helper
+   writes the result directly to the guest XMM0.
+
+   Declarable side effects: in all cases, reads guest state at
+   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
+   guest_XMM0.
+
+   Is expected to be called with opc_and_imm combinations which have
+   actually been validated, and will assert if otherwise.  The front
+   end should ensure we're only called with verified values.
+*/
+extern ULong amd64g_dirtyhelper_PCMPxSTRx ( 
+          VexGuestAMD64State*,
+          HWord opc4_and_imm,
+          HWord gstOffL, HWord gstOffR,
+          HWord edxIN, HWord eaxIN
+       );
+
 
 //extern void  amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
 //extern void  amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* );
diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c
index 48f915cdc7..e3f9d510f7 100644
--- a/VEX/priv/guest_amd64_helpers.c
+++ b/VEX/priv/guest_amd64_helpers.c
@@ -2514,45 +2514,123 @@ ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
 /*---------------------------------------------------------------*/
 
-/* CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
-   actually it could be a clean helper, but for the fact that we can't
-   pass by value 2 x V128 to a clean helper.)  Reads guest state, no
-   writes to guest state, no accesses of memory, is a pure function.
-   This relies on the property that the XMM regs are laid out
-   consecutively in the guest state, so we can index into them here.
-   Returned value (0 .. 16) is in the low 16 bits of the return value.
-   Returned bits 31:16 hold the result OSZACP value.
-*/
-ULong amd64g_dirtyhelper_ISTRI_08 ( VexGuestAMD64State* gst,
-                                    HWord gstOffL, HWord gstOffR )
+static UInt zmask_from_V128 ( V128* arg )
 {
-   U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
-   U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
-   return (HWord) compute_ISTRI_08( argL, argR );
+   UInt i, res = 0;
+   for (i = 0; i < 16; i++) {
+      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
+   }
+   return res;
 }
 
-ULong amd64g_dirtyhelper_ISTRI_0C ( VexGuestAMD64State* gst,
-                                    HWord gstOffL, HWord gstOffR )
-{
-   U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
-   U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
-   return (HWord) compute_ISTRI_0C( argL, argR );
-}
+/* Helps with PCMP{I,E}STR{I,M}.
 
-ULong amd64g_dirtyhelper_ISTRI_3A ( VexGuestAMD64State* gst,
-                                    HWord gstOffL, HWord gstOffR )
+   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
+   actually it could be a clean helper, but for the fact that we can't
+   pass by value 2 x V128 to a clean helper, nor have one returned.)
+   Reads guest state, writes to guest state for the xSTRM cases, no
+   accesses of memory, is a pure function.
+
+   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
+   the callee knows which I/E and I/M variant it is dealing with and
+   what the specific operation is.  4th byte of opcode is in the range
+   0x60 to 0x63:
+       istri  66 0F 3A 63
+       istrm  66 0F 3A 62
+       estri  66 0F 3A 61
+       estrm  66 0F 3A 60
+
+   gstOffL and gstOffR are the guest state offsets for the two XMM
+   register inputs.  We never have to deal with the memory case since
+   that is handled by pre-loading the relevant value into the fake
+   XMM16 register.
+
+   For ESTRx variants, edxIN and eaxIN hold the values of those two
+   registers.
+
+   In all cases, the bottom 16 bits of the result contain the new
+   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
+   result hold the new %ecx value.  For xSTRM variants, the helper
+   writes the result directly to the guest XMM0.
+
+   Declarable side effects: in all cases, reads guest state at
+   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
+   guest_XMM0.
+
+   Is expected to be called with opc_and_imm combinations which have
+   actually been validated, and will assert if otherwise.  The front
+   end should ensure we're only called with verified values.
+*/
+ULong amd64g_dirtyhelper_PCMPxSTRx ( 
+          VexGuestAMD64State* gst,
+          HWord opc4_and_imm,
+          HWord gstOffL, HWord gstOffR,
+          HWord edxIN, HWord eaxIN
+       )
 {
-   U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
-   U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
-   return (HWord) compute_ISTRI_3A( argL, argR );
-}
+   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
+   HWord imm8 = opc4_and_imm & 0xFF;
+   HWord isISTRx = opc4 & 2;
+   HWord isxSTRM = (opc4 & 1) ^ 1;
+   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
+   vassert((imm8 & 1) == 0); /* we support byte-size cases only */
+
+   // where the args are
+   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
+   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
+
+   /* Create the arg validity masks, either from the vectors
+      themselves or from the supplied edx/eax values. */
+   // FIXME: this is only right for the 8-bit data cases.
+   // At least that is asserted above.
+   UInt zmaskL, zmaskR;
+   if (isISTRx) {
+      zmaskL = zmask_from_V128(argL);
+      zmaskR = zmask_from_V128(argR);
+   } else {
+      Int tmp;
+      tmp = edxIN & 0xFFFFFFFF;
+      if (tmp < -16) tmp = -16;
+      if (tmp > 16)  tmp = 16;
+      if (tmp < 0)   tmp = -tmp;
+      vassert(tmp >= 0 && tmp <= 16);
+      zmaskL = (1 << tmp) & 0xFFFF;
+      tmp = eaxIN & 0xFFFFFFFF;
+      if (tmp < -16) tmp = -16;
+      if (tmp > 16)  tmp = 16;
+      if (tmp < 0)   tmp = -tmp;
+      vassert(tmp >= 0 && tmp <= 16);
+      zmaskR = (1 << tmp) & 0xFFFF;
+   }
 
-ULong amd64g_dirtyhelper_ISTRI_4A ( VexGuestAMD64State* gst,
-                                    HWord gstOffL, HWord gstOffR )
-{
-   U128* argL = (U128*)( ((UChar*)gst) + gstOffL );
-   U128* argR = (U128*)( ((UChar*)gst) + gstOffR );
-   return (HWord) compute_ISTRI_4A( argL, argR );
+   // temp spot for the resulting flags and vector.
+   V128 resV;
+   UInt resOSZACP;
+
+   // do the meyaath
+   Bool ok = compute_PCMPxSTRx ( 
+                &resV, &resOSZACP, argL, argR, 
+                zmaskL, zmaskR, imm8, (Bool)isxSTRM
+             );
+
+   // front end shouldn't pass us any imm8 variants we can't
+   // handle.  Hence:
+   vassert(ok);
+
+   // So, finally we need to get the results back to the caller.
+   // In all cases, the new OSZACP value is the lowest 16 of
+   // the return value.
+   if (isxSTRM) {
+      /* gst->guest_XMM0 = resV; */ // gcc don't like that
+      gst->guest_XMM0[0] = resV.w32[0];
+      gst->guest_XMM0[1] = resV.w32[1];
+      gst->guest_XMM0[2] = resV.w32[2];
+      gst->guest_XMM0[3] = resV.w32[3];
+      return resOSZACP & 0x8D5;
+   } else {
+      UInt newECX = resV.w32[0] & 0xFFFF;
+      return (newECX << 16) | (resOSZACP & 0x8D5);
+   }
 }
 
 
diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 7ffb433a9d..b7e78a80ab 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -14461,6 +14461,61 @@ DisResult disInstr_AMD64_WRK (
       goto decode_success;
    }
 
+   /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
+      Extract byte from r32/m8 and insert into xmm1 */
+   if ( have66noF2noF3( pfx )
+        && sz == 2
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x20 ) {
+
+      Int    imm8;
+      IRTemp new8 = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)(insn[3+1] & 0xF);
+         assign( new8, binop(Iop_And64,
+                             unop(Iop_32Uto64,
+                                  getIReg32(eregOfRexRM(pfx,modrm))),
+                             mkU64(0xFF)));
+         delta += 3+1+1;
+         DIP( "pinsrb $%d,%s,%s\n", imm8,
+              nameIReg32( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8 = (Int)(insn[3+alen] & 0xF);
+         assign( new8, unop(Iop_8Uto64, loadLE( Ity_I8, mkexpr(addr) )));
+         delta += 3+alen+1;
+         DIP( "pinsrb $%d,%s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      // Create a V128 value which has the selected byte in the
+      // specified lane, and zeroes everywhere else.
+      IRTemp tmp128 = newTemp(Ity_V128);
+      IRTemp halfshift = newTemp(Ity_I64);
+      assign(halfshift, binop(Iop_Shl64,
+                              mkexpr(new8), mkU8(8 * (imm8 & 7))));
+      vassert(imm8 >= 0 && imm8 <= 15);
+      if (imm8 < 8) {
+         assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
+      } else {
+         assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
+      }
+
+      UShort mask = ~(1 << imm8);
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_OrV128,
+                        mkexpr(tmp128),
+                        binop( Iop_AndV128, 
+                               getXMMReg( gregOfRexRM(pfx, modrm) ),
+                               mkV128(mask) ) ) );
+
+      goto decode_success;
+   }
+
    /* 66 0F 38 37 = PCMPGTQ
       64x2 comparison (signed, presumably; the Intel docs don't say :-)
    */
@@ -15174,14 +15229,19 @@ DisResult disInstr_AMD64_WRK (
    }
 
    /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
+      66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
+      66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
+      66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
       (selected special cases that actually occur in glibc,
        not by any means a complete implementation.)
    */
    if (have66noF2noF3(pfx) 
        && sz == 2 
        && insn[0] == 0x0F && insn[1] == 0x3A
-       && insn[2] == 0x63) {
+       && (insn[2] >= 0x60 && insn[2] <= 0x63)) {
 
+      UInt  isISTRx = insn[2] & 2;
+      UInt  isxSTRM = (insn[2] & 1) ^ 1;
       UInt  regNoL = 0;
       UInt  regNoR = 0;
       UChar imm    = 0;
@@ -15208,35 +15268,41 @@ DisResult disInstr_AMD64_WRK (
       }
 
       /* Now we know the XMM reg numbers for the operands, and the
-         immediate byte.  Is it one we can actually handle? */
-      void*  fn = NULL;
-      HChar* nm = NULL;
+         immediate byte.  Is it one we can actually handle? Throw out
+         any cases for which the helper function has not been
+         verified. */
       switch (imm) {
-         case 0x08: fn = &amd64g_dirtyhelper_ISTRI_08;
-                    nm = "amd64g_dirtyhelper_ISTRI_08"; break;
-         case 0x0C: fn = &amd64g_dirtyhelper_ISTRI_0C;
-                    nm = "amd64g_dirtyhelper_ISTRI_0C"; break;
-         case 0x3A: fn = &amd64g_dirtyhelper_ISTRI_3A;
-                    nm = "amd64g_dirtyhelper_ISTRI_3A"; break;
-         case 0x4A: fn = &amd64g_dirtyhelper_ISTRI_4A;
-                    nm = "amd64g_dirtyhelper_ISTRI_4A"; break;
-         default:   goto decode_failure;
-      }
-      vassert(fn); vassert(nm);
+         case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
+         case 0x1A: case 0x3A: case 0x44: case 0x4A:
+            break;
+         default:
+            goto decode_failure;
+      }
+
+      /* Who ya gonna call?  Presumably not Ghostbusters. */
+      void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
+      HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
 
+      /* Round up the arguments.  Note that this is a kludge -- the
+         use of mkU64 rather than mkIRExpr_HWord implies the
+         assumption that the host's word size is 64-bit. */
       UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
       UInt gstOffR = xmmGuestRegOffset(regNoR);
 
-      IRTemp resT = newTemp(Ity_I64);
-      IRDirty* d
-         = unsafeIRDirty_1_N( resT, 0/*regparms*/,
-                              nm, fn,
-                              mkIRExprVec_2( mkIRExpr_HWord(gstOffL),
-                                             mkIRExpr_HWord(gstOffR)) );
+      IRExpr*  opc4_and_imm = mkU64((insn[2] << 8) | (imm & 0xFF));
+      IRExpr*  gstOffLe     = mkU64(gstOffL);
+      IRExpr*  gstOffRe     = mkU64(gstOffR);
+      IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
+      IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
+      IRExpr** args
+         = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
+
+      IRTemp   resT = newTemp(Ity_I64);
+      IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
       /* It's not really a dirty call, but we can't use the clean
          helper mechanism here for the very lame reason that we can't
-         pass 2 x V128s by value to a helper.  Hence this roundabout
-         scheme. */
+         pass 2 x V128s by value to a helper, nor get one back.  Hence
+         this roundabout scheme. */
       d->needsBBP = True;
       d->nFxState = 2;
       d->fxState[0].fx     = Ifx_Read;
@@ -15245,33 +15311,164 @@ DisResult disInstr_AMD64_WRK (
       d->fxState[1].fx     = Ifx_Read;
       d->fxState[1].offset = gstOffR;
       d->fxState[1].size   = sizeof(U128);
+      if (isxSTRM) {
+         /* Declare that the helper writes XMM0. */
+         d->nFxState = 3;
+         d->fxState[2].fx     = Ifx_Write;
+         d->fxState[2].offset = xmmGuestRegOffset(0);
+         d->fxState[2].size   = sizeof(U128);
+      }
+
       stmt( IRStmt_Dirty(d) );
 
-      /* Now resT[15:0] holds what the Intel docs call IntRes2, and
-         resT[31:16] holds the new OSZACP values.  We must park the
-         resultin ECX and update the condition codes. */
-      putIReg64(R_RCX, binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF)));
+      /* Now resT[15:0] holds the new OSZACP values, so the condition
+         codes must be updated. And for a xSTRI case, resT[31:16]
+         holds the new ECX value, so stash that too. */
+      if (!isxSTRM) {
+         putIReg64(R_RCX, binop(Iop_And64,
+                                binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
+                                mkU64(0xFFFF)));
+      }
 
       stmt( IRStmt_Put(
                OFFB_CC_DEP1,
-               binop(Iop_And64, binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
-                                mkU64(0xFFFF))
+               binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
       ));
       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
 
       if (regNoL == 16) {
-         DIP("pcmpistri $%x,%s,%s\n",
+         DIP("pcmp%cstr%c $%x,%s,%s\n",
+             isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
              (UInt)imm, dis_buf, nameXMMReg(regNoR));
       } else {
-         DIP("pcmpistri $%x,%s,%s\n",
+         DIP("pcmp%cstr%c $%x,%s,%s\n",
+             isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
              (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
       }
 
       goto decode_success;
    }
 
+
+   /* 66 0f 38 17 /r = PTEST xmm1, xmm2/m128
+      Logical compare (set ZF and CF from AND/ANDN of the operands) */
+   if (have66noF2noF3( pfx ) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x17) {
+      modrm = insn[3];
+      IRTemp vecE = newTemp(Ity_V128);
+      IRTemp vecG = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
+         delta += 3+1;
+         DIP( "ptest %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
+         delta += 3+alen;
+         DIP( "ptest %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
+
+      /* Set Z=1 iff (vecE & vecG) == 0
+         Set C=1 iff (vecE & not vecG) == 0
+      */
+
+      /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
+      IRTemp andV  = newTemp(Ity_V128);
+      IRTemp andnV = newTemp(Ity_V128);
+      assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
+      assign(andnV, binop(Iop_AndV128,
+                          mkexpr(vecE),
+                          binop(Iop_XorV128, mkexpr(vecG),
+                                             mkV128(0xFFFF))));
+
+      /* The same, but reduced to 64-bit values, by or-ing the top
+         and bottom 64-bits together.  It relies on this trick:
+
+          InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
+
+          InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
+          InterleaveHI64x2([a,b],[a,b]) == [a,a] 
+
+          and so the OR of the above 2 exprs produces
+          [a OR b, a OR b], from which we simply take the lower half.
+      */
+      IRTemp and64  = newTemp(Ity_I64);
+      IRTemp andn64 = newTemp(Ity_I64);
+   
+      assign(
+         and64,
+         unop(Iop_V128to64,
+              binop(Iop_OrV128,
+                    binop(Iop_InterleaveLO64x2, mkexpr(andV), mkexpr(andV)),
+                    binop(Iop_InterleaveHI64x2, mkexpr(andV), mkexpr(andV))
+              )
+         )
+      );
+
+      assign(
+         andn64,
+         unop(Iop_V128to64,
+              binop(Iop_OrV128,
+                    binop(Iop_InterleaveLO64x2, mkexpr(andnV), mkexpr(andnV)),
+                    binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV))
+              )
+          )
+       );
+
+      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
+         slice out the Z and C bits conveniently.  We use the standard
+         trick all-zeroes -> all-zeroes, anything-else -> all-ones
+         done by "(x | -x) >>s (word-size - 1)".
+      */
+      IRTemp z64 = newTemp(Ity_I64);
+      IRTemp c64 = newTemp(Ity_I64);
+      assign(z64,
+             unop(Iop_Not64,
+                  binop(Iop_Sar64,
+                        binop(Iop_Or64,
+                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
+                              mkexpr(and64)
+                        ), 
+                        mkU8(63)))
+      );
+
+      assign(c64,
+             unop(Iop_Not64,
+                  binop(Iop_Sar64,
+                        binop(Iop_Or64,
+                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
+                              mkexpr(andn64)
+                        ),
+                        mkU8(63)))
+      );
+
+      /* And finally, slice out the Z and C flags and set the flags
+         thunk to COPY for them.  OSAP are set to zero. */
+      IRTemp newOSZACP = newTemp(Ity_I64);
+      assign(newOSZACP, 
+             binop(Iop_Or64,
+                   binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
+                   binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))
+             )
+      );
+
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+      goto decode_success;
+   }
+
+
    /* ---------------------------------------------------- */
    /* --- end of the SSE4 decoder                      --- */
    /* ---------------------------------------------------- */
@@ -17319,8 +17516,8 @@ DisResult disInstr_AMD64_WRK (
             fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16";
             fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16; 
             /* This is a Core-2-like machine */
-            /* fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16"; */
-            /* fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16; */
+            //fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
+            //fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
             /* This is a Core-i5-like machine */
          }
          else {
diff --git a/VEX/priv/guest_generic_x87.c b/VEX/priv/guest_generic_x87.c
index 0b8294474d..4204893bdd 100644
--- a/VEX/priv/guest_generic_x87.c
+++ b/VEX/priv/guest_generic_x87.c
@@ -542,9 +542,10 @@ ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )
 
 /* We need the definitions for OSZACP eflags/rflags offsets.
    #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
-   require values directly.  They are not going to change in the
-   future :-)
+   required values directly.  They are not going to change in the
+   foreseeable future :-)
 */
+
 #define SHIFT_O   11
 #define SHIFT_S   7
 #define SHIFT_Z   6
@@ -591,204 +592,294 @@ static UInt ctz32 ( UInt x )
    return 32 - clz32((~x) & (x-1));
 }
 
-
-/* Do the computations for SSE4.2 ISTRI_XX.  Not called directly from
-   generated code.  Pure function, reads *argLU and *argRU, returned
-   value (0 .. 16) is in the low 16 bits of the return value.
-   Returned bits 31:16 hold the result OSZACP value.
-*/
-UInt compute_ISTRI_08 ( U128* argLU, U128* argRU )
+/* Convert a 4-bit value to a 32-bit value by cloning each bit 8
+   times.  There's surely a better way to do this, but I don't know
+   what it is. */
+static UInt bits4_to_bytes4 ( UInt bits4 )
 {
-   /* unsigned bytes    (also works for unsigned)
-      equal each        (straightforward parallel compare)
-      polarity +        (IntRes2 = IntRes1)
-      index 0           (want index of ls 1 bit)
-   */
-   Int    i;
-   UChar* argL = (UChar*)argLU;
-   UChar* argR = (UChar*)argRU;
-   UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
-   for (i = 15; i >= 0; i--) {
-      UChar cL  = argL[i];
-      UChar cR  = argR[i];
-      zmaskL    = (zmaskL << 1)    | (cL == 0  ? 1 : 0);
-      zmaskR    = (zmaskR << 1)    | (cR == 0  ? 1 : 0);
-      boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
-   }
-   UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
-   UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
-
-   // do invalidation, common to all equal-each cases
-   UInt intRes1
-      = (boolResII & validL & validR)  // if both valid, use cmpres
-        | (~ (validL | validR));       // if both invalid, force 1
-                                       // else force 0
-   intRes1 &= 0xFFFF;
-
-   // polarity: +
-   UInt intRes2 = intRes1;
-
-   // generate ecx value, common to all index-of-ls-1-bit cases
-   UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
-
-   // generate new flags, common to all ISTRI and ISTRM cases
-   UInt newFlags    // A, P are zero
-      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
-      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
-      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
-      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
-
-   return (newFlags << 16) | newECX;
+   UInt r = 0;
+   r |= (bits4 & 1) ? 0x000000FF : 0;
+   r |= (bits4 & 2) ? 0x0000FF00 : 0;
+   r |= (bits4 & 4) ? 0x00FF0000 : 0;
+   r |= (bits4 & 8) ? 0xFF000000 : 0;
+   return r;
 }
 
 
-UInt compute_ISTRI_0C ( U128* argLU, U128* argRU )
+/* Given partial results from a pcmpXstrX operation (intRes1,
+   basically), generate an I- or M-format output value, also the new
+   OSZACP flags.  */
+static
+void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,
+                                   /*OUT*/UInt* resOSZACP,
+                                   UInt intRes1,
+                                   UInt zmaskL, UInt zmaskR,
+                                   UInt validL,
+                                   UInt pol, UInt idx,
+                                   Bool isxSTRM )
 {
-   /* unsigned bytes
-      equal ordered     (substring search)
-      polarity +        (IntRes2 = IntRes1)
-      index 0           (want index of ls 1 bit)
+   vassert((pol >> 2) == 0);
+   vassert((idx >> 1) == 0);
+
+   UInt intRes2 = 0;
+   switch (pol) {
+      case 0: intRes2 = intRes1;          break; // pol +
+      case 1: intRes2 = ~intRes1;         break; // pol -
+      case 2: intRes2 = intRes1;          break; // pol m+
+      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
+   }
+   intRes2 &= 0xFFFF;
+
+   if (isxSTRM) {
+ 
+      // generate M-format output (a bit or byte mask in XMM0)
+      if (idx) {
+         resV->w32[0] = bits4_to_bytes4( (intRes2 >>  0) & 0xF );
+         resV->w32[1] = bits4_to_bytes4( (intRes2 >>  4) & 0xF );
+         resV->w32[2] = bits4_to_bytes4( (intRes2 >>  8) & 0xF );
+         resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );
+      } else {
+         resV->w32[0] = intRes2 & 0xFFFF;
+         resV->w32[1] = 0;
+         resV->w32[2] = 0;
+         resV->w32[3] = 0;
+      }
 
-      argL: haystack,  argR: needle
-   */
-   UInt i, hi, ni;
-   UChar* argL = (UChar*)argLU;
-   UChar* argR = (UChar*)argRU;
-   UInt boolRes = 0, zmaskL = 0, zmaskR = 0;
-   UInt keepSearching = 1;
-   for (i = 0; i < 16; i++) {
-      UChar cL  = argL[i];
-      UChar cR  = argR[i];
-      zmaskL    = (zmaskL >> 1) | (cL == 0 ? (1 << 15) : 0);
-      zmaskR    = (zmaskR >> 1) | (cR == 0 ? (1 << 15) : 0);
-
-      if (argL[i] == 0) {
-        // run off the end of the haystack.
-        keepSearching = 0;
-      } 
-
-      UInt m = 1;
-      if (keepSearching) {
-         for (ni = 0; ni < 16; ni++) {
-            if (argR[ni] == 0) break;
-            hi = ni + i;
-            if (hi >= 16) break;
-            if (argL[hi] != argR[ni]) { m = 0; break; }
-         }
+   } else {
+
+      // generate I-format output (an index in ECX)
+      // generate ecx value
+      UInt newECX = 0;
+      if (idx) {
+         // index of ms-1-bit
+         newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
       } else {
-         m = 0;
+         // index of ls-1-bit
+         newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
       }
-      boolRes = (boolRes >> 1) | (m << 15);
+
+      resV->w32[0] = newECX;
+      resV->w32[1] = 0;
+      resV->w32[2] = 0;
+      resV->w32[3] = 0;
 
    }
 
-   // boolRes is "pre-invalidated"
-   UInt intRes1 = boolRes & 0xFFFF;
+   // generate new flags, common to all ISTRI and ISTRM cases
+   *resOSZACP    // A, P are zero
+     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+}
+
 
-   // polarity: +
-   UInt intRes2 = intRes1;
+/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
+   variants.
 
-   // generate ecx value, common to all index-of-ls-1-bit cases
-   UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+   For xSTRI variants, the new ECX value is placed in the 32 bits
+   pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
+   variants, the result is a 128 bit value and is placed at *resV in
+   the obvious way.
 
-   // generate new flags, common to all ISTRI and ISTRM cases
-   UInt newFlags    // A, P are zero
-      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
-      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
-      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
-      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+   For all variants, the new OSZACP value is placed at *resOSZACP.
 
-   return (newFlags << 16) | newECX;
-}
+   argLV and argRV are the vector args.  The caller must prepare a
+   16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
+   must be 1 for each zero byte of of the respective arg.  For ESTRx
+   variants this is derived from the explicit length indication, and
+   must be 0 in all places except at the bit index corresponding to
+   the valid length (0 .. 16).  If the valid length is 16 then the
+   mask must be all zeroes.  In all cases, bits 31:16 must be zero.
 
+   imm8 is the original immediate from the instruction.  isSTRM
+   indicates whether this is a xSTRM or xSTRI variant, which controls
+   how much of *res is written.
 
-UInt compute_ISTRI_3A ( U128* argLU, U128* argRU )
+   If the given imm8 case can be handled, the return value is True.
+   If not, False is returned, and neither *res not *resOSZACP are
+   altered.
+*/
+
+Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
+                         /*OUT*/UInt* resOSZACP,
+                         V128* argLV,  V128* argRV,
+                         UInt zmaskL, UInt zmaskR,
+                         UInt imm8,   Bool isxSTRM )
 {
-   /* signed bytes      (also works for unsigned)
-      equal each        (straightforward parallel compare)
-      polarity Masked-  (IntRes2 = IntRes1 ^ validL)
-      index 0           (want index of ls 1 bit)
-   */
-   Int    i;
-   UChar* argL = (UChar*)argLU;
-   UChar* argR = (UChar*)argRU;
-   UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
-   for (i = 15; i >= 0; i--) {
-      UChar cL  = argL[i];
-      UChar cR  = argR[i];
-      zmaskL    = (zmaskL << 1)    | (cL == 0  ? 1 : 0);
-      zmaskR    = (zmaskR << 1)    | (cR == 0  ? 1 : 0);
-      boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+   vassert(imm8 < 0x80);
+   vassert((zmaskL >> 16) == 0);
+   vassert((zmaskR >> 16) == 0);
+
+   /* Explicitly reject any imm8 values that haven't been validated,
+      even if they would probably work.  Life is too short to have
+      unvalidated cases in the code base. */
+   switch (imm8) {
+      case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
+      case 0x1A: case 0x3A: case 0x44: case 0x4A:
+         break;
+      default:
+         return False;
    }
-   UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
-   UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
 
-   // do invalidation, common to all equal-each cases
-   UInt intRes1
-      = (boolResII & validL & validR)  // if both valid, use cmpres
-        | (~ (validL | validR));       // if both invalid, force 1
-                                       // else force 0
-   intRes1 &= 0xFFFF;
+   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
+   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
+   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
+   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
+
+   /*----------------------------------------*/
+   /*-- strcmp on byte data                --*/
+   /*----------------------------------------*/
+
+   if (agg == 2/*equal each, aka strcmp*/
+       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
+      Int    i;
+      UChar* argL = (UChar*)argLV;
+      UChar* argR = (UChar*)argRV;
+      UInt boolResII = 0;
+      for (i = 15; i >= 0; i--) {
+         UChar cL  = argL[i];
+         UChar cR  = argR[i];
+         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+      }
+      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+      // do invalidation, common to all equal-each cases
+      UInt intRes1
+         = (boolResII & validL & validR)  // if both valid, use cmpres
+           | (~ (validL | validR));       // if both invalid, force 1
+                                          // else force 0
+      intRes1 &= 0xFFFF;
+
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
+
+      return True;
+   }
 
-   // polarity: Masked-
-   UInt intRes2 = (intRes1 ^ validL) & 0xFFFF;
+   /*----------------------------------------*/
+   /*-- set membership on byte data        --*/
+   /*----------------------------------------*/
+
+   if (agg == 0/*equal any, aka find chars in a set*/
+       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
+      /* argL: the string,  argR: charset */
+      UInt   si, ci;
+      UChar* argL    = (UChar*)argLV;
+      UChar* argR    = (UChar*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+      for (si = 0; si < 16; si++) {
+         if ((validL & (1 << si)) == 0)
+            // run off the end of the string.
+            break;
+         UInt m = 0;
+         for (ci = 0; ci < 16; ci++) {
+            if ((validR & (1 << ci)) == 0) break;
+            if (argR[ci] == argL[si]) { m = 1; break; }
+         }
+         boolRes |= (m << si);
+      }
 
-   // generate ecx value, common to all index-of-ls-1-bit cases
-   UInt newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFFFF;
+   
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
 
-   // generate new flags, common to all ISTRI and ISTRM cases
-   UInt newFlags    // A, P are zero
-      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
-      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
-      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
-      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+      return True;
+   }
 
-   return (newFlags << 16) | newECX;
-}
+   /*----------------------------------------*/
+   /*-- substring search on byte data      --*/
+   /*----------------------------------------*/
+
+   if (agg == 3/*equal ordered, aka substring search*/
+       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
+
+      /* argL: haystack,  argR: needle */
+      UInt   ni, hi;
+      UChar* argL    = (UChar*)argLV;
+      UChar* argR    = (UChar*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+      for (hi = 0; hi < 16; hi++) {
+         if ((validL & (1 << hi)) == 0)
+            // run off the end of the haystack
+            break;
+         UInt m = 1;
+         for (ni = 0; ni < 16; ni++) {
+            if ((validR & (1 << ni)) == 0) break;
+            UInt i = ni + hi;
+            if (i >= 16) break;
+            if (argL[i] != argR[ni]) { m = 0; break; }
+         }
+         boolRes |= (m << hi);
+      }
 
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFFFF;
 
-UInt compute_ISTRI_4A ( U128* argLU, U128* argRU )
-{
-   /* signed bytes  (also works for unsigned)
-      equal each    (straightforward parallel compare)
-      polarity +    (IntRes2 = IntRes1)
-      index 1       (want index of ms 1 bit)
-   */
-   Int    i;
-   UChar* argL = (UChar*)argLU;
-   UChar* argR = (UChar*)argRU;
-   UInt boolResII = 0, zmaskL = 0, zmaskR = 0;
-   for (i = 15; i >= 0; i--) {
-      UChar cL  = argL[i];
-      UChar cR  = argR[i];
-      zmaskL    = (zmaskL << 1)    | (cL == 0  ? 1 : 0);
-      zmaskR    = (zmaskR << 1)    | (cR == 0  ? 1 : 0);
-      boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
+
+      return True;
    }
-   UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
-   UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
 
-   // do invalidation, common to all equal-each cases
-   UInt intRes1
-      = (boolResII & validL & validR)  // if both valid, use cmpres
-        | (~ (validL | validR));       // if both invalid, force 1
-                                       // else force 0
-   intRes1 &= 0xFFFF;
+   /*----------------------------------------*/
+   /*-- ranges, unsigned byte data         --*/
+   /*----------------------------------------*/
+
+   if (agg == 1/*ranges*/
+       && fmt == 0/*ub*/) {
+
+      /* argL: string,  argR: range-pairs */
+      UInt   ri, si;
+      UChar* argL    = (UChar*)argLV;
+      UChar* argR    = (UChar*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+      for (si = 0; si < 16; si++) {
+         if ((validL & (1 << si)) == 0)
+            // run off the end of the string
+            break;
+         UInt m = 0;
+         for (ri = 0; ri < 16; ri += 2) {
+            if ((validR & (3 << ri)) != (3 << ri)) break;
+            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { 
+               m = 1; break;
+            }
+         }
+         boolRes |= (m << si);
+      }
 
-   // polarity
-   UInt intRes2 = intRes1;
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFFFF;
 
-   // generate ecx value, common to all index-of-ms-1-bit cases
-   UInt newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
 
-   // generate new flags, common to all ISTRI and ISTRM cases
-   UInt newFlags    // A, P are zero
-      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
-      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
-      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
-      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+      return True;
+   }
 
-   return (newFlags << 16) | newECX;
+   return False;
 }
 
 
diff --git a/VEX/priv/guest_generic_x87.h b/VEX/priv/guest_generic_x87.h
index f07b0ea080..9cbe23b38d 100644
--- a/VEX/priv/guest_generic_x87.h
+++ b/VEX/priv/guest_generic_x87.h
@@ -98,15 +98,14 @@ typedef
    generated code.  CLEAN HELPER. */
 extern ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp );
 
-/* Do the computations for SSE4.2 ISTRI_XX.  Not called directly from
-   generated code.  Pure function, reads *argLU and *argRU, returned
-   value (0 .. 16) is in the low 16 bits of the return value.
-   Returned bits 31:16 hold the result OSZACP value. */
-extern UInt compute_ISTRI_08 ( U128* argLU, U128* argRU );
-extern UInt compute_ISTRI_0C ( U128* argLU, U128* argRU );
-extern UInt compute_ISTRI_3A ( U128* argLU, U128* argRU );
-extern UInt compute_ISTRI_4A ( U128* argLU, U128* argRU );
-
+/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
+   variants.  See bigger comment on implementation of this function
+   for details on call/return conventions. */
+extern Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
+                                /*OUT*/UInt* resOSZACP,
+                                V128* argLV,  V128* argRV,
+                                UInt zmaskL, UInt zmaskR,
+                                UInt imm8,   Bool isxSTRM );
 
 #endif /* ndef __VEX_GUEST_GENERIC_X87_H */
 
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index cf89d535c0..1e719e7151 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -788,6 +788,21 @@ static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
 }
 
 
+/* Expand the given byte into a 64-bit word, by cloning each bit
+   8 times. */
+static ULong bitmask8_to_bytemask64 ( UShort w8 )
+{
+   vassert(w8 == (w8 & 0xFF));
+   ULong w64 = 0;
+   Int i;
+   for (i = 0; i < 8; i++) {
+      if (w8 & (1<<i))
+         w64 |= (0xFFULL << (8 * i));
+   }
+   return w64;
+}
+
+
 //.. /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
 //..    after most non-simple FPU operations (simple = +, -, *, / and
 //..    sqrt).
@@ -3219,83 +3234,24 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
       switch (e->Iex.Const.con->Ico.V128) {
          case 0x0000:
             dst = generate_zeroes_V128(env);
-            return dst;
+            break;
          case 0xFFFF:
             dst = generate_ones_V128(env);
-            return dst;
-         default:
-            break;
-      }
-      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
-      const ULong const_z64    = 0x0000000000000000ULL;
-      const ULong const_o64    = 0xFFFFFFFFFFFFFFFFULL;
-      const ULong const_z32o32 = 0x00000000FFFFFFFFULL;
-      const ULong const_o32z32 = 0xFFFFFFFF00000000ULL;
-      switch (e->Iex.Const.con->Ico.V128) {
-         case 0x0000: case 0xFFFF:
-            vassert(0); /* handled just above */
-         /* do push_uimm64 twice, first time for the high-order half. */
-         case 0x00F0:
-            push_uimm64(env, const_z64);
-            push_uimm64(env, const_o32z32);
-            break;
-         case 0x00FF:
-            push_uimm64(env, const_z64);
-            push_uimm64(env, const_o64);
-            break;
-         case 0x000F:
-            push_uimm64(env, const_z64);
-            push_uimm64(env, const_z32o32);
             break;
-         case 0x0F00:
-            push_uimm64(env, const_z32o32);
-            push_uimm64(env, const_z64);
-            break;
-         case 0x0F0F:
-            push_uimm64(env, const_z32o32);
-            push_uimm64(env, const_z32o32);
-            break;
-         case 0x0FF0:
-            push_uimm64(env, const_z32o32);
-            push_uimm64(env, const_o32z32);
-            break;
-         case 0x0FFF:
-            push_uimm64(env, const_z32o32);
-            push_uimm64(env, const_o64);
-            break;
-         case 0xF000:
-            push_uimm64(env, const_o32z32);
-            push_uimm64(env, const_z64);
-            break;
-         case 0xF00F:
-            push_uimm64(env, const_o32z32);
-            push_uimm64(env, const_z32o32);
-            break;
-         case 0xF0F0:
-            push_uimm64(env, const_o32z32);
-            push_uimm64(env, const_o32z32);
-            break;
-         case 0xF0FF:
-            push_uimm64(env, const_o32z32);
-            push_uimm64(env, const_o64);
-            break;
-         case 0xFF00:
-            push_uimm64(env, const_o64);
-            push_uimm64(env, const_z64);
-            break;
-         case 0xFF0F:
-            push_uimm64(env, const_o64);
-            push_uimm64(env, const_z32o32);
-            break;
-         case 0xFFF0:
-            push_uimm64(env, const_o64);
-            push_uimm64(env, const_o32z32);
+         default: {
+            AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+            /* do push_uimm64 twice, first time for the high-order half. */
+            push_uimm64(env, bitmask8_to_bytemask64(
+                                (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
+                       ));
+            push_uimm64(env, bitmask8_to_bytemask64(
+                                (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
+                       ));
+            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
+            add_to_rsp(env, 16);
             break;
-         default:
-            goto vec_fail;
+         }
       }
-      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
-      add_to_rsp(env, 16);
       return dst;
    }
 
@@ -3723,7 +3679,7 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
       return dst;
    }
 
-   vec_fail:
+   //vec_fail:
    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
    ppIRExpr(e);
diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h
index 125514a737..53850cbdd2 100644
--- a/VEX/priv/host_generic_simd128.h
+++ b/VEX/priv/host_generic_simd128.h
@@ -43,18 +43,6 @@
 #ifndef __VEX_HOST_GENERIC_SIMD128_H
 #define __VEX_HOST_GENERIC_SIMD128_H
 
-/* A union for doing 128-bit primitives conveniently.  It is not
-   public and so not placed in pub/. */
-typedef
-   union {
-      UChar  w8[16];
-      UShort w16[8];
-      UInt   w32[4];
-      ULong  w64[2];
-   }
-   V128;
-
-
 #include "libvex_basictypes.h"
 
 /* DO NOT MAKE THESE INTO REGPARM FNS!  THIS WILL BREAK CALLING
diff --git a/VEX/pub/libvex_basictypes.h b/VEX/pub/libvex_basictypes.h
index a996f2e563..a945913547 100644
--- a/VEX/pub/libvex_basictypes.h
+++ b/VEX/pub/libvex_basictypes.h
@@ -62,7 +62,17 @@ typedef    signed long long int   Long;
 /* Always 128 bits. */
 typedef  UInt  U128[4];
 
+/* A union for doing 128-bit vector primitives conveniently. */
+typedef
+   union {
+      UChar  w8[16];
+      UShort w16[8];
+      UInt   w32[4];
+      ULong  w64[2];
+   }
+   V128;
 
+/* Floating point. */
 typedef  float   Float;    /* IEEE754 single-precision (32-bit) value */
 typedef  double  Double;   /* IEEE754 double-precision (64-bit) value */
 
-- 
2.47.2