From: Julian Seward <jseward@acm.org>
Date: Fri, 18 Jun 2010 08:17:41 +0000 (+0000)
Subject: Implement SSE4 instructions: PCMPGTQ PMAXUD PMINUD PMAXSB PMINSB PMULLD
X-Git-Tag: svn/VALGRIND_3_6_1^2~86
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f3517dfd35be26de45197201e30951845b26ac3e;p=thirdparty%2Fvalgrind.git

Implement SSE4 instructions: PCMPGTQ PMAXUD PMINUD PMAXSB PMINSB PMULLD
I believe this covers everything that gcc-4.4 and gcc-4.5 will generate
with "-O3 -msse4.2".  Note, this commit changes the set of IR ops and so
requires a from-scratch rebuild of the tree.


git-svn-id: svn://svn.valgrind.org/vex/trunk@1984
---

diff --git a/VEX/Makefile-gcc b/VEX/Makefile-gcc
index c7d42a3c50..d1e1df4837 100644
--- a/VEX/Makefile-gcc
+++ b/VEX/Makefile-gcc
@@ -17,6 +17,7 @@ PRIV_HEADERS = 	priv/host_x86_defs.h			\
 		priv/host_ppc_defs.h			\
 		priv/host_generic_regs.h	        \
 		priv/host_generic_simd64.h	        \
+		priv/host_generic_simd128.h	        \
 		priv/main_globals.h			\
 		priv/main_util.h			\
 		priv/guest_generic_x87.h               	\
@@ -44,6 +45,7 @@ LIB_OBJS = 	priv/ir_defs.o                          \
 		priv/host_ppc_isel.o			\
 		priv/host_generic_regs.o	        \
 		priv/host_generic_simd64.o	        \
+		priv/host_generic_simd128.o	        \
 		priv/host_generic_reg_alloc2.o		\
 		priv/guest_generic_x87.o	        \
 		priv/guest_generic_bb_to_IR.o		\
@@ -262,6 +264,10 @@ priv/host_generic_simd64.o: $(ALL_HEADERS) priv/host_generic_simd64.c
 	$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd64.o \
 					 -c priv/host_generic_simd64.c
 
+priv/host_generic_simd128.o: $(ALL_HEADERS) priv/host_generic_simd128.c
+	$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd128.o \
+					 -c priv/host_generic_simd128.c
+
 priv/host_generic_reg_alloc2.o: $(ALL_HEADERS) priv/host_generic_reg_alloc2.c
 	$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_reg_alloc2.o \
 					 -c priv/host_generic_reg_alloc2.c
diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 0b4748fe98..0647b8d0f7 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -14406,121 +14406,85 @@ DisResult disInstr_AMD64_WRK (
       goto decode_success;
    }
 
+   /* 66 0F 38 37 = PCMPGTQ
+      64x2 comparison (signed, presumably; the Intel docs don't say :-)
+   */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+3, 
+                                 "pcmpgtq", Iop_CmpGT64Sx2, False );
+      goto decode_success;
+   }
 
    /* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
       Maximum of Packed Signed Double Word Integers (XMM) 
-      --
       66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
       Minimum of Packed Signed Double Word Integers (XMM) */
-   if ( have66noF2noF3( pfx ) 
-        && sz == 2 
+   if ( have66noF2noF3( pfx ) && sz == 2 
         && insn[0] == 0x0F && insn[1] == 0x38
-        && ( (insn[2] == 0x3D) || (insn[2] == 0x39) ) ) {
-
-      IRTemp reg_vec  = newTemp(Ity_V128);
-      IRTemp rom_vec  = newTemp(Ity_V128);
-      IRTemp mask_vec = newTemp(Ity_V128);
-
-      Bool isPMAX     = (insn[2] == 0x3D) ? True : False;
-
-      HChar* str      = isPMAX ? "pmaxsd" : "pminsd";
-
-      modrm = insn[3];
-      assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
-
-      if ( epartIsReg( modrm ) ) {
-         assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
-         delta += 3+1;
-         DIP( "%s %s,%s\n", str,
-              nameXMMReg( eregOfRexRM(pfx, modrm) ),
-              nameXMMReg( gregOfRexRM(pfx, modrm) ) );    
-      } else {
-         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
-         assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) );
-         delta += 3+alen;
-         DIP( "%s %s,%s\n", str, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
-      }
-
-      assign( mask_vec, binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ) );
-
-      IRTemp max_min_vec = newTemp(Ity_V128);
-      if ( isPMAX ) {
-         assign( max_min_vec,
-                 binop( Iop_OrV128, 
-                        binop( Iop_AndV128, mkexpr(rom_vec),
-                               unop( Iop_NotV128, mkexpr(mask_vec) ) ),
-                        binop( Iop_AndV128, mkexpr(reg_vec), mkexpr(mask_vec) ) ) );
-      } else {
-         assign( max_min_vec, 
-                 binop( Iop_OrV128, 
-                        binop( Iop_AndV128, mkexpr(reg_vec),
-                               unop( Iop_NotV128, mkexpr(mask_vec) ) ), 
-                        binop( Iop_AndV128, mkexpr(rom_vec), mkexpr(mask_vec) ) ) );
-      }
-
-      putXMMReg( gregOfRexRM(pfx, modrm), mkexpr(max_min_vec) );
-
+        && (insn[2] == 0x3D || insn[2] == 0x39)) {
+      Bool isMAX = insn[2] == 0x3D;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxsd" : "pminsd",
+                 isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
+                 False
+              );
       goto decode_success;
    }
 
-
    /* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
       Maximum of Packed Unsigned Doubleword Integers (XMM)
       66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
       Minimum of Packed Unsigned Doubleword Integers (XMM) */
-   if ( have66noF2noF3( pfx ) 
-        && sz == 2 
+   if ( have66noF2noF3( pfx ) && sz == 2 
         && insn[0] == 0x0F && insn[1] == 0x38
         && (insn[2] == 0x3F || insn[2] == 0x3B)) {
+      Bool isMAX = insn[2] == 0x3F;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxud" : "pminud",
+                 isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
+                 False
+              );
+      goto decode_success;
+   }
 
-      Bool   is_max   = insn[2] == 0x3F;
-      IRTemp reg_vec  = newTemp(Ity_V128);
-      IRTemp rom_vec  = newTemp(Ity_V128);
-      IRTemp mask_vec = newTemp(Ity_V128);
-      IRTemp and_vec  = newTemp(Ity_V128);
-      IRTemp not_vec  = newTemp(Ity_V128);
-
-      modrm = insn[3];
-      assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
-
-      if ( epartIsReg( modrm ) ) {
-         assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
-         delta += 3+1;
-         DIP( "p%sud %s,%s\n",
-              is_max ? "max" : "min",
-              nameXMMReg( eregOfRexRM(pfx, modrm) ),
-              nameXMMReg( gregOfRexRM(pfx, modrm) ) );    
-      } else {
-         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
-         assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) );
-         delta += 3+alen;
-         DIP( "p%sd %s,%s\n",
-               is_max ? "max" : "min",
-              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
-      }
-
-      /* the foll. simulates Iop_CmpGT32Ux4 (not implemented) 
-         c.f. Hacker's Delight, S2-11, p.23 */
-      assign( mask_vec, 
-              binop( Iop_XorV128, 
-                     binop( Iop_XorV128, 
-                            binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ),
-                            binop( Iop_SarN32x4, mkexpr(reg_vec), mkU8(31) ) ), 
-                     binop( Iop_SarN32x4, mkexpr(rom_vec), mkU8(31) ) ) );
-
-      assign( and_vec,
-              binop( Iop_AndV128, mkexpr(is_max ? reg_vec : rom_vec),
-                     mkexpr(mask_vec) ) );
-      assign( not_vec,
-              binop( Iop_AndV128, mkexpr(is_max ? rom_vec : reg_vec), 
-                     unop( Iop_NotV128, mkexpr(mask_vec) ) ) );
-
-      putXMMReg( gregOfRexRM(pfx, modrm), 
-                 binop( Iop_OrV128, mkexpr(not_vec), mkexpr(and_vec) ) );
- 
+   /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
+      Maximum of Packed Unsigned Word Integers (XMM)
+      66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
+      Minimum of Packed Unsigned Word Integers (XMM)
+   */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38
+        && (insn[2] == 0x3E || insn[2] == 0x3A)) {
+      Bool isMAX = insn[2] == 0x3E;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxuw" : "pminuw",
+                 isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
+                 False
+              );
       goto decode_success;
    }
 
+   /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
+      8Sx16 (signed) max
+      66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
+      8Sx16 (signed) min
+   */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38
+        && (insn[2] == 0x3C || insn[2] == 0x38)) {
+      Bool isMAX = insn[2] == 0x3C;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxsb" : "pminsb",
+                 isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
+                 False
+              );
+      goto decode_success;
+   }
 
    /* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64 
       Packed Move with Sign Extend from Byte to Word (XMM) */
@@ -14964,6 +14928,40 @@ DisResult disInstr_AMD64_WRK (
    }
 
 
+   /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
+      32x4 integer multiply from xmm2/m128 to xmm1 */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
+  
+      modrm = insn[3];
+
+      IRTemp argL = newTemp(Ity_V128);
+      IRTemp argR = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmulld %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
+         delta += 3+alen;
+         DIP( "pmulld %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
+
+      goto decode_success;
+   }
+
+
    /* F3 0F B8  = POPCNT{W,L,Q}
       Count the number of 1 bits in a register
     */
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index 0a0a3a621b..ddf7503e96 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -42,6 +42,7 @@
 #include "main_globals.h"
 #include "host_generic_regs.h"
 #include "host_generic_simd64.h"
+#include "host_generic_simd128.h"
 #include "host_amd64_defs.h"
 
 
@@ -3158,7 +3159,8 @@ static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
 /* DO NOT CALL THIS DIRECTLY */
 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
 {
-   Bool     arg1isEReg = False;
+   HWord      fn = 0; /* address of helper fn, if required */
+   Bool       arg1isEReg = False;
    AMD64SseOp op = Asse_INVALID;
    IRType     ty = typeOfIRExpr(env->type_env,e);
    vassert(e);
@@ -3614,6 +3616,73 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
          return dst;
       }
 
+      case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
+                           goto do_SseAssistedBinary;
+      case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
+                           goto do_SseAssistedBinary;
+      case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
+                           goto do_SseAssistedBinary;
+      case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
+                           goto do_SseAssistedBinary;
+      case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
+                           goto do_SseAssistedBinary;
+      case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
+                           goto do_SseAssistedBinary;
+      case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
+                           goto do_SseAssistedBinary;
+      do_SseAssistedBinary: {
+         /* RRRufff!  RRRufff code is what we're generating here.  Oh
+            well. */
+         vassert(fn != 0);
+         HReg dst = newVRegV(env);
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg argp = newVRegI(env);
+         /* subq $112, %rsp         -- make a space*/
+         sub_from_rsp(env, 112);
+         /* leaq 48(%rsp), %r_argp  -- point into it */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+                                        argp));
+         /* andq $-16, %r_argp      -- 16-align the pointer */
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                         AMD64RMI_Imm( ~(UInt)15 ), 
+                                         argp));
+         /* Prepare 3 arg regs:
+            leaq 0(%r_argp), %rdi
+            leaq 16(%r_argp), %rsi
+            leaq 32(%r_argp), %rdx
+         */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+                                        hregAMD64_RDI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+                                        hregAMD64_RSI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+                                        hregAMD64_RDX()));
+         /* Store the two args, at (%rsi) and (%rdx):
+            movupd  %argL, 0(%rsi)
+            movupd  %argR, 0(%rdx)
+         */
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
+                                          AMD64AMode_IR(0, hregAMD64_RSI())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
+                                          AMD64AMode_IR(0, hregAMD64_RDX())));
+         /* call the helper */
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
+         /* fetch the result from memory, using %r_argp, which the
+            register allocator will keep alive across the call. */
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
+                                          AMD64AMode_IR(0, argp)));
+         /* and finally, clear the space */
+         add_to_rsp(env, 112);
+         return dst;
+      }
+
       default:
          break;
    } /* switch (e->Iex.Binop.op) */
diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c
new file mode 100644
index 0000000000..8ed516609d
--- /dev/null
+++ b/VEX/priv/host_generic_simd128.c
@@ -0,0 +1,220 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                            host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2010-2010 OpenWorks GbR
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd128.h"
+
+
+/* Primitive helpers always take args of the real type (signed vs
+   unsigned) but return an unsigned result, so there's no conversion
+   weirdness when stuffing results back in the V128 union fields,
+   which are all unsigned. */
+
+static inline UInt mul32 ( Int xx, Int yy )
+{
+   Int t = ((Int)xx) * ((Int)yy);
+   return toUInt(t);
+}
+
+static inline UInt max32S ( Int xx, Int yy )
+{
+   return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32S ( Int xx, Int yy )
+{
+   return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UInt max32U ( UInt xx, UInt yy )
+{
+   return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32U ( UInt xx, UInt yy )
+{
+   return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UShort max16U ( UShort xx, UShort yy )
+{
+   return toUShort((xx > yy) ? xx : yy);
+}
+
+static inline UShort min16U ( UShort xx, UShort yy )
+{
+   return toUShort((xx < yy) ? xx : yy);
+}
+
+static inline UChar max8S ( Char xx, Char yy )
+{
+   return toUChar((xx > yy) ? xx : yy);
+}
+
+static inline UChar min8S ( Char xx, Char yy )
+{
+   return toUChar((xx < yy) ? xx : yy);
+}
+
+static inline ULong cmpGT64S ( Long xx, Long yy )
+{
+   return (((Long)xx) > ((Long)yy))
+             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
+}
+
+void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
+                              V128* argL, V128* argR )
+{
+   res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
+   res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
+   res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
+   res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
+   res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
+   res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
+   res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
+   res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
+   res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
+   res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
+   res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
+   res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
+   res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
+   res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
+   res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
+   res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
+   res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
+   res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
+   res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
+   res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
+   res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
+   res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
+   res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
+   res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
+   res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
+   res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
+   res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
+   res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
+   res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
+   res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
+   res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
+   res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
+   res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
+   res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
+   res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
+   res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
+   res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
+   res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
+   res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
+   res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
+   res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
+   res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
+   res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
+   res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
+   res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
+   res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
+   res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
+   res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
+   res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
+   res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
+   res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
+   res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
+   res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
+   res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
+   res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
+   res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
+   res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
+   res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
+   res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
+   res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
+                                 V128* argL, V128* argR )
+{
+   res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
+   res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                              host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h
new file mode 100644
index 0000000000..125514a737
--- /dev/null
+++ b/VEX/priv/host_generic_simd128.h
@@ -0,0 +1,79 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2010-2010 OpenWorks GbR
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   as clean helper functions from IR.
+
+   These will get called from generated code and therefore should be
+   well behaved -- no floating point or mmx insns, just straight
+   integer code.
+
+   Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD128_H
+#define __VEX_HOST_GENERIC_SIMD128_H
+
+/* A union for doing 128-bit primitives conveniently.  It is not
+   public and so not placed in pub/. */
+typedef
+   union {
+      UChar  w8[16];
+      UShort w16[8];
+      UInt   w32[4];
+      ULong  w64[2];
+   }
+   V128;
+
+
+#include "libvex_basictypes.h"
+
+/* DO NOT MAKE THESE INTO REGPARM FNS!  THIS WILL BREAK CALLING
+   SEQUENCES GENERATED BY host-x86/isel.c. */
+
+extern void h_generic_calc_Mul32x4    ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Sx4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Sx4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Ux4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Ux4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max16Ux8   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min16Ux8   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max8Sx16   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min8Sx16   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                              host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index 4e0a9773c4..0e6f2edadf 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -336,6 +336,7 @@ void ppIROp ( IROp op )
       case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
       case Iop_Mul16x4: vex_printf("Mul16x4"); return;
       case Iop_Mul32x2: vex_printf("Mul32x2"); return;
+      case Iop_Mul32x4: vex_printf("Mul32x4"); return;
       case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
       case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
       case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
@@ -525,6 +526,7 @@ void ppIROp ( IROp op )
       case Iop_CmpGT8Sx16: vex_printf("CmpGT8Sx16"); return;
       case Iop_CmpGT16Sx8: vex_printf("CmpGT16Sx8"); return;
       case Iop_CmpGT32Sx4: vex_printf("CmpGT32Sx4"); return;
+      case Iop_CmpGT64Sx2: vex_printf("CmpGT64Sx2"); return;
       case Iop_CmpGT8Ux16: vex_printf("CmpGT8Ux16"); return;
       case Iop_CmpGT16Ux8: vex_printf("CmpGT16Ux8"); return;
       case Iop_CmpGT32Ux4: vex_printf("CmpGT32Ux4"); return;
@@ -1899,7 +1901,7 @@ void typeOfPrimop ( IROp op,
       case Iop_Sub32x4:   case Iop_Sub64x2:
       case Iop_QSub8Ux16: case Iop_QSub16Ux8: case Iop_QSub32Ux4:
       case Iop_QSub8Sx16: case Iop_QSub16Sx8: case Iop_QSub32Sx4:
-      case Iop_Mul16x8:
+      case Iop_Mul16x8: case Iop_Mul32x4:
       case Iop_MulHi16Ux8: case Iop_MulHi32Ux4: 
       case Iop_MulHi16Sx8: case Iop_MulHi32Sx4: 
       case Iop_MullEven8Ux16: case Iop_MullEven16Ux8:
@@ -1912,6 +1914,7 @@ void typeOfPrimop ( IROp op,
       case Iop_Min8Ux16: case Iop_Min16Ux8: case Iop_Min32Ux4:
       case Iop_CmpEQ8x16:  case Iop_CmpEQ16x8:  case Iop_CmpEQ32x4:
       case Iop_CmpGT8Sx16: case Iop_CmpGT16Sx8: case Iop_CmpGT32Sx4:
+      case Iop_CmpGT64Sx2:
       case Iop_CmpGT8Ux16: case Iop_CmpGT16Ux8: case Iop_CmpGT32Ux4:
       case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4:
       case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4:
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
index 8eec2aff39..962b952e7d 100644
--- a/VEX/priv/main_main.c
+++ b/VEX/priv/main_main.c
@@ -57,6 +57,8 @@
 #include "guest_arm_defs.h"
 #include "guest_ppc_defs.h"
 
+#include "host_generic_simd128.h"
+
 
 /* This file contains the top level interface to the library. */
 
@@ -141,6 +143,7 @@ void LibVEX_Init (
    vassert(4 == sizeof(Addr32));
    vassert(8 == sizeof(Addr64));
    vassert(16 == sizeof(U128));
+   vassert(16 == sizeof(V128));
 
    vassert(sizeof(void*) == 4 || sizeof(void*) == 8);
    vassert(sizeof(void*) == sizeof(int*));
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index bb39becc2c..4b7d628fb4 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -834,7 +834,7 @@ typedef
       Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4,
 
       /* MULTIPLICATION (normal / high half of signed/unsigned) */
-      Iop_Mul16x8,
+      Iop_Mul16x8,    Iop_Mul32x4,
       Iop_MulHi16Ux8, Iop_MulHi32Ux4,
       Iop_MulHi16Sx8, Iop_MulHi32Sx4,
       /* (widening signed/unsigned of even lanes, with lowest lane=zero) */
@@ -853,7 +853,7 @@ typedef
 
       /* COMPARISON */
       Iop_CmpEQ8x16,  Iop_CmpEQ16x8,  Iop_CmpEQ32x4,
-      Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4,
+      Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2,
       Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4,
 
       /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */