From: Julian Seward <jseward@acm.org>
Date: Mon, 21 May 2012 10:18:10 +0000 (+0000)
Subject: Add initial support for Intel AVX instructions (Valgrind side).
X-Git-Tag: svn/VALGRIND_3_8_0~299
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8ebb8c30e5f7f0f36e13a0f3d9a145de0a29d6f4;p=thirdparty%2Fvalgrind.git

Add initial support for Intel AVX instructions (Valgrind side).
Tracker bug is #273475.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@12569
---

diff --git a/coregrind/m_coredump/coredump-elf.c b/coregrind/m_coredump/coredump-elf.c
index 77a8f346fd..745bd7ee69 100644
--- a/coregrind/m_coredump/coredump-elf.c
+++ b/coregrind/m_coredump/coredump-elf.c
@@ -417,7 +417,8 @@ static void fill_fpu(const ThreadState *tst, vki_elf_fpregset_t *fpu)
 //::    fpu->mxcsr_mask = ?;
 //::    fpu->st_space = ?;
 
-#  define DO(n)  VG_(memcpy)(fpu->xmm_space + n * 4, &arch->vex.guest_XMM##n, sizeof(arch->vex.guest_XMM##n))
+#  define DO(n)  VG_(memcpy)(fpu->xmm_space + n * 4, \
+                             &arch->vex.guest_YMM##n[0], 16)
    DO(0);  DO(1);  DO(2);  DO(3);  DO(4);  DO(5);  DO(6);  DO(7);
    DO(8);  DO(9);  DO(10); DO(11); DO(12); DO(13); DO(14); DO(15);
 #  undef DO
diff --git a/coregrind/m_gdbserver/valgrind-low-amd64.c b/coregrind/m_gdbserver/valgrind-low-amd64.c
index 0dc0382af6..5880eeaaf3 100644
--- a/coregrind/m_gdbserver/valgrind-low-amd64.c
+++ b/coregrind/m_gdbserver/valgrind-low-amd64.c
@@ -251,22 +251,22 @@ void transfer_register (ThreadId tid, int abs_regno, void * buf,
    case 37: *mod = False; break; // GDBTD ??? equivalent of foseg
    case 38: *mod = False; break; // GDBTD ??? equivalent of fooff
    case 39: *mod = False; break; // GDBTD ??? equivalent of fop
-   case 40: VG_(transfer) (&amd64->guest_XMM0,  buf, dir, size, mod); break;
-   case 41: VG_(transfer) (&amd64->guest_XMM1,  buf, dir, size, mod); break;
-   case 42: VG_(transfer) (&amd64->guest_XMM2,  buf, dir, size, mod); break;
-   case 43: VG_(transfer) (&amd64->guest_XMM3,  buf, dir, size, mod); break;
-   case 44: VG_(transfer) (&amd64->guest_XMM4,  buf, dir, size, mod); break;
-   case 45: VG_(transfer) (&amd64->guest_XMM5,  buf, dir, size, mod); break;
-   case 46: VG_(transfer) (&amd64->guest_XMM6,  buf, dir, size, mod); break;
-   case 47: VG_(transfer) (&amd64->guest_XMM7,  buf, dir, size, mod); break;
-   case 48: VG_(transfer) (&amd64->guest_XMM8,  buf, dir, size, mod); break;
-   case 49: VG_(transfer) (&amd64->guest_XMM9,  buf, dir, size, mod); break;
-   case 50: VG_(transfer) (&amd64->guest_XMM10, buf, dir, size, mod); break;
-   case 51: VG_(transfer) (&amd64->guest_XMM11, buf, dir, size, mod); break;
-   case 52: VG_(transfer) (&amd64->guest_XMM12, buf, dir, size, mod); break;
-   case 53: VG_(transfer) (&amd64->guest_XMM13, buf, dir, size, mod); break;
-   case 54: VG_(transfer) (&amd64->guest_XMM14, buf, dir, size, mod); break;
-   case 55: VG_(transfer) (&amd64->guest_XMM15, buf, dir, size, mod); break;
+   case 40: VG_(transfer) (&amd64->guest_YMM0[0],  buf, dir, size, mod); break;
+   case 41: VG_(transfer) (&amd64->guest_YMM1[0],  buf, dir, size, mod); break;
+   case 42: VG_(transfer) (&amd64->guest_YMM2[0],  buf, dir, size, mod); break;
+   case 43: VG_(transfer) (&amd64->guest_YMM3[0],  buf, dir, size, mod); break;
+   case 44: VG_(transfer) (&amd64->guest_YMM4[0],  buf, dir, size, mod); break;
+   case 45: VG_(transfer) (&amd64->guest_YMM5[0],  buf, dir, size, mod); break;
+   case 46: VG_(transfer) (&amd64->guest_YMM6[0],  buf, dir, size, mod); break;
+   case 47: VG_(transfer) (&amd64->guest_YMM7[0],  buf, dir, size, mod); break;
+   case 48: VG_(transfer) (&amd64->guest_YMM8[0],  buf, dir, size, mod); break;
+   case 49: VG_(transfer) (&amd64->guest_YMM9[0],  buf, dir, size, mod); break;
+   case 50: VG_(transfer) (&amd64->guest_YMM10[0], buf, dir, size, mod); break;
+   case 51: VG_(transfer) (&amd64->guest_YMM11[0], buf, dir, size, mod); break;
+   case 52: VG_(transfer) (&amd64->guest_YMM12[0], buf, dir, size, mod); break;
+   case 53: VG_(transfer) (&amd64->guest_YMM13[0], buf, dir, size, mod); break;
+   case 54: VG_(transfer) (&amd64->guest_YMM14[0], buf, dir, size, mod); break;
+   case 55: VG_(transfer) (&amd64->guest_YMM15[0], buf, dir, size, mod); break;
    case 56: 
       if (dir == valgrind_to_gdbserver) {
          // vex only models the rounding bits (see libvex_guest_x86.h)
diff --git a/coregrind/m_scheduler/scheduler.c b/coregrind/m_scheduler/scheduler.c
index 4e49873ddf..4c7ef60fcd 100644
--- a/coregrind/m_scheduler/scheduler.c
+++ b/coregrind/m_scheduler/scheduler.c
@@ -699,15 +699,15 @@ static void do_pre_run_checks ( ThreadState* tst )
                (void*)a_vexsh2, sz_vexsh2,
                (void*)a_spill, sz_spill );
 
-   vg_assert(VG_IS_16_ALIGNED(sz_vex));
-   vg_assert(VG_IS_16_ALIGNED(sz_vexsh1));
-   vg_assert(VG_IS_16_ALIGNED(sz_vexsh2));
-   vg_assert(VG_IS_16_ALIGNED(sz_spill));
+   vg_assert(VG_IS_32_ALIGNED(sz_vex));
+   vg_assert(VG_IS_32_ALIGNED(sz_vexsh1));
+   vg_assert(VG_IS_32_ALIGNED(sz_vexsh2));
+   vg_assert(VG_IS_32_ALIGNED(sz_spill));
 
-   vg_assert(VG_IS_16_ALIGNED(a_vex));
-   vg_assert(VG_IS_16_ALIGNED(a_vexsh1));
-   vg_assert(VG_IS_16_ALIGNED(a_vexsh2));
-   vg_assert(VG_IS_16_ALIGNED(a_spill));
+   vg_assert(VG_IS_32_ALIGNED(a_vex));
+   vg_assert(VG_IS_32_ALIGNED(a_vexsh1));
+   vg_assert(VG_IS_32_ALIGNED(a_vexsh2));
+   vg_assert(VG_IS_32_ALIGNED(a_spill));
 
    /* Check that the guest state and its two shadows have the same
       size, and that there are no holes in between.  The latter is
@@ -739,14 +739,14 @@ static void do_pre_run_checks ( ThreadState* tst )
 #  endif
 
 #  if defined(VGA_amd64)
-   /* amd64 XMM regs must form an array, ie, have no holes in
+   /* amd64 YMM regs must form an array, ie, have no holes in
       between. */
    vg_assert(
-      (offsetof(VexGuestAMD64State,guest_XMM16)
-       - offsetof(VexGuestAMD64State,guest_XMM0))
-      == (17/*#regs*/-1) * 16/*bytes per reg*/
+      (offsetof(VexGuestAMD64State,guest_YMM16)
+       - offsetof(VexGuestAMD64State,guest_YMM0))
+      == (17/*#regs*/-1) * 32/*bytes per reg*/
    );
-   vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestAMD64State,guest_XMM0)));
+   vg_assert(VG_IS_32_ALIGNED(offsetof(VexGuestAMD64State,guest_YMM0)));
    vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_FPREG)));
    vg_assert(16 == offsetof(VexGuestAMD64State,guest_RAX));
    vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RAX)));
diff --git a/coregrind/pub_core_threadstate.h b/coregrind/pub_core_threadstate.h
index 4860bade88..ccb41a1b6b 100644
--- a/coregrind/pub_core_threadstate.h
+++ b/coregrind/pub_core_threadstate.h
@@ -102,19 +102,19 @@ typedef
 
       /* Note that for code generation reasons, we require that the
          guest state area, its two shadows, and the spill area, are
-         16-aligned and have 16-aligned sizes, and there are no holes
+         32-aligned and have 32-aligned sizes, and there are no holes
          in between.  This is checked by do_pre_run_checks() in
          scheduler.c. */
 
       /* Saved machine context. */
-      VexGuestArchState vex __attribute__((aligned(16)));
+      VexGuestArchState vex __attribute__((aligned(32)));
 
       /* Saved shadow context (2 copies). */
-      VexGuestArchState vex_shadow1 __attribute__((aligned(16)));
-      VexGuestArchState vex_shadow2 __attribute__((aligned(16)));
+      VexGuestArchState vex_shadow1 __attribute__((aligned(32)));
+      VexGuestArchState vex_shadow2 __attribute__((aligned(32)));
 
       /* Spill area. */
-      UChar vex_spill[LibVEX_N_SPILL_BYTES] __attribute__((aligned(16)));
+      UChar vex_spill[LibVEX_N_SPILL_BYTES] __attribute__((aligned(32)));
 
       /* --- END vex-mandated guest state --- */
    } 
diff --git a/docs/Makefile.am b/docs/Makefile.am
index 2deeb011c9..95c10691af 100644
--- a/docs/Makefile.am
+++ b/docs/Makefile.am
@@ -26,6 +26,7 @@ EXTRA_DIST = \
 	internals/3_4_BUGSTATUS.txt \
 	internals/3_5_BUGSTATUS.txt \
 	internals/arm_thumb_notes_gdbserver.txt \
+	internals/avx-notes.txt \
 	internals/BIG_APP_NOTES.txt \
 	internals/Darwin-notes.txt \
 	internals/SPEC-notes.txt \
diff --git a/docs/internals/avx-notes.txt b/docs/internals/avx-notes.txt
new file mode 100644
index 0000000000..575b37a434
--- /dev/null
+++ b/docs/internals/avx-notes.txt
@@ -0,0 +1,28 @@
+
+Cleanups
+~~~~~~~~
+
+FXSAVE/FXRSTOR: can no longer say (w.r.t the guest state
+effects declaration) that the SSE regs are written/read
+in one single block.  Instead need to make a declaration
+for each bottom-half independently :-(
+
+in fact, re-check everything that assumes the XMM regs form
+an array, because they no longer do.  Done: PCMPISTRI et al,
+Also AESENC et al.
+
+* guest state alignment, all targets -- will probably fail now
+
+* FXSAVE/FXRSTOR on amd64, as noted above
+
+* tools other than memcheck -- now fail w/ AVX insns
+
+* remove regclass HRc256
+
+* disable Avx insns in backend (or rm this code, will we
+  ever need it?)
+
+* change amd64 getAllocableRegs back to what it was originally
+  [DONE]
+
+* fix up none/tests/amd64/avx-1.c
diff --git a/memcheck/mc_include.h b/memcheck/mc_include.h
index 91921d22c5..017868e5ef 100644
--- a/memcheck/mc_include.h
+++ b/memcheck/mc_include.h
@@ -549,11 +549,13 @@ VG_REGPARM(2) void  MC_(helperc_b_store2) ( Addr a, UWord d32 );
 VG_REGPARM(2) void  MC_(helperc_b_store4) ( Addr a, UWord d32 );
 VG_REGPARM(2) void  MC_(helperc_b_store8) ( Addr a, UWord d32 );
 VG_REGPARM(2) void  MC_(helperc_b_store16)( Addr a, UWord d32 );
+VG_REGPARM(2) void  MC_(helperc_b_store32)( Addr a, UWord d32 );
 VG_REGPARM(1) UWord MC_(helperc_b_load1) ( Addr a );
 VG_REGPARM(1) UWord MC_(helperc_b_load2) ( Addr a );
 VG_REGPARM(1) UWord MC_(helperc_b_load4) ( Addr a );
 VG_REGPARM(1) UWord MC_(helperc_b_load8) ( Addr a );
 VG_REGPARM(1) UWord MC_(helperc_b_load16)( Addr a );
+VG_REGPARM(1) UWord MC_(helperc_b_load32)( Addr a );
 
 /* Functions defined in mc_translate.c */
 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
index 1f1ae25704..5bee6c2247 100644
--- a/memcheck/mc_machine.c
+++ b/memcheck/mc_machine.c
@@ -611,23 +611,23 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
    if (o == GOF(FC3210)   && szB == 8) return -1;
 
    /* XMM registers */
-   if (o >= GOF(XMM0)  && o+sz <= GOF(XMM0) +SZB(XMM0))  return GOF(XMM0);
-   if (o >= GOF(XMM1)  && o+sz <= GOF(XMM1) +SZB(XMM1))  return GOF(XMM1);
-   if (o >= GOF(XMM2)  && o+sz <= GOF(XMM2) +SZB(XMM2))  return GOF(XMM2);
-   if (o >= GOF(XMM3)  && o+sz <= GOF(XMM3) +SZB(XMM3))  return GOF(XMM3);
-   if (o >= GOF(XMM4)  && o+sz <= GOF(XMM4) +SZB(XMM4))  return GOF(XMM4);
-   if (o >= GOF(XMM5)  && o+sz <= GOF(XMM5) +SZB(XMM5))  return GOF(XMM5);
-   if (o >= GOF(XMM6)  && o+sz <= GOF(XMM6) +SZB(XMM6))  return GOF(XMM6);
-   if (o >= GOF(XMM7)  && o+sz <= GOF(XMM7) +SZB(XMM7))  return GOF(XMM7);
-   if (o >= GOF(XMM8)  && o+sz <= GOF(XMM8) +SZB(XMM8))  return GOF(XMM8);
-   if (o >= GOF(XMM9)  && o+sz <= GOF(XMM9) +SZB(XMM9))  return GOF(XMM9);
-   if (o >= GOF(XMM10) && o+sz <= GOF(XMM10)+SZB(XMM10)) return GOF(XMM10);
-   if (o >= GOF(XMM11) && o+sz <= GOF(XMM11)+SZB(XMM11)) return GOF(XMM11);
-   if (o >= GOF(XMM12) && o+sz <= GOF(XMM12)+SZB(XMM12)) return GOF(XMM12);
-   if (o >= GOF(XMM13) && o+sz <= GOF(XMM13)+SZB(XMM13)) return GOF(XMM13);
-   if (o >= GOF(XMM14) && o+sz <= GOF(XMM14)+SZB(XMM14)) return GOF(XMM14);
-   if (o >= GOF(XMM15) && o+sz <= GOF(XMM15)+SZB(XMM15)) return GOF(XMM15);
-   if (o >= GOF(XMM16) && o+sz <= GOF(XMM16)+SZB(XMM16)) return GOF(XMM16);
+   if (o >= GOF(YMM0)  && o+sz <= GOF(YMM0) +SZB(YMM0))  return GOF(YMM0);
+   if (o >= GOF(YMM1)  && o+sz <= GOF(YMM1) +SZB(YMM1))  return GOF(YMM1);
+   if (o >= GOF(YMM2)  && o+sz <= GOF(YMM2) +SZB(YMM2))  return GOF(YMM2);
+   if (o >= GOF(YMM3)  && o+sz <= GOF(YMM3) +SZB(YMM3))  return GOF(YMM3);
+   if (o >= GOF(YMM4)  && o+sz <= GOF(YMM4) +SZB(YMM4))  return GOF(YMM4);
+   if (o >= GOF(YMM5)  && o+sz <= GOF(YMM5) +SZB(YMM5))  return GOF(YMM5);
+   if (o >= GOF(YMM6)  && o+sz <= GOF(YMM6) +SZB(YMM6))  return GOF(YMM6);
+   if (o >= GOF(YMM7)  && o+sz <= GOF(YMM7) +SZB(YMM7))  return GOF(YMM7);
+   if (o >= GOF(YMM8)  && o+sz <= GOF(YMM8) +SZB(YMM8))  return GOF(YMM8);
+   if (o >= GOF(YMM9)  && o+sz <= GOF(YMM9) +SZB(YMM9))  return GOF(YMM9);
+   if (o >= GOF(YMM10) && o+sz <= GOF(YMM10)+SZB(YMM10)) return GOF(YMM10);
+   if (o >= GOF(YMM11) && o+sz <= GOF(YMM11)+SZB(YMM11)) return GOF(YMM11);
+   if (o >= GOF(YMM12) && o+sz <= GOF(YMM12)+SZB(YMM12)) return GOF(YMM12);
+   if (o >= GOF(YMM13) && o+sz <= GOF(YMM13)+SZB(YMM13)) return GOF(YMM13);
+   if (o >= GOF(YMM14) && o+sz <= GOF(YMM14)+SZB(YMM14)) return GOF(YMM14);
+   if (o >= GOF(YMM15) && o+sz <= GOF(YMM15)+SZB(YMM15)) return GOF(YMM15);
+   if (o >= GOF(YMM16) && o+sz <= GOF(YMM16)+SZB(YMM16)) return GOF(YMM16);
 
    /* MMX accesses to FP regs.  Need to allow for 32-bit references
       due to dirty helpers for frstor etc, which reference the entire
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index 804285ae60..d0d439b192 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -5860,6 +5860,16 @@ UWord VG_REGPARM(1) MC_(helperc_b_load16)( Addr a ) {
    return (UWord)oBoth;
 }
 
+UWord VG_REGPARM(1) MC_(helperc_b_load32)( Addr a ) {
+   UInt oQ0   = (UInt)MC_(helperc_b_load8)( a + 0 );
+   UInt oQ1   = (UInt)MC_(helperc_b_load8)( a + 8 );
+   UInt oQ2   = (UInt)MC_(helperc_b_load8)( a + 16 );
+   UInt oQ3   = (UInt)MC_(helperc_b_load8)( a + 24 );
+   UInt oAll  = merge_origins(merge_origins(oQ0, oQ1),
+                              merge_origins(oQ2, oQ3));
+   return (UWord)oAll;
+}
+
 
 /*--------------------------------------------*/
 /*--- Origin tracking: store handlers      ---*/
@@ -5972,6 +5982,13 @@ void VG_REGPARM(2) MC_(helperc_b_store16)( Addr a, UWord d32 ) {
    MC_(helperc_b_store8)( a + 8, d32 );
 }
 
+void VG_REGPARM(2) MC_(helperc_b_store32)( Addr a, UWord d32 ) {
+   MC_(helperc_b_store8)( a +  0, d32 );
+   MC_(helperc_b_store8)( a +  8, d32 );
+   MC_(helperc_b_store8)( a + 16, d32 );
+   MC_(helperc_b_store8)( a + 24, d32 );
+}
+
 
 /*--------------------------------------------*/
 /*--- Origin tracking: sarp handlers       ---*/
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 15d7640e68..378fa581ff 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -358,7 +358,7 @@ static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
 /* Shadow state is always accessed using integer types.  This returns
    an integer type with the same size (as per sizeofIRType) as the
    given type.  The only valid shadow types are Bit, I8, I16, I32,
-   I64, I128, V128. */
+   I64, I128, V128, V256. */
 
 static IRType shadowTypeV ( IRType ty )
 {
@@ -376,6 +376,7 @@ static IRType shadowTypeV ( IRType ty )
       case Ity_F128: return Ity_I128;
       case Ity_D128: return Ity_I128;
       case Ity_V128: return Ity_V128;
+      case Ity_V256: return Ity_V256;
       default: ppIRType(ty); 
                VG_(tool_panic)("memcheck:shadowTypeV");
    }
@@ -461,14 +462,17 @@ static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
 /*------------------------------------------------------------*/
 /*--- Helper functions for 128-bit ops                     ---*/
 /*------------------------------------------------------------*/
+
 static IRExpr *i128_const_zero(void)
 {
-  return binop(Iop_64HLto128, IRExpr_Const(IRConst_U64(0)),
-               IRExpr_Const(IRConst_U64(0)));
+   IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
+   return binop(Iop_64HLto128, z64, z64);
 }
 
-/* There are no 128-bit loads and/or stores. So we do not need to worry
-   about that in expr2vbits_Load */
+/* There are no I128-bit loads and/or stores [as generated by any
+   current front ends].  So we do not need to worry about that in
+   expr2vbits_Load */
+
 
 /*------------------------------------------------------------*/
 /*--- Constructing definedness primitive ops               ---*/
@@ -3716,7 +3720,6 @@ IRAtom* expr2vbits_Load ( MCEnv* mce,
                           IREndness end, IRType ty, 
                           IRAtom* addr, UInt bias )
 {
-   IRAtom *v64hi, *v64lo;
    tl_assert(end == Iend_LE || end == Iend_BE);
    switch (shadowTypeV(ty)) {
       case Ity_I8: 
@@ -3724,17 +3727,33 @@ IRAtom* expr2vbits_Load ( MCEnv* mce,
       case Ity_I32: 
       case Ity_I64:
          return expr2vbits_Load_WRK(mce, end, ty, addr, bias);
-      case Ity_V128:
+      case Ity_V128: {
+         IRAtom *v64hi, *v64lo;
          if (end == Iend_LE) {
-            v64lo = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias);
+            v64lo = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+0);
             v64hi = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+8);
          } else {
-            v64hi = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias);
+            v64hi = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+0);
             v64lo = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+8);
          }
          return assignNew( 'V', mce, 
                            Ity_V128, 
                            binop(Iop_64HLtoV128, v64hi, v64lo));
+      }
+      case Ity_V256: {
+         /* V256-bit case -- phrased in terms of 64 bit units (Qs),
+            with Q3 being the most significant lane. */
+         if (end == Iend_BE) goto unhandled;
+         IRAtom* v64Q0 = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+0);
+         IRAtom* v64Q1 = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+8);
+         IRAtom* v64Q2 = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+16);
+         IRAtom* v64Q3 = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+24);
+         return assignNew( 'V', mce,
+                           Ity_V256,
+                           IRExpr_Qop(Iop_64x4toV256,
+                                      v64Q3, v64Q2, v64Q1, v64Q0));
+      }
+      unhandled:
       default:
          VG_(tool_panic)("expr2vbits_Load");
    }
@@ -3934,7 +3953,8 @@ void do_shadow_Store ( MCEnv* mce,
    // shadow computation ops that precede it.
    if (MC_(clo_mc_level) == 1) {
       switch (ty) {
-         case Ity_V128: // V128 weirdness
+         case Ity_V256: // V256 weirdness -- used four times
+         case Ity_V128: // V128 weirdness -- used twice
                         c = IRConst_V128(V_BITS16_DEFINED); break;
          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
@@ -3953,6 +3973,7 @@ void do_shadow_Store ( MCEnv* mce,
       bits into shadow memory. */
    if (end == Iend_LE) {
       switch (ty) {
+         case Ity_V256: /* we'll use the helper four times */
          case Ity_V128: /* we'll use the helper twice */
          case Ity_I64: helper = &MC_(helperc_STOREV64le);
                        hname = "MC_(helperc_STOREV64le)";
@@ -3983,11 +4004,81 @@ void do_shadow_Store ( MCEnv* mce,
          case Ity_I8:  helper = &MC_(helperc_STOREV8);
                        hname = "MC_(helperc_STOREV8)";
                        break;
+         /* Note, no V256 case here, because no big-endian target that
+            we support, has 256 vectors. */
          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
       }
    }
 
-   if (ty == Ity_V128) {
+   if (UNLIKELY(ty == Ity_V256)) {
+
+      /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
+         Q3 being the most significant lane. */
+      /* These are the offsets of the Qs in memory. */
+      Int     offQ0, offQ1, offQ2, offQ3;
+
+      /* Various bits for constructing the 4 lane helper calls */
+      IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
+      IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
+      IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
+      IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
+
+      if (end == Iend_LE) {
+         offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
+      } else {
+         offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
+      }
+
+      eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
+      addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
+      vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
+      diQ0    = unsafeIRDirty_0_N( 
+                   1/*regparms*/, 
+                   hname, VG_(fnptr_to_fnentry)( helper ), 
+                   mkIRExprVec_2( addrQ0, vdataQ0 )
+                );
+
+      eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
+      addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
+      vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
+      diQ1    = unsafeIRDirty_0_N( 
+                   1/*regparms*/, 
+                   hname, VG_(fnptr_to_fnentry)( helper ), 
+                   mkIRExprVec_2( addrQ1, vdataQ1 )
+                );
+
+      eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
+      addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
+      vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
+      diQ2    = unsafeIRDirty_0_N( 
+                   1/*regparms*/, 
+                   hname, VG_(fnptr_to_fnentry)( helper ), 
+                   mkIRExprVec_2( addrQ2, vdataQ2 )
+                );
+
+      eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
+      addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
+      vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
+      diQ3    = unsafeIRDirty_0_N( 
+                   1/*regparms*/, 
+                   hname, VG_(fnptr_to_fnentry)( helper ), 
+                   mkIRExprVec_2( addrQ3, vdataQ3 )
+                );
+
+      if (guard)
+         diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
+
+      setHelperAnns( mce, diQ0 );
+      setHelperAnns( mce, diQ1 );
+      setHelperAnns( mce, diQ2 );
+      setHelperAnns( mce, diQ3 );
+      stmt( 'V', mce, IRStmt_Dirty(diQ0) );
+      stmt( 'V', mce, IRStmt_Dirty(diQ1) );
+      stmt( 'V', mce, IRStmt_Dirty(diQ2) );
+      stmt( 'V', mce, IRStmt_Dirty(diQ3) );
+
+   } 
+   else if (UNLIKELY(ty == Ity_V128)) {
 
       /* V128-bit case */
       /* See comment in next clause re 64-bit regparms */
@@ -5449,6 +5540,9 @@ static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
       case 16: hFun  = (void*)&MC_(helperc_b_load16);
                hName = "MC_(helperc_b_load16)";
                break;
+      case 32: hFun  = (void*)&MC_(helperc_b_load32);
+               hName = "MC_(helperc_b_load32)";
+               break;
       default:
          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
          tl_assert(0);
@@ -5511,6 +5605,9 @@ static void gen_store_b ( MCEnv* mce, Int szB,
       case 16: hFun  = (void*)&MC_(helperc_b_store16);
                hName = "MC_(helperc_b_store16)";
                break;
+      case 32: hFun  = (void*)&MC_(helperc_b_store32);
+               hName = "MC_(helperc_b_store32)";
+               break;
       default:
          tl_assert(0);
    }
diff --git a/none/tests/amd64/avx-1.c b/none/tests/amd64/avx-1.c
new file mode 100644
index 0000000000..861bba861c
--- /dev/null
+++ b/none/tests/amd64/avx-1.c
@@ -0,0 +1,344 @@
+      /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
+      /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
+      /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
+      /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
+      /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
+      /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
+      /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
+      /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
+      /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
+      /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
+
+/* . VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
+/* . VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
+/* . VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
+/* . VCVTTSD2SI xmm1/m64, r32 = VEX.LIG.F2.0F.W0 2C /r */
+/* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
+/* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
+/* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
+/* . VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
+/* VANDPD r/m, rV, r ::: r = rV & r/m (MVR format) */
+/* VANDNPD r/m, rV, r ::: r = (not rV) & r/m (MVR format) */
+/* VORPD r/m, rV, r ::: r = rV ^ r/m (MVR format) */
+/* VXORPD r/m, rV, r ::: r = rV ^ r/m (MVR format) */
+/* VXORPS r/m, rV, r ::: r = rV ^ r/m (MVR format) */
+/* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F0.0F.WIG 58 /r */
+/* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F0.0F.WIG 59 /r */
+/* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
+/* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
+/* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
+/* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
+/* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
+
+      /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
+      /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
+      /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
+      /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
+
+/* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
+/* VPSLLD imm8, xmm2, xmm1 = VEX.128.66.0F.WIG 72 /6 ib */
+/* VPSRLDQ VEX.NDD.128.66.0F.WIG 73 /3 ib */
+/* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m (MVR format) */
+
+      /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
+      /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
+      /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
+
+/* . VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
+/* . VPOR = VEX.NDS.128.66.0F.WIG EB /r */
+/* . VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
+/* . VPSUBB = VEX.NDS.128.66.0F.WIG EF /r */
+/* . VPSUBD = VEX.NDS.128.66.0F.WIG FE /r */
+/* . VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
+/* . VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) (MVR format) */
+/* . VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
+/* . VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
+/* . VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
+/* . VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
+      /* VPEXTRD imm8, r32/m32, xmm2 */
+      /* VINSERTF128 r/m, rV, rD */
+      /* VEXTRACTF128 rS, r/m */
+
+/* . VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4 */
+      /* VEX.128.F2.0F.WIG /12 r = MOVDDUP xmm2/m64, xmm1 */
+     /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
+/* . VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
+/* . VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
+/* . VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
+/* . VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
+/* . VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
+/* . VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
+/* . VANDPS = VEX.NDS.128.0F.WIG 54 /r */
+/* . VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
+/* . VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
+
+/* really needs testing -- Intel docs don't make sense */
+/* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 */
+
+/* really needs testing -- Intel docs don't make sense */
+/* of the form vmovq  %xmm0,-0x8(%rsp) */
+
+/* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
+/* . VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
+/* . VORPS = VEX.NDS.128.0F.WIG 56 /r */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <malloc.h>
+
+typedef  unsigned char           UChar;
+typedef  unsigned int            UInt;
+typedef  unsigned long int       UWord;
+typedef  unsigned long long int  ULong;
+
+#define IS_32_ALIGNED(_ptr) (0 == (0x1F & (UWord)(_ptr)))
+
+typedef  union { UChar u8[32];  UInt u32[8];  }  YMM;
+
+typedef  struct {  YMM a1; YMM a2; YMM a3; YMM a4; ULong u64; }  Block;
+
+void showYMM ( YMM* vec )
+{
+   int i;
+   assert(IS_32_ALIGNED(vec));
+   for (i = 31; i >= 0; i--) {
+      printf("%02x", (UInt)vec->u8[i]);
+      if (i > 0 && 0 == ((i+0) & 7)) printf(".");
+   }
+}
+
+void showBlock ( char* msg, Block* block )
+{
+   printf("  %s\n", msg);
+   printf("    "); showYMM(&block->a1); printf("\n");
+   printf("    "); showYMM(&block->a2); printf("\n");
+   printf("    "); showYMM(&block->a3); printf("\n");
+   printf("    "); showYMM(&block->a4); printf("\n");
+   printf("    %016llx\n", block->u64);
+}
+
+UChar randUChar ( void )
+{
+   static UInt seed = 80021;
+   seed = 1103515245 * seed + 12345;
+   return (seed >> 17) & 0xFF;
+}
+
+void randBlock ( Block* b )
+{
+   int i;
+   UChar* p = (UChar*)b;
+   for (i = 0; i < sizeof(Block); i++)
+      p[i] = randUChar();
+}
+
+
+/* Generate a function test_NAME, that tests the given insn, in both
+   its mem and reg forms.  The reg form of the insn may mention, as
+   operands only %ymm6, %ymm7, %ymm8, %ymm9 and %r14.  The mem form of
+   the insn may mention as operands only (%rax), %ymm7, %ymm8, %ymm9
+   and %r14. */
+
+#define GEN_test_RandM(_name, _reg_form, _mem_form)   \
+    \
+    static void test_##_name ( void ) \
+    { \
+       Block* b = memalign(32, sizeof(Block)); \
+       randBlock(b); \
+       printf("%s(reg)\n", #_name); \
+       showBlock("before", b); \
+       __asm__ __volatile__( \
+          "vmovdqa   0(%0),%%ymm7"  "\n\t" \
+          "vmovdqa  32(%0),%%ymm8"  "\n\t" \
+          "vmovdqa  64(%0),%%ymm6"  "\n\t" \
+          "vmovdqa  96(%0),%%ymm9"  "\n\t" \
+          "movq    128(%0),%%r14"   "\n\t" \
+          _reg_form   "\n\t" \
+          "vmovdqa %%ymm7,  0(%0)"  "\n\t" \
+          "vmovdqa %%ymm8, 32(%0)"  "\n\t" \
+          "vmovdqa %%ymm6, 64(%0)"  "\n\t" \
+          "vmovdqa %%ymm9, 96(%0)"  "\n\t" \
+          "movq    %%r14, 128(%0)"  "\n\t" \
+          : /*OUT*/  \
+          : /*IN*/"r"(b) \
+          : /*TRASH*/"xmm7","xmm8","xmm6","xmm9","r14","memory","cc" \
+       ); \
+       showBlock("after", b); \
+       randBlock(b); \
+       printf("%s(mem)\n", #_name); \
+       showBlock("before", b); \
+       __asm__ __volatile__( \
+          "leaq      0(%0),%%rax"  "\n\t" \
+          "vmovdqa  32(%0),%%ymm8"  "\n\t" \
+          "vmovdqa  64(%0),%%ymm7"  "\n\t" \
+          "vmovdqa  96(%0),%%ymm9"  "\n\t" \
+          "movq    128(%0),%%r14"   "\n\t" \
+          _mem_form   "\n\t" \
+          "vmovdqa %%ymm8, 32(%0)"  "\n\t" \
+          "vmovdqa %%ymm7, 64(%0)"  "\n\t" \
+          "vmovdqa %%ymm9, 96(%0)"  "\n\t" \
+          "movq    %%r14, 128(%0)"  "\n\t" \
+          : /*OUT*/  \
+          : /*IN*/"r"(b) \
+          : /*TRASH*/"xmm8","xmm7","xmm9","r14","rax","memory","cc" \
+       ); \
+       showBlock("after", b); \
+       printf("\n"); \
+       free(b); \
+    }
+
+GEN_test_RandM(VPOR_128,
+               "vpor %%xmm6,  %%xmm8, %%xmm7",
+               "vpor (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPXOR_128,
+               "vpxor %%xmm6,  %%xmm8, %%xmm7",
+               "vpxor (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPSUBB_128,
+               "vpsubb %%xmm6,  %%xmm8, %%xmm7",
+               "vpsubb (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPSUBD_128,
+               "vpsubd %%xmm6,  %%xmm8, %%xmm7",
+               "vpsubd (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPADDD_128,
+               "vpaddd %%xmm6,  %%xmm8, %%xmm7",
+               "vpaddd (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPMOVZXWD_128,
+               "vpmovzxwd %%xmm6,  %%xmm8",
+               "vpmovzxwd (%%rax), %%xmm8")
+
+GEN_test_RandM(VPMOVZXBW_128,
+               "vpmovzxbw %%xmm6,  %%xmm8",
+               "vpmovzxbw (%%rax), %%xmm8")
+
+GEN_test_RandM(VPBLENDVB_128,
+               "vpblendvb %%xmm9, %%xmm6,  %%xmm8, %%xmm7",
+               "vpblendvb %%xmm9, (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPMINSD_128,
+               "vpminsd %%xmm6,  %%xmm8, %%xmm7",
+               "vpminsd (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VPMAXSD_128,
+               "vpmaxsd %%xmm6,  %%xmm8, %%xmm7",
+               "vpmaxsd (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VANDPD_128,
+               "vandpd %%xmm6,  %%xmm8, %%xmm7",
+               "vandpd (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCVTSI2SD_32,
+               "vcvtsi2sdl %%r14d,  %%xmm8, %%xmm7",
+               "vcvtsi2sdl (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCVTSI2SD_64,
+               "vcvtsi2sdq %%r14,   %%xmm8, %%xmm7",
+               "vcvtsi2sdq (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCVTSI2SS_64,
+               "vcvtsi2ssq %%r14,   %%xmm8, %%xmm7",
+               "vcvtsi2ssq (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCVTTSD2SI_32,
+               "vcvttsd2si %%xmm8,  %%r14d",
+               "vcvttsd2si (%%rax), %%r14d")
+
+GEN_test_RandM(VPSHUFB_128,
+               "vpshufb %%xmm6,  %%xmm8, %%xmm7",
+               "vpshufb (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCMPSD_128_0x0,
+               "vcmpsd $0, %%xmm6,  %%xmm8, %%xmm7",
+               "vcmpsd $0, (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCMPSD_128_0xD,
+               "vcmpsd $0xd, %%xmm6,  %%xmm8, %%xmm7",
+               "vcmpsd $0xd, (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VSQRTSD_128,
+               "vsqrtsd %%xmm6,  %%xmm8, %%xmm7",
+               "vsqrtsd (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VORPS_128,
+               "vorps %%xmm6,  %%xmm8, %%xmm7",
+               "vorps (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VANDNPS_128,
+               "vandnps %%xmm6,  %%xmm8, %%xmm7",
+               "vandnps (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VMAXSS_128,
+               "vmaxss %%xmm6,  %%xmm8, %%xmm7",
+               "vmaxss (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VMINSS_128,
+               "vminss %%xmm6,  %%xmm8, %%xmm7",
+               "vminss (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VANDPS_128,
+               "vandps %%xmm6,  %%xmm8, %%xmm7",
+               "vandps (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VCVTSI2SS_128,
+               "vcvtsi2ssl %%r14d,  %%xmm8, %%xmm7",
+               "vcvtsi2ssl (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VUNPCKLPS_128,
+               "vunpcklps %%xmm6,  %%xmm8, %%xmm7",
+               "vunpcklps (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VDIVSS_128,
+               "vdivss %%xmm6,  %%xmm8, %%xmm7",
+               "vdivss (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VADDSS_128,
+               "vaddss %%xmm6,  %%xmm8, %%xmm7",
+               "vaddss (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VSUBSS_128,
+               "vsubss %%xmm6,  %%xmm8, %%xmm7",
+               "vsubss (%%rax), %%xmm8, %%xmm7")
+
+GEN_test_RandM(VMULSS_128,
+               "vmulss %%xmm6,  %%xmm8, %%xmm7",
+               "vmulss (%%rax), %%xmm8, %%xmm7")
+
+int main ( void )
+{
+   test_VMULSS_128();
+   test_VSUBSS_128();
+   test_VADDSS_128();
+   test_VDIVSS_128();
+   test_VUNPCKLPS_128();
+   test_VCVTSI2SS_128();
+   test_VANDPS_128();
+   test_VMINSS_128();
+   test_VMAXSS_128();
+   test_VANDNPS_128();
+   test_VORPS_128();
+   test_VSQRTSD_128();
+   // test_VCMPSD_128_0xD(); BORKED
+   test_VCMPSD_128_0x0();
+   test_VPSHUFB_128();
+   test_VCVTTSD2SI_32();
+   test_VCVTSI2SS_64();
+   test_VCVTSI2SD_64();
+   test_VCVTSI2SD_32();
+   test_VPOR_128();
+   test_VPXOR_128();
+   test_VPSUBB_128();
+   test_VPSUBD_128();
+   test_VPADDD_128();
+   test_VPMOVZXBW_128();
+   test_VPMOVZXWD_128();
+   test_VPBLENDVB_128();
+   test_VPMINSD_128();
+   test_VPMAXSD_128();
+   test_VANDPD_128();
+   return 0;
+}