Implement XSAVE/XRSTOR for AVX (state components 0, 1 and 2)

author Julian Seward <jseward@acm.org>

Wed, 12 Aug 2015 11:15:53 +0000 (11:15 +0000)

committer Julian Seward <jseward@acm.org>

Wed, 12 Aug 2015 11:15:53 +0000 (11:15 +0000)
author Julian Seward <jseward@acm.org>
Wed, 12 Aug 2015 11:15:53 +0000 (11:15 +0000)
committer Julian Seward <jseward@acm.org>
Wed, 12 Aug 2015 11:15:53 +0000 (11:15 +0000)
diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h

index 3b9e7ffeac47852f08bb3525a64eb18143cb9351..72cd7f7652254d2c0e6ea513f833b9e5d9bb11e1 100644 (file)
--- a/VEX/priv/guest_amd64_defs.h
+++ b/VEX/priv/guest_amd64_defs.h
@@ -168,13 +168,19 @@ extern void  amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st );
  extern void  amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st );
  extern void  amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st );
  extern void  amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st );
+extern void  amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st );
  
  extern void  amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
  
-extern void      amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM
-                    ( VexGuestAMD64State*, HWord );
-extern VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM
-                    ( VexGuestAMD64State*, HWord );
+extern void amd64g_dirtyhelper_XSAVE_COMPONENT_0
+               ( VexGuestAMD64State* gst, HWord addr );
+extern void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS 
+               ( VexGuestAMD64State* gst, HWord addr );
+
+extern VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
+                    ( VexGuestAMD64State* gst, HWord addr );
+extern VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS 
+                    ( VexGuestAMD64State* gst, HWord addr );
  
  extern ULong amd64g_dirtyhelper_RDTSC ( void );
  extern void  amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st );
diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c

index e3c3b73a4d09935e727bcadea1abacd3f01b98f3..3eeb8aeb4cd05146ddac65ff97e317c9bceed093 100644 (file)
--- a/VEX/priv/guest_amd64_helpers.c
+++ b/VEX/priv/guest_amd64_helpers.c
@@ -1943,8 +1943,15 @@ void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
  }
  
  
-static
-void do_fxsave ( VexGuestAMD64State* gst, HWord addr, Bool save_xmm_regs )
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+/* XSAVE component 0 is the x87 FPU state. */
+void amd64g_dirtyhelper_XSAVE_COMPONENT_0
+        ( VexGuestAMD64State* gst, HWord addr )
  {
     /* Derived from values obtained from
        vendor_id       : AuthenticAMD
@@ -1959,17 +1966,15 @@ void do_fxsave ( VexGuestAMD64State* gst, HWord addr, Bool save_xmm_regs )
     Fpu_State tmp;
     UShort*   addrS = (UShort*)addr;
     UChar*    addrC = (UChar*)addr;
-   UInt      mxcsr;
     UShort    fp_tags;
     UInt      summary_tags;
     Int       r, stno;
     UShort    *srcS, *dstS;
  
     do_get_x87( gst, (UChar*)&tmp );
-   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
  
-   /* Now build the proper fxsave image from the x87 image we just
-      made. */
+   /* Now build the proper fxsave x87 image from the fsave x87 image
+      we just made. */
  
     addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
     addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
@@ -2002,11 +2007,8 @@ void do_fxsave ( VexGuestAMD64State* gst, HWord addr, Bool save_xmm_regs )
     addrS[10] = 0; /* BOGUS */
     addrS[11] = 0; /* BOGUS */
  
-   addrS[12] = toUShort(mxcsr);  /* MXCSR */
-   addrS[13] = toUShort(mxcsr >> 16);
-
-   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
-   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
+   /* addrS[13,12] are MXCSR -- not written */
+   /* addrS[15,14] are MXCSR_MASK -- not written */
  
     /* Copy in the FP registers, in ST order. */
     for (stno = 0; stno < 8; stno++) {
@@ -2021,94 +2023,95 @@ void do_fxsave ( VexGuestAMD64State* gst, HWord addr, Bool save_xmm_regs )
        dstS[6] = 0;
        dstS[7] = 0;
     }
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+/* XSAVE component 1 is the SSE state. */
+void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS 
+        ( VexGuestAMD64State* gst, HWord addr )
+{
+   UShort* addrS = (UShort*)addr;
+   UInt    mxcsr;
+
+   /* The only non-register parts of the SSE state are MXCSR and
+      MXCSR_MASK. */
+   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
+
+   addrS[12] = toUShort(mxcsr);  /* MXCSR */
+   addrS[13] = toUShort(mxcsr >> 16);
+
+   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
+   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Do FXSAVE from the supplied VexGuestAMD64State structure and store
+   the result at the given address which represents a buffer of at
+   least 416 bytes.
+
+   This function is not called from generated code.  FXSAVE is dealt
+   with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
+   functions above plus some in-line IR.  This function is merely a
+   convenience function for VEX's users.
+*/
+void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
+                                /*OUT*/HWord fp_state )
+{
+   /* Do the x87 part */
+   amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
+
+   /* And now the SSE part, except for the registers themselves. */
+   amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
  
     /* That's the first 160 bytes of the image done. */
-   if (save_xmm_regs == True) {
-      /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
-         big-endian, these need to be byte-swapped. */
-      U128 *xmm = (U128 *)(addr + 160);
-
-      vassert(host_is_little_endian());
-
-#     define COPY_U128(_dst,_src)                       \
-         do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
-              _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
-         while (0)
-
-      COPY_U128( xmm[0],  gst->guest_YMM0 );
-      COPY_U128( xmm[1],  gst->guest_YMM1 );
-      COPY_U128( xmm[2],  gst->guest_YMM2 );
-      COPY_U128( xmm[3],  gst->guest_YMM3 );
-      COPY_U128( xmm[4],  gst->guest_YMM4 );
-      COPY_U128( xmm[5],  gst->guest_YMM5 );
-      COPY_U128( xmm[6],  gst->guest_YMM6 );
-      COPY_U128( xmm[7],  gst->guest_YMM7 );
-      COPY_U128( xmm[8],  gst->guest_YMM8 );
-      COPY_U128( xmm[9],  gst->guest_YMM9 );
-      COPY_U128( xmm[10], gst->guest_YMM10 );
-      COPY_U128( xmm[11], gst->guest_YMM11 );
-      COPY_U128( xmm[12], gst->guest_YMM12 );
-      COPY_U128( xmm[13], gst->guest_YMM13 );
-      COPY_U128( xmm[14], gst->guest_YMM14 );
-      COPY_U128( xmm[15], gst->guest_YMM15 );
+   /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
+      big-endian, these need to be byte-swapped. */
+   U128 *xmm = (U128 *)(fp_state + 160);
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( xmm[0],  gst->guest_YMM0 );
+   COPY_U128( xmm[1],  gst->guest_YMM1 );
+   COPY_U128( xmm[2],  gst->guest_YMM2 );
+   COPY_U128( xmm[3],  gst->guest_YMM3 );
+   COPY_U128( xmm[4],  gst->guest_YMM4 );
+   COPY_U128( xmm[5],  gst->guest_YMM5 );
+   COPY_U128( xmm[6],  gst->guest_YMM6 );
+   COPY_U128( xmm[7],  gst->guest_YMM7 );
+   COPY_U128( xmm[8],  gst->guest_YMM8 );
+   COPY_U128( xmm[9],  gst->guest_YMM9 );
+   COPY_U128( xmm[10], gst->guest_YMM10 );
+   COPY_U128( xmm[11], gst->guest_YMM11 );
+   COPY_U128( xmm[12], gst->guest_YMM12 );
+   COPY_U128( xmm[13], gst->guest_YMM13 );
+   COPY_U128( xmm[14], gst->guest_YMM14 );
+   COPY_U128( xmm[15], gst->guest_YMM15 );
  #  undef COPY_U128
-   } else {
-      /* We let the generated IR to copy remaining %xmm0 .. %xmm15, so as to
-       make Memcheck's definedness flow for the non-XMM parts independent from
-       that of the all the other control and status words in the structure.
-       This avoids the false positives shown in #291310. */
-   }
  }
  
  
-static
-VexEmNote do_fxrstor ( VexGuestAMD64State* gst, HWord addr,
-                       Bool rstor_xmm_regs )
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
+             ( VexGuestAMD64State* gst, HWord addr )
  {
     Fpu_State tmp;
-   VexEmNote warnX87 = EmNote_NONE;
-   VexEmNote warnXMM = EmNote_NONE;
     UShort*   addrS   = (UShort*)addr;
     UChar*    addrC   = (UChar*)addr;
     UShort    fp_tags;
     Int       r, stno, i;
  
-   if (rstor_xmm_regs == True) {
-      /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
-         to be byte-swapped. */
-      U128 *xmm = (U128 *)(addr + 160);
-
-      vassert(host_is_little_endian());
-
-#     define COPY_U128(_dst,_src)                       \
-         do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
-              _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
-         while (0)
-
-      COPY_U128( gst->guest_YMM0, xmm[0] );
-      COPY_U128( gst->guest_YMM1, xmm[1] );
-      COPY_U128( gst->guest_YMM2, xmm[2] );
-      COPY_U128( gst->guest_YMM3, xmm[3] );
-      COPY_U128( gst->guest_YMM4, xmm[4] );
-      COPY_U128( gst->guest_YMM5, xmm[5] );
-      COPY_U128( gst->guest_YMM6, xmm[6] );
-      COPY_U128( gst->guest_YMM7, xmm[7] );
-      COPY_U128( gst->guest_YMM8, xmm[8] );
-      COPY_U128( gst->guest_YMM9, xmm[9] );
-      COPY_U128( gst->guest_YMM10, xmm[10] );
-      COPY_U128( gst->guest_YMM11, xmm[11] );
-      COPY_U128( gst->guest_YMM12, xmm[12] );
-      COPY_U128( gst->guest_YMM13, xmm[13] );
-      COPY_U128( gst->guest_YMM14, xmm[14] );
-      COPY_U128( gst->guest_YMM15, xmm[15] );
-
-#  undef COPY_U128
-   } else {
-      /* Don't restore %xmm0 .. %xmm15, for the same reasons that
-         do_fxsave(save_xmm_regs = False) doesn't save them.  See
-         comment in that function for details. */
-   }
-
     /* Copy the x87 registers out of the image, into a temporary
        Fpu_State struct. */
     for (i = 0; i < 14; i++) tmp.env[i] = 0;
@@ -2137,16 +2140,75 @@ VexEmNote do_fxrstor ( VexGuestAMD64State* gst, HWord addr,
     tmp.env[FP_ENV_TAG] = fp_tags;
  
     /* Now write 'tmp' into the guest state. */
-   warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
+   VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
  
-   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
-                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
-     ULong w64 = amd64g_check_ldmxcsr( (ULong)w32 );
+   return warnX87;
+}
  
-     warnXMM = (VexEmNote)(w64 >> 32);
  
-     gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
-   }
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
+             ( VexGuestAMD64State* gst, HWord addr )
+{
+   UShort* addrS = (UShort*)addr;
+   UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
+                   | ((((UInt)addrS[13]) & 0xFFFF) << 16);
+   ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
+
+   VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
+
+   gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
+   return warnXMM;
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Do FXRSTOR from the supplied address and store read values to the given
+   VexGuestAMD64State structure. 
+
+   This function is not called from generated code.  FXRSTOR is dealt
+   with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
+   functions above plus some in-line IR.  This function is merely a
+   convenience function for VEX's users.
+*/
+VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
+                                      /*MOD*/VexGuestAMD64State* gst )
+{
+   /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
+      to be byte-swapped. */
+   U128 *xmm = (U128 *)(fp_state + 160);
+
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( gst->guest_YMM0, xmm[0] );
+   COPY_U128( gst->guest_YMM1, xmm[1] );
+   COPY_U128( gst->guest_YMM2, xmm[2] );
+   COPY_U128( gst->guest_YMM3, xmm[3] );
+   COPY_U128( gst->guest_YMM4, xmm[4] );
+   COPY_U128( gst->guest_YMM5, xmm[5] );
+   COPY_U128( gst->guest_YMM6, xmm[6] );
+   COPY_U128( gst->guest_YMM7, xmm[7] );
+   COPY_U128( gst->guest_YMM8, xmm[8] );
+   COPY_U128( gst->guest_YMM9, xmm[9] );
+   COPY_U128( gst->guest_YMM10, xmm[10] );
+   COPY_U128( gst->guest_YMM11, xmm[11] );
+   COPY_U128( gst->guest_YMM12, xmm[12] );
+   COPY_U128( gst->guest_YMM13, xmm[13] );
+   COPY_U128( gst->guest_YMM14, xmm[14] );
+   COPY_U128( gst->guest_YMM15, xmm[15] );
+
+#  undef COPY_U128
+
+   VexEmNote warnXMM
+      = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
+   VexEmNote warnX87
+      = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
  
     /* Prefer an X87 emwarn over an XMM one, if both exist. */
     if (warnX87 != EmNote_NONE)
@@ -2156,24 +2218,9 @@ VexEmNote do_fxrstor ( VexGuestAMD64State* gst, HWord addr,
  }
  
  
-/* CALLED FROM GENERATED CODE */
-/* DIRTY HELPER (reads guest state, writes guest mem) */
-/* NOTE: only handles 32-bit format (no REX.W on the insn) */
-/* NOTE: does not save XMM registers - see do_fxsave() for details */
-void amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
-                                                HWord addr )
-{
-   do_fxsave( gst, addr, False );
-}
-
-/* CALLED FROM GENERATED CODE */
-/* DIRTY HELPER (writes guest state, reads guest mem) */
-VexEmNote amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State* gst,
-                                                      HWord addr )
-{
-   return do_fxrstor( gst, addr, False );
-}
-
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for FSAVE/FRSTOR                   ---*/
+/*---------------------------------------------------------------*/
  
  /* DIRTY HELPER (writes guest state) */
  /* Initialise the x87 FPU state as per 'finit'. */
@@ -2465,28 +2512,9 @@ VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
     return ew;
  }
  
-/* VISIBLE TO LIBVEX CLIENT */
-/* Do FXSAVE from the supplied VexGuestAMD64tate structure and store the
-   result at the given address which represents a buffer of at least 416
-   bytes. Saves also XMM registers. */
-void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
-                                /*OUT*/HWord fp_state )
-{
-   do_fxsave( gst, fp_state, True );
-}
-
-/* VISIBLE TO LIBVEX CLIENT */
-/* Do FXRSTOR from the supplied address and store read values to the given
-   VexGuestAMD64State structure. Restores also XMM registers. */
-VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
-                                      /*MOD*/VexGuestAMD64State* gst )
-{
-   return do_fxrstor( gst, fp_state, True );
-}
-
  
  /*---------------------------------------------------------------*/
-/*--- Misc integer helpers, including rotates and CPUID.      ---*/
+/*--- CPUID helpers.                                          ---*/
  /*---------------------------------------------------------------*/
  
  /* Claim to be the following CPU, which is probably representative of
@@ -2845,6 +2873,14 @@ void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
  /* Claim to be the following CPU (4 x ...), which is AVX and cx16
     capable.  Plus (kludge!) it "supports" HTM.
  
+   Also with the following change: claim that XSaveOpt is not
+   available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
+   on the real CPU.  Consequently, programs that correctly observe
+   these CPUID values should only try to use 3 of the 8 XSave-family
+   instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
+   having to implement the compacted or optimised save/restore
+   variants.
+
     vendor_id       : GenuineIntel
     cpu family      : 6
     model           : 42
@@ -2955,7 +2991,7 @@ void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
           switch (old_ecx) {
              case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
                                        0x00000340, 0x00000000); break;
-            case 0x00000001: SET_ABCD(0x00000001, 0x00000000,
+            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
                                        0x00000000, 0x00000000); break;
              case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
                                        0x00000000, 0x00000000); break;
@@ -3004,6 +3040,176 @@ void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st )
  }
  
  
+/* Claim to be the following CPU (4 x ...), which is AVX2 capable.
+
+   With the following change: claim that XSaveOpt is not available, by
+   cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
+   CPU.  Consequently, programs that correctly observe these CPUID
+   values should only try to use 3 of the 8 XSave-family instructions:
+   XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
+   implement the compacted or optimised save/restore variants.
+
+   vendor_id       : GenuineIntel
+   cpu family      : 6
+   model           : 60
+   model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
+   stepping        : 3
+   microcode       : 0x1c
+   cpu MHz         : 919.957
+   cache size      : 8192 KB
+   physical id     : 0
+   siblings        : 4
+   core id         : 3
+   cpu cores       : 4
+   apicid          : 6
+   initial apicid  : 6
+   fpu             : yes
+   fpu_exception   : yes
+   cpuid level     : 13
+   wp              : yes
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
+                     cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
+                     tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
+                     arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
+                     aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
+                     vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
+                     sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
+                     avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
+                     tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
+                     bmi1 avx2 smep bmi2 erms invpcid xsaveopt
+   bugs            :
+   bogomips        : 5786.68
+   clflush size    : 64
+   cache_alignment : 64
+   address sizes   : 39 bits physical, 48 bits virtual
+   power management:
+*/
+void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)                \
+      do { st->guest_RAX = (ULong)(_a);        \
+           st->guest_RBX = (ULong)(_b);        \
+           st->guest_RCX = (ULong)(_c);        \
+           st->guest_RDX = (ULong)(_d);        \
+      } while (0)
+
+   UInt old_eax = (UInt)st->guest_RAX;
+   UInt old_ecx = (UInt)st->guest_RCX;
+
+   switch (old_eax) {
+      case 0x00000000:
+         SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
+         break;
+      case 0x00000001:
+         SET_ABCD(0x000306c3, 0x02100800, 0x7ffafbff, 0xbfebfbff);
+         break;
+      case 0x00000002:
+         SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
+         break;
+      case 0x00000003:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000004:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
+                                      0x0000003f, 0x00000000); break;
+            case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
+                                      0x0000003f, 0x00000000); break;
+            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
+                                      0x000001ff, 0x00000000); break;
+            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
+                                      0x00001fff, 0x00000006); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x00000005:
+         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
+         break;
+      case 0x00000006:
+         SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
+         break;
+      case 0x00000007:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000000, 0x000027ab,
+                                      0x00000000, 0x00000000); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x00000008:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000009:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000a:
+         SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
+         break;
+      case 0x0000000b:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
+                                      0x00000100, 0x00000002); break;
+            case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
+                                      0x00000201, 0x00000002); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      old_ecx,    0x00000002); break;
+         }
+         break;
+      case 0x0000000c:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000d:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
+                                      0x00000340, 0x00000000); break;
+            case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+            case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
+                                      0x00000000, 0x00000000); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x80000000:
+         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000001:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
+         break;
+      case 0x80000002:
+         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
+         break;
+      case 0x80000003:
+         SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
+         break;
+      case 0x80000004:
+         SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
+         break;
+      case 0x80000005:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000006:
+         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
+         break;
+      case 0x80000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
+         break;
+      case 0x80000008:
+         SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      default:
+         SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
+         break;
+   }
+#  undef SET_ABCD
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Misc integer helpers, including rotates and crypto.     ---*/
+/*---------------------------------------------------------------*/
+
  ULong amd64g_calculate_RCR ( ULong arg, 
                               ULong rot_amt, 
                               ULong rflags_in, 
diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c

index 908b84ecaa8c4e9c3170bc92627a65ff97f68b99..a594832543d6eb9528ea20d7005558fd3581bd2a 100644 (file)
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -349,6 +349,13 @@ IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
     vpanic("doScalarWidening(amd64)");
  }
  
+static
+void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
+{
+   IRType ty = typeOfIRExpr(irsb->tyenv, value);
+   stmt( IRStmt_Put(gstOffB,
+                    IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
+}
  
  
  /*------------------------------------------------------------*/
@@ -5195,6 +5202,52 @@ static IRExpr* get_FPU_sw ( void )
  }
  
  
+/* Generate a dirty helper call that initialises the x87 state a la
+   FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
+   |guard| is used as a guarding condition.
+*/
+static void gen_FINIT_SEQUENCE ( IRExpr* guard )
+{
+   /* Uses dirty helper: 
+         void amd64g_do_FINIT ( VexGuestAMD64State* ) */
+   IRDirty* d  = unsafeIRDirty_0_N ( 
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_FINIT", 
+                    &amd64g_dirtyhelper_FINIT,
+                    mkIRExprVec_1( IRExpr_BBPTR() )
+                 );
+
+   /* declare we're writing guest state */
+   d->nFxState = 5;
+   vex_bzero(&d->fxState, sizeof(d->fxState));
+
+   d->fxState[0].fx     = Ifx_Write;
+   d->fxState[0].offset = OFFB_FTOP;
+   d->fxState[0].size   = sizeof(UInt);
+
+   d->fxState[1].fx     = Ifx_Write;
+   d->fxState[1].offset = OFFB_FPREGS;
+   d->fxState[1].size   = 8 * sizeof(ULong);
+
+   d->fxState[2].fx     = Ifx_Write;
+   d->fxState[2].offset = OFFB_FPTAGS;
+   d->fxState[2].size   = 8 * sizeof(UChar);
+
+   d->fxState[3].fx     = Ifx_Write;
+   d->fxState[3].offset = OFFB_FPROUND;
+   d->fxState[3].size   = sizeof(ULong);
+
+   d->fxState[4].fx     = Ifx_Write;
+   d->fxState[4].offset = OFFB_FC3210;
+   d->fxState[4].size   = sizeof(ULong);
+
+   if (guard)
+      d->guard = guard;
+
+   stmt( IRStmt_Dirty(d) );
+}
+
+
  /* ------------------------------------------------------- */
  /* Given all that stack-mangling junk, we can now go ahead
     and describe FP instructions. 
@@ -6309,41 +6362,7 @@ ULong dis_FPU ( /*OUT*/Bool* decode_ok,
                 break;
  
              case 0xE3: {
-               /* Uses dirty helper: 
-                     void amd64g_do_FINIT ( VexGuestAMD64State* ) */
-               IRDirty* d  = unsafeIRDirty_0_N ( 
-                                0/*regparms*/, 
-                                "amd64g_dirtyhelper_FINIT", 
-                                &amd64g_dirtyhelper_FINIT,
-                                mkIRExprVec_1( IRExpr_BBPTR() )
-                             );
-
-               /* declare we're writing guest state */
-               d->nFxState = 5;
-               vex_bzero(&d->fxState, sizeof(d->fxState));
-
-               d->fxState[0].fx     = Ifx_Write;
-               d->fxState[0].offset = OFFB_FTOP;
-               d->fxState[0].size   = sizeof(UInt);
-
-               d->fxState[1].fx     = Ifx_Write;
-               d->fxState[1].offset = OFFB_FPREGS;
-               d->fxState[1].size   = 8 * sizeof(ULong);
-
-               d->fxState[2].fx     = Ifx_Write;
-               d->fxState[2].offset = OFFB_FPTAGS;
-               d->fxState[2].size   = 8 * sizeof(UChar);
-
-               d->fxState[3].fx     = Ifx_Write;
-               d->fxState[3].offset = OFFB_FPROUND;
-               d->fxState[3].size   = sizeof(ULong);
-
-               d->fxState[4].fx     = Ifx_Write;
-               d->fxState[4].offset = OFFB_FC3210;
-               d->fxState[4].size   = sizeof(ULong);
-
-               stmt( IRStmt_Dirty(d) );
-
+               gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
                 DIP("fninit\n");
                 break;
              }
@@ -9875,6 +9894,10 @@ static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
     gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
  }
  
+static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
+   gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
+}
+
  /* Helper for deciding whether a given insn (starting at the opcode
     byte) may validly be used with a LOCK prefix.  The following insns
     may be used with LOCK when their destination operand is in memory.
@@ -11550,6 +11573,495 @@ static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
  }
  
  
+static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
+{
+   /* ------ rfbm[0] gates the x87 state ------ */
+
+   /* Uses dirty helper: 
+         void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
+   */
+   IRDirty* d0 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
+                    &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                 );
+   d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
+                     mkU64(1));
+
+   /* Declare we're writing memory.  Really, bytes 24 through 31
+      (MXCSR and MXCSR_MASK) aren't written, but we can't express more
+      than 1 memory area here, so just mark the whole thing as
+      written. */
+   d0->mFx   = Ifx_Write;
+   d0->mAddr = mkexpr(addr);
+   d0->mSize = 160;
+
+   /* declare we're reading guest state */
+   d0->nFxState = 5;
+   vex_bzero(&d0->fxState, sizeof(d0->fxState));
+
+   d0->fxState[0].fx     = Ifx_Read;
+   d0->fxState[0].offset = OFFB_FTOP;
+   d0->fxState[0].size   = sizeof(UInt);
+
+   d0->fxState[1].fx     = Ifx_Read;
+   d0->fxState[1].offset = OFFB_FPREGS;
+   d0->fxState[1].size   = 8 * sizeof(ULong);
+
+   d0->fxState[2].fx     = Ifx_Read;
+   d0->fxState[2].offset = OFFB_FPTAGS;
+   d0->fxState[2].size   = 8 * sizeof(UChar);
+
+   d0->fxState[3].fx     = Ifx_Read;
+   d0->fxState[3].offset = OFFB_FPROUND;
+   d0->fxState[3].size   = sizeof(ULong);
+
+   d0->fxState[4].fx     = Ifx_Read;
+   d0->fxState[4].offset = OFFB_FC3210;
+   d0->fxState[4].size   = sizeof(ULong);
+
+   stmt( IRStmt_Dirty(d0) );
+
+   /* ------ rfbm[1] gates the SSE state ------ */
+
+   IRTemp rfbm_1    = newTemp(Ity_I64);
+   IRTemp rfbm_1or2 = newTemp(Ity_I64);
+   assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
+   assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
+
+   IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
+   IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
+
+   /* Uses dirty helper: 
+         void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
+                 ( VexGuestAMD64State*, ULong )
+      This creates only MXCSR and MXCSR_MASK.  We need to do this if
+      either components 1 (SSE) or 2 (AVX) are requested.  Hence the
+      guard condition is a bit more complex.
+   */
+   IRDirty* d1 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
+                    &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                 );
+   d1->guard = guard_1or2;
+
+   /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
+      the code for rbfm[0] just above claims a write of 0 .. 159, so
+      this duplicates it.  But at least correctly connects 24 .. 31 to
+      the MXCSR guest state representation (SSEROUND field). */
+   d1->mFx   = Ifx_Write;
+   d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
+   d1->mSize = 8;
+
+   /* declare we're reading guest state */
+   d1->nFxState = 1;
+   vex_bzero(&d1->fxState, sizeof(d1->fxState));
+
+   d1->fxState[0].fx     = Ifx_Read;
+   d1->fxState[0].offset = OFFB_SSEROUND;
+   d1->fxState[0].size   = sizeof(ULong);
+
+   /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
+      else.  We do the actual register array, XMM[0..15], separately,
+      in order that any undefinedness in the XMM registers is tracked
+      separately by Memcheck and does not "infect" the in-memory
+      shadow for the other parts of the image. */
+   stmt( IRStmt_Dirty(d1) );
+
+   /* And now the XMMs themselves. */
+   UInt reg;
+   for (reg = 0; reg < 16; reg++) {
+      stmt( IRStmt_StoreG(
+               Iend_LE,
+               binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
+               getXMMReg(reg),
+               guard_1
+      ));
+   }
+
+   /* ------ rfbm[2] gates the AVX state ------ */
+   /* Component 2 is just a bunch of register saves, so we'll do it
+      inline, just to be simple and to be Memcheck friendly. */
+
+   IRTemp rfbm_2 = newTemp(Ity_I64);
+   assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
+
+   IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
+
+   for (reg = 0; reg < 16; reg++) {
+      stmt( IRStmt_StoreG(
+               Iend_LE,
+               binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
+               getYMMRegLane128(reg,1),
+               guard_2
+      ));
+   }
+}
+
+
+static Long dis_XSAVE ( const VexAbiInfo* vbi,
+                        Prefix pfx, Long delta, Int sz )
+{
+   /* Note that the presence or absence of REX.W (indicated here by
+      |sz|) slightly affects the written format: whether the saved FPU
+      IP and DP pointers are 64 or 32 bits.  But the helper function
+      we call simply writes zero bits in the relevant fields, which
+      are 64 bits regardless of what REX.W is, and so it's good enough
+      (iow, equally broken) in both cases. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_64_aligned(addr);
+
+   DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* VEX's caller is assumed to have checked this. */
+   const ULong aSSUMED_XCR0_VALUE = 7;
+
+   IRTemp rfbm = newTemp(Ity_I64);
+   assign(rfbm,
+          binop(Iop_And64,
+                binop(Iop_Or64,
+                      binop(Iop_Shl64,
+                            unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
+                      unop(Iop_32Uto64, getIRegRAX(4))),
+                mkU64(aSSUMED_XCR0_VALUE)));
+
+   gen_XSAVE_SEQUENCE(addr, rfbm);
+
+   /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
+      OR-ing the RFBM value into it. */
+   IRTemp addr_plus_512 = newTemp(Ity_I64);
+   assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
+   storeLE( mkexpr(addr_plus_512),
+            binop(Iop_Or8,
+                  unop(Iop_64to8, mkexpr(rfbm)),
+                  loadLE(Ity_I8, mkexpr(addr_plus_512))) );
+
+   return delta;
+}
+
+
+static Long dis_FXSAVE ( const VexAbiInfo* vbi,
+                         Prefix pfx, Long delta, Int sz )
+{
+   /* See comment in dis_XSAVE about the significance of REX.W. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_16_aligned(addr);
+
+   DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
+      to 0b011, generate the XSAVE sequence accordingly, and let iropt
+      fold out the unused (AVX) parts accordingly. */
+   IRTemp rfbm = newTemp(Ity_I64);
+   assign(rfbm, mkU64(3));
+   gen_XSAVE_SEQUENCE(addr, rfbm);
+
+   return delta;
+}
+
+
+static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
+{
+   /* ------ rfbm[0] gates the x87 state ------ */
+
+   /* If rfbm[0] == 1, we have to write the x87 state.  If
+      xstate_bv[0] == 1, we will read it from the memory image, else
+      we'll set it to initial values.  Doing this with a helper
+      function and getting the definedness flow annotations correct is
+      too difficult, so generate stupid but simple code: first set the
+      registers to initial values, regardless of xstate_bv[0].  Then,
+      conditionally restore from the memory image. */
+
+   IRTemp rfbm_0       = newTemp(Ity_I64);
+   IRTemp xstate_bv_0  = newTemp(Ity_I64);
+   IRTemp restore_0    = newTemp(Ity_I64);
+   assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
+   assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
+   assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
+
+   gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
+
+   /* Uses dirty helper: 
+         void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
+   */
+   IRDirty* d0 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
+                    &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                 );
+   d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
+
+   /* Declare we're reading memory.  Really, bytes 24 through 31
+      (MXCSR and MXCSR_MASK) aren't read, but we can't express more
+      than 1 memory area here, so just mark the whole thing as
+      read. */
+   d0->mFx   = Ifx_Read;
+   d0->mAddr = mkexpr(addr);
+   d0->mSize = 160;
+
+   /* declare we're writing guest state */
+   d0->nFxState = 5;
+   vex_bzero(&d0->fxState, sizeof(d0->fxState));
+
+   d0->fxState[0].fx     = Ifx_Write;
+   d0->fxState[0].offset = OFFB_FTOP;
+   d0->fxState[0].size   = sizeof(UInt);
+
+   d0->fxState[1].fx     = Ifx_Write;
+   d0->fxState[1].offset = OFFB_FPREGS;
+   d0->fxState[1].size   = 8 * sizeof(ULong);
+
+   d0->fxState[2].fx     = Ifx_Write;
+   d0->fxState[2].offset = OFFB_FPTAGS;
+   d0->fxState[2].size   = 8 * sizeof(UChar);
+
+   d0->fxState[3].fx     = Ifx_Write;
+   d0->fxState[3].offset = OFFB_FPROUND;
+   d0->fxState[3].size   = sizeof(ULong);
+
+   d0->fxState[4].fx     = Ifx_Write;
+   d0->fxState[4].offset = OFFB_FC3210;
+   d0->fxState[4].size   = sizeof(ULong);
+
+   stmt( IRStmt_Dirty(d0) );
+
+   /* ------ rfbm[1] gates the SSE state ------ */
+
+   /* Same scheme as component 0: first zero it out, and then possibly
+      restore from the memory area. */
+   IRTemp rfbm_1       = newTemp(Ity_I64);
+   IRTemp xstate_bv_1  = newTemp(Ity_I64);
+   IRTemp restore_1    = newTemp(Ity_I64);
+   assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
+   assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
+   assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
+   IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
+   IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
+
+   IRTemp rfbm_1or2       = newTemp(Ity_I64);
+   IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
+   IRTemp restore_1or2    = newTemp(Ity_I64);
+   assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
+   assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
+   assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
+                                           mkexpr(xstate_bv_1or2)));
+   IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
+   IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
+
+   /* The areas in question are: SSEROUND, and the XMM register array. */
+   putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
+
+   UInt reg;
+   for (reg = 0; reg < 16; reg++) {
+      putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
+   }
+
+   /* And now possibly restore from MXCSR/MXCSR_MASK */
+   /* Uses dirty helper: 
+         void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
+                 ( VexGuestAMD64State*, ULong )
+      This restores from only MXCSR and MXCSR_MASK.  We need to do
+      this if either components 1 (SSE) or 2 (AVX) are requested.
+      Hence the guard condition is a bit more complex.
+   */
+   IRDirty* d1 = unsafeIRDirty_0_N (
+                    0/*regparms*/, 
+                    "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
+                    &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
+                    mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
+                ) ;
+   d1->guard = restore_1or2e;
+
+   /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
+      the code for rbfm[0] just above claims a read of 0 .. 159, so
+      this duplicates it.  But at least correctly connects 24 .. 31 to
+      the MXCSR guest state representation (SSEROUND field). */
+   d1->mFx   = Ifx_Read;
+   d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
+   d1->mSize = 8;
+
+   /* declare we're writing guest state */
+   d1->nFxState = 1;
+   vex_bzero(&d1->fxState, sizeof(d1->fxState));
+
+   d1->fxState[0].fx     = Ifx_Write;
+   d1->fxState[0].offset = OFFB_SSEROUND;
+   d1->fxState[0].size   = sizeof(ULong);
+
+   /* Call the helper.  This creates SSEROUND but nothing
+      else.  We do the actual register array, XMM[0..15], separately,
+      in order that any undefinedness in the XMM registers is tracked
+      separately by Memcheck and is not "infected" by the in-memory
+      shadow for the other parts of the image. */
+   stmt( IRStmt_Dirty(d1) );
+
+   /* And now the XMMs themselves.  For each register, we PUT either
+      its old value, or the value loaded from memory.  One convenient
+      way to do that is with a conditional load that has its the
+      default value, the old value of the register. */
+   for (reg = 0; reg < 16; reg++) {
+      IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
+      IRExpr* alt = getXMMReg(reg);
+      IRTemp  loadedValue = newTemp(Ity_V128);
+      stmt( IRStmt_LoadG(Iend_LE,
+                         ILGop_IdentV128, 
+                         loadedValue, ea, alt, restore_1e) );
+      putXMMReg(reg, mkexpr(loadedValue));
+   }
+
+   /* ------ rfbm[2] gates the AVX state ------ */
+   /* Component 2 is just a bunch of register loads, so we'll do it
+      inline, just to be simple and to be Memcheck friendly. */
+
+   /* Same scheme as component 0: first zero it out, and then possibly
+      restore from the memory area. */
+   IRTemp rfbm_2      = newTemp(Ity_I64);
+   IRTemp xstate_bv_2 = newTemp(Ity_I64);
+   IRTemp restore_2   = newTemp(Ity_I64);
+   assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
+   assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
+   assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
+
+   IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
+   IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
+
+   for (reg = 0; reg < 16; reg++) {
+      putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
+   }
+
+   for (reg = 0; reg < 16; reg++) {
+      IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
+      IRExpr* alt = getYMMRegLane128(reg, 1);
+      IRTemp  loadedValue = newTemp(Ity_V128);
+      stmt( IRStmt_LoadG(Iend_LE,
+                         ILGop_IdentV128, 
+                         loadedValue, ea, alt, restore_2e) );
+      putYMMRegLane128(reg, 1, mkexpr(loadedValue));
+   }
+}
+
+
+static Long dis_XRSTOR ( const VexAbiInfo* vbi,
+                         Prefix pfx, Long delta, Int sz )
+{
+   /* As with XRSTOR above we ignore the value of REX.W since we're
+      not bothering with the FPU DP and IP fields. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_64_aligned(addr);
+
+   DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* VEX's caller is assumed to have checked this. */
+   const ULong aSSUMED_XCR0_VALUE = 7;
+
+   IRTemp rfbm = newTemp(Ity_I64);
+   assign(rfbm,
+          binop(Iop_And64,
+                binop(Iop_Or64,
+                      binop(Iop_Shl64,
+                            unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
+                      unop(Iop_32Uto64, getIRegRAX(4))),
+                mkU64(aSSUMED_XCR0_VALUE)));
+
+   IRTemp xstate_bv = newTemp(Ity_I64);
+   assign(xstate_bv, loadLE(Ity_I64,
+                            binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
+
+   IRTemp xcomp_bv = newTemp(Ity_I64);
+   assign(xcomp_bv, loadLE(Ity_I64,
+                           binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
+
+   IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
+   assign( xsavehdr_23_16, 
+           loadLE(Ity_I64,
+                  binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
+
+   /* We must fault if 
+      * xcomp_bv[63] == 1, since this simulated CPU does not support
+        the compaction extension.
+      * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
+      * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
+        imply that xcomp_bv must be zero.
+      xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
+   */
+   IRTemp fault_if_nonzero = newTemp(Ity_I64);
+   assign(fault_if_nonzero,
+          binop(Iop_Or64,
+                binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
+                binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
+   stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
+                     Ijk_SigSEGV,
+                     IRConst_U64(guest_RIP_curr_instr),
+                     OFFB_RIP
+   ));
+
+   /* We are guaranteed now that both xstate_bv and rfbm are in the
+      range 0 .. 7.  Generate the restore sequence proper. */
+   gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
+
+   return delta;
+}
+
+
+static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
+                          Prefix pfx, Long delta, Int sz )
+{
+   /* As with FXSAVE above we ignore the value of REX.W since we're
+      not bothering with the FPU DP and IP fields. */
+   IRTemp addr  = IRTemp_INVALID;
+   Int    alen  = 0;
+   HChar  dis_buf[50];
+   UChar  modrm = getUChar(delta);
+   vassert(!epartIsReg(modrm)); /* ensured by caller */
+   vassert(sz == 4 || sz == 8); /* ditto */
+
+   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+   delta += alen;
+   gen_SEGV_if_not_16_aligned(addr);
+
+   DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
+
+   /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
+      as if components 0 and 1 are set as present in XSTATE_BV in the
+      XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
+      generate the XRSTOR sequence accordingly, and let iropt fold out
+      the unused (AVX) parts accordingly. */
+   IRTemp three = newTemp(Ity_I64);
+   assign(three, mkU64(3));
+   gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
+
+   return delta;
+}
+
+
  static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
  {
     vassert(imm8 >= 0 && imm8 <= 7);
@@ -11794,6 +12306,7 @@ static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
  __attribute__((noinline))
  static
  Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
+                        const VexArchInfo* archinfo,
                          const VexAbiInfo* vbi,
                          Prefix pfx, Int sz, Long deltaIN,
                          DisResult* dres )
@@ -13620,166 +14133,34 @@ Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
           delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
           goto decode_success;
        }
-      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
-         Note that the presence or absence of REX.W slightly affects the
-         written format: whether the saved FPU IP and DP pointers are 64
-         or 32 bits.  But the helper function we call simply writes zero
-         bits in the relevant fields (which are 64 bits regardless of
-         what REX.W is) and so it's good enough (iow, equally broken) in
-         both cases. */
+      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
        if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
            && !epartIsReg(getUChar(delta))
            && gregOfRexRM(pfx,getUChar(delta)) == 0) {
-          IRDirty* d;
-         modrm = getUChar(delta);
-         vassert(!epartIsReg(modrm));
-
-         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-         delta += alen;
-         gen_SEGV_if_not_16_aligned(addr);
-
-         DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
-
-         /* Uses dirty helper: 
-              void amd64g_do_FXSAVE_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
-                                                     ULong ) */
-         d = unsafeIRDirty_0_N ( 
-                0/*regparms*/, 
-                "amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM",
-                &amd64g_dirtyhelper_FXSAVE_ALL_EXCEPT_XMM,
-                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
-             );
-
-         /* declare we're writing memory */
-         d->mFx   = Ifx_Write;
-         d->mAddr = mkexpr(addr);
-         d->mSize = 464; /* according to recent Intel docs */
-
-         /* declare we're reading guest state */
-         d->nFxState = 6;
-         vex_bzero(&d->fxState, sizeof(d->fxState));
-
-         d->fxState[0].fx     = Ifx_Read;
-         d->fxState[0].offset = OFFB_FTOP;
-         d->fxState[0].size   = sizeof(UInt);
-
-         d->fxState[1].fx     = Ifx_Read;
-         d->fxState[1].offset = OFFB_FPREGS;
-         d->fxState[1].size   = 8 * sizeof(ULong);
-
-         d->fxState[2].fx     = Ifx_Read;
-         d->fxState[2].offset = OFFB_FPTAGS;
-         d->fxState[2].size   = 8 * sizeof(UChar);
-
-         d->fxState[3].fx     = Ifx_Read;
-         d->fxState[3].offset = OFFB_FPROUND;
-         d->fxState[3].size   = sizeof(ULong);
-
-         d->fxState[4].fx     = Ifx_Read;
-         d->fxState[4].offset = OFFB_FC3210;
-         d->fxState[4].size   = sizeof(ULong);
-
-         d->fxState[5].fx     = Ifx_Read;
-         d->fxState[5].offset = OFFB_SSEROUND;
-         d->fxState[5].size   = sizeof(ULong);
-
-         /* Call the helper.  This creates all parts of the in-memory
-            image except for the XMM[0..15] array, which we do
-            separately, in order that any undefinedness in the XMM
-            registers is tracked separately by Memcheck and does not
-            "infect" the in-memory shadow for the other parts of the
-            image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
-            SSEROUND). */
-         stmt( IRStmt_Dirty(d) );
-
-         /* And now the XMMs themselves. */
-         UInt xmm;
-         for (xmm = 0; xmm < 16; xmm++) {
-            storeLE( binop(Iop_Add64, mkexpr(addr), mkU64(160 + xmm * 16)),
-                     getXMMReg(xmm) );
-         }
-
+         delta = dis_FXSAVE(vbi, pfx, delta, sz);
           goto decode_success;
        }
-      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
-         As with FXSAVE above we ignore the value of REX.W since we're
-         not bothering with the FPU DP and IP fields. */
+      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
        if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
            && !epartIsReg(getUChar(delta))
            && gregOfRexRM(pfx,getUChar(delta)) == 1) {
-         IRDirty* d;
-         modrm = getUChar(delta);
-         vassert(!epartIsReg(modrm));
-
-         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
-         delta += alen;
-         gen_SEGV_if_not_16_aligned(addr);
-
-         DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
-
-         /* Uses dirty helper: 
-              VexEmNote amd64g_do_FXRSTOR_ALL_EXCEPT_XMM ( VexGuestAMD64State*,
-                                                           ULong )
-            NOTE:
-              the VexEmNote value is simply ignored
-         */
-         d = unsafeIRDirty_0_N ( 
-                0/*regparms*/, 
-                "amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM", 
-                &amd64g_dirtyhelper_FXRSTOR_ALL_EXCEPT_XMM,
-                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
-             );
-
-         /* declare we're reading memory */
-         d->mFx   = Ifx_Read;
-         d->mAddr = mkexpr(addr);
-         d->mSize = 464; /* according to recent Intel docs */
-
-         /* declare we're writing guest state */
-         d->nFxState = 6;
-         vex_bzero(&d->fxState, sizeof(d->fxState));
-
-         d->fxState[0].fx     = Ifx_Write;
-         d->fxState[0].offset = OFFB_FTOP;
-         d->fxState[0].size   = sizeof(UInt);
-
-         d->fxState[1].fx     = Ifx_Write;
-         d->fxState[1].offset = OFFB_FPREGS;
-         d->fxState[1].size   = 8 * sizeof(ULong);
-
-         d->fxState[2].fx     = Ifx_Write;
-         d->fxState[2].offset = OFFB_FPTAGS;
-         d->fxState[2].size   = 8 * sizeof(UChar);
-
-         d->fxState[3].fx     = Ifx_Write;
-         d->fxState[3].offset = OFFB_FPROUND;
-         d->fxState[3].size   = sizeof(ULong);
-
-         d->fxState[4].fx     = Ifx_Write;
-         d->fxState[4].offset = OFFB_FC3210;
-         d->fxState[4].size   = sizeof(ULong);
-
-         d->fxState[5].fx     = Ifx_Write;
-         d->fxState[5].offset = OFFB_SSEROUND;
-         d->fxState[5].size   = sizeof(ULong);
-
-         /* Call the helper.  This reads all parts of the in-memory
-            image except for the XMM[0..15] array, which we do
-            separately, in order that any undefinedness in the XMM
-            registers is tracked separately by Memcheck and does not
-            "infect" the in-guest-state shadow for the other parts of the
-            image (FPTOP, FPREGS, FPTAGS, FPROUND, FC3210,
-            SSEROUND). */
-         stmt( IRStmt_Dirty(d) );
-
-         /* And now the XMMs themselves. */
-         UInt xmm;
-         for (xmm = 0; xmm < 16; xmm++) {
-            putXMMReg(xmm, loadLE(Ity_V128,
-                                  binop(Iop_Add64, mkexpr(addr),
-                                                   mkU64(160 + xmm * 16))));
-         }
-
+         delta = dis_FXRSTOR(vbi, pfx, delta, sz);
+         goto decode_success;
+      }
+      /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
+      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+          && !epartIsReg(getUChar(delta))
+          && gregOfRexRM(pfx,getUChar(delta)) == 4
+          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+         delta = dis_XSAVE(vbi, pfx, delta, sz);
+         goto decode_success;
+      }
+      /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
+      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+          && !epartIsReg(getUChar(delta))
+          && gregOfRexRM(pfx,getUChar(delta)) == 5
+          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+         delta = dis_XRSTOR(vbi, pfx, delta, sz);
           goto decode_success;
        }
        break;
@@ -21524,31 +21905,20 @@ Long dis_ESC_0F (
        const HChar* fName = NULL;
        void*        fAddr = NULL;
  
-      /* JRS 2014-11-11: this a really horrible temp kludge to work
-         around the fact that the Yosemite (OSX 10.10)
-         /usr/lib/system/libdyld.dylib expects XSAVE/XRSTOR to be
-         implemented, because amd64g_dirtyhelper_CPUID_avx_and_cx16
-         claims they are supported, but so far they aren't.  So cause
-         it to fall back to a simpler CPU.  The cleaner approach of
-         setting CPUID(eax=1).OSXSAVE=0 and .XSAVE=0 isn't desirable
-         since it will (per the official Intel guidelines) lead to
-         software concluding that AVX isn't supported.
-
-         This is also a kludge in that putting these ifdefs here checks
-         the build (host) architecture, when really we're checking the
-         guest architecture. */
-      Bool this_is_yosemite = False;
-#     if defined(VGP_amd64_darwin) && DARWIN_VERS == DARWIN_10_10
-      this_is_yosemite = True;
-#     endif
-
        if (haveF2orF3(pfx)) goto decode_failure;
+
        /* This isn't entirely correct, CPUID should depend on the VEX
           capabilities, not on the underlying CPU. See bug #324882. */
-      if (!this_is_yosemite &&
-          (archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
+      if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
            (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
-          (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
+          (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) {
+         fName = "amd64g_dirtyhelper_CPUID_avx2";
+         fAddr = &amd64g_dirtyhelper_CPUID_avx2;
+         /* This is a Core-i7-4910-like machine */
+      }
+      else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
+               (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
+               (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
           fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
           fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
           /* This is a Core-i5-2300-like machine */
@@ -22050,7 +22420,8 @@ Long dis_ESC_0F (
        facility in 64 bit mode. */
     {
        Bool decode_OK = False;
-      delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
+      delta = dis_ESC_0F__SSE2 ( &decode_OK,
+                                 archinfo, vbi, pfx, sz, deltaIN, dres );
        if (decode_OK)
           return delta;
     }
diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c

index 5e139e3750c9c326fd5cceeb8e6f040cffabb46a..76c74c30cc51f1f9283bfbd4dfd3cdb5c518b8eb 100644 (file)
--- a/VEX/priv/host_amd64_defs.c
+++ b/VEX/priv/host_amd64_defs.c
@@ -910,6 +910,28 @@ AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
     vassert(sz == 4 || sz == 8 || sz == 16);
     return i;
  }
+AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
+                                   HReg src, AMD64AMode* addr )
+{
+   AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag                = Ain_SseCStore;
+   i->Ain.SseCStore.cond = cond;
+   i->Ain.SseCStore.src  = src;
+   i->Ain.SseCStore.addr = addr;
+   vassert(cond != Acc_ALWAYS);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
+                                  AMD64AMode* addr, HReg dst )
+{
+   AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+   i->tag               = Ain_SseCLoad;
+   i->Ain.SseCLoad.cond = cond;
+   i->Ain.SseCLoad.addr = addr;
+   i->Ain.SseCLoad.dst  = dst;
+   vassert(cond != Acc_ALWAYS);
+   return i;
+}
  AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
  {
     AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
@@ -1268,6 +1290,24 @@ void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
              ppAMD64AMode(i->Ain.SseLdSt.addr);
           }
           return;
+      case Ain_SseCStore:
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.SseCStore.cond));
+         vex_printf("movups ");
+         ppHRegAMD64(i->Ain.SseCStore.src);
+         vex_printf(", ");
+         ppAMD64AMode(i->Ain.SseCStore.addr);
+         vex_printf(" }");
+         return;
+      case Ain_SseCLoad:
+         vex_printf("if (%%rflags.%s) { ",
+                    showAMD64CondCode(i->Ain.SseCLoad.cond));
+         vex_printf("movups ");
+         ppAMD64AMode(i->Ain.SseCLoad.addr);
+         vex_printf(", ");
+         ppHRegAMD64(i->Ain.SseCLoad.dst);
+         vex_printf(" }");
+         return;
        case Ain_SseLdzLO:
           vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
           ppAMD64AMode(i->Ain.SseLdzLO.addr);
@@ -1566,6 +1606,14 @@ void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
           addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
                         i->Ain.SseLdSt.reg);
           return;
+      case Ain_SseCStore:
+         addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
+         addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
+         return;
+      case Ain_SseCLoad:
+         addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
+         addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
+         return;
        case Ain_SseLdzLO:
           addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
           addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
@@ -1799,6 +1847,14 @@ void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
           mapReg(m, &i->Ain.SseLdSt.reg);
           mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
           break;
+      case Ain_SseCStore:
+         mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
+         mapReg(m, &i->Ain.SseCStore.src);
+         return;
+      case Ain_SseCLoad:
+         mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
+         mapReg(m, &i->Ain.SseCLoad.dst);
+         return;
        case Ain_SseLdzLO:
           mapReg(m, &i->Ain.SseLdzLO.reg);
           mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
@@ -2366,7 +2422,7 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
     UChar* p = &buf[0];
     UChar* ptmp;
     Int    j;
-   vassert(nbuf >= 32);
+   vassert(nbuf >= 64);
     vassert(mode64 == True);
  
     /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
@@ -2823,13 +2879,33 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
                 *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
                 break;
              case RLPri_2Int:
-               vassert(0); //ATC
+               goto bad; //ATC
                 // movabsq $0x5555555555555555, %rax
                 *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
                 // movq %rax, %rdx
                 *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
+               break;
+            case RLPri_V128SpRel:
+               if (i->Ain.Call.rloc.spOff == 0) {
+                  // We could accept any |spOff| here, but that's more
+                  // hassle and the only value we're ever going to get
+                  // is zero (I believe.)  Hence take the easy path :)
+                  // We need a scag register -- r11 can be it.
+                  // movabsq $0x5555555555555555, %r11
+                  *p++ = 0x49; *p++ = 0xBB;
+                  p = emit64(p, 0x5555555555555555ULL);
+                  // movq %r11, 0(%rsp)
+                  *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
+                  // movq %r11, 8(%rsp)
+                  *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
+                  *p++ = 0x08;
+                  break;
+               }
+               goto bad; //ATC for all other spOff values
+            case RLPri_V256SpRel:
+               goto bad; //ATC
              case RLPri_None: case RLPri_INVALID: default:
-               vassert(0);
+               vassert(0); // should never get here
           }
  
           // after:
@@ -3081,7 +3157,7 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
     }
  
     case Ain_CStore: {
-      /* AFAICS this is identical to Ain_CStore except that the opcode
+      /* AFAICS this is identical to Ain_CLoad except that the opcode
           is 0x89 instead of 0x8B. */
        vassert(i->Ain.CStore.cond != Acc_ALWAYS);
  
@@ -3418,6 +3494,60 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
                             i->Ain.SseLdSt.addr);
        goto done;
  
+   case Ain_SseCStore: {
+      vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
+      ptmp = p; /* fill in this bit later */
+      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+
+      /* Now the store. */
+      *p++ = clearWBit(
+             rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
+                            i->Ain.SseCStore.addr));
+      *p++ = 0x0F; 
+      *p++ = toUChar(0x11);
+      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
+                           i->Ain.SseCStore.addr);
+
+      /* Fix up the conditional branch */
+      Int delta = p - ptmp;
+      vassert(delta > 0 && delta < 40);
+      *ptmp = toUChar(delta-1);
+      goto done;
+   }
+
+   case Ain_SseCLoad: {
+      vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
+
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
+      ptmp = p; /* fill in this bit later */
+      *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+
+      /* Now the load. */
+      *p++ = clearWBit(
+             rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
+                            i->Ain.SseCLoad.addr));
+      *p++ = 0x0F; 
+      *p++ = toUChar(0x10);
+      p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
+                           i->Ain.SseCLoad.addr);
+
+      /* Fix up the conditional branch */
+      Int delta = p - ptmp;
+      vassert(delta > 0 && delta < 40);
+      *ptmp = toUChar(delta-1);
+      goto done;
+   }
+
     case Ain_SseLdzLO:
        vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
        /* movs[sd] amode, %xmm-dst */
@@ -3726,7 +3856,7 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
     /*NOTREACHED*/
     
    done:
-   vassert(p - &buf[0] <= 32);
+   vassert(p - &buf[0] <= 64);
     return p - &buf[0];
  }
  
diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h

index f76cd8392aedcb44f8c0fe777ec54c9fd6f9d282..523d18d772a266daa61db429c937abb864da234a 100644 (file)
--- a/VEX/priv/host_amd64_defs.h
+++ b/VEX/priv/host_amd64_defs.h
@@ -390,6 +390,8 @@ typedef
        Ain_SseSDSS,     /* scalar float32 to/from float64 */
        Ain_SseLdSt,     /* SSE load/store 32/64/128 bits, no alignment
                            constraints, upper 96/64/0 bits arbitrary */
+      Ain_SseCStore,   /* SSE conditional store, 128 bit only, any alignment */
+      Ain_SseCLoad,    /* SSE conditional load, 128 bit only, any alignment */
        Ain_SseLdzLO,    /* SSE load low 32/64 bits, zero remainder of reg */
        Ain_Sse32Fx4,    /* SSE binary, 32Fx4 */
        Ain_Sse32FLo,    /* SSE binary, 32F in lowest lane only */
@@ -641,6 +643,16 @@ typedef
              HReg        reg;
              AMD64AMode* addr;
           } SseLdSt;
+         struct {
+            AMD64CondCode cond; /* may not be Acc_ALWAYS */
+            HReg          src;
+            AMD64AMode*   addr;
+         } SseCStore;
+         struct {
+            AMD64CondCode cond; /* may not be Acc_ALWAYS */
+            AMD64AMode*   addr;
+            HReg          dst;
+         } SseCLoad;
           struct {
              Int         sz; /* 4 or 8 only */
              HReg        reg;
@@ -751,6 +763,8 @@ extern AMD64Instr* AMD64Instr_SseSI2SF   ( Int szS, Int szD, HReg src, HReg dst
  extern AMD64Instr* AMD64Instr_SseSF2SI   ( Int szS, Int szD, HReg src, HReg dst );
  extern AMD64Instr* AMD64Instr_SseSDSS    ( Bool from64, HReg src, HReg dst );
  extern AMD64Instr* AMD64Instr_SseLdSt    ( Bool isLoad, Int sz, HReg, AMD64AMode* );
+extern AMD64Instr* AMD64Instr_SseCStore  ( AMD64CondCode, HReg, AMD64AMode* );
+extern AMD64Instr* AMD64Instr_SseCLoad   ( AMD64CondCode, AMD64AMode*, HReg );
  extern AMD64Instr* AMD64Instr_SseLdzLO   ( Int sz, HReg, AMD64AMode* );
  extern AMD64Instr* AMD64Instr_Sse32Fx4   ( AMD64SseOp, HReg, HReg );
  extern AMD64Instr* AMD64Instr_Sse32FLo   ( AMD64SseOp, HReg, HReg );
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c

index 9d9d78e35149969fe1ca65ad6c8420c58d5b02fd..8be498b218a27ec6394a6db71d3afdbba59ff166 100644 (file)
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -4298,21 +4298,35 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
  
        UChar szB = 0; /* invalid */
        switch (lg->cvt) {
-         case ILGop_Ident32: szB = 4; break;
-         case ILGop_Ident64: szB = 8; break;
+         case ILGop_Ident32:   szB = 4;  break;
+         case ILGop_Ident64:   szB = 8;  break;
+         case ILGop_IdentV128: szB = 16; break;
           default: break;
        }
        if (szB == 0)
           goto stmt_fail;
  
-      AMD64AMode* amAddr = iselIntExpr_AMode(env, lg->addr);
-      HReg rAlt  = iselIntExpr_R(env, lg->alt);
-      HReg rDst  = lookupIRTemp(env, lg->dst);
+      AMD64AMode* amAddr
+         = iselIntExpr_AMode(env, lg->addr);
+      HReg rAlt
+         = szB == 16 ? iselVecExpr(env, lg->alt)
+                     : iselIntExpr_R(env, lg->alt);
+      HReg rDst
+         = lookupIRTemp(env, lg->dst);
+
        /* Get the alt value into the dst.  We'll do a conditional load
           which overwrites it -- or not -- with loaded data. */
-      addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
+      if (szB == 16) {
+         addInstr(env, mk_vMOVsd_RR(rAlt, rDst));
+      } else {
+         addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
+      }
        AMD64CondCode cc = iselCondCode(env, lg->guard);
-      addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
+      if (szB == 16) {
+         addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst));
+      } else {
+         addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
+      }
        return;
     }
  
@@ -4324,17 +4338,26 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
  
        UChar szB = 0; /* invalid */
        switch (typeOfIRExpr(env->type_env, sg->data)) {
-         case Ity_I32: szB = 4; break;
-         case Ity_I64: szB = 8; break;
+         case Ity_I32:  szB = 4; break;
+         case Ity_I64:  szB = 8; break;
+         case Ity_V128: szB = 16; break;
           default: break;
        }
        if (szB == 0)
           goto stmt_fail;
  
-      AMD64AMode*   amAddr = iselIntExpr_AMode(env, sg->addr);
-      HReg          rSrc   = iselIntExpr_R(env, sg->data);
-      AMD64CondCode cc     = iselCondCode(env, sg->guard);
-      addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
+      AMD64AMode* amAddr
+         = iselIntExpr_AMode(env, sg->addr);
+      HReg rSrc
+         = szB == 16 ? iselVecExpr(env, sg->data)
+                     : iselIntExpr_R(env, sg->data);
+      AMD64CondCode cc
+         = iselCondCode(env, sg->guard);
+      if (szB == 16) {
+         addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr));
+      } else {
+         addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
+      }
        return;
     }
  
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c

index 305e915bb82bca31d043b591bd692e12f9226167..dd9d7ba948667d5c7234219f6f5f7655af416b05 100644 (file)
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -1470,13 +1470,14 @@ void ppIRStoreG ( const IRStoreG* sg )
  void ppIRLoadGOp ( IRLoadGOp cvt )
  {
     switch (cvt) {
-      case ILGop_INVALID: vex_printf("ILGop_INVALID"); break;      
-      case ILGop_Ident64: vex_printf("Ident64"); break;      
-      case ILGop_Ident32: vex_printf("Ident32"); break;      
-      case ILGop_16Uto32: vex_printf("16Uto32"); break;      
-      case ILGop_16Sto32: vex_printf("16Sto32"); break;      
-      case ILGop_8Uto32:  vex_printf("8Uto32"); break;      
-      case ILGop_8Sto32:  vex_printf("8Sto32"); break;      
+      case ILGop_INVALID:   vex_printf("ILGop_INVALID"); break;      
+      case ILGop_IdentV128: vex_printf("IdentV128"); break;      
+      case ILGop_Ident64:   vex_printf("Ident64"); break;      
+      case ILGop_Ident32:   vex_printf("Ident32"); break;      
+      case ILGop_16Uto32:   vex_printf("16Uto32"); break;      
+      case ILGop_16Sto32:   vex_printf("16Sto32"); break;      
+      case ILGop_8Uto32:    vex_printf("8Uto32"); break;      
+      case ILGop_8Sto32:    vex_printf("8Sto32"); break;      
        default: vpanic("ppIRLoadGOp");
     }
  }
@@ -3525,6 +3526,8 @@ void typeOfIRLoadGOp ( IRLoadGOp cvt,
                         /*OUT*/IRType* t_res, /*OUT*/IRType* t_arg )
  {
     switch (cvt) {
+      case ILGop_IdentV128:
+         *t_res = Ity_V128; *t_arg = Ity_V128; break;
        case ILGop_Ident64:
           *t_res = Ity_I64; *t_arg = Ity_I64; break;
        case ILGop_Ident32:
diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c

index 52cef9bad23a555636b1872f28122cc69ab88233..c9b2c3ba6de958ae39b3ac85b9095e93129ca6f9 100644 (file)
--- a/VEX/priv/ir_opt.c
+++ b/VEX/priv/ir_opt.c
@@ -1264,6 +1264,7 @@ static IRExpr* mkZeroOfPrimopResultType ( IROp op )
        case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
        case Iop_XorV128:
        case Iop_AndV128: return IRExpr_Const(IRConst_V128(0));
+      case Iop_XorV256:
        case Iop_AndV256: return IRExpr_Const(IRConst_V256(0));
        default: vpanic("mkZeroOfPrimopResultType: bad primop");
     }
@@ -2285,6 +2286,7 @@ static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e )
              case Iop_Xor32:
              case Iop_Xor64:
              case Iop_XorV128:
+            case Iop_XorV256:
                 /* Xor8/16/32/64/V128(t,t) ==> 0, for some IRTemp t */
                 if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                    e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
@@ -2887,6 +2889,8 @@ IRSB* cprop_BB ( IRSB* in )
        typeOfIRLoadGOp(lg->cvt, &cvtRes, &cvtArg);
        IROp cvtOp = Iop_INVALID;
        switch (lg->cvt) {
+         case ILGop_IdentV128:
+         case ILGop_Ident64:
           case ILGop_Ident32: break;
           case ILGop_8Uto32:  cvtOp = Iop_8Uto32;  break;
           case ILGop_8Sto32:  cvtOp = Iop_8Sto32;  break;
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h

index 67edc8a7dd743a3d263b4d86ff26d8f0d89f2e25..e61291cc6486c798e3574878b296f77ae3354948 100644 (file)
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -2592,6 +2592,7 @@ typedef
  typedef
     enum {
        ILGop_INVALID=0x1D00,
+      ILGop_IdentV128, /* 128 bit vector, no conversion */
        ILGop_Ident64,   /* 64 bit, no conversion */
        ILGop_Ident32,   /* 32 bit, no conversion */
        ILGop_16Uto32,   /* 16 bit load, Z-widen to 32 */
author	Julian Seward <jseward@acm.org>
	Wed, 12 Aug 2015 11:15:53 +0000 (11:15 +0000)
committer	Julian Seward <jseward@acm.org>
	Wed, 12 Aug 2015 11:15:53 +0000 (11:15 +0000)
VEX/priv/guest_amd64_defs.h		patch \| blob \| blame \| history
VEX/priv/guest_amd64_helpers.c		patch \| blob \| blame \| history
VEX/priv/guest_amd64_toIR.c		patch \| blob \| blame \| history
VEX/priv/host_amd64_defs.c		patch \| blob \| blame \| history
VEX/priv/host_amd64_defs.h		patch \| blob \| blame \| history
VEX/priv/host_amd64_isel.c		patch \| blob \| blame \| history
VEX/priv/ir_defs.c		patch \| blob \| blame \| history
VEX/priv/ir_opt.c		patch \| blob \| blame \| history
VEX/pub/libvex_ir.h		patch \| blob \| blame \| history