From: Julian Seward <jseward@acm.org>
Date: Fri, 16 Aug 2013 08:31:29 +0000 (+0000)
Subject: Add support for direct V256 shadow helper returns -- memcheck side.
X-Git-Tag: svn/VALGRIND_3_9_0~191
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cd24a631d94bc185bb245606e3128ed645b77d7d;p=thirdparty%2Fvalgrind.git

Add support for direct V256 shadow helper returns -- memcheck side.
(Patrick J. LoPresti, lopresti@gmail.com).  Bug 294285.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13500
---

diff --git a/memcheck/mc_include.h b/memcheck/mc_include.h
index 27815368dd..24d53fd947 100644
--- a/memcheck/mc_include.h
+++ b/memcheck/mc_include.h
@@ -580,6 +580,8 @@ VG_REGPARM(2) void MC_(helperc_STOREV16be) ( Addr, UWord );
 VG_REGPARM(2) void MC_(helperc_STOREV16le) ( Addr, UWord );
 VG_REGPARM(2) void MC_(helperc_STOREV8)    ( Addr, UWord );
 
+VG_REGPARM(2) void  MC_(helperc_LOADV256be) ( /*OUT*/V256*, Addr );
+VG_REGPARM(2) void  MC_(helperc_LOADV256le) ( /*OUT*/V256*, Addr );
 VG_REGPARM(2) void  MC_(helperc_LOADV128be) ( /*OUT*/V128*, Addr );
 VG_REGPARM(2) void  MC_(helperc_LOADV128le) ( /*OUT*/V128*, Addr );
 VG_REGPARM(1) ULong MC_(helperc_LOADV64be)  ( Addr );
diff --git a/memcheck/mc_main.c b/memcheck/mc_main.c
index 4608e98412..dbc347f2f7 100644
--- a/memcheck/mc_main.c
+++ b/memcheck/mc_main.c
@@ -1130,43 +1130,45 @@ static Bool parse_ignore_ranges ( const HChar* str0 )
 
 static
 __attribute__((noinline))
-void mc_LOADV128_slow ( /*OUT*/V128* res, Addr a, Bool bigendian )
-{
-   SizeT  nBits          = 128;
-   V128   vbits128;      /* result */
-   V128   pessim128;     /* only used when p-l-ok=yes */
-   SSizeT bytes_per_long = 64 / 8;
-   SSizeT szL            = nBits / 64;  /* Size in longs */
-   SSizeT szB            = bytes_per_long * szL;
+void mc_LOADVx_slow ( /*OUT*/ULong* res, Addr a, SizeT nBits, Bool bigendian )
+{
+   ULong  pessim[4];  /* only used when p-l-ok=yes */
+   SSizeT szB            = nBits / 8;
+   SSizeT szL            = szB / 8;  /* Size in longs */
    SSizeT i, j;          /* Must be signed. */
    SizeT  n_addrs_bad = 0;
    Addr   ai;
    UChar  vbits8;
    Bool   ok;
 
-   vbits128.w64[0] = V_BITS64_UNDEFINED;
-   vbits128.w64[1] = V_BITS64_UNDEFINED;
-   pessim128.w64[0] = V_BITS64_DEFINED;
-   pessim128.w64[1] = V_BITS64_DEFINED;
+   /* Code below assumes load size is a power of two and at least 64
+      bits. */
+   tl_assert((szB & (szB-1)) == 0 && szL > 0);
 
-   tl_assert(nBits == 128);
+   /* If this triggers, you probably just need to increase the size of
+      the pessim array. */
+   tl_assert(szL <= sizeof(pessim) / sizeof(pessim[0]));
 
-   /* Make up a 128-bit result V word, which contains the loaded data
-      for valid addresses and Defined for invalid addresses.  Iterate
-      over the bytes in the word, from the most significant down to
-      the least.  The vbits to return are calculated into vbits128.
-      Also compute the pessimising value to be used when
+   for (j=0 ; j < szL ; j++) {
+      pessim[j] = V_BITS64_DEFINED;
+      res[j] = V_BITS64_UNDEFINED;
+   }
+
+   /* Make up a result V word, which contains the loaded data for
+      valid addresses and Defined for invalid addresses.  Iterate over
+      the bytes in the word, from the most significant down to the
+      least.  The vbits to return are calculated into vbits128.  Also
+      compute the pessimising value to be used when
       --partial-loads-ok=yes.  n_addrs_bad is redundant (the relevant
-      info can be gleaned from pessim128) but is used as a
+      info can be gleaned from the pessim array) but is used as a
       cross-check. */
    for (j = szL-1 ; j >= 0 ; j--) {
       ULong  vbits64     = V_BITS64_UNDEFINED;
       ULong  pessim64    = V_BITS64_DEFINED;
       UWord  long_index = byte_offset_w(szL, bigendian, j);
-      for (i = bytes_per_long-1; i >= 0; i--) {
+      for (i = 8-1; i >= 0; i--) {
          PROF_EVENT(31, "mc_LOADV128_slow(loop)");
-         ai = a + long_index*bytes_per_long + byte_offset_w(bytes_per_long,
-                                                            bigendian, i);
+         ai = a + 8*long_index + byte_offset_w(8, bigendian, i);
          ok = get_vbits8(ai, &vbits8);
          vbits64 <<= 8;
          vbits64 |= vbits8;
@@ -1174,22 +1176,19 @@ void mc_LOADV128_slow ( /*OUT*/V128* res, Addr a, Bool bigendian )
          pessim64 <<= 8;
          pessim64 |= (ok ? V_BITS8_DEFINED : V_BITS8_UNDEFINED);
       }
-      vbits128.w64[long_index] = vbits64;
-      pessim128.w64[long_index] = pessim64;
+      res[long_index] = vbits64;
+      pessim[long_index] = pessim64;
    }
 
    /* In the common case, all the addresses involved are valid, so we
       just return the computed V bits and have done. */
-   if (LIKELY(n_addrs_bad == 0)) {
-      *res = vbits128;
+   if (LIKELY(n_addrs_bad == 0))
       return;
-   }
 
    /* If there's no possibility of getting a partial-loads-ok
       exemption, report the error and quit. */
    if (!MC_(clo_partial_loads_ok)) {
       MC_(record_address_error)( VG_(get_running_tid)(), a, szB, False );
-      *res = vbits128;
       return;
    }
 
@@ -1199,7 +1198,7 @@ void mc_LOADV128_slow ( /*OUT*/V128* res, Addr a, Bool bigendian )
       false negatives.  If it doesn't apply, just report an addressing
       error in the usual way. */
 
-   /* Some code steps along byte strings in aligned word-sized chunks
+   /* Some code steps along byte strings in aligned chunks
       even when there is only a partially defined word at the end (eg,
       optimised strlen).  This is allowed by the memory model of
       modern machines, since an aligned load cannot span two pages and
@@ -1217,29 +1216,28 @@ void mc_LOADV128_slow ( /*OUT*/V128* res, Addr a, Bool bigendian )
    */
 
    /* "at least one of the addresses is invalid" */
-   tl_assert(pessim128.w64[0] != V_BITS64_DEFINED
-             || pessim128.w64[1] != V_BITS64_DEFINED);
+   ok = False;
+   for (j=0 ; j < szL ; j++)
+      ok |= pessim[j] != V_BITS8_DEFINED;
+   tl_assert(ok);
 
    if (0 == (a & (szB - 1)) && n_addrs_bad < szB) {
       /* Exemption applies.  Use the previously computed pessimising
-         value for vbits128 and return the combined result, but don't
-         flag an addressing error.  The pessimising value is Defined
-         for valid addresses and Undefined for invalid addresses. */
+         value and return the combined result, but don't flag an
+         addressing error.  The pessimising value is Defined for valid
+         addresses and Undefined for invalid addresses. */
       /* for assumption that doing bitwise or implements UifU */
       tl_assert(V_BIT_UNDEFINED == 1 && V_BIT_DEFINED == 0);
       /* (really need "UifU" here...)
-         vbits128 UifU= pessim128  (is pessimised by it, iow) */
+         vbits[j] UifU= pessim[j]  (is pessimised by it, iow) */
       for (j = szL-1 ; j >= 0 ; j--)
-         vbits128.w64[j] |= pessim128.w64[j];
-      *res = vbits128;
+         res[j] |= pessim[j];
       return;
    }
 
    /* Exemption doesn't apply.  Flag an addressing error in the normal
       way. */
    MC_(record_address_error)( VG_(get_running_tid)(), a, szB, False );
-
-   *res = vbits128;
 }
 
 
@@ -4207,28 +4205,29 @@ static void mc_pre_reg_read ( CorePart part, ThreadId tid, const HChar* s,
 /* ------------------------ Size = 16 ------------------------ */
 
 static INLINE
-void mc_LOADV128 ( /*OUT*/V128* res, Addr a, Bool isBigEndian )
+void mc_LOADVx ( /*OUT*/ULong* res, Addr a, SizeT nBits, Bool isBigEndian )
 {
-   PROF_EVENT(200, "mc_LOADV128");
+   PROF_EVENT(200, "mc_LOADVx");
 
 #ifndef PERF_FAST_LOADV
-   mc_LOADV128_slow( res, a, isBigEndian );
+   mc_LOADVx_slow( res, a, nBits, isBigEndian );
    return;
 #else
    {
       UWord   sm_off16, vabits16;
       SecMap* sm;
       int j;
+      int nBytes = nBits / 8;
 
-      if (UNLIKELY( UNALIGNED_OR_HIGH(a,128) )) {
-         PROF_EVENT(201, "mc_LOADV128-slow1");
-         mc_LOADV128_slow( res, a, isBigEndian );
+      if (UNLIKELY( UNALIGNED_OR_HIGH(a,nBits) )) {
+         PROF_EVENT(201, "mc_LOADVx-slow1");
+         mc_LOADVx_slow( res, a, nBits, isBigEndian );
          return;
       }
 
-      // Handle common cases quickly: a (and a+8) is suitably aligned,
-      // is mapped, and addressible.
-      for (j=0 ; j<2 ; ++j) {
+      /* Handle common cases quickly: a (and a+8 and a+16 etc.) is
+         suitably aligned, is mapped, and addressible. */
+      for (j=0 ; j<nBytes/8 ; ++j) {
          sm       = get_secmap_for_reading_low(a + 8*j);
          sm_off16 = SM_OFF_16(a + 8*j);
          vabits16 = ((UShort*)(sm->vabits8))[sm_off16];
@@ -4236,14 +4235,14 @@ void mc_LOADV128 ( /*OUT*/V128* res, Addr a, Bool isBigEndian )
          // Convert V bits from compact memory form to expanded
          // register form.
          if (LIKELY(vabits16 == VA_BITS16_DEFINED)) {
-            res->w64[j] = V_BITS64_DEFINED;
+            res[j] = V_BITS64_DEFINED;
          } else if (LIKELY(vabits16 == VA_BITS16_UNDEFINED)) {
-            res->w64[j] = V_BITS64_UNDEFINED;
+            res[j] = V_BITS64_UNDEFINED;
          } else {
             /* Slow case: some block of 8 bytes are not all-defined or
                all-undefined. */
-            PROF_EVENT(202, "mc_LOADV128-slow2");
-            mc_LOADV128_slow( res, a, isBigEndian );
+            PROF_EVENT(202, "mc_LOADVx-slow2");
+            mc_LOADVx_slow( res, a, nBits, isBigEndian );
             return;
          }
       }
@@ -4252,16 +4251,24 @@ void mc_LOADV128 ( /*OUT*/V128* res, Addr a, Bool isBigEndian )
 #endif
 }
 
+VG_REGPARM(2) void MC_(helperc_LOADV256be) ( /*OUT*/V256* res, Addr a )
+{
+   mc_LOADVx(&res->w64[0], a, 256, True);
+}
+VG_REGPARM(2) void MC_(helperc_LOADV256le) ( /*OUT*/V256* res, Addr a )
+{
+   mc_LOADVx(&res->w64[0], a, 256, False);
+}
+
 VG_REGPARM(2) void MC_(helperc_LOADV128be) ( /*OUT*/V128* res, Addr a )
 {
-   mc_LOADV128(res, a, True);
+   mc_LOADVx(&res->w64[0], a, 128, True);
 }
 VG_REGPARM(2) void MC_(helperc_LOADV128le) ( /*OUT*/V128* res, Addr a )
 {
-   mc_LOADV128(res, a, False);
+   mc_LOADVx(&res->w64[0], a, 128, False);
 }
 
-
 /* ------------------------ Size = 8 ------------------------ */
 
 static INLINE
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index cebbec27c0..d04e6484d1 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -4183,8 +4183,8 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
    The definedness of |guard| itself is not checked.  That is assumed
    to have been done before this point, by the caller. */
 static
-IRAtom* expr2vbits_Load_WRK ( MCEnv* mce, 
-                              IREndness end, IRType ty, 
+IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+                              IREndness end, IRType ty,
                               IRAtom* addr, UInt bias, IRAtom* guard )
 {
    tl_assert(isOriginalAtom(mce,addr));
@@ -4202,8 +4202,12 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
    const HChar* hname            = NULL;
    Bool         ret_via_outparam = False;
 
-   if (end == Iend_LE) {   
+   if (end == Iend_LE) {
       switch (ty) {
+         case Ity_V256: helper = &MC_(helperc_LOADV256le);
+                        hname = "MC_(helperc_LOADV256le)";
+                        ret_via_outparam = True;
+                        break;
          case Ity_V128: helper = &MC_(helperc_LOADV128le);
                         hname = "MC_(helperc_LOADV128le)";
                         ret_via_outparam = True;
@@ -4225,6 +4229,10 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
       }
    } else {
       switch (ty) {
+         case Ity_V256: helper = &MC_(helperc_LOADV256be);
+                        hname = "MC_(helperc_LOADV256be)";
+                        ret_via_outparam = True;
+                        break;
          case Ity_V128: helper = &MC_(helperc_LOADV128be);
                         hname = "MC_(helperc_LOADV128be)";
                         ret_via_outparam = True;
@@ -4309,37 +4317,20 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
    definedness of |guard| before this point.
 */
 static
-IRAtom* expr2vbits_Load ( MCEnv* mce, 
-                          IREndness end, IRType ty, 
+IRAtom* expr2vbits_Load ( MCEnv* mce,
+                          IREndness end, IRType ty,
                           IRAtom* addr, UInt bias,
                           IRAtom* guard )
 {
    tl_assert(end == Iend_LE || end == Iend_BE);
    switch (shadowTypeV(ty)) {
-      case Ity_I8: 
-      case Ity_I16: 
-      case Ity_I32: 
+      case Ity_I8:
+      case Ity_I16:
+      case Ity_I32:
       case Ity_I64:
       case Ity_V128:
+      case Ity_V256:
          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
-      case Ity_V256: {
-         /* V256-bit case -- phrased in terms of 64 bit units (Qs),
-            with Q3 being the most significant lane. */
-         if (end == Iend_BE) goto unhandled;
-         IRAtom* v64Q0
-            = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+0,  guard);
-         IRAtom* v64Q1
-            = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+8,  guard);
-         IRAtom* v64Q2
-            = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+16, guard);
-         IRAtom* v64Q3
-            = expr2vbits_Load_WRK(mce, end, Ity_I64, addr, bias+24, guard);
-         return assignNew( 'V', mce,
-                           Ity_V256,
-                           IRExpr_Qop(Iop_64x4toV256,
-                                      v64Q3, v64Q2, v64Q1, v64Q0));
-      }
-      unhandled:
       default:
          VG_(tool_panic)("expr2vbits_Load");
    }