From: Julian Seward <jseward@acm.org>
Date: Wed, 20 Oct 2010 21:38:42 +0000 (+0000)
Subject: Merge from trunk, r2067 (Add support for SMSAD{X}, SMLSD{X}, USAD{A}8.)
X-Git-Tag: svn/VALGRIND_3_6_1^2~15
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=886e7307581475f4d878eb9886dde426f4afe345;p=thirdparty%2Fvalgrind.git

Merge from trunk, r2067 (Add support for SMSAD{X}, SMLSD{X}, USAD{A}8.)


git-svn-id: svn://svn.valgrind.org/vex/branches/VEX_3_6_BRANCH@2068
---

diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c
index 277a90d5f4..c1f92111f6 100644
--- a/VEX/priv/guest_arm_toIR.c
+++ b/VEX/priv/guest_arm_toIR.c
@@ -1667,6 +1667,24 @@ static void armSignedSatQ( IRTemp regT,    /* value to clamp - Ity_I32 */
 }
 
 
+/* Compute a value 0 :: I32 or 1 :: I32, indicating whether signed
+   overflow occurred for 32-bit addition.  Needs both args and the
+   result.  HD p27. */
+static
+IRExpr* signed_overflow_after_Add32 ( IRExpr* resE,
+                                      IRTemp argL, IRTemp argR )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res, resE);
+   return
+      binop( Iop_Shr32, 
+             binop( Iop_And32,
+                    binop( Iop_Xor32, mkexpr(res), mkexpr(argL) ),
+                    binop( Iop_Xor32, mkexpr(res), mkexpr(argR) )), 
+             mkU8(31) );
+}
+
+
 /*------------------------------------------------------------*/
 /*--- Larger helpers                                       ---*/
 /*------------------------------------------------------------*/
@@ -9651,27 +9669,31 @@ static Bool decode_V6MEDIA_instruction (
    }
 
    /* --------------- smuad, smuadx<c><Rd>,<Rn>,<Rm> --------------- */
+   /* --------------- smsad, smsadx<c><Rd>,<Rn>,<Rm> --------------- */
    {
      UInt regD = 99, regN = 99, regM = 99, bitM = 99;
-     Bool gate = False;
+     Bool gate = False, isAD = False;
 
      if (isT) {
-        if (INSNT0(15,4) == 0xFB2 && (INSNT1(15,0) & 0xF0E0) == 0xF000) {
+        if ((INSNT0(15,4) == 0xFB2 || INSNT0(15,4) == 0xFB4)
+            && (INSNT1(15,0) & 0xF0E0) == 0xF000) {
            regN = INSNT0(3,0);
            regD = INSNT1(11,8);
            regM = INSNT1(3,0);
            bitM = INSNT1(4,4);
+           isAD = INSNT0(15,4) == 0xFB2;
            if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
               gate = True;
         }
      } else {
         if (INSNA(27,20) == BITS8(0,1,1,1,0,0,0,0) &&
             INSNA(15,12) == BITS4(1,1,1,1)         &&
-            (INSNA(7,4) & BITS4(1,1,0,1)) == BITS4(0,0,0,1) ) {
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(0,0,0,1) ) {
            regD = INSNA(19,16);
            regN = INSNA(3,0);
            regM = INSNA(11,8);
            bitM = INSNA(5,5);
+           isAD = INSNA(6,6) == 0;
            if (regD != 15 && regN != 15 && regM != 15)
               gate = True;
         }
@@ -9701,22 +9723,24 @@ static Bool decode_V6MEDIA_instruction (
                                    binop(Iop_Sar32, mkexpr(irt_regN), mkU8(16)), 
                                    binop(Iop_Sar32, mkexpr(irt_regM), mkU8(16))) );
         IRExpr* ire_result 
-           = binop( Iop_Add32, mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) );
+           = binop( isAD ? Iop_Add32 : Iop_Sub32,
+                    mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) );
 
         if (isT)
            putIRegT( regD, ire_result, condT );
         else
            putIRegA( regD, ire_result, condT, Ijk_Boring );
 
-        or_into_QFLAG32( binop( Iop_Shr32, 
-                                binop( Iop_And32,
-                                       binop( Iop_Xor32, ire_result, 
-                                              mkexpr(irt_prod_hi) ),
-                                       binop( Iop_Xor32, ire_result, 
-                                              mkexpr(irt_prod_lo) ) ), 
-                                mkU8(31)), condT );
+        if (isAD) {
+           or_into_QFLAG32(
+              signed_overflow_after_Add32( ire_result,
+                                           irt_prod_lo, irt_prod_hi ),
+              condT
+           );
+        }
 
-        DIP("smuad%s%s r%u, r%u, r%u\n",
+        DIP("smu%cd%s%s r%u, r%u, r%u\n",
+            isAD ? 'a' : 's',
             bitM ? "x" : "", nCC(conq), regD, regN, regM);
         return True;
      }
@@ -9724,29 +9748,33 @@ static Bool decode_V6MEDIA_instruction (
    }
 
    /* --------------- smlad{X}<c> <Rd>,<Rn>,<Rm>,<Ra> -------------- */
+   /* --------------- smlsd{X}<c> <Rd>,<Rn>,<Rm>,<Ra> -------------- */
    {
      UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99;
-     Bool gate = False;
+     Bool gate = False, isAD = False;
 
      if (isT) {
-        if (INSNT0(15,4) == 0xFB2 && INSNT1(7,5) == BITS3(0,0,0)) {
+       if ((INSNT0(15,4) == 0xFB2 || INSNT0(15,4) == 0xFB4)
+           && INSNT1(7,5) == BITS3(0,0,0)) {
            regN = INSNT0(3,0);
            regD = INSNT1(11,8);
            regM = INSNT1(3,0);
            regA = INSNT1(15,12);
            bitM = INSNT1(4,4);
+           isAD = INSNT0(15,4) == 0xFB2;
            if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
                && !isBadRegT(regA))
               gate = True;
         }
      } else {
         if (INSNA(27,20) == BITS8(0,1,1,1,0,0,0,0) &&
-            (INSNA(7,4) & BITS4(1,1,0,1)) == BITS4(0,0,0,1)) {
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(0,0,0,1)) {
            regD = INSNA(19,16);
            regA = INSNA(15,12);
            regN = INSNA(3,0);
            regM = INSNA(11,8);
            bitM = INSNA(5,5);
+           isAD = INSNA(6,6) == 0;
            if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
               gate = True;
         }
@@ -9779,7 +9807,7 @@ static Bool decode_V6MEDIA_instruction (
                 binop( Iop_Mul32, 
                        binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
                        binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
-        assign( irt_sum, binop( Iop_Add32, 
+        assign( irt_sum, binop( isAD ? Iop_Add32 : Iop_Sub32, 
                                 mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) ) );
 
         IRExpr* ire_result = binop(Iop_Add32, mkexpr(irt_sum), mkexpr(irt_regA));
@@ -9789,22 +9817,21 @@ static Bool decode_V6MEDIA_instruction (
         else
            putIRegA( regD, ire_result, condT, Ijk_Boring );
 
-        or_into_QFLAG32( binop( Iop_Shr32, 
-                                binop( Iop_And32,
-                                       binop( Iop_Xor32, mkexpr(irt_sum), 
-                                              mkexpr(irt_prod_lo) ),
-                                       binop( Iop_Xor32, mkexpr(irt_sum), 
-                                              mkexpr(irt_prod_hi) ) ), 
-                                mkU8(31)), condT );
-        or_into_QFLAG32( binop( Iop_Shr32, 
-                                binop( Iop_And32,
-                                       binop( Iop_Xor32, ire_result, 
-                                              mkexpr(irt_sum) ),
-                                       binop( Iop_Xor32, ire_result, 
-                                              mkexpr(irt_regA) ) ), 
-                                mkU8(31)), condT );
-
-        DIP("smlad%s%s r%u, r%u, r%u, r%u\n",
+        if (isAD) {
+           or_into_QFLAG32(
+              signed_overflow_after_Add32( mkexpr(irt_sum),
+                                           irt_prod_lo, irt_prod_hi ),
+              condT
+           );
+        }
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, irt_sum, irt_regA ),
+           condT
+        );
+
+        DIP("sml%cd%s%s r%u, r%u, r%u, r%u\n",
+            isAD ? 'a' : 's',
             bitM ? "x" : "", nCC(conq), regD, regN, regM, regA);
         return True;
      }
@@ -9868,14 +9895,10 @@ static Bool decode_V6MEDIA_instruction (
         else
            putIRegA( regD, ire_result, condT, Ijk_Boring );
 
-        or_into_QFLAG32( binop( Iop_Shr32, 
-                                binop( Iop_And32,
-                                       binop(Iop_Xor32, 
-                                             ire_result, mkexpr(irt_prod)),
-                                       binop(Iop_Xor32, 
-                                             ire_result, mkexpr(irt_regA)) ), 
-                                mkU8(31)), 
-                         condT );
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, irt_prod, irt_regA ),
+           condT
+        );
 
         DIP( "smla%c%c%s r%u, r%u, r%u, r%u\n", 
              bitN ? 't' : 'b', bitM ? 't' : 'b', 
@@ -9943,14 +9966,10 @@ static Bool decode_V6MEDIA_instruction (
         else
            putIRegA( regD, ire_result, condT, Ijk_Boring );
 
-        or_into_QFLAG32( binop( Iop_Shr32, 
-                                binop( Iop_And32,
-                                       binop(Iop_Xor32, 
-                                             ire_result, mkexpr(prod32)),
-                                       binop(Iop_Xor32, 
-                                             ire_result, mkexpr(irt_regA)) ), 
-                                mkU8(31)), 
-                         condT );
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, prod32, irt_regA ),
+           condT
+        );
 
         DIP( "smlaw%c%s r%u, r%u, r%u, r%u\n", 
              bitM ? 't' : 'b', 
@@ -10111,6 +10130,59 @@ static Bool decode_V6MEDIA_instruction (
      /* fall through */
    }
 
+   /* --------------- usad8  Rd,Rn,Rm    ---------------- */
+   /* --------------- usada8 Rd,Rn,Rm,Ra ---------------- */
+   {
+     UInt rD = 99, rN = 99, rM = 99, rA = 99;
+     Bool gate = False;
+
+     if (isT) {
+       if (INSNT0(15,4) == 0xFB7 && INSNT1(7,4) == BITS4(0,0,0,0)) {
+           rN = INSNT0(3,0);
+           rA = INSNT1(15,12);
+           rD = INSNT1(11,8);
+           rM = INSNT1(3,0);
+           if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM) && rA != 13)
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,1,0,0,0) &&
+            INSNA(7,4)   == BITS4(0,0,0,1) ) {
+           rD = INSNA(19,16);
+           rA = INSNA(15,12);
+           rM = INSNA(11,8);
+           rN = INSNA(3,0);
+           if (rD != 15 && rN != 15 && rM != 15 /* but rA can be 15 */)
+              gate = True;
+        }
+     }
+     /* We allow rA == 15, to denote the usad8 (no accumulator) case. */
+
+     if (gate) {
+        IRExpr* rNe = isT ? getIRegT(rN) : getIRegA(rN);
+        IRExpr* rMe = isT ? getIRegT(rM) : getIRegA(rM);
+        IRExpr* rAe = rA == 15 ? mkU32(0)
+                               : (isT ? getIRegT(rA) : getIRegA(rA)); 
+        IRExpr* res = binop(Iop_Add32,
+                            binop(Iop_Sad8Ux4, rNe, rMe),
+                            rAe);
+        if (isT)
+           putIRegT( rD, res, condT );
+        else
+           putIRegA( rD, res, condT, Ijk_Boring );
+
+        if (rA == 15) {
+           DIP( "usad8%s r%u, r%u, r%u\n", 
+                nCC(conq), rD, rN, rM );
+        } else {
+           DIP( "usada8%s r%u, r%u, r%u, r%u\n", 
+                nCC(conq), rD, rN, rM, rA );
+        }
+        return True;
+     }
+     /* fall through */
+   }
+
    /* ---------- Doesn't match anything. ---------- */
    return False;
 
diff --git a/VEX/priv/host_arm_isel.c b/VEX/priv/host_arm_isel.c
index 747293830a..4bba9a35de 100644
--- a/VEX/priv/host_arm_isel.c
+++ b/VEX/priv/host_arm_isel.c
@@ -1347,6 +1347,8 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
             fn = &h_generic_calc_QSub8Sx4; break;
          case Iop_QSub8Ux4:
             fn = &h_generic_calc_QSub8Ux4; break;
+         case Iop_Sad8Ux4:
+            fn = &h_generic_calc_Sad8Ux4; break;
          default:
             break;
       }
diff --git a/VEX/priv/host_generic_simd64.c b/VEX/priv/host_generic_simd64.c
index e685ad6a3d..03d6d2ff17 100644
--- a/VEX/priv/host_generic_simd64.c
+++ b/VEX/priv/host_generic_simd64.c
@@ -439,6 +439,12 @@ static inline Char hsub8S ( Char xx, Char yy )
    return (Char)r;
 }
 
+static inline UInt absdiff8U ( UChar xx, UChar yy )
+{
+   UInt xxu = (UChar)xx;
+   UInt yyu = (UChar)yy;
+   return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
+}
 
 /* ----------------------------------------------------- */
 /* Start of the externally visible functions.  These simply
@@ -1317,6 +1323,15 @@ UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
           );
 }
 
+UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
+{
+   return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
+          + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
+          + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
+          + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- end                               host_generic_simd64.c ---*/
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/host_generic_simd64.h b/VEX/priv/host_generic_simd64.h
index c29fbf6b2f..e854fc726e 100644
--- a/VEX/priv/host_generic_simd64.h
+++ b/VEX/priv/host_generic_simd64.h
@@ -149,6 +149,8 @@ extern UInt h_generic_calc_QAdd8Sx4 ( UInt, UInt );
 extern UInt h_generic_calc_QSub8Ux4 ( UInt, UInt );
 extern UInt h_generic_calc_QSub8Sx4 ( UInt, UInt );
 
+extern UInt h_generic_calc_Sad8Ux4  ( UInt, UInt );
+
 extern UInt h_generic_calc_CmpNEZ16x2 ( UInt );
 extern UInt h_generic_calc_CmpNEZ8x4  ( UInt );
 
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index eabf831da5..f78db106e2 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -305,6 +305,7 @@ void ppIROp ( IROp op )
       case Iop_HAdd8Sx4: vex_printf("HAdd8Sx4"); return;
       case Iop_HSub8Ux4: vex_printf("HSub8Ux4"); return;
       case Iop_HSub8Sx4: vex_printf("HSub8Sx4"); return;
+      case Iop_Sad8Ux4:  vex_printf("Sad8Ux4"); return;
 
       case Iop_CmpNEZ16x2: vex_printf("CmpNEZ16x2"); return;
       case Iop_CmpNEZ8x4:  vex_printf("CmpNEZ8x4"); return;
@@ -1944,6 +1945,7 @@ void typeOfPrimop ( IROp op,
       case Iop_QSub8Sx4: case Iop_QSub8Ux4:
       case Iop_HAdd8Ux4: case Iop_HAdd8Sx4:
       case Iop_HSub8Ux4: case Iop_HSub8Sx4:
+      case Iop_Sad8Ux4:
          BINARY(Ity_I32,Ity_I32, Ity_I32);
 
       case Iop_Add64: case Iop_Sub64: case Iop_Mul64:
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index b3d78b6b06..95042aab27 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -699,6 +699,9 @@ typedef
       Iop_HAdd8Ux4, Iop_HAdd8Sx4,
       Iop_HSub8Ux4, Iop_HSub8Sx4,
 
+      /* 8x4 sum of absolute unsigned differences. */
+      Iop_Sad8Ux4,
+
       /* MISC (vector integer cmp != 0) */
       Iop_CmpNEZ16x2, Iop_CmpNEZ8x4,