From: Julian Seward <jseward@acm.org>
Date: Sun, 24 Jul 2016 18:58:21 +0000 (+0000)
Subject: Implement PMULL 1q,1d,1d and PMULL2 1q,2d,2d.  n-i-bz.
X-Git-Tag: svn/VALGRIND_3_12_0^2~34
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9f109ab25f3ae66e6420c2ff23c75f622afa1c15;p=thirdparty%2Fvalgrind.git

Implement PMULL 1q,1d,1d and PMULL2 1q,2d,2d.  n-i-bz.


git-svn-id: svn://svn.valgrind.org/vex/trunk@3232
---

diff --git a/VEX/priv/guest_arm64_defs.h b/VEX/priv/guest_arm64_defs.h
index dcaf5a062b..7ec6439d13 100644
--- a/VEX/priv/guest_arm64_defs.h
+++ b/VEX/priv/guest_arm64_defs.h
@@ -115,6 +115,9 @@ ULong arm64g_calculate_condition ( /* ARM64Condcode << 4 | cc_op */
 
 extern ULong arm64g_dirtyhelper_MRS_CNTVCT_EL0 ( void );
 
+extern void  arm64g_dirtyhelper_PMULLQ ( /*OUT*/V128* res,
+                                         ULong arg1, ULong arg2 );
+
 extern void  arm64g_dirtyhelper_AESE ( /*OUT*/V128* res,
                                        ULong argHi, ULong argLo );
 extern void  arm64g_dirtyhelper_AESD ( /*OUT*/V128* res,
diff --git a/VEX/priv/guest_arm64_helpers.c b/VEX/priv/guest_arm64_helpers.c
index e31ff2d352..5346accfa7 100644
--- a/VEX/priv/guest_arm64_helpers.c
+++ b/VEX/priv/guest_arm64_helpers.c
@@ -692,6 +692,33 @@ ULong arm64g_dirtyhelper_MRS_CNTVCT_EL0 ( void )
 }
 
 
+void arm64g_dirtyhelper_PMULLQ ( /*OUT*/V128* res, ULong arg1, ULong arg2 )
+{
+   /* This doesn't need to be a dirty helper, except for the fact that
+      a clean helper can't return a 128 bit value.  This is a pretty
+      lame implementation of PMULLQ, but at least it doesn't contain any
+      data dependent branches, and has lots of ILP.  I guess we could unroll
+      the loop completely and offer extensive prayers to the gods of ILP
+      if more performance is needed. */
+   UInt i;
+   ULong accHi = 0, accLo = 0;
+   ULong op2Hi = 0, op2Lo = arg2;
+   for (i = 0; i < 64; i++) {
+      /* Make |mask| be all 0s or all 1s, a copy of arg1[i] */
+      Long mask = arg1 << (63-i);
+      mask >>= 63;
+      accHi ^= (op2Hi & mask);
+      accLo ^= (op2Lo & mask);
+      /* do: op2Hi:op2Lo <<=u 1 */
+      op2Hi <<= 1;
+      op2Hi |= ((op2Lo >> 63) & 1);
+      op2Lo <<= 1;
+   }
+   res->w64[1] = accHi;
+   res->w64[0] = accLo;
+}
+
+
 /*---------------------------------------------------------------*/
 /*--- Crypto instruction helpers                              ---*/
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index 95907a8b78..a3a9078b07 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -11153,16 +11153,41 @@ Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
       /* -------- 0,1110  PMULL{2} -------- */
       /* Widens, and size refers to the narrow lanes. */
-      if (size != X00) return False;
-      IRTemp res
-         = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
-                                     getQReg128(nn), getQReg128(mm));
-      putQReg128(dd, mkexpr(res));
-      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
-      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
+      if (size != X00 && size != X11) return False;
+      IRTemp  res  = IRTemp_INVALID;
+      IRExpr* srcN = getQReg128(nn);
+      IRExpr* srcM = getQReg128(mm);
+      const HChar* arrNarrow = NULL;
+      const HChar* arrWide   = NULL;
+      if (size == X00) {
+         res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
+                                         srcN, srcM);
+         arrNarrow = nameArr_Q_SZ(bitQ, size);
+         arrWide   = nameArr_Q_SZ(1,    size+1);
+      } else {
+         /* The same thing as the X00 case, except we have to call
+            a helper to do it. */
+         vassert(size == X11);
+         res = newTemp(Ity_V128);
+         IROp slice
+            = is2 ? Iop_V128HIto64 : Iop_V128to64;
+         IRExpr** args
+            = mkIRExprVec_3( IRExpr_VECRET(),
+                             unop(slice, srcN), unop(slice, srcM));
+         IRDirty* di
+            = unsafeIRDirty_1_N( res, 0/*regparms*/,
+                                      "arm64g_dirtyhelper_PMULLQ",
+                                      &arm64g_dirtyhelper_PMULLQ, args);
+         stmt(IRStmt_Dirty(di));
+         /* We can't use nameArr_Q_SZ for this because it can't deal with
+            Q-sized (128 bit) results.  Hence do it by hand. */
+         arrNarrow = bitQ == 0 ? "1d" : "2d";
+         arrWide   = "1q";
+      }
+      putQReg128(dd, mkexpr(res));    
       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
-          nameQReg128(dd), arrNarrow,
-          nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
+          nameQReg128(dd), arrWide,
+          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
       return True;
    }