From: Julian Seward Date: Sun, 24 Jul 2016 18:58:21 +0000 (+0000) Subject: Implement PMULL 1q,1d,1d and PMULL2 1q,2d,2d. n-i-bz. X-Git-Tag: svn/VALGRIND_3_12_0^2~34 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9f109ab25f3ae66e6420c2ff23c75f622afa1c15;p=thirdparty%2Fvalgrind.git Implement PMULL 1q,1d,1d and PMULL2 1q,2d,2d. n-i-bz. git-svn-id: svn://svn.valgrind.org/vex/trunk@3232 --- diff --git a/VEX/priv/guest_arm64_defs.h b/VEX/priv/guest_arm64_defs.h index dcaf5a062b..7ec6439d13 100644 --- a/VEX/priv/guest_arm64_defs.h +++ b/VEX/priv/guest_arm64_defs.h @@ -115,6 +115,9 @@ ULong arm64g_calculate_condition ( /* ARM64Condcode << 4 | cc_op */ extern ULong arm64g_dirtyhelper_MRS_CNTVCT_EL0 ( void ); +extern void arm64g_dirtyhelper_PMULLQ ( /*OUT*/V128* res, + ULong arg1, ULong arg2 ); + extern void arm64g_dirtyhelper_AESE ( /*OUT*/V128* res, ULong argHi, ULong argLo ); extern void arm64g_dirtyhelper_AESD ( /*OUT*/V128* res, diff --git a/VEX/priv/guest_arm64_helpers.c b/VEX/priv/guest_arm64_helpers.c index e31ff2d352..5346accfa7 100644 --- a/VEX/priv/guest_arm64_helpers.c +++ b/VEX/priv/guest_arm64_helpers.c @@ -692,6 +692,33 @@ ULong arm64g_dirtyhelper_MRS_CNTVCT_EL0 ( void ) } +void arm64g_dirtyhelper_PMULLQ ( /*OUT*/V128* res, ULong arg1, ULong arg2 ) +{ + /* This doesn't need to be a dirty helper, except for the fact that + a clean helper can't return a 128 bit value. This is a pretty + lame implementation of PMULLQ, but at least it doesn't contain any + data dependent branches, and has lots of ILP. I guess we could unroll + the loop completely and offer extensive prayers to the gods of ILP + if more performance is needed. */ + UInt i; + ULong accHi = 0, accLo = 0; + ULong op2Hi = 0, op2Lo = arg2; + for (i = 0; i < 64; i++) { + /* Make |mask| be all 0s or all 1s, a copy of arg1[i] */ + Long mask = arg1 << (63-i); + mask >>= 63; + accHi ^= (op2Hi & mask); + accLo ^= (op2Lo & mask); + /* do: op2Hi:op2Lo <<=u 1 */ + op2Hi <<= 1; + op2Hi |= ((op2Lo >> 63) & 1); + op2Lo <<= 1; + } + res->w64[1] = accHi; + res->w64[0] = accLo; +} + + /*---------------------------------------------------------------*/ /*--- Crypto instruction helpers ---*/ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c index 95907a8b78..a3a9078b07 100644 --- a/VEX/priv/guest_arm64_toIR.c +++ b/VEX/priv/guest_arm64_toIR.c @@ -11153,16 +11153,41 @@ Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn) if (bitU == 0 && opcode == BITS4(1,1,1,0)) { /* -------- 0,1110 PMULL{2} -------- */ /* Widens, and size refers to the narrow lanes. */ - if (size != X00) return False; - IRTemp res - = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8, - getQReg128(nn), getQReg128(mm)); - putQReg128(dd, mkexpr(res)); - const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size); - const HChar* arrWide = nameArr_Q_SZ(1, size+1); + if (size != X00 && size != X11) return False; + IRTemp res = IRTemp_INVALID; + IRExpr* srcN = getQReg128(nn); + IRExpr* srcM = getQReg128(mm); + const HChar* arrNarrow = NULL; + const HChar* arrWide = NULL; + if (size == X00) { + res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8, + srcN, srcM); + arrNarrow = nameArr_Q_SZ(bitQ, size); + arrWide = nameArr_Q_SZ(1, size+1); + } else { + /* The same thing as the X00 case, except we have to call + a helper to do it. */ + vassert(size == X11); + res = newTemp(Ity_V128); + IROp slice + = is2 ? Iop_V128HIto64 : Iop_V128to64; + IRExpr** args + = mkIRExprVec_3( IRExpr_VECRET(), + unop(slice, srcN), unop(slice, srcM)); + IRDirty* di + = unsafeIRDirty_1_N( res, 0/*regparms*/, + "arm64g_dirtyhelper_PMULLQ", + &arm64g_dirtyhelper_PMULLQ, args); + stmt(IRStmt_Dirty(di)); + /* We can't use nameArr_Q_SZ for this because it can't deal with + Q-sized (128 bit) results. Hence do it by hand. */ + arrNarrow = bitQ == 0 ? "1d" : "2d"; + arrWide = "1q"; + } + putQReg128(dd, mkexpr(res)); DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "", - nameQReg128(dd), arrNarrow, - nameQReg128(nn), arrWide, nameQReg128(mm), arrWide); + nameQReg128(dd), arrWide, + nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow); return True; }