From: Julian Seward Date: Sat, 26 Jan 2019 17:00:41 +0000 (+0100) Subject: amd64 pipeline: generate a much better translation for PMADDUBSW. X-Git-Tag: VALGRIND_3_15_0~91 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2656009e6f88d0c60751baa9c6a9abd7b0e2cac6;p=thirdparty%2Fvalgrind.git amd64 pipeline: generate a much better translation for PMADDUBSW. This seems pretty common in some codecs, and the existing translation was somewhat longwinded. --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index fea0ecadf8..86bf970448 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -15689,26 +15689,8 @@ static Long dis_PHADD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta, static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV ) { - IRTemp sVoddsSX = newTemp(Ity_V128); - IRTemp sVevensSX = newTemp(Ity_V128); - IRTemp dVoddsZX = newTemp(Ity_V128); - IRTemp dVevensZX = newTemp(Ity_V128); - /* compute dV unsigned x sV signed */ - assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) ); - assign( sVevensSX, binop(Iop_SarN16x8, - binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), - mkU8(8)) ); - assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) ); - assign( dVevensZX, binop(Iop_ShrN16x8, - binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)), - mkU8(8)) ); - IRTemp res = newTemp(Ity_V128); - assign( res, binop(Iop_QAdd16Sx8, - binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)), - binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX)) - ) - ); + assign(res, binop(Iop_PwExtUSMulQAdd8x16, mkexpr(dV), mkexpr(sV))); return res; } diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c index 8e55197444..9a2e2bd16b 100644 --- a/VEX/priv/host_amd64_defs.c +++ b/VEX/priv/host_amd64_defs.c @@ -589,6 +589,7 @@ const HChar* showAMD64SseOp ( AMD64SseOp op ) { case Asse_UNPCKLD: return "punpckld"; case Asse_UNPCKLQ: return "punpcklq"; case Asse_PSHUFB: return "pshufb"; + case Asse_PMADDUBSW: return "pmaddubsw"; default: vpanic("showAMD64SseOp"); } } @@ -3871,6 +3872,8 @@ Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc, case Asse_UNPCKLQ: XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break; case Asse_PSHUFB: XX(0x66); XX(rex); XX(0x0F); XX(0x38); XX(0x00); break; + case Asse_PMADDUBSW:XX(0x66); XX(rex); + XX(0x0F); XX(0x38); XX(0x04); break; default: goto bad; } p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst), diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h index 64bd810247..0a665fec09 100644 --- a/VEX/priv/host_amd64_defs.h +++ b/VEX/priv/host_amd64_defs.h @@ -343,7 +343,9 @@ typedef Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW, Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ, Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ, - Asse_PSHUFB // Only for SSSE3 capable hosts + // Only for SSSE3 capable hosts: + Asse_PSHUFB, + Asse_PMADDUBSW } AMD64SseOp; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index faddc685a4..1f226d9807 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3529,6 +3529,13 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) // IROp to enter the compilation pipeline in the first place. break; + case Iop_PwExtUSMulQAdd8x16: + if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) { + op = Asse_PMADDUBSW; + goto do_SseReRg; + } + break; + case Iop_QNarrowBin32Sto16Sx8: op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg; case Iop_QNarrowBin16Sto8Sx16: diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 61b4a6dcff..6c12249636 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -817,6 +817,7 @@ void ppIROp ( IROp op ) case Iop_PwAddL8Sx16: vex_printf("PwAddL8Sx16"); return; case Iop_PwAddL16Sx8: vex_printf("PwAddL16Sx8"); return; case Iop_PwAddL32Sx4: vex_printf("PwAddL32Sx4"); return; + case Iop_PwExtUSMulQAdd8x16: vex_printf("PwExtUSMulQAdd8x16"); return; case Iop_Sub8x16: vex_printf("Sub8x16"); return; case Iop_Sub16x8: vex_printf("Sub16x8"); return; @@ -3159,6 +3160,7 @@ void typeOfPrimop ( IROp op, case Iop_Rsh32Ux4: case Iop_Rsh64Ux2: case Iop_MulI128by10E: case Iop_MulI128by10ECarry: + case Iop_PwExtUSMulQAdd8x16: BINARY(Ity_V128,Ity_V128, Ity_V128); case Iop_Perm8x16x2: diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 849b12455d..61d22016d8 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1626,6 +1626,15 @@ typedef Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4, Iop_PwAddL64Ux2, Iop_PwAddL8Sx16, Iop_PwAddL16Sx8, Iop_PwAddL32Sx4, + /* This is amd64 PMADDUBSW, (V128, V128) -> V128. For each adjacent pair + of bytes [a,b] in the first arg and [c,d] in the second, computes: + signed/signed sat to 16 bits ( zxTo16(a) * sxTo16(b) + + zxTo16(c) * sxTo16(d) ) + This exists because it's frequently used and there's no reasonably + concise way to express it using other IROps. + */ + Iop_PwExtUSMulQAdd8x16, + /* Other unary pairwise ops */ /* Vector bit matrix transpose. (V128) -> V128 */ diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 4706f277d9..e3086b613a 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -3843,6 +3843,11 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce, case Iop_QDMulHi16Sx8: case Iop_QRDMulHi16Sx8: case Iop_PolynomialMulAdd16x8: + /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each + 16-bit chunk of the output is formed from corresponding 16-bit chunks + of the input args, so we can treat it like an other binary 16x8 + operation. That's despite it having '8x16' in its name. */ + case Iop_PwExtUSMulQAdd8x16: return binary16Ix8(mce, vatom1, vatom2); case Iop_Sub32x4: diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index 97d953fb55..3005599bf3 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -805,6 +805,7 @@ static irop_t irops[] = { { DEFOP(Iop_PwAddL8Sx16, UNDEF_UNKNOWN), }, { DEFOP(Iop_PwAddL16Sx8, UNDEF_UNKNOWN), }, { DEFOP(Iop_PwAddL32Sx4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_PwExtUSMulQAdd8x16, UNDEF_UNKNOWN), }, { DEFOP(Iop_Abs8x16, UNDEF_UNKNOWN), }, { DEFOP(Iop_Abs16x8, UNDEF_UNKNOWN), }, { DEFOP(Iop_Abs32x4, UNDEF_UNKNOWN), },