static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
{
- IRTemp sVoddsSX = newTemp(Ity_V128);
- IRTemp sVevensSX = newTemp(Ity_V128);
- IRTemp dVoddsZX = newTemp(Ity_V128);
- IRTemp dVevensZX = newTemp(Ity_V128);
- /* compute dV unsigned x sV signed */
- assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
- assign( sVevensSX, binop(Iop_SarN16x8,
- binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
- mkU8(8)) );
- assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
- assign( dVevensZX, binop(Iop_ShrN16x8,
- binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
- mkU8(8)) );
-
IRTemp res = newTemp(Ity_V128);
- assign( res, binop(Iop_QAdd16Sx8,
- binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
- binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
- )
- );
+ assign(res, binop(Iop_PwExtUSMulQAdd8x16, mkexpr(dV), mkexpr(sV)));
return res;
}
case Asse_UNPCKLD: return "punpckld";
case Asse_UNPCKLQ: return "punpcklq";
case Asse_PSHUFB: return "pshufb";
+ case Asse_PMADDUBSW: return "pmaddubsw";
default: vpanic("showAMD64SseOp");
}
}
case Asse_UNPCKLQ: XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
case Asse_PSHUFB: XX(0x66); XX(rex);
XX(0x0F); XX(0x38); XX(0x00); break;
+ case Asse_PMADDUBSW:XX(0x66); XX(rex);
+ XX(0x0F); XX(0x38); XX(0x04); break;
default: goto bad;
}
p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW,
Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ,
Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ,
- Asse_PSHUFB // Only for SSSE3 capable hosts
+ // Only for SSSE3 capable hosts:
+ Asse_PSHUFB,
+ Asse_PMADDUBSW
}
AMD64SseOp;
// IROp to enter the compilation pipeline in the first place.
break;
+ case Iop_PwExtUSMulQAdd8x16:
+ if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
+ op = Asse_PMADDUBSW;
+ goto do_SseReRg;
+ }
+ break;
+
case Iop_QNarrowBin32Sto16Sx8:
op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
case Iop_QNarrowBin16Sto8Sx16:
case Iop_PwAddL8Sx16: vex_printf("PwAddL8Sx16"); return;
case Iop_PwAddL16Sx8: vex_printf("PwAddL16Sx8"); return;
case Iop_PwAddL32Sx4: vex_printf("PwAddL32Sx4"); return;
+ case Iop_PwExtUSMulQAdd8x16: vex_printf("PwExtUSMulQAdd8x16"); return;
case Iop_Sub8x16: vex_printf("Sub8x16"); return;
case Iop_Sub16x8: vex_printf("Sub16x8"); return;
case Iop_Rsh32Ux4: case Iop_Rsh64Ux2:
case Iop_MulI128by10E:
case Iop_MulI128by10ECarry:
+ case Iop_PwExtUSMulQAdd8x16:
BINARY(Ity_V128,Ity_V128, Ity_V128);
case Iop_Perm8x16x2:
Iop_PwAddL8Ux16, Iop_PwAddL16Ux8, Iop_PwAddL32Ux4, Iop_PwAddL64Ux2,
Iop_PwAddL8Sx16, Iop_PwAddL16Sx8, Iop_PwAddL32Sx4,
+ /* This is amd64 PMADDUBSW, (V128, V128) -> V128. For each adjacent pair
+ of bytes [a,b] in the first arg and [c,d] in the second, computes:
+ signed/signed sat to 16 bits ( zxTo16(a) * sxTo16(b)
+ + zxTo16(c) * sxTo16(d) )
+ This exists because it's frequently used and there's no reasonably
+ concise way to express it using other IROps.
+ */
+ Iop_PwExtUSMulQAdd8x16,
+
/* Other unary pairwise ops */
/* Vector bit matrix transpose. (V128) -> V128 */
case Iop_QDMulHi16Sx8:
case Iop_QRDMulHi16Sx8:
case Iop_PolynomialMulAdd16x8:
+ /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each
+ 16-bit chunk of the output is formed from corresponding 16-bit chunks
+ of the input args, so we can treat it like an other binary 16x8
+ operation. That's despite it having '8x16' in its name. */
+ case Iop_PwExtUSMulQAdd8x16:
return binary16Ix8(mce, vatom1, vatom2);
case Iop_Sub32x4:
{ DEFOP(Iop_PwAddL8Sx16, UNDEF_UNKNOWN), },
{ DEFOP(Iop_PwAddL16Sx8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_PwAddL32Sx4, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_PwExtUSMulQAdd8x16, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Abs8x16, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Abs16x8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Abs32x4, UNDEF_UNKNOWN), },