static
IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
{
- IRTemp sHi = newTemp(Ity_I64);
- IRTemp sLo = newTemp(Ity_I64);
- IRTemp dHi = newTemp(Ity_I64);
- IRTemp dLo = newTemp(Ity_I64);
- IRTemp rHi = newTemp(Ity_I64);
- IRTemp rLo = newTemp(Ity_I64);
- IRTemp sevens = newTemp(Ity_I64);
- IRTemp mask0x80hi = newTemp(Ity_I64);
- IRTemp mask0x80lo = newTemp(Ity_I64);
- IRTemp maskBit3hi = newTemp(Ity_I64);
- IRTemp maskBit3lo = newTemp(Ity_I64);
- IRTemp sAnd7hi = newTemp(Ity_I64);
- IRTemp sAnd7lo = newTemp(Ity_I64);
- IRTemp permdHi = newTemp(Ity_I64);
- IRTemp permdLo = newTemp(Ity_I64);
- IRTemp res = newTemp(Ity_V128);
-
- assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
- assign( dLo, unop(Iop_V128to64, mkexpr(dV)) );
- assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
- assign( sLo, unop(Iop_V128to64, mkexpr(sV)) );
-
- assign( sevens, mkU64(0x0707070707070707ULL) );
-
- /* mask0x80hi = Not(SarN8x8(sHi,7))
- maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
- sAnd7hi = And(sHi,sevens)
- permdHi = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
- And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
- rHi = And(permdHi,mask0x80hi)
- */
- assign(
- mask0x80hi,
- unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
-
- assign(
- maskBit3hi,
- binop(Iop_SarN8x8,
- binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
- mkU8(7)));
-
- assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
-
- assign(
- permdHi,
- binop(
- Iop_Or64,
- binop(Iop_And64,
- binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
- mkexpr(maskBit3hi)),
- binop(Iop_And64,
- binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
- unop(Iop_Not64,mkexpr(maskBit3hi))) ));
-
- assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
-
- /* And the same for the lower half of the result. What fun. */
-
- assign(
- mask0x80lo,
- unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
-
- assign(
- maskBit3lo,
- binop(Iop_SarN8x8,
- binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
- mkU8(7)));
-
- assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
-
- assign(
- permdLo,
- binop(
- Iop_Or64,
- binop(Iop_And64,
- binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
- mkexpr(maskBit3lo)),
- binop(Iop_And64,
- binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
- unop(Iop_Not64,mkexpr(maskBit3lo))) ));
-
- assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
-
- assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
+ IRTemp halfMask = newTemp(Ity_I64);
+ assign(halfMask, mkU64(0x8F8F8F8F8F8F8F8FULL));
+ IRExpr* mask = binop(Iop_64HLtoV128, mkexpr(halfMask), mkexpr(halfMask));
+ IRTemp res = newTemp(Ity_V128);
+ assign(res,
+ binop(Iop_PermOrZero8x16,
+ mkexpr(dV),
+ // Mask off bits [6:3] of each source operand lane
+ binop(Iop_AndV128, mkexpr(sV), mask)
+ ));
return res;
}
putMMXReg(
gregLO3ofRM(modrm),
binop(
- Iop_And64,
- /* permute the lanes */
- binop(
- Iop_Perm8x8,
- mkexpr(dV),
- binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
- ),
- /* mask off lanes which have (index & 0x80) == 0x80 */
- unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
+ Iop_PermOrZero8x8,
+ mkexpr(dV),
+ // Mask off bits [6:3] of each source operand lane
+ binop(Iop_And64, mkexpr(sV), mkU64(0x8787878787878787ULL))
)
);
goto decode_success;
case Asse_UNPCKLW: return "punpcklw";
case Asse_UNPCKLD: return "punpckld";
case Asse_UNPCKLQ: return "punpcklq";
+ case Asse_PSHUFB: return "pshufb";
default: vpanic("showAMD64SseOp");
}
}
case Asse_UNPCKLW: XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
case Asse_UNPCKLD: XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
case Asse_UNPCKLQ: XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
+ case Asse_PSHUFB: XX(0x66); XX(rex);
+ XX(0x0F); XX(0x38); XX(0x00); break;
default: goto bad;
}
p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
Asse_SAR16, Asse_SAR32,
Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW,
Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ,
- Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ
+ Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ,
+ Asse_PSHUFB // Only for SSSE3 capable hosts
}
AMD64SseOp;
fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
case Iop_CatEvenLanes16x4:
fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
- case Iop_Perm8x8:
- fn = (HWord)h_generic_calc_Perm8x8; break;
+ case Iop_PermOrZero8x8:
+ fn = (HWord)h_generic_calc_PermOrZero8x8; break;
case Iop_Max8Ux8:
fn = (HWord)h_generic_calc_Max8Ux8; break;
return dst;
}
+ case Iop_PermOrZero8x16:
+ if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) {
+ op = Asse_PSHUFB;
+ goto do_SseReRg;
+ }
+ // Otherwise we'll have to generate a call to
+ // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a
+ // host which doesn't have SSSE3, in which case we don't expect this
+ // IROp to enter the compilation pipeline in the first place.
+ break;
+
case Iop_QNarrowBin32Sto16Sx8:
op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
case Iop_QNarrowBin16Sto8Sx16:
res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
}
+//void VEX_REGPARM(3)
+// h_generic_calc_PermOrZero8x16 ( /*OUT*/V128* res,
+// V128* argL, V128* argR )
+//{
+// for (UInt i = 0; i < 16; i++) {
+// UChar ix = argR->w8[i];
+// Char zeroingMask = (Char)ix;
+// zeroingMask ^= 0x80;
+// zeroingMask >>= 7;
+// ix &= 15;
+// res->w8[i] = (argL->w8[ix] & zeroingMask) & 0xFF;
+// }
+//}
+
UInt /*not-regparm*/
h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
{
extern VEX_REGPARM(3)
void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* );
+// This is correct and tested, but isn't used because we just generate
+// PSHUFB on amd64 instead.
+//extern VEX_REGPARM(3)
+// void h_generic_calc_PermOrZero8x16 ( /*OUT*/V128*, V128*, V128* );
+
extern /*not-regparm*/
UInt h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo );
return toUChar((w64 >> (8*ix)) & 0xFF);
}
+static inline UChar indexOrZero8x8 ( ULong w64, UChar ix ) {
+ Char zeroingMask = (Char)ix;
+ zeroingMask ^= 0x80;
+ zeroingMask >>= 7;
+ ix &= 7;
+ return toUChar( ((w64 >> (8*ix)) & zeroingMask) & 0xFF );
+}
+
/* Scalar helpers. */
);
}
-/* misc hack looking for a proper home */
+/* ------------ Permutation ------------ */
+
ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
{
return mk8x8(
);
}
+ULong h_generic_calc_PermOrZero8x8 ( ULong aa, ULong bb )
+{
+ return mk8x8(
+ indexOrZero8x8(aa, sel8x8_7(bb)),
+ indexOrZero8x8(aa, sel8x8_6(bb)),
+ indexOrZero8x8(aa, sel8x8_5(bb)),
+ indexOrZero8x8(aa, sel8x8_4(bb)),
+ indexOrZero8x8(aa, sel8x8_3(bb)),
+ indexOrZero8x8(aa, sel8x8_2(bb)),
+ indexOrZero8x8(aa, sel8x8_1(bb)),
+ indexOrZero8x8(aa, sel8x8_0(bb))
+ );
+}
+
/* ------------ Shifting ------------ */
/* Note that because these primops are undefined if the shift amount
equals or exceeds the lane width, the shift amount is masked so
extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong );
extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong );
-extern ULong h_generic_calc_Perm8x8 ( ULong, ULong );
+extern ULong h_generic_calc_Perm8x8 ( ULong, ULong );
+extern ULong h_generic_calc_PermOrZero8x8 ( ULong, ULong );
extern ULong h_generic_calc_ShlN8x8 ( ULong, UInt );
extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt );
case Iop_Sal32x2: vex_printf("Sal32x2"); return;
case Iop_Sal64x1: vex_printf("Sal64x1"); return;
case Iop_Perm8x8: vex_printf("Perm8x8"); return;
+ case Iop_PermOrZero8x8: vex_printf("PermOrZero8x8"); return;
case Iop_Reverse8sIn16_x4: vex_printf("Reverse8sIn16_x4"); return;
case Iop_Reverse8sIn32_x2: vex_printf("Reverse8sIn32_x2"); return;
case Iop_Reverse16sIn32_x2: vex_printf("Reverse16sIn32_x2"); return;
case Iop_SliceV128: vex_printf("SliceV128"); return;
case Iop_Perm8x16: vex_printf("Perm8x16"); return;
+ case Iop_PermOrZero8x16: vex_printf("PermOrZero8x16"); return;
case Iop_Perm32x4: vex_printf("Perm32x4"); return;
case Iop_Perm8x16x2: vex_printf("Perm8x16x2"); return;
case Iop_Reverse8sIn16_x8: vex_printf("Reverse8sIn16_x8"); return;
case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4:
case Iop_InterleaveOddLanes8x8: case Iop_InterleaveEvenLanes8x8:
case Iop_InterleaveOddLanes16x4: case Iop_InterleaveEvenLanes16x4:
- case Iop_Perm8x8:
+ case Iop_Perm8x8: case Iop_PermOrZero8x8:
case Iop_Max8Ux8: case Iop_Max16Ux4: case Iop_Max32Ux2:
case Iop_Max8Sx8: case Iop_Max16Sx4: case Iop_Max32Sx2:
case Iop_Max32Fx2: case Iop_Min32Fx2:
case Iop_PackOddLanes8x16: case Iop_PackEvenLanes8x16:
case Iop_PackOddLanes16x8: case Iop_PackEvenLanes16x8:
case Iop_PackOddLanes32x4: case Iop_PackEvenLanes32x4:
- case Iop_Perm8x16: case Iop_Perm32x4:
+ case Iop_Perm8x16: case Iop_PermOrZero8x16:
+ case Iop_Perm32x4:
case Iop_RecipStep32Fx4: case Iop_RecipStep64Fx2:
case Iop_RSqrtStep32Fx4: case Iop_RSqrtStep64Fx2:
case Iop_CipherV128:
as indexed by control vector bytes:
for i in 0 .. 7 . result[i] = argL[ argR[i] ]
argR[i] values may only be in the range 0 .. 7, else behaviour
- is undefined. */
+ is undefined. That is, argR[i][7:3] must be zero. */
Iop_Perm8x8,
+ /* PERMUTING with optional zeroing:
+ for i in 0 .. 7 . result[i] = if argR[i] bit 7 is set
+ then zero else argL[ argR[i] ]
+ argR[i][6:3] must be zero, else behaviour is undefined.
+ */
+ Iop_PermOrZero8x8,
+
/* MISC CONVERSION -- get high bits of each byte lane, a la
x86/amd64 pmovmskb */
Iop_GetMSBs8x8, /* I64 -> I8 */
as indexed by control vector bytes:
for i in 0 .. 15 . result[i] = argL[ argR[i] ]
argR[i] values may only be in the range 0 .. 15, else behaviour
- is undefined. */
+ is undefined. That is, argR[i][7:4] must be zero. */
Iop_Perm8x16,
Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */
+ /* PERMUTING with optional zeroing:
+ for i in 0 .. 15 . result[i] = if argR[i] bit 7 is set
+ then zero else argL[ argR[i] ]
+ argR[i][6:4] must be zero, else behaviour is undefined.
+ */
+ Iop_PermOrZero8x16,
+
/* same, but Triop (argL consists of two 128-bit parts) */
/* correct range for argR values is 0..31 */
/* (V128, V128, V128) -> V128 */
complainIfUndefined(mce, atom2, NULL);
return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
- /* Perm8x8: rearrange values in left arg using steering values
- from right arg. So rearrange the vbits in the same way but
- pessimise wrt steering values. */
+ /* Perm8x8: rearrange values in left arg using steering values from
+ right arg. So rearrange the vbits in the same way but pessimise wrt
+ steering values. We assume that unused bits in the steering value
+ are defined zeros, so we can safely PCast within each lane of the the
+ steering value without having to take precautions to avoid a
+ dependency on those unused bits.
+
+ This is also correct for PermOrZero8x8, but it is a bit subtle. For
+ each lane, if bit 7 of the steering value is zero, then we'll steer
+ the shadow value exactly as per Perm8x8. If that bit is one, then
+ the operation will set the resulting (concrete) value to zero. That
+ means it is defined, and should have a shadow value of zero. Hence
+ in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
+ as Perm8x8) and then pessimise against the steering values. */
case Iop_Perm8x8:
+ case Iop_PermOrZero8x8:
return mkUifU64(
mce,
assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
complainIfUndefined(mce, atom2, NULL);
return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
- /* Perm8x16: rearrange values in left arg using steering values
- from right arg. So rearrange the vbits in the same way but
- pessimise wrt steering values. Perm32x4 ditto. */
+ /* Perm8x16: rearrange values in left arg using steering values
+ from right arg. So rearrange the vbits in the same way but
+ pessimise wrt steering values. Perm32x4 ditto. */
+ /* PermOrZero8x16: see comments above for PermOrZero8x8. */
case Iop_Perm8x16:
+ case Iop_PermOrZero8x16:
return mkUifUV128(
mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
{ DEFOP(Iop_Reverse16sIn64_x1, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Reverse32sIn64_x1, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Perm8x8, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_PermOrZero8x8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_GetMSBs8x8, UNDEF_UNKNOWN), },
{ DEFOP(Iop_RecipEst32Ux2, UNDEF_UNKNOWN), },
{ DEFOP(Iop_RSqrtEst32Ux2, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Reverse32sIn64_x2, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Reverse1sIn8_x16, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Perm8x16, UNDEF_UNKNOWN), },
+ { DEFOP(Iop_PermOrZero8x16, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Perm32x4, UNDEF_UNKNOWN), },
{ DEFOP(Iop_Perm8x16x2, UNDEF_UNKNOWN), },
{ DEFOP(Iop_GetMSBs8x16, UNDEF_UNKNOWN), },