rather than chopping it up into two 64-bit pieces in the front end.
git-svn-id: svn://svn.valgrind.org/vex/trunk@2590
static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
Long delta, Bool isAvx )
{
- /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
UChar modrm = getUChar(delta);
vassert(epartIsReg(modrm)); /* ensured by caller */
UInt rE = eregOfRexRM(pfx,modrm);
UInt rG = gregOfRexRM(pfx,modrm);
- IRTemp t0 = newTemp(Ity_I64);
- IRTemp t1 = newTemp(Ity_I64);
- IRTemp t5 = newTemp(Ity_I32);
- assign(t0, getXMMRegLane64(rE, 0));
- assign(t1, getXMMRegLane64(rE, 1));
- assign(t5,
- unop(Iop_16Uto32,
- binop(Iop_8HLto16,
- unop(Iop_GetMSBs8x8, mkexpr(t1)),
- unop(Iop_GetMSBs8x8, mkexpr(t0)))));
- putIReg32(rG, mkexpr(t5));
+ IRTemp t0 = newTemp(Ity_V128);
+ IRTemp t1 = newTemp(Ity_I32);
+ assign(t0, getXMMReg(rE));
+ assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
+ putIReg32(rG, mkexpr(t1));
DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
nameIReg32(rG));
delta += 1;
return dst;
}
+ case Iop_GetMSBs8x16: {
+ /* Note: the following assumes the helper is of signature
+ UInt fn ( ULong w64hi, ULong w64Lo ),
+ and is not a regparm fn. */
+ HReg dst = newVRegI(env);
+ HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
+ HReg rsp = hregAMD64_RSP();
+ fn = (HWord)h_generic_calc_GetMSBs8x16;
+ AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp);
+ AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
+ addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
+ 16, vec, m16_rsp));
+ /* hi 64 bits into RDI -- the first arg */
+ addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
+ AMD64RMI_Mem(m8_rsp),
+ hregAMD64_RDI() )); /* 1st arg */
+ /* lo 64 bits into RSI -- the 2nd arg */
+ addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
+ AMD64RMI_Mem(m16_rsp),
+ hregAMD64_RSI() )); /* 2nd arg */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
+ /* MovxLQ is not exactly the right thing here. We just
+ need to get the bottom 8 bits of RAX into dst, and zero
+ out everything else. Assuming that the helper returns
+ a UInt with the top 24 bits zeroed out, it'll do,
+ though. */
+ addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
+ return dst;
+ }
+
default:
break;
}
res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
}
+UInt /*not-regparm*/
+ h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
+{
+ UInt r = 0;
+ if (w64hi & (1ULL << (64-1))) r |= (1<<15);
+ if (w64hi & (1ULL << (56-1))) r |= (1<<14);
+ if (w64hi & (1ULL << (48-1))) r |= (1<<13);
+ if (w64hi & (1ULL << (40-1))) r |= (1<<12);
+ if (w64hi & (1ULL << (32-1))) r |= (1<<11);
+ if (w64hi & (1ULL << (24-1))) r |= (1<<10);
+ if (w64hi & (1ULL << (16-1))) r |= (1<<9);
+ if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
+ if (w64lo & (1ULL << (64-1))) r |= (1<<7);
+ if (w64lo & (1ULL << (56-1))) r |= (1<<6);
+ if (w64lo & (1ULL << (48-1))) r |= (1<<5);
+ if (w64lo & (1ULL << (40-1))) r |= (1<<4);
+ if (w64lo & (1ULL << (32-1))) r |= (1<<3);
+ if (w64lo & (1ULL << (24-1))) r |= (1<<2);
+ if (w64lo & (1ULL << (16-1))) r |= (1<<1);
+ if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
+ return r;
+}
/*---------------------------------------------------------------*/
/*--- end host_generic_simd128.c ---*/
extern VEX_REGPARM(3)
void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* );
+extern /*not-regparm*/
+ UInt h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo );
+
#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
/*---------------------------------------------------------------*/
case Iop_Reverse64_32x2: vex_printf("Reverse64_32x2"); return;
case Iop_Abs32Fx2: vex_printf("Abs32Fx2"); return;
case Iop_GetMSBs8x8: vex_printf("GetMSBs8x8"); return;
+ case Iop_GetMSBs8x16: vex_printf("GetMSBs8x16"); return;
case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
case Iop_Left16: UNARY(Ity_I16,Ity_I16);
case Iop_CmpwNEZ32: case Iop_Left32: UNARY(Ity_I32,Ity_I32);
case Iop_CmpwNEZ64: case Iop_Left64: UNARY(Ity_I64,Ity_I64);
- case Iop_GetMSBs8x8: UNARY(Ity_I64, Ity_I8);
+
+ case Iop_GetMSBs8x8: UNARY(Ity_I64, Ity_I8);
+ case Iop_GetMSBs8x16: UNARY(Ity_V128, Ity_I16);
case Iop_MullU8: case Iop_MullS8:
BINARY(Ity_I8,Ity_I8, Ity_I16);
of arbitrary sign the result of the operation is 1.5. */
Iop_Rsqrts32Fx4,
-
/* --- Int to/from FP conversion --- */
/* Unlike the standard fp conversions, these irops take no
rounding mode argument. Instead the irop trailers _R{M,P,N,Z}
Iop_Perm8x16,
Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */
+ /* MISC CONVERSION -- get high bits of each byte lane, a la
+ x86/amd64 pmovmskb */
+ Iop_GetMSBs8x16, /* V64 -> I16 */
+
/* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
See floating-point equiwalents for details. */
Iop_Recip32x4, Iop_Rsqrte32x4,