From ec735c8e8177baefec436052c468516a6c7cbe49 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Thu, 13 Dec 2012 18:29:56 +0000 Subject: [PATCH] Implement 128-bit PMOVMSKB using a single new primop (Iop_GetMSBs8x16) rather than chopping it up into two 64-bit pieces in the front end. git-svn-id: svn://svn.valgrind.org/vex/trunk@2590 --- VEX/priv/guest_amd64_toIR.c | 17 +++++------------ VEX/priv/host_amd64_isel.c | 31 +++++++++++++++++++++++++++++++ VEX/priv/host_generic_simd128.c | 22 ++++++++++++++++++++++ VEX/priv/host_generic_simd128.h | 3 +++ VEX/priv/ir_defs.c | 5 ++++- VEX/pub/libvex_ir.h | 5 ++++- 6 files changed, 69 insertions(+), 14 deletions(-) diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 9e8df2694f..b667c328d5 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -10274,22 +10274,15 @@ static Long dis_CVTDQ2PS_256 ( VexAbiInfo* vbi, Prefix pfx, static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta, Bool isAvx ) { - /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */ UChar modrm = getUChar(delta); vassert(epartIsReg(modrm)); /* ensured by caller */ UInt rE = eregOfRexRM(pfx,modrm); UInt rG = gregOfRexRM(pfx,modrm); - IRTemp t0 = newTemp(Ity_I64); - IRTemp t1 = newTemp(Ity_I64); - IRTemp t5 = newTemp(Ity_I32); - assign(t0, getXMMRegLane64(rE, 0)); - assign(t1, getXMMRegLane64(rE, 1)); - assign(t5, - unop(Iop_16Uto32, - binop(Iop_8HLto16, - unop(Iop_GetMSBs8x8, mkexpr(t1)), - unop(Iop_GetMSBs8x8, mkexpr(t0))))); - putIReg32(rG, mkexpr(t5)); + IRTemp t0 = newTemp(Ity_V128); + IRTemp t1 = newTemp(Ity_I32); + assign(t0, getXMMReg(rE)); + assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0)))); + putIReg32(rG, mkexpr(t1)); DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE), nameIReg32(rG)); delta += 1; diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 98e90f7ba4..d6f507e998 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -1605,6 +1605,37 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e ) return dst; } + case Iop_GetMSBs8x16: { + /* Note: the following assumes the helper is of signature + UInt fn ( ULong w64hi, ULong w64Lo ), + and is not a regparm fn. */ + HReg dst = newVRegI(env); + HReg vec = iselVecExpr(env, e->Iex.Unop.arg); + HReg rsp = hregAMD64_RSP(); + fn = (HWord)h_generic_calc_GetMSBs8x16; + AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp); + AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); + addInstr(env, AMD64Instr_SseLdSt(False/*store*/, + 16, vec, m16_rsp)); + /* hi 64 bits into RDI -- the first arg */ + addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, + AMD64RMI_Mem(m8_rsp), + hregAMD64_RDI() )); /* 1st arg */ + /* lo 64 bits into RSI -- the 2nd arg */ + addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, + AMD64RMI_Mem(m16_rsp), + hregAMD64_RSI() )); /* 2nd arg */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 )); + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 )); + /* MovxLQ is not exactly the right thing here. We just + need to get the bottom 8 bits of RAX into dst, and zero + out everything else. Assuming that the helper returns + a UInt with the top 24 bits zeroed out, it'll do, + though. */ + addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst)); + return dst; + } + default: break; } diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c index 908f250323..a45f5fb622 100644 --- a/VEX/priv/host_generic_simd128.c +++ b/VEX/priv/host_generic_simd128.c @@ -368,6 +368,28 @@ void VEX_REGPARM(3) res->w32[3] = argL->w32[ argR->w32[3] & 3 ]; } +UInt /*not-regparm*/ + h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ) +{ + UInt r = 0; + if (w64hi & (1ULL << (64-1))) r |= (1<<15); + if (w64hi & (1ULL << (56-1))) r |= (1<<14); + if (w64hi & (1ULL << (48-1))) r |= (1<<13); + if (w64hi & (1ULL << (40-1))) r |= (1<<12); + if (w64hi & (1ULL << (32-1))) r |= (1<<11); + if (w64hi & (1ULL << (24-1))) r |= (1<<10); + if (w64hi & (1ULL << (16-1))) r |= (1<<9); + if (w64hi & (1ULL << ( 8-1))) r |= (1<<8); + if (w64lo & (1ULL << (64-1))) r |= (1<<7); + if (w64lo & (1ULL << (56-1))) r |= (1<<6); + if (w64lo & (1ULL << (48-1))) r |= (1<<5); + if (w64lo & (1ULL << (40-1))) r |= (1<<4); + if (w64lo & (1ULL << (32-1))) r |= (1<<3); + if (w64lo & (1ULL << (24-1))) r |= (1<<2); + if (w64lo & (1ULL << (16-1))) r |= (1<<1); + if (w64lo & (1ULL << ( 8-1))) r |= (1<<0); + return r; +} /*---------------------------------------------------------------*/ /*--- end host_generic_simd128.c ---*/ diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h index 7956b80000..ba8555ddd3 100644 --- a/VEX/priv/host_generic_simd128.h +++ b/VEX/priv/host_generic_simd128.h @@ -86,6 +86,9 @@ extern VEX_REGPARM(3) extern VEX_REGPARM(3) void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* ); +extern /*not-regparm*/ + UInt h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo ); + #endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index b356f60d1f..e4cdd829b3 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -584,6 +584,7 @@ void ppIROp ( IROp op ) case Iop_Reverse64_32x2: vex_printf("Reverse64_32x2"); return; case Iop_Abs32Fx2: vex_printf("Abs32Fx2"); return; case Iop_GetMSBs8x8: vex_printf("GetMSBs8x8"); return; + case Iop_GetMSBs8x16: vex_printf("GetMSBs8x16"); return; case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return; case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return; @@ -2299,7 +2300,9 @@ void typeOfPrimop ( IROp op, case Iop_Left16: UNARY(Ity_I16,Ity_I16); case Iop_CmpwNEZ32: case Iop_Left32: UNARY(Ity_I32,Ity_I32); case Iop_CmpwNEZ64: case Iop_Left64: UNARY(Ity_I64,Ity_I64); - case Iop_GetMSBs8x8: UNARY(Ity_I64, Ity_I8); + + case Iop_GetMSBs8x8: UNARY(Ity_I64, Ity_I8); + case Iop_GetMSBs8x16: UNARY(Ity_V128, Ity_I16); case Iop_MullU8: case Iop_MullS8: BINARY(Ity_I8,Ity_I8, Ity_I16); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index 99eaaaf6a0..bc85c3f23d 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1179,7 +1179,6 @@ typedef of arbitrary sign the result of the operation is 1.5. */ Iop_Rsqrts32Fx4, - /* --- Int to/from FP conversion --- */ /* Unlike the standard fp conversions, these irops take no rounding mode argument. Instead the irop trailers _R{M,P,N,Z} @@ -1433,6 +1432,10 @@ typedef Iop_Perm8x16, Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */ + /* MISC CONVERSION -- get high bits of each byte lane, a la + x86/amd64 pmovmskb */ + Iop_GetMSBs8x16, /* V64 -> I16 */ + /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate See floating-point equiwalents for details. */ Iop_Recip32x4, Iop_Rsqrte32x4, -- 2.47.2