From: Julian Seward Date: Wed, 20 Jun 2012 11:46:19 +0000 (+0000) Subject: Implement X-Git-Tag: svn/VALGRIND_3_8_1^2~83 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0bc5bac261b91c00fe495292b1582e954789f7b7;p=thirdparty%2Fvalgrind.git Implement VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r git-svn-id: svn://svn.valgrind.org/vex/trunk@2395 --- diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 0288ed09da..364bb79487 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -8986,6 +8986,20 @@ static void breakupV256toV128s ( IRTemp t256, assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256))); } +/* Break a V128-bit value up into two 64-bit ints. */ + +static void breakupV128to64s ( IRTemp t128, + /*OUTs*/ + IRTemp* t1, IRTemp* t0 ) +{ + vassert(t0 && *t0 == IRTemp_INVALID); + vassert(t1 && *t1 == IRTemp_INVALID); + *t0 = newTemp(Ity_I64); + *t1 = newTemp(Ity_I64); + assign( *t0, unop(Iop_V128to64, mkexpr(t128)) ); + assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) ); +} + /* Helper for the SSSE3 (not SSE3) PMULHRSW insns. Given two 64-bit values (aa,bb), computes, for each of the 4 16-bit lanes: @@ -23015,6 +23029,66 @@ Long dis_ESC_0F__VEX ( /*--- ---*/ /*------------------------------------------------------------*/ +static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV ) +{ + /* In the control vector, zero out all but the bottom two bits of + each 32-bit lane. */ + IRExpr* cv1 = binop(Iop_ShrN32x4, + binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)), + mkU8(30)); + /* And use the resulting cleaned-up control vector as steering + in a Perm operation. */ + IRTemp res = newTemp(Ity_V128); + assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1)); + return res; +} + +static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV ) +{ + IRTemp dHi, dLo, cHi, cLo; + dHi = dLo = cHi = cLo = IRTemp_INVALID; + breakupV256toV128s( dataV, &dHi, &dLo ); + breakupV256toV128s( ctrlV, &cHi, &cLo ); + IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi ); + IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo ); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo))); + return res; +} + +static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV ) +{ + /* No cleverness here .. */ + IRTemp dHi, dLo, cHi, cLo; + dHi = dLo = cHi = cLo = IRTemp_INVALID; + breakupV128to64s( dataV, &dHi, &dLo ); + breakupV128to64s( ctrlV, &cHi, &cLo ); + IRExpr* rHi + = IRExpr_Mux0X( unop(Iop_64to8, + binop(Iop_And64, mkexpr(cHi), mkU64(2))), + mkexpr(dLo), mkexpr(dHi) ); + IRExpr* rLo + = IRExpr_Mux0X( unop(Iop_64to8, + binop(Iop_And64, mkexpr(cLo), mkU64(2))), + mkexpr(dLo), mkexpr(dHi) ); + IRTemp res = newTemp(Ity_V128); + assign(res, binop(Iop_64HLtoV128, rHi, rLo)); + return res; +} + +static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV ) +{ + IRTemp dHi, dLo, cHi, cLo; + dHi = dLo = cHi = cLo = IRTemp_INVALID; + breakupV256toV128s( dataV, &dHi, &dLo ); + breakupV256toV128s( ctrlV, &cHi, &cLo ); + IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi ); + IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo ); + IRTemp res = newTemp(Ity_V256); + assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo))); + return res; +} + __attribute__((noinline)) static Long dis_ESC_0F38__VEX ( @@ -23048,6 +23122,120 @@ Long dis_ESC_0F38__VEX ( } break; + case 0x0C: + /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp ctrlV = newTemp(Ity_V128); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + DIP("vpermilps %s,%s,%s\n", + nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + assign(ctrlV, getXMMReg(rE)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + DIP("vpermilps %s,%s,%s\n", + dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + assign(ctrlV, loadLE(Ity_V128, mkexpr(addr))); + } + IRTemp dataV = newTemp(Ity_V128); + assign(dataV, getXMMReg(rV)); + IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV); + putYMMRegLoAndZU(rG, mkexpr(resV)); + *uses_vvvv = True; + goto decode_success; + } + /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp ctrlV = newTemp(Ity_V256); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + DIP("vpermilps %s,%s,%s\n", + nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG)); + assign(ctrlV, getYMMReg(rE)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + DIP("vpermilps %s,%s,%s\n", + dis_buf, nameYMMReg(rV), nameYMMReg(rG)); + assign(ctrlV, loadLE(Ity_V256, mkexpr(addr))); + } + IRTemp dataV = newTemp(Ity_V256); + assign(dataV, getYMMReg(rV)); + IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV); + putYMMReg(rG, mkexpr(resV)); + *uses_vvvv = True; + goto decode_success; + } + break; + + case 0x0D: + /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */ + if (have66noF2noF3(pfx) + && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp ctrlV = newTemp(Ity_V128); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + DIP("vpermilpd %s,%s,%s\n", + nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG)); + assign(ctrlV, getXMMReg(rE)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + DIP("vpermilpd %s,%s,%s\n", + dis_buf, nameXMMReg(rV), nameXMMReg(rG)); + assign(ctrlV, loadLE(Ity_V128, mkexpr(addr))); + } + IRTemp dataV = newTemp(Ity_V128); + assign(dataV, getXMMReg(rV)); + IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV); + putYMMRegLoAndZU(rG, mkexpr(resV)); + *uses_vvvv = True; + goto decode_success; + } + /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */ + if (have66noF2noF3(pfx) + && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) { + UChar modrm = getUChar(delta); + UInt rG = gregOfRexRM(pfx, modrm); + UInt rV = getVexNvvvv(pfx); + IRTemp ctrlV = newTemp(Ity_V256); + if (epartIsReg(modrm)) { + UInt rE = eregOfRexRM(pfx, modrm); + delta += 1; + DIP("vpermilpd %s,%s,%s\n", + nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG)); + assign(ctrlV, getYMMReg(rE)); + } else { + addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 ); + delta += alen; + DIP("vpermilpd %s,%s,%s\n", + dis_buf, nameYMMReg(rV), nameYMMReg(rG)); + assign(ctrlV, loadLE(Ity_V256, mkexpr(addr))); + } + IRTemp dataV = newTemp(Ity_V256); + assign(dataV, getYMMReg(rV)); + IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV); + putYMMReg(rG, mkexpr(resV)); + *uses_vvvv = True; + goto decode_success; + } + break; + case 0x18: /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */ if (have66noF2noF3(pfx) diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 11d7d9b92e..c8625f5588 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -3243,6 +3243,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) goto do_SseAssistedBinary; case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2; goto do_SseAssistedBinary; + case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4; + goto do_SseAssistedBinary; case Iop_QNarrowBin32Sto16Ux8: fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8; goto do_SseAssistedBinary; diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c index 6e1100c303..14d454666c 100644 --- a/VEX/priv/host_generic_simd128.c +++ b/VEX/priv/host_generic_simd128.c @@ -358,6 +358,16 @@ void VEX_REGPARM(3) res->w16[7] = narrow32to16(argL->w32[3]); } +void VEX_REGPARM(3) + h_generic_calc_Perm32x4 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w32[0] = argL->w32[ argR->w32[0] & 3 ]; + res->w32[1] = argL->w32[ argR->w32[1] & 3 ]; + res->w32[2] = argL->w32[ argR->w32[2] & 3 ]; + res->w32[3] = argL->w32[ argR->w32[3] & 3 ]; +} + /*---------------------------------------------------------------*/ /*--- end host_generic_simd128.c ---*/ diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h index 6f9cc97bf5..c5a7635784 100644 --- a/VEX/priv/host_generic_simd128.h +++ b/VEX/priv/host_generic_simd128.h @@ -83,6 +83,9 @@ extern VEX_REGPARM(3) void h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128*, V128*, V128* ); +extern VEX_REGPARM(3) + void h_generic_calc_Perm32x4 ( /*OUT*/V128*, V128*, V128* ); + #endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */ /*---------------------------------------------------------------*/ diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index efbe3c210a..445b7bf879 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -925,6 +925,7 @@ void ppIROp ( IROp op ) case Iop_ExtractV128: vex_printf("ExtractV128"); return; case Iop_Perm8x16: vex_printf("Perm8x16"); return; + case Iop_Perm32x4: vex_printf("Perm32x4"); return; case Iop_Reverse16_8x16: vex_printf("Reverse16_8x16"); return; case Iop_Reverse32_8x16: vex_printf("Reverse32_8x16"); return; case Iop_Reverse32_16x8: vex_printf("Reverse32_16x8"); return; @@ -2579,7 +2580,7 @@ void typeOfPrimop ( IROp op, case Iop_InterleaveOddLanes8x16: case Iop_InterleaveEvenLanes8x16: case Iop_InterleaveOddLanes16x8: case Iop_InterleaveEvenLanes16x8: case Iop_InterleaveOddLanes32x4: case Iop_InterleaveEvenLanes32x4: - case Iop_Perm8x16: + case Iop_Perm8x16: case Iop_Perm32x4: case Iop_Recps32Fx4: case Iop_Rsqrts32Fx4: BINARY(Ity_V128,Ity_V128, Ity_V128); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index cda42181b7..06dc82e69a 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -1415,6 +1415,7 @@ typedef argR[i] values may only be in the range 0 .. 15, else behaviour is undefined. */ Iop_Perm8x16, + Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */ /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate See floating-point equiwalents for details. */