From: Julian Seward <jseward@acm.org>
Date: Wed, 20 Jun 2012 11:46:19 +0000 (+0000)
Subject: Implement
X-Git-Tag: svn/VALGRIND_3_8_1^2~83
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0bc5bac261b91c00fe495292b1582e954789f7b7;p=thirdparty%2Fvalgrind.git

Implement

VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r
VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r
VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r
VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r


git-svn-id: svn://svn.valgrind.org/vex/trunk@2395
---

diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 0288ed09da..364bb79487 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -8986,6 +8986,20 @@ static void breakupV256toV128s ( IRTemp t256,
    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
 }
 
+/* Break a V128-bit value up into two 64-bit ints. */
+
+static void breakupV128to64s ( IRTemp t128,
+                               /*OUTs*/
+                               IRTemp* t1, IRTemp* t0 )
+{
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   *t0 = newTemp(Ity_I64);
+   *t1 = newTemp(Ity_I64);
+   assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
+   assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
+}
+
 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
    values (aa,bb), computes, for each of the 4 16-bit lanes:
 
@@ -23015,6 +23029,66 @@ Long dis_ESC_0F__VEX (
 /*---                                                      ---*/
 /*------------------------------------------------------------*/
 
+static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
+{
+   /* In the control vector, zero out all but the bottom two bits of
+      each 32-bit lane. */
+   IRExpr* cv1 = binop(Iop_ShrN32x4,
+                       binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
+                       mkU8(30));
+   /* And use the resulting cleaned-up control vector as steering
+      in a Perm operation. */
+   IRTemp res = newTemp(Ity_V128);
+   assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
+   return res;
+}
+
+static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
+{
+   IRTemp dHi, dLo, cHi, cLo;
+   dHi = dLo = cHi = cLo = IRTemp_INVALID;
+   breakupV256toV128s( dataV, &dHi, &dLo );
+   breakupV256toV128s( ctrlV, &cHi, &cLo );
+   IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
+   IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
+   return res;
+}
+
+static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
+{
+   /* No cleverness here .. */
+   IRTemp dHi, dLo, cHi, cLo;
+   dHi = dLo = cHi = cLo = IRTemp_INVALID;
+   breakupV128to64s( dataV, &dHi, &dLo );
+   breakupV128to64s( ctrlV, &cHi, &cLo );
+   IRExpr* rHi
+      = IRExpr_Mux0X( unop(Iop_64to8,
+                           binop(Iop_And64, mkexpr(cHi), mkU64(2))),
+                      mkexpr(dLo), mkexpr(dHi) );
+   IRExpr* rLo
+      = IRExpr_Mux0X( unop(Iop_64to8,
+                           binop(Iop_And64, mkexpr(cLo), mkU64(2))),
+                      mkexpr(dLo), mkexpr(dHi) );
+   IRTemp res = newTemp(Ity_V128);
+   assign(res, binop(Iop_64HLtoV128, rHi, rLo));
+   return res;
+}
+
+static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
+{
+   IRTemp dHi, dLo, cHi, cLo;
+   dHi = dLo = cHi = cLo = IRTemp_INVALID;
+   breakupV256toV128s( dataV, &dHi, &dLo );
+   breakupV256toV128s( ctrlV, &cHi, &cLo );
+   IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
+   IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
+   IRTemp res = newTemp(Ity_V256);
+   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
+   return res;
+}
+
 __attribute__((noinline))
 static
 Long dis_ESC_0F38__VEX (
@@ -23048,6 +23122,120 @@ Long dis_ESC_0F38__VEX (
       }
       break;
 
+   case 0x0C:
+      /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
+      if (have66noF2noF3(pfx)
+          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp ctrlV = newTemp(Ity_V128);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            DIP("vpermilps %s,%s,%s\n",
+                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+            assign(ctrlV, getXMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            DIP("vpermilps %s,%s,%s\n",
+                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
+            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
+         }
+         IRTemp dataV = newTemp(Ity_V128);
+         assign(dataV, getXMMReg(rV));
+         IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
+         putYMMRegLoAndZU(rG, mkexpr(resV));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp ctrlV = newTemp(Ity_V256);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            DIP("vpermilps %s,%s,%s\n",
+                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+            assign(ctrlV, getYMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            DIP("vpermilps %s,%s,%s\n",
+                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
+         }
+         IRTemp dataV = newTemp(Ity_V256);
+         assign(dataV, getYMMReg(rV));
+         IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
+         putYMMReg(rG, mkexpr(resV));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
+   case 0x0D:
+      /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
+      if (have66noF2noF3(pfx)
+          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp ctrlV = newTemp(Ity_V128);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            DIP("vpermilpd %s,%s,%s\n",
+                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
+            assign(ctrlV, getXMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            DIP("vpermilpd %s,%s,%s\n",
+                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
+            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
+         }
+         IRTemp dataV = newTemp(Ity_V128);
+         assign(dataV, getXMMReg(rV));
+         IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
+         putYMMRegLoAndZU(rG, mkexpr(resV));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
+      if (have66noF2noF3(pfx)
+          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
+         UChar  modrm = getUChar(delta);
+         UInt   rG    = gregOfRexRM(pfx, modrm);
+         UInt   rV    = getVexNvvvv(pfx);
+         IRTemp ctrlV = newTemp(Ity_V256);
+         if (epartIsReg(modrm)) {
+            UInt rE = eregOfRexRM(pfx, modrm);
+            delta += 1;
+            DIP("vpermilpd %s,%s,%s\n",
+                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
+            assign(ctrlV, getYMMReg(rE));
+         } else {
+            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
+            delta += alen;
+            DIP("vpermilpd %s,%s,%s\n",
+                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
+            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
+         }
+         IRTemp dataV = newTemp(Ity_V256);
+         assign(dataV, getYMMReg(rV));
+         IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
+         putYMMReg(rG, mkexpr(resV));
+         *uses_vvvv = True;
+         goto decode_success;
+      }
+      break;
+
    case 0x18:
       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
       if (have66noF2noF3(pfx)
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index 11d7d9b92e..c8625f5588 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -3243,6 +3243,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
                            goto do_SseAssistedBinary;
       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
                            goto do_SseAssistedBinary;
+      case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
+                           goto do_SseAssistedBinary;
       case Iop_QNarrowBin32Sto16Ux8:
                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
                            goto do_SseAssistedBinary;
diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c
index 6e1100c303..14d454666c 100644
--- a/VEX/priv/host_generic_simd128.c
+++ b/VEX/priv/host_generic_simd128.c
@@ -358,6 +358,16 @@ void VEX_REGPARM(3)
    res->w16[7] = narrow32to16(argL->w32[3]);
 }
 
+void VEX_REGPARM(3)
+     h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
+   res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
+   res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
+   res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
+}
+
 
 /*---------------------------------------------------------------*/
 /*--- end                              host_generic_simd128.c ---*/
diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h
index 6f9cc97bf5..c5a7635784 100644
--- a/VEX/priv/host_generic_simd128.h
+++ b/VEX/priv/host_generic_simd128.h
@@ -83,6 +83,9 @@ extern VEX_REGPARM(3)
        void h_generic_calc_NarrowBin32to16x8
                                       ( /*OUT*/V128*, V128*, V128* );
 
+extern VEX_REGPARM(3)
+       void h_generic_calc_Perm32x4   ( /*OUT*/V128*, V128*, V128* );
+
 #endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
 
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index efbe3c210a..445b7bf879 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -925,6 +925,7 @@ void ppIROp ( IROp op )
       case Iop_ExtractV128: vex_printf("ExtractV128"); return;
 
       case Iop_Perm8x16: vex_printf("Perm8x16"); return;
+      case Iop_Perm32x4: vex_printf("Perm32x4"); return;
       case Iop_Reverse16_8x16: vex_printf("Reverse16_8x16"); return;
       case Iop_Reverse32_8x16: vex_printf("Reverse32_8x16"); return;
       case Iop_Reverse32_16x8: vex_printf("Reverse32_16x8"); return;
@@ -2579,7 +2580,7 @@ void typeOfPrimop ( IROp op,
       case Iop_InterleaveOddLanes8x16: case Iop_InterleaveEvenLanes8x16:
       case Iop_InterleaveOddLanes16x8: case Iop_InterleaveEvenLanes16x8:
       case Iop_InterleaveOddLanes32x4: case Iop_InterleaveEvenLanes32x4:
-      case Iop_Perm8x16:
+      case Iop_Perm8x16: case Iop_Perm32x4:
       case Iop_Recps32Fx4:
       case Iop_Rsqrts32Fx4:
          BINARY(Ity_V128,Ity_V128, Ity_V128);
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index cda42181b7..06dc82e69a 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -1415,6 +1415,7 @@ typedef
          argR[i] values may only be in the range 0 .. 15, else behaviour
          is undefined. */
       Iop_Perm8x16,
+      Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */
 
       /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
          See floating-point equiwalents for details. */