From ec735c8e8177baefec436052c468516a6c7cbe49 Mon Sep 17 00:00:00 2001
From: Julian Seward <jseward@acm.org>
Date: Thu, 13 Dec 2012 18:29:56 +0000
Subject: [PATCH] Implement 128-bit PMOVMSKB using a single new primop
 (Iop_GetMSBs8x16) rather than chopping it up into two 64-bit pieces in the
 front end.

git-svn-id: svn://svn.valgrind.org/vex/trunk@2590
---
 VEX/priv/guest_amd64_toIR.c     | 17 +++++------------
 VEX/priv/host_amd64_isel.c      | 31 +++++++++++++++++++++++++++++++
 VEX/priv/host_generic_simd128.c | 22 ++++++++++++++++++++++
 VEX/priv/host_generic_simd128.h |  3 +++
 VEX/priv/ir_defs.c              |  5 ++++-
 VEX/pub/libvex_ir.h             |  5 ++++-
 6 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 9e8df2694f..b667c328d5 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -10274,22 +10274,15 @@ static Long dis_CVTDQ2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
 static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
                                Long delta, Bool isAvx )
 {
-   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
    UChar modrm = getUChar(delta);
    vassert(epartIsReg(modrm)); /* ensured by caller */
    UInt   rE = eregOfRexRM(pfx,modrm);
    UInt   rG = gregOfRexRM(pfx,modrm);
-   IRTemp t0 = newTemp(Ity_I64);
-   IRTemp t1 = newTemp(Ity_I64);
-   IRTemp t5 = newTemp(Ity_I32);
-   assign(t0, getXMMRegLane64(rE, 0));
-   assign(t1, getXMMRegLane64(rE, 1));
-   assign(t5,
-          unop(Iop_16Uto32,
-               binop(Iop_8HLto16,
-                     unop(Iop_GetMSBs8x8, mkexpr(t1)),
-                     unop(Iop_GetMSBs8x8, mkexpr(t0)))));
-   putIReg32(rG, mkexpr(t5));
+   IRTemp t0 = newTemp(Ity_V128);
+   IRTemp t1 = newTemp(Ity_I32);
+   assign(t0, getXMMReg(rE));
+   assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
+   putIReg32(rG, mkexpr(t1));
    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
        nameIReg32(rG));
    delta += 1;
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index 98e90f7ba4..d6f507e998 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -1605,6 +1605,37 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
             return dst;
          }
 
+         case Iop_GetMSBs8x16: {
+            /* Note: the following assumes the helper is of signature
+                  UInt fn ( ULong w64hi, ULong w64Lo ),
+               and is not a regparm fn. */
+            HReg dst = newVRegI(env);
+            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
+            HReg rsp = hregAMD64_RSP();
+            fn = (HWord)h_generic_calc_GetMSBs8x16;
+            AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
+            AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
+            addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
+                                             16, vec, m16_rsp));
+            /* hi 64 bits into RDI -- the first arg */
+            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, 
+                                             AMD64RMI_Mem(m8_rsp),
+                                             hregAMD64_RDI() )); /* 1st arg */
+            /* lo 64 bits into RSI -- the 2nd arg */
+            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, 
+                                             AMD64RMI_Mem(m16_rsp),
+                                             hregAMD64_RSI() )); /* 2nd arg */
+            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
+            addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
+            /* MovxLQ is not exactly the right thing here.  We just
+               need to get the bottom 8 bits of RAX into dst, and zero
+               out everything else.  Assuming that the helper returns
+               a UInt with the top 24 bits zeroed out, it'll do,
+               though. */
+            addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
+            return dst;
+         }
+
          default: 
             break;
       }
diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c
index 908f250323..a45f5fb622 100644
--- a/VEX/priv/host_generic_simd128.c
+++ b/VEX/priv/host_generic_simd128.c
@@ -368,6 +368,28 @@ void VEX_REGPARM(3)
    res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
 }
 
+UInt /*not-regparm*/
+     h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
+{
+   UInt r = 0;
+   if (w64hi & (1ULL << (64-1))) r |= (1<<15);
+   if (w64hi & (1ULL << (56-1))) r |= (1<<14);
+   if (w64hi & (1ULL << (48-1))) r |= (1<<13);
+   if (w64hi & (1ULL << (40-1))) r |= (1<<12);
+   if (w64hi & (1ULL << (32-1))) r |= (1<<11);
+   if (w64hi & (1ULL << (24-1))) r |= (1<<10);
+   if (w64hi & (1ULL << (16-1))) r |= (1<<9);
+   if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
+   if (w64lo & (1ULL << (64-1))) r |= (1<<7);
+   if (w64lo & (1ULL << (56-1))) r |= (1<<6);
+   if (w64lo & (1ULL << (48-1))) r |= (1<<5);
+   if (w64lo & (1ULL << (40-1))) r |= (1<<4);
+   if (w64lo & (1ULL << (32-1))) r |= (1<<3);
+   if (w64lo & (1ULL << (24-1))) r |= (1<<2);
+   if (w64lo & (1ULL << (16-1))) r |= (1<<1);
+   if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
+   return r;
+}
 
 /*---------------------------------------------------------------*/
 /*--- end                              host_generic_simd128.c ---*/
diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h
index 7956b80000..ba8555ddd3 100644
--- a/VEX/priv/host_generic_simd128.h
+++ b/VEX/priv/host_generic_simd128.h
@@ -86,6 +86,9 @@ extern VEX_REGPARM(3)
 extern VEX_REGPARM(3)
        void h_generic_calc_Perm32x4   ( /*OUT*/V128*, V128*, V128* );
 
+extern /*not-regparm*/
+       UInt  h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo );
+
 #endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
 
 /*---------------------------------------------------------------*/
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
index b356f60d1f..e4cdd829b3 100644
--- a/VEX/priv/ir_defs.c
+++ b/VEX/priv/ir_defs.c
@@ -584,6 +584,7 @@ void ppIROp ( IROp op )
       case Iop_Reverse64_32x2: vex_printf("Reverse64_32x2"); return;
       case Iop_Abs32Fx2: vex_printf("Abs32Fx2"); return;
       case Iop_GetMSBs8x8: vex_printf("GetMSBs8x8"); return;
+      case Iop_GetMSBs8x16: vex_printf("GetMSBs8x16"); return;
 
       case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
       case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
@@ -2299,7 +2300,9 @@ void typeOfPrimop ( IROp op,
       case Iop_Left16: UNARY(Ity_I16,Ity_I16);
       case Iop_CmpwNEZ32: case Iop_Left32: UNARY(Ity_I32,Ity_I32);
       case Iop_CmpwNEZ64: case Iop_Left64: UNARY(Ity_I64,Ity_I64);
-      case Iop_GetMSBs8x8: UNARY(Ity_I64, Ity_I8);
+
+      case Iop_GetMSBs8x8:  UNARY(Ity_I64, Ity_I8);
+      case Iop_GetMSBs8x16: UNARY(Ity_V128, Ity_I16);
 
       case Iop_MullU8: case Iop_MullS8:
          BINARY(Ity_I8,Ity_I8, Ity_I16);
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
index 99eaaaf6a0..bc85c3f23d 100644
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -1179,7 +1179,6 @@ typedef
          of arbitrary sign the result of the operation is 1.5. */
       Iop_Rsqrts32Fx4,
 
-
       /* --- Int to/from FP conversion --- */
       /* Unlike the standard fp conversions, these irops take no
          rounding mode argument. Instead the irop trailers _R{M,P,N,Z}
@@ -1433,6 +1432,10 @@ typedef
       Iop_Perm8x16,
       Iop_Perm32x4, /* ditto, except argR values are restricted to 0 .. 3 */
 
+      /* MISC CONVERSION -- get high bits of each byte lane, a la
+         x86/amd64 pmovmskb */
+      Iop_GetMSBs8x16, /* V64 -> I16 */
+
       /* Vector Reciprocal Estimate and Vector Reciprocal Square Root Estimate
          See floating-point equiwalents for details. */
       Iop_Recip32x4, Iop_Rsqrte32x4,
-- 
2.47.2