From: Julian Seward <jseward@acm.org>
Date: Mon, 14 Feb 2011 13:30:26 +0000 (+0000)
Subject: Merge from trunk, r2076 (Implement SSE4.x EXTRACTPS, BLENDVPD,
X-Git-Tag: svn/VALGRIND_3_6_1^2~10
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f3a6734679332a9b1c671b01924ce798cf166df6;p=thirdparty%2Fvalgrind.git

Merge from trunk, r2076 (Implement SSE4.x EXTRACTPS, BLENDVPD,
BLENDVPS, PBLENDVB.)


git-svn-id: svn://svn.valgrind.org/vex/branches/VEX_3_6_BRANCH@2093
---

diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
index 3d723b416a..da85c4961a 100644
--- a/VEX/priv/guest_amd64_toIR.c
+++ b/VEX/priv/guest_amd64_toIR.c
@@ -14643,6 +14643,55 @@ DisResult disInstr_AMD64_WRK (
       goto decode_success;
    }
 
+
+   /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
+      float from xmm reg and store in gen.reg or mem.  This is
+      identical to PEXTRD, except that REX.W appears to be ignored.
+   */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2  /* REX.W == 0; perhaps too strict? */
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x17 ) {
+
+      Int imm8_10;
+      IRTemp xmm_vec   = newTemp(Ity_V128);
+      IRTemp src_dword = newTemp(Ity_I32);
+
+      modrm = insn[3];
+      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8_10 = (Int)(insn[3+1] & 3);
+      } else { 
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8_10 = (Int)(insn[3+alen] & 3);
+      }
+
+      switch ( imm8_10 ) {
+         case 0:  assign( src_dword, mkexpr(t0) ); break;
+         case 1:  assign( src_dword, mkexpr(t1) ); break;
+         case 2:  assign( src_dword, mkexpr(t2) ); break;
+         case 3:  assign( src_dword, mkexpr(t3) ); break;
+         default: vassert(0);
+      }
+
+      if ( epartIsReg( modrm ) ) {
+         putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
+         delta += 3+1+1;
+         DIP( "extractps $%d, %s,%s\n", imm8_10,
+              nameXMMReg( gregOfRexRM(pfx, modrm) ),
+              nameIReg32( eregOfRexRM(pfx, modrm) ) );
+      } else {
+         storeLE( mkexpr(addr), mkexpr(src_dword) );
+         delta += 3+alen+1;
+         DIP( "extractps $%d, %s,%s\n", 
+              imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+      }
+
+      goto decode_success;
+   }
+
+
    /* 66 0F 38 37 = PCMPGTQ
       64x2 comparison (signed, presumably; the Intel docs don't say :-)
    */
@@ -15731,6 +15780,74 @@ DisResult disInstr_AMD64_WRK (
       goto decode_success;
    }
 
+   /* 66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
+      66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
+      66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
+      Blend at various granularities, with XMM0 (implicit operand)
+      providing the controlling mask.
+   */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x38
+       && (insn[2] == 0x15 || insn[2] == 0x14 || insn[2] == 0x10)) {
+      modrm = insn[3];
+
+      HChar* nm    = NULL;
+      UInt   gran  = 0;
+      IROp   opSAR = Iop_INVALID;
+      switch (insn[2]) {
+         case 0x15:
+            nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
+            break;
+         case 0x14:
+            nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
+            break;
+         case 0x10:
+            nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
+            break;
+      }
+      vassert(nm);
+
+      IRTemp vecE = newTemp(Ity_V128);
+      IRTemp vecG = newTemp(Ity_V128);
+      IRTemp vec0 = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
+         delta += 3+1;
+         DIP( "%s %s,%s\n", nm,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
+         delta += 3+alen;
+         DIP( "%s %s,%s\n", nm,
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
+      assign(vec0, getXMMReg(0));
+
+      /* Now the tricky bit is to convert vec0 into a suitable mask,
+         by copying the most significant bit of each lane into all
+         positions in the lane. */
+      IRTemp sh = newTemp(Ity_I8);
+      assign(sh, mkU8(8 * gran - 1));
+
+      IRTemp mask = newTemp(Ity_V128);
+      assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
+
+      IRTemp notmask = newTemp(Ity_V128);
+      assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
+
+      IRExpr* res = binop(Iop_OrV128,
+                          binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
+                          binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask)));
+      putXMMReg(gregOfRexRM(pfx, modrm), res);
+
+      goto decode_success;
+   }
 
    /* ---------------------------------------------------- */
    /* --- end of the SSE4 decoder                      --- */
diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
index a54444a860..8f8e4a3179 100644
--- a/VEX/priv/host_amd64_isel.c
+++ b/VEX/priv/host_amd64_isel.c
@@ -3659,6 +3659,54 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
          return dst;
       }
 
+      case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
+                         goto do_SseAssistedVectorAndScalar;
+      case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
+                         goto do_SseAssistedVectorAndScalar;
+      do_SseAssistedVectorAndScalar: {
+         /* RRRufff!  RRRufff code is what we're generating here.  Oh
+            well. */
+         vassert(fn != 0);
+         HReg dst = newVRegV(env);
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg argp = newVRegI(env);
+         /* subq $112, %rsp         -- make a space*/
+         sub_from_rsp(env, 112);
+         /* leaq 48(%rsp), %r_argp  -- point into it */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+                                        argp));
+         /* andq $-16, %r_argp      -- 16-align the pointer */
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                         AMD64RMI_Imm( ~(UInt)15 ), 
+                                         argp));
+         /* Prepare 2 vector arg regs:
+            leaq 0(%r_argp), %rdi
+            leaq 16(%r_argp), %rsi
+         */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+                                        hregAMD64_RDI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+                                        hregAMD64_RSI()));
+         /* Store the vector arg, at (%rsi):
+            movupd  %argL, 0(%rsi)
+         */
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
+                                          AMD64AMode_IR(0, hregAMD64_RSI())));
+         /* And get the scalar value into rdx */
+         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
+
+         /* call the helper */
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
+         /* fetch the result from memory, using %r_argp, which the
+            register allocator will keep alive across the call. */
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
+                                          AMD64AMode_IR(0, argp)));
+         /* and finally, clear the space */
+         add_to_rsp(env, 112);
+         return dst;
+      }
+
       default:
          break;
    } /* switch (e->Iex.Binop.op) */
diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c
index 8ed516609d..2430e67bbe 100644
--- a/VEX/priv/host_generic_simd128.c
+++ b/VEX/priv/host_generic_simd128.c
@@ -94,6 +94,16 @@ static inline ULong cmpGT64S ( Long xx, Long yy )
              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
 }
 
+static inline ULong sar64 ( ULong v, UInt n )
+{
+   return ((Long)v) >> n;
+}
+
+static inline UChar sar8 ( UChar v, UInt n )
+{
+   return toUChar(((Char)v) >> n);
+}
+
 void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
                               V128* argL, V128* argR )
 {
@@ -214,6 +224,44 @@ void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
    res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
 }
 
+/* ------------ Shifting ------------ */
+/* Note that because these primops are undefined if the shift amount
+   equals or exceeds the lane width, the shift amount is masked so
+   that the scalar shifts are always in range.  In fact, given the
+   semantics of these primops (Sar64x2, etc) it is an error if in
+   fact we are ever given an out-of-range shift amount. 
+*/
+void h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
+                               V128* argL, UInt nn)
+{
+   /* vassert(nn < 64); */
+   nn &= 63;
+   res->w64[0] = sar64(argL->w64[0], nn);
+   res->w64[1] = sar64(argL->w64[1], nn);
+}
+
+void h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
+                              V128* argL, UInt nn)
+{
+   /* vassert(nn < 8); */
+   nn &= 7;
+   res->w8[ 0] = sar8(argL->w8[ 0], nn);
+   res->w8[ 1] = sar8(argL->w8[ 1], nn);
+   res->w8[ 2] = sar8(argL->w8[ 2], nn);
+   res->w8[ 3] = sar8(argL->w8[ 3], nn);
+   res->w8[ 4] = sar8(argL->w8[ 4], nn);
+   res->w8[ 5] = sar8(argL->w8[ 5], nn);
+   res->w8[ 6] = sar8(argL->w8[ 6], nn);
+   res->w8[ 7] = sar8(argL->w8[ 7], nn);
+   res->w8[ 8] = sar8(argL->w8[ 8], nn);
+   res->w8[ 9] = sar8(argL->w8[ 9], nn);
+   res->w8[10] = sar8(argL->w8[10], nn);
+   res->w8[11] = sar8(argL->w8[11], nn);
+   res->w8[12] = sar8(argL->w8[12], nn);
+   res->w8[13] = sar8(argL->w8[13], nn);
+   res->w8[14] = sar8(argL->w8[14], nn);
+   res->w8[15] = sar8(argL->w8[15], nn);
+}
 
 /*---------------------------------------------------------------*/
 /*--- end                              host_generic_simd128.c ---*/
diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h
index 53850cbdd2..d764439e16 100644
--- a/VEX/priv/host_generic_simd128.h
+++ b/VEX/priv/host_generic_simd128.h
@@ -58,6 +58,8 @@ extern void h_generic_calc_Min16Ux8   ( /*OUT*/V128*, V128*, V128* );
 extern void h_generic_calc_Max8Sx16   ( /*OUT*/V128*, V128*, V128* );
 extern void h_generic_calc_Min8Sx16   ( /*OUT*/V128*, V128*, V128* );
 extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_SarN64x2   ( /*OUT*/V128*, V128*, UInt );
+extern void h_generic_calc_SarN8x16   ( /*OUT*/V128*, V128*, UInt );
 
 
 #endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */