fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
case Iop_QNarrowBin16Sto8Ux8:
fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
+ case Iop_NarrowBin16to8x8:
+ fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
+ case Iop_NarrowBin32to16x4:
+ fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
case Iop_QSub8Sx8:
fn = (HWord)h_generic_calc_QSub8Sx8; break;
case Iop_QNarrowBin32Sto16Ux8:
fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
goto do_SseAssistedBinary;
+ case Iop_NarrowBin16to8x16:
+ fn = (HWord)h_generic_calc_NarrowBin16to8x16;
+ goto do_SseAssistedBinary;
+ case Iop_NarrowBin32to16x8:
+ fn = (HWord)h_generic_calc_NarrowBin32to16x8;
+ goto do_SseAssistedBinary;
do_SseAssistedBinary: {
/* RRRufff! RRRufff code is what we're generating here. Oh
well. */
return (UShort)xx;
}
-void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
+static inline UShort narrow32to16 ( UInt xx )
+{
+ return (UShort)xx;
+}
+
+static inline UChar narrow16to8 ( UShort xx )
+{
+ return (UChar)xx;
+}
+
+
+void VEX_REGPARM(3)
+ h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
}
-void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
}
-void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
}
-void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
}
-void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
}
-void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
}
-void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
}
-void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
}
-void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
}
-void h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
}
-void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
semantics of these primops (Sar64x2, etc) it is an error if in
fact we are ever given an out-of-range shift amount.
*/
-void h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
+void /*not-regparm*/
+ h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
V128* argL, UInt nn)
{
/* vassert(nn < 64); */
res->w64[1] = sar64(argL->w64[1], nn);
}
-void h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
+void /*not-regparm*/
+ h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
V128* argL, UInt nn)
{
/* vassert(nn < 8); */
res->w8[15] = sar8(argL->w8[15], nn);
}
-void h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
+void VEX_REGPARM(3)
+ h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
V128* argL, V128* argR )
{
res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
}
+void VEX_REGPARM(3)
+ h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w8[ 0] = narrow16to8(argR->w16[0]);
+ res->w8[ 1] = narrow16to8(argR->w16[1]);
+ res->w8[ 2] = narrow16to8(argR->w16[2]);
+ res->w8[ 3] = narrow16to8(argR->w16[3]);
+ res->w8[ 4] = narrow16to8(argR->w16[4]);
+ res->w8[ 5] = narrow16to8(argR->w16[5]);
+ res->w8[ 6] = narrow16to8(argR->w16[6]);
+ res->w8[ 7] = narrow16to8(argR->w16[7]);
+ res->w8[ 8] = narrow16to8(argL->w16[0]);
+ res->w8[ 9] = narrow16to8(argL->w16[1]);
+ res->w8[10] = narrow16to8(argL->w16[2]);
+ res->w8[11] = narrow16to8(argL->w16[3]);
+ res->w8[12] = narrow16to8(argL->w16[4]);
+ res->w8[13] = narrow16to8(argL->w16[5]);
+ res->w8[14] = narrow16to8(argL->w16[6]);
+ res->w8[15] = narrow16to8(argL->w16[7]);
+}
+
+void VEX_REGPARM(3)
+ h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w16[0] = narrow32to16(argR->w32[0]);
+ res->w16[1] = narrow32to16(argR->w32[1]);
+ res->w16[2] = narrow32to16(argR->w32[2]);
+ res->w16[3] = narrow32to16(argR->w32[3]);
+ res->w16[4] = narrow32to16(argL->w32[0]);
+ res->w16[5] = narrow32to16(argL->w32[1]);
+ res->w16[6] = narrow32to16(argL->w32[2]);
+ res->w16[7] = narrow32to16(argL->w32[3]);
+}
+
/*---------------------------------------------------------------*/
/*--- end host_generic_simd128.c ---*/
#include "libvex_basictypes.h"
-/* DO NOT MAKE THESE INTO REGPARM FNS! THIS WILL BREAK CALLING
- SEQUENCES GENERATED BY host-x86/isel.c. */
-
-extern void h_generic_calc_Mul32x4 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Max32Sx4 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Min32Sx4 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Max32Ux4 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Min32Ux4 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Max16Ux8 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Min16Ux8 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_CmpEQ64x2 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
-extern void h_generic_calc_SarN64x2 ( /*OUT*/V128*, V128*, UInt );
-extern void h_generic_calc_SarN8x16 ( /*OUT*/V128*, V128*, UInt );
-
-extern void h_generic_calc_QNarrowBin32Sto16Ux8
+extern VEX_REGPARM(3)
+ void h_generic_calc_Mul32x4 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Max32Sx4 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Min32Sx4 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Max32Ux4 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Min32Ux4 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Max16Ux8 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Min16Ux8 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_CmpEQ64x2 ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+
+extern /*not-regparm*/
+ void h_generic_calc_SarN64x2 ( /*OUT*/V128*, V128*, UInt );
+extern /*not-regparm*/
+ void h_generic_calc_SarN8x16 ( /*OUT*/V128*, V128*, UInt );
+
+extern VEX_REGPARM(3)
+ void h_generic_calc_QNarrowBin32Sto16Ux8
+ ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_NarrowBin16to8x16
+ ( /*OUT*/V128*, V128*, V128* );
+extern VEX_REGPARM(3)
+ void h_generic_calc_NarrowBin32to16x8
( /*OUT*/V128*, V128*, V128* );
-
#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
return (UChar)xx;
}
+static inline UShort narrow32to16 ( UInt xx )
+{
+ return (UShort)xx;
+}
+
+static inline UChar narrow16to8 ( UShort xx )
+{
+ return (UChar)xx;
+}
+
/* shifts: we don't care about out-of-range ones, since
that is dealt with at a higher level. */
);
}
+/* ------------ Truncating narrowing ------------ */
+
+ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
+{
+ UInt d = sel32x2_1(aa);
+ UInt c = sel32x2_0(aa);
+ UInt b = sel32x2_1(bb);
+ UInt a = sel32x2_0(bb);
+ return mk16x4(
+ narrow32to16(d),
+ narrow32to16(c),
+ narrow32to16(b),
+ narrow32to16(a)
+ );
+}
+
+ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
+{
+ UShort h = sel16x4_3(aa);
+ UShort g = sel16x4_2(aa);
+ UShort f = sel16x4_1(aa);
+ UShort e = sel16x4_0(aa);
+ UShort d = sel16x4_3(bb);
+ UShort c = sel16x4_2(bb);
+ UShort b = sel16x4_1(bb);
+ UShort a = sel16x4_0(bb);
+ return mk8x8(
+ narrow16to8(h),
+ narrow16to8(g),
+ narrow16to8(f),
+ narrow16to8(e),
+ narrow16to8(d),
+ narrow16to8(c),
+ narrow16to8(b),
+ narrow16to8(a)
+ );
+}
+
/* ------------ Interleaving ------------ */
ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
extern ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong, ULong );
extern ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong, ULong );
extern ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong, ULong );
+extern ULong h_generic_calc_NarrowBin32to16x4 ( ULong, ULong );
+extern ULong h_generic_calc_NarrowBin16to8x8 ( ULong, ULong );
extern ULong h_generic_calc_InterleaveHI8x8 ( ULong, ULong );
extern ULong h_generic_calc_InterleaveLO8x8 ( ULong, ULong );
#include "main_globals.h"
#include "host_generic_regs.h"
#include "host_generic_simd64.h"
+#include "host_generic_simd128.h"
#include "host_x86_defs.h"
/* TODO 21 Apr 2005:
fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
case Iop_QNarrowBin16Sto8Ux8:
fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
+ case Iop_NarrowBin16to8x8:
+ fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
+ case Iop_NarrowBin32to16x4:
+ fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
case Iop_QSub8Sx8:
fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
# define SSE2_OR_ABOVE \
(env->hwcaps & VEX_HWCAPS_X86_SSE2)
+ HWord fn = 0; /* address of helper fn, if required */
MatchInfo mi;
Bool arg1isEReg = False;
X86SseOp op = Xsse_INVALID;
return dst;
}
+ case Iop_NarrowBin32to16x8:
+ fn = (HWord)h_generic_calc_NarrowBin32to16x8;
+ goto do_SseAssistedBinary;
+ case Iop_NarrowBin16to8x16:
+ fn = (HWord)h_generic_calc_NarrowBin16to8x16;
+ goto do_SseAssistedBinary;
+ do_SseAssistedBinary: {
+ /* As with the amd64 case (where this is copied from) we
+ generate pretty bad code. */
+ vassert(fn != 0);
+ HReg dst = newVRegV(env);
+ HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+ HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+ HReg argp = newVRegI(env);
+ /* subl $112, %esp -- make a space */
+ sub_from_esp(env, 112);
+ /* leal 48(%esp), %r_argp -- point into it */
+ addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
+ argp));
+ /* andl $-16, %r_argp -- 16-align the pointer */
+ addInstr(env, X86Instr_Alu32R(Xalu_AND,
+ X86RMI_Imm( ~(UInt)15 ),
+ argp));
+ /* Prepare 3 arg regs:
+ leal 0(%r_argp), %eax
+ leal 16(%r_argp), %edx
+ leal 32(%r_argp), %ecx
+ */
+ addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
+ hregX86_EAX()));
+ addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
+ hregX86_EDX()));
+ addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
+ hregX86_ECX()));
+ /* Store the two args, at (%edx) and (%ecx):
+ movupd %argL, 0(%edx)
+ movupd %argR, 0(%ecx)
+ */
+ addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
+ X86AMode_IR(0, hregX86_EDX())));
+ addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
+ X86AMode_IR(0, hregX86_ECX())));
+ /* call the helper */
+ addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn, 3 ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
+ X86AMode_IR(0, argp)));
+ /* and finally, clear the space */
+ add_to_esp(env, 112);
+ return dst;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
case Iop_QNarrowBin16Sto8Ux8: vex_printf("QNarrowBin16Sto8Ux8"); return;
case Iop_QNarrowBin16Sto8Sx8: vex_printf("QNarrowBin16Sto8Sx8"); return;
case Iop_QNarrowBin32Sto16Sx4: vex_printf("QNarrowBin32Sto16Sx4"); return;
+ case Iop_NarrowBin16to8x8: vex_printf("NarrowBin16to8x8"); return;
+ case Iop_NarrowBin32to16x4: vex_printf("NarrowBin32to16x4"); return;
case Iop_InterleaveHI8x8: vex_printf("InterleaveHI8x8"); return;
case Iop_InterleaveHI16x4: vex_printf("InterleaveHI16x4"); return;
case Iop_InterleaveHI32x2: vex_printf("InterleaveHI32x2"); return;
case Iop_PwAdd32Fx2:
case Iop_QNarrowBin32Sto16Sx4:
case Iop_QNarrowBin16Sto8Sx8: case Iop_QNarrowBin16Sto8Ux8:
+ case Iop_NarrowBin16to8x8: case Iop_NarrowBin32to16x4:
case Iop_Sub8x8: case Iop_Sub16x4: case Iop_Sub32x2:
case Iop_QSub8Sx8: case Iop_QSub16Sx4:
case Iop_QSub32Sx2: case Iop_QSub64Sx1:
*/
Iop_QNarrowBin16Sto8Ux8,
Iop_QNarrowBin16Sto8Sx8, Iop_QNarrowBin32Sto16Sx4,
+ Iop_NarrowBin16to8x8, Iop_NarrowBin32to16x4,
/* INTERLEAVING */
/* Interleave lanes from low or high halves of