priv/host_ppc_defs.h \
priv/host_generic_regs.h \
priv/host_generic_simd64.h \
+ priv/host_generic_simd128.h \
priv/main_globals.h \
priv/main_util.h \
priv/guest_generic_x87.h \
priv/host_ppc_isel.o \
priv/host_generic_regs.o \
priv/host_generic_simd64.o \
+ priv/host_generic_simd128.o \
priv/host_generic_reg_alloc2.o \
priv/guest_generic_x87.o \
priv/guest_generic_bb_to_IR.o \
$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd64.o \
-c priv/host_generic_simd64.c
+priv/host_generic_simd128.o: $(ALL_HEADERS) priv/host_generic_simd128.c
+ $(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd128.o \
+ -c priv/host_generic_simd128.c
+
priv/host_generic_reg_alloc2.o: $(ALL_HEADERS) priv/host_generic_reg_alloc2.c
$(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_reg_alloc2.o \
-c priv/host_generic_reg_alloc2.c
goto decode_success;
}
+ /* 66 0F 38 37 = PCMPGTQ
+ 64x2 comparison (signed, presumably; the Intel docs don't say :-)
+ */
+ if ( have66noF2noF3( pfx ) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
+ delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
+ "pcmpgtq", Iop_CmpGT64Sx2, False );
+ goto decode_success;
+ }
/* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
Maximum of Packed Signed Double Word Integers (XMM)
- --
66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
Minimum of Packed Signed Double Word Integers (XMM) */
- if ( have66noF2noF3( pfx )
- && sz == 2
+ if ( have66noF2noF3( pfx ) && sz == 2
&& insn[0] == 0x0F && insn[1] == 0x38
- && ( (insn[2] == 0x3D) || (insn[2] == 0x39) ) ) {
-
- IRTemp reg_vec = newTemp(Ity_V128);
- IRTemp rom_vec = newTemp(Ity_V128);
- IRTemp mask_vec = newTemp(Ity_V128);
-
- Bool isPMAX = (insn[2] == 0x3D) ? True : False;
-
- HChar* str = isPMAX ? "pmaxsd" : "pminsd";
-
- modrm = insn[3];
- assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
-
- if ( epartIsReg( modrm ) ) {
- assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
- delta += 3+1;
- DIP( "%s %s,%s\n", str,
- nameXMMReg( eregOfRexRM(pfx, modrm) ),
- nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- } else {
- addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
- assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) );
- delta += 3+alen;
- DIP( "%s %s,%s\n", str, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- }
-
- assign( mask_vec, binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ) );
-
- IRTemp max_min_vec = newTemp(Ity_V128);
- if ( isPMAX ) {
- assign( max_min_vec,
- binop( Iop_OrV128,
- binop( Iop_AndV128, mkexpr(rom_vec),
- unop( Iop_NotV128, mkexpr(mask_vec) ) ),
- binop( Iop_AndV128, mkexpr(reg_vec), mkexpr(mask_vec) ) ) );
- } else {
- assign( max_min_vec,
- binop( Iop_OrV128,
- binop( Iop_AndV128, mkexpr(reg_vec),
- unop( Iop_NotV128, mkexpr(mask_vec) ) ),
- binop( Iop_AndV128, mkexpr(rom_vec), mkexpr(mask_vec) ) ) );
- }
-
- putXMMReg( gregOfRexRM(pfx, modrm), mkexpr(max_min_vec) );
-
+ && (insn[2] == 0x3D || insn[2] == 0x39)) {
+ Bool isMAX = insn[2] == 0x3D;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxsd" : "pminsd",
+ isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
+ False
+ );
goto decode_success;
}
-
/* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
Maximum of Packed Unsigned Doubleword Integers (XMM)
66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
Minimum of Packed Unsigned Doubleword Integers (XMM) */
- if ( have66noF2noF3( pfx )
- && sz == 2
+ if ( have66noF2noF3( pfx ) && sz == 2
&& insn[0] == 0x0F && insn[1] == 0x38
&& (insn[2] == 0x3F || insn[2] == 0x3B)) {
+ Bool isMAX = insn[2] == 0x3F;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxud" : "pminud",
+ isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
+ False
+ );
+ goto decode_success;
+ }
- Bool is_max = insn[2] == 0x3F;
- IRTemp reg_vec = newTemp(Ity_V128);
- IRTemp rom_vec = newTemp(Ity_V128);
- IRTemp mask_vec = newTemp(Ity_V128);
- IRTemp and_vec = newTemp(Ity_V128);
- IRTemp not_vec = newTemp(Ity_V128);
-
- modrm = insn[3];
- assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
-
- if ( epartIsReg( modrm ) ) {
- assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
- delta += 3+1;
- DIP( "p%sud %s,%s\n",
- is_max ? "max" : "min",
- nameXMMReg( eregOfRexRM(pfx, modrm) ),
- nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- } else {
- addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
- assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) );
- delta += 3+alen;
- DIP( "p%sd %s,%s\n",
- is_max ? "max" : "min",
- dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
- }
-
- /* the foll. simulates Iop_CmpGT32Ux4 (not implemented)
- c.f. Hacker's Delight, S2-11, p.23 */
- assign( mask_vec,
- binop( Iop_XorV128,
- binop( Iop_XorV128,
- binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ),
- binop( Iop_SarN32x4, mkexpr(reg_vec), mkU8(31) ) ),
- binop( Iop_SarN32x4, mkexpr(rom_vec), mkU8(31) ) ) );
-
- assign( and_vec,
- binop( Iop_AndV128, mkexpr(is_max ? reg_vec : rom_vec),
- mkexpr(mask_vec) ) );
- assign( not_vec,
- binop( Iop_AndV128, mkexpr(is_max ? rom_vec : reg_vec),
- unop( Iop_NotV128, mkexpr(mask_vec) ) ) );
-
- putXMMReg( gregOfRexRM(pfx, modrm),
- binop( Iop_OrV128, mkexpr(not_vec), mkexpr(and_vec) ) );
-
+ /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
+ Maximum of Packed Unsigned Word Integers (XMM)
+ 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
+ Minimum of Packed Unsigned Word Integers (XMM)
+ */
+ if ( have66noF2noF3( pfx ) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x3E || insn[2] == 0x3A)) {
+ Bool isMAX = insn[2] == 0x3E;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxuw" : "pminuw",
+ isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
+ False
+ );
goto decode_success;
}
+ /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
+ 8Sx16 (signed) max
+ 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
+ 8Sx16 (signed) min
+ */
+ if ( have66noF2noF3( pfx ) && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38
+ && (insn[2] == 0x3C || insn[2] == 0x38)) {
+ Bool isMAX = insn[2] == 0x3C;
+ delta = dis_SSEint_E_to_G(
+ vbi, pfx, delta+3,
+ isMAX ? "pmaxsb" : "pminsb",
+ isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
+ False
+ );
+ goto decode_success;
+ }
/* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64
Packed Move with Sign Extend from Byte to Word (XMM) */
}
+ /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
+ 32x4 integer multiply from xmm2/m128 to xmm1 */
+ if ( have66noF2noF3( pfx )
+ && sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
+
+ modrm = insn[3];
+
+ IRTemp argL = newTemp(Ity_V128);
+ IRTemp argR = newTemp(Ity_V128);
+
+ if ( epartIsReg(modrm) ) {
+ assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+ delta += 3+1;
+ DIP( "pmulld %s,%s\n",
+ nameXMMReg( eregOfRexRM(pfx, modrm) ),
+ nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ } else {
+ addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+ assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
+ delta += 3+alen;
+ DIP( "pmulld %s,%s\n",
+ dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+ }
+
+ assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
+
+ putXMMReg( gregOfRexRM(pfx, modrm),
+ binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
+
+ goto decode_success;
+ }
+
+
/* F3 0F B8 = POPCNT{W,L,Q}
Count the number of 1 bits in a register
*/
#include "main_globals.h"
#include "host_generic_regs.h"
#include "host_generic_simd64.h"
+#include "host_generic_simd128.h"
#include "host_amd64_defs.h"
/* DO NOT CALL THIS DIRECTLY */
static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
{
- Bool arg1isEReg = False;
+ HWord fn = 0; /* address of helper fn, if required */
+ Bool arg1isEReg = False;
AMD64SseOp op = Asse_INVALID;
IRType ty = typeOfIRExpr(env->type_env,e);
vassert(e);
return dst;
}
+ case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4;
+ goto do_SseAssistedBinary;
+ case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4;
+ goto do_SseAssistedBinary;
+ case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4;
+ goto do_SseAssistedBinary;
+ case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4;
+ goto do_SseAssistedBinary;
+ case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4;
+ goto do_SseAssistedBinary;
+ case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8;
+ goto do_SseAssistedBinary;
+ case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8;
+ goto do_SseAssistedBinary;
+ case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16;
+ goto do_SseAssistedBinary;
+ case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16;
+ goto do_SseAssistedBinary;
+ case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
+ goto do_SseAssistedBinary;
+ do_SseAssistedBinary: {
+ /* RRRufff! RRRufff code is what we're generating here. Oh
+ well. */
+ vassert(fn != 0);
+ HReg dst = newVRegV(env);
+ HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+ HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+ HReg argp = newVRegI(env);
+ /* subq $112, %rsp -- make a space*/
+ sub_from_rsp(env, 112);
+ /* leaq 48(%rsp), %r_argp -- point into it */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+ argp));
+ /* andq $-16, %r_argp -- 16-align the pointer */
+ addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+ AMD64RMI_Imm( ~(UInt)15 ),
+ argp));
+ /* Prepare 3 arg regs:
+ leaq 0(%r_argp), %rdi
+ leaq 16(%r_argp), %rsi
+ leaq 32(%r_argp), %rdx
+ */
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+ hregAMD64_RDI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+ hregAMD64_RSI()));
+ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+ hregAMD64_RDX()));
+ /* Store the two args, at (%rsi) and (%rdx):
+ movupd %argL, 0(%rsi)
+ movupd %argR, 0(%rdx)
+ */
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
+ AMD64AMode_IR(0, hregAMD64_RSI())));
+ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
+ AMD64AMode_IR(0, hregAMD64_RDX())));
+ /* call the helper */
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
+ /* fetch the result from memory, using %r_argp, which the
+ register allocator will keep alive across the call. */
+ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
+ AMD64AMode_IR(0, argp)));
+ /* and finally, clear the space */
+ add_to_rsp(env, 112);
+ return dst;
+ }
+
default:
break;
} /* switch (e->Iex.Binop.op) */
--- /dev/null
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+
+ Copyright (C) 2010-2010 OpenWorks GbR
+ info@open-works.net
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+ where the instruction selectors cannot generate code in-line.
+ These are purely back-end entities and cannot be seen/referenced
+ from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd128.h"
+
+
+/* Primitive helpers always take args of the real type (signed vs
+ unsigned) but return an unsigned result, so there's no conversion
+ weirdness when stuffing results back in the V128 union fields,
+ which are all unsigned. */
+
+static inline UInt mul32 ( Int xx, Int yy )
+{
+ Int t = ((Int)xx) * ((Int)yy);
+ return toUInt(t);
+}
+
+static inline UInt max32S ( Int xx, Int yy )
+{
+ return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32S ( Int xx, Int yy )
+{
+ return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UInt max32U ( UInt xx, UInt yy )
+{
+ return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32U ( UInt xx, UInt yy )
+{
+ return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UShort max16U ( UShort xx, UShort yy )
+{
+ return toUShort((xx > yy) ? xx : yy);
+}
+
+static inline UShort min16U ( UShort xx, UShort yy )
+{
+ return toUShort((xx < yy) ? xx : yy);
+}
+
+static inline UChar max8S ( Char xx, Char yy )
+{
+ return toUChar((xx > yy) ? xx : yy);
+}
+
+static inline UChar min8S ( Char xx, Char yy )
+{
+ return toUChar((xx < yy) ? xx : yy);
+}
+
+static inline ULong cmpGT64S ( Long xx, Long yy )
+{
+ return (((Long)xx) > ((Long)yy))
+ ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
+}
+
+void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
+ res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
+ res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
+ res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
+ res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
+ res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
+ res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
+ res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
+ res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
+ res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
+ res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
+ res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
+ res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
+ res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
+ res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
+ res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
+ res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
+ res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
+ res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
+ res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
+ res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
+ res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
+ res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
+ res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
+ res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
+ res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
+ res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
+ res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
+ res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
+ res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
+ res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
+ res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
+ res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
+ res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
+ res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
+ res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
+ res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
+ res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
+ res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
+ res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
+ res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
+ res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
+ res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
+ res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
+ res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
+ res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
+ res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
+ res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
+ res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
+ res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
+ res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
+ res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
+ res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
+ res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
+ res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
+ res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
+ res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
+ res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
+ res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
+ res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
+ V128* argL, V128* argR )
+{
+ res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
+ res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
--- /dev/null
+
+/*---------------------------------------------------------------*/
+/*--- begin host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ This file is part of Valgrind, a dynamic binary instrumentation
+ framework.
+
+ Copyright (C) 2010-2010 OpenWorks GbR
+ info@open-works.net
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+ where the instruction selectors cannot generate code in-line.
+ These are purely back-end entities and cannot be seen/referenced
+ as clean helper functions from IR.
+
+ These will get called from generated code and therefore should be
+ well behaved -- no floating point or mmx insns, just straight
+ integer code.
+
+ Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD128_H
+#define __VEX_HOST_GENERIC_SIMD128_H
+
+/* A union for doing 128-bit primitives conveniently. It is not
+ public and so not placed in pub/. */
+typedef
+ union {
+ UChar w8[16];
+ UShort w16[8];
+ UInt w32[4];
+ ULong w64[2];
+ }
+ V128;
+
+
+#include "libvex_basictypes.h"
+
+/* DO NOT MAKE THESE INTO REGPARM FNS! THIS WILL BREAK CALLING
+ SEQUENCES GENERATED BY host-x86/isel.c. */
+
+extern void h_generic_calc_Mul32x4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Sx4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Sx4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Ux4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Ux4 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max16Ux8 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min16Ux8 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
+
+/*---------------------------------------------------------------*/
+/*--- end host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
case Iop_Mul16x4: vex_printf("Mul16x4"); return;
case Iop_Mul32x2: vex_printf("Mul32x2"); return;
+ case Iop_Mul32x4: vex_printf("Mul32x4"); return;
case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
case Iop_CmpGT8Sx16: vex_printf("CmpGT8Sx16"); return;
case Iop_CmpGT16Sx8: vex_printf("CmpGT16Sx8"); return;
case Iop_CmpGT32Sx4: vex_printf("CmpGT32Sx4"); return;
+ case Iop_CmpGT64Sx2: vex_printf("CmpGT64Sx2"); return;
case Iop_CmpGT8Ux16: vex_printf("CmpGT8Ux16"); return;
case Iop_CmpGT16Ux8: vex_printf("CmpGT16Ux8"); return;
case Iop_CmpGT32Ux4: vex_printf("CmpGT32Ux4"); return;
case Iop_Sub32x4: case Iop_Sub64x2:
case Iop_QSub8Ux16: case Iop_QSub16Ux8: case Iop_QSub32Ux4:
case Iop_QSub8Sx16: case Iop_QSub16Sx8: case Iop_QSub32Sx4:
- case Iop_Mul16x8:
+ case Iop_Mul16x8: case Iop_Mul32x4:
case Iop_MulHi16Ux8: case Iop_MulHi32Ux4:
case Iop_MulHi16Sx8: case Iop_MulHi32Sx4:
case Iop_MullEven8Ux16: case Iop_MullEven16Ux8:
case Iop_Min8Ux16: case Iop_Min16Ux8: case Iop_Min32Ux4:
case Iop_CmpEQ8x16: case Iop_CmpEQ16x8: case Iop_CmpEQ32x4:
case Iop_CmpGT8Sx16: case Iop_CmpGT16Sx8: case Iop_CmpGT32Sx4:
+ case Iop_CmpGT64Sx2:
case Iop_CmpGT8Ux16: case Iop_CmpGT16Ux8: case Iop_CmpGT32Ux4:
case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4:
case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4:
#include "guest_arm_defs.h"
#include "guest_ppc_defs.h"
+#include "host_generic_simd128.h"
+
/* This file contains the top level interface to the library. */
vassert(4 == sizeof(Addr32));
vassert(8 == sizeof(Addr64));
vassert(16 == sizeof(U128));
+ vassert(16 == sizeof(V128));
vassert(sizeof(void*) == 4 || sizeof(void*) == 8);
vassert(sizeof(void*) == sizeof(int*));
Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4,
/* MULTIPLICATION (normal / high half of signed/unsigned) */
- Iop_Mul16x8,
+ Iop_Mul16x8, Iop_Mul32x4,
Iop_MulHi16Ux8, Iop_MulHi32Ux4,
Iop_MulHi16Sx8, Iop_MulHi32Sx4,
/* (widening signed/unsigned of even lanes, with lowest lane=zero) */
/* COMPARISON */
Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4,
- Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4,
+ Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2,
Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4,
/* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */