From: Julian Seward Date: Fri, 18 Jun 2010 08:17:41 +0000 (+0000) Subject: Implement SSE4 instructions: PCMPGTQ PMAXUD PMINUD PMAXSB PMINSB PMULLD X-Git-Tag: svn/VALGRIND_3_6_1^2~86 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f3517dfd35be26de45197201e30951845b26ac3e;p=thirdparty%2Fvalgrind.git Implement SSE4 instructions: PCMPGTQ PMAXUD PMINUD PMAXSB PMINSB PMULLD I believe this covers everything that gcc-4.4 and gcc-4.5 will generate with "-O3 -msse4.2". Note, this commit changes the set of IR ops and so requires a from-scratch rebuild of the tree. git-svn-id: svn://svn.valgrind.org/vex/trunk@1984 --- diff --git a/VEX/Makefile-gcc b/VEX/Makefile-gcc index c7d42a3c50..d1e1df4837 100644 --- a/VEX/Makefile-gcc +++ b/VEX/Makefile-gcc @@ -17,6 +17,7 @@ PRIV_HEADERS = priv/host_x86_defs.h \ priv/host_ppc_defs.h \ priv/host_generic_regs.h \ priv/host_generic_simd64.h \ + priv/host_generic_simd128.h \ priv/main_globals.h \ priv/main_util.h \ priv/guest_generic_x87.h \ @@ -44,6 +45,7 @@ LIB_OBJS = priv/ir_defs.o \ priv/host_ppc_isel.o \ priv/host_generic_regs.o \ priv/host_generic_simd64.o \ + priv/host_generic_simd128.o \ priv/host_generic_reg_alloc2.o \ priv/guest_generic_x87.o \ priv/guest_generic_bb_to_IR.o \ @@ -262,6 +264,10 @@ priv/host_generic_simd64.o: $(ALL_HEADERS) priv/host_generic_simd64.c $(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd64.o \ -c priv/host_generic_simd64.c +priv/host_generic_simd128.o: $(ALL_HEADERS) priv/host_generic_simd128.c + $(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_simd128.o \ + -c priv/host_generic_simd128.c + priv/host_generic_reg_alloc2.o: $(ALL_HEADERS) priv/host_generic_reg_alloc2.c $(CC) $(CCFLAGS) $(ALL_INCLUDES) -o priv/host_generic_reg_alloc2.o \ -c priv/host_generic_reg_alloc2.c diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c index 0b4748fe98..0647b8d0f7 100644 --- a/VEX/priv/guest_amd64_toIR.c +++ b/VEX/priv/guest_amd64_toIR.c @@ -14406,121 +14406,85 @@ DisResult disInstr_AMD64_WRK ( goto decode_success; } + /* 66 0F 38 37 = PCMPGTQ + 64x2 comparison (signed, presumably; the Intel docs don't say :-) + */ + if ( have66noF2noF3( pfx ) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) { + delta = dis_SSEint_E_to_G( vbi, pfx, delta+3, + "pcmpgtq", Iop_CmpGT64Sx2, False ); + goto decode_success; + } /* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128 Maximum of Packed Signed Double Word Integers (XMM) - -- 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128 Minimum of Packed Signed Double Word Integers (XMM) */ - if ( have66noF2noF3( pfx ) - && sz == 2 + if ( have66noF2noF3( pfx ) && sz == 2 && insn[0] == 0x0F && insn[1] == 0x38 - && ( (insn[2] == 0x3D) || (insn[2] == 0x39) ) ) { - - IRTemp reg_vec = newTemp(Ity_V128); - IRTemp rom_vec = newTemp(Ity_V128); - IRTemp mask_vec = newTemp(Ity_V128); - - Bool isPMAX = (insn[2] == 0x3D) ? True : False; - - HChar* str = isPMAX ? "pmaxsd" : "pminsd"; - - modrm = insn[3]; - assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) ); - - if ( epartIsReg( modrm ) ) { - assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) ); - delta += 3+1; - DIP( "%s %s,%s\n", str, - nameXMMReg( eregOfRexRM(pfx, modrm) ), - nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } else { - addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 ); - assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) ); - delta += 3+alen; - DIP( "%s %s,%s\n", str, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } - - assign( mask_vec, binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ) ); - - IRTemp max_min_vec = newTemp(Ity_V128); - if ( isPMAX ) { - assign( max_min_vec, - binop( Iop_OrV128, - binop( Iop_AndV128, mkexpr(rom_vec), - unop( Iop_NotV128, mkexpr(mask_vec) ) ), - binop( Iop_AndV128, mkexpr(reg_vec), mkexpr(mask_vec) ) ) ); - } else { - assign( max_min_vec, - binop( Iop_OrV128, - binop( Iop_AndV128, mkexpr(reg_vec), - unop( Iop_NotV128, mkexpr(mask_vec) ) ), - binop( Iop_AndV128, mkexpr(rom_vec), mkexpr(mask_vec) ) ) ); - } - - putXMMReg( gregOfRexRM(pfx, modrm), mkexpr(max_min_vec) ); - + && (insn[2] == 0x3D || insn[2] == 0x39)) { + Bool isMAX = insn[2] == 0x3D; + delta = dis_SSEint_E_to_G( + vbi, pfx, delta+3, + isMAX ? "pmaxsd" : "pminsd", + isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4, + False + ); goto decode_success; } - /* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128 Maximum of Packed Unsigned Doubleword Integers (XMM) 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128 Minimum of Packed Unsigned Doubleword Integers (XMM) */ - if ( have66noF2noF3( pfx ) - && sz == 2 + if ( have66noF2noF3( pfx ) && sz == 2 && insn[0] == 0x0F && insn[1] == 0x38 && (insn[2] == 0x3F || insn[2] == 0x3B)) { + Bool isMAX = insn[2] == 0x3F; + delta = dis_SSEint_E_to_G( + vbi, pfx, delta+3, + isMAX ? "pmaxud" : "pminud", + isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4, + False + ); + goto decode_success; + } - Bool is_max = insn[2] == 0x3F; - IRTemp reg_vec = newTemp(Ity_V128); - IRTemp rom_vec = newTemp(Ity_V128); - IRTemp mask_vec = newTemp(Ity_V128); - IRTemp and_vec = newTemp(Ity_V128); - IRTemp not_vec = newTemp(Ity_V128); - - modrm = insn[3]; - assign( reg_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) ); - - if ( epartIsReg( modrm ) ) { - assign( rom_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) ); - delta += 3+1; - DIP( "p%sud %s,%s\n", - is_max ? "max" : "min", - nameXMMReg( eregOfRexRM(pfx, modrm) ), - nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } else { - addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 ); - assign( rom_vec, loadLE( Ity_V128, mkexpr(addr) ) ); - delta += 3+alen; - DIP( "p%sd %s,%s\n", - is_max ? "max" : "min", - dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); - } - - /* the foll. simulates Iop_CmpGT32Ux4 (not implemented) - c.f. Hacker's Delight, S2-11, p.23 */ - assign( mask_vec, - binop( Iop_XorV128, - binop( Iop_XorV128, - binop( Iop_CmpGT32Sx4, mkexpr(reg_vec), mkexpr(rom_vec) ), - binop( Iop_SarN32x4, mkexpr(reg_vec), mkU8(31) ) ), - binop( Iop_SarN32x4, mkexpr(rom_vec), mkU8(31) ) ) ); - - assign( and_vec, - binop( Iop_AndV128, mkexpr(is_max ? reg_vec : rom_vec), - mkexpr(mask_vec) ) ); - assign( not_vec, - binop( Iop_AndV128, mkexpr(is_max ? rom_vec : reg_vec), - unop( Iop_NotV128, mkexpr(mask_vec) ) ) ); - - putXMMReg( gregOfRexRM(pfx, modrm), - binop( Iop_OrV128, mkexpr(not_vec), mkexpr(and_vec) ) ); - + /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128 + Maximum of Packed Unsigned Word Integers (XMM) + 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128 + Minimum of Packed Unsigned Word Integers (XMM) + */ + if ( have66noF2noF3( pfx ) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x3E || insn[2] == 0x3A)) { + Bool isMAX = insn[2] == 0x3E; + delta = dis_SSEint_E_to_G( + vbi, pfx, delta+3, + isMAX ? "pmaxuw" : "pminuw", + isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8, + False + ); goto decode_success; } + /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128 + 8Sx16 (signed) max + 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128 + 8Sx16 (signed) min + */ + if ( have66noF2noF3( pfx ) && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 + && (insn[2] == 0x3C || insn[2] == 0x38)) { + Bool isMAX = insn[2] == 0x3C; + delta = dis_SSEint_E_to_G( + vbi, pfx, delta+3, + isMAX ? "pmaxsb" : "pminsb", + isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16, + False + ); + goto decode_success; + } /* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64 Packed Move with Sign Extend from Byte to Word (XMM) */ @@ -14964,6 +14928,40 @@ DisResult disInstr_AMD64_WRK ( } + /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128 + 32x4 integer multiply from xmm2/m128 to xmm1 */ + if ( have66noF2noF3( pfx ) + && sz == 2 + && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) { + + modrm = insn[3]; + + IRTemp argL = newTemp(Ity_V128); + IRTemp argR = newTemp(Ity_V128); + + if ( epartIsReg(modrm) ) { + assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) ); + delta += 3+1; + DIP( "pmulld %s,%s\n", + nameXMMReg( eregOfRexRM(pfx, modrm) ), + nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } else { + addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 ); + assign( argL, loadLE( Ity_V128, mkexpr(addr) )); + delta += 3+alen; + DIP( "pmulld %s,%s\n", + dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) ); + } + + assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) )); + + putXMMReg( gregOfRexRM(pfx, modrm), + binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) ); + + goto decode_success; + } + + /* F3 0F B8 = POPCNT{W,L,Q} Count the number of 1 bits in a register */ diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c index 0a0a3a621b..ddf7503e96 100644 --- a/VEX/priv/host_amd64_isel.c +++ b/VEX/priv/host_amd64_isel.c @@ -42,6 +42,7 @@ #include "main_globals.h" #include "host_generic_regs.h" #include "host_generic_simd64.h" +#include "host_generic_simd128.h" #include "host_amd64_defs.h" @@ -3158,7 +3159,8 @@ static HReg iselVecExpr ( ISelEnv* env, IRExpr* e ) /* DO NOT CALL THIS DIRECTLY */ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) { - Bool arg1isEReg = False; + HWord fn = 0; /* address of helper fn, if required */ + Bool arg1isEReg = False; AMD64SseOp op = Asse_INVALID; IRType ty = typeOfIRExpr(env->type_env,e); vassert(e); @@ -3614,6 +3616,73 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e ) return dst; } + case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4; + goto do_SseAssistedBinary; + case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4; + goto do_SseAssistedBinary; + case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4; + goto do_SseAssistedBinary; + case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4; + goto do_SseAssistedBinary; + case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4; + goto do_SseAssistedBinary; + case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8; + goto do_SseAssistedBinary; + case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8; + goto do_SseAssistedBinary; + case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16; + goto do_SseAssistedBinary; + case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16; + goto do_SseAssistedBinary; + case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2; + goto do_SseAssistedBinary; + do_SseAssistedBinary: { + /* RRRufff! RRRufff code is what we're generating here. Oh + well. */ + vassert(fn != 0); + HReg dst = newVRegV(env); + HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); + HReg argR = iselVecExpr(env, e->Iex.Binop.arg2); + HReg argp = newVRegI(env); + /* subq $112, %rsp -- make a space*/ + sub_from_rsp(env, 112); + /* leaq 48(%rsp), %r_argp -- point into it */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), + argp)); + /* andq $-16, %r_argp -- 16-align the pointer */ + addInstr(env, AMD64Instr_Alu64R(Aalu_AND, + AMD64RMI_Imm( ~(UInt)15 ), + argp)); + /* Prepare 3 arg regs: + leaq 0(%r_argp), %rdi + leaq 16(%r_argp), %rsi + leaq 32(%r_argp), %rdx + */ + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), + hregAMD64_RDI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp), + hregAMD64_RSI())); + addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp), + hregAMD64_RDX())); + /* Store the two args, at (%rsi) and (%rdx): + movupd %argL, 0(%rsi) + movupd %argR, 0(%rdx) + */ + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL, + AMD64AMode_IR(0, hregAMD64_RSI()))); + addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR, + AMD64AMode_IR(0, hregAMD64_RDX()))); + /* call the helper */ + addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 )); + /* fetch the result from memory, using %r_argp, which the + register allocator will keep alive across the call. */ + addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst, + AMD64AMode_IR(0, argp))); + /* and finally, clear the space */ + add_to_rsp(env, 112); + return dst; + } + default: break; } /* switch (e->Iex.Binop.op) */ diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c new file mode 100644 index 0000000000..8ed516609d --- /dev/null +++ b/VEX/priv/host_generic_simd128.c @@ -0,0 +1,220 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_generic_simd128.c ---*/ +/*---------------------------------------------------------------*/ + +/* + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2010-2010 OpenWorks GbR + info@open-works.net + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + + The GNU General Public License is contained in the file COPYING. +*/ + +/* Generic helper functions for doing 128-bit SIMD arithmetic in cases + where the instruction selectors cannot generate code in-line. + These are purely back-end entities and cannot be seen/referenced + from IR. */ + +#include "libvex_basictypes.h" +#include "host_generic_simd128.h" + + +/* Primitive helpers always take args of the real type (signed vs + unsigned) but return an unsigned result, so there's no conversion + weirdness when stuffing results back in the V128 union fields, + which are all unsigned. */ + +static inline UInt mul32 ( Int xx, Int yy ) +{ + Int t = ((Int)xx) * ((Int)yy); + return toUInt(t); +} + +static inline UInt max32S ( Int xx, Int yy ) +{ + return toUInt((xx > yy) ? xx : yy); +} + +static inline UInt min32S ( Int xx, Int yy ) +{ + return toUInt((xx < yy) ? xx : yy); +} + +static inline UInt max32U ( UInt xx, UInt yy ) +{ + return toUInt((xx > yy) ? xx : yy); +} + +static inline UInt min32U ( UInt xx, UInt yy ) +{ + return toUInt((xx < yy) ? xx : yy); +} + +static inline UShort max16U ( UShort xx, UShort yy ) +{ + return toUShort((xx > yy) ? xx : yy); +} + +static inline UShort min16U ( UShort xx, UShort yy ) +{ + return toUShort((xx < yy) ? xx : yy); +} + +static inline UChar max8S ( Char xx, Char yy ) +{ + return toUChar((xx > yy) ? xx : yy); +} + +static inline UChar min8S ( Char xx, Char yy ) +{ + return toUChar((xx < yy) ? xx : yy); +} + +static inline ULong cmpGT64S ( Long xx, Long yy ) +{ + return (((Long)xx) > ((Long)yy)) + ? 0xFFFFFFFFFFFFFFFFULL : 0ULL; +} + +void h_generic_calc_Mul32x4 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w32[0] = mul32(argL->w32[0], argR->w32[0]); + res->w32[1] = mul32(argL->w32[1], argR->w32[1]); + res->w32[2] = mul32(argL->w32[2], argR->w32[2]); + res->w32[3] = mul32(argL->w32[3], argR->w32[3]); +} + +void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w32[0] = max32S(argL->w32[0], argR->w32[0]); + res->w32[1] = max32S(argL->w32[1], argR->w32[1]); + res->w32[2] = max32S(argL->w32[2], argR->w32[2]); + res->w32[3] = max32S(argL->w32[3], argR->w32[3]); +} + +void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w32[0] = min32S(argL->w32[0], argR->w32[0]); + res->w32[1] = min32S(argL->w32[1], argR->w32[1]); + res->w32[2] = min32S(argL->w32[2], argR->w32[2]); + res->w32[3] = min32S(argL->w32[3], argR->w32[3]); +} + +void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w32[0] = max32U(argL->w32[0], argR->w32[0]); + res->w32[1] = max32U(argL->w32[1], argR->w32[1]); + res->w32[2] = max32U(argL->w32[2], argR->w32[2]); + res->w32[3] = max32U(argL->w32[3], argR->w32[3]); +} + +void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w32[0] = min32U(argL->w32[0], argR->w32[0]); + res->w32[1] = min32U(argL->w32[1], argR->w32[1]); + res->w32[2] = min32U(argL->w32[2], argR->w32[2]); + res->w32[3] = min32U(argL->w32[3], argR->w32[3]); +} + +void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w16[0] = max16U(argL->w16[0], argR->w16[0]); + res->w16[1] = max16U(argL->w16[1], argR->w16[1]); + res->w16[2] = max16U(argL->w16[2], argR->w16[2]); + res->w16[3] = max16U(argL->w16[3], argR->w16[3]); + res->w16[4] = max16U(argL->w16[4], argR->w16[4]); + res->w16[5] = max16U(argL->w16[5], argR->w16[5]); + res->w16[6] = max16U(argL->w16[6], argR->w16[6]); + res->w16[7] = max16U(argL->w16[7], argR->w16[7]); +} + +void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w16[0] = min16U(argL->w16[0], argR->w16[0]); + res->w16[1] = min16U(argL->w16[1], argR->w16[1]); + res->w16[2] = min16U(argL->w16[2], argR->w16[2]); + res->w16[3] = min16U(argL->w16[3], argR->w16[3]); + res->w16[4] = min16U(argL->w16[4], argR->w16[4]); + res->w16[5] = min16U(argL->w16[5], argR->w16[5]); + res->w16[6] = min16U(argL->w16[6], argR->w16[6]); + res->w16[7] = min16U(argL->w16[7], argR->w16[7]); +} + +void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]); + res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]); + res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]); + res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]); + res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]); + res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]); + res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]); + res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]); + res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]); + res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]); + res->w8[10] = max8S(argL->w8[10], argR->w8[10]); + res->w8[11] = max8S(argL->w8[11], argR->w8[11]); + res->w8[12] = max8S(argL->w8[12], argR->w8[12]); + res->w8[13] = max8S(argL->w8[13], argR->w8[13]); + res->w8[14] = max8S(argL->w8[14], argR->w8[14]); + res->w8[15] = max8S(argL->w8[15], argR->w8[15]); +} + +void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]); + res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]); + res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]); + res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]); + res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]); + res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]); + res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]); + res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]); + res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]); + res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]); + res->w8[10] = min8S(argL->w8[10], argR->w8[10]); + res->w8[11] = min8S(argL->w8[11], argR->w8[11]); + res->w8[12] = min8S(argL->w8[12], argR->w8[12]); + res->w8[13] = min8S(argL->w8[13], argR->w8[13]); + res->w8[14] = min8S(argL->w8[14], argR->w8[14]); + res->w8[15] = min8S(argL->w8[15], argR->w8[15]); +} + +void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res, + V128* argL, V128* argR ) +{ + res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]); + res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]); +} + + +/*---------------------------------------------------------------*/ +/*--- end host_generic_simd128.c ---*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h new file mode 100644 index 0000000000..125514a737 --- /dev/null +++ b/VEX/priv/host_generic_simd128.h @@ -0,0 +1,79 @@ + +/*---------------------------------------------------------------*/ +/*--- begin host_generic_simd128.h ---*/ +/*---------------------------------------------------------------*/ + +/* + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2010-2010 OpenWorks GbR + info@open-works.net + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + + The GNU General Public License is contained in the file COPYING. +*/ + +/* Generic helper functions for doing 128-bit SIMD arithmetic in cases + where the instruction selectors cannot generate code in-line. + These are purely back-end entities and cannot be seen/referenced + as clean helper functions from IR. + + These will get called from generated code and therefore should be + well behaved -- no floating point or mmx insns, just straight + integer code. + + Each function implements the correspondingly-named IR primop. +*/ + +#ifndef __VEX_HOST_GENERIC_SIMD128_H +#define __VEX_HOST_GENERIC_SIMD128_H + +/* A union for doing 128-bit primitives conveniently. It is not + public and so not placed in pub/. */ +typedef + union { + UChar w8[16]; + UShort w16[8]; + UInt w32[4]; + ULong w64[2]; + } + V128; + + +#include "libvex_basictypes.h" + +/* DO NOT MAKE THESE INTO REGPARM FNS! THIS WILL BREAK CALLING + SEQUENCES GENERATED BY host-x86/isel.c. */ + +extern void h_generic_calc_Mul32x4 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Max32Sx4 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Min32Sx4 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Max32Ux4 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Min32Ux4 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Max16Ux8 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Min16Ux8 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Max8Sx16 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_Min8Sx16 ( /*OUT*/V128*, V128*, V128* ); +extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* ); + + +#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */ + +/*---------------------------------------------------------------*/ +/*--- end host_generic_simd128.h ---*/ +/*---------------------------------------------------------------*/ diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c index 4e0a9773c4..0e6f2edadf 100644 --- a/VEX/priv/ir_defs.c +++ b/VEX/priv/ir_defs.c @@ -336,6 +336,7 @@ void ppIROp ( IROp op ) case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return; case Iop_Mul16x4: vex_printf("Mul16x4"); return; case Iop_Mul32x2: vex_printf("Mul32x2"); return; + case Iop_Mul32x4: vex_printf("Mul32x4"); return; case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return; case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return; case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return; @@ -525,6 +526,7 @@ void ppIROp ( IROp op ) case Iop_CmpGT8Sx16: vex_printf("CmpGT8Sx16"); return; case Iop_CmpGT16Sx8: vex_printf("CmpGT16Sx8"); return; case Iop_CmpGT32Sx4: vex_printf("CmpGT32Sx4"); return; + case Iop_CmpGT64Sx2: vex_printf("CmpGT64Sx2"); return; case Iop_CmpGT8Ux16: vex_printf("CmpGT8Ux16"); return; case Iop_CmpGT16Ux8: vex_printf("CmpGT16Ux8"); return; case Iop_CmpGT32Ux4: vex_printf("CmpGT32Ux4"); return; @@ -1899,7 +1901,7 @@ void typeOfPrimop ( IROp op, case Iop_Sub32x4: case Iop_Sub64x2: case Iop_QSub8Ux16: case Iop_QSub16Ux8: case Iop_QSub32Ux4: case Iop_QSub8Sx16: case Iop_QSub16Sx8: case Iop_QSub32Sx4: - case Iop_Mul16x8: + case Iop_Mul16x8: case Iop_Mul32x4: case Iop_MulHi16Ux8: case Iop_MulHi32Ux4: case Iop_MulHi16Sx8: case Iop_MulHi32Sx4: case Iop_MullEven8Ux16: case Iop_MullEven16Ux8: @@ -1912,6 +1914,7 @@ void typeOfPrimop ( IROp op, case Iop_Min8Ux16: case Iop_Min16Ux8: case Iop_Min32Ux4: case Iop_CmpEQ8x16: case Iop_CmpEQ16x8: case Iop_CmpEQ32x4: case Iop_CmpGT8Sx16: case Iop_CmpGT16Sx8: case Iop_CmpGT32Sx4: + case Iop_CmpGT64Sx2: case Iop_CmpGT8Ux16: case Iop_CmpGT16Ux8: case Iop_CmpGT32Ux4: case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4: case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4: diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c index 8eec2aff39..962b952e7d 100644 --- a/VEX/priv/main_main.c +++ b/VEX/priv/main_main.c @@ -57,6 +57,8 @@ #include "guest_arm_defs.h" #include "guest_ppc_defs.h" +#include "host_generic_simd128.h" + /* This file contains the top level interface to the library. */ @@ -141,6 +143,7 @@ void LibVEX_Init ( vassert(4 == sizeof(Addr32)); vassert(8 == sizeof(Addr64)); vassert(16 == sizeof(U128)); + vassert(16 == sizeof(V128)); vassert(sizeof(void*) == 4 || sizeof(void*) == 8); vassert(sizeof(void*) == sizeof(int*)); diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h index bb39becc2c..4b7d628fb4 100644 --- a/VEX/pub/libvex_ir.h +++ b/VEX/pub/libvex_ir.h @@ -834,7 +834,7 @@ typedef Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, /* MULTIPLICATION (normal / high half of signed/unsigned) */ - Iop_Mul16x8, + Iop_Mul16x8, Iop_Mul32x4, Iop_MulHi16Ux8, Iop_MulHi32Ux4, Iop_MulHi16Sx8, Iop_MulHi32Sx4, /* (widening signed/unsigned of even lanes, with lowest lane=zero) */ @@ -853,7 +853,7 @@ typedef /* COMPARISON */ Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, - Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, + Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2, Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, /* VECTOR x SCALAR SHIFT (shift amt :: Ity_I8) */