priv/host_generic_simd128.h \
priv/host_generic_simd256.h \
priv/host_generic_maddf.h \
+ priv/host_amd64_maddf.h \
priv/host_x86_defs.h \
priv/host_amd64_defs.h \
priv/host_ppc_defs.h \
priv/host_generic_simd128.c \
priv/host_generic_simd256.c \
priv/host_generic_maddf.c \
+ priv/host_amd64_maddf.c \
priv/host_generic_reg_alloc2.c \
priv/host_generic_reg_alloc3.c \
priv/host_x86_defs.c \
437790 valgrind reports "Conditional jump or move depends on uninitialised
value" in memchr of macOS 10.12-10.15
460616 disInstr(arm64): unhandled instruction 0x4E819402 (dotprod/ASIMDDP)
+463458 memcheck/tests/vcpu_fnfns fails when glibc is built for x86-64-v3
+463463 none/tests/amd64/fma fails when executed on a x86-64-v3 system
466762 Add redirs for C23 free_sized() and free_aligned_sized()
466884 Missing writev uninit padding suppression for _XSend
471036 disInstr_AMD64: disInstr miscalculated next %rip on RORX imm8, m32/64, r32/6
Assertion '!sr_isError(sr)' failed."
480488 Add support for FreeBSD 13.3
480706 Unhandled syscall 325 (mlock2)
+481127 amd64: Implement VFMADD213 for Iop_MAddF32
481131 [PATCH] x86 regtest: fix clobber lists in generated asm statements
483786 Incorrect parameter indexing in FreeBSD clock_nanosleep syscall wrapper
484002 Add suppression for invalid read in glibc's __wcpncpy_avx2() via wcsxfrm()
case Asse_PMADDUBSW: return "pmaddubsw";
case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4).";
case Asse_F16toF32: return "vcvtph2ps.";
+ case Asse_VFMADD213: return "vfmadd213";
default: vpanic("showAMD64SseOp");
}
}
//uu i->Ain.AvxReRg.dst = rg;
//uu return i;
//uu }
+AMD64Instr* AMD64Instr_Avx32FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) {
+ AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+ i->tag = Ain_Avx32FLo;
+ i->Ain.Avx32FLo.op = op;
+ i->Ain.Avx32FLo.src1 = src1;
+ i->Ain.Avx32FLo.src2 = src2;
+ i->Ain.Avx32FLo.dst = dst;
+ vassert(op != Asse_MOV);
+ return i;
+}
+
+AMD64Instr* AMD64Instr_Avx64FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) {
+ AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
+ i->tag = Ain_Avx64FLo;
+ i->Ain.Avx64FLo.op = op;
+ i->Ain.Avx64FLo.src1 = src1;
+ i->Ain.Avx64FLo.src2 = src2;
+ i->Ain.Avx64FLo.dst = dst;
+ vassert(op != Asse_MOV);
+ return i;
+}
+
AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
AMD64AMode* amFailAddr ) {
AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
//uu vex_printf(",");
//uu ppHRegAMD64(i->Ain.AvxReRg.dst);
//uu return;
+ case Ain_Avx32FLo:
+ vex_printf("%sss ", showAMD64SseOp(i->Ain.Avx32FLo.op));
+ ppHRegAMD64(i->Ain.Avx32FLo.src2);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.Avx32FLo.src1);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.Avx32FLo.dst);
+ return;
+ case Ain_Avx64FLo:
+ vex_printf("%ssd ", showAMD64SseOp(i->Ain.Avx64FLo.op));
+ ppHRegAMD64(i->Ain.Avx64FLo.src2);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.Avx64FLo.src1);
+ vex_printf(",");
+ ppHRegAMD64(i->Ain.Avx64FLo.dst);
+ return;
case Ain_EvCheck:
vex_printf("(evCheck) decl ");
ppAMD64AMode(i->Ain.EvCheck.amCounter);
//uu }
//uu }
//uu return;
+ case Ain_Avx32FLo:
+ vassert(i->Ain.Avx32FLo.op != Asse_MOV);
+ addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src1);
+ addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src2);
+ addHRegUse(u, HRmModify, i->Ain.Avx32FLo.dst);
+ return;
+ case Ain_Avx64FLo:
+ vassert(i->Ain.Avx64FLo.op != Asse_MOV);
+ addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src1);
+ addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src2);
+ addHRegUse(u, HRmModify, i->Ain.Avx64FLo.dst);
+ return;
case Ain_EvCheck:
/* We expect both amodes only to mention %rbp, so this is in
fact pointless, since %rbp isn't allocatable, but anyway.. */
//uu mapReg(m, &i->Ain.AvxReRg.src);
//uu mapReg(m, &i->Ain.AvxReRg.dst);
//uu return;
+ case Ain_Avx32FLo:
+ mapReg(m, &i->Ain.Avx32FLo.src1);
+ mapReg(m, &i->Ain.Avx32FLo.src2);
+ mapReg(m, &i->Ain.Avx32FLo.dst);
+ return;
+ case Ain_Avx64FLo:
+ mapReg(m, &i->Ain.Avx64FLo.src1);
+ mapReg(m, &i->Ain.Avx64FLo.src2);
+ mapReg(m, &i->Ain.Avx64FLo.dst);
+ return;
case Ain_EvCheck:
/* We expect both amodes only to mention %rbp, so this is in
fact pointless, since %rbp isn't allocatable, but anyway.. */
//uu goto done;
//uu }
+ case Ain_Avx32FLo: {
+ UInt d = vregEnc3210(i->Ain.Avx32FLo.dst);
+ UInt v = vregEnc3210(i->Ain.Avx32FLo.src1);
+ UInt s = vregEnc3210(i->Ain.Avx32FLo.src2);
+ UInt m = 2, pp = 1;
+ UInt opcode;
+ switch (i->Ain.Avx32FLo.op) {
+ case Asse_VFMADD213:
+ // VFMADD213SS %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15)
+ opcode = 0xa9;
+ break;
+ default:
+ goto bad;
+ }
+ // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 0 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte
+ // : 1 1 d2 d1 d0 s2 s1 s0
+ *p++ = 0xC4; // 3-byte VEX
+ *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m;
+ *p++ = ((~v&0x0f) << 3) | pp;
+ *p++ = opcode;
+ *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
+ goto done;
+ }
+ case Ain_Avx64FLo: {
+ UInt d = vregEnc3210(i->Ain.Avx64FLo.dst);
+ UInt v = vregEnc3210(i->Ain.Avx64FLo.src1);
+ UInt s = vregEnc3210(i->Ain.Avx64FLo.src2);
+ UInt m = 2, pp = 1;
+ UInt opcode;
+ switch (i->Ain.Avx64FLo.op) {
+ case Asse_VFMADD213:
+ // VFMADD213SD %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15)
+ opcode = 0xa9;
+ break;
+ default:
+ goto bad;
+ }
+ // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 1 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte
+ // : 1 1 d2 d1 d0 s2 s1 s0
+ *p++ = 0xC4; // 3-byte VEX
+ *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m;
+ *p++ = (1<<7)|((~v&0x0f) << 3) | pp;
+ *p++ = opcode;
+ *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
+ goto done;
+ }
+
case Ain_EvCheck: {
/* We generate:
(3 bytes) decl 8(%rbp) 8 == offsetof(host_EvC_COUNTER)
// Only for F16C capable hosts:
Asse_F32toF16, // F32 to F16 conversion, aka vcvtps2ph
Asse_F16toF32, // F16 to F32 conversion, aka vcvtph2ps
+ // Only for FMA (FMA3) capable hosts:
+ Asse_VFMADD213, // Fused Multiply-Add, aka vfmadd213ss
}
AMD64SseOp;
//uu Ain_AvxLdSt, /* AVX load/store 256 bits,
//uu no alignment constraints */
//uu Ain_AvxReRg, /* AVX binary general reg-reg, Re, Rg */
+ Ain_Avx32FLo, /* AVX binary 3 operand, 32F in lowest lane only */
+ Ain_Avx64FLo, /* AVX binary 3 operand, 64F in lowest lane only */
Ain_EvCheck, /* Event check */
Ain_ProfInc /* 64-bit profile counter increment */
}
//uu HReg src;
//uu HReg dst;
//uu } AvxReRg;
+ struct {
+ AMD64SseOp op;
+ HReg src1;
+ HReg src2;
+ HReg dst;
+ } Avx32FLo;
+ struct {
+ AMD64SseOp op;
+ HReg src1;
+ HReg src2;
+ HReg dst;
+ } Avx64FLo;
struct {
AMD64AMode* amCounter;
AMD64AMode* amFailAddr;
extern AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM );
//uu extern AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad, HReg, AMD64AMode* );
//uu extern AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Avx32FLo ( AMD64SseOp, HReg, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Avx64FLo ( AMD64SseOp, HReg, HReg, HReg );
extern AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
AMD64AMode* amFailAddr );
extern AMD64Instr* AMD64Instr_ProfInc ( void );
#include "host_generic_simd64.h"
#include "host_generic_simd128.h"
#include "host_generic_simd256.h"
+#include "host_amd64_maddf.h"
#include "host_generic_maddf.h"
#include "host_amd64_defs.h"
HReg argX = iselFltExpr(env, qop->arg2);
HReg argY = iselFltExpr(env, qop->arg3);
HReg argZ = iselFltExpr(env, qop->arg4);
+ if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
+ vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
+ if (dst.u32 != argX.u32)
+ addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
+ addInstr(env, AMD64Instr_Avx32FLo(Asse_VFMADD213, argY, argZ, dst));
+ return dst;
+ }
/* XXXROUNDINGFIXME */
/* set roundingmode here */
/* subq $16, %rsp -- make a space*/
AMD64AMode_IR(0, hregAMD64_RDX())));
addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
AMD64AMode_IR(0, hregAMD64_RCX())));
- /* call the helper */
- addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
- (ULong)(HWord)h_generic_calc_MAddF32,
- 4, mk_RetLoc_simple(RLPri_None) ));
+
+ /* call the helper with priority order : fma4 -> fallback generic
+ remark: the fma3 case is handled before without helper*/
+#if defined(VGA_amd64)
+ if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+ (ULong)(HWord)h_amd64_calc_MAddF32_fma4,
+ 4, mk_RetLoc_simple(RLPri_None) ));
+ }else
+#endif
+ {
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+ (ULong)(HWord)h_generic_calc_MAddF32,
+ 4, mk_RetLoc_simple(RLPri_None) ));
+ }
+
/* fetch the result from memory, using %r_argp, which the
register allocator will keep alive across the call. */
addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
HReg argX = iselDblExpr(env, qop->arg2);
HReg argY = iselDblExpr(env, qop->arg3);
HReg argZ = iselDblExpr(env, qop->arg4);
+ if (env->hwcaps & VEX_HWCAPS_AMD64_FMA3) {
+ vassert(dst.u32 != argY.u32 && dst.u32 != argZ.u32);
+ if (dst.u32 != argX.u32)
+ addInstr(env, AMD64Instr_SseReRg(Asse_MOV, argX, dst));
+ addInstr(env, AMD64Instr_Avx64FLo(Asse_VFMADD213, argY, argZ, dst));
+ return dst;
+ }
+
/* XXXROUNDINGFIXME */
/* set roundingmode here */
/* subq $32, %rsp -- make a space*/
AMD64AMode_IR(0, hregAMD64_RDX())));
addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
AMD64AMode_IR(0, hregAMD64_RCX())));
- /* call the helper */
- addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
- (ULong)(HWord)h_generic_calc_MAddF64,
- 4, mk_RetLoc_simple(RLPri_None) ));
+
+ /* call the helper with priority order : fma4 -> fallback generic
+ remark: the fma3 case is handled before without helper*/
+#if defined(VGA_amd64)
+ if (env->hwcaps & VEX_HWCAPS_AMD64_FMA4) {
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+ (ULong)(HWord)h_amd64_calc_MAddF64_fma4,
+ 4, mk_RetLoc_simple(RLPri_None) ));
+ }else
+#endif
+ {
+ addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
+ (ULong)(HWord)h_generic_calc_MAddF64,
+ 4, mk_RetLoc_simple(RLPri_None) ));
+ }
+
/* fetch the result from memory, using %r_argp, which the
register allocator will keep alive across the call. */
addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
| VEX_HWCAPS_AMD64_AVX2
| VEX_HWCAPS_AMD64_F16C
| VEX_HWCAPS_AMD64_RDRAND
- | VEX_HWCAPS_AMD64_RDSEED)));
+ | VEX_HWCAPS_AMD64_RDSEED
+ | VEX_HWCAPS_AMD64_FMA3
+ | VEX_HWCAPS_AMD64_FMA4)));
/* Check that the host's endianness is as expected. */
vassert(archinfo_host->endness == VexEndnessLE);
--- /dev/null
+
+/*---------------------------------------------------------------*/
+/*--- begin host_amd64_maddf.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ Compute x * y + z as ternary operation with intrinsics.
+*/
+
+
+#include "libvex_basictypes.h"
+#include "host_amd64_maddf.h"
+
+#if defined(VGA_amd64)
+void VEX_REGPARM(3)
+ h_amd64_calc_MAddF32_fma4 ( /*OUT*/Float* res,
+ Float* argX, Float* argY, Float* argZ )
+{
+ __asm__ ("vfmaddss %3,%2,%1,%0;" :
+ "=x"(*res): "x"(*argX),"x"(*argY), "x"(*argZ));
+ return ;
+}
+
+void VEX_REGPARM(3)
+ h_amd64_calc_MAddF64_fma4 ( /*OUT*/Double* res,
+ Double* argX, Double* argY, Double* argZ )
+{
+ __asm__ ("vfmaddsd %3,%2,%1,%0;" :
+ "=x"(*res): "x"(*argX),"x"(*argY), "x"(*argZ));
+ return;
+}
+#endif
+/*---------------------------------------------------------------*/
+/*--- end host_amd64_maddf.c --*/
+/*---------------------------------------------------------------*/
--- /dev/null
+
+/*---------------------------------------------------------------*/
+/*--- begin host_amd64_maddf.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+ Compute x * y + z as ternary operation with intrinsics
+*/
+
+/* Generic helper functions for doing FMA, i.e. compute x * y + z
+ as ternary operation.
+ These are purely back-end entities and cannot be seen/referenced
+ from IR. */
+
+#ifndef __VEX_HOST_AMD64_MADDF_H
+#define __VEX_HOST_AMD64_MADDF_H
+
+#include "libvex_basictypes.h"
+
+#if defined(VGA_amd64)
+extern VEX_REGPARM(3)
+ void h_amd64_calc_MAddF32_fma4 ( /*OUT*/Float*, Float*, Float*, Float* );
+
+extern VEX_REGPARM(3)
+ void h_amd64_calc_MAddF64_fma4 ( /*OUT*/Double*, Double*, Double*,
+ Double* );
+#endif
+#endif /* ndef __VEX_HOST_AMD64_MADDF_H */
+
+/*---------------------------------------------------------------*/
+/*--- end host_amd64_maddf.h --*/
+/*---------------------------------------------------------------*/
{ VEX_HWCAPS_AMD64_F16C, "f16c" },
{ VEX_HWCAPS_AMD64_RDRAND, "rdrand" },
{ VEX_HWCAPS_AMD64_RDSEED, "rdseed" },
+ { VEX_HWCAPS_AMD64_FMA3, "fma" }, /*fma to keep the same naming as /proc/cpuinfo*/
+ { VEX_HWCAPS_AMD64_FMA4, "fma4" },
};
/* Allocate a large enough buffer */
static HChar buf[sizeof prefix +
#define VEX_HWCAPS_AMD64_RDRAND (1<<13) /* RDRAND instructions */
#define VEX_HWCAPS_AMD64_F16C (1<<14) /* F16C instructions */
#define VEX_HWCAPS_AMD64_RDSEED (1<<15) /* RDSEED instructions */
+#define VEX_HWCAPS_AMD64_FMA3 (1<<16) /* FMA3 instructions */
+#define VEX_HWCAPS_AMD64_FMA4 (1<<17) /* FMA4 instructions */
/* ppc32: baseline capability is integer only */
#define VEX_HWCAPS_PPC32_F (1<<8) /* basic (non-optional) FP */
#elif defined(VGA_amd64)
{ Bool have_sse3, have_ssse3, have_cx8, have_cx16;
Bool have_lzcnt, have_avx, have_bmi, have_avx2;
+ Bool have_fma3, have_fma4;
Bool have_rdtscp, have_rdrand, have_f16c, have_rdseed;
UInt eax, ebx, ecx, edx, max_basic, max_extended;
ULong xgetbv_0 = 0;
have_sse3 = have_ssse3 = have_cx8 = have_cx16
= have_lzcnt = have_avx = have_bmi = have_avx2
- = have_rdtscp = have_rdrand = have_f16c = have_rdseed = False;
+ = have_rdtscp = have_rdrand = have_f16c = have_rdseed
+ = have_fma3 = have_fma4 = False;
eax = ebx = ecx = edx = max_basic = max_extended = 0;
// we assume that SSE1 and SSE2 are available by default
have_sse3 = (ecx & (1<<0)) != 0; /* True => have sse3 insns */
have_ssse3 = (ecx & (1<<9)) != 0; /* True => have Sup SSE3 insns */
- // fma is ecx:12
+ have_fma3 = (ecx & (1<<12))!= 0; /* True => have fma3 insns */
// sse41 is ecx:19
// sse42 is ecx:20
// xsave is ecx:26
have_rdrand = (ecx & (1<<30)) != 0; /* True => have RDRAND insns */
have_avx = False;
- /* have_fma = False; */
+
if ( (ecx & ((1<<28)|(1<<27)|(1<<26))) == ((1<<28)|(1<<27)|(1<<26)) ) {
/* Processor supports AVX instructions and XGETBV is enabled
by OS and AVX instructions are enabled by the OS. */
if (ebx2 == 576 && eax2 == 256) {
have_avx = True;
}
- /* have_fma = (ecx & (1<<12)) != 0; */
- /* have_fma: Probably correct, but gcc complains due to
- unusedness. */
}
}
have_rdtscp = (edx & (1<<27)) != 0; /* True => have RDTSVCP */
}
+ if (max_extended >= 0x80000001) {
+ VG_(cpuid)(0x80000001, 0, &eax, &ebx, &ecx, &edx);
+ have_fma4= (ecx & (1<<16)) != 0; /* True => have fma4 */
+ }
+
/* Check for BMI1 and AVX2. If we have AVX1 (plus OS support). */
have_bmi = False;
have_avx2 = False;
| (have_rdtscp ? VEX_HWCAPS_AMD64_RDTSCP : 0)
| (have_f16c ? VEX_HWCAPS_AMD64_F16C : 0)
| (have_rdrand ? VEX_HWCAPS_AMD64_RDRAND : 0)
- | (have_rdseed ? VEX_HWCAPS_AMD64_RDSEED : 0);
+ | (have_rdseed ? VEX_HWCAPS_AMD64_RDSEED : 0)
+ | (have_fma3 ? VEX_HWCAPS_AMD64_FMA3 : 0)
+ | (have_fma4 ? VEX_HWCAPS_AMD64_FMA4 : 0);
VG_(machine_get_cache_info)(&vai);