({ \
__m512 dst; \
__asm__ volatile \
- ("{tcvtrowd2ps\t%1, %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowd2ps\t%1, %%tmm%c[_src], %0 \
+ |tcvtrowd2ps\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
({ \
__m512 dst; \
__asm__ volatile \
- ("{tcvtrowd2ps\t$"#imm", %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowd2ps\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowd2ps\t%0, tmm%c[_src], %[_imm]}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16h\t%1, %%tmm"#src", %0|tcvtrowps2bf16h\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2bf16h\t%1, %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16h\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16h\t$"#imm", %%tmm"#src", %0|tcvtrowps2bf16h\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2bf16h\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16h\t%0, tmm%c[_src], %[_imm]}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16l\t%1, %%tmm"#src", %0|tcvtrowps2bf16l\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2bf16l\t%1, %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16l\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
({ \
__m512bh dst; \
__asm__ volatile \
- ("{tcvtrowps2bf16l\t$"#imm", %%tmm"#src", %0|tcvtrowps2bf16l\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2bf16l\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2bf16l\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phh\t%1, %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2phh\t%1, %%tmm%c[_src], %0|tcvtrowps2phh\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phh\t$"#imm", %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2phh\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2phh\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phl\t%1, %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tcvtrowps2phl\t%1, %%tmm%c[_src], %0|tcvtrowps2phl\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
({ \
__m512h dst; \
__asm__ volatile \
- ("{tcvtrowps2phl\t$"#imm", %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tcvtrowps2phl\t%[_imm], %%tmm%c[_src], %0 \
+ |tcvtrowps2phl\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
({ \
__m512 dst; \
__asm__ volatile \
- ("{tilemovrow\t%1, %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", %1}" \
- : "=v" (dst) : "r" ((unsigned) (A))); \
+ ("{tilemovrow\t%1, %%tmm%c[_src], %0|tilemovrow\t%0, tmm%c[_src], %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A)), [_src]"i"(src)); \
dst; \
})
({ \
__m512 dst; \
__asm__ volatile \
- ("{tilemovrow\t$"#imm", %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", "#imm"}" \
- : "=v" (dst) :); \
+ ("{tilemovrow\t%[_imm], %%tmm%c[_src], %0 \
+ |tilemovrow\t%0, tmm%c[_src], "#imm"}" \
+ : "=v" (dst) : [_src]"i"(src), [_imm]"i"(imm)); \
dst; \
})
#if defined(__x86_64__)
#define _tile_dpbf16ps_internal(dst,src1,src2) \
- __asm__ volatile\
- ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+ __asm__ volatile \
+ ("{tdpbf16ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpbf16ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
#define _tile_dpbf16ps(dst,src1,src2) \
_tile_dpbf16ps_internal (dst, src1, src2)
#endif /* __AMX_COMPLEX__ */
#if defined(__x86_64__)
-#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \
- __asm__ volatile\
- ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
-
-#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \
- __asm__ volatile\
- ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile \
+ ("{tcmmimfp16ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \
+ |tcmmimfp16ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \
+ :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3))
+
+#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile \
+ ("{tcmmrlfp16ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \
+ |tcmmrlfp16ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \
+ :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3))
#define _tile_cmmimfp16ps(src1_dst,src2,src3) \
_tile_cmmimfp16ps_internal (src1_dst, src2, src3)
#define _AMXFP16INTRIN_H_INCLUDED
#if defined(__x86_64__)
-#define _tile_dpfp16ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+#define _tile_dpfp16ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpfp16ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpfp16ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
#define _tile_dpfp16ps(dst,src1,src2) \
_tile_dpfp16ps_internal (dst,src1,src2)
#define _AMXFP8INTRIN_H_INCLUDED
#if defined(__x86_64__)
-#define _tile_dpbf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdpbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
-
-#define _tile_dpbhf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdpbhf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbhf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
-
-#define _tile_dphbf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdphbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
-
-#define _tile_dphf8ps_internal(dst,src1,src2) \
- __asm__ volatile \
- ("{tdphf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+#define _tile_dpbf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpbf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpbf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
+
+#define _tile_dpbhf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpbhf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdpbhf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
+
+#define _tile_dphbf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdphbf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdphbf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
+
+#define _tile_dphf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdphf8ps\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |tdphf8ps\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ :: [_dst]"i"(dst), [_src1]"i"(src1), [_src2]"i"(src2))
#define _tile_dpbf8ps(dst,src1,src2) \
_tile_dpbf8ps_internal (dst,src1,src2)
#if defined(__x86_64__)
#define _tile_int8_dp_internal(name,dst,src1,src2) \
__asm__ volatile \
- ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+ ("{"#name"\t%%tmm%c[_src2], %%tmm%c[_src1], %%tmm%c[_dst] \
+ |"#name"\ttmm%c[_dst], tmm%c[_src1], tmm%c[_src2]}" \
+ ::[_dst]"i"(dst),[_src1]"i"(src1),[_src2]"i"(src2))
#define _tile_dpbssd(dst,src1,src2) \
_tile_int8_dp_internal (tdpbssd, dst, src1, src2)
#define __DISABLE_AMX_MOVRS__
#endif /* __AMX_MOVRS__ */
-#define _tile_loaddrs_internal(tdst, base, stride) \
-__asm__ volatile \
- ("{tileloaddrs\t(%0,%1,1), %%tmm"#tdst \
- "|tileloaddrs\t%%tmm"#tdst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+#define _tile_loaddrs_internal(tdst, base, stride) \
+__asm__ volatile \
+ ("{tileloaddrs\t(%0,%1,1), %%tmm%c[_tdst] \
+ |tileloaddrs\ttmm%c[_tdst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_tdst]"i"(tdst))
-#define _tile_loaddrst1_internal(tdst, base, stride) \
-__asm__ volatile \
- ("{tileloaddrst1\t(%0,%1,1), %%tmm"#tdst \
- "|tileloaddrst1\t%%tmm"#tdst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+#define _tile_loaddrst1_internal(tdst, base, stride) \
+__asm__ volatile \
+ ("{tileloaddrst1\t(%0,%1,1), %%tmm%c[_tdst] \
+ |tileloaddrst1\ttmm%c[_tdst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_tdst]"i"(tdst))
#define _tile_loaddrs(tdst, base, stride) \
_tile_loaddrs_internal(tdst, base, stride)
#if defined(__x86_64__)
#define _tile_mmultf32ps_internal(src1_dst,src2,src3) \
- __asm__ volatile\
- ("{tmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+ __asm__ volatile \
+ ("{tmmultf32ps\t%%tmm%c[_src3], %%tmm%c[_src2], %%tmm%c[_src1_dst] \
+ |tmmultf32ps\ttmm%c[_src1_dst], tmm%c[_src2], tmm%c[_src3]}" \
+ :: [_src1_dst]"i"(src1_dst), [_src2]"i"(src2), [_src3]"i"(src3))
#define _tile_mmultf32ps(src1_dst,src2,src3) \
_tile_mmultf32ps_internal (src1_dst, src2, src3)
#define _tile_loadd_internal(dst,base,stride) \
__asm__ volatile \
- ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+ ("{tileloadd\t(%0,%1,1), %%tmm%c[_dst]|tileloadd\ttmm%c[_dst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_dst]"i"(dst))
#define _tile_stream_loadd(dst,base,stride) \
_tile_stream_loadd_internal (dst, base, stride)
#define _tile_stream_loadd_internal(dst,base,stride) \
__asm__ volatile \
- ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \
- :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+ ("{tileloaddt1\t(%0,%1,1), %%tmm%c[_dst]|tileloaddt1\ttmm%c[_dst], [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_dst]"i"(dst))
#define _tile_stored(dst,base,stride) \
_tile_stored_internal (dst, base, stride)
#define _tile_stored_internal(src,base,stride) \
__asm__ volatile \
- ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
- :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \
- : "memory")
+ ("{tilestored\t%%tmm%c[_src], (%0,%1,1)|tilestored\t[%0+%1*1], tmm%c[_src]}" \
+ :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)), [_src]"i"(src) \
+ : "memory")
#define _tile_zero(dst) \
_tile_zero_internal (dst)
-#define _tile_zero_internal(dst) \
- __asm__ volatile \
- ("tilezero\t%%tmm"#dst ::)
+#define _tile_zero_internal(dst) \
+ __asm__ volatile \
+ ("{tilezero\t%%tmm%c[_dst]|tilezero\ttmm%c[_dst]}" :: [_dst]"i"(dst))
#endif
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-int8 -O0" } */
+/* { dg-final { scan-assembler "tdpbssd\[ \\t]+\[^\n\]*%tmm2+\[^\n\]*%tmm1+\[^\n\]*%tmm0" } } */
+
+#include <immintrin.h>
+
+template <int hello, int crazy, int gcc>
+struct dpbssd
+{
+ void operator()() { _tile_dpbssd(hello, crazy, gcc); }
+};
+
+void f()
+{
+ dpbssd<0, 1, 2>()();
+}
+
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-avx512 -O0" } */
+/* { dg-final { scan-assembler "tcvtrowd2ps\[ \\t]+%e.x,\[ \\t\]*%tmm1,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowd2ps\[ \\t]+\\\$5,\[ \\t\]*%tmm2,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2bf16h\[ \\t]+%e.x,\[ \\t\]*%tmm1,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2bf16h\[ \\t]+\\\$7,\[ \\t\]*%tmm3,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2bf16l\[ \\t]+%e.x,\[ \\t\]*%tmm2,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2bf16l\[ \\t]+\\\$3,\[ \\t\]*%tmm4,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2phh\[ \\t]+%e.x,\[ \\t\]*%tmm1,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2phh\[ \\t]+\\\$6,\[ \\t\]*%tmm2,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2phl\[ \\t]+%e.x,\[ \\t\]*%tmm3,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tcvtrowps2phl\[ \\t]+\\\$2,\[ \\t\]*%tmm4,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tilemovrow\[ \\t]+%e.x,\[ \\t\]*%tmm5,\[ \\t\]*%zmm\[0-9\]+" } } */
+/* { dg-final { scan-assembler "tilemovrow\[ \\t]+\\\$4,\[ \\t\]*%tmm6,\[ \\t\]*%zmm\[0-9\]+" } } */
+
+#include <immintrin.h>
+
+template <int tmm_num>
+struct tile_cvtrowd2ps_test
+{
+ __m512 operator()() { return _tile_cvtrowd2ps(tmm_num, 0); }
+};
+
+template <int tmm_num, int imm>
+struct tile_cvtrowd2psi_test
+{
+ __m512 operator()() { return _tile_cvtrowd2psi(tmm_num, imm); }
+};
+
+template <int tmm_num>
+struct tile_cvtrowps2bf16h_test
+{
+ __m512bh operator()() { return _tile_cvtrowps2bf16h(tmm_num, 0); }
+};
+
+template <int tmm_num, int imm>
+struct tile_cvtrowps2bf16hi_test
+{
+ __m512bh operator()() { return _tile_cvtrowps2bf16hi(tmm_num, imm); }
+};
+
+template <int tmm_num>
+struct tile_cvtrowps2bf16l_test
+{
+ __m512bh operator()() { return _tile_cvtrowps2bf16l(tmm_num, 0); }
+};
+
+template <int tmm_num, int imm>
+struct tile_cvtrowps2bf16li_test
+{
+ __m512bh operator()() { return _tile_cvtrowps2bf16li(tmm_num, imm); }
+};
+
+template <int tmm_num>
+struct tile_cvtrowps2phh_test
+{
+ __m512h operator()() { return _tile_cvtrowps2phh(tmm_num, 0); }
+};
+
+template <int tmm_num, int imm>
+struct tile_cvtrowps2phhi_test
+{
+ __m512h operator()() { return _tile_cvtrowps2phhi(tmm_num, imm); }
+};
+
+template <int tmm_num>
+struct tile_cvtrowps2phl_test
+{
+ __m512h operator()() { return _tile_cvtrowps2phl(tmm_num, 0); }
+};
+
+template <int tmm_num, int imm>
+struct tile_cvtrowps2phli_test
+{
+ __m512h operator()() { return _tile_cvtrowps2phli(tmm_num, imm); }
+};
+
+template <int tmm_num>
+struct tile_movrow_test
+{
+ __m512 operator()() { return _tile_movrow(tmm_num, 0); }
+};
+
+template <int tmm_num, int imm>
+struct tile_movrowi_test
+{
+ __m512 operator()() { return _tile_movrowi(tmm_num, imm); }
+};
+
+void test_amx_avx512()
+{
+ __m512 r1 = tile_cvtrowd2ps_test<1>()();
+ __m512 r2 = tile_cvtrowd2psi_test<2, 5>()();
+ __m512bh r3 = tile_cvtrowps2bf16h_test<1>()();
+ __m512bh r4 = tile_cvtrowps2bf16hi_test<3, 7>()();
+ __m512bh r5 = tile_cvtrowps2bf16l_test<2>()();
+ __m512bh r6 = tile_cvtrowps2bf16li_test<4, 3>()();
+ __m512h r7 = tile_cvtrowps2phh_test<1>()();
+ __m512h r8 = tile_cvtrowps2phhi_test<2, 6>()();
+ __m512h r9 = tile_cvtrowps2phl_test<3>()();
+ __m512h r10 = tile_cvtrowps2phli_test<4, 2>()();
+ __m512 r11 = tile_movrow_test<5>()();
+ __m512 r12 = tile_movrowi_test<6, 4>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-bf16 -O0" } */
+/* { dg-final { scan-assembler "tdpbf16ps\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm0" } } */
+
+#include <immintrin.h>
+
+template <int dst, int src1, int src2>
+struct dpbf16ps
+{
+ void operator()() { _tile_dpbf16ps(dst, src1, src2); }
+};
+
+void test_amx_bf16()
+{
+ dpbf16ps<0, 1, 2>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-complex -O0" } */
+/* { dg-final { scan-assembler "tcmmimfp16ps\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm0" } } */
+/* { dg-final { scan-assembler "tcmmrlfp16ps\[ \\t]+%tmm5,\[ \\t\]*%tmm4,\[ \\t\]*%tmm3" } } */
+
+#include <immintrin.h>
+
+template <int dst, int src1, int src2>
+struct cmmimfp16ps
+{
+ void operator()() { _tile_cmmimfp16ps(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct cmmrlfp16ps
+{
+ void operator()() { _tile_cmmrlfp16ps(dst, src1, src2); }
+};
+
+void test_amx_complex()
+{
+ cmmimfp16ps<0, 1, 2>()();
+ cmmrlfp16ps<3, 4, 5>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-fp16 -O0" } */
+/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm0" } } */
+
+#include <immintrin.h>
+
+template <int dst, int src1, int src2>
+struct dpfp16ps
+{
+ void operator()() { _tile_dpfp16ps(dst, src1, src2); }
+};
+
+void test_amx_fp16()
+{
+ dpfp16ps<0, 1, 2>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-fp8 -O0" } */
+/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm0" } } */
+/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+%tmm5,\[ \\t\]*%tmm4,\[ \\t\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm6" } } */
+/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+%tmm1,\[ \\t\]*%tmm0,\[ \\t\]*%tmm7" } } */
+
+#include <immintrin.h>
+
+template <int dst, int src1, int src2>
+struct dpbf8ps
+{
+ void operator()() { _tile_dpbf8ps(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct dpbhf8ps
+{
+ void operator()() { _tile_dpbhf8ps(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct dphbf8ps
+{
+ void operator()() { _tile_dphbf8ps(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct dphf8ps
+{
+ void operator()() { _tile_dphf8ps(dst, src1, src2); }
+};
+
+void test_amx_fp8()
+{
+ dpbf8ps<0, 1, 2>()();
+ dpbhf8ps<3, 4, 5>()();
+ dphbf8ps<6, 1, 2>()();
+ dphf8ps<7, 0, 1>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-int8 -O0" } */
+/* { dg-final { scan-assembler "tdpbssd\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm0" } } */
+/* { dg-final { scan-assembler "tdpbsud\[ \\t]+%tmm5,\[ \\t\]*%tmm4,\[ \\t\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdpbusd\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm6" } } */
+/* { dg-final { scan-assembler "tdpbuud\[ \\t]+%tmm1,\[ \\t\]*%tmm0,\[ \\t\]*%tmm7" } } */
+
+#include <immintrin.h>
+
+template <int dst, int src1, int src2>
+struct dpbssd
+{
+ void operator()() { _tile_dpbssd(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct dpbsud
+{
+ void operator()() { _tile_dpbsud(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct dpbusd
+{
+ void operator()() { _tile_dpbusd(dst, src1, src2); }
+};
+
+template <int dst, int src1, int src2>
+struct dpbuud
+{
+ void operator()() { _tile_dpbuud(dst, src1, src2); }
+};
+
+void test_amx_int8()
+{
+ dpbssd<0, 1, 2>()();
+ dpbsud<3, 4, 5>()();
+ dpbusd<6, 1, 2>()();
+ dpbuud<7, 0, 1>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-movrs -O0" } */
+/* { dg-final { scan-assembler "tileloaddrs\[ \\t]+\[^\n\]*,\[ \\t\]*%tmm0" } } */
+/* { dg-final { scan-assembler "tileloaddrst1\[ \\t]+\[^\n\]*,\[ \\t\]*%tmm1" } } */
+
+#include <immintrin.h>
+
+template <int tmm_num>
+struct tile_loaddrs_test
+{
+ void operator()(const void* base, int stride)
+ {
+ _tile_loaddrs(tmm_num, base, stride);
+ }
+};
+
+template <int tmm_num>
+struct tile_loaddrst1_test
+{
+ void operator()(const void* base, int stride)
+ {
+ _tile_loaddrst1(tmm_num, base, stride);
+ }
+};
+
+void test_amx_movrs()
+{
+ char buf[1024];
+ tile_loaddrs_test<0>()(buf, 64);
+ tile_loaddrst1_test<1>()(buf, 64);
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -mamx-tf32 -O0" } */
+/* { dg-final { scan-assembler "tmmultf32ps\[ \\t]+%tmm2,\[ \\t\]*%tmm1,\[ \\t\]*%tmm0" } } */
+
+#include <immintrin.h>
+
+template <int dst, int src1, int src2>
+struct mmultf32ps
+{
+ void operator()() { _tile_mmultf32ps(dst, src1, src2); }
+};
+
+void test_amx_tf32()
+{
+ mmultf32ps<0, 1, 2>()();
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mamx-tile -O0" } */
+/* { dg-final { scan-assembler "tileloadd\[ \\t]+\[^\n\]*,\[ \\t\]*%tmm0" } } */
+/* { dg-final { scan-assembler "tilestored\[ \\t]+%tmm1," } } */
+/* { dg-final { scan-assembler "tilezero\[ \\t]+%tmm3" } } */
+/* { dg-final { scan-assembler "tileloaddt1\[ \\t]+\[^\n\]*,\[ \\t\]*%tmm2" } } */
+
+#include <immintrin.h>
+
+template <int tmm_num>
+struct tile_loadd_test
+{
+ void operator()(const void* base, int stride)
+ {
+ _tile_loadd(tmm_num, base, stride);
+ }
+};
+
+template <int tmm_num>
+struct tile_stored_test
+{
+ void operator()(void* base, int stride)
+ {
+ _tile_stored(tmm_num, base, stride);
+ }
+};
+
+template <int tmm_num>
+struct tile_zero_test
+{
+ void operator()() { _tile_zero(tmm_num); }
+};
+
+template <int tmm_num>
+struct tile_stream_loadd_test
+{
+ void operator()(const void* base, int stride)
+ {
+ _tile_stream_loadd(tmm_num, base, stride);
+ }
+};
+
+void test_amx_tile()
+{
+ char buf[1024];
+ tile_loadd_test<0>()(buf, 64);
+ tile_stored_test<1>()(buf, 64);
+ tile_stream_loadd_test<2>()(buf, 64);
+ tile_zero_test<3>()();
+}
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-bf16 -masm=intel" } */
-/* { dg-final { scan-assembler "tdpbf16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdpbf16ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
#include <immintrin.h>
void TEST ()
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-complex -masm=intel" } */
-/* { dg-final { scan-assembler "tcmmimfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tcmmimfp16ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tcmmrlfp16ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
#include <immintrin.h>
void TEST()
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */
-/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
#include <immintrin.h>
void TEST ()
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */
-/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
#include <immintrin.h>
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-int8 -masm=intel" } */
-/* { dg-final { scan-assembler "tdpbssd\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tdpbsud\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tdpbusd\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
-/* { dg-final { scan-assembler "tdpbuud\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdpbssd\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tdpbsud\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tdpbusd\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
+/* { dg-final { scan-assembler "tdpbuud\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
#include <immintrin.h>
void TEST ()
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-movrs -masm=intel" } */
-/* { dg-final { scan-assembler-times "tileloaddrs\[ \\t]%tmm\[0-9\]" 1 } } */
-/* { dg-final { scan-assembler-times "tileloaddrst1\[ \\t]%tmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "tileloaddrs\[ \\t]tmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "tileloaddrst1\[ \\t]tmm\[0-9\]" 1 } } */
#include <immintrin.h>
extern const void* base;
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-require-effective-target masm_intel } */
/* { dg-options "-O2 -mamx-tf32 -masm=intel" } */
-/* { dg-final { scan-assembler "tmmultf32ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tmmultf32ps\[ \\t]+\[^\n%\]*tmm1+\[^\n%\]*tmm2+\[^\n%\]*tmm3" } } */
#include <immintrin.h>
void TEST()
/* { dg-final { scan-assembler "ldtilecfg\[ \\t]" } } */
/* { dg-final { scan-assembler "sttilecfg\[ \\t]" } } */
/* { dg-final { scan-assembler "tilerelease" } } */
-/* { dg-final { scan-assembler "tileloadd\[ \\t]%tmm\[0-9\]" } } */
-/* { dg-final { scan-assembler "tileloaddt1\[ \\t]%tmm\[0-9\]" } } */
-/* { dg-final { scan-assembler "tilestored\[ \\t]\[^\n\]+\[^\n\]*%tmm\[0-9\]" } } */
-/* { dg-final { scan-assembler "tilezero\[ \\t]+\[^\n\]*%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "tileloadd\[ \\t]tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "tileloaddt1\[ \\t]tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "tilestored\[ \\t]\[^\n\]+\[^\n\]*tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "tilezero\[ \\t]+\[^\n%\]*tmm\[0-9\]" } } */
#include <immintrin.h>
extern int a[];