set_feature (FEATURE_AMX_AVX512);
if (eax & bit_AMX_TF32)
set_feature (FEATURE_AMX_TF32);
+ if (eax & bit_AMX_TRANSPOSE)
+ set_feature (FEATURE_AMX_TRANSPOSE);
}
}
| OPTION_MASK_ISA2_AMX_AVX512)
#define OPTION_MASK_ISA2_AMX_TF32_SET \
(OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TF32)
+#define OPTION_MASK_ISA2_AMX_TRANSPOSE_SET \
+ (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TRANSPOSE)
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
(OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8_UNSET \
| OPTION_MASK_ISA2_AMX_BF16_UNSET | OPTION_MASK_ISA2_AMX_FP16_UNSET \
| OPTION_MASK_ISA2_AMX_COMPLEX_UNSET | OPTION_MASK_ISA2_AMX_AVX512_UNSET \
- | OPTION_MASK_ISA2_AMX_TF32_UNSET)
+ | OPTION_MASK_ISA2_AMX_TF32_UNSET | OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET)
#define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8
#define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16
#define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR
(OPTION_MASK_ISA2_AVX10_2_512 | OPTION_MASK_ISA2_AMX_AVX512_UNSET)
#define OPTION_MASK_ISA2_AMX_AVX512_UNSET OPTION_MASK_ISA2_AMX_AVX512
#define OPTION_MASK_ISA2_AMX_TF32_UNSET OPTION_MASK_ISA2_AMX_TF32
+#define OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET OPTION_MASK_ISA2_AMX_TRANSPOSE
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
}
return true;
+ case OPT_mamx_transpose:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_TRANSPOSE_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_TRANSPOSE_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET;
+ opts->x_ix86_isa_flags2_explicit |=
+ OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
FEATURE_AVX10_2_512,
FEATURE_AMX_AVX512,
FEATURE_AMX_TF32,
+ FEATURE_AMX_TRANSPOSE,
CPU_FEATURE_MAX
};
ISA_NAMES_TABLE_ENTRY("amx-avx512", FEATURE_AMX_AVX512, P_NONE,
"-mamx-avx512")
ISA_NAMES_TABLE_ENTRY("amx-tf32", FEATURE_AMX_TF32, P_NONE, "-mamx-tf32")
+ ISA_NAMES_TABLE_ENTRY("amx-transpose", FEATURE_AMX_TRANSPOSE,
+ P_NONE, "-mamx-transpose")
ISA_NAMES_TABLE_END
avx10_2bf16intrin.h avx10_2-512bf16intrin.h
avx10_2satcvtintrin.h avx10_2-512satcvtintrin.h
avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h
- avx10_2copyintrin.h amxavx512intrin.h amxtf32intrin.h"
+ avx10_2copyintrin.h amxavx512intrin.h amxtf32intrin.h
+ amxtransposeintrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
--- /dev/null
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxtransposeintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXTRANSPOSEINTRIN_H_INCLUDED
+#define _AMXTRANSPOSEINTRIN_H_INCLUDED
+
+#if !defined(__AMX_TRANSPOSE__)
+#pragma GCC push_options
+#pragma GCC target("amx-transpose")
+#define __DISABLE_AMX_TRANSPOSE__
+#endif /* __AMX_TRANSPOSE__ */
+
+#if defined(__x86_64__)
+#define _tile_transposed_internal(dst,src) \
+ __asm__ volatile\
+ ("{ttransposed\t%%tmm"#src", %%tmm"#dst"|ttransposed\t%%tmm"#dst", %%tmm"#src"}" ::)
+
+#define _tile_2rpntlvwz0_internal(dst,base,stride) \
+ __asm__ volatile\
+ ("{t2rpntlvwz0\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz0\t%%tmm"#dst", [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((long) (stride)))
+
+#define _tile_2rpntlvwz0t1_internal(dst,base,stride) \
+ __asm__ volatile\
+ ("{t2rpntlvwz0t1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz0t1\t%%tmm"#dst", [%0+%1*1]}" \
+ :: "r" ((const void*)(base)), "r" ((long)(stride)))
+
+#define _tile_2rpntlvwz1_internal(dst,base,stride) \
+ __asm__ volatile\
+ ("{t2rpntlvwz1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz1\t%%tmm"#dst", [%0+%1*1]}" \
+ :: "r" ((const void*)(base)), "r" ((long)(stride)))
+
+#define _tile_2rpntlvwz1t1_internal(dst,base,stride) \
+ __asm__ volatile\
+ ("{t2rpntlvwz1t1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz1t1\t%%tmm"#dst", [%0+%1*1]}" \
+ :: "r" ((const void*)(base)), "r" ((long)(stride)))
+
+#define _tile_transposed(dst,src) \
+ _tile_transposed_internal (dst, src)
+
+#define _tile_2rpntlvwz0(dst,base,stride) \
+ _tile_2rpntlvwz0_internal (dst, base, stride)
+
+#define _tile_2rpntlvwz0t1(dst,base,stride) \
+ _tile_2rpntlvwz0t1_internal (dst, base, stride)
+
+#define _tile_2rpntlvwz1(dst,base,stride) \
+ _tile_2rpntlvwz1_internal (dst, base, stride)
+
+#define _tile_2rpntlvwz1t1(dst,base,stride) \
+ _tile_2rpntlvwz1t1_internal (dst, base, stride)
+
+#if !defined(__AMX_BF16__)
+#pragma GCC push_options
+#pragma GCC target("amx-bf16")
+#define __DISABLE_AMX_BF16__
+#endif /* __AMX_BF16__ */
+
+#define _tile_tdpbf16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{ttdpbf16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttdpbf16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_tdpbf16ps(src1_dst,src2,src3) \
+ _tile_tdpbf16ps_internal (src1_dst, src2, src3)
+
+#ifdef __DISABLE_AMX_BF16__
+#undef __DISABLE_AMX_BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_BF16__ */
+
+#if !defined(__AMX_FP16__)
+#pragma GCC push_options
+#pragma GCC target("amx-fp16")
+#define __DISABLE_AMX_FP16__
+#endif /* __AMX_FP16__ */
+
+#define _tile_tdpfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{ttdpfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttdpfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_tdpfp16ps(src1_dst,src2,src3) \
+ _tile_tdpfp16ps_internal (src1_dst, src2, src3)
+
+#ifdef __DISABLE_AMX_FP16__
+#undef __DISABLE_AMX_FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_FP16__ */
+
+#if !defined(__AMX_COMPLEX__)
+#pragma GCC push_options
+#pragma GCC target("amx-complex")
+#define __DISABLE_AMX_COMPLEX__
+#endif /* __AMX_COMPLEX__ */
+
+#define _tile_conjtcmmimfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{tconjtcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tconjtcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_conjtfp16_internal(dst,src) \
+ __asm__ volatile\
+ ("{tconjtfp16\t%%tmm"#src", %%tmm"#dst"|tconjtfp16\t%%tmm"#dst", %%tmm"#src"}" ::)
+
+#define _tile_tcmmimfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{ttcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_tcmmrlfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{ttcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_conjtcmmimfp16ps(src1_dst,src2,src3) \
+ _tile_conjtcmmimfp16ps_internal (src1_dst, src2, src3)
+
+#define _tile_conjtfp16(dst,src) \
+ _tile_conjtfp16_internal (dst, src)
+
+#define _tile_tcmmimfp16ps(src1_dst,src2,src3) \
+ _tile_tcmmimfp16ps_internal (src1_dst, src2, src3)
+
+#define _tile_tcmmrlfp16ps(src1_dst,src2,src3) \
+ _tile_tcmmrlfp16ps_internal (src1_dst, src2, src3)
+
+#ifdef __DISABLE_AMX_COMPLEX__
+#undef __DISABLE_AMX_COMPLEX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_COMPLEX__ */
+
+#if !defined(__AMX_TF32__)
+#pragma GCC push_options
+#pragma GCC target("amx-tf32")
+#define __DISABLE_AMX_TF32__
+#endif /* __AMX_TF32__ */
+
+#define _tile_tmmultf32ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{ttmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_tmmultf32ps(src1_dst,src2,src3) \
+ _tile_tmmultf32ps_internal (src1_dst, src2, src3)
+
+#ifdef __DISABLE_AMX_TF32__
+#undef __DISABLE_AMX_TF32__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_TF32__ */
+
+#endif /* __x86_64__ */
+
+#ifdef __DISABLE_AMX_TRANSPOSE__
+#undef __DISABLE_AMX_TRANSPOSE__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_TRANSPOSE__ */
+
+#endif /* _AMXTRANSPOSEINTRIN_H_INCLUDED */
/* AMX sub leaf (%eax == 0x1e, %ecx == 1) */
/* %eax */
+#define bit_AMX_TRANSPOSE (1 << 5)
#define bit_AMX_TF32 (1 << 6)
#define bit_AMX_AVX512 (1 << 7)
def_or_undef (parse_in, "__AMX_AVX512__");
if (isa_flag2 & OPTION_MASK_ISA2_AMX_TF32)
def_or_undef (parse_in, "__AMX_TF32__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AMX_TRANSPOSE)
+ def_or_undef (parse_in, "__AMX_TRANSPOSE__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
DEF_PTA(AVX10_2_512)
DEF_PTA(AMX_AVX512)
DEF_PTA(AMX_TF32)
+DEF_PTA(AMX_TRANSPOSE)
{ "-mavx10.2-256", OPTION_MASK_ISA2_AVX10_2_256 },
{ "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 },
{ "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 },
- { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 }
+ { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 },
+ { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE }
};
static struct ix86_target_opts isa_opts[] =
{
IX86_ATTR_ISA ("avx10.2-512", OPT_mavx10_2_512),
IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512),
IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32),
+ IX86_ATTR_ISA ("amx-transpose", OPT_mamx_transpose),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
mamx-tf32
Target Mask(ISA2_AMX_TF32) Var(ix86_isa_flags2) Save
Support AMX-TF32 built-in functions and code generation.
+
+mamx-transpose
+Target Mask(ISA2_AMX_TRANSPOSE) Var(ix86_isa_flags2) Save
+Support AMX-TRANSPOSE built-in functions and code generation.
mamx-tf32
UrlSuffix(gcc/x86-Options.html#index-mamx-tf32)
+mamx-transpose
+UrlSuffix(gcc/x86-Options.html#index-mamx-transpose)
+
#include <amxtf32intrin.h>
+#include <amxtransposeintrin.h>
+
#include <prfchwintrin.h>
#include <keylockerintrin.h>
@itemx no-amx-tf32
Enable/disable the generation of the AMX-TF32 instructions.
+@cindex @code{target("amx-transpose")} function attribute, x86
+@item amx-transpose
+@itemx no-amx-transpose
+Enable/disable the generation of the AMX-TRANSPOSE instructions.
+
@cindex @code{target("cld")} function attribute, x86
@item cld
@itemx no-cld
-mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16
-mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mapxf
-musermsr -mavx10.1 -mavx10.1-256 -mavx10.1-512 -mevex512 -mavx10.2 -mavx10.2-256
--mavx10.2-512 -mamx-avx512 -mamx-tf32
+-mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose
-mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops
-minline-stringops-dynamically -mstringop-strategy=@var{alg}
-mkl -mwidekl
@need 200
@opindex mamx-tf32
@itemx -mamx-tf32
+@need 200
+@opindex mamx-transpose
+@itemx -mamx-transpose
These switches enable the use of instructions in the MMX, SSE,
AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA, AES,
PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG,
AVX512VPOPCNTDQ, AVX512VNNI, SERIALIZE, UINTR, HRESET, AMXTILE, AMXINT8,
AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16, AVXIFMA, AVXVNNIINT8, AVXNECONVERT,
CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT, AMX-COMPLEX, AVXVNNIINT16, SM3, SHA512,
-SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32 or CLDEMOTE
-extended instruction sets. Each has a corresponding @option{-mno-} option to
-disable use of these instructions.
+SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-TRANSPOSE or
+CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-}
+option to disable use of these instructions.
These extensions are also available as built-in functions: see
@ref{x86 Built-in Functions}, for details of the functions enabled and
@item amx_tf32
Target supports the execution of @code{amx-tf32} instructions.
+@item amx_transpose
+Target supports the execution of @code{amx-transpose} instructions.
+
@item cell_hw
Test system can execute AltiVec and Cell PPU instructions.
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
/* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
/* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
int colsb;
} __tile;
+typedef struct __tilepair
+{
+ /* Max size of tile register */
+ uint8_t buf[2048];
+ int rows;
+ int colsb;
+} __tilepair;
+
/* Maxium col/row size in bytes */
#define MAX_ROWS 16
#define MAX_COLS 64
src->buf[i * src->colsb + j] = 0;
}
+/* Zero __tilepair src. It should be init first. */
+void zero_pair_tile_src (__tilepair *src)
+{
+ memset(src->buf, 0, 2048);
+}
+
/* Compare tile config value with __tilecfg_u dst */
int check_tile_config (__tilecfg_u *src, __tilecfg_u *dst)
{
return 1;
}
+/* Compare pair_tile register value with __tile variable */
+int check_pair_tile_register (__tile* ref_0, __tile* ref_1, __tilepair* target)
+{
+ /* Tile register should be stored from tmm to
+ memory and compare with emulation results. */
+ int rows = target->rows;
+ int colsb = target->colsb;
+ int i, j;
+
+ for (i = 0; i < rows; i++)
+ for (j = 0; j < colsb; j++)
+ {
+ if (ref_0->buf[i * colsb + j] != target->buf[i * colsb + j])
+ return 0;
+ if (ref_1->buf[i * colsb + j] != target->buf[rows * colsb + i * colsb + j])
+ return 0;
+ }
+
+ return 1;
+}
+
#ifndef DO_TEST
#define DO_TEST do_test
static void test_amx (void);
#ifdef AMX_TF32
&& __builtin_cpu_supports ("amx-tf32")
#endif
+#ifdef AMX_TRANSPOSE
+ && __builtin_cpu_supports ("amx-transpose")
+#endif
#ifdef __linux__
&& request_perm_xtile_data ()
#endif
}
#endif
-#if defined (AMX_AVX512)
+#if defined (AMX_AVX512) || defined (AMX_BF16)
/* Transformation functions between bf16/float */
static uint16_t make_f32_bf16 (float f)
{
return tmp.f;
}
+void init_pair_tile_src (int tmm_num, __tilepair *src, uint8_t *_buffer, int z)
+{
+ int rows, colsb, start, i, j, t, elements[2];
+ uint16_t *buffer = (uint16_t *) _buffer;
+ uint16_t *ptr = (uint16_t *) src->buf;
+ __tilecfg_u tmp;
+
+ _tile_storeconfig (tmp.a);
+
+ tmm_num &= ~1;
+
+ rows = tmp.s.rows[tmm_num];
+ colsb = tmp.s.colsb[tmm_num];
+ start = tmp.s.start_row;
+
+ zero_pair_tile_src (src);
+
+ for (t = 0; t < 2; t++)
+ elements[t] = tmp.s.colsb[tmm_num + t] / 4;
+
+ src->colsb = (tmp.s.colsb[tmm_num] + tmp.s.colsb[tmm_num + 1]) / 2;
+ src->rows = rows;
+
+ while (start < 2 * rows)
+ {
+ int r = start / 2;
+ int w = start % 2;
+
+ if (start < 2 * rows - z)
+ for (t = 0; t < 2; t++)
+ if (tmp.s.colsb[tmm_num + t] > 0)
+ for (i = 0; i < elements[t]; i++)
+ ptr[t * rows * colsb / 2 + r * elements[t] * 2 + 2 * i + w] =
+ buffer[start * colsb / 2 + t * elements[0] + i];
+ start++;
+ }
+}
+
#endif
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-options "-O2 -mamx-transpose" } */
+#define AMX_TRANSPOSE
+#define DO_TEST test_amx_transpose_t2rpntlvw
+void test_amx_transpose_t2rpntlvw ();
+#include "amx-helper.h"
+#define init_pair_tile_reg_and_src_z(tmm_num, src, buffer, ztype) \
+{ \
+ init_pair_tile_src (tmm_num, &src, buffer, ztype); \
+ _tile_2rpntlvwz##ztype (tmm_num, buffer, _STRIDE); \
+}
+
+void test_amx_transpose_t2rpntlvw ()
+{
+ __tilecfg_u cfg;
+ __tilepair src;
+ __tile ref_0, ref_1;
+ uint8_t buffer[2048];
+ int i;
+
+ init_tile_config (&cfg);
+
+ for (i = 0; i < 2048; i++)
+ buffer[i] = i % 256;
+
+ /* Check t2rpntlvwz0. */
+ init_pair_tile_reg_and_src_z (0, src, buffer, 0);
+ _tile_stored (0, ref_0.buf, _STRIDE);
+ _tile_stored (1, ref_1.buf, _STRIDE);
+ if (!check_pair_tile_register (&ref_0, &ref_1, &src))
+ abort ();
+
+ /* Check t2rpntlvwz1. */
+ init_pair_tile_reg_and_src_z (1, src, buffer, 1);
+ _tile_stored (0, ref_0.buf, _STRIDE);
+ _tile_stored (1, ref_1.buf, _STRIDE);
+ if (!check_pair_tile_register (&ref_0, &ref_1, &src))
+ abort ();
+
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mamx-transpose -mamx-bf16 -mamx-complex -mamx-fp16 -mamx-tf32" } */
+/* { dg-final { scan-assembler "ttdpbf16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "ttdpfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "ttransposed\[ \\t]+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz0\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz0t1\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz1\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz1t1\[ \\t]+\[^\n\]*\\(%\[a-z0-9]*\,%\[a-z0-9\]*\,\[124\]\\)+\[^\n\]*%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "tconjtcmmimfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "tconjtfp16\[ \\t]+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "ttcmmimfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "ttcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "ttmmultf32ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+#include <immintrin.h>
+
+extern const void* base;
+extern const int stride;
+
+#define TMM0 0
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+
+void TEST()
+{
+ _tile_tdpbf16ps (TMM1, TMM2, TMM3);
+ _tile_tdpfp16ps (TMM1, TMM2, TMM3);
+ _tile_transposed (TMM1, TMM2);
+ _tile_2rpntlvwz0 (TMM0, base, stride);
+ _tile_2rpntlvwz0t1 (TMM1, base, stride);
+ _tile_2rpntlvwz1 (TMM2, base, stride);
+ _tile_2rpntlvwz1t1 (TMM3, base, stride);
+ _tile_conjtcmmimfp16ps (TMM1, TMM2, TMM3);
+ _tile_conjtfp16 (TMM1, TMM2);
+ _tile_tcmmimfp16ps (TMM1, TMM2, TMM3);
+ _tile_tcmmrlfp16ps (TMM1, TMM2, TMM3);
+ _tile_tmmultf32ps (TMM1, TMM2, TMM3);
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-O2 -mamx-transpose -mamx-bf16 -mamx-complex -mamx-fp16 -mamx-tf32 -masm=intel" } */
+/* { dg-final { scan-assembler "ttdpbf16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "ttdpfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "ttransposed\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz0\[ \\t]%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz0t1\[ \\t]%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz1\[ \\t]%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "t2rpntlvwz1t1\[ \\t]%tmm\[0-9\]" } } */
+/* { dg-final { scan-assembler "tconjtcmmimfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tconjtfp16\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2" } } */
+/* { dg-final { scan-assembler "ttcmmimfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "ttcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "ttmmultf32ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+#include <immintrin.h>
+
+extern const void* base;
+extern const int stride;
+
+void TEST()
+{
+ _tile_tdpbf16ps (1, 2, 3);
+ _tile_tdpfp16ps (1, 2, 3);
+ _tile_transposed (1, 2);
+ _tile_2rpntlvwz0 (5, base, stride);
+ _tile_2rpntlvwz0t1 (4, base, stride);
+ _tile_2rpntlvwz1 (3, base, stride);
+ _tile_2rpntlvwz1t1 (2, base, stride);
+ _tile_conjtcmmimfp16ps (1, 2, 3);
+ _tile_conjtfp16 (1, 2);
+ _tile_tcmmimfp16ps (1, 2, 3);
+ _tile_tcmmrlfp16ps (1, 2, 3);
+ _tile_tmmultf32ps (1, 2, 3);
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_complex } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */
+#define AMX_TRANSPOSE
+#define AMX_COMPLEX
+#define DO_TEST test_amx_transpose_conjtcmmimfp16ps
+void test_amx_transpose_conjtcmmimfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_conjtcmmimfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int K = src1->rows;
+ int M = src1->colsb / 4;
+ int N = src2->colsb / 4;
+ int m, k, n, t;
+
+ for (m = 0; m < M; m++)
+ for (k = 0; k < K; k++)
+ for (n = 0; n < N; n++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[m * N + n] +=
+ (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t]) *
+ make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t + 1])) -
+ (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t + 1]) *
+ make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t]));
+}
+
+void test_amx_transpose_conjtcmmimfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_conjtcmmimfp16ps (&dst, &src1, &src2);
+
+ _tile_conjtcmmimfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_complex } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */
+#define AMX_TRANSPOSE
+#define AMX_COMPLEX
+#define DO_TEST test_amx_transpose_conjtfp16
+void test_amx_transpose_conjtfp16 ();
+#include "amx-helper.h"
+
+void calc_matrix_conjtfp16 (__tile *dst, __tile *src)
+{
+ uint16_t *src_buf = (uint16_t *) src->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int M = dst->rows;
+ int N = dst->colsb / 4;
+ int i, j, t;
+
+ for (i = 0; i < M; i++)
+ for (j = 0; j < N; j++)
+ for (t = 0; t < 2; t+=2)
+ {
+ dst_buf[i * 2 * N + 2 * j + t] = src_buf[j * 2 * M + 2 * i + t];
+ dst_buf[i * 2 * N + 2 * j + t + 1] = -src_buf[j * 2 * M + 2 * i + t + 1];
+ }
+}
+
+void test_amx_transpose_conjtfp16 ()
+{
+ __tilecfg_u cfg;
+ __tile src, dst, ref;
+ uint8_t tmp_dst_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (2, src, tmp_dst_buf);
+
+ /* Check tconjtfp16. */
+ calc_matrix_conjtfp16 (&dst, &src);
+ _tile_conjtfp16 (1, 2);
+ _tile_stored (1, ref.buf, _STRIDE);
+
+ if (!check_tile_register (&ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_complex } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */
+#define AMX_TRANSPOSE
+#define AMX_COMPLEX
+#define DO_TEST test_amx_transpose_tcmmimfp16ps
+void test_amx_transpose_tcmmimfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_tcmmimfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int K = src1->rows;
+ int M = src1->colsb / 4;
+ int N = src2->colsb / 4;
+ int m, k, n, t;
+
+ for (m = 0; m < M; m++)
+ for (k = 0; k < K; k++)
+ for (n = 0; n < N; n++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[m * N + n] +=
+ (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t]) *
+ make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t + 1])) +
+ (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t + 1]) *
+ make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t]));
+}
+
+void test_amx_transpose_tcmmimfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_tcmmimfp16ps (&dst, &src1, &src2);
+
+ _tile_tcmmimfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_complex } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-complex -mavx512fp16" } */
+#define AMX_TRANSPOSE
+#define AMX_COMPLEX
+#define DO_TEST test_amx_transpose_tcmmrlfp16ps
+void test_amx_transpose_tcmmrlfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_tcmmrlfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int K = src1->rows;
+ int M = src1->colsb / 4;
+ int N = src2->colsb / 4;
+ int m, k, n, t;
+
+ for (m = 0; m < M; m++)
+ for (k = 0; k < K; k++)
+ for (n = 0; n < N; n++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[m * N + n] +=
+ (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t]) *
+ make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t])) -
+ (make_fp16_f32(src1_buf[k * 2 * M + 2 * m + t + 1]) *
+ make_fp16_f32(src2_buf[k * 2 * N + 2 * n + t + 1]));
+}
+
+void test_amx_transpose_tcmmrlfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_tcmmrlfp16ps (&dst, &src1, &src2);
+
+ _tile_tcmmrlfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_bf16 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-bf16 -mavx512bf16" } */
+#define AMX_TRANSPOSE
+#define AMX_BF16
+#define DO_TEST test_amx_transpose_tdpbf16ps
+void test_amx_transpose_tdpbf16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_tdpbf16ps(__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int K = src1->rows;
+ int M = src1->colsb / 4;
+ int N = src2->colsb / 4;
+ int m, k, n, t;
+
+ for (m = 0; m < M; m++)
+ for (k = 0; k < K; k++)
+ for (n = 0; n < N; n++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[m * N + n] +=
+ (make_bf16_f32 (src1_buf[k * 2 * M + 2 * m + t]) *
+ make_bf16_f32 (src2_buf[k * 2 * N + 2 * n + t])) +
+ (make_bf16_f32 (src1_buf[k * 2 * M + 2 * m + t + 1]) *
+ make_bf16_f32 (src2_buf[k * 2 * N + 2 * n + t + 1]));
+}
+
+void test_amx_transpose_tdpbf16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024];
+
+ init_bf16_max_tile_buffer (tmp_dst_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_tdpbf16ps (&dst, &src1, &src2);
+
+ _tile_tdpbf16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_fp16 } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-fp16 -mavx512fp16" } */
+#define AMX_TRANSPOSE
+#define AMX_FP16
+#define DO_TEST test_amx_transpose_tdpfp16ps
+void test_amx_transpose_tdpfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_tdpfp16ps(__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int K = src1->rows;
+ int M = src1->colsb / 4;
+ int N = src2->colsb / 4;
+ int m, k, n, t;
+
+ for (m = 0; m < M; m++)
+ for (k = 0; k < K; k++)
+ for (n = 0; n < N; n++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[m * N + n] +=
+ (make_fp16_f32 (src1_buf[k * 2 * M + 2 * m + t]) *
+ make_fp16_f32 (src2_buf[k * 2 * N + 2 * n + t])) +
+ (make_fp16_f32 (src1_buf[k * 2 * M + 2 * m + t + 1]) *
+ make_fp16_f32 (src2_buf[k * 2 * N + 2 * n + t + 1]));
+}
+
+void test_amx_transpose_tdpfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer(tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_tdpfp16ps (&dst, &src1, &src2);
+
+ _tile_tdpfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-require-effective-target amx_tf32 } */
+/* { dg-options "-O2 -mamx-transpose -mamx-tf32" } */
+#define AMX_TRANSPOSE
+#define AMX_TF32
+#define DO_TEST test_amx_transpose_tmmultf32ps
+void test_amx_transpose_tmmultf32ps();
+#include "amx-helper.h"
+
+void calc_matrix_tmmultf32ps(__tile *dst, __tile *src1, __tile *src2)
+{
+ float *src1_buf = (float *) src1->buf;
+ float *src2_buf = (float *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int K = src1->rows;
+ int M = src1->colsb / 4;
+ int N = src2->colsb / 4;
+ int m, n, k;
+
+ for (m = 0; m < M; m++)
+ for (k = 0; k < K; k++)
+ for (n = 0; n < N; n++)
+ dst_buf[m * N + n] +=
+ zero_lower_mantissa_bits_fp32 (silence_snan_fp32 (src1_buf[k * M + m])) *
+ zero_lower_mantissa_bits_fp32 (silence_snan_fp32 (src2_buf[k * N + n]));
+
+}
+
+void test_amx_transpose_tmmultf32ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024];
+
+ init_fp32_max_tile_buffer (tmp_dst_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_tmmultf32ps (&dst, &src1, &src2);
+
+ _tile_tmmultf32ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_transpose } */
+/* { dg-options "-O2 -mamx-transpose" } */
+#define AMX_TRANSPOSE
+#define DO_TEST test_amx_transpose_transposed
+void test_amx_transpose_transposed ();
+#include "amx-helper.h"
+
+void calc_matrix_ttransposed (__tile *dst, __tile *src)
+{
+ uint32_t *src_buf = (uint32_t *) src->buf;
+ uint32_t *dst_buf = (uint32_t *) dst->buf;
+
+ int M = src->rows;
+ int N = src->colsb / 4;
+ int i, j;
+
+ for (i = 0; i < M; i++)
+ for (j = 0; j < N; j++)
+ dst_buf[j * M + i] = (uint32_t) src_buf[i * N + j];
+}
+
+void test_amx_transpose_transposed ()
+{
+ __tilecfg_u cfg;
+ __tile src, dst, ref;
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src (1, dst);
+ init_tile_reg_and_src (2, src);
+
+ /* Check ttransposed. */
+ calc_matrix_ttransposed (&dst, &src);
+ _tile_transposed (1, 2);
+ _tile_stored (1, ref.buf, _STRIDE);
+
+ if (!check_tile_register (&ref, &dst))
+ abort ();
+}
extern void test_avx10_2_512 (void) __attribute__((__target__("avx10.2-512")));
extern void test_amx_avx512 (void) __attribute__((__target__("amx-avx512")));
extern void test_amx_tf32 (void) __attribute__((__target__("amx-tf32")));
+extern void test_amx_transpose (void) __attribute__((__target__("amx-transpose")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx512vpopcntdq(void) __attribute__((__target__("no-avx512vpopcntdq")));
extern void test_no_avx10_2_512 (void) __attribute__((__target__("no-avx10.2-512")));
extern void test_no_amx_avx512 (void) __attribute__((__target__("no-amx-avx512")));
extern void test_no_amx_tf32 (void) __attribute__((__target__("no-amx-tf32")));
+extern void test_no_amx_transpose (void) __attribute__((__target__("no-amx-transpose")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
popcntintrin.h gfniintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
#include <x86intrin.h>
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
/* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose")
#endif
/* Following intrinsics require immediate arguments. They
/* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
#ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose")
#endif
#include <immintrin.h>
test_1 (_cvtss_sh, unsigned short, float, 1)
#define __builtin_ia32_minmaxps128_mask(A, B, C, D, E) __builtin_ia32_minmaxps128_mask (A, B, 100, D, E)
#define __builtin_ia32_minmaxps256_mask_round(A, B, C, D, E, F) __builtin_ia32_minmaxps256_mask_round (A, B, 100, D, E, 4)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose")
#include <x86intrin.h>
} "-mamx-tf32" ]
}
+# Return 1 if amx-transpose instructions can be compiled.
+proc check_effective_target_amx_transpose { } {
+ return [check_no_compiler_messages amx_transpose object {
+ void
+ foo ()
+ {
+ __asm__ volatile ("ttransposed\t%%tmm1, %%tmm2" ::);
+ }
+ } "-mamx-transpose" ]
+}
+
# Return 1 if sse instructions can be compiled.
proc check_effective_target_sse { } {
return [check_no_compiler_messages sse object {