}
}
+ /* Get Advanced Features at level 0x1e (eax = 0x1e, ecx = 1). */
+ if (max_cpuid_level >= 0x1e)
+ {
+ __cpuid_count (0x1e, 1, eax, ebx, ecx, edx);
+ if (amx_usable)
+ {
+ if (eax & bit_AMX_AVX512)
+ set_feature (FEATURE_AMX_AVX512);
+ }
+ }
+
/* Get Advanced Features at level 0x24 (eax = 0x24, ecx = 0). */
if (avx10_set && max_cpuid_level >= 0x24)
{
#define OPTION_MASK_ISA2_AVX10_2_512_SET \
(OPTION_MASK_ISA2_AVX10_1_512_SET | OPTION_MASK_ISA2_AVX10_2_256_SET \
| OPTION_MASK_ISA2_AVX10_2_512)
+#define OPTION_MASK_ISA2_AMX_AVX512_SET \
+ (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AVX10_2_512_SET \
+ | OPTION_MASK_ISA2_AMX_AVX512)
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
#define OPTION_MASK_ISA2_AMX_TILE_UNSET \
(OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8_UNSET \
| OPTION_MASK_ISA2_AMX_BF16_UNSET | OPTION_MASK_ISA2_AMX_FP16_UNSET \
- | OPTION_MASK_ISA2_AMX_COMPLEX_UNSET)
+ | OPTION_MASK_ISA2_AMX_COMPLEX_UNSET | OPTION_MASK_ISA2_AMX_AVX512_UNSET)
#define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8
#define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16
#define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR
#define OPTION_MASK_ISA2_AVX10_1_512_UNSET \
(OPTION_MASK_ISA2_AVX10_1_512 | OPTION_MASK_ISA2_AVX10_2_512_UNSET)
#define OPTION_MASK_ISA2_AVX10_2_256_UNSET OPTION_MASK_ISA2_AVX10_2_256
-#define OPTION_MASK_ISA2_AVX10_2_512_UNSET OPTION_MASK_ISA2_AVX10_2_512
+#define OPTION_MASK_ISA2_AVX10_2_512_UNSET \
+ (OPTION_MASK_ISA2_AVX10_2_512 | OPTION_MASK_ISA2_AMX_AVX512_UNSET)
+#define OPTION_MASK_ISA2_AMX_AVX512_UNSET OPTION_MASK_ISA2_AMX_AVX512
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
}
return true;
+ case OPT_mamx_avx512:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_AVX512_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_AVX512_SET;
+ opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2_SET;
+ opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX2_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_AVX512_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_AVX512_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
FEATURE_AVX10_1_512,
FEATURE_AVX10_2_256,
FEATURE_AVX10_2_512,
+ FEATURE_AMX_AVX512,
CPU_FEATURE_MAX
};
ISA_NAMES_TABLE_ENTRY("avx10.2", FEATURE_AVX10_2_256, P_NONE, "-mavx10.2")
ISA_NAMES_TABLE_ENTRY("avx10.2-256", FEATURE_AVX10_2_256, P_NONE, "-mavx10.2-256")
ISA_NAMES_TABLE_ENTRY("avx10.2-512", FEATURE_AVX10_2_512, P_NONE, "-mavx10.2-512")
+ ISA_NAMES_TABLE_ENTRY("amx-avx512", FEATURE_AMX_AVX512, P_NONE,
+ "-mamx-avx512")
ISA_NAMES_TABLE_END
avx10_2bf16intrin.h avx10_2-512bf16intrin.h
avx10_2satcvtintrin.h avx10_2-512satcvtintrin.h
avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h
- avx10_2copyintrin.h"
+ avx10_2copyintrin.h amxavx512intrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
--- /dev/null
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXAVX512INTRIN_H_INCLUDED
+#define _AMXAVX512INTRIN_H_INCLUDED
+
+#if !defined(__AMX_AVX512__)
+#pragma GCC push_options
+#pragma GCC target("amx-avx512")
+#define __DISABLE_AMX_AVX512__
+#endif /* __AMX_AVX512__ */
+
+#if defined(__x86_64__)
+#define _tile_cvtrowd2ps_internal(src,A) \
+({ \
+ __m512 dst; \
+ __asm__ volatile \
+ ("{tcvtrowd2ps\t%1, %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A))); \
+ dst; \
+})
+
+#define _tile_cvtrowd2psi_internal(src,imm) \
+({ \
+ __m512 dst; \
+ __asm__ volatile \
+ ("{tcvtrowd2ps\t$"#imm", %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", "#imm"}" \
+ : "=v" (dst) :); \
+ dst; \
+})
+
+#define _tile_cvtrowps2pbf16h_internal(src,A) \
+({ \
+ __m512bh dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2pbf16h\t%1, %%tmm"#src", %0|tcvtrowps2pbf16h\t%0, %%tmm"#src", %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A))); \
+ dst; \
+})
+
+#define _tile_cvtrowps2pbf16hi_internal(src,imm) \
+({ \
+ __m512bh dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2pbf16h\t$"#imm", %%tmm"#src", %0|tcvtrowps2pbf16h\t%0, %%tmm"#src", "#imm"}" \
+ : "=v" (dst) :); \
+ dst; \
+})
+
+#define _tile_cvtrowps2pbf16l_internal(src,A) \
+({ \
+ __m512bh dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2pbf16l\t%1, %%tmm"#src", %0|tcvtrowps2pbf16l\t%0, %%tmm"#src", %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A))); \
+ dst; \
+})
+
+#define _tile_cvtrowps2pbf16li_internal(src,imm) \
+({ \
+ __m512bh dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2pbf16l\t$"#imm", %%tmm"#src", %0|tcvtrowps2pbf16l\t%0, %%tmm"#src", "#imm"}" \
+ : "=v" (dst) :); \
+ dst; \
+})
+
+#define _tile_cvtrowps2phh_internal(src,A) \
+({ \
+ __m512h dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2phh\t%1, %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A))); \
+ dst; \
+})
+
+#define _tile_cvtrowps2phhi_internal(src,imm) \
+({ \
+ __m512h dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2phh\t$"#imm", %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", "#imm"}" \
+ : "=v" (dst) :); \
+ dst; \
+})
+
+#define _tile_cvtrowps2phl_internal(src,A) \
+({ \
+ __m512h dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2phl\t%1, %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A))); \
+ dst; \
+})
+
+#define _tile_cvtrowps2phli_internal(src,imm) \
+({ \
+ __m512h dst; \
+ __asm__ volatile \
+ ("{tcvtrowps2phl\t$"#imm", %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", "#imm"}" \
+ : "=v" (dst) :); \
+ dst; \
+})
+
+#define _tile_movrow_internal(src,A) \
+({ \
+ __m512 dst; \
+ __asm__ volatile \
+ ("{tilemovrow\t%1, %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", %1}" \
+ : "=v" (dst) : "r" ((unsigned) (A))); \
+ dst; \
+})
+
+#define _tile_movrowi_internal(src,imm) \
+({ \
+ __m512 dst; \
+ __asm__ volatile \
+ ("{tilemovrow\t$"#imm", %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", "#imm"}" \
+ : "=v" (dst) :); \
+ dst; \
+})
+
+#define _tile_cvtrowd2ps(src,A) \
+ _tile_cvtrowd2ps_internal (src,A)
+
+#define _tile_cvtrowd2psi(src,imm) \
+ _tile_cvtrowd2psi_internal (src,imm)
+
+#define _tile_cvtrowps2pbf16h(src,A) \
+ _tile_cvtrowps2pbf16h_internal (src,A)
+
+#define _tile_cvtrowps2pbf16hi(src,imm) \
+ _tile_cvtrowps2pbf16hi_internal (src,imm)
+
+#define _tile_cvtrowps2pbf16l(src,A) \
+ _tile_cvtrowps2pbf16l_internal (src,A)
+
+#define _tile_cvtrowps2pbf16li(src,imm) \
+ _tile_cvtrowps2pbf16li_internal (src,imm)
+
+#define _tile_cvtrowps2phh(src,A) \
+ _tile_cvtrowps2phh_internal (src,A)
+
+#define _tile_cvtrowps2phhi(src,imm) \
+ _tile_cvtrowps2phhi_internal (src,imm)
+
+#define _tile_cvtrowps2phl(src,A) \
+ _tile_cvtrowps2phl_internal (src,A)
+
+#define _tile_cvtrowps2phli(src,imm) \
+ _tile_cvtrowps2phli_internal (src,imm)
+
+#define _tile_movrow(src,A) \
+ _tile_movrow_internal (src,A)
+
+#define _tile_movrowi(src,imm) \
+ _tile_movrowi_internal (src,imm)
+
+#endif
+
+#ifdef __DISABLE_AMX_AVX512__
+#undef __DISABLE_AMX_AVX512__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_AVX512__ */
+
+#endif /* _AMXAVX512INTRIN_H_INCLUDED */
#define bit_AESKLE ( 1<<0 )
#define bit_WIDEKL ( 1<<2 )
+/* AMX sub leaf (%eax == 0x1e, %ecx == 1) */
+/* %eax */
+#define bit_AMX_AVX512 (1 << 7)
+
/* AVX10 sub leaf (%eax == 0x24) */
/* %ebx */
#define bit_AVX10_256 (1 << 17)
def_or_undef (parse_in, "__AVX10_2_256__");
if (isa_flag2 & OPTION_MASK_ISA2_AVX10_2_512)
def_or_undef (parse_in, "__AVX10_2_512__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AMX_AVX512)
+ def_or_undef (parse_in, "__AMX_AVX512__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
DEF_PTA(AVX10_1_512)
DEF_PTA(AVX10_2_256)
DEF_PTA(AVX10_2_512)
+DEF_PTA(AMX_AVX512)
{ "-mavx10.1-256", OPTION_MASK_ISA2_AVX10_1_256 },
{ "-mavx10.1-512", OPTION_MASK_ISA2_AVX10_1_512 },
{ "-mavx10.2-256", OPTION_MASK_ISA2_AVX10_2_256 },
- { "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 }
+ { "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 },
+ { "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 }
};
static struct ix86_target_opts isa_opts[] =
{
IX86_ATTR_ISA ("avx10.2", OPT_mavx10_2_256),
IX86_ATTR_ISA ("avx10.2-256", OPT_mavx10_2_256),
IX86_ATTR_ISA ("avx10.2-512", OPT_mavx10_2_512),
+ IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
Target Alias(mavx10.2-256)
Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2,
AVX10.1 and AVX10.2 built-in functions and code generation.
+
+mamx-avx512
+Target Mask(ISA2_AMX_AVX512) Var(ix86_isa_flags2) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX10.1-512,
+AVX10.2-512 and AMX-AVX512 built-in functions and code generation.
mavx10.2
UrlSuffix(gcc/x86-Options.html#index-mavx10_002e2)
+mamx-avx512
+UrlSuffix(gcc/x86-Options.html#index-mamx-avx512)
+
#include <amxcomplexintrin.h>
+#include <amxavx512intrin.h>
+
#include <prfchwintrin.h>
#include <keylockerintrin.h>
@itemx no-avx10.2-512
Enable/disable the generation of the AVX10.2 512 bit instructions.
+@cindex @code{target("amx-avx512")} function attribute, x86
+@item amx-avx512
+@itemx no-amx-avx512
+Enable/disable the generation of the AMX-AVX512 instructions.
+
@cindex @code{target("cld")} function attribute, x86
@item cld
@itemx no-cld
-mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16
-mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mapxf
-musermsr -mavx10.1 -mavx10.1-256 -mavx10.1-512 -mevex512 -mavx10.2 -mavx10.2-256
--mavx10.2-512
+-mavx10.2-512 -mamx-avx512
-mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops
-minline-stringops-dynamically -mstringop-strategy=@var{alg}
-mkl -mwidekl
@need 200
@opindex mavx10.2-512
@itemx -mavx10.2-512
+@need 200
+@opindex mamx-avx512
+@itemx -mamx-avx512
These switches enable the use of instructions in the MMX, SSE,
AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA, AES,
PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG,
AVX512VPOPCNTDQ, AVX512VNNI, SERIALIZE, UINTR, HRESET, AMXTILE, AMXINT8,
AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16, AVXIFMA, AVXVNNIINT8, AVXNECONVERT,
CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT, AMX-COMPLEX, AVXVNNIINT16, SM3, SHA512,
-SM4, APX_F, USER_MSR, AVX10.1, AVX10.2 or CLDEMOTE extended instruction sets.
-Each has a corresponding @option{-mno-} option to disable use of these
-instructions.
+SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512 or CLDEMOTE extended
+instruction sets. Each has a corresponding @option{-mno-} option to disable
+use of these instructions.
These extensions are also available as built-in functions: see
@ref{x86 Built-in Functions}, for details of the functions enabled and
@item amx_bf16
Target supports the execution of @code{amx-bf16} instructions.
+@item amx_avx512
+Target supports the execution of @code{amx-avx512} instructions.
+
@item amx_complex
Target supports the execution of @code{amx-complex} instructions.
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512" } */
/* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512" } */
/* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
#ifdef AMX_COMPLEX
&& __builtin_cpu_supports ("amx-complex")
#endif
+#ifdef AMX_AVX512
+ && __builtin_cpu_supports ("amx-avx512")
+#endif
#ifdef __linux__
&& request_perm_xtile_data ()
#endif
#ifndef AMX_HELPER_H_INCLUDED
#define AMX_HELPER_H_INCLUDED
-#if defined(AMX_FP16) || defined(AMX_COMPLEX)
#include <immintrin.h>
#include <xmmintrin.h>
-#endif
#include "amx-check.h"
typedef union
uint16_t u;
} union16f_uw;
-#if defined(AMX_FP16) || defined(AMX_COMPLEX)
+typedef union
+{
+ __bf16 bf16;
+ uint16_t u;
+} union16bh_uw;
+
+typedef union
+{
+ float f;
+ uint32_t u;
+} union32f_ud;
+
+typedef union
+{
+ __m512 m;
+ uint8_t u[64];
+} union512_ub;
+
+#if defined(AMX_FP16) || defined(AMX_COMPLEX) || defined (AMX_AVX512)
/* Transformation functions between fp16/float */
static uint16_t make_f32_fp16 (float f)
{
}
#endif
+#if defined (AMX_AVX512)
+/* Transformation functions between bf16/float */
+static uint16_t make_f32_bf16 (float f)
+{
+ union16bh_uw tmp;
+ tmp.bf16 = (__bf16) f;
+ return tmp.u;
+}
+
+static float make_bf16_f32 (uint16_t bf)
+{
+ union16bh_uw tmp;
+ tmp.u = bf;
+ return _mm_cvtsbh_ss (tmp.bf16);
+}
+
+/* Init tile buffer with bf16 pairs */
+void init_bf16_max_tile_buffer (uint8_t *buf)
+{
+ int i, j;
+ uint16_t* ptr = (uint16_t *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 32; j++)
+ {
+ float f = 2.5f * i + 1.25f * j;
+ ptr[i * 32 + j] = make_f32_bf16 (f);
+ }
+}
+#endif
+
+/* Init tile buffer with fp32 */
+void init_fp32_max_tile_buffer (uint8_t *buf)
+{
+ int i, j;
+ float* ptr = (float *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 16; j++)
+ ptr[i * 16 + j] = 2.5f * i + 1.25f * j;
+}
+
+/* Init tile buffer with int32 */
+void init_int32_max_tile_buffer (uint8_t *buf)
+{
+ int i, j;
+ uint32_t *ptr = (uint32_t *)buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 16; j++)
+ ptr[i * 16 + j] = (uint32_t) (3 * j - 16 * i);
+}
+
+#define COMPARE_ZMM(A, B) \
+for (int j = 0; j < 16; j++) \
+{ \
+ union32f_ud fu1, fu2; \
+ fu1.f = A[j]; \
+ fu2.f = B[j]; \
+ if (fu1.u != fu2.u) \
+ abort (); \
+}
+
+#define COMPARE_ZMM_BF16(A, B) \
+for (int j = 0; j < 32; j++) \
+{ \
+ union16bh_uw fu1, fu2; \
+ fu1.bf16 = A[j]; \
+ fu2.bf16 = B[j]; \
+ if (fu1.u != fu2.u) \
+ abort(); \
+}
+
+#define COMPARE_ZMM_FP16(A, B) \
+for (int j = 0; j < 32; j++) \
+{ \
+ union16f_uw fu1, fu2; \
+ fu1.f16 = A[j]; \
+ fu2.f16 = B[j]; \
+ if (fu1.u != fu2.u) \
+ abort(); \
+}
+
#endif
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=x86-64-v3 -mamx-avx512" } */
+/* { dg-final { scan-assembler-times "tcvtrowd2ps\[ \\t]" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2pbf16h\[ \\t]" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2pbf16l\[ \\t]" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2phh\[ \\t]" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2phl\[ \\t]" 2 } } */
+/* { dg-final { scan-assembler-times "tilemovrow\[ \\t]" 2 } } */
+#include <immintrin.h>
+
+#define TMM1 1
+
+__m512 a;
+__m512bh b;
+__m512h c;
+
+void TEST ()
+{
+ a = _tile_cvtrowd2ps (TMM1, 1);
+ a = _tile_cvtrowd2psi (TMM1, 2);
+ b = _tile_cvtrowps2pbf16h (TMM1, 3);
+ b = _tile_cvtrowps2pbf16hi (TMM1, 4);
+ b = _tile_cvtrowps2pbf16l (TMM1, 5);
+ b = _tile_cvtrowps2pbf16li (TMM1, 6);
+ c = _tile_cvtrowps2phh (TMM1, 7);
+ c = _tile_cvtrowps2phhi (TMM1, 8);
+ c = _tile_cvtrowps2phl (TMM1, 9);
+ c = _tile_cvtrowps2phli (TMM1, 10);
+ a = _tile_movrow (TMM1, 11);
+ a = _tile_movrowi (TMM1, 12);
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-O2 -march=x86-64-v3 -mamx-avx512 -masm=intel" } */
+/* { dg-final { scan-assembler-times "tcvtrowd2ps\[ \\t]+\[^\n\]*zmm\[0-9\]+\[^\n\]*tmm1+\[^\n\]*" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2pbf16h\[ \\t]+\[^\n\]*zmm\[0-9\]+\[^\n\]*tmm1+\[^\n\]*" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2pbf16l\[ \\t]+\[^\n\]*zmm\[0-9\]+\[^\n\]*tmm1+\[^\n\]*" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2phh\[ \\t]+\[^\n\]*zmm\[0-9\]+\[^\n\]*tmm1+\[^\n\]*" 2 } } */
+/* { dg-final { scan-assembler-times "tcvtrowps2phl\[ \\t]+\[^\n\]*zmm\[0-9\]+\[^\n\]*tmm1+\[^\n\]*" 2 } } */
+/* { dg-final { scan-assembler-times "tilemovrow\[ \\t]+\[^\n\]*zmm\[0-9\]+\[^\n\]*tmm1+\[^\n\]*" 2 } } */
+#include <immintrin.h>
+
+__m512 a;
+__m512bh b;
+__m512h c;
+
+void TEST ()
+{
+ a = _tile_cvtrowd2ps (1, 1);
+ a = _tile_cvtrowd2psi (1, 2);
+ b = _tile_cvtrowps2pbf16h (1, 3);
+ b = _tile_cvtrowps2pbf16hi (1, 4);
+ b = _tile_cvtrowps2pbf16l (1, 5);
+ b = _tile_cvtrowps2pbf16li (1, 6);
+ c = _tile_cvtrowps2phh (1, 7);
+ c = _tile_cvtrowps2phhi (1, 8);
+ c = _tile_cvtrowps2phl (1, 9);
+ c = _tile_cvtrowps2phli (1, 10);
+ a = _tile_movrow (1, 11);
+ a = _tile_movrowi (1, 12);
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_avx512 } */
+/* { dg-options "-O2 -march=x86-64-v3 -mamx-avx512" } */
+#define AMX_AVX512
+#define DO_TEST test_amx_avx512_cvtrowd2ps
+void test_amx_avx512_cvtrowd2ps();
+#include "amx-helper.h"
+
+volatile __m512 cal_dst, cmp_dst;
+
+#define DEFINE_TEST_CVTROWD2PS(EI, T) \
+__m512 \
+__attribute__((noinline, noclone, __target__("no-amx-avx512"))) \
+calc_cvtrowd2ps##EI (__tile *src, T __A) \
+{ \
+ uint32_t *src_buf = (uint32_t *)src->buf; \
+ int N = src->colsb / 4; \
+ int vl = 512; \
+ int vl_bytes = vl >> 3; \
+ int row_index, row_chunk, j; \
+ __m512 res; \
+ if ((#EI) == "e") \
+ { \
+ row_index = (__A) & 0xffff; \
+ row_chunk = (((__A) >> 16) & 0xffff) * vl_bytes; \
+ } \
+ else \
+ { \
+ row_index = (__A) & 0x3f; \
+ row_chunk = ((__A) >> 6) * vl_bytes; \
+ } \
+ for (j = 0; j < vl_bytes / 4; j++) \
+ if (j + row_chunk / 4 >= N) \
+ res[j] = 0; \
+ else \
+ res[j] = (float) (int) src_buf[row_index * N + j + row_chunk / 4]; \
+ return res; \
+}
+
+DEFINE_TEST_CVTROWD2PS(e, unsigned)
+DEFINE_TEST_CVTROWD2PS(i, const unsigned)
+
+#define TEST_CVTROWD2PS(X, Y, EI, T, INTRIN) \
+cal_dst = calc_cvtrowd2ps##EI (X, Y); \
+cmp_dst = _tile_##INTRIN (1, Y); \
+COMPARE_ZMM(cal_dst, cmp_dst);
+
+void test_amx_avx512_cvtrowd2ps()
+{
+ __tilecfg_u cfg;
+ __tile src;
+ uint8_t tmp_dst_buf[1024];
+ unsigned a = 2;
+
+ init_int32_max_tile_buffer (tmp_dst_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, src, tmp_dst_buf);
+
+ TEST_CVTROWD2PS (&src, a, e, unsigned, cvtrowd2ps);
+ TEST_CVTROWD2PS (&src, 1, i, const unsigned, cvtrowd2psi);
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_avx512 } */
+/* { dg-options "-O2 -march=x86-64-v3 -mamx-avx512" } */
+#define AMX_AVX512
+#define DO_TEST test_amx_avx512_cvtrowps2pbf16
+void test_amx_avx512_cvtrowps2pbf16();
+#include "amx-helper.h"
+
+volatile __m512bh cal_dst, cmp_dst;
+
+#define DEFINE_TEST_CVTROWPS2PBF16(HL, EI, T) \
+__m512bh \
+__attribute__((noinline, noclone, __target__("no-amx-avx512"))) \
+calc_cvtrowps2pbf16##HL##EI (__tile *src, T __A) \
+{ \
+ float *src_buf = (float *) src->buf; \
+ int N = src->colsb / 4; \
+ int vl = 512; \
+ int vl_bytes = vl >> 3; \
+ int row_index, row_chunk, zeropos, pos, j, k; \
+ __m512bh res; \
+ if ((#EI) == "e") \
+ { \
+ row_index = (__A) & 0xffff; \
+ row_chunk = (((__A) >> 16) & 0xffff) * vl_bytes; \
+ } \
+ else \
+ { \
+ row_index = (__A) & 0x3f; \
+ row_chunk = ((__A) >> 6) * vl_bytes; \
+ } \
+ if ((#HL) == "h") \
+ { \
+ zeropos = 0; \
+ pos = 1; \
+ } \
+ else \
+ { \
+ zeropos = 1; \
+ pos = 0; \
+ } \
+ for (j = 0; j < vl_bytes / 4; j++) \
+ if (j + row_chunk / 4 >= N) \
+ for (k = 0; k < 2; k++) \
+ res[2 * j + k] = 0; \
+ else \
+ { \
+ union16bh_uw tmp; \
+ tmp.u = make_f32_bf16 (src_buf[row_index * N + j + row_chunk / 4]); \
+ res[2 * j + pos] = tmp.bf16; \
+ res[2 * j + zeropos] = (__bf16) 0; \
+ } \
+ return res; \
+}
+
+DEFINE_TEST_CVTROWPS2PBF16(h, e, unsigned)
+DEFINE_TEST_CVTROWPS2PBF16(l, e, unsigned)
+DEFINE_TEST_CVTROWPS2PBF16(h, i, const unsigned)
+DEFINE_TEST_CVTROWPS2PBF16(l, i, const unsigned)
+
+#define TEST_CVTROWPS2PBF16(X, Y, HL, EI, T, INTRIN) \
+cal_dst = calc_cvtrowps2pbf16##HL##EI (X, Y); \
+cmp_dst = _tile_##INTRIN (1, Y); \
+COMPARE_ZMM_BF16(cal_dst, cmp_dst);
+
+void test_amx_avx512_cvtrowps2pbf16 ()
+{
+ __tilecfg_u cfg;
+ __tile src;
+ uint8_t tmp_dst_buf[1024];
+ unsigned a = 2;
+
+ init_fp32_max_tile_buffer (tmp_dst_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, src, tmp_dst_buf);
+
+ TEST_CVTROWPS2PBF16 (&src, a, h, e, unsigned, cvtrowps2pbf16h);
+ TEST_CVTROWPS2PBF16 (&src, a, l, e, unsigned, cvtrowps2pbf16l);
+ TEST_CVTROWPS2PBF16 (&src, 1, h, i, const unsigned, cvtrowps2pbf16hi);
+ TEST_CVTROWPS2PBF16 (&src, 1, l, i, const unsigned, cvtrowps2pbf16li);
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_avx512 } */
+/* { dg-options "-O2 -march=x86-64-v3 -mamx-avx512" } */
+#define AMX_AVX512
+#define DO_TEST test_amx_avx512_cvtrowps2ph
+void test_amx_avx512_cvtrowps2ph();
+#include "amx-helper.h"
+
+volatile __m512h cal_dst, cmp_dst;
+
+#define DEFINE_TEST_CVTROWPS2PH(HL, EI, T) \
+__m512h \
+__attribute__((noinline, noclone, __target__("no-amx-avx512"))) \
+calc_cvtrowps2ph##HL##EI (__tile *src, T __A) \
+{ \
+ float *src_buf = (float *) src->buf; \
+ int N = src->colsb / 4; \
+ int vl = 512; \
+ int vl_bytes = vl >> 3; \
+ int row_index, row_chunk, zeropos, pos, j, k; \
+ __m512h res; \
+ if ((#EI) == "e") \
+ { \
+ row_index = (__A) & 0xffff; \
+ row_chunk = (((__A) >> 16) & 0xffff) * vl_bytes; \
+ } \
+ else \
+ { \
+ row_index = (__A) & 0x3f; \
+ row_chunk = ((__A) >> 6) * vl_bytes; \
+ } \
+ if ((#HL) == "h") \
+ { \
+ zeropos = 0; \
+ pos = 1; \
+ } \
+ else \
+ { \
+ zeropos = 1; \
+ pos = 0; \
+ } \
+ for (j = 0; j < vl_bytes / 4; j++) \
+ if (j + row_chunk / 4 >= N) \
+ for (k = 0; k < 2; k++) \
+ res[2 * j + k] = 0; \
+ else \
+ { \
+ union16f_uw tmp; \
+ tmp.u = make_f32_fp16 (src_buf[row_index * N + j + row_chunk / 4]); \
+ res[2 * j + zeropos] = 0; \
+ res[2 * j + pos] = tmp.f16; \
+ } \
+ return res; \
+}
+
+DEFINE_TEST_CVTROWPS2PH(h, e, unsigned)
+DEFINE_TEST_CVTROWPS2PH(l, e, unsigned)
+DEFINE_TEST_CVTROWPS2PH(h, i, const unsigned)
+DEFINE_TEST_CVTROWPS2PH(l, i, const unsigned)
+
+#define TEST_CVTROWPS2PH(X, Y, HL, EI, T, INTRIN) \
+cal_dst = calc_cvtrowps2ph##HL##EI (X, Y); \
+cmp_dst = _tile_##INTRIN (1, Y); \
+COMPARE_ZMM_FP16(cal_dst, cmp_dst);
+
+void test_amx_avx512_cvtrowps2ph ()
+{
+ __tilecfg_u cfg;
+ __tile src;
+ uint8_t tmp_dst_buf[1024];
+ unsigned a = 2;
+
+ init_fp32_max_tile_buffer (tmp_dst_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, src, tmp_dst_buf);
+
+ TEST_CVTROWPS2PH (&src, a, h, e, unsigned, cvtrowps2phh);
+ TEST_CVTROWPS2PH (&src, a, l, e, unsigned, cvtrowps2phl);
+ TEST_CVTROWPS2PH (&src, 1, h, i, const unsigned, cvtrowps2phhi);
+ TEST_CVTROWPS2PH (&src, 1, l, i, const unsigned, cvtrowps2phli);
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_avx512 } */
+/* { dg-options "-O2 -march=x86-64-v3 -mamx-avx512" } */
+#define AMX_AVX512
+#define DO_TEST test_amx_avx512_movrow
+void test_amx_avx512_movrow();
+#include "amx-helper.h"
+
+int j, k;
+volatile __m512 cal_dst, cmp_dst;
+
+#define TEST_MOVROW(X, Y, EI, T, INTRIN) \
+__m512 \
+__attribute__((noinline, noclone, __target__("no-amx-avx512"))) \
+calc_movrow##EI (__tile *src, T __A) \
+{ \
+ uint8_t *src_buf = (uint8_t *)src->buf; \
+ int N = src->colsb; \
+ int vl = 512; \
+ int vl_bytes = vl >> 3; \
+ int row_index, row_chunk; \
+ __m512 res; \
+ if ((EI) == 'e') \
+ { \
+ row_index = (__A) & 0xffff; \
+ row_chunk = (((__A) >> 16) & 0xffff) * vl_bytes; \
+ } \
+ else \
+ { \
+ row_index = (__A) & 0x3f; \
+ row_chunk = ((__A) >> 6) * vl_bytes; \
+ } \
+ union512_ub tmp; \
+ for (j = 0; j < vl_bytes; j++) \
+ if (j + row_chunk >= N) \
+ tmp.u[j] = 0; \
+ else \
+ tmp.u[j] = src_buf[row_index * N + j + row_chunk]; \
+ res = tmp.m; \
+ return res; \
+} \
+cal_dst = calc_movrow##EI (X, Y); \
+cmp_dst = _tile_##INTRIN (1, Y); \
+COMPARE_ZMM(cal_dst, cmp_dst);
+
+void test_amx_avx512_movrow()
+{
+ __tilecfg_u cfg;
+ __tile src;
+ unsigned a = 2;
+ char e = 'e', i = 'i';
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src (1, src);
+
+ TEST_MOVROW (&src, a, e, unsigned, movrow);
+ TEST_MOVROW (&src, 1, i, const unsigned, movrowi);
+
+}
extern void test_user_msr (void) __attribute__((__target__("usermsr")));
extern void test_avx10_2 (void) __attribute__((__target__("avx10.2")));
extern void test_avx10_2_512 (void) __attribute__((__target__("avx10.2-512")));
+extern void test_amx_avx512 (void) __attribute__((__target__("amx-avx512")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx512vpopcntdq(void) __attribute__((__target__("no-avx512vpopcntdq")));
extern void test_no_user_msr (void) __attribute__((__target__("no-usermsr")));
extern void test_no_avx10_2 (void) __attribute__((__target__("no-avx10.2")));
extern void test_no_avx10_2_512 (void) __attribute__((__target__("no-avx10.2-512")));
+extern void test_no_amx_avx512 (void) __attribute__((__target__("no-amx-avx512")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
popcntintrin.h gfniintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512" } */
#include <x86intrin.h>
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
/* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512")
#endif
/* Following intrinsics require immediate arguments. They
/* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
#ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512")
#endif
#include <immintrin.h>
test_1 (_cvtss_sh, unsigned short, float, 1)
#define __builtin_ia32_minmaxps128_mask(A, B, C, D, E) __builtin_ia32_minmaxps128_mask (A, B, 100, D, E)
#define __builtin_ia32_minmaxps256_mask_round(A, B, C, D, E, F) __builtin_ia32_minmaxps256_mask_round (A, B, 100, D, E, 4)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512")
#include <x86intrin.h>
} "-mavx10.2-512" ]
}
+# Return 1 if amx-avx512 instructions can be compiled.
+proc check_effective_target_amx_avx512 { } {
+ return [check_no_compiler_messages amx_avx512 object {
+ void
+ foo ()
+ {
+ __asm__ volatile ("tilemovrow\t%%edx, %%tmm2, %%zmm1" ::);
+ }
+ } "-mamx-avx512" ]
+}
+
# Return 1 if sse instructions can be compiled.
proc check_effective_target_sse { } {
return [check_no_compiler_messages sse object {