set_feature (FEATURE_AMX_TF32);
if (eax & bit_AMX_TRANSPOSE)
set_feature (FEATURE_AMX_TRANSPOSE);
+ if (eax & bit_AMX_FP8)
+ set_feature (FEATURE_AMX_FP8);
}
}
(OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TF32)
#define OPTION_MASK_ISA2_AMX_TRANSPOSE_SET \
(OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_TRANSPOSE)
+#define OPTION_MASK_ISA2_AMX_FP8_SET \
+ (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_FP8)
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
(OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8_UNSET \
| OPTION_MASK_ISA2_AMX_BF16_UNSET | OPTION_MASK_ISA2_AMX_FP16_UNSET \
| OPTION_MASK_ISA2_AMX_COMPLEX_UNSET | OPTION_MASK_ISA2_AMX_AVX512_UNSET \
- | OPTION_MASK_ISA2_AMX_TF32_UNSET | OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET)
+ | OPTION_MASK_ISA2_AMX_TF32_UNSET | OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET \
+ | OPTION_MASK_ISA2_AMX_FP8_UNSET)
#define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8
#define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16
#define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR
#define OPTION_MASK_ISA2_AMX_AVX512_UNSET OPTION_MASK_ISA2_AMX_AVX512
#define OPTION_MASK_ISA2_AMX_TF32_UNSET OPTION_MASK_ISA2_AMX_TF32
#define OPTION_MASK_ISA2_AMX_TRANSPOSE_UNSET OPTION_MASK_ISA2_AMX_TRANSPOSE
+#define OPTION_MASK_ISA2_AMX_FP8_UNSET OPTION_MASK_ISA2_AMX_FP8
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
}
return true;
+ case OPT_mamx_fp8:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_FP8_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP8_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP8_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP8_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
FEATURE_AMX_AVX512,
FEATURE_AMX_TF32,
FEATURE_AMX_TRANSPOSE,
+ FEATURE_AMX_FP8,
CPU_FEATURE_MAX
};
ISA_NAMES_TABLE_ENTRY("amx-tf32", FEATURE_AMX_TF32, P_NONE, "-mamx-tf32")
ISA_NAMES_TABLE_ENTRY("amx-transpose", FEATURE_AMX_TRANSPOSE,
P_NONE, "-mamx-transpose")
+ ISA_NAMES_TABLE_ENTRY("amx-fp8", FEATURE_AMX_FP8, P_NONE, "-mamx-fp8")
ISA_NAMES_TABLE_END
avx10_2satcvtintrin.h avx10_2-512satcvtintrin.h
avx10_2minmaxintrin.h avx10_2-512minmaxintrin.h
avx10_2copyintrin.h amxavx512intrin.h amxtf32intrin.h
- amxtransposeintrin.h"
+ amxtransposeintrin.h amxfp8intrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
--- /dev/null
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXFP8INTRIN_H_INCLUDED
+#define _AMXFP8INTRIN_H_INCLUDED
+
+#if defined(__x86_64__)
+#define _tile_dpbf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbhf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpbhf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbhf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dphbf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdphbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dphf8ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdphf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbf8ps(dst,src1,src2) \
+ _tile_dpbf8ps_internal (dst,src1,src2)
+
+#define _tile_dpbhf8ps(dst,src1,src2) \
+ _tile_dpbhf8ps_internal (dst,src1,src2)
+
+#define _tile_dphbf8ps(dst,src1,src2) \
+ _tile_dphbf8ps_internal (dst,src1,src2)
+
+#define _tile_dphf8ps(dst,src1,src2) \
+ _tile_dphf8ps_internal (dst,src1,src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_FP8__
+#undef __DISABLE_AMX_FP8__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_FP8__ */
+
+#endif /* _AMXFP8INTRIN_H_INCLUDED */
/* AMX sub leaf (%eax == 0x1e, %ecx == 1) */
/* %eax */
+#define bit_AMX_FP8 (1 << 4)
#define bit_AMX_TRANSPOSE (1 << 5)
#define bit_AMX_TF32 (1 << 6)
#define bit_AMX_AVX512 (1 << 7)
def_or_undef (parse_in, "__AMX_TF32__");
if (isa_flag2 & OPTION_MASK_ISA2_AMX_TRANSPOSE)
def_or_undef (parse_in, "__AMX_TRANSPOSE__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP8)
+ def_or_undef (parse_in, "__AMX_FP8__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
DEF_PTA(AMX_AVX512)
DEF_PTA(AMX_TF32)
DEF_PTA(AMX_TRANSPOSE)
+DEF_PTA(AMX_FP8)
{ "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 },
{ "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 },
{ "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 },
- { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE }
+ { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE },
+ { "-mamx-fp8", OPTION_MASK_ISA2_AMX_FP8 }
};
static struct ix86_target_opts isa_opts[] =
{
IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512),
IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32),
IX86_ATTR_ISA ("amx-transpose", OPT_mamx_transpose),
+ IX86_ATTR_ISA ("amx-fp8", OPT_mamx_fp8),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
mamx-transpose
Target Mask(ISA2_AMX_TRANSPOSE) Var(ix86_isa_flags2) Save
Support AMX-TRANSPOSE built-in functions and code generation.
+
+mamx-fp8
+Target Mask(ISA2_AMX_FP8) Var(ix86_isa_flags2) Save
+Support AMX-FP8 built-in functions and code generation.
mamx-transpose
UrlSuffix(gcc/x86-Options.html#index-mamx-transpose)
+mamx-fp8
+UrlSuffix(gcc/x86-Options.html#index-mamx-fp8)
+
#include <amxtransposeintrin.h>
+#include <amxfp8intrin.h>
+
#include <prfchwintrin.h>
#include <keylockerintrin.h>
@itemx no-amx-transpose
Enable/disable the generation of the AMX-TRANSPOSE instructions.
+@cindex @code{target("amx-fp8")} function attribute, x86
+@item amx-fp8
+@itemx no-amx-fp8
+Enable/disable the generation of the AMX-FP8 instructions.
+
@cindex @code{target("cld")} function attribute, x86
@item cld
@itemx no-cld
-mvpclmulqdq -mavx512bitalg -mmovdiri -mmovdir64b -mavx512vpopcntdq
-mavx512vnni -mprfchw -mrdpid
-mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk
--mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni
+-mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni -mamx-fp8
-mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16
-mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mapxf
-musermsr -mavx10.1 -mavx10.1-256 -mavx10.1-512 -mevex512 -mavx10.2 -mavx10.2-256
@need 200
@opindex mamx-transpose
@itemx -mamx-transpose
+@need 200
+@itemx -mamx-fp8
+@opindex mamx-fp8
These switches enable the use of instructions in the MMX, SSE,
AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA, AES,
PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG,
AVX512VPOPCNTDQ, AVX512VNNI, SERIALIZE, UINTR, HRESET, AMXTILE, AMXINT8,
AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16, AVXIFMA, AVXVNNIINT8, AVXNECONVERT,
CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT, AMX-COMPLEX, AVXVNNIINT16, SM3, SHA512,
-SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-TRANSPOSE or
-CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-}
-option to disable use of these instructions.
+SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-TRANSPOSE,
+AMX-FP8 or CLDEMOTE extended instruction sets. Each has a corresponding
+@option{-mno-} option to disable use of these instructions.
These extensions are also available as built-in functions: see
@ref{x86 Built-in Functions}, for details of the functions enabled and
@item amx_transpose
Target supports the execution of @code{amx-transpose} instructions.
+@item amx_fp8
+Target supports the execution of @code{amx-fp8} instructions.
+
@item cell_hw
Test system can execute AltiVec and Cell PPU instructions.
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */
/* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */
/* { dg-skip-if "requires hosted libstdc++ for cstdlib malloc" { ! hostedlib } } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
#ifdef AMX_TRANSPOSE
&& __builtin_cpu_supports ("amx-transpose")
#endif
+#ifdef AMX_FP8
+ && __builtin_cpu_supports ("amx-fp8")
+#endif
#ifdef __linux__
&& request_perm_xtile_data ()
#endif
ptr[i * 16 + j] = 2.5f * i + 1.25f * j;
}
+/* Init tile fp32 buffer with zero */
+void init_fp32_max_tile_zero_buffer (uint8_t *buf)
+{
+ int i, j;
+ float* ptr = (float *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 16; j++)
+ ptr[i * 16 + j] = 0.0f;
+}
+
/* Init tile buffer with int32 */
void init_int32_max_tile_buffer (uint8_t *buf)
{
ptr[i * 16 + j] = (uint32_t) (3 * j - 16 * i);
}
+void
+init_fp8_max_tile_buffer (uint8_t *buf)
+{
+ int i, j;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 64; j++)
+ {
+ int idx = i * 64 + j;
+
+ /* Positive Infinity (S11111.00) */
+ if (idx % 128 == 0)
+ buf[idx] = 0x7C;
+
+ /* Negative Infinity (S11111.00 with sign bit set) */
+ else if (idx % 128 == 1)
+ buf[idx] = 0xFC;
+
+ /* Positive NaN (S11111.01) */
+ else if (idx % 128 == 2)
+ buf[idx] = 0x7D;
+
+ /* Negative NaN (S11111.01 with sign bit set) */
+ else if (idx % 128 == 3)
+ buf[idx] = 0xFD;
+
+ /* insert Positive NaN (S11111.10) */
+ else if (idx % 128 == 4)
+ buf[idx] = 0x7E;
+
+ /* Negative NaN (S11111.10 with sign bit set) */
+ else if (idx % 128 == 5)
+ buf[idx] = 0xFE;
+
+ /* Positive NaN (S11111.11) */
+ else if (idx % 128 == 6)
+ buf[idx] = 0x7F;
+
+ /* Negative NaN (S11111.11 with sign bit set) */
+ else if (idx % 128 == 7)
+ buf[idx] = 0xFF;
+
+ else
+ buf[idx] = (uint8_t) ((idx * 251) & 0xFF);
+ }
+}
+
#define COMPARE_ZMM(A, B) \
for (int j = 0; j < 16; j++) \
{ \
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mamx-fp8" } */
+/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+
+#include <immintrin.h>
+
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+
+void TEST ()
+{
+ _tile_dpbf8ps (TMM1, TMM2, TMM3);
+ _tile_dpbhf8ps (TMM1, TMM2, TMM3);
+ _tile_dphbf8ps (TMM1, TMM2, TMM3);
+ _tile_dphf8ps (TMM1, TMM2, TMM3);
+}
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */
+/* { dg-final { scan-assembler "tdpbf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdpbhf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdphbf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tdphf8ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+
+#include <immintrin.h>
+
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+
+void TEST ()
+{
+ _tile_dpbf8ps (TMM1, TMM2, TMM3);
+ _tile_dpbhf8ps (TMM1, TMM2, TMM3);
+ _tile_dphbf8ps (TMM1, TMM2, TMM3);
+ _tile_dphf8ps (TMM1, TMM2, TMM3);
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_fp8 } */
+/* { dg-options "-O2 -mamx-fp8" } */
+
+#define AMX_FP8
+#define DO_TEST test_amx_fp8_dpbf8ps
+void test_amx_fp8_dpbf8ps ();
+
+#include "amx-helper.h"
+#include "fp8-helper.h"
+#include "fp-emulation.h"
+
+void
+calc_matrix_dpbf8ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ unsigned char *src1_buf = (unsigned char *)src1->buf;
+ unsigned char *src2_buf = (unsigned char *)src2->buf;
+ float *dst_buf = (float *)dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+
+ int* valueState_a = (int*)malloc(sizeof(int));
+ int* valueState_b = (int*)malloc(sizeof(int));
+ int* valueState = (int *)malloc(M * K * sizeof(int));
+ __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t));
+
+ for (int i = 0; i < M; i++)
+ {
+ for (int j = 0; j < N; j++)
+ for (int k = 0; k < K; k++)
+ {
+ int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 1, valueState_a);
+ int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 1, valueState_b);
+ int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*');
+
+ int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 1, valueState_a);
+ int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 1, valueState_b);
+ int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*');
+
+ int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 1, valueState_a);
+ int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 1, valueState_b);
+ int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*');
+
+ int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 1, valueState_a);
+ int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 1, valueState_b);
+ int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*');
+
+ valueState[i * K + k] = state_handler(
+ state_handler(
+ state_handler(
+ state_handler(
+ valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'),
+ valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'),
+ valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'),
+ valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+');
+
+ temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0
+ + (__int128_t)s1e1 * (__int128_t)s2e1
+ + (__int128_t)s1e2 * (__int128_t)s2e2
+ + (__int128_t)s1e3 * (__int128_t)s2e3;
+ }
+ }
+
+ for (int i = 0; i < M; i++)
+ for (int k = 0; k < K; k++)
+ {
+ float tmp_float = shift_int128_to_fp32(temp[i * K + k], 1, 1);
+ if (isnan(dst_buf[i * K + k]) || isnan(tmp_float))
+ dst_buf[i * K + k] = -nanf("");
+ else switch(valueState[i * K + k])
+ {
+ case -3:
+ dst_buf[i * K + k] = -INFINITY;
+ break;
+ case 3:
+ dst_buf[i * K + k] = 1.0 / 0.0;
+ break;
+ case 2:
+ case -2:
+ dst_buf[i * K + k] = -nanf("");
+ break;
+ case -1:
+ case 0:
+ case 1:
+ default:
+ dst_buf[i * K + k] += tmp_float;
+ break;
+ }
+ }
+
+ free(valueState_a);
+ free(valueState_b);
+ free(valueState);
+ free(temp);
+}
+
+void test_amx_fp8_dpbf8ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024];
+
+ init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf);
+ init_fp8_max_tile_buffer(tmp_src_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf);
+
+ calc_matrix_dpbf8ps (&dst, &src1, &src2);
+
+ _tile_dpbf8ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_fp8 } */
+/* { dg-options "-O2 -mamx-fp8" } */
+
+#define AMX_FP8
+#define DO_TEST test_amx_fp8_dpbhf8ps
+void test_amx_fp8_dpbhf8ps ();
+
+#include "amx-helper.h"
+#include "fp8-helper.h"
+#include "fp-emulation.h"
+
+void
+calc_matrix_dpbhf8ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ unsigned char *src1_buf = (unsigned char *)src1->buf;
+ unsigned char *src2_buf = (unsigned char *)src2->buf;
+ float *dst_buf = (float *)dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+
+ int* valueState_a = (int*)malloc(sizeof(int));
+ int* valueState_b = (int*)malloc(sizeof(int));
+ int* valueState = (int *)malloc(M * K * sizeof(int));
+ __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t));
+
+ for (int i = 0; i < M; i++)
+ {
+ for (int j = 0; j < N; j++)
+ for (int k = 0; k < K; k++)
+ {
+ int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 1, valueState_a);
+ int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 0, valueState_b);
+ int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*');
+
+ int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 1, valueState_a);
+ int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 0, valueState_b);
+ int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*');
+
+ int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 1, valueState_a);
+ int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 0, valueState_b);
+ int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*');
+
+ int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 1, valueState_a);
+ int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 0, valueState_b);
+ int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*');
+
+ valueState[i * K + k] = state_handler(
+ state_handler(
+ state_handler(
+ state_handler(
+ valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'),
+ valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'),
+ valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'),
+ valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+');
+
+ temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0
+ + (__int128_t)s1e1 * (__int128_t)s2e1
+ + (__int128_t)s1e2 * (__int128_t)s2e2
+ + (__int128_t)s1e3 * (__int128_t)s2e3;
+ }
+ }
+
+ for (int i = 0; i < M; i++)
+ for (int k = 0; k < K; k++)
+ {
+ float tmp_float = shift_int128_to_fp32(temp[i * K + k], 1, 0);
+ if (isnan(dst_buf[i * K + k]) || isnan(tmp_float))
+ dst_buf[i * K + k] = -nanf("");
+ else switch(valueState[i * K + k])
+ {
+ case -3:
+ dst_buf[i * K + k] = -INFINITY;
+ break;
+ case 3:
+ dst_buf[i * K + k] = 1.0 / 0.0;
+ break;
+ case 2:
+ case -2:
+ dst_buf[i * K + k] = -nanf("");
+ break;
+ case -1:
+ case 0:
+ case 1:
+ default:
+ dst_buf[i * K + k] += tmp_float;
+ break;
+ }
+ }
+
+ free(valueState_a);
+ free(valueState_b);
+ free(valueState);
+ free(temp);
+}
+
+void test_amx_fp8_dpbhf8ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024];
+
+ init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf);
+ init_fp8_max_tile_buffer(tmp_src_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf);
+
+ calc_matrix_dpbhf8ps (&dst, &src1, &src2);
+
+ _tile_dpbhf8ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_fp8 } */
+/* { dg-options "-O2 -mamx-fp8" } */
+
+#define AMX_FP8
+#define DO_TEST test_amx_fp8_dphbf8ps
+void test_amx_fp8_dphbf8ps ();
+
+#include "amx-helper.h"
+#include "fp8-helper.h"
+#include "fp-emulation.h"
+
+void
+calc_matrix_dphbf8ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ unsigned char *src1_buf = (unsigned char *)src1->buf;
+ unsigned char *src2_buf = (unsigned char *)src2->buf;
+ float *dst_buf = (float *)dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+
+ int* valueState_a = (int*)malloc(sizeof(int));
+ int* valueState_b = (int*)malloc(sizeof(int));
+ int* valueState = (int *)malloc(M * K * sizeof(int));
+ __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t));
+
+ for (int i = 0; i < M; i++)
+ {
+ for (int j = 0; j < N; j++)
+ for (int k = 0; k < K; k++)
+ {
+ int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 0, valueState_a);
+ int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 1, valueState_b);
+ int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*');
+
+ int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 0, valueState_a);
+ int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 1, valueState_b);
+ int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*');
+
+ int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 0, valueState_a);
+ int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 1, valueState_b);
+ int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*');
+
+ int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 0, valueState_a);
+ int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 1, valueState_b);
+ int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*');
+
+ valueState[i * K + k] = state_handler(
+ state_handler(
+ state_handler(
+ state_handler(
+ valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'),
+ valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'),
+ valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'),
+ valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+');
+
+ temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0
+ + (__int128_t)s1e1 * (__int128_t)s2e1
+ + (__int128_t)s1e2 * (__int128_t)s2e2
+ + (__int128_t)s1e3 * (__int128_t)s2e3;
+ }
+ }
+
+ for (int i = 0; i < M; i++)
+ for (int k = 0; k < K; k++)
+ {
+ float tmp_float = shift_int128_to_fp32(temp[i * K + k], 0, 1);
+ if (isnan(dst_buf[i * K + k]) || isnan(tmp_float))
+ dst_buf[i * K + k] = -nanf("");
+ else switch(valueState[i * K + k])
+ {
+ case -3:
+ dst_buf[i * K + k] = -INFINITY;
+ break;
+ case 3:
+ dst_buf[i * K + k] = 1.0 / 0.0;
+ break;
+ case 2:
+ case -2:
+ dst_buf[i * K + k] = -nanf("");
+ break;
+ case -1:
+ case 0:
+ case 1:
+ default:
+ dst_buf[i * K + k] += tmp_float;
+ break;
+ }
+ }
+
+ free(valueState_a);
+ free(valueState_b);
+ free(valueState);
+ free(temp);
+}
+
+void test_amx_fp8_dphbf8ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024];
+
+ init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf);
+ init_fp8_max_tile_buffer(tmp_src_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf);
+
+ calc_matrix_dphbf8ps (&dst, &src1, &src2);
+
+ _tile_dphbf8ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_fp8 } */
+/* { dg-options "-O2 -mamx-fp8" } */
+
+#define AMX_FP8
+#define DO_TEST test_amx_fp8_dphf8ps
+void test_amx_fp8_dphf8ps ();
+
+#include "amx-helper.h"
+#include "fp8-helper.h"
+#include "fp-emulation.h"
+
+void
+calc_matrix_dphf8ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ unsigned char *src1_buf = (unsigned char *)src1->buf;
+ unsigned char *src2_buf = (unsigned char *)src2->buf;
+ float *dst_buf = (float *)dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+
+ int* valueState_a = (int*)malloc(sizeof(int));
+ int* valueState_b = (int*)malloc(sizeof(int));
+ int* valueState = (int *)malloc(M * K * sizeof(int));
+ __int128_t *temp = (__int128_t *)malloc(M * K * sizeof(__int128_t));
+
+ for (int i = 0; i < M; i++)
+ {
+ for (int j = 0; j < N; j++)
+ for (int k = 0; k < K; k++)
+ {
+ int64_t s1e0 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 0], 0, valueState_a);
+ int64_t s2e0 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 0], 0, valueState_b);
+ int valueState_e0 = state_handler(*valueState_a, *valueState_b, s1e0, s2e0, '*');
+
+ int64_t s1e1 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 1], 0, valueState_a);
+ int64_t s2e1 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 1], 0, valueState_b);
+ int valueState_e1 = state_handler(*valueState_a, *valueState_b, s1e1, s2e1, '*');
+
+ int64_t s1e2 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 2], 0, valueState_a);
+ int64_t s2e2 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 2], 0, valueState_b);
+ int valueState_e2 = state_handler(*valueState_a, *valueState_b, s1e2, s2e2, '*');
+
+ int64_t s1e3 = shift_fp8_to_int64(src1_buf[4 * i * N + 4 * j + 3], 0, valueState_a);
+ int64_t s2e3 = shift_fp8_to_int64(src2_buf[4 * j * K + 4 * k + 3], 0, valueState_b);
+ int valueState_e3 = state_handler(*valueState_a, *valueState_b, s1e3, s2e3, '*');
+
+ valueState[i * K + k] = state_handler(
+ state_handler(
+ state_handler(
+ state_handler(
+ valueState_e0, valueState_e1, (__int128_t)s1e0 * s2e0, s1e1 * s2e1, '+'),
+ valueState_e2, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1, s1e2 * s2e2, '+'),
+ valueState_e3, (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2, s1e3 * s2e3, '+'),
+ valueState[i * K + k], (__int128_t)s1e0 * s2e0 + s1e1 * s2e1 + s1e2 * s2e2 + s1e3 * s2e3, temp[i * K + k], '+');
+
+ temp[i * K + k] += (__int128_t)s1e0 * (__int128_t)s2e0
+ + (__int128_t)s1e1 * (__int128_t)s2e1
+ + (__int128_t)s1e2 * (__int128_t)s2e2
+ + (__int128_t)s1e3 * (__int128_t)s2e3;
+ }
+ }
+
+ for (int i = 0; i < M; i++)
+ for (int k = 0; k < K; k++)
+ {
+ float tmp_float = shift_int128_to_fp32(temp[i * K + k], 0, 0);
+ if (isnan(dst_buf[i * K + k]) || isnan(tmp_float))
+ dst_buf[i * K + k] = -nanf("");
+ else switch(valueState[i * K + k])
+ {
+ case -3:
+ dst_buf[i * K + k] = -INFINITY;
+ break;
+ case 3:
+ dst_buf[i * K + k] = 1.0 / 0.0;
+ break;
+ case 2:
+ case -2:
+ dst_buf[i * K + k] = -nanf("");
+ break;
+ case -1:
+ case 0:
+ case 1:
+ default:
+ dst_buf[i * K + k] += tmp_float;
+ break;
+ }
+ }
+
+ free(valueState_a);
+ free(valueState_b);
+ free(valueState);
+ free(temp);
+}
+
+void test_amx_fp8_dphf8ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_zero_buf[1024], tmp_src_buf[1024];
+
+ init_fp32_max_tile_zero_buffer (tmp_dst_zero_buf);
+ init_fp8_max_tile_buffer(tmp_src_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_src_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_src_buf);
+
+ calc_matrix_dphf8ps (&dst, &src1, &src2);
+
+ _tile_dphf8ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
--- /dev/null
+#ifndef FP_EMULATION_H_INCLUDED
+#define FP_EMULATION_H_INCLUDED
+
+#include <math.h>
+
+int is_snan(float x)
+{
+ union32f_ud fb;
+ fb.f = x;
+ return __builtin_isnan(x) && (fb.u & 0x00400000) == 0;
+}
+
+int is_qnan(float x)
+{
+ return __builtin_isnan(x) && !is_snan(x);
+}
+
+#define INTEL_SSE_MATH_OP(op, a, b) \
+({ \
+ union32f_ud tmp; \
+ float result; \
+ if (is_qnan(a)) \
+ result = a; \
+ else if (is_snan(a)) \
+ { \
+ tmp.f = a; tmp.u |= 0x400000; result = tmp.f; \
+ } else if (is_snan(b)) \
+ { \
+ tmp.f = b; tmp.u |= 0x400000; result = tmp.f; \
+ } else if (is_qnan(b)) \
+ result = b; \
+ else \
+ result = a op b; \
+ result; \
+})
+
+float
+intel_sse_math_add(float a, float b)
+{
+ return INTEL_SSE_MATH_OP(+, a, b);
+}
+
+float
+intel_sse_math_sub(float a, float b)
+{
+ return INTEL_SSE_MATH_OP(-, a, b);
+}
+
+float
+intel_sse_math_mul(float a, float b)
+{
+ return INTEL_SSE_MATH_OP(*, a, b);
+}
+
+float
+intel_sse_math_div(float a, float b)
+{
+ return INTEL_SSE_MATH_OP(/, a, b);
+}
+
+/* +-3 == +-inf,
+ +-2 == +-Nan, use -2 only,
+ +-1 == +- normal number,
+ 0 == undefined/not intialized */
+
+int
+state_handler(int src0, int src1, __int128 a, __int128 b, char op)
+{
+ /* Nan */
+ if (src0 == -2 || src1 == -2)
+ return -2;
+
+ if (abs(src0) == 3 || abs(src1) == 3)
+ {
+ /* +INF + +INF = +INF, -INF + -INF = -INF */
+ if (src0 == src1)
+ return src0;
+
+ /* Positive result */
+ if ((op == '*' && ((src0 < 0 && b < 0) || (src1 < 0 && a < 0))) ||
+ (op == '*' && ((src0 > 0 && b > 0) || (src1 > 0 && a > 0))))
+ return 3;
+
+ /* -INF * (positive normal) = -INF */
+ if (op == '*' && ((src0 < 0 || src1 < 0) && (a >= 0 && b >= 0)))
+ return -3;
+
+ /* INF * 0 = NaN */
+ if (a == 0 || b == 0)
+ return -2;
+
+ /* -INF + +INF = NaN, else +INF or -INF */
+ return (op == '+' && src0 != src1) ? -2 : 3 * ((src0 / 3) + (src1 / 3));
+ }
+
+ /* Normal number case */
+ __int128 result = op == '+' ? a + b : a * b;
+ return result < 0 ? -1 : 1;
+}
+
+int
+state_handler_float (float src)
+{
+ if (isnan (src))
+ return -2;
+ else if (isinf (src))
+ return src > 0 ? 3 : -3;
+ else
+ return src >= 0 ? 1 : -1;
+}
+
+#endif
#ifndef FP8_HELPER_UNCLUDED
#define FP8_HELPER_UNCLUDED
+#include <stdint.h>
+#include <inttypes.h>
+
typedef union
{
_Float16 f16;
unsigned short u16;
} Float16Union;
+typedef union
+{
+ float f;
+ uint32_t u;
+} Floatuint32Union;
+
static unsigned char
convert_fp16_to_hf8 (_Float16 x, unsigned char b, int s)
{
return res.f16;
}
+static float
+convert_bf8_to_fp32 (unsigned char x)
+{
+ Float16Union u = {.u16 = (x << 8) & 0xff00};
+ return (float)(u.f16);
+}
+
+static float
+convert_hf8_to_fp32 (unsigned char x)
+{
+ Floatuint32Union res = {.f = 0.0f};
+ unsigned int s = (x & 0x80) << 24, e = (x & 0x78) >> 3, m = x & 0x07;
+ unsigned int e_norm = e + 120;
+
+ if (!e && m)
+ {
+ unsigned int lz_cnt = 2 - (m > 1) - (m > 3);
+ e_norm -= lz_cnt;
+ m = (m << (lz_cnt + 1) & 0x7);
+ }
+ else if (!e)
+ e_norm = 0;
+ else if (e == 0xf && m == 0x7)
+ e_norm = 255, m = 4;
+
+ res.u |= (e_norm << 23) | (m << 20) | s;
+
+ return res.f;
+}
+
+static float
+convert_fp8_to_fp32 (unsigned char x, int y)
+{
+ return y ? convert_bf8_to_fp32 (x)
+ : convert_hf8_to_fp32 (x);
+}
+
+static int
+fp8_isNan(unsigned char value, int bf8)
+{
+ unsigned char e = bf8 ? (value >> 2) & 0b11111 : (value >> 3) & 0b1111;
+ unsigned char m = bf8 ? value & 0b11 : value & 0b111;
+
+ return bf8 ? (e == 0b11111 && m != 0) : (e == 0b1111 && m == 0b111);
+}
+
+static int
+bf8_isInf(unsigned char value)
+{
+ unsigned char sign = value >> 7;
+ unsigned char e = (value >> 2) & 0b11111;
+ unsigned char m = value & 0b11;
+
+ if (e == 0b11111 && !m)
+ return sign ? -1 : 1;
+ return 0;
+}
+
+#ifdef __x86_64__
+/* type = 0 for hf8 and type = 1 for bf8 */
+/* value is 2^16*x for bf8 and 2^9*x for hf8 */
+/* +-3 means +-inf,
+ +-2 means +-Nan,
+ and we will use -2 only,
+ +-1 means positive and negative no rmal numbers,
+ 0 means undefined/not initialised */
+static int64_t
+shift_fp8_to_int64 (unsigned char x, int bf8, int *valueState)
+{
+ *valueState = (x >> 7) ? -1 : 1;
+ if (fp8_isNan (x, bf8))
+ *valueState = -2;
+ if (bf8)
+ if (bf8_isInf(x) == 1)
+ *valueState = 3;
+ else if (bf8_isInf(x) == -1)
+ *valueState = -3;
+
+ unsigned short sign = (x & 0x80) >> 7;
+ unsigned short exp = bf8 ? (x & 0x7c) >> 2 : (x & 0x78) >> 3;
+ unsigned short frac = bf8 ? (x & 0x03) : (x & 0x07);
+ unsigned short mant = (exp == 0) ? frac : (bf8 ? (frac | 0x4) : (frac | 0x8));
+ unsigned short e_count = (exp == 0) ? 0 : exp - 1;
+ int64_t magnitude = (int64_t)mant << (int64_t)e_count;
+
+ return sign ? -magnitude : magnitude;
+}
+
+
+/* type = 0 for hf8 and type = 1 for bf8 */
+static float
+shift_int128_to_fp32 (__int128_t in, int type1, int type2)
+{
+ if (in == 0)
+ return 0;
+
+ unsigned short sign = (in >> 127) & 1;
+ unsigned short Jbit_position = 126;
+ unsigned short fac = (type1 == type2) ? ( (type1) ? 32 : 18) : 25;
+
+ __int128_t magnitude = sign ? -in : in;
+
+ while (((magnitude >> 126) & 1) == 0)
+ {
+ Jbit_position --;
+ magnitude <<= 1;
+ }
+
+ __int128_t sticky = (magnitude & (((__int128_t)1 << 102) - 1)) != 0;
+ __int128_t Gbit = (magnitude >> 102) & 1;
+ __int128_t Lbit = (magnitude >> 103) & 1;
+ __int128_t RndAddl = Gbit & ( Lbit | sticky);
+ __int128_t mant = (magnitude >> 103) & (((__int128_t)1 << 25) - 1);
+
+ __int128_t Rndmant = mant + RndAddl;
+ __int128_t Ovf = Rndmant >> 24;
+ __int128_t exp = 127 + Jbit_position - fac + Ovf;
+ __int128_t frac = Rndmant & 0x7FFFFF;
+
+ Floatuint32Union res;
+ res.u = sign << 31;
+ res.u |= exp << 23;
+ res.u |= frac;
+
+ return res.f;
+}
+#endif
+
#endif
extern void test_amx_avx512 (void) __attribute__((__target__("amx-avx512")));
extern void test_amx_tf32 (void) __attribute__((__target__("amx-tf32")));
extern void test_amx_transpose (void) __attribute__((__target__("amx-transpose")));
+extern void test_amx_fp8 (void) __attribute__((__target__("amx-fp8")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx512vpopcntdq(void) __attribute__((__target__("no-avx512vpopcntdq")));
extern void test_no_amx_avx512 (void) __attribute__((__target__("no-amx-avx512")));
extern void test_no_amx_tf32 (void) __attribute__((__target__("no-amx-tf32")));
extern void test_no_amx_transpose (void) __attribute__((__target__("no-amx-transpose")));
+extern void test_no_amx_fp8 (void) __attribute__((__target__("no-amx-fp8")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
popcntintrin.h gfniintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */
#include <x86intrin.h>
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16 -mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
/* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -msha -mxsavec -mxsaves -mclflushopt -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavxifma -mavxvnniint8 -mavxneconvert -mamx-fp16 -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mavx10.2-512 -mamx-avx512 -mamx-tf32 -mamx-transpose -mamx-fp8" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose,amx-fp8")
#endif
/* Following intrinsics require immediate arguments. They
/* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
#ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,sha,gfni,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose,amx-fp8")
#endif
#include <immintrin.h>
test_1 (_cvtss_sh, unsigned short, float, 1)
#define __builtin_ia32_minmaxps128_mask(A, B, C, D, E) __builtin_ia32_minmaxps128_mask (A, B, 100, D, E)
#define __builtin_ia32_minmaxps256_mask_round(A, B, C, D, E, F) __builtin_ia32_minmaxps256_mask_round (A, B, 100, D, E, 4)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,sha,xsavec,xsaves,clflushopt,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,vpclmulqdq,pconfig,wbnoinvd,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avxifma,avxvnniint8,avxneconvert,cmpccxadd,amx-fp16,prefetchi,raoint,amx-complex,avxvnniint16,sm3,sha512,sm4,avx10.2-512,amx-avx512,amx-tf32,amx-transpose,amx-fp8")
#include <x86intrin.h>
} "-mamx-transpose" ]
}
+# Return 1 if amx-fp8 instructions can be compiled.
+proc check_effective_target_amx_fp8 { } {
+ return [check_no_compiler_messages amx_fp8 object {
+ void
+ foo ()
+ {
+ __asm__ volatile ("tdpbf8ps\t%%tmm1, %%tmm2, %%tmm3" ::);
+ }
+ } "-mamx-fp8" ]
+}
+
# Return 1 if sse instructions can be compiled.
proc check_effective_target_sse { } {
return [check_no_compiler_messages sse object {