For avx512bmm instructions, this patch adds the intrinsics support.
Code-generation and runtime tests will be added in the future patches.
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_amd_cpu): Add znver6 1Ah
family model numbers.
(get_available_features): Set feature AVX512BMM.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AVX512BMM_SET):
New macro.
(OPTION_MASK_ISA2_AVX512BMM_UNSET): New macro.
(OPTION_MASK_ISA2_AVX512BW_UNSET): Unset AVX512BMM.
(ix86_handle_option): Likewise.
* common/config/i386/i386-cpuinfo.h (enum processor_subtypes):
Add AMDFAM1AH_ZNVER6.
(enum processor_features): Add FEATURE_AVX512BMM.
* common/config/i386/i386-isas.h: Likewise.
* config.gcc: Add avx512bmmintrin.h, avx512bmmvlintrin.h,
znver6.
* config/i386/cpuid.h (bit_AVX512BMM): New macro.
* config/i386/driver-i386.cc (host_detect_local_cpu): Likewise.
* config/i386/i386-builtin.def (BDESC): Add AVX512BMM builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal): Likewise.
* config/i386/i386-isa.def (AVX512BMM): Likewise.
* config/i386/i386-options.cc (m_ZNVER6): New macro.
(m_ZNVER): Add m_ZNVER6.
(processor_cost_table): Uses znver5_cost table for
PROCESSOR_ZNVER6 for now.
(ix86_valid_target_attribute_inner_p): Likewise.
* config/i386/i386.cc (ix86_reassociation_width): Likewise.
* config/i386/i386.h (enum processor_type): Likewise.
* config/i386/i386.md: Likewise.
* config/i386/i386.opt: Likewise.
* config/i386/i386.opt.urls: Likewise.
* config/i386/immintrin.h: Likewise.
* config/i386/sse.md (avx512bmm_vbmacor16x16x16_<mode>): New
define_insn.
(avx512bmm_vbmacxor16x16x16_<mode>): Likewise.
(avx512bmm_vbitrevb_<mode>_mask): Likewise.
(avx512bmm_vbitrevb_<mode>): Likewise.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Likewise.
(ix86_adjust_cost): Likewise.
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add
m_ZNVER6.
(X86_TUNE_FUSE_MOV_AND_ALU): Likewise.
(X86_TUNE_USE_SCATTER_2PARTS): Likewise.
(X86_TUNE_USE_SCATTER_4PARTS): Likewise.
(X86_TUNE_USE_SCATTER_8PARTS): Likewise.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.
(X86_TUNE_AVX512_MOVE_BY_PIECES): Likewise.
* doc/extend.texi: Likewise.
* doc/invoke.texi: Likewise.
* config/i386/avx512bmmintrin.h: New file.
* config/i386/avx512bmmvlintrin.h: New file.
gcc/testsuite/ChangeLog:
* g++.target/i386/mv29.C: Likewise.
* gcc.target/i386/funcspec-56.inc: Likewise.
* gcc.target/i386/avx512bmm-1.c: New test.
* gcc.target/i386/avx512bmmvl-1.c: New test.
CHECK___builtin_cpu_is ("znver5");
cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
}
+ else if ((model >= 0x50 && model <= 0x5f) ||
+ (model >= 0x80 && model <= 0xcf) ||
+ (model >= 0xd8 && model <= 0xe7))
+ {
+ cpu = "znver6";
+ CHECK___builtin_cpu_is ("znver6");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER6;
+ }
else if (has_cpu_feature (cpu_model, cpu_features2,
FEATURE_AVX512VP2INTERSECT))
{
CHECK___builtin_cpu_is ("znver5");
cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
}
+ else if (has_cpu_feature (cpu_model, cpu_features2,
+ FEATURE_AVX512BMM))
+ {
+ cpu = "znver6";
+ CHECK___builtin_cpu_is ("znver6");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER6;
+ }
break;
default:
break;
}
}
+ /* Get Advanced Features at level 0x21 (eax = 0x21). */
+ if (max_cpuid_level >= 0x21)
+ {
+ __cpuid (0x21, eax, ebx, ecx, edx);
+ if (eax & bit_AVX512BMM)
+ {
+ set_feature (FEATURE_AVX512BMM);
+ }
+ }
+
/* Get Advanced Features at level 0x24 (eax = 0x24, ecx = 0). */
if (avx10_set && max_cpuid_level >= 0x24)
{
#define OPTION_MASK_ISA2_MOVRS_SET OPTION_MASK_ISA2_MOVRS
#define OPTION_MASK_ISA2_AMX_MOVRS_SET \
(OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_MOVRS)
+#define OPTION_MASK_ISA2_AVX512BMM_SET OPTION_MASK_ISA2_AVX512BMM
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
#define OPTION_MASK_ISA2_AMX_FP8_UNSET OPTION_MASK_ISA2_AMX_FP8
#define OPTION_MASK_ISA2_MOVRS_UNSET OPTION_MASK_ISA2_MOVRS
#define OPTION_MASK_ISA2_AMX_MOVRS_UNSET OPTION_MASK_ISA2_AMX_MOVRS
+#define OPTION_MASK_ISA2_AVX512BMM_UNSET OPTION_MASK_ISA2_AVX512BMM
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
#define OPTION_MASK_ISA2_AVX512BW_UNSET \
(OPTION_MASK_ISA2_AVX512BF16_UNSET \
- | OPTION_MASK_ISA2_AVX512FP16_UNSET)
+ | OPTION_MASK_ISA2_AVX512FP16_UNSET \
+ | OPTION_MASK_ISA2_AVX512BMM_UNSET)
/* Set 1 << value as value of -malign-FLAG option. */
}
return true;
+ case OPT_mavx512bmm:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BMM_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BMM_SET;
+ opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW_SET;
+ opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AVX512BMM_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BMM_UNSET;
+ }
+ return true;
+
case OPT_mavxvnni:
if (value)
{
"znver2",
"znver3",
"znver4",
- "znver5"
+ "znver5",
+ "znver6"
};
/* Guarantee that the array is aligned with enum processor_type. */
{"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
PTA_ZNVER5,
M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
+ {"znver6", PROCESSOR_ZNVER6, CPU_ZNVER5,
+ PTA_ZNVER6,
+ M_CPU_SUBTYPE (AMDFAM1AH_ZNVER6), P_PROC_AVX512F},
{"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
PTA_BTVER1,
M_CPU_TYPE (AMD_BTVER1), P_PROC_SSE4_A},
ZHAOXIN_FAM7H_SHIJIDADAO,
INTEL_COREI7_DIAMONDRAPIDS,
INTEL_COREI7_NOVALAKE,
+ AMDFAM1AH_ZNVER6,
CPU_SUBTYPE_MAX
};
FEATURE_AMX_FP8 = 120,
FEATURE_MOVRS,
FEATURE_AMX_MOVRS,
+ FEATURE_AVX512BMM,
CPU_FEATURE_MAX
};
ISA_NAMES_TABLE_ENTRY("amx-fp8", FEATURE_AMX_FP8, P_NONE, "-mamx-fp8")
ISA_NAMES_TABLE_ENTRY("movrs", FEATURE_MOVRS, P_NONE, "-mmovrs")
ISA_NAMES_TABLE_ENTRY("amx-movrs", FEATURE_AMX_MOVRS, P_NONE, "-mamx-movrs")
+ ISA_NAMES_TABLE_ENTRY("avx512bmm", FEATURE_AVX512BMM, P_NONE, "-mavx512bmm")
ISA_NAMES_TABLE_END
avx10_2bf16intrin.h avx10_2satcvtintrin.h
avx10_2minmaxintrin.h avx10_2copyintrin.h
amxavx512intrin.h amxtf32intrin.h amxfp8intrin.h
- movrsintrin.h amxmovrsintrin.h"
+ movrsintrin.h amxmovrsintrin.h avx512bmmintrin.h
+ avx512bmmvlintrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
# 64-bit x86 processors supported by --with-arch=. Each processor
# MUST be separated by exactly one space.
x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
-bdver3 bdver4 znver1 znver2 znver3 znver4 znver5 btver1 btver2 k8 k8-sse3 \
+bdver3 bdver4 znver1 znver2 znver3 znver4 znver5 znver6 btver1 btver2 k8 k8-sse3 \
opteron opteron-sse3 nocona core2 corei7 corei7-avx core-avx-i core-avx2 \
atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont skylake-avx512 cannonlake icelake-client icelake-server \
arch=znver5
cpu=znver5
;;
+ znver6-*)
+ arch=znver6
+ cpu=znver6
+ ;;
bdver4-*)
arch=bdver4
cpu=bdver4
arch=znver5
cpu=znver5
;;
+ znver6-*)
+ arch=znver6
+ cpu=znver6
+ ;;
bdver4-*)
arch=bdver4
cpu=bdver4
--- /dev/null
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BMMINTRIN_H_INCLUDED
+#define _AVX512BMMINTRIN_H_INCLUDED
+
+#ifndef __AVX512BMM__
+#pragma GCC push_options
+#pragma GCC target("avx512bmm")
+#define __DISABLE_AVX512BMM__
+#endif /* __AVX512BMM__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bmacor16x16x16 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vbmacor16x16x16_v32hi ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bmacxor16x16x16 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vbmacxor16x16x16_v32hi ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_bitrev_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_bitrev_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi)(__m512i)
+ _mm512_setzero_epi32 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bitrev_epi8 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi)(__m512i)
+ _mm512_undefined_epi32 (),
+ (__mmask64) -1);
+}
+
+#ifdef __DISABLE_AVX512BMM__
+#undef __DISABLE_AVX512BMM__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BMM__ */
+
+#endif /* _AVX512BMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BMMVLINTRIN_H_INCLUDED
+#define _AVX512BMMVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BMM__)
+#pragma GCC push_options
+#pragma GCC target("avx512bmm,avx512vl")
+#define __DISABLE_AVX512BMMVL__
+#endif /* __AVX512BMM__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bmacor16x16x16 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vbmacor16x16x16_v16hi ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bmacxor16x16x16 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vbmacxor16x16x16_v16hi ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __C);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_mask_bitrev_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_maskz_bitrev_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi)(__m128i)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_bitrev_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi)(__m128i)
+ _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_bitrev_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_bitrev_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A,
+ (__v32qi)(__m256i)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bitrev_epi8 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A,
+ (__v32qi)(__m256i)
+ _mm256_undefined_si256 (),
+ (__mmask32) -1);
+}
+
+#ifdef __DISABLE_AVX512BMMVL__
+#undef __DISABLE_AVX512BMMVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BMMVL__ */
+
+#endif /* _AVX512BMMVLINTRIN_H_INCLUDED */
#define bit_AESKLE ( 1<<0 )
#define bit_WIDEKL ( 1<<2 )
+/* Sub leaf (%eax == 0x21) */
+#define bit_AVX512BMM ( 1<<23 )
+
/* AMX sub leaf (%eax == 0x1e, %ecx == 1) */
/* %eax */
#define bit_AMX_FP8 (1 << 4)
processor = PROCESSOR_GEODE;
else if (has_feature (FEATURE_MOVBE) && family == 22)
processor = PROCESSOR_BTVER2;
+ else if (has_feature (FEATURE_AVX512BMM))
+ processor = PROCESSOR_ZNVER6;
else if (has_feature (FEATURE_AVX512VP2INTERSECT))
processor = PROCESSOR_ZNVER5;
else if (has_feature (FEATURE_AVX512F))
case PROCESSOR_ZNVER5:
cpu = "znver5";
break;
+ case PROCESSOR_ZNVER6:
+ cpu = "znver6";
+ break;
case PROCESSOR_BTVER1:
cpu = "btver1";
break;
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, "__builtin_ia32_cvtbf2sf", IX86_BUILTIN_CVTBF2SF, UNKNOWN, (int) FLOAT_FTYPE_BFLOAT16)
+/* AVX512BMM. */
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacor16x16x16_v16hi, "__builtin_ia32_vbmacor16x16x16_v16hi", IX86_BUILTIN_VBMACORV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacor16x16x16_v32hi, "__builtin_ia32_vbmacor16x16x16_v32hi", IX86_BUILTIN_VBMACORV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacxor16x16x16_v16hi, "__builtin_ia32_vbmacxor16x16x16_v16hi", IX86_BUILTIN_VBMACXORV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacxor16x16x16_v32hi, "__builtin_ia32_vbmacxor16x16x16_v32hi", IX86_BUILTIN_VBMACXORV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v16qi_mask, "__builtin_ia32_vbitrevb128_mask", IX86_BUILTIN_VBITREV16_MASK, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v32qi_mask, "__builtin_ia32_vbitrevb256_mask", IX86_BUILTIN_VBITREV32_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v64qi_mask, "__builtin_ia32_vbitrevb512_mask", IX86_BUILTIN_VBITREV64_MASK, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
/* AVX512FP16. */
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
def_or_undef (parse_in, "__znver5");
def_or_undef (parse_in, "__znver5__");
break;
+ case PROCESSOR_ZNVER6:
+ def_or_undef (parse_in, "__znver6");
+ def_or_undef (parse_in, "__znver6__");
+ break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__btver1");
def_or_undef (parse_in, "__btver1__");
case PROCESSOR_ZNVER5:
def_or_undef (parse_in, "__tune_znver5__");
break;
+ case PROCESSOR_ZNVER6:
+ def_or_undef (parse_in, "__tune_znver6__");
+ break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__tune_btver1__");
break;
def_or_undef (parse_in, "__MOVRS__");
if (isa_flag2 & OPTION_MASK_ISA2_AMX_MOVRS)
def_or_undef (parse_in, "__AMX_MOVRS__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AVX512BMM)
+ def_or_undef (parse_in, "__AVX512BMM__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
DEF_PTA(AMX_FP8)
DEF_PTA(MOVRS)
DEF_PTA(AMX_MOVRS)
+DEF_PTA(AVX512BMM)
#define m_ZNVER3 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER3)
#define m_ZNVER4 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER4)
#define m_ZNVER5 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER5)
+#define m_ZNVER6 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER6)
#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
#define m_BTVER (m_BTVER1 | m_BTVER2)
-#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5)
+#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_ZNVER6)
#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
| m_ZNVER)
{ "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 },
{ "-mamx-fp8", OPTION_MASK_ISA2_AMX_FP8 },
{ "-mmovrs", OPTION_MASK_ISA2_MOVRS },
- { "-mamx-movrs", OPTION_MASK_ISA2_AMX_MOVRS }
+ { "-mamx-movrs", OPTION_MASK_ISA2_AMX_MOVRS },
+ { "-mavx512bmm", OPTION_MASK_ISA2_AVX512BMM }
};
static struct ix86_target_opts isa_opts[] =
{
&znver2_cost, /* PROCESSOR_ZNVER2. */
&znver3_cost, /* PROCESSOR_ZNVER3. */
&znver4_cost, /* PROCESSOR_ZNVER4. */
- &znver5_cost /* PROCESSOR_ZNVER5. */
+ &znver5_cost, /* PROCESSOR_ZNVER5. */
+ &znver5_cost /* PROCESSOR_ZNVER6. */
};
/* Guarantee that the array is aligned with enum processor_type. */
IX86_ATTR_ISA ("amx-fp8", OPT_mamx_fp8),
IX86_ATTR_ISA ("movrs", OPT_mmovrs),
IX86_ATTR_ISA ("amx-movrs", OPT_mamx_movrs),
+ IX86_ATTR_ISA ("avx512bmm", OPT_mavx512bmm),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
return 1;
/* Znver5 can do 2 integer multiplications per cycle with latency
of 3. */
- if (ix86_tune == PROCESSOR_ZNVER5
+ if ((ix86_tune == PROCESSOR_ZNVER5 || ix86_tune == PROCESSOR_ZNVER6)
&& INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
width = 6;
PROCESSOR_ZNVER3,
PROCESSOR_ZNVER4,
PROCESSOR_ZNVER5,
+ PROCESSOR_ZNVER6,
PROCESSOR_max
};
| PTA_AVX512VNNI | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ;
constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI
| PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI;
+constexpr wide_int_bitmask PTA_ZNVER6 = PTA_ZNVER5 | PTA_AVXVNNIINT8
+ | PTA_AVXNECONVERT | PTA_AVX512BMM | PTA_AVXIFMA | PTA_AVX512FP16;
constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT
(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
atom,slm,glm,haswell,generic,lujiazui,yongfeng,amdfam10,bdver1,
bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3,znver4,
- znver5"
+ znver5,znver6"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
mamx-movrs
Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save
Support AMX-MOVRS built-in functions and code generation.
+
+mavx512bmm
+Target Mask(ISA2_AVX512BMM) Var(ix86_isa_flags2) Save
+Support AVX512BMM built-in functions and code generation.
mamx-movrs
UrlSuffix(gcc/x86-Options.html#index-mamx-movrs)
+mavx512bmm
+UrlSuffix(gcc/x86-Options.html#index-mavx512bmm)
+
#include <movrsintrin.h>
#include <amxmovrsintrin.h>
+
+#include <avx512bmmintrin.h>
+
+#include <avx512bmmvlintrin.h>
#endif /* _IMMINTRIN_H_INCLUDED */
UNSPEC_MINMAXBF16
UNSPEC_MINMAX
+ ;; For AVX512BMM support
+ UNSPEC_VBMACOR
+ UNSPEC_VBMACXOR
+ UNSPEC_VBITREV
+
;; For MOVRS suppport
UNSPEC_VMOVRS
])
(set_attr "prefix" "evex")
(set_attr "memory" "load")
(set_attr "mode" "<sseinsnmode>")])
+
+(define_mode_iterator VI2_256_512_AVX512VL
+ [V32HI (V16HI "TARGET_AVX512VL")])
+
+(define_insn "avx512bmm_vbmacor16x16x16_<mode>"
+ [(set (match_operand:VI2_256_512_AVX512VL 0 "register_operand" "=v")
+ (unspec:VI2_256_512_AVX512VL
+ [(match_operand:VI2_256_512_AVX512VL 1 "register_operand" "0")
+ (match_operand:VI2_256_512_AVX512VL 2 "register_operand" "v")
+ (match_operand:VI2_256_512_AVX512VL 3 "nonimmediate_operand" "vm")]
+ UNSPEC_VBMACOR))]
+ "TARGET_AVX512BMM"
+ "vbmacor16x16x16\t{%3, %2, %0|%0, %2, %3}"
+ [(set_attr ("prefix") ("evex"))
+ (set_attr "mode" "<sseinsnmode>")])
+
+
+(define_insn "avx512bmm_vbmacxor16x16x16_<mode>"
+ [(set (match_operand:VI2_256_512_AVX512VL 0 "register_operand" "=v")
+ (unspec:VI2_256_512_AVX512VL
+ [(match_operand:VI2_256_512_AVX512VL 1 "register_operand" "0")
+ (match_operand:VI2_256_512_AVX512VL 2 "register_operand" "v")
+ (match_operand:VI2_256_512_AVX512VL 3 "nonimmediate_operand" "vm")]
+ UNSPEC_VBMACXOR))]
+ "TARGET_AVX512BMM"
+ "vbmacxor16x16x16\t{%3, %2, %0|%0, %2, %3}"
+ [(set_attr ("prefix") ("evex"))
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512bmm_vbitrevb_<mode>_mask"
+ [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI1_AVX512VL
+ (unspec:VI1_AVX512VL
+ [(match_operand:VI1_AVX512VL 1 "nonimmediate_operand" "vm")]
+ UNSPEC_VBITREV)
+ (match_operand:VI1_AVX512VL 2 "reg_or_0_operand" "0C")
+ (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+ "TARGET_AVX512BMM"
+ "vbitrevb\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+ [(set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512bmm_vbitrevb_<mode>"
+ [(set (match_operand:VI1_AVX512VL 0 "register_operand" "=v")
+ (unspec:VI1_AVX512VL
+ [(match_operand:VI1_AVX512VL 1 "nonimmediate_operand" "vm")]
+ UNSPEC_VBITREV)
+ )]
+ "TARGET_AVX512BMM"
+ "vbitrevb\t{%1, %0|%0, %1}"
+ [(set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
is not going to be able to use more than 4 instructions since that
is limits of the decoders. */
case PROCESSOR_ZNVER5:
+ case PROCESSOR_ZNVER6:
return 4;
case PROCESSOR_ICELAKE_CLIENT:
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
case PROCESSOR_ZNVER5:
+ case PROCESSOR_ZNVER6:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
There is also limitation for immediate and displacement supported. */
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | m_ZNVER4 | m_ZNVER5)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | m_ZNVER4 | m_ZNVER5
+ | m_ZNVER6)
/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
and the destination is used by alu. alu must be one of
ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
- m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
+ m_ZNVER5 | m_ZNVER6 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
/* X86_TUNE_FUSE_AND_BRANCH_MEM: Fuse alu with a subsequent conditional
jump instruction when alu contains memory operand.
/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements. */
DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4 | m_ZNVER5))
+ ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6))
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements. */
/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements. */
DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4 | m_ZNVER5))
+ ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6))
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4 | m_ZNVER5))
+ ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6))
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain. */
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
- m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_ZNVER6 | m_CORE_HYBRID
| m_SAPPHIRERAPIDS | m_GRANITERAPIDS | m_GRANITERAPIDS_D
| m_DIAMONDRAPIDS | m_CORE_ATOM | m_GENERIC)
/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5
+ | m_ZNVER6)
/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction. */
/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
- m_ZNVER4 | m_ZNVER5)
+ m_ZNVER4 | m_ZNVER5 | m_ZNVER6)
/* X86_TUNE_AVX512_TWO_EPILOGUES: Use two vector epilogues for 512-bit
vectorized loops. */
@item znver4
AMD Family 19h Zen version 4.
+@item amdfam1ah
+AMD Family 1ah CPU.
+
@item znver5
AMD Family 1ah Zen version 5.
+
+@item znver6
+AMD Family 1ah Zen version 6.
@end table
Here is an example:
-mavx512fp16 -mavxifma -mavxvnniint8 -mavxneconvert -mcmpccxadd -mamx-fp16
-mprefetchi -mraoint -mamx-complex -mavxvnniint16 -msm3 -msha512 -msm4 -mapxf
-musermsr -mavx10.1 -mavx10.2 -mamx-avx512 -mamx-tf32 -mmovrs -mamx-movrs
--mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops
+-mavx512bmm -mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops
-minline-stringops-dynamically -mstringop-strategy=@var{alg}
-mkl -mwidekl
-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
AVX512BITALG, AVX512VPOPCNTDQ, GFNI, AVXVNNI, MOVDIRI, MOVDIR64B,
AVX512VP2INTERSECT, PREFETCHI and 64-bit instruction set extensions.)
+@item znver6
+AMD Family 1ah core based CPUs with x86-64 instruction set support. (This
+supersets BMI, BMI2, CLWB, F16C, FMA, FSGSBASE, AVX, AVX2, ADCX, RDSEED,
+MWAITX, SHA, CLZERO, AES, PCLMUL, CX16, MOVBE, MMX, SSE, SSE2, SSE3, SSE4A,
+SSSE3, SSE4.1, SSE4.2, ABM, XSAVEC, XSAVES, CLFLUSHOPT, POPCNT, RDPID,
+WBNOINVD, PKU, VPCLMULQDQ, VAES, AVX512F, AVX512DQ, AVX512IFMA, AVX512CD,
+AVX512BW, AVX512VL, AVX512BF16, AVX512VBMI, AVX512VBMI2, AVX512VNNI,
+AVX512BITALG, AVX512VPOPCNTDQ, GFNI, AVXVNNI, MOVDIRI, MOVDIR64B,
+AVX512VP2INTERSECT, PREFETCHI, AVXVNNIINT8, AVXIFMA, AVX512FP16, AVXNECONVERT,
+AVX512BMM and 64-bit instruction set extensions.)
+
@item btver1
CPUs based on AMD Family 14h cores with x86-64 instruction set support. (This
supersets MMX, SSE, SSE2, SSE3, SSSE3, SSE4A, CX16, ABM and 64-bit
@need 200
@opindex mamx-movrs
@itemx -mamx-movrs
+@need 200
+@opindex mavx512bmm
+@itemx -mavx512bmm
These switches enable the use of instructions in the MMX, SSE,
AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA, AES,
PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG,
AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16, AVXIFMA, AVXVNNIINT8, AVXNECONVERT,
CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT, AMX-COMPLEX, AVXVNNIINT16, SM3, SHA512,
SM4, APX_F, USER_MSR, AVX10.1, AVX10.2, AMX-AVX512, AMX-TF32, AMX-FP8, MOVRS,
-AMX-MOVRS or CLDEMOTE extended instruction sets. Each has a corresponding
-@option{-mno-} option to disable use of these instructions.
+AMX-MOVRS, AVX512BMM or CLDEMOTE extended instruction sets. Each has a
+corresponding @option{-mno-} option to disable use of these instructions.
These extensions are also available as built-in functions: see
@ref{x86 Built-in Functions}, for details of the functions enabled and
return 11;
}
+int __attribute__ ((target("arch=znver6"))) foo () {
+ return 12;
+}
+
int main ()
{
int val = foo ();
assert (val == 10);
else if (__builtin_cpu_is ("znver5"))
assert (val == 11);
+ else if (__builtin_cpu_is ("znver6"))
+ assert (val == 12);
else
assert (val == 0);
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512bmm -O2" } */
+/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m512i x,y,z;
+volatile __mmask64 m;
+
+__m512i extern
+avx512bmm_test (void)
+{
+ x = _mm512_bmacor16x16x16 (x, y, z);
+
+ x = _mm512_bmacxor16x16x16 (x, y, z);
+
+ x = _mm512_bitrev_epi8 (x);
+
+ x = _mm512_mask_bitrev_epi8 (m, x, y);
+
+ x = _mm512_maskz_bitrev_epi8 (m, x);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-mavx512bmm -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+
+#include <immintrin.h>
+
+volatile __m256i x,y,z;
+volatile __m128i x_,y_,z_;
+volatile __mmask32 m;
+volatile __mmask16 m_;
+
+void extern
+avx512bmm_test (void)
+{
+ x = _mm256_bmacor16x16x16 (x, y, z);
+
+ x = _mm256_bmacxor16x16x16 (x, y, z);
+
+ x = _mm256_mask_bitrev_epi8 (m, x, y);
+ x_ = _mm128_mask_bitrev_epi8 (m_, x_, y_);
+
+ x = _mm256_maskz_bitrev_epi8 (m, y);
+ x_ = _mm128_maskz_bitrev_epi8 (m_, y_);
+
+ x = _mm256_bitrev_epi8 (x);
+ x_ = _mm128_bitrev_epi8 (x_);
+}
extern void test_arch_znver3 (void) __attribute__((__target__("arch=znver3")));
extern void test_arch_znver4 (void) __attribute__((__target__("arch=znver4")));
extern void test_arch_znver5 (void) __attribute__((__target__("arch=znver5")));
+extern void test_arch_znver6 (void) __attribute__((__target__("arch=znver6")));
extern void test_tune_nocona (void) __attribute__((__target__("tune=nocona")));
extern void test_tune_core2 (void) __attribute__((__target__("tune=core2")));
extern void test_tune_znver3 (void) __attribute__((__target__("tune=znver3")));
extern void test_tune_znver4 (void) __attribute__((__target__("tune=znver4")));
extern void test_tune_znver5 (void) __attribute__((__target__("tune=znver5")));
+extern void test_tune_znver6 (void) __attribute__((__target__("tune=znver6")));
extern void test_fpmath_sse (void) __attribute__((__target__("sse2,fpmath=sse")));
extern void test_fpmath_387 (void) __attribute__((__target__("sse2,fpmath=387")));