From: wwwhhhyyy Date: Mon, 30 Aug 2021 08:41:41 +0000 (+0800) Subject: [i386] GLC tuning: Break false dependency for dest register. X-Git-Tag: basepoints/gcc-13~1705 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1c2575586c47f56a2e75f734af42371579516f0c;p=thirdparty%2Fgcc.git [i386] GLC tuning: Break false dependency for dest register. For GoldenCove micro-architecture, force insert zero-idiom in asm template to break false dependency of dest register for several insns. The related insns are: VPERM/D/Q/PS/PD VRANGEPD/PS/SD/SS VGETMANTSS/SD/SH VGETMANDPS/PD - mem version only VPMULLQ VFMULCSH/PH VFCMULCSH/PH gcc/ChangeLog: * config/i386/i386.h (TARGET_DEST_FALSE_DEP_FOR_GLC): New macro. * config/i386/sse.md (__): Insert zero-idiom in output template when attr enabled, set new attribute to true for non-mask/maskz insn. (avx512fp16_sh_v8hf): Likewise. (avx512dq_mul3): Likewise. (_permvar): Likewise. (avx2_perm_1): Likewise. (avx512f_perm_1): Likewise. (avx512dq_rangep): Likewise. (avx512dq_ranges): Likewise. (_getmant): Likewise. (avx512f_vgetmant): Likewise. * config/i386/subst.md (mask3_dest_false_dep_for_glc_cond): New subst_attr. (mask4_dest_false_dep_for_glc_cond): Likewise. (mask6_dest_false_dep_for_glc_cond): Likewise. (mask10_dest_false_dep_for_glc_cond): Likewise. (maskc_dest_false_dep_for_glc_cond): Likewise. (mask_scalar4_dest_false_dep_for_glc_cond): Likewise. (mask_scalarc_dest_false_dep_for_glc_cond): Likewise. * config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEP_FOR_GLC): New DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-dest-false-dep-for-glc.c: New test. * gcc.target/i386/avx512dq-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512f-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512vl-dest-false-dep-for-glc.c: Ditto. --- diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 3ac0f698ae2b..f1bb8a868f17 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_EXPAND_ABS] #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \ ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD] +#define TARGET_DEST_FALSE_DEP_FOR_GLC \ + ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 50dc5da9a383..ea72aa5d9ac4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6536,7 +6536,12 @@ (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "")] UNSPEC_COMPLEX_F_C_MUL))] "TARGET_AVX512FP16 && " - "v\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && ) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "v\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "ssemul") (set_attr "mode" "")]) @@ -6742,7 +6747,12 @@ (match_dup 1) (const_int 3)))] "TARGET_AVX512FP16" - "vsh\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && ) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vsh\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "ssemul") (set_attr "mode" "V8HF")]) @@ -15207,7 +15217,14 @@ (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))] "TARGET_AVX512DQ && && ix86_binary_operator_ok (MULT, mode, operands)" - "vpmullq\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vpmullq\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "sseimul") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -24636,7 +24653,14 @@ (match_operand: 2 "register_operand" "v")] UNSPEC_VPERMVAR))] "TARGET_AVX2 && " - "vperm\t{%1, %2, %0|%0, %2, %1}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vperm\t{%1, %2, %0|%0, %2, %1}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "") (set_attr "mode" "")]) @@ -24873,6 +24897,10 @@ mask |= INTVAL (operands[4]) << 4; mask |= INTVAL (operands[5]) << 6; operands[2] = GEN_INT (mask); + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); return "vperm\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "sselog") @@ -24944,6 +24972,10 @@ mask |= INTVAL (operands[4]) << 4; mask |= INTVAL (operands[5]) << 6; operands[2] = GEN_INT (mask); + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); return "vperm\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "sselog") @@ -26843,7 +26875,14 @@ (match_operand:SI 3 "const_0_to_15_operand")] UNSPEC_RANGE))] "TARGET_AVX512DQ && " - "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -26859,7 +26898,14 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512DQ" - "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -26899,7 +26945,13 @@ (match_operand:SI 2 "const_0_to_15_operand")] UNSPEC_GETMANT))] "TARGET_AVX512F" - "vgetmant\t{%2, %1, %0|%0, %1, %2}"; +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && MEM_P (operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vgetmant\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -26914,7 +26966,14 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vgetmant\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vgetmant\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} [(set_attr "prefix" "evex") (set_attr "mode" "")]) diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 21d445cc46c8..bb86f82905f8 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -71,6 +71,11 @@ (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex") (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex") (define_subst_attr "mask_expand_op3" "mask" "3" "5") +(define_subst_attr "mask3_dest_false_dep_for_glc_cond" "mask" "1" "operands[3] == CONST0_RTX(mode)") +(define_subst_attr "mask4_dest_false_dep_for_glc_cond" "mask" "1" "operands[4] == CONST0_RTX(mode)") +(define_subst_attr "mask6_dest_false_dep_for_glc_cond" "mask" "1" "operands[6] == CONST0_RTX(mode)") +(define_subst_attr "mask10_dest_false_dep_for_glc_cond" "mask" "1" "operands[10] == CONST0_RTX(mode)") +(define_subst_attr "maskc_dest_false_dep_for_glc_cond" "maskc" "1" "operands[3] == CONST0_RTX(mode)") (define_subst "mask" [(set (match_operand:SUBST_V 0) @@ -337,6 +342,8 @@ (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3") (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4") (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4") +(define_subst_attr "mask_scalar4_dest_false_dep_for_glc_cond" "mask_scalar" "1" "operands[4] == CONST0_RTX(mode)") +(define_subst_attr "mask_scalarc_dest_false_dep_for_glc_cond" "mask_scalarc" "1" "operands[3] == CONST0_RTX(V8HFmode)") (define_subst "mask_scalar" [(set (match_operand:SUBST_V 0) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 0d3fd0780681..f9eb3c227c11 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -79,6 +79,12 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY, m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC) +/* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before + several insns to break false dependency on the dest register for GLC + micro-architecture. */ +DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC, + "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE) + /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies are resolved on SSE register parts instead of whole registers, so we may maintain just lower part of scalar values in proper format leaving the diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..787b1d08f804 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + + +#include + +extern __m256i i1, i2, i3, i4; +extern __m256d d1, d2; +extern __m256 f1, f2; + +void vperm_test (void) +{ + i3 = _mm256_permutevar8x32_epi32 (i1, i2); + i4 = _mm256_permute4x64_epi64 (i1, 12); + d2 = _mm256_permute4x64_pd (d1, 12); + f2 = _mm256_permutevar8x32_ps (f1, i2); +} + +/* { dg-final { scan-assembler-times "vxorps" 4 } } */ +/* { dg-final { scan-assembler-times "vpermd" 1 } } */ +/* { dg-final { scan-assembler-times "vpermq" 1 } } */ +/* { dg-final { scan-assembler-times "vpermpd" 1 } } */ +/* { dg-final { scan-assembler-times "vpermps" 1 } } */ + diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..b334b88194be --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c @@ -0,0 +1,73 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -mavx512vl -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m512i i1; +extern __m256i i2; +extern __m128i i3; +extern __m512d d1, d11; +extern __m256d d2; +extern __m128d d3, d33; +extern __m512 f1, f11; +extern __m256 f2; +extern __m128 f3, f33; + +__mmask32 m32; +__mmask16 m16; +__mmask8 m8; + +void mullo_test (void) +{ + i1 = _mm512_mullo_epi64 (i1, i1); + i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1); + i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1); + i2 = _mm256_mullo_epi64 (i2, i2); + i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2); + i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2); + i3 = _mm_mullo_epi64 (i3, i3); + i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3); + i3 = _mm_maskz_mullo_epi64 (m8, i3, i3); +} + +void range_test (void) +{ + d1 = _mm512_range_pd (d1, d11, 15); + d11 = _mm512_range_round_pd (d11, d1, 15, 8); + d1 = _mm512_mask_range_pd (d1, m8, d11, d11, 15); + d11 = _mm512_mask_range_round_pd (d11, m8, d1, d1, 15, 8); + d1 = _mm512_maskz_range_pd (m8, d11, d11, 15); + d11 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8); + d2 = _mm256_range_pd (d2, d2, 15); + d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15); + d2 = _mm256_maskz_range_pd (m8, d2, d2, 15); + d3 = _mm_range_pd (d3, d3, 15); + d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15); + d3 = _mm_maskz_range_pd (m8, d3, d3, 15); + d33 = _mm_range_sd (d33, d33, 15); + d33 = _mm_mask_range_sd (d33, m8, d33, d33, 15); + d33 = _mm_maskz_range_sd (m8, d33, d33, 15); + + f1 = _mm512_range_ps (f1, f11, 15); + f11 = _mm512_range_round_ps (f11, f1, 15, 8); + f1 = _mm512_mask_range_ps (f1, m16, f11, f11, 15); + f11 = _mm512_mask_range_round_ps (f11, m16, f1, f1, 15, 8); + f1 = _mm512_maskz_range_ps (m16, f11, f11, 15); + f11 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8); + f2 = _mm256_range_ps (f2, f2, 15); + f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15); + f2 = _mm256_maskz_range_ps (m8, f2, f2, 15); + f3 = _mm_range_ps (f3, f3, 15); + f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15); + f3 = _mm_maskz_range_ps (m8, f3, f3, 15); + f33 = _mm_range_ss (f33, f33, 15); + f33 = _mm_mask_range_ss (f33, m8, f33, f33, 15); + f33 = _mm_maskz_range_ss (m8, f33, f33, 15); +} + +/* { dg-final { scan-assembler-times "vxorps" 26 } } */ +/* { dg-final { scan-assembler-times "vpmullq" 9 } } */ +/* { dg-final { scan-assembler-times "vrangepd" 12 } } */ +/* { dg-final { scan-assembler-times "vrangesd" 3 } } */ +/* { dg-final { scan-assembler-times "vrangeps" 12 } } */ +/* { dg-final { scan-assembler-times "vrangess" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..26e4ba7e9696 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c @@ -0,0 +1,103 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m512i i1, i2, i3; +extern __m512d d1, d11, *pd1; +extern __m128d d2; +extern __m512 f1, *pf1; +extern __m128 f2; +volatile __m512d *pd11; + +__mmask16 m16; +__mmask8 m8; + +void vperm_test (void) +{ + d1 = _mm512_permutex_pd (d1, 12); + d1 = _mm512_mask_permutex_pd (d1, m8, d1, 13); + d1 = _mm512_maskz_permutex_pd (m8, d1, 14); + d11 = _mm512_permutexvar_pd (i1, d11); + d11 = _mm512_mask_permutexvar_pd (d11, m8, i2, d11); + d11 = _mm512_maskz_permutexvar_pd (m8, i3, d11); + + f1 = _mm512_permutexvar_ps (i1, f1); + f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1); + f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1); + + i3 = _mm512_permutexvar_epi64 (i3, i3); + i3 = _mm512_mask_permutexvar_epi64 (i3, m8, i1, i1); + i3 = _mm512_maskz_permutexvar_epi64 (m8, i3, i1); + i1 = _mm512_permutex_epi64 (i3, 12); + i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12); + i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12); + + i2 = _mm512_permutexvar_epi32 (i2, i2); + i2 = _mm512_mask_permutexvar_epi32 (i2, m16, i2, i2); + i3 = _mm512_maskz_permutexvar_epi32 (m16, i3, i3); +} + +void getmant_test (void) +{ + d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm512_getmant_round_pd (*pd11, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + + d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + +} + +/* { dg-final { scan-assembler-times "vxorps" 22 } } */ +/* { dg-final { scan-assembler-times "vpermd" 3 } } */ +/* { dg-final { scan-assembler-times "vpermq" 6 } } */ +/* { dg-final { scan-assembler-times "vpermps" 3 } } */ +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..990d65b09044 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m512h h1; +extern __m256h h2; +extern __m128h h3; + +__mmask32 m32; +__mmask16 m16; +__mmask8 m8; + +void complex_mul_test (void) +{ + h1 = _mm512_fmul_pch (h1, h1); + h1 = _mm512_fmul_round_pch (h1, h1, 8); + h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1); + h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8); + h1 = _mm512_maskz_fmul_pch (m32, h1, h1); + h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11); + + h3 = _mm_fmul_sch (h3, h3); + h3 = _mm_fmul_round_sch (h3, h3, 8); + h3 = _mm_mask_fmul_sch (h3, m8, h3, h3); + h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8); + h3 = _mm_maskz_fmul_sch (m8, h3, h3); + h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11); +} + +void vgetmant_test (void) +{ + h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); +} + +/* { dg-final { scan-assembler-times "vxorps" 10 } } */ +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */ +/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */ + diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..775d88abe046 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m256h h1; +extern __m128h h2; + +__mmask16 m16; +__mmask8 m8; + +void complex_mul_test (void) +{ + h1 = _mm256_fmul_pch (h1, h1); + h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1); + h1 = _mm256_maskz_fmul_pch (m16, h1, h1); + h2 = _mm_fmul_pch (h2, h2); + h2 = _mm_mask_fmul_pch (h2, m16, h2, h2); + h2 = _mm_maskz_fmul_pch (m16, h2, h2); +} + +/* { dg-final { scan-assembler-times "vxorps" 4 } } */ +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */ + diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..37d3ba51452f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c @@ -0,0 +1,76 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mtune=generic -mavx512vl -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + + +#include + +extern __m256i i1, i2, i3; +extern __m256d d1, d11, *pd1; +extern __m128d d2, *pd2; +extern __m256 f1, *pf1; +extern __m128 f2, *pf2; + +__mmask16 m16; +__mmask8 m8; + +void vperm_test (void) +{ + d1 = _mm256_permutex_pd (d1, 12); + d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12); + d1 = _mm256_maskz_permutex_pd (m8, d1, 12); + d11 = _mm256_permutexvar_pd (i1, d11); + d11 = _mm256_mask_permutexvar_pd (d11, m8, i1, d11); + d11 = _mm256_maskz_permutexvar_pd (m8, i1, d11); + + f1 = _mm256_permutexvar_ps (i1, f1); + f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1); + f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1); + + i1 = _mm256_permutexvar_epi64 (i1, i1); + i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1); + i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1); + i1 = _mm256_permutex_epi64 (i1, 12); + i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12); + i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12); + + i2 = _mm256_permutexvar_epi32 (i2, i2); + i2 = _mm256_mask_permutexvar_epi32 (i2, m8, i2, i2); + i3 = _mm256_maskz_permutexvar_epi32 (m8, i3, i3); +} + +void getmant_test (void) +{ + d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); +} + +/* { dg-final { scan-assembler-times "vxorps" 19 } } */ +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */ +/* { dg-final { scan-assembler-times "vpermps" 3 } } */ +/* { dg-final { scan-assembler-times "vpermq" 6 } } */ +/* { dg-final { scan-assembler-times "vpermd" 3 } } */ +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */ +