From 1c2575586c47f56a2e75f734af42371579516f0c Mon Sep 17 00:00:00 2001 From: wwwhhhyyy Date: Mon, 30 Aug 2021 16:41:41 +0800 Subject: [PATCH] [i386] GLC tuning: Break false dependency for dest register. For GoldenCove micro-architecture, force insert zero-idiom in asm template to break false dependency of dest register for several insns. The related insns are: VPERM/D/Q/PS/PD VRANGEPD/PS/SD/SS VGETMANTSS/SD/SH VGETMANDPS/PD - mem version only VPMULLQ VFMULCSH/PH VFCMULCSH/PH gcc/ChangeLog: * config/i386/i386.h (TARGET_DEST_FALSE_DEP_FOR_GLC): New macro. * config/i386/sse.md (__): Insert zero-idiom in output template when attr enabled, set new attribute to true for non-mask/maskz insn. (avx512fp16_sh_v8hf): Likewise. (avx512dq_mul3): Likewise. (_permvar): Likewise. (avx2_perm_1): Likewise. (avx512f_perm_1): Likewise. (avx512dq_rangep): Likewise. (avx512dq_ranges): Likewise. (_getmant): Likewise. (avx512f_vgetmant): Likewise. * config/i386/subst.md (mask3_dest_false_dep_for_glc_cond): New subst_attr. (mask4_dest_false_dep_for_glc_cond): Likewise. (mask6_dest_false_dep_for_glc_cond): Likewise. (mask10_dest_false_dep_for_glc_cond): Likewise. (maskc_dest_false_dep_for_glc_cond): Likewise. (mask_scalar4_dest_false_dep_for_glc_cond): Likewise. (mask_scalarc_dest_false_dep_for_glc_cond): Likewise. * config/i386/x86-tune.def (X86_TUNE_DEST_FALSE_DEP_FOR_GLC): New DEF_TUNE enabled for m_SAPPHIRERAPIDS and m_ALDERLAKE gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-dest-false-dep-for-glc.c: New test. * gcc.target/i386/avx512dq-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512f-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c: Ditto. * gcc.target/i386/avx512vl-dest-false-dep-for-glc.c: Ditto. --- gcc/config/i386/i386.h | 2 + gcc/config/i386/sse.md | 75 +++++++++++-- gcc/config/i386/subst.md | 7 ++ gcc/config/i386/x86-tune.def | 6 + .../i386/avx2-dest-false-dep-for-glc.c | 24 ++++ .../i386/avx512dq-dest-false-dep-for-glc.c | 73 +++++++++++++ .../i386/avx512f-dest-false-dep-for-glc.c | 103 ++++++++++++++++++ .../i386/avx512fp16-dest-false-dep-for-glc.c | 45 ++++++++ .../avx512fp16vl-dest-false-dep-for-glc.c | 24 ++++ .../i386/avx512vl-dest-false-dep-for-glc.c | 76 +++++++++++++ 10 files changed, 427 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 3ac0f698ae2b..f1bb8a868f17 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -429,6 +429,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_EXPAND_ABS] #define TARGET_V2DF_REDUCTION_PREFER_HADDPD \ ix86_tune_features[X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD] +#define TARGET_DEST_FALSE_DEP_FOR_GLC \ + ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 50dc5da9a383..ea72aa5d9ac4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6536,7 +6536,12 @@ (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "")] UNSPEC_COMPLEX_F_C_MUL))] "TARGET_AVX512FP16 && " - "v\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && ) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "v\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "ssemul") (set_attr "mode" "")]) @@ -6742,7 +6747,12 @@ (match_dup 1) (const_int 3)))] "TARGET_AVX512FP16" - "vsh\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && ) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vsh\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "ssemul") (set_attr "mode" "V8HF")]) @@ -15207,7 +15217,14 @@ (match_operand:VI8_AVX512VL 2 "bcst_vector_operand" "vmBr")))] "TARGET_AVX512DQ && && ix86_binary_operator_ok (MULT, mode, operands)" - "vpmullq\t{%2, %1, %0|%0, %1, %2}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vpmullq\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "sseimul") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -24636,7 +24653,14 @@ (match_operand: 2 "register_operand" "v")] UNSPEC_VPERMVAR))] "TARGET_AVX2 && " - "vperm\t{%1, %2, %0|%0, %2, %1}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vperm\t{%1, %2, %0|%0, %2, %1}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "") (set_attr "mode" "")]) @@ -24873,6 +24897,10 @@ mask |= INTVAL (operands[4]) << 4; mask |= INTVAL (operands[5]) << 6; operands[2] = GEN_INT (mask); + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); return "vperm\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "sselog") @@ -24944,6 +24972,10 @@ mask |= INTVAL (operands[4]) << 4; mask |= INTVAL (operands[5]) << 6; operands[2] = GEN_INT (mask); + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); return "vperm\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "sselog") @@ -26843,7 +26875,14 @@ (match_operand:SI 3 "const_0_to_15_operand")] UNSPEC_RANGE))] "TARGET_AVX512DQ && " - "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -26859,7 +26898,14 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512DQ" - "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}" +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vrange\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -26899,7 +26945,13 @@ (match_operand:SI 2 "const_0_to_15_operand")] UNSPEC_GETMANT))] "TARGET_AVX512F" - "vgetmant\t{%2, %1, %0|%0, %1, %2}"; +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && MEM_P (operands[1])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vgetmant\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -26914,7 +26966,14 @@ (match_dup 1) (const_int 1)))] "TARGET_AVX512F" - "vgetmant\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +{ + if (TARGET_DEST_FALSE_DEP_FOR_GLC + && + && !reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])) + output_asm_insn ("vxorps\t{%x0, %x0, %x0}", operands); + return "vgetmant\t{%3, %2, %1, %0|%0, %1, %2, %3}"; +} [(set_attr "prefix" "evex") (set_attr "mode" "")]) diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 21d445cc46c8..bb86f82905f8 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -71,6 +71,11 @@ (define_subst_attr "mask_prefix4" "mask" "orig,orig,vex" "evex,evex,evex") (define_subst_attr "bcst_mask_prefix4" "mask" "orig,orig,maybe_evex" "evex,evex,evex") (define_subst_attr "mask_expand_op3" "mask" "3" "5") +(define_subst_attr "mask3_dest_false_dep_for_glc_cond" "mask" "1" "operands[3] == CONST0_RTX(mode)") +(define_subst_attr "mask4_dest_false_dep_for_glc_cond" "mask" "1" "operands[4] == CONST0_RTX(mode)") +(define_subst_attr "mask6_dest_false_dep_for_glc_cond" "mask" "1" "operands[6] == CONST0_RTX(mode)") +(define_subst_attr "mask10_dest_false_dep_for_glc_cond" "mask" "1" "operands[10] == CONST0_RTX(mode)") +(define_subst_attr "maskc_dest_false_dep_for_glc_cond" "maskc" "1" "operands[3] == CONST0_RTX(mode)") (define_subst "mask" [(set (match_operand:SUBST_V 0) @@ -337,6 +342,8 @@ (define_subst_attr "mask_scalar_operand3" "mask_scalar" "" "%{%4%}%N3") (define_subst_attr "mask_scalar_operand4" "mask_scalar" "" "%{%5%}%N4") (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4") +(define_subst_attr "mask_scalar4_dest_false_dep_for_glc_cond" "mask_scalar" "1" "operands[4] == CONST0_RTX(mode)") +(define_subst_attr "mask_scalarc_dest_false_dep_for_glc_cond" "mask_scalarc" "1" "operands[3] == CONST0_RTX(V8HFmode)") (define_subst "mask_scalar" [(set (match_operand:SUBST_V 0) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 0d3fd0780681..f9eb3c227c11 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -79,6 +79,12 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY, m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC) +/* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before + several insns to break false dependency on the dest register for GLC + micro-architecture. */ +DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC, + "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE) + /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies are resolved on SSE register parts instead of whole registers, so we may maintain just lower part of scalar values in proper format leaving the diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..787b1d08f804 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + + +#include + +extern __m256i i1, i2, i3, i4; +extern __m256d d1, d2; +extern __m256 f1, f2; + +void vperm_test (void) +{ + i3 = _mm256_permutevar8x32_epi32 (i1, i2); + i4 = _mm256_permute4x64_epi64 (i1, 12); + d2 = _mm256_permute4x64_pd (d1, 12); + f2 = _mm256_permutevar8x32_ps (f1, i2); +} + +/* { dg-final { scan-assembler-times "vxorps" 4 } } */ +/* { dg-final { scan-assembler-times "vpermd" 1 } } */ +/* { dg-final { scan-assembler-times "vpermq" 1 } } */ +/* { dg-final { scan-assembler-times "vpermpd" 1 } } */ +/* { dg-final { scan-assembler-times "vpermps" 1 } } */ + diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..b334b88194be --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c @@ -0,0 +1,73 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -mavx512vl -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m512i i1; +extern __m256i i2; +extern __m128i i3; +extern __m512d d1, d11; +extern __m256d d2; +extern __m128d d3, d33; +extern __m512 f1, f11; +extern __m256 f2; +extern __m128 f3, f33; + +__mmask32 m32; +__mmask16 m16; +__mmask8 m8; + +void mullo_test (void) +{ + i1 = _mm512_mullo_epi64 (i1, i1); + i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1); + i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1); + i2 = _mm256_mullo_epi64 (i2, i2); + i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2); + i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2); + i3 = _mm_mullo_epi64 (i3, i3); + i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3); + i3 = _mm_maskz_mullo_epi64 (m8, i3, i3); +} + +void range_test (void) +{ + d1 = _mm512_range_pd (d1, d11, 15); + d11 = _mm512_range_round_pd (d11, d1, 15, 8); + d1 = _mm512_mask_range_pd (d1, m8, d11, d11, 15); + d11 = _mm512_mask_range_round_pd (d11, m8, d1, d1, 15, 8); + d1 = _mm512_maskz_range_pd (m8, d11, d11, 15); + d11 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8); + d2 = _mm256_range_pd (d2, d2, 15); + d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15); + d2 = _mm256_maskz_range_pd (m8, d2, d2, 15); + d3 = _mm_range_pd (d3, d3, 15); + d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15); + d3 = _mm_maskz_range_pd (m8, d3, d3, 15); + d33 = _mm_range_sd (d33, d33, 15); + d33 = _mm_mask_range_sd (d33, m8, d33, d33, 15); + d33 = _mm_maskz_range_sd (m8, d33, d33, 15); + + f1 = _mm512_range_ps (f1, f11, 15); + f11 = _mm512_range_round_ps (f11, f1, 15, 8); + f1 = _mm512_mask_range_ps (f1, m16, f11, f11, 15); + f11 = _mm512_mask_range_round_ps (f11, m16, f1, f1, 15, 8); + f1 = _mm512_maskz_range_ps (m16, f11, f11, 15); + f11 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8); + f2 = _mm256_range_ps (f2, f2, 15); + f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15); + f2 = _mm256_maskz_range_ps (m8, f2, f2, 15); + f3 = _mm_range_ps (f3, f3, 15); + f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15); + f3 = _mm_maskz_range_ps (m8, f3, f3, 15); + f33 = _mm_range_ss (f33, f33, 15); + f33 = _mm_mask_range_ss (f33, m8, f33, f33, 15); + f33 = _mm_maskz_range_ss (m8, f33, f33, 15); +} + +/* { dg-final { scan-assembler-times "vxorps" 26 } } */ +/* { dg-final { scan-assembler-times "vpmullq" 9 } } */ +/* { dg-final { scan-assembler-times "vrangepd" 12 } } */ +/* { dg-final { scan-assembler-times "vrangesd" 3 } } */ +/* { dg-final { scan-assembler-times "vrangeps" 12 } } */ +/* { dg-final { scan-assembler-times "vrangess" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..26e4ba7e9696 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-dest-false-dep-for-glc.c @@ -0,0 +1,103 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m512i i1, i2, i3; +extern __m512d d1, d11, *pd1; +extern __m128d d2; +extern __m512 f1, *pf1; +extern __m128 f2; +volatile __m512d *pd11; + +__mmask16 m16; +__mmask8 m8; + +void vperm_test (void) +{ + d1 = _mm512_permutex_pd (d1, 12); + d1 = _mm512_mask_permutex_pd (d1, m8, d1, 13); + d1 = _mm512_maskz_permutex_pd (m8, d1, 14); + d11 = _mm512_permutexvar_pd (i1, d11); + d11 = _mm512_mask_permutexvar_pd (d11, m8, i2, d11); + d11 = _mm512_maskz_permutexvar_pd (m8, i3, d11); + + f1 = _mm512_permutexvar_ps (i1, f1); + f1 = _mm512_mask_permutexvar_ps (f1, m16, i1, f1); + f1 = _mm512_maskz_permutexvar_ps (m16, i1, f1); + + i3 = _mm512_permutexvar_epi64 (i3, i3); + i3 = _mm512_mask_permutexvar_epi64 (i3, m8, i1, i1); + i3 = _mm512_maskz_permutexvar_epi64 (m8, i3, i1); + i1 = _mm512_permutex_epi64 (i3, 12); + i1 = _mm512_mask_permutex_epi64 (i1, m8, i1, 12); + i1 = _mm512_maskz_permutex_epi64 (m8, i1, 12); + + i2 = _mm512_permutexvar_epi32 (i2, i2); + i2 = _mm512_mask_permutexvar_epi32 (i2, m16, i2, i2); + i3 = _mm512_maskz_permutexvar_epi32 (m16, i3, i3); +} + +void getmant_test (void) +{ + d1 = _mm512_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm512_getmant_round_pd (*pd11, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d1 = _mm512_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm512_mask_getmant_round_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d1 = _mm512_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm512_maskz_getmant_round_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f1 = _mm512_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm512_getmant_round_ps (*pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f1 = _mm512_mask_getmant_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm512_mask_getmant_round_ps (f1, m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f1 = _mm512_maskz_getmant_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm512_maskz_getmant_round_ps (m16, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + + d2 = _mm_getmant_sd (d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_getmant_round_sd (d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d2 = _mm_mask_getmant_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_mask_getmant_round_sd (d2, m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + d2 = _mm_maskz_getmant_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_maskz_getmant_round_sd (m8, d2, d2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f2 = _mm_getmant_ss (f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_getmant_round_ss (f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f2 = _mm_mask_getmant_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_mask_getmant_round_ss (f2, m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + f2 = _mm_maskz_getmant_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_maskz_getmant_round_ss (m8, f2, f2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src, 8); + +} + +/* { dg-final { scan-assembler-times "vxorps" 22 } } */ +/* { dg-final { scan-assembler-times "vpermd" 3 } } */ +/* { dg-final { scan-assembler-times "vpermq" 6 } } */ +/* { dg-final { scan-assembler-times "vpermps" 3 } } */ +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantsd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantss" 6 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..990d65b09044 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mtune=generic -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m512h h1; +extern __m256h h2; +extern __m128h h3; + +__mmask32 m32; +__mmask16 m16; +__mmask8 m8; + +void complex_mul_test (void) +{ + h1 = _mm512_fmul_pch (h1, h1); + h1 = _mm512_fmul_round_pch (h1, h1, 8); + h1 = _mm512_mask_fmul_pch (h1, m32, h1, h1); + h1 = _mm512_mask_fmul_round_pch (h1, m32, h1, h1, 8); + h1 = _mm512_maskz_fmul_pch (m32, h1, h1); + h1 = _mm512_maskz_fmul_round_pch (m32, h1, h1, 11); + + h3 = _mm_fmul_sch (h3, h3); + h3 = _mm_fmul_round_sch (h3, h3, 8); + h3 = _mm_mask_fmul_sch (h3, m8, h3, h3); + h3 = _mm_mask_fmul_round_sch (h3, m8, h3, h3, 8); + h3 = _mm_maskz_fmul_sch (m8, h3, h3); + h3 = _mm_maskz_fmul_round_sch (m8, h3, h3, 11); +} + +void vgetmant_test (void) +{ + h3 = _mm_getmant_sh (h3, h3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + h3 = _mm_mask_getmant_sh (h3, m8, h3, h3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + h3 = _mm_maskz_getmant_sh (m8, h3, h3, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); +} + +/* { dg-final { scan-assembler-times "vxorps" 10 } } */ +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */ +/* { dg-final { scan-assembler-times "vfmulcsh" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantsh" 3 } } */ + diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..775d88abe046 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-dest-false-dep-for-glc.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512fp16 -mavx512vl -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + +#include + +extern __m256h h1; +extern __m128h h2; + +__mmask16 m16; +__mmask8 m8; + +void complex_mul_test (void) +{ + h1 = _mm256_fmul_pch (h1, h1); + h1 = _mm256_mask_fmul_pch (h1, m16, h1, h1); + h1 = _mm256_maskz_fmul_pch (m16, h1, h1); + h2 = _mm_fmul_pch (h2, h2); + h2 = _mm_mask_fmul_pch (h2, m16, h2, h2); + h2 = _mm_maskz_fmul_pch (m16, h2, h2); +} + +/* { dg-final { scan-assembler-times "vxorps" 4 } } */ +/* { dg-final { scan-assembler-times "vfmulcph" 6 } } */ + diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c new file mode 100644 index 000000000000..37d3ba51452f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vl-dest-false-dep-for-glc.c @@ -0,0 +1,76 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mtune=generic -mavx512vl -mtune-ctrl=dest_false_dep_for_glc -O2" } */ + + +#include + +extern __m256i i1, i2, i3; +extern __m256d d1, d11, *pd1; +extern __m128d d2, *pd2; +extern __m256 f1, *pf1; +extern __m128 f2, *pf2; + +__mmask16 m16; +__mmask8 m8; + +void vperm_test (void) +{ + d1 = _mm256_permutex_pd (d1, 12); + d1 = _mm256_mask_permutex_pd (d1, m8, d1, 12); + d1 = _mm256_maskz_permutex_pd (m8, d1, 12); + d11 = _mm256_permutexvar_pd (i1, d11); + d11 = _mm256_mask_permutexvar_pd (d11, m8, i1, d11); + d11 = _mm256_maskz_permutexvar_pd (m8, i1, d11); + + f1 = _mm256_permutexvar_ps (i1, f1); + f1 = _mm256_mask_permutexvar_ps (f1, m8, i1, f1); + f1 = _mm256_maskz_permutexvar_ps (m8, i1, f1); + + i1 = _mm256_permutexvar_epi64 (i1, i1); + i1 = _mm256_mask_permutexvar_epi64 (i1, m8, i1, i1); + i1 = _mm256_maskz_permutexvar_epi64 (m8, i1, i1); + i1 = _mm256_permutex_epi64 (i1, 12); + i1 = _mm256_mask_permutex_epi64 (i1, m8, i1, 12); + i1 = _mm256_maskz_permutex_epi64 (m8, i1, 12); + + i2 = _mm256_permutexvar_epi32 (i2, i2); + i2 = _mm256_mask_permutexvar_epi32 (i2, m8, i2, i2); + i3 = _mm256_maskz_permutexvar_epi32 (m8, i3, i3); +} + +void getmant_test (void) +{ + d1 = _mm256_getmant_pd (*pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm256_mask_getmant_pd (d1, m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d1 = _mm256_maskz_getmant_pd (m8, *pd1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_getmant_pd (*pd2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_mask_getmant_pd (d2, m8, *pd2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + d2 = _mm_maskz_getmant_pd (m8, *pd2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm256_getmant_ps (*pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm256_mask_getmant_ps (f1, m8, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f1 = _mm256_maskz_getmant_ps (m8, *pf1, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_getmant_ps (*pf2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_mask_getmant_ps (f2, m8, *pf2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); + f2 = _mm_maskz_getmant_ps (m8, *pf2, _MM_MANT_NORM_p75_1p5, + _MM_MANT_SIGN_src); +} + +/* { dg-final { scan-assembler-times "vxorps" 19 } } */ +/* { dg-final { scan-assembler-times "vpermpd" 6 } } */ +/* { dg-final { scan-assembler-times "vpermps" 3 } } */ +/* { dg-final { scan-assembler-times "vpermq" 6 } } */ +/* { dg-final { scan-assembler-times "vpermd" 3 } } */ +/* { dg-final { scan-assembler-times "vgetmantpd" 6 } } */ +/* { dg-final { scan-assembler-times "vgetmantps" 6 } } */ + -- 2.47.2