From 13c556d6ae84be3ee2bc245a56eafa58221de86a Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 29 Jun 2023 14:25:28 +0800 Subject: [PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0' False dependency happens when destination is only updated by pternlog. There is no false dependency when destination is also used in source. So either a pxor should be inserted, or input operand should be set with constraint '0'. gcc/ChangeLog: PR target/110438 PR target/110202 * config/i386/predicates.md (int_float_vector_all_ones_operand): New predicate. * config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New define_insn. (*_cvtmask2_pternlog_false_dep): Ditto. (*_cvtmask2_pternlog_false_dep): Ditto. (*_cvtmask2): Adjust to define_insn_and_split to avoid false dependence. (*_cvtmask2): Ditto. (one_cmpl2): Adjust constraint of operands 1 to '0' to avoid false dependence. (*andnot3): Ditto. (iornot3): Ditto. (*3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110438.c: New test. * gcc.target/i386/pr100711-6.c: Adjust testcase. --- gcc/config/i386/predicates.md | 8 +- gcc/config/i386/sse.md | 145 ++++++++++++++++++--- gcc/testsuite/gcc.target/i386/pr100711-6.c | 2 +- gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++++ 4 files changed, 168 insertions(+), 17 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 7ddbe01a6f94..37d20c6303ab 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1192,12 +1192,18 @@ return false; }) -/* Return true if operand is a vector constant that is all ones. */ +/* Return true if operand is an integral vector constant that is all ones. */ (define_predicate "vector_all_ones_operand" (and (match_code "const_vector") (match_test "INTEGRAL_MODE_P (GET_MODE (op))") (match_test "op == CONSTM1_RTX (GET_MODE (op))"))) +/* Return true if operand is a vector constant that is all ones. */ +(define_predicate "int_float_vector_all_ones_operand" + (ior (match_operand 0 "vector_all_ones_operand") + (match_operand 0 "float_vector_all_ones_operand") + (match_test "op == constm1_rtx"))) + /* Return true if operand is an 128/256bit all ones vector that zero-extends to 256/512bit. */ (define_predicate "vector_all_ones_zero_extend_half_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a2099373123d..24359cd189c8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1382,6 +1382,29 @@ ] (symbol_ref "true")))]) +; False dependency happens on destination register which is not really +; used when moving all ones to vector register +(define_split + [(set (match_operand:VMOVE 0 "register_operand") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))] + "TARGET_AVX512F && reload_completed + && ( == 64 || EXT_REX_SSE_REG_P (operands[0])) + && optimize_insn_for_speed_p ()" + [(set (match_dup 0) (match_dup 2)) + (parallel + [(set (match_dup 0) (match_dup 1)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[2] = CONST0_RTX (mode);") + +(define_insn "*vmov_constm1_pternlog_false_dep" + [(set (match_operand:VMOVE 0 "register_operand" "=v") + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "")) + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512VL || == 64" + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}" + [(set_attr "type" "sselog1") + (set_attr "prefix" "evex")]) + ;; If mem_addr points to a memory region with less than whole vector size bytes ;; of accessible memory and k is a mask that would prevent reading the inaccessible ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd @@ -9336,7 +9359,7 @@ operands[3] = CONST0_RTX (mode); }") -(define_insn "*_cvtmask2" +(define_insn_and_split "*_cvtmask2" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") @@ -9346,11 +9369,35 @@ "@ vpmovm2\t{%1, %0|%0, %1} vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + "&& !TARGET_AVX512DQ && reload_completed + && optimize_function_for_speed_p (cfun)" + [(set (match_dup 0) (match_dup 4)) + (parallel + [(set (match_dup 0) + (vec_merge:VI48_AVX512VL + (match_dup 2) + (match_dup 3) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "operands[4] = CONST0_RTX (mode);" [(set_attr "isa" "avx512dq,*") (set_attr "length_immediate" "0,1") (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn "*_cvtmask2_pternlog_false_dep" + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (vec_merge:VI48_AVX512VL + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") + (match_operand:VI48_AVX512VL 3 "const0_operand") + (match_operand: 1 "register_operand" "Yk"))) + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F && !TARGET_AVX512DQ" + "vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_expand "extendv2sfv2df2" [(set (match_operand:V2DF 0 "register_operand") (float_extend:V2DF @@ -17164,14 +17211,82 @@ operands[2] = force_reg (mode, operands[2]); }) -(define_insn "one_cmpl2" - [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] +(define_insn_and_split "one_cmpl2" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))] "TARGET_AVX512F && (! || mode == SImode || mode == DImode)" +{ + if (! && which_alternative + && optimize_insn_for_speed_p ()) + return "#"; + + if (TARGET_AVX512VL) + return "vpternlog\t{$0x55, %1, %0, %0|%0, %0, %1, 0x55}"; + else + return "vpternlog\t{$0x55, %g1, %g0, %g0|%g0, %g0, %g1, 0x55}"; +} + "&& reload_completed && !REG_P (operands[1]) && ! + && optimize_insn_for_speed_p ()" + [(set (match_dup 0) (match_dup 3)) + (parallel + [(set (match_dup 0) + (xor:VI (match_dup 1) (match_dup 2))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] +{ + if (MEM_P (operands[1])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + { + if (GET_MODE_SIZE (mode) < 4) + { + if ( == 64 ? TARGET_AVX512BW + : (TARGET_AVX512BW && TARGET_AVX512VL) + || !EXT_REX_SSE_REG_P (operands[0])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + operands[3] = CONST0_RTX (mode); + } + else + { + if ( == 64 || TARGET_AVX512VL + || !EXT_REX_SSE_REG_P (operands[0])) + { + operands[3] = operands[1]; + operands[1] = operands[0]; + } + else + operands[3] = CONST0_RTX (mode); + } + } +} + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set (attr "mode") + (if_then_else (match_test "TARGET_AVX512VL") + (const_string "") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref " == 64 || TARGET_AVX512VL") + (const_int 1)))]) + +(define_insn "*one_cmpl2_pternlog_false_dep" + [(set (match_operand:VI 0 "register_operand" "=v,v,v") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br") + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC"))) + (unspec [(match_operand:VI 3 "register_operand" "0,0,0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_AVX512F" { if (TARGET_AVX512VL) return "vpternlog\t{$0x55, %1, %0, %0|%0, %0, %1, 0x55}"; @@ -17224,7 +17339,7 @@ [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") (and:VI (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))] "TARGET_SSE && (register_operand (operands[1], mode) || register_operand (operands[2], mode))" @@ -17683,8 +17798,8 @@ [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") (ior:VI (not:VI - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr")) + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))] "( == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], mode) @@ -17708,7 +17823,7 @@ (const_string "") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "2,3") + (if_then_else (eq_attr "alternative" "0,1") (symbol_ref " == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17716,8 +17831,8 @@ [(set (match_operand:VI 0 "register_operand" "=v,v") (not:VI (xor:VI - (match_operand:VI 1 "bcst_vector_operand" "%v,v") - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (match_operand:VI 1 "bcst_vector_operand" "%0, 0") + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))] "( == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], mode) @@ -17736,7 +17851,7 @@ (const_string "") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref " == 64 || TARGET_AVX512VL") (const_string "*")))]) @@ -17747,8 +17862,8 @@ (define_insn "*3" [(set (match_operand:VI 0 "register_operand" "=v,v") (andor:VI - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))] + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0")) + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))] "( == 64 || TARGET_AVX512VL || (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && (register_operand (operands[1], mode) @@ -17767,7 +17882,7 @@ (const_string "") (const_string "XI"))) (set (attr "enabled") - (if_then_else (eq_attr "alternative" "1") + (if_then_else (eq_attr "alternative" "0") (symbol_ref " == 64 || TARGET_AVX512VL") (const_string "*")))]) diff --git a/gcc/testsuite/gcc.target/i386/pr100711-6.c b/gcc/testsuite/gcc.target/i386/pr100711-6.c index 7142a98f5371..808507471c96 100644 --- a/gcc/testsuite/gcc.target/i386/pr100711-6.c +++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -Os" } */ typedef int v16si __attribute__ ((vector_size (64))); typedef long long v8di __attribute__((vector_size (64))); diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c new file mode 100644 index 000000000000..11b8cc59fd24 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr110438.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */ +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */ +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */ +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */ + + +#include + +__m512i g(void) +{ + return (__m512i){ 0 } - 1; +} + +__m512i g1(__m512i* a) +{ + return ~(*a); +} + +void +foo (int* a, int* __restrict b) +{ + for (int i = 0; i != 16; i++) + { + if (b[i]) + a[i] = -1; + else + a[i] = 0; + } +} -- 2.47.2