]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Break false dependency chain on Zen5
authorJan Hubicka <hubicka@ucw.cz>
Tue, 4 Mar 2025 15:22:01 +0000 (16:22 +0100)
committerJan Hubicka <hubicka@ucw.cz>
Tue, 4 Mar 2025 15:22:51 +0000 (16:22 +0100)
Zen5 on some variants has false dependency on tzcnt, blsi, blsr and blsmsk
instructions.  Those can be tested by the following benchmark

jh@shroud:~> cat ee.c
int
main()
{
       int a = 10;
       int b = 0;
       for (int i = 0; i < 1000000000; i++)
       {
               asm volatile ("xor %0, %0": "=r" (b));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
               asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
       }
       return 0;
}
jh@shroud:~> cat bmk.sh
gcc ee.c -DBREAK -DINST=\"$1\" -O2 ; time ./a.out ; gcc ee.c -DINST=\"$1\" -O2 ; time ./a.out
jh@shroud:~> sh bmk.sh tzcnt

real    0m0.886s
user    0m0.886s
sys     0m0.000s

real    0m0.886s
user    0m0.886s
sys     0m0.000s

jh@shroud:~> sh bmk.sh blsi

real    0m0.979s
user    0m0.979s
sys     0m0.000s

real    0m2.418s
user    0m2.418s
sys     0m0.000s

jh@shroud:~> sh bmk.sh blsr

real    0m0.986s
user    0m0.986s
sys     0m0.000s

real    0m2.422s
user    0m2.421s
sys     0m0.000s
jh@shroud:~> sh bmk.sh blsmsk

real    0m0.973s
user    0m0.973s
sys     0m0.000s

real    0m2.422s
user    0m2.422s
sys     0m0.000s

We already have runable that controls tzcnt together with lzcnt and popcnt.
Since it seems that only tzcnt is affected I added new tunable to control tzcnt
only.  I also added splitters for blsi/blsr/blsmsk implemented analogously to
existing splitter for lzcnt.

The patch is neutral on SPEC. We produce blsi and blsr in some internal loops, but
they usually have same destination as source. However it is good to break the
dependency chain to avoid patogolical cases and it is quite cheap overall, so I
think we want to enable this for generic.  I will send followup patch for this.

Bootstrapped/regtested x86_64-linux, will commit it shortly.

gcc/ChangeLog:

* config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_TZCNT): New macro.
(TARGET_AVOID_FALSE_DEP_FOR_BLS): New macro.
* config/i386/i386.md (*bmi_blsi_<mode>): Add splitter for false
dependency.
(*bmi_blsi_<mode>_ccno): Add splitter for false dependency.
(*bmi_blsi_<mode>_falsedep): New pattern.
(*bmi_blsmsk_<mode>): Add splitter for false dependency.
(*bmi_blsmsk_<mode>_falsedep): New pattern.
(*bmi_blsr_<mode>): Add splitter for false dependency.
(*bmi_blsr_<mode>_cmp): Add splitter for false dependency
(*bmi_blsr_<mode>_cmp_falsedep): New pattern.
* config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT): New tune.
(X86_TUNE_AVOID_FALSE_DEP_FOR_BLS): New tune.

gcc/testsuite/ChangeLog:

* gcc.target/i386/blsi.c: New test.
* gcc.target/i386/blsmsk.c: New test.
* gcc.target/i386/blsr.c: New test.

gcc/config/i386/i386.h
gcc/config/i386/i386.md
gcc/config/i386/x86-tune.def
gcc/testsuite/gcc.target/i386/blsi.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/blsmsk.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/blsr.c [new file with mode: 0644]

index 2696bfb3a81ece702fe91eed239f55a44da7b7e8..ce29c272bc0bd5125a1a90b05704e3a27e4a1009 100644 (file)
@@ -461,6 +461,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
     ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
 #define TARGET_AVOID_FALSE_DEP_FOR_BMI \
        ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
+#define TARGET_AVOID_FALSE_DEP_FOR_TZCNT \
+       ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT]
+#define TARGET_AVOID_FALSE_DEP_FOR_BLS \
+       ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BLS]
 #define TARGET_ONE_IF_CONV_INSN \
        ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN]
 #define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE]
index 8575fbf40fedc33b9d4f01cb3af79e6d30c9955a..b1cd52382a80f487f1faaac49f822c133c923521 100644 (file)
        (ctz:SWI48 (match_dup 1)))]
   "TARGET_BMI"
   "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
-  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+  "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
   return "bsf{<imodesuffix>}\t{%1, %0|%0, %1}";
 }
   "(TARGET_BMI || TARGET_CPU_P (GENERIC))
-   && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI && TARGET_64BIT"
   "tzcnt{l}\t{%1, %k0|%k0, %1}"
-  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+  "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
   return "bsf{l}\t{%1, %k0|%k0, %1}";
 }
   "(TARGET_BMI || TARGET_CPU_P (GENERIC))
-   && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
    (set_attr "btver2_decode" "direct, double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsi_<mode>"
+(define_insn_and_split "*bmi_blsi_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (and:SWI48
           (neg:SWI48
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI"
   "blsi\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCNO FLAGS_REG)
+      (compare:CCNO
+        (and:SWI48
+          (neg:SWI48 (match_dup 1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
+(define_split
+  [(set (match_operand 3 "flags_reg_operand")
+       (match_operator 4 "compare_operator"
+         [(and:SWI48
+           (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand"))
+           (match_dup 1))
+          (const_int 0)]))
+   (set (match_operand:SWI48 0 "register_operand")
+       (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))]
+  "TARGET_BMI
+   && TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (match_dup 3)
+      (match_op_dup 4
+        [(and:SWI48
+           (neg:SWI48 (match_dup 1))
+           (match_dup 1))
+         (const_int 0)]))
+     (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);")
+
+; False dependency happens when destination is only updated by blsi.
+(define_insn "*bmi_blsi_<mode>_falsedep"
+  [(set (reg FLAGS_REG)
+       (compare
+         (and:SWI48
+           (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm"))
+           (match_dup 1))
+         (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_BMI && ix86_match_ccmode (insn, CCNOmode)"
+   "blsi\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+;; No need for splitter of the false dependency, since the output is unused
+;; so it will not extend dependency chain.
 (define_insn "*bmi_blsi_<mode>_ccno"
   [(set (reg FLAGS_REG)
        (compare
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsmsk_<mode>"
+(define_insn_and_split "*bmi_blsmsk_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (xor:SWI48
           (plus:SWI48
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI"
   "blsmsk\t{%1, %0|%0, %1}"
+  "TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (match_dup 0)
+          (xor:SWI48
+            (plus:SWI48 (match_dup 1) (const_int -1))
+            (match_dup 1)))
+     (clobber (reg:CC FLAGS_REG))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+; False dependency happens when destination is only updated by blmsk.
+(define_insn "*bmi_blsmsk_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (xor:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_BMI"
+  "blsmsk\t{%1, %0|%0, %1}"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsr_<mode>"
+(define_insn_and_split "*bmi_blsr_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (and:SWI48
           (plus:SWI48
             (const_int -1))
           (match_dup 1)))
    (clobber (reg:CC FLAGS_REG))]
-   "TARGET_BMI"
-   "blsr\t{%1, %0|%0, %1}"
+  "TARGET_BMI"
+  "blsr\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+      (compare:CCZ
+        (and:SWI48
+          (plus:SWI48 (match_dup 1) (const_int -1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1))
+                                   (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsr_<mode>_cmp"
+(define_insn_and_split "*bmi_blsr_<mode>_cmp"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ
          (and:SWI48
            (match_dup 1)
            (const_int -1))
          (match_dup 1)))]
+  "TARGET_BMI"
+  "blsr\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+      (compare:CCZ
+        (and:SWI48
+          (plus:SWI48 (match_dup 1) (const_int -1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1))
+                                   (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+; False dependency happens when destination is only updated by bslr.
+(define_insn "*bmi_blsr_<mode>_cmp_falsedep"
+  [(set (reg:CCZ FLAGS_REG)
+       (compare:CCZ
+         (and:SWI48
+           (plus:SWI48
+             (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+             (const_int -1))
+           (match_dup 1))
+         (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (and:SWI48
+         (plus:SWI48
+           (match_dup 1)
+           (const_int -1))
+         (match_dup 1)))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
    "TARGET_BMI"
    "blsr\t{%1, %0|%0, %1}"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
+;; No need for splitter of the false dependency, since the output is unused
+;; so it will not extend dependency chain.
 (define_insn "*bmi_blsr_<mode>_ccz"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ
index b6e39f642e88df9f4f42859036d1fb57a4de3521..c857e769b60ec3b7580289b9a49e713bc2e88f62 100644 (file)
@@ -366,6 +366,16 @@ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
          | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
          | m_ZHAOXIN | m_GENERIC)
 
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT: Avoid false dependency
+   for tzcnt instruction (also included in X86_TUNE_AVOID_FALSE_DEP_FOR_BMI).  */
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT, "avoid_false_dep_for_tzcnt",
+         m_ZNVER5)
+
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_BLS: Avoid false dependency
+   for blsi, blsr and blsmsk instructions.  */
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS, "avoid_false_dep_for_bls",
+         m_ZNVER5)
+
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
    unrolling small loop less important. For, such architectures we adjust
diff --git a/gcc/testsuite/gcc.target/i386/blsi.c b/gcc/testsuite/gcc.target/i386/blsi.c
new file mode 100644 (file)
index 0000000..ee68a9d
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+void bar ();
+int
+test(int a)
+{
+       return a & -a;
+}
+int
+test2(int a)
+{
+       if (a & -a)
+               bar ();
+}
+int
+test3(int a)
+{
+       int ret = a & -a;
+       if (ret)
+               bar ();
+       return ret;
+}
+/* All three functions should produce bslr.
+   Only test and test3 needs xor to break false dependency.  */
+/* { dg-final { scan-assembler-times "blsi\[ \\t\]+" 3 } } */
+/* { dg-final { scan-assembler-times "xor" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/blsmsk.c b/gcc/testsuite/gcc.target/i386/blsmsk.c
new file mode 100644 (file)
index 0000000..9d15ac0
--- /dev/null
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+int
+test(int a)
+{
+       return (a - 1)^a;
+}
+/* { dg-final { scan-assembler-times "blsmsk\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "xor" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/blsr.c b/gcc/testsuite/gcc.target/i386/blsr.c
new file mode 100644 (file)
index 0000000..340ebb4
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+void bar ();
+int
+test(int a)
+{
+       return a & (a - 1);
+}
+int
+test2(int a)
+{
+       if (a & (a - 1))
+               bar ();
+}
+int
+test3(int a)
+{
+       int ret = a & (a - 1);
+       if (ret)
+               bar ();
+       return ret;
+}
+/* All three functions should produce bslr.
+   Only test and test3 needs xor to break false dependency.  */
+/* { dg-final { scan-assembler-times "blsr\[ \\t\]+" 3 } } */
+/* { dg-final { scan-assembler-times "xor" 2 } } */