tree-reassoc.cc: PR tree-optimization/116139 Don't assert when forming fully-pipeline...

author Kyrylo Tkachov <ktkachov@nvidia.com>

Fri, 2 Aug 2024 13:21:16 +0000 (06:21 -0700)

committer Kyrylo Tkachov <ktkachov@nvidia.com>

Mon, 5 Aug 2024 11:07:47 +0000 (16:37 +0530)
author Kyrylo Tkachov <ktkachov@nvidia.com>
Fri, 2 Aug 2024 13:21:16 +0000 (06:21 -0700)
committer Kyrylo Tkachov <ktkachov@nvidia.com>
Mon, 5 Aug 2024 11:07:47 +0000 (16:37 +0530)
diff --git a/gcc/testsuite/gcc.target/aarch64/pr116139.c b/gcc/testsuite/gcc.target/aarch64/pr116139.c

new file mode 100644 (file)

index 0000000..78a2132
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr116139.c
@@ -0,0 +1,35 @@
+/* PR tree-optimization/116139 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param fully-pipelined-fma=1 -mcpu=neoverse-n3" } */
+
+#define LOOP_COUNT 800000000
+typedef double data_e;
+
+data_e
+foo (data_e in)
+{
+  data_e a1, a2, a3, a4;
+  data_e tmp, result = 0;
+  a1 = in + 0.1;
+  a2 = in * 0.1;
+  a3 = in + 0.01;
+  a4 = in * 0.59;
+
+  data_e result2 = 0;
+
+  for (int ic = 0; ic < LOOP_COUNT; ic++)
+    {
+      tmp = a1 + a2 * a2 + a3 * a3 + a4 * a4 ;
+      result += tmp - ic;
+      result2 = result2 / 2 - tmp;
+
+      a1 += 0.91;
+      a2 += 0.1;
+      a3 -= 0.01;
+      a4 -= 0.89;
+
+    }
+
+  return result + result2;
+}
+
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc

index d74352268b5d502562d153ceac1807524568a097..70c810c51984289b02d50b3f8a7e1080403fe6fd 100644 (file)
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -5509,16 +5509,15 @@ get_reassociation_width (vec<operand_entry *> *ops, int mult_num, tree lhs,
       , it is latency(MULT)*2 + latency(ADD)*2.  Assuming latency(MULT) >=
       latency(ADD), the first variant is preferred.
  
-     Find out if we can get a smaller width considering FMA.  */
-  if (width > 1 && mult_num && param_fully_pipelined_fma)
+     Find out if we can get a smaller width considering FMA.
+     Assume FMUL and FMA use the same units that can also do FADD.
+     For other scenarios, such as when FMUL and FADD are using separated units,
+     the following code may not apply.  */
+
+  int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
+  if (width > 1 && mult_num && param_fully_pipelined_fma
+      && width_mult <= width)
      {
-      /* When param_fully_pipelined_fma is set, assume FMUL and FMA use the
-        same units that can also do FADD.  For other scenarios, such as when
-        FMUL and FADD are using separated units, the following code may not
-        appy.  */
-      int width_mult = targetm.sched.reassociation_width (MULT_EXPR, mode);
-      gcc_checking_assert (width_mult <= width);
-
        /* Latency of MULT_EXPRs.  */
        int lat_mul
         = get_mult_latency_consider_fma (ops_num, mult_num, width_mult);
author	Kyrylo Tkachov <ktkachov@nvidia.com>
	Fri, 2 Aug 2024 13:21:16 +0000 (06:21 -0700)
committer	Kyrylo Tkachov <ktkachov@nvidia.com>
	Mon, 5 Aug 2024 11:07:47 +0000 (16:37 +0530)
gcc/testsuite/gcc.target/aarch64/pr116139.c	[new file with mode: 0644]	patch \| blob
gcc/tree-ssa-reassoc.cc		patch \| blob \| blame \| history