AArch64: extend cost model to cost outer loop vect where the inner loop is invariant...

author Tamar Christina <tamar.christina@arm.com>

Tue, 26 Aug 2025 12:10:10 +0000 (13:10 +0100)

committer Tamar Christina <tamar.christina@arm.com>

Tue, 26 Aug 2025 12:19:15 +0000 (13:19 +0100)
author Tamar Christina <tamar.christina@arm.com>
Tue, 26 Aug 2025 12:10:10 +0000 (13:10 +0100)
committer Tamar Christina <tamar.christina@arm.com>
Tue, 26 Aug 2025 12:19:15 +0000 (13:19 +0100)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index eb9e2cfaab098e54a5d25673b37c30291d1830c4..c5110566215374660a39f169bb5e5f0b8040ef13 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17057,6 +17057,14 @@ private:
       or vector loop.  There is one entry for each tuning option of
       interest.  */
    auto_vec<aarch64_vec_op_count, 2> m_ops;
+
+  /* When doing inner-loop vectorization the constraints on the data-refs in the
+     outer-loop could limit the inner loop references.  i.e. the outerloop can
+     force the inner-loop to do a load and splat which will result in the loop
+     being entirely scalar as all lanes work on a duplicate.  Currently we don't
+     support unrolling of the inner loop independently from the outerloop during
+     outer-loop vectorization which tends to lead to pipeline bubbles.  */
+  bool m_loop_fully_scalar_dup = false;
  };
  
  aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
@@ -18079,6 +18087,28 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
         analyze_loop_vinfo (loop_vinfo);
  
        m_analyzed_vinfo = true;
+      if (in_inner_loop_p)
+       m_loop_fully_scalar_dup = true;
+    }
+
+  /* Detect whether the loop is working on fully duplicated lanes.  This would
+     only be possible with inner loop vectorization since otherwise we wouldn't
+     try to vectorize.  */
+  if (in_inner_loop_p
+      && node
+      && m_loop_fully_scalar_dup
+      && SLP_TREE_LANES (node) == 1
+      && !SLP_TREE_CHILDREN (node).exists ())
+    {
+      /* Check if load is a duplicate.  */
+      if (gimple_vuse (stmt_info->stmt)
+         && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT)
+       ;
+      else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+              || SLP_TREE_DEF_TYPE (node) == vect_external_def)
+       ;
+      else
+       m_loop_fully_scalar_dup = false;
      }
  
    /* Apply the heuristic described above m_stp_sequence_cost.  */
@@ -18445,8 +18475,19 @@ adjust_body_cost (loop_vec_info loop_vinfo,
    if (m_vec_flags & VEC_ANY_SVE)
      threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
  
-  if (m_num_vector_iterations >= 1
-      && m_num_vector_iterations < threshold)
+  /* Increase the cost of the vector code if it looks like the vector code has
+     limited throughput due to outer-loop vectorization.  */
+  if (m_loop_fully_scalar_dup)
+    {
+      body_cost *= estimated_vf;
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Increasing body cost to %d because vector code has"
+                        " low throughput of per iteration due to splats\n",
+                        body_cost);
+    }
+  else if (m_num_vector_iterations >= 1
+          && m_num_vector_iterations < threshold)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290.c b/gcc/testsuite/gcc.target/aarch64/pr121290.c

new file mode 100644 (file)

index 0000000..05aa4a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121290.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mcpu=neoverse-v2 -fdump-tree-vect-all -std=c99" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+  for (int i = 0; i < 4; ++i)
+    {
+      int res = 0;
+      for (int j = 0; j < 100; ++j)
+        res += y[j] * z[i];
+      x[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "low throughput of per iteration due to splats" "vect" } } */
author	Tamar Christina <tamar.christina@arm.com>
	Tue, 26 Aug 2025 12:10:10 +0000 (13:10 +0100)
committer	Tamar Christina <tamar.christina@arm.com>
	Tue, 26 Aug 2025 12:19:15 +0000 (13:19 +0100)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pr121290.c	[new file with mode: 0644]	patch \| blob