AArch64: tweak inner-loop penalty when doing outer-loop vect [PR121290]

author Tamar Christina <tamar.christina@arm.com>

Mon, 5 Jan 2026 14:27:14 +0000 (14:27 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Mon, 5 Jan 2026 14:27:14 +0000 (14:27 +0000)
author Tamar Christina <tamar.christina@arm.com>
Mon, 5 Jan 2026 14:27:14 +0000 (14:27 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Mon, 5 Jan 2026 14:27:14 +0000 (14:27 +0000)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index ab902b1283ac00bbf6e57acdacc184913ccc3d3b..01828a95bb6210f21034e82864d1c31518bda866 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17488,6 +17488,15 @@ private:
       support unrolling of the inner loop independently from the outerloop during
       outer-loop vectorization which tends to lead to pipeline bubbles.  */
    bool m_loop_fully_scalar_dup = false;
+
+  /* If m_loop_fully_scalar_dup is true then this variable contains the number
+     of statements we estimate to be duplicate.  We only increase the cost of
+     the seeds because we don't want to overly pessimist the loops.  */
+  uint64_t m_num_dup_stmts = 0;
+
+  /* If m_loop_fully_scalar_dup this contains the total number of leaf stmts
+     found in the SLP tree.  */
+  uint64_t m_num_total_stmts = 0;
  };
  
  aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
@@ -18477,6 +18486,43 @@ aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
      }
  }
  
+/* Determine probabilistically whether the STMT is one tht could possible be
+   made into a by lane operation later on.  We can't be sure, but certain
+   operations have a higher chance.  */
+
+static bool
+aarch64_possible_by_lane_insn_p (vec_info *m_vinfo, gimple *stmt)
+{
+  if (!gimple_has_lhs (stmt))
+    return false;
+
+  use_operand_p use_p;
+  imm_use_iterator iter;
+  tree var = gimple_get_lhs (stmt);
+  FOR_EACH_IMM_USE_FAST (use_p, iter, var)
+    {
+      gimple *new_stmt = USE_STMT (use_p);
+      auto stmt_info = vect_stmt_to_vectorize (m_vinfo->lookup_stmt (new_stmt));
+      auto rep_stmt = STMT_VINFO_STMT (stmt_info);
+      /* Re-association is a problem here, since lane instructions are only
+        supported as the last operand, as such we put duplicate operands
+        last.  We could check the other operand for invariancy, but it may not
+        be an outer-loop defined invariant.  For now just checking the last
+        operand catches all the cases and we can expand if needed.  */
+      if (is_gimple_assign (rep_stmt))
+       switch (gimple_assign_rhs_code (rep_stmt))
+         {
+         case MULT_EXPR:
+           return operand_equal_p (gimple_assign_rhs2 (new_stmt), var, 0);
+         case DOT_PROD_EXPR:
+           return operand_equal_p (gimple_assign_rhs3 (new_stmt), var, 0);
+         default:
+           continue;
+         }
+    }
+  return false;
+}
+
  unsigned
  aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
                                      stmt_vec_info stmt_info, slp_tree node,
@@ -18509,7 +18555,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
         analyze_loop_vinfo (loop_vinfo);
  
        m_analyzed_vinfo = true;
-      if (in_inner_loop_p)
+
+      /* We should only apply the heuristic for invariant values on the inner
+        most loop in a nested loop nest.  */
+      if (in_inner_loop_p && loop_vinfo)
         m_loop_fully_scalar_dup = true;
      }
  
@@ -18518,17 +18567,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       try to vectorize.  */
    if (in_inner_loop_p
        && node
-      && m_loop_fully_scalar_dup
        && SLP_TREE_LANES (node) == 1
        && !SLP_TREE_CHILDREN (node).exists ())
      {
-      /* Check if load is a duplicate.  */
-      if (gimple_vuse (stmt_info->stmt)
-         && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT)
-       ;
-      else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
-              || SLP_TREE_DEF_TYPE (node) == vect_external_def)
-       ;
+      m_num_total_stmts++;
+      gimple *stmt = STMT_VINFO_STMT (stmt_info);
+      /* Check if load is a duplicate that will be duplicated inside the
+        current loop.  */
+      if (gimple_vuse (stmt)
+         && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT
+         && !aarch64_possible_by_lane_insn_p (m_vinfo, stmt))
+       m_num_dup_stmts++;
+      else if ((SLP_TREE_DEF_TYPE (node) == vect_constant_def
+               || SLP_TREE_DEF_TYPE (node) == vect_external_def)
+              && !aarch64_possible_by_lane_insn_p (m_vinfo, stmt))
+       m_num_dup_stmts++;
        else
         m_loop_fully_scalar_dup = false;
      }
@@ -18898,8 +18951,16 @@ adjust_body_cost (loop_vec_info loop_vinfo,
      threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
  
    /* Increase the cost of the vector code if it looks like the vector code has
-     limited throughput due to outer-loop vectorization.  */
-  if (m_loop_fully_scalar_dup)
+     limited throughput due to outer-loop vectorization.  As a rough estimate we
+     require at least half the operations be a duplicate expression.  This is an
+     attempt ot strike a balance between scalar and vector costing wrt to outer
+     loop vectorization.  The vectorizer applies a rather huge penalty to scalar
+     costing when doing outer-loop vectorization (See
+     LOOP_VINFO_INNER_LOOP_COST_FACTOR) and because of this accurate costing
+     becomes rather hard.  the 50% here allows us to amortize the cost on longer
+     loop bodies where the majority of the inputs are not a broadcast.  */
+  if (m_loop_fully_scalar_dup
+      && (m_num_dup_stmts * 2 >= m_num_total_stmts))
      {
        body_cost *= estimated_vf;
        if (dump_enabled_p ())
diff --git a/gcc/testsuite/g++.target/aarch64/pr121290_1.C b/gcc/testsuite/g++.target/aarch64/pr121290_1.C

new file mode 100644 (file)

index 0000000..e16d773
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/pr121290_1.C
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mcpu=neoverse-v2 -fdump-tree-vect-all -w" } */
+
+#include <cstddef>
+#include <cstdint>
+
+std::ptrdiff_t getRunReps ();
+
+double * get_rp ();
+std::ptrdiff_t get_it ();
+
+void
+runSeqVariant ()
+{
+  const std::ptrdiff_t run_reps = getRunReps ();
+
+  double *__restrict__ B = get_rp ();
+  double *__restrict__ D = get_rp ();
+  double *__restrict__ M = get_rp ();
+  std::ptrdiff_t NE = get_it ();
+
+  for (volatile int irep = 0; irep < run_reps; ++irep)
+    {
+      for (int e = 0; e < NE; ++e)
+        {
+          double s_B[5][4];
+
+          for (int d = 0; d < 4; d++)
+            for (int q = 0; q < 5; q++)
+              s_B[q][d] = B[q + 5 * d];
+
+          double s_D[5][5][5];
+
+          for (int k1 = 0; k1 < 5; k1++)
+            for (int k2 = 0; k2 < 5; k2++)
+              for (int k3 = 0; k3 < 5; k3++)
+                s_D[k1][k2][k3] = D[k1 + 5 * k2 + 5 * 5 * k3 + 5 * 5 * 5 * e];
+
+          for (int i1 = 0; i1 < 4; i1++)
+            for (int i2 = 0; i2 < 4; i2++)
+              for (int i3 = 0; i3 < 4; i3++)
+                for (int j1 = 0; j1 < 4; ++j1)
+                  for (int j2 = 0; j2 < 4; ++j2)
+                    for (int j3 = 0; j3 < 4; ++j3)
+                      {
+                        double val = 0.0;
+                        for (int k1 = 0; k1 < 5; ++k1)
+                          for (int k2 = 0; k2 < 5; ++k2)
+                            for (int k3 = 0; k3 < 5; ++k3)
+                              val += s_B[k1][i1] * s_B[k1][j1] * s_B[k2][i2]
+                                     * s_B[k2][j2] * s_B[k3][i3] * s_B[k3][j3]
+                                     * s_D[k1][k2][k3];
+                        // clang-format off
+                        M[i1 + 4 * (i2 + 4 * (i3 + 4 * (j1 + 4 * (j2 + 4  * (j3 + 4 * e)))))] = val;
+                        //clang-format on
+                      }
+        }
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "low throughput of per iteration due to splats" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290.c b/gcc/testsuite/gcc.target/aarch64/pr121290_1.c

similarity index 100%

rename from gcc/testsuite/gcc.target/aarch64/pr121290.c

rename to gcc/testsuite/gcc.target/aarch64/pr121290_1.c
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290_2.c b/gcc/testsuite/gcc.target/aarch64/pr121290_2.c

new file mode 100644 (file)

index 0000000..c39a015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121290_2.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mcpu=neoverse-v2 -fdump-tree-vect-all -std=c99" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D];
+
+int main()
+{
+    float x;
+    for (int nl = 0; nl < iterations; nl++) {
+        x = a[0];
+        for (int i = 0; i < LEN_1D; ++i) {
+            if (a[i] > x) {
+                x = a[i];
+            }
+        }
+    }
+
+    return x > 1;
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "low throughput of per iteration due to splats" "vect" } } */
author	Tamar Christina <tamar.christina@arm.com>
	Mon, 5 Jan 2026 14:27:14 +0000 (14:27 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Mon, 5 Jan 2026 14:27:14 +0000 (14:27 +0000)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/g++.target/aarch64/pr121290_1.C	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/pr121290_1.c	[moved from gcc/testsuite/gcc.target/aarch64/pr121290.c with 100% similarity]	patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pr121290_2.c	[new file with mode: 0644]	patch \| blob