AArch64: Cap suggested unroll factor for small known-niters loops

author Pengfei Li <Pengfei.Li2@arm.com>

Mon, 29 Jun 2026 08:30:45 +0000 (08:30 +0000)

committer Pengfei Li <Pengfei.Li2@arm.com>

Thu, 2 Jul 2026 18:43:44 +0000 (18:43 +0000)
author Pengfei Li <Pengfei.Li2@arm.com>
Mon, 29 Jun 2026 08:30:45 +0000 (08:30 +0000)
committer Pengfei Li <Pengfei.Li2@arm.com>
Thu, 2 Jul 2026 18:43:44 +0000 (18:43 +0000)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 3785a8e722d00740e0282471c07a768380dfb7f8..78f1eae8336c9bd4f8fc529154065b11e8394b13 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17570,7 +17570,7 @@ private:
    unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
                                  unsigned int);
    bool prefer_unrolled_loop () const;
-  unsigned int determine_suggested_unroll_factor ();
+  unsigned int determine_suggested_unroll_factor (loop_vec_info loop_vinfo);
  
    /* True if we have performed one-time initialization based on the
       vec_info.  */
@@ -19132,7 +19132,8 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
  }
  
  unsigned int
-aarch64_vector_costs::determine_suggested_unroll_factor ()
+aarch64_vector_costs::
+determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
  {
    bool sve = m_vec_flags & VEC_ANY_SVE;
    /* If we are trying to unroll an Advanced SIMD main loop that contains
@@ -19189,6 +19190,16 @@ aarch64_vector_costs::determine_suggested_unroll_factor ()
        max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
      }
  
+  /* For known iteration loops, cap suggested unroll factor to avoid redundant
+     unrolled chunks.  Use CEIL rather than truncating division to make sure
+     the completely unrolled vector loop covers all scalar iterations.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      unsigned int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
+      max_unroll_factor = MIN (max_unroll_factor, CEIL (niters, estimated_vf));
+    }
+
    /* Make sure unroll factor is power of 2.  */
    return 1 << ceil_log2 (max_unroll_factor);
  }
@@ -19380,7 +19391,8 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
      {
        m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                              m_costs[vect_body]);
-      m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+      m_suggested_unroll_factor
+       = determine_suggested_unroll_factor (loop_vinfo);
  
        /* For gather and scatters there's an additional overhead for the first
          iteration.  For low count loops they're not beneficial so model the
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c

new file mode 100644 (file)

index 0000000..8548f50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c
@@ -0,0 +1,17 @@
+/* Check that the loop is not unrolled.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+  int sum = 0;
+  for (int i = 0; i < 10; i++)
+    sum += abs (p1[i] - p2[i]);
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {\tld1b\t[^\n]*, mul vl} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c

new file mode 100644 (file)

index 0000000..e5f8c45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c
@@ -0,0 +1,17 @@
+/* Check that the loop is unrolled by 2 rather than 4 for small niters.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+  int sum = 0;
+  for (int i = 0; i < 20; i++)
+    sum += abs (p1[i] - p2[i]);
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\t[^\n]*, mul vl} 2 } } */
author	Pengfei Li <Pengfei.Li2@arm.com>
	Mon, 29 Jun 2026 08:30:45 +0000 (08:30 +0000)
committer	Pengfei Li <Pengfei.Li2@arm.com>
	Thu, 2 Jul 2026 18:43:44 +0000 (18:43 +0000)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c	[new file with mode: 0644]	patch \| blob