]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
AArch64: Cap suggested unroll factor for small known-niters loops
authorPengfei Li <Pengfei.Li2@arm.com>
Mon, 29 Jun 2026 08:30:45 +0000 (08:30 +0000)
committerPengfei Li <Pengfei.Li2@arm.com>
Thu, 2 Jul 2026 18:43:44 +0000 (18:43 +0000)
The AArch64 backend can suggest an unroll factor to the vectorizer in
order to expose more ILP. However, in some cases the suggested value is
larger than needed. For the test cases added by this patch, the AArch64
backend suggests an unroll factor of 4, but the loops only need 1 or 2
SVE vector iterations respectively to cover their 10 or 20 scalar
iterations.

This patch caps the suggested unroll factor with CEIL (niters, VF) for
small known-niters loops. CEIL is used rather than truncating division
so that the completely unrolled vector loop still covers all scalar
iterations. Reducing the unroll factor below the number of required
vector iterations could require a separate epilogue loop and lead to
worse code generation.

Bootstrapped and tested on aarch64-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64.cc
(aarch64_vector_costs::determine_suggested_unroll_factor): Add a
loop_vec_info parameter.
(determine_suggested_unroll_factor): Cap the suggested unroll for
small-niters loops.
(aarch64_vector_costs::finish_cost): Pass loop_vinfo to
determine_suggested_unroll_factor.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/vect-unroll-1.c: New test.
* gcc.target/aarch64/sve/vect-unroll-2.c: New test.

gcc/config/aarch64/aarch64.cc
gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c [new file with mode: 0644]

index 3785a8e722d00740e0282471c07a768380dfb7f8..78f1eae8336c9bd4f8fc529154065b11e8394b13 100644 (file)
@@ -17570,7 +17570,7 @@ private:
   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
                                 unsigned int);
   bool prefer_unrolled_loop () const;
-  unsigned int determine_suggested_unroll_factor ();
+  unsigned int determine_suggested_unroll_factor (loop_vec_info loop_vinfo);
 
   /* True if we have performed one-time initialization based on the
      vec_info.  */
@@ -19132,7 +19132,8 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
 }
 
 unsigned int
-aarch64_vector_costs::determine_suggested_unroll_factor ()
+aarch64_vector_costs::
+determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
 {
   bool sve = m_vec_flags & VEC_ANY_SVE;
   /* If we are trying to unroll an Advanced SIMD main loop that contains
@@ -19189,6 +19190,16 @@ aarch64_vector_costs::determine_suggested_unroll_factor ()
       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
     }
 
+  /* For known iteration loops, cap suggested unroll factor to avoid redundant
+     unrolled chunks.  Use CEIL rather than truncating division to make sure
+     the completely unrolled vector loop covers all scalar iterations.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      unsigned int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
+      max_unroll_factor = MIN (max_unroll_factor, CEIL (niters, estimated_vf));
+    }
+
   /* Make sure unroll factor is power of 2.  */
   return 1 << ceil_log2 (max_unroll_factor);
 }
@@ -19380,7 +19391,8 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
     {
       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                             m_costs[vect_body]);
-      m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+      m_suggested_unroll_factor
+       = determine_suggested_unroll_factor (loop_vinfo);
 
       /* For gather and scatters there's an additional overhead for the first
         iteration.  For low count loops they're not beneficial so model the
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c
new file mode 100644 (file)
index 0000000..8548f50
--- /dev/null
@@ -0,0 +1,17 @@
+/* Check that the loop is not unrolled.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+  int sum = 0;
+  for (int i = 0; i < 10; i++)
+    sum += abs (p1[i] - p2[i]);
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {\tld1b\t[^\n]*, mul vl} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c
new file mode 100644 (file)
index 0000000..e5f8c45
--- /dev/null
@@ -0,0 +1,17 @@
+/* Check that the loop is unrolled by 2 rather than 4 for small niters.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+  int sum = 0;
+  for (int i = 0; i < 20; i++)
+    sum += abs (p1[i] - p2[i]);
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\t[^\n]*, mul vl} 2 } } */