unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
unsigned int);
bool prefer_unrolled_loop () const;
- unsigned int determine_suggested_unroll_factor ();
+ unsigned int determine_suggested_unroll_factor (loop_vec_info loop_vinfo);
/* True if we have performed one-time initialization based on the
vec_info. */
}
unsigned int
-aarch64_vector_costs::determine_suggested_unroll_factor ()
+aarch64_vector_costs::
+determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
{
bool sve = m_vec_flags & VEC_ANY_SVE;
/* If we are trying to unroll an Advanced SIMD main loop that contains
max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
}
+ /* For known iteration loops, cap suggested unroll factor to avoid redundant
+ unrolled chunks. Use CEIL rather than truncating division to make sure
+ the completely unrolled vector loop covers all scalar iterations. */
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ {
+ unsigned int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+ unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
+ max_unroll_factor = MIN (max_unroll_factor, CEIL (niters, estimated_vf));
+ }
+
/* Make sure unroll factor is power of 2. */
return 1 << ceil_log2 (max_unroll_factor);
}
{
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
m_costs[vect_body]);
- m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+ m_suggested_unroll_factor
+ = determine_suggested_unroll_factor (loop_vinfo);
/* For gather and scatters there's an additional overhead for the first
iteration. For low count loops they're not beneficial so model the
--- /dev/null
+/* Check that the loop is not unrolled. */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+ int sum = 0;
+ for (int i = 0; i < 10; i++)
+ sum += abs (p1[i] - p2[i]);
+ return sum;
+}
+
+/* { dg-final { scan-assembler-not {\tld1b\t[^\n]*, mul vl} } } */
--- /dev/null
+/* Check that the loop is unrolled by 2 rather than 4 for small niters. */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+ int sum = 0;
+ for (int i = 0; i < 20; i++)
+ sum += abs (p1[i] - p2[i]);
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\t[^\n]*, mul vl} 2 } } */