From: Tamar Christina <tamar.christina@arm.com>
Date: Sun, 22 Sep 2024 12:34:10 +0000 (+0100)
Subject: aarch64: Take into account when VF is higher than known scalar iters
X-Git-Tag: basepoints/gcc-16~5786
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e84e5d034124c6733d3b36d8623c56090d4d17f7;p=thirdparty%2Fgcc.git

aarch64: Take into account when VF is higher than known scalar iters

Consider low overhead loops like:

void
foo (char *restrict a, int *restrict b, int *restrict c, int n)
{
  for (int i = 0; i < 9; i++)
    {
      int res = c[i];
      int t = b[i];
      if (a[i] != 0)
        res = t;
      c[i] = res;
    }
}

For such loops we use latency only costing since the loop bounds is known and
small.

The current costing however does not consider the case where niters < VF.

So when comparing the scalar vs vector costs it doesn't keep in mind that the
scalar code can't perform VF iterations.  This makes it overestimate the cost
for the scalar loop and we incorrectly vectorize.

This patch takes the minimum of the VF and niters in such cases.
Before the patch we generate:

 note:  Original vector body cost = 46
 note:  Vector loop iterates at most 1 times
 note:  Scalar issue estimate:
 note:    load operations = 2
 note:    store operations = 1
 note:    general operations = 1
 note:    reduction latency = 0
 note:    estimated min cycles per iteration = 1.000000
 note:    estimated cycles per vector iteration (for VF 32) = 32.000000
 note:  SVE issue estimate:
 note:    load operations = 5
 note:    store operations = 4
 note:    general operations = 11
 note:    predicate operations = 12
 note:    reduction latency = 0
 note:    estimated min cycles per iteration without predication = 5.500000
 note:    estimated min cycles per iteration for predication = 12.000000
 note:    estimated min cycles per iteration = 12.000000
 note:  Low iteration count, so using pure latency costs
 note:  Cost model analysis:

vs after:

 note:  Original vector body cost = 46
 note:  Known loop bounds, capping VF to 9 for analysis
 note:  Vector loop iterates at most 1 times
 note:  Scalar issue estimate:
 note:    load operations = 2
 note:    store operations = 1
 note:    general operations = 1
 note:    reduction latency = 0
 note:    estimated min cycles per iteration = 1.000000
 note:    estimated cycles per vector iteration (for VF 9) = 9.000000
 note:  SVE issue estimate:
 note:    load operations = 5
 note:    store operations = 4
 note:    general operations = 11
 note:    predicate operations = 12
 note:    reduction latency = 0
 note:    estimated min cycles per iteration without predication = 5.500000
 note:    estimated min cycles per iteration for predication = 12.000000
 note:    estimated min cycles per iteration = 12.000000
 note:  Increasing body cost to 1472 because the scalar code could issue within the limit imposed by predicate operations
 note:  Low iteration count, so using pure latency costs
 note:  Cost model analysis:

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (adjust_body_cost):
	Cap VF for low iteration loops.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/asrdiv_4.c: Update bounds.
	* gcc.target/aarch64/sve/cond_asrd_2.c: Likewise.
	* gcc.target/aarch64/sve/cond_uxt_6.c: Likewise.
	* gcc.target/aarch64/sve/cond_uxt_7.c: Likewise.
	* gcc.target/aarch64/sve/cond_uxt_8.c: Likewise.
	* gcc.target/aarch64/sve/miniloop_1.c: Likewise.
	* gcc.target/aarch64/sve/spill_6.c: Likewise.
	* gcc.target/aarch64/sve/sve_iters_low_1.c: New test.
	* gcc.target/aarch64/sve/sve_iters_low_2.c: New test.
---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 92763d403c7..68913beaee2 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17565,6 +17565,19 @@ adjust_body_cost (loop_vec_info loop_vinfo,
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "Original vector body cost = %d\n", body_cost);
 
+  /* If we know we have a single partial vector iteration, cap the VF
+     to the number of scalar iterations for costing purposes.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      if (niters < estimated_vf && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Scalar loop iterates at most %wd times.  Capping VF "
+			 " from %d to %wd\n", niters, estimated_vf, niters);
+
+      estimated_vf = MIN (estimated_vf, niters);
+    }
+
   fractional_cost scalar_cycles_per_iter
     = scalar_ops.min_cycles_per_iter () * estimated_vf;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
index 6684fe1c124..10a96a894af 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
@@ -15,12 +15,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
index e4040ee3520..db1721efbc7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
@@ -14,12 +14,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
index e47276a3a35..b8b3e862d0a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
index f49915c4ac1..2d02fb70f33 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
index 42eb4b2661b..8fe2455687b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
index 09eb4146816..cd1fd2b8a07 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
@@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c,
 	   int * __restrict__ g, int * __restrict__ h)
 {
   int i = 0;
-  for (i = 0; i < 3; i++)
+  for (i = 0; i < 30; i++)
     {
       a[i] += i;
       b[i] += i;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
index ae9c338f569..2ff969ced00 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
@@ -11,20 +11,20 @@ void consumer (void *);
   {									\
     if (which)								\
       {									\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x1[i] += VAL;							\
 	consumer (x1);							\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x2[i] -= VAL;							\
 	consumer (x2);							\
       }									\
     else								\
       {									\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x3[i] &= VAL;							\
 	consumer (x3);							\
       }									\
-    for (int i = 0; i < 7; ++i)						\
+    for (int i = 0; i < 70; ++i)					\
       x4[i] |= VAL;							\
     consumer (x4);							\
   }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
new file mode 100644
index 00000000000..952a4b1cd58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n)
+{
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
new file mode 100644
index 00000000000..02d10de2a62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride)
+{
+  if (stride <= 1)
+    return;
+
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i*stride];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */