Fix a case in which the vector cost model was ignored

author rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>

Mon, 18 Mar 2019 12:25:32 +0000 (12:25 +0000)

committer rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>

Mon, 18 Mar 2019 12:25:32 +0000 (12:25 +0000)
author rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 18 Mar 2019 12:25:32 +0000 (12:25 +0000)
committer rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 18 Mar 2019 12:25:32 +0000 (12:25 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index d3a9c25e2dd76ef5f900f0f2294eafbd16c8cf86..d5c8114fb741ecc3bbff7735ee9a2fa5ec59891f 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2019-03-18  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * tree-vect-loop.c (vect_estimate_min_profitable_iters): Fix the
+       calculation of the minimum number of scalar iterations for
+       fully-predicated loops.
+
  2019-03-18  Martin Jambor  <mjambor@suse.cz>
  
         PR tree-optimization/89546
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index e9301aac6fb65940bcf8d1f49c559b8aed0c466b..012f2a059b884ffcec7739767ea978d12e0b116c 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2019-03-18  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * gcc.target/aarch64/sve/cost_model_1.c: New test.
+
  2019-03-18  Martin Jambor  <mjambor@suse.cz>
  
         PR tree-optimization/89546
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_1.c

new file mode 100644 (file)

index 0000000..a6d6442
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_1.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
+
+void
+f (unsigned int *restrict x, unsigned int *restrict y,
+   unsigned char *restrict z, unsigned int n)
+{
+  for (unsigned int i = 0; i < n % 4; ++i)
+    x[i] = x[i] + y[i] + z[i];
+}
+
+/* { dg-final { scan-tree-dump "not vectorized: estimated iteration count too small" vect } } */
+/* { dg-final { scan-tree-dump "vectorized 0 loops" vect } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index bd81193a23605e6e78e38f3e1413154d264d8d2f..0edcdc7ee5f35983e40ce69f1812686780fe95b0 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3600,14 +3600,89 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    /* Calculate number of iterations required to make the vector version
       profitable, relative to the loop bodies only.  The following condition
       must hold true:
-     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+     SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
       where
       SIC = scalar iteration cost, VIC = vector iteration cost,
       VOC = vector outside cost, VF = vectorization factor,
-     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+     NPEEL = prologue iterations + epilogue iterations,
       SOC = scalar outside cost for run time cost model check.  */
  
-  if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
+  int saving_per_viter = (scalar_single_iter_cost * assumed_vf
+                         - vec_inside_cost);
+  if (saving_per_viter <= 0)
+    {
+      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
+       warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
+                   "vectorization did not happen for a simd loop");
+
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cost model: the vector iteration cost = %d "
+                        "divided by the scalar iteration cost = %d "
+                        "is greater or equal to the vectorization factor = %d"
+                         ".\n",
+                        vec_inside_cost, scalar_single_iter_cost, assumed_vf);
+      *ret_min_profitable_niters = -1;
+      *ret_min_profitable_estimate = -1;
+      return;
+    }
+
+  /* ??? The "if" arm is written to handle all cases; see below for what
+     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* Rewriting the condition above in terms of the number of
+        vector iterations (vniters) rather than the number of
+        scalar iterations (niters) gives:
+
+        SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
+
+        <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
+
+        For integer N, X and Y when X > 0:
+
+        N * X > Y <==> N >= (Y /[floor] X) + 1.  */
+      int outside_overhead = (vec_outside_cost
+                             - scalar_single_iter_cost * peel_iters_prologue
+                             - scalar_single_iter_cost * peel_iters_epilogue
+                             - scalar_outside_cost);
+      /* We're only interested in cases that require at least one
+        vector iteration.  */
+      int min_vec_niters = 1;
+      if (outside_overhead > 0)
+       min_vec_niters = outside_overhead / saving_per_viter + 1;
+
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
+                    min_vec_niters);
+
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Now that we know the minimum number of vector iterations,
+            find the minimum niters for which the scalar cost is larger:
+
+            SIC * niters > VIC * vniters + VOC - SOC
+
+            We know that the minimum niters is no more than
+            vniters * VF + NPEEL, but it might be (and often is) less
+            than that if a partial vector iteration is cheaper than the
+            equivalent scalar code.  */
+         int threshold = (vec_inside_cost * min_vec_niters
+                          + vec_outside_cost
+                          - scalar_outside_cost);
+         if (threshold <= 0)
+           min_profitable_iters = 1;
+         else
+           min_profitable_iters = threshold / scalar_single_iter_cost + 1;
+       }
+      else
+       /* Convert the number of vector iterations into a number of
+          scalar iterations.  */
+       min_profitable_iters = (min_vec_niters * assumed_vf
+                               + peel_iters_prologue
+                               + peel_iters_epilogue);
+    }
+  else
      {
        min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
                               * assumed_vf
@@ -3617,8 +3692,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
          min_profitable_iters = 0;
        else
         {
-         min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
-                                  - vec_inside_cost);
+         min_profitable_iters /= saving_per_viter;
  
           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
               <= (((int) vec_inside_cost * min_profitable_iters)
@@ -3627,24 +3701,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
             min_profitable_iters++;
         }
      }
-  /* vector version will never be profitable.  */
-  else
-    {
-      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
-       warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
-                   "vectorization did not happen for a simd loop");
-
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "cost model: the vector iteration cost = %d "
-                        "divided by the scalar iteration cost = %d "
-                        "is greater or equal to the vectorization factor = %d"
-                         ".\n",
-                        vec_inside_cost, scalar_single_iter_cost, assumed_vf);
-      *ret_min_profitable_niters = -1;
-      *ret_min_profitable_estimate = -1;
-      return;
-    }
  
    if (dump_enabled_p ())
      dump_printf (MSG_NOTE,
@@ -3668,10 +3724,34 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  
       Non-vectorized variant is SIC * niters and it must win over vector
       variant on the expected loop trip count.  The following condition must hold true:
-     SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
+     SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
  
    if (vec_outside_cost <= 0)
      min_profitable_estimate = 0;
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* This is a repeat of the code above, but with + SOC rather
+        than - SOC.  */
+      int outside_overhead = (vec_outside_cost
+                             - scalar_single_iter_cost * peel_iters_prologue
+                             - scalar_single_iter_cost * peel_iters_epilogue
+                             + scalar_outside_cost);
+      int min_vec_niters = 1;
+      if (outside_overhead > 0)
+       min_vec_niters = outside_overhead / saving_per_viter + 1;
+
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         int threshold = (vec_inside_cost * min_vec_niters
+                          + vec_outside_cost
+                          + scalar_outside_cost);
+         min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
+       }
+      else
+       min_profitable_estimate = (min_vec_niters * assumed_vf
+                                  + peel_iters_prologue
+                                  + peel_iters_epilogue);
+    }
    else
      {
        min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
author	rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>
	Mon, 18 Mar 2019 12:25:32 +0000 (12:25 +0000)
committer	rsandifo <rsandifo@138bc75d-0d04-0410-961f-82ee72b054a4>
	Mon, 18 Mar 2019 12:25:32 +0000 (12:25 +0000)
gcc/ChangeLog		patch \| blob \| blame \| history
gcc/testsuite/ChangeLog		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/cost_model_1.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-loop.c		patch \| blob \| blame \| history