aarch64: Use real scalar op counts

author Richard Sandiford <richard.sandiford@arm.com>

Fri, 12 Nov 2021 17:33:01 +0000 (17:33 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Fri, 12 Nov 2021 17:33:01 +0000 (17:33 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Fri, 12 Nov 2021 17:33:01 +0000 (17:33 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Fri, 12 Nov 2021 17:33:01 +0000 (17:33 +0000)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index d8bbc66c226ea4fc7c81516ba49049f1f2584515..3944c095e1d898411ea6ec0c1a07b9a5c314a868 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14813,7 +14813,8 @@ private:
                                         fractional_cost, fractional_cost,
                                         bool, unsigned int, unsigned int *,
                                         bool *);
-  unsigned int adjust_body_cost (unsigned int);
+  unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
+                                unsigned int);
  
    /* True if we have performed one-time initialization based on the
       vec_info.  */
@@ -14850,22 +14851,16 @@ private:
       iterate, otherwise it is zero.  */
    uint64_t m_num_vector_iterations = 0;
  
-  /* Used only when vectorizing loops.  Estimates the number and kind of scalar
-     operations that would be needed to perform the same work as one iteration
-     of the vector loop.  */
-  aarch64_vec_op_count m_scalar_ops;
+  /* Used only when vectorizing loops.  Estimates the number and kind of
+     operations that would be needed by one iteration of the scalar
+     or vector loop.  */
+  aarch64_vec_op_count m_ops;
  
-  /* Used only when vectorizing loops.  If M_VEC_FLAGS & VEC_ADVSIMD,
-     this structure estimates the number and kind of operations that the
-     vector loop would contain.  If M_VEC_FLAGS & VEC_SVE, the structure
-     estimates what the equivalent Advanced SIMD-only code would need in
-     order to perform the same work as one iteration of the SVE loop.  */
+  /* Used only when vectorizing loops for SVE.  It estimates what the
+     equivalent Advanced SIMD-only code would need in order to perform
+     the same work as one iteration of the SVE loop.  */
    aarch64_vec_op_count m_advsimd_ops;
  
-  /* Used only when vectorizing loops with SVE.  It estimates the number and
-     kind of operations that the SVE loop would contain.  */
-  aarch64_vec_op_count m_sve_ops;
-
    /* Used to detect cases in which we end up costing the same load twice,
       once to account for results that are actually used and once to account
       for unused results.  */
@@ -14875,9 +14870,10 @@ private:
  aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
                                             bool costing_for_scalar)
    : vector_costs (vinfo, costing_for_scalar),
-    m_scalar_ops (aarch64_tune_params.vec_costs->issue_info, 0),
-    m_advsimd_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ADVSIMD),
-    m_sve_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ANY_SVE)
+    m_vec_flags (costing_for_scalar ? 0
+                : aarch64_classify_vector_mode (vinfo->vector_mode)),
+    m_ops (aarch64_tune_params.vec_costs->issue_info, m_vec_flags),
+    m_advsimd_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ADVSIMD)
  {
  }
  
@@ -15016,7 +15012,7 @@ aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
        FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
         if (rgm->type)
           num_masks += num_vectors_m1 + 1;
-      m_sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
+      m_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
      }
  }
  
@@ -15550,8 +15546,8 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
  /* COUNT, KIND, STMT_INFO and VECTYPE are the same as for
     vector_costs::add_stmt_cost and they describe an operation in the
     body of a vector loop.  Record issue information relating to the vector
-   operation in OPS, where OPS is one of m_scalar_ops, m_advsimd_ops
-   or m_sve_ops; see the comments above those variables for details.
+   operation in OPS, where OPS is one of m_ops or m_advsimd_ops; see the
+   comments above those variables for details.
  
     FACTOR says how many iterations of the loop described by VEC_FLAGS would be
     needed to match one iteration of the vector loop in VINFO.  */
@@ -15570,14 +15566,14 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
  
    /* Calculate the minimum cycles per iteration imposed by a reduction
       operation.  */
-  if ((kind == vector_stmt || kind == vec_to_scalar)
+  if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
        && vect_is_reduction (stmt_info))
      {
        unsigned int base
         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, vec_flags);
        if (vect_reduc_type (m_vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
         {
-         if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+         if (vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
             {
               /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
                  as a single operation, whereas for Advanced SIMD it is a
@@ -15744,11 +15740,6 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
    loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
    if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
      {
-      /* If we're costing the vector code, record whether we're vectorizing
-        for Advanced SIMD or SVE.  */
-      if (!m_costing_for_scalar)
-       m_vec_flags = aarch64_classify_vector_mode (m_vinfo->vector_mode);
-
        if (loop_vinfo)
         analyze_loop_vinfo (loop_vinfo);
  
@@ -15793,31 +15784,16 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
          innermost loop, also estimate the operations that would need
          to be issued by all relevant implementations of the loop.  */
        if (loop_vinfo
-         && m_vec_flags
-         && where == vect_body
+         && (m_costing_for_scalar || where == vect_body)
           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
-         && vectype
           && stmt_cost != 0)
         {
-         /* Record estimates for the scalar code.  */
-         count_ops (count, kind, stmt_info, vectype, &m_scalar_ops,
-                    vect_nunits_for_cost (vectype));
-
-         if (aarch64_sve_mode_p (m_vinfo->vector_mode)
-             && m_sve_ops.base_issue_info ())
-           {
-             /* Record estimates for a possible Advanced SIMD version
-                of the SVE code.  */
-             count_ops (count, kind, stmt_info, vectype, &m_advsimd_ops,
-                        aarch64_estimated_sve_vq ());
-
-             /* Record estimates for the SVE code itself.  */
-             count_ops (count, kind, stmt_info, vectype, &m_sve_ops, 1);
-           }
-         else
-           /* Record estimates for the Advanced SIMD code.  Treat SVE like
-              Advanced SIMD if the CPU has no specific SVE costs.  */
-           count_ops (count, kind, stmt_info, vectype, &m_advsimd_ops, 1);
+         count_ops (count, kind, stmt_info, vectype, &m_ops, 1);
+         if (aarch64_sve_mode_p (m_vinfo->vector_mode))
+           /* Record estimates for a possible Advanced SIMD version
+              of the SVE code.  */
+           count_ops (count, kind, stmt_info, vectype,
+                      &m_advsimd_ops, aarch64_estimated_sve_vq ());
         }
  
        /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
@@ -15885,7 +15861,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
    /* Estimate the minimum number of cycles per iteration needed to issue
       non-predicate operations.  */
    fractional_cost sve_nonpred_issue_cycles_per_iter
-    = aarch64_estimate_min_cycles_per_iter (&m_sve_ops, issue_info->sve);
+    = aarch64_estimate_min_cycles_per_iter (&m_ops, issue_info->sve);
  
    /* Estimate the minimum number of cycles per iteration needed to rename
       SVE instructions.
@@ -15901,9 +15877,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
         ??? This value is very much on the pessimistic side, but seems to work
         pretty well in practice.  */
      sve_rename_cycles_per_iter
-      = { m_sve_ops.general_ops
-         + m_sve_ops.loads
-         + m_sve_ops.pred_ops + 1, 5 };
+      = { m_ops.general_ops + m_ops.loads + m_ops.pred_ops + 1, 5 };
  
    /* Combine the rename and non-predicate issue limits into a single value.  */
    fractional_cost sve_nonpred_cycles_per_iter
@@ -15912,7 +15886,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
    /* Separately estimate the minimum number of cycles per iteration needed
       to issue the predicate operations.  */
    fractional_cost sve_pred_issue_cycles_per_iter
-    = { m_sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
+    = { m_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
  
    /* Calculate the overall limit on the number of cycles per iteration.  */
    fractional_cost sve_cycles_per_iter
@@ -15920,15 +15894,15 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
  
    if (dump_enabled_p ())
      {
-      m_sve_ops.dump ();
+      m_ops.dump ();
        dump_printf_loc (MSG_NOTE, vect_location,
                        "  estimated cycles per iteration = %f\n",
                        sve_cycles_per_iter.as_double ());
-      if (m_sve_ops.pred_ops)
+      if (m_ops.pred_ops)
         dump_printf_loc (MSG_NOTE, vect_location,
                          "    predicate issue = %f\n",
                          sve_pred_issue_cycles_per_iter.as_double ());
-      if (m_sve_ops.pred_ops || sve_rename_cycles_per_iter)
+      if (m_ops.pred_ops || sve_rename_cycles_per_iter)
         dump_printf_loc (MSG_NOTE, vect_location,
                          "    non-predicate issue = %f\n",
                          sve_nonpred_issue_cycles_per_iter.as_double ());
@@ -16008,8 +15982,13 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info,
  /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
     and return the new cost.  */
  unsigned int
-aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
+aarch64_vector_costs::
+adjust_body_cost (loop_vec_info loop_vinfo,
+                 const aarch64_vector_costs *scalar_costs,
+                 unsigned int body_cost)
  {
+  const auto &scalar_ops = scalar_costs->m_ops;
+  unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
    unsigned int orig_body_cost = body_cost;
    bool should_disparage = false;
  
@@ -16056,19 +16035,11 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
      return body_cost;
  
    fractional_cost scalar_cycles_per_iter
-    = aarch64_estimate_min_cycles_per_iter (&m_scalar_ops,
-                                           issue_info->scalar);
-
-  fractional_cost advsimd_cycles_per_iter
-    = aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops,
-                                           issue_info->advsimd);
+    = aarch64_estimate_min_cycles_per_iter (&scalar_ops, issue_info->scalar);
+  scalar_cycles_per_iter *= estimated_vf;
  
-  bool could_use_advsimd
-    = ((m_vec_flags & VEC_ADVSIMD)
-       || (aarch64_autovec_preference != 2
-          && (aarch64_tune_params.extra_tuning_flags
-              & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
-          && !m_saw_sve_only_op));
+  fractional_cost vector_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&m_ops, m_ops.base_issue_info ());
  
    if (dump_enabled_p ())
      {
@@ -16077,32 +16048,40 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
                          "Vector loop iterates at most %wd times\n",
                          m_num_vector_iterations);
        dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
-      m_scalar_ops.dump ();
+      scalar_ops.dump ();
        dump_printf_loc (MSG_NOTE, vect_location,
-                      "  estimated cycles per iteration = %f\n",
-                      scalar_cycles_per_iter.as_double ());
-      if (could_use_advsimd)
-       {
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "Advanced SIMD issue estimate:\n");
-         m_advsimd_ops.dump ();
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "  estimated cycles per iteration = %f\n",
-                          advsimd_cycles_per_iter.as_double ());
-       }
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "Loop could not use Advanced SIMD\n");
+                      "  estimated cycles per vector iteration"
+                      " (for VF %d) = %f\n",
+                      estimated_vf, scalar_cycles_per_iter.as_double ());
      }
  
-  fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter;
-  unsigned int vector_reduction_latency = m_advsimd_ops.reduction_latency;
-
    if ((m_vec_flags & VEC_ANY_SVE) && issue_info->sve)
      {
+      bool could_use_advsimd
+       = (aarch64_autovec_preference != 2
+          && (aarch64_tune_params.extra_tuning_flags
+              & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
+          && !m_saw_sve_only_op);
+
+      fractional_cost advsimd_cycles_per_iter
+       = aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops,
+                                               issue_info->advsimd);
        if (dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
-      vector_reduction_latency = m_sve_ops.reduction_latency;
+       {
+         if (could_use_advsimd)
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "Advanced SIMD issue estimate:\n");
+             m_advsimd_ops.dump ();
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "  estimated cycles per iteration = %f\n",
+                              advsimd_cycles_per_iter.as_double ());
+           }
+         else
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Loop could not use Advanced SIMD\n");
+         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
+       }
        vector_cycles_per_iter
         = adjust_body_cost_sve (issue_info, scalar_cycles_per_iter,
                                 advsimd_cycles_per_iter, could_use_advsimd,
@@ -16123,6 +16102,18 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
                                 &body_cost, &should_disparage);
         }
      }
+  else
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Vector issue estimate:\n");
+         m_ops.dump ();
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "  estimated cycles per iteration = %f\n",
+                          vector_cycles_per_iter.as_double ());
+       }
+    }
  
    /* Decide whether to stick to latency-based costs or whether to try to
       take issue rates into account.  */
@@ -16164,8 +16155,8 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
       vector code is an improvement, even if adding the other (non-loop-carried)
       latencies tends to hide this saving.  We therefore reduce the cost of the
       vector loop body in proportion to the saving.  */
-  else if (m_scalar_ops.reduction_latency > vector_reduction_latency
-          && m_scalar_ops.reduction_latency == scalar_cycles_per_iter
+  else if (scalar_ops.reduction_latency > m_ops.reduction_latency
+          && scalar_ops.reduction_latency == scalar_cycles_per_iter
            && scalar_cycles_per_iter > vector_cycles_per_iter
            && !should_disparage)
      {
@@ -16181,13 +16172,16 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost)
  }
  
  void
-aarch64_vector_costs::finish_cost (const vector_costs *scalar_costs)
+aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
  {
+  auto *scalar_costs
+    = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
    loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
    if (loop_vinfo
        && m_vec_flags
        && aarch64_use_new_vector_costs_p ())
-    m_costs[vect_body] = adjust_body_cost (m_costs[vect_body]);
+    m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
+                                          m_costs[vect_body]);
  
    vector_costs::finish_cost (scalar_costs);
  }
author	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 12 Nov 2021 17:33:01 +0000 (17:33 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 12 Nov 2021 17:33:01 +0000 (17:33 +0000)