]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/tree-vect-slp.c
Correct a function pre/postcondition [PR102403].
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
index f08797c2bc0c4d74074e951faea9cc19504ae631..024a1c38a2342246d7891db1de5f1d6e6458d5dd 100644 (file)
@@ -2311,6 +2311,10 @@ out:
              }
          if (dump_enabled_p ())
            dump_printf (MSG_NOTE, "\n");
+         /* After swapping some operands we lost track whether an
+            operand has any pattern defs so be conservative here.  */
+         if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
+           oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
          /* And try again with scratch 'matches' ... */
          bool *tem = XALLOCAVEC (bool, group_size);
          if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
@@ -2552,6 +2556,44 @@ debug (slp_tree node)
                       node);
 }
 
+/* Recursive helper for the dot producer below.  */
+
+static void
+dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
+{
+  if (visited.add (node))
+    return;
+
+  fprintf (f, "\"%p\" [label=\"", (void *)node);
+  vect_print_slp_tree (MSG_NOTE,
+                      dump_location_t::from_location_t (UNKNOWN_LOCATION),
+                      node);
+  fprintf (f, "\"];\n");
+
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    dot_slp_tree (f, child, visited);
+}
+
+DEBUG_FUNCTION void
+dot_slp_tree (const char *fname, slp_tree node)
+{
+  FILE *f = fopen (fname, "w");
+  fprintf (f, "digraph {\n");
+  fflush (f);
+    {
+      debug_dump_context ctx (f);
+      hash_set<slp_tree> visited;
+      dot_slp_tree (f, node, visited);
+    }
+  fflush (f);
+  fprintf (f, "}\n");
+  fclose (f);
+}
+
 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
 
 static void
@@ -3316,7 +3358,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
   else if (kind == slp_inst_kind_reduc_group)
     {
       /* Collect reduction statements.  */
-      vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
+      const vec<stmt_vec_info> &reductions
+       = as_a <loop_vec_info> (vinfo)->reductions;
       scalar_stmts.create (reductions.length ());
       for (i = 0; reductions.iterate (i, &next_info); i++)
        if (STMT_VINFO_RELEVANT_P (next_info)
@@ -3609,8 +3652,10 @@ vect_optimize_slp (vec_info *vinfo)
       slp_tree node = vertices[idx].node;
 
       /* Handle externals and constants optimistically throughout the
-        iteration.  */
-      if (SLP_TREE_DEF_TYPE (node) == vect_external_def
+        iteration.  But treat existing vectors as fixed since we
+        do not handle permuting them below.  */
+      if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
+          && !SLP_TREE_VEC_DEFS (node).exists ())
          || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
        continue;
 
@@ -3674,6 +3719,18 @@ vect_optimize_slp (vec_info *vinfo)
       vertices[idx].perm_out = perms.length () - 1;
     }
 
+  /* In addition to the above we have to mark outgoing permutes facing
+     non-reduction graph entries that are not represented as to be
+     materialized.  */
+  for (slp_instance instance : vinfo->slp_instances)
+    if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
+      {
+       /* Just setting perm_out isn't enough for the propagation to
+          pick this up.  */
+       vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
+       vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
+      }
+
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;
   bool do_materialization = false;
@@ -3728,6 +3785,8 @@ vect_optimize_slp (vec_info *vinfo)
                  case CFN_COMPLEX_MUL:
                  case CFN_COMPLEX_MUL_CONJ:
                  case CFN_VEC_ADDSUB:
+                 case CFN_VEC_FMADDSUB:
+                 case CFN_VEC_FMSUBADD:
                    vertices[idx].perm_in = 0;
                    vertices[idx].perm_out = 0;
                  default:;
@@ -4132,7 +4191,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
 {
   unsigned int i;
   poly_uint64 unrolling_factor = 1;
-  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  const vec<slp_instance> &slp_instances
+    = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
   slp_instance instance;
   int decided_to_slp = 0;
 
@@ -4791,24 +4851,28 @@ static bool
 vectorizable_bb_reduc_epilogue (slp_instance instance,
                                stmt_vector_for_cost *cost_vec)
 {
-  enum tree_code reduc_code
-    = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+  gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
+  enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
   if (reduc_code == MINUS_EXPR)
     reduc_code = PLUS_EXPR;
   internal_fn reduc_fn;
   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
       || reduc_fn == IFN_LAST
-      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
+      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
+      || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
+                                    TREE_TYPE (vectype)))
     return false;
 
   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
-     cost log2 vector operations plus shuffles.  */
+     cost log2 vector operations plus shuffles and one extraction.  */
   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
                    vectype, 0, vect_body);
   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
                    vectype, 0, vect_body);
+  record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
+                   vectype, 0, vect_body);
   return true;
 }
 
@@ -5040,6 +5104,42 @@ vect_bb_partition_graph (bb_vec_info bb_vinfo)
     }
 }
 
+/* Compute the set of scalar stmts participating in internal and external
+   nodes.  */
+
+static void
+vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
+                                        hash_set<slp_tree> &visited,
+                                        hash_set<stmt_vec_info> &vstmts,
+                                        hash_set<stmt_vec_info> &estmts)
+{
+  int i;
+  stmt_vec_info stmt_info;
+  slp_tree child;
+
+  if (visited.add (node))
+    return;
+
+  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
+    {
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+       vstmts.add (stmt_info);
+
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+       if (child)
+         vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
+                                                  vstmts, estmts);
+    }
+  else
+    for (tree def : SLP_TREE_SCALAR_OPS (node))
+      {
+       stmt_vec_info def_stmt = vinfo->lookup_def (def);
+       if (def_stmt)
+         estmts.add (def_stmt);
+      }
+}
+
+
 /* Compute the scalar cost of the SLP node NODE and its children
    and return it.  Do not account defs that are marked in LIFE and
    update LIFE according to uses of NODE.  */
@@ -5048,6 +5148,7 @@ static void
 vect_bb_slp_scalar_cost (vec_info *vinfo,
                         slp_tree node, vec<bool, va_heap> *life,
                         stmt_vector_for_cost *cost_vec,
+                        hash_set<stmt_vec_info> &vectorized_scalar_stmts,
                         hash_set<slp_tree> &visited)
 {
   unsigned i;
@@ -5084,8 +5185,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
                  {
                    stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
                    if (!use_stmt_info
-                       || !PURE_SLP_STMT
-                             (vect_stmt_to_vectorize (use_stmt_info)))
+                       || !vectorized_scalar_stmts.contains (use_stmt_info))
                      {
                        (*life)[i] = true;
                        break;
@@ -5148,7 +5248,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
              subtree_life.safe_splice (*life);
            }
          vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
-                                  visited);
+                                  vectorized_scalar_stmts, visited);
          subtree_life.truncate (0);
        }
     }
@@ -5173,7 +5273,8 @@ li_cost_vec_cmp (const void *a_, const void *b_)
 
 static bool
 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
-                                   vec<slp_instance> slp_instances)
+                                   vec<slp_instance> slp_instances,
+                                   loop_p orig_loop)
 {
   slp_instance instance;
   int i;
@@ -5189,11 +5290,33 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
                              SLP_INSTANCE_TREE (instance), visited);
     }
 
+  /* Compute the set of scalar stmts we know will go away 'locally' when
+     vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
+     not accurate for nodes promoted extern late or for scalar stmts that
+     are used both in extern defs and in vectorized defs.  */
+  hash_set<stmt_vec_info> vectorized_scalar_stmts;
+  hash_set<stmt_vec_info> scalar_stmts_in_externs;
+  hash_set<slp_tree> visited;
+  FOR_EACH_VEC_ELT (slp_instances, i, instance)
+    {
+      vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
+                                              SLP_INSTANCE_TREE (instance),
+                                              visited,
+                                              vectorized_scalar_stmts,
+                                              scalar_stmts_in_externs);
+      for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
+       vectorized_scalar_stmts.add (rstmt);
+    }
+  /* Scalar stmts used as defs in external nodes need to be preseved, so
+     remove them from vectorized_scalar_stmts.  */
+  for (stmt_vec_info stmt : scalar_stmts_in_externs)
+    vectorized_scalar_stmts.remove (stmt);
+
   /* Calculate scalar cost and sum the cost for the vector stmts
      previously collected.  */
   stmt_vector_for_cost scalar_costs = vNULL;
   stmt_vector_for_cost vector_costs = vNULL;
-  hash_set<slp_tree> visited;
+  visited.empty ();
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
@@ -5206,14 +5329,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
                          SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
       vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
-                              &life, &scalar_costs, visited);
+                              &life, &scalar_costs, vectorized_scalar_stmts,
+                              visited);
       vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
-  /* Unset visited flag.  */
-  stmt_info_for_cost *cost;
-  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
-    gimple_set_visited  (cost->stmt_info->stmt, false);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
@@ -5230,6 +5350,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
     li_scalar_costs (scalar_costs.length ());
   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
     li_vector_costs (vector_costs.length ());
+  stmt_info_for_cost *cost;
   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
     {
       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
@@ -5252,6 +5373,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
   /* Now cost the portions individually.  */
   unsigned vi = 0;
   unsigned si = 0;
+  bool profitable = true;
   while (si < li_scalar_costs.length ()
         && vi < li_vector_costs.length ())
     {
@@ -5318,25 +5440,29 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
         example).  */
       if (vec_outside_cost + vec_inside_cost > scalar_cost)
        {
-         scalar_costs.release ();
-         vector_costs.release ();
-         return false;
+         profitable = false;
+         break;
        }
     }
-  if (vi < li_vector_costs.length ())
+  if (profitable && vi < li_vector_costs.length ())
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                         "Excess vector cost for part in loop %d:\n",
                         li_vector_costs[vi].first);
-      scalar_costs.release ();
-      vector_costs.release ();
-      return false;
+      profitable = false;
     }
 
+  /* Unset visited flag.  This is delayed when the subgraph is profitable
+     and we process the loop for remaining unvectorized if-converted code.  */
+  if (!orig_loop || !profitable)
+    FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+      gimple_set_visited  (cost->stmt_info->stmt, false);
+
   scalar_costs.release ();
   vector_costs.release ();
-  return true;
+
+  return profitable;
 }
 
 /* qsort comparator for lane defs.  */
@@ -5750,7 +5876,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
 static bool
 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
-                vec<int> *dataref_groups, unsigned int n_stmts)
+                vec<int> *dataref_groups, unsigned int n_stmts,
+                loop_p orig_loop)
 {
   bb_vec_info bb_vinfo;
   auto_vector_modes vector_modes;
@@ -5789,9 +5916,8 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 
          bb_vinfo->shared->check_datarefs ();
 
-         unsigned i;
-         slp_instance instance;
-         FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
+         auto_vec<slp_instance> profitable_subgraphs;
+         for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
            {
              if (instance->subgraph_entries.is_empty ())
                continue;
@@ -5799,7 +5925,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
              vect_location = instance->location ();
              if (!unlimited_cost_model (NULL)
                  && !vect_bb_vectorization_profitable_p
-                       (bb_vinfo, instance->subgraph_entries))
+                       (bb_vinfo, instance->subgraph_entries, orig_loop))
                {
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5811,6 +5937,47 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
              if (!dbg_cnt (vect_slp))
                continue;
 
+             profitable_subgraphs.safe_push (instance);
+           }
+
+         /* When we're vectorizing an if-converted loop body with the
+            very-cheap cost model make sure we vectorized all if-converted
+            code.  */
+         if (!profitable_subgraphs.is_empty ()
+             && orig_loop)
+           {
+             gcc_assert (bb_vinfo->bbs.length () == 1);
+             for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+                  !gsi_end_p (gsi); gsi_next (&gsi))
+               {
+                 /* The costing above left us with DCEable vectorized scalar
+                    stmts having the visited flag set on profitable
+                    subgraphs.  Do the delayed clearing of the flag here.  */
+                 if (gimple_visited_p (gsi_stmt (gsi)))
+                   {
+                     gimple_set_visited (gsi_stmt (gsi), false);
+                     continue;
+                   }
+                 if (flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
+                   continue;
+
+                 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+                   if (gimple_assign_rhs_code (ass) == COND_EXPR)
+                     {
+                       if (!profitable_subgraphs.is_empty ()
+                           && dump_enabled_p ())
+                         dump_printf_loc (MSG_NOTE, vect_location,
+                                          "not profitable because of "
+                                          "unprofitable if-converted scalar "
+                                          "code\n");
+                       profitable_subgraphs.truncate (0);
+                     }
+               }
+           }
+
+         /* Finally schedule the profitable subgraphs.  */
+         for (slp_instance instance : profitable_subgraphs)
+           {
              if (!vectorized && dump_enabled_p ())
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "Basic block will be vectorized "
@@ -5899,7 +6066,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
    true if anything in the basic-block was vectorized.  */
 
 static bool
-vect_slp_bbs (vec<basic_block> bbs)
+vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
 {
   vec<data_reference_p> datarefs = vNULL;
   auto_vec<int> dataref_groups;
@@ -5925,20 +6092,24 @@ vect_slp_bbs (vec<basic_block> bbs)
                                              &dataref_groups, current_group))
            ++current_group;
        }
+      /* New BBs always start a new DR group.  */
+      ++current_group;
     }
 
-  return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
+  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
 }
 
-/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
-   true if anything in the basic-block was vectorized.  */
+/* Special entry for the BB vectorizer.  Analyze and transform a single
+   if-converted BB with ORIG_LOOPs body being the not if-converted
+   representation.  Returns true if anything in the basic-block was
+   vectorized.  */
 
 bool
-vect_slp_bb (basic_block bb)
+vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
 {
   auto_vec<basic_block> bbs;
   bbs.safe_push (bb);
-  return vect_slp_bbs (bbs);
+  return vect_slp_bbs (bbs, orig_loop);
 }
 
 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
@@ -5989,7 +6160,7 @@ vect_slp_function (function *fun)
 
       if (split && !bbs.is_empty ())
        {
-         r |= vect_slp_bbs (bbs);
+         r |= vect_slp_bbs (bbs, NULL);
          bbs.truncate (0);
          bbs.quick_push (bb);
        }
@@ -6007,13 +6178,13 @@ vect_slp_function (function *fun)
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                               "splitting region at control altering "
                               "definition %G", last);
-           r |= vect_slp_bbs (bbs);
+           r |= vect_slp_bbs (bbs, NULL);
            bbs.truncate (0);
          }
     }
 
   if (!bbs.is_empty ())
-    r |= vect_slp_bbs (bbs);
+    r |= vect_slp_bbs (bbs, NULL);
 
   free (rpo);
 
@@ -6044,7 +6215,7 @@ vect_slp_function (function *fun)
 
 void
 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
-                         vec<tree> elts, unsigned int nresults,
+                         const vec<tree> &elts, unsigned int nresults,
                          vec<tree> &results)
 {
   unsigned int nelts = elts.length ();
@@ -6400,7 +6571,7 @@ vect_get_slp_defs (vec_info *,
 
 bool
 vect_transform_slp_perm_load (vec_info *vinfo,
-                             slp_tree node, vec<tree> dr_chain,
+                             slp_tree node, const vec<tree> &dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
                              bool analyze_only, unsigned *n_perms,
                              unsigned int *n_loads, bool dce_chain)
@@ -7429,7 +7600,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
 
 void
-vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
+vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
 {
   slp_instance instance;
   unsigned int i;