]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/tree-vect-slp.c
Correct a function pre/postcondition [PR102403].
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
index 6237a61ffd44ddbc034f7befa3be70c6b23d19b6..024a1c38a2342246d7891db1de5f1d6e6458d5dd 100644 (file)
@@ -1177,21 +1177,6 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
              continue;
            }
 
-         if (need_same_oprnds)
-           {
-             tree other_op1 = (call_stmt
-                               ? gimple_call_arg (call_stmt, 1)
-                               : gimple_assign_rhs2 (stmt));
-             if (!operand_equal_p (first_op1, other_op1, 0))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "Build SLP failed: different shift "
-                                    "arguments in %G", stmt);
-                 /* Mismatch.  */
-                 continue;
-               }
-           }
          if (!load_p
              && first_stmt_code == BIT_FIELD_REF
              && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
@@ -1231,6 +1216,22 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
              continue;
            }
 
+         if (need_same_oprnds)
+           {
+             tree other_op1 = (call_stmt
+                               ? gimple_call_arg (call_stmt, 1)
+                               : gimple_assign_rhs2 (stmt));
+             if (!operand_equal_p (first_op1, other_op1, 0))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different shift "
+                                    "arguments in %G", stmt);
+                 /* Mismatch.  */
+                 continue;
+               }
+           }
+
          if (!types_compatible_p (vectype, *node_vectype))
            {
              if (dump_enabled_p ())
@@ -1442,6 +1443,80 @@ dt_sort_cmp (const void *op1_, const void *op2_, void *)
   return (int)op1->code - (int)op2->code;
 }
 
+/* Linearize the associatable expression chain at START with the
+   associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
+   filling CHAIN with the result and using WORKLIST as intermediate storage.
+   CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
+   or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
+   stmts, starting with START.  */
+
+static void
+vect_slp_linearize_chain (vec_info *vinfo,
+                         vec<std::pair<tree_code, gimple *> > &worklist,
+                         vec<chain_op_t> &chain,
+                         enum tree_code code, gimple *start,
+                         gimple *&code_stmt, gimple *&alt_code_stmt,
+                         vec<gimple *> *chain_stmts)
+{
+  /* For each lane linearize the addition/subtraction (or other
+     uniform associatable operation) expression tree.  */
+  worklist.safe_push (std::make_pair (code, start));
+  while (!worklist.is_empty ())
+    {
+      auto entry = worklist.pop ();
+      gassign *stmt = as_a <gassign *> (entry.second);
+      enum tree_code in_code = entry.first;
+      enum tree_code this_code = gimple_assign_rhs_code (stmt);
+      /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
+      if (!code_stmt
+         && gimple_assign_rhs_code (stmt) == code)
+       code_stmt = stmt;
+      else if (!alt_code_stmt
+              && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
+       alt_code_stmt = stmt;
+      if (chain_stmts)
+       chain_stmts->safe_push (stmt);
+      for (unsigned opnum = 1; opnum <= 2; ++opnum)
+       {
+         tree op = gimple_op (stmt, opnum);
+         vect_def_type dt;
+         stmt_vec_info def_stmt_info;
+         bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
+         gcc_assert (res);
+         if (dt == vect_internal_def
+             && is_pattern_stmt_p (def_stmt_info))
+           op = gimple_get_lhs (def_stmt_info->stmt);
+         gimple *use_stmt;
+         use_operand_p use_p;
+         if (dt == vect_internal_def
+             && single_imm_use (op, &use_p, &use_stmt)
+             && is_gimple_assign (def_stmt_info->stmt)
+             && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
+                 || (code == PLUS_EXPR
+                     && (gimple_assign_rhs_code (def_stmt_info->stmt)
+                         == MINUS_EXPR))))
+           {
+             tree_code op_def_code = this_code;
+             if (op_def_code == MINUS_EXPR && opnum == 1)
+               op_def_code = PLUS_EXPR;
+             if (in_code == MINUS_EXPR)
+               op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
+             worklist.safe_push (std::make_pair (op_def_code,
+                                                 def_stmt_info->stmt));
+           }
+         else
+           {
+             tree_code op_def_code = this_code;
+             if (op_def_code == MINUS_EXPR && opnum == 1)
+               op_def_code = PLUS_EXPR;
+             if (in_code == MINUS_EXPR)
+               op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
+             chain.safe_push (chain_op_t (op_def_code, dt, op));
+           }
+       }
+    }
+}
+
 typedef hash_map <vec <stmt_vec_info>, slp_tree,
                  simple_hashmap_traits <bst_traits, slp_tree> >
   scalar_stmts_to_slp_tree_map_t;
@@ -1517,6 +1592,14 @@ vect_build_slp_tree (vec_info *vinfo,
       SLP_TREE_SCALAR_STMTS (res) = vNULL;
       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
       res->failed = XNEWVEC (bool, group_size);
+      if (flag_checking)
+       {
+         unsigned i;
+         for (i = 0; i < group_size; ++i)
+           if (!matches[i])
+             break;
+         gcc_assert (i < group_size);
+       }
       memcpy (res->failed, matches, sizeof (bool) * group_size);
     }
   else
@@ -1536,13 +1619,12 @@ vect_build_slp_tree (vec_info *vinfo,
 /* Helper for building an associated SLP node chain.  */
 
 static void
-vect_slp_build_two_operator_nodes (slp_tree perm,
+vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
                                   slp_tree op0, slp_tree op1,
                                   stmt_vec_info oper1, stmt_vec_info oper2,
                                   vec<std::pair<unsigned, unsigned> > lperm)
 {
   unsigned group_size = SLP_TREE_LANES (op1);
-  tree vectype = SLP_TREE_VECTYPE (op1);
 
   slp_tree child1 = new _slp_tree;
   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
@@ -1785,63 +1867,14 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
        {
          /* For each lane linearize the addition/subtraction (or other
             uniform associatable operation) expression tree.  */
-         worklist.safe_push (std::make_pair (code, stmts[lane]->stmt));
-         while (!worklist.is_empty ())
-           {
-             auto entry = worklist.pop ();
-             gassign *stmt = as_a <gassign *> (entry.second);
-             enum tree_code in_code = entry.first;
-             enum tree_code this_code = gimple_assign_rhs_code (stmt);
-             /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
-             if (!op_stmt_info
-                 && gimple_assign_rhs_code (stmt) == code)
-               op_stmt_info = vinfo->lookup_stmt (stmt);
-             else if (!other_op_stmt_info
-                      && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
-               other_op_stmt_info = vinfo->lookup_stmt (stmt);
-             for (unsigned opnum = 1; opnum <= 2; ++opnum)
-               {
-                 tree op = gimple_op (stmt, opnum);
-                 vect_def_type dt;
-                 stmt_vec_info def_stmt_info;
-                 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
-                 gcc_assert (res);
-                 if (dt == vect_internal_def)
-                   {
-                     def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
-                     op = gimple_get_lhs (def_stmt_info->stmt);
-                   }
-                 gimple *use_stmt;
-                 use_operand_p use_p;
-                 if (dt == vect_internal_def
-                     && single_imm_use (op, &use_p, &use_stmt)
-                     && is_gimple_assign (def_stmt_info->stmt)
-                     && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
-                         || (code == PLUS_EXPR
-                             && (gimple_assign_rhs_code (def_stmt_info->stmt)
-                                 == MINUS_EXPR))))
-                   {
-                     tree_code op_def_code = this_code;
-                     if (op_def_code == MINUS_EXPR && opnum == 1)
-                       op_def_code = PLUS_EXPR;
-                     if (in_code == MINUS_EXPR)
-                       op_def_code
-                         = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
-                     worklist.safe_push (std::make_pair (op_def_code,
-                                                         def_stmt_info->stmt));
-                   }
-                 else
-                   {
-                     tree_code op_def_code = this_code;
-                     if (op_def_code == MINUS_EXPR && opnum == 1)
-                       op_def_code = PLUS_EXPR;
-                     if (in_code == MINUS_EXPR)
-                       op_def_code
-                         = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
-                     chain.safe_push (chain_op_t (op_def_code, dt, op));
-                   }
-               }
-           }
+         gimple *op_stmt = NULL, *other_op_stmt = NULL;
+         vect_slp_linearize_chain (vinfo, worklist, chain, code,
+                                   stmts[lane]->stmt, op_stmt, other_op_stmt,
+                                   NULL);
+         if (!op_stmt_info && op_stmt)
+           op_stmt_info = vinfo->lookup_stmt (op_stmt);
+         if (!other_op_stmt_info && other_op_stmt)
+           other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
          if (chain.length () == 2)
            {
              /* In a chain of just two elements resort to the regular
@@ -1849,19 +1882,39 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                 mismatch still hard-FAIL.  */
              if (chain_len == 0)
                hard_fail = false;
+             else
+               {
+                 matches[lane] = false;
+                 /* ???  We might want to process the other lanes, but
+                    make sure to not give false matching hints to the
+                    caller for lanes we did not process.  */
+                 if (lane != group_size - 1)
+                   matches[0] = false;
+               }
              break;
            }
          else if (chain_len == 0)
            chain_len = chain.length ();
          else if (chain.length () != chain_len)
-           /* ???  Here we could slip in magic to compensate with
-              neutral operands.  */
-           break;
+           {
+             /* ???  Here we could slip in magic to compensate with
+                neutral operands.  */
+             matches[lane] = false;
+             if (lane != group_size - 1)
+               matches[0] = false;
+             break;
+           }
          chains.quick_push (chain.copy ());
          chain.truncate (0);
        }
       if (chains.length () == group_size)
        {
+         /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
+         if (!op_stmt_info)
+           {
+             hard_fail = false;
+             goto out;
+           }
          /* Now we have a set of chains with the same length.  */
          /* 1. pre-sort according to def_type and operation.  */
          for (unsigned lane = 0; lane < group_size; ++lane)
@@ -1903,20 +1956,23 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                    dump_printf_loc (MSG_NOTE, vect_location,
                                     "giving up on chain due to mismatched "
                                     "def types\n");
+                 matches[lane] = false;
+                 if (lane != group_size - 1)
+                   matches[0] = false;
                  goto out;
                }
              if (dt == vect_constant_def
                  || dt == vect_external_def)
                {
-               /* We can always build those.  Might want to sort last
-                  or defer building.  */
-                  vec<tree> ops;
-                  ops.create (group_size);
-                  for (lane = 0; lane < group_size; ++lane)
-                    ops.quick_push (chains[lane][n].op);
-                  slp_tree child = vect_create_new_slp_node (ops);
-                  SLP_TREE_DEF_TYPE (child) = dt;
-                  children.safe_push (child);
+                 /* We can always build those.  Might want to sort last
+                    or defer building.  */
+                 vec<tree> ops;
+                 ops.create (group_size);
+                 for (lane = 0; lane < group_size; ++lane)
+                   ops.quick_push (chains[lane][n].op);
+                 slp_tree child = vect_create_new_slp_node (ops);
+                 SLP_TREE_DEF_TYPE (child) = dt;
+                 children.safe_push (child);
                }
              else if (dt != vect_internal_def)
                {
@@ -1981,6 +2037,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                        dump_printf_loc (MSG_NOTE, vect_location,
                                         "failed to match up op %d\n", n);
                      op_stmts.release ();
+                     if (lane != group_size - 1)
+                       matches[0] = false;
+                     else
+                       matches[lane] = false;
                      goto out;
                    }
                  if (dump_enabled_p ())
@@ -2065,7 +2125,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                  for (unsigned lane = 0; lane < group_size; ++lane)
                    lperm.quick_push (std::make_pair
                      (chains[lane][i].code != chains[0][i].code, lane));
-                 vect_slp_build_two_operator_nodes (child, op0, op1,
+                 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
                                                     (chains[0][i].code == code
                                                      ? op_stmt_info
                                                      : other_op_stmt_info),
@@ -2251,6 +2311,10 @@ out:
              }
          if (dump_enabled_p ())
            dump_printf (MSG_NOTE, "\n");
+         /* After swapping some operands we lost track whether an
+            operand has any pattern defs so be conservative here.  */
+         if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
+           oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
          /* And try again with scratch 'matches' ... */
          bool *tem = XALLOCAVEC (bool, group_size);
          if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
@@ -2492,6 +2556,44 @@ debug (slp_tree node)
                       node);
 }
 
+/* Recursive helper for the dot producer below.  */
+
+static void
+dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
+{
+  if (visited.add (node))
+    return;
+
+  fprintf (f, "\"%p\" [label=\"", (void *)node);
+  vect_print_slp_tree (MSG_NOTE,
+                      dump_location_t::from_location_t (UNKNOWN_LOCATION),
+                      node);
+  fprintf (f, "\"];\n");
+
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    dot_slp_tree (f, child, visited);
+}
+
+DEBUG_FUNCTION void
+dot_slp_tree (const char *fname, slp_tree node)
+{
+  FILE *f = fopen (fname, "w");
+  fprintf (f, "digraph {\n");
+  fflush (f);
+    {
+      debug_dump_context ctx (f);
+      hash_set<slp_tree> visited;
+      dot_slp_tree (f, node, visited);
+    }
+  fflush (f);
+  fprintf (f, "}\n");
+  fclose (f);
+}
+
 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
 
 static void
@@ -3256,7 +3358,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
   else if (kind == slp_inst_kind_reduc_group)
     {
       /* Collect reduction statements.  */
-      vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
+      const vec<stmt_vec_info> &reductions
+       = as_a <loop_vec_info> (vinfo)->reductions;
       scalar_stmts.create (reductions.length ());
       for (i = 0; reductions.iterate (i, &next_info); i++)
        if (STMT_VINFO_RELEVANT_P (next_info)
@@ -3407,11 +3510,29 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
   return opt_result::success ();
 }
 
+struct slpg_vertex
+{
+  slpg_vertex (slp_tree node_)
+    : node (node_), perm_in (-1), perm_out (-1) {}
+
+  int get_perm_materialized () const
+    { return perm_in != perm_out ? perm_in : 0; }
+
+  slp_tree node;
+  /* The common permutation on the incoming lanes (towards SLP children).  */
+  int perm_in;
+  /* The permutation on the outgoing lanes (towards SLP parents).  When
+     the node is a materialization point for a permute this differs
+     from perm_in (and is then usually zero).  Materialization happens
+     on the input side.  */
+  int perm_out;
+};
+
 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
 
 static void
 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
-                        vec<slp_tree> &vertices, vec<int> &leafs)
+                        vec<slpg_vertex> &vertices, vec<int> &leafs)
 {
   unsigned i;
   slp_tree child;
@@ -3420,41 +3541,39 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
     return;
 
   node->vertex = vertices.length ();
-  vertices.safe_push (node);
+  vertices.safe_push (slpg_vertex (node));
 
   bool leaf = true;
+  bool force_leaf = false;
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (child)
       {
        leaf = false;
        vect_slp_build_vertices (visited, child, vertices, leafs);
       }
-  if (leaf)
+    else
+      force_leaf = true;
+  /* Since SLP discovery works along use-def edges all cycles have an
+     entry - but there's the exception of cycles where we do not handle
+     the entry explicitely (but with a NULL SLP node), like some reductions
+     and inductions.  Force those SLP PHIs to act as leafs to make them
+     backwards reachable.  */
+  if (leaf || force_leaf)
     leafs.safe_push (node->vertex);
 }
 
 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
 
 static void
-vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
+vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
                         vec<int> &leafs)
 {
   hash_set<slp_tree> visited;
   unsigned i;
   slp_instance instance;
   FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
-    {
-      unsigned n_v = vertices.length ();
-      unsigned n_l = leafs.length ();
-      vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
-                              leafs);
-      /* If we added vertices but no entries to the reverse graph we've
-        added a cycle that is not backwards-reachable.   Push the entry
-        to mimic as leaf then.  */
-      if (vertices.length () > n_v
-         && leafs.length () == n_l)
-       leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
-    }
+    vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
+                            leafs);
 }
 
 /* Apply (reverse) bijectite PERM to VEC.  */
@@ -3493,7 +3612,8 @@ vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
                   int perm_a, int perm_b)
 {
   return (perm_a == perm_b
-         || (perms[perm_a].length () == perms[perm_b].length ()
+         || (perm_a != -1 && perm_b != -1
+             && perms[perm_a].length () == perms[perm_b].length ()
              && memcmp (&perms[perm_a][0], &perms[perm_b][0],
                         sizeof (unsigned) * perms[perm_a].length ()) == 0));
 }
@@ -3508,50 +3628,45 @@ vect_optimize_slp (vec_info *vinfo)
 
   slp_tree node;
   unsigned i;
-  auto_vec<slp_tree> vertices;
+  auto_vec<slpg_vertex> vertices;
   auto_vec<int> leafs;
   vect_slp_build_vertices (vinfo, vertices, leafs);
 
   struct graph *slpg = new_graph (vertices.length ());
-  FOR_EACH_VEC_ELT (vertices, i, node)
-    {
-      unsigned j;
-      slp_tree child;
-      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-       if (child)
-         add_edge (slpg, i, child->vertex);
-    }
+  for (slpg_vertex &v : vertices)
+    for (slp_tree child : SLP_TREE_CHILDREN (v.node))
+      if (child)
+       add_edge (slpg, v.node->vertex, child->vertex);
 
   /* Compute (reverse) postorder on the inverted graph.  */
   auto_vec<int> ipo;
   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
-  auto_sbitmap n_visited (vertices.length ());
-  auto_sbitmap n_materialize (vertices.length ());
-  auto_vec<int> n_perm (vertices.length ());
   auto_vec<vec<unsigned> > perms;
-
-  bitmap_clear (n_visited);
-  bitmap_clear (n_materialize);
-  n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
   /* Produce initial permutations.  */
   for (i = 0; i < leafs.length (); ++i)
     {
       int idx = leafs[i];
-      slp_tree node = vertices[idx];
+      slp_tree node = vertices[idx].node;
 
       /* Handle externals and constants optimistically throughout the
-        iteration.  */
-      if (SLP_TREE_DEF_TYPE (node) == vect_external_def
+        iteration.  But treat existing vectors as fixed since we
+        do not handle permuting them below.  */
+      if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
+          && !SLP_TREE_VEC_DEFS (node).exists ())
          || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
        continue;
 
       /* Leafs do not change across iterations.  Note leafs also double
         as entries to the reverse graph.  */
       if (!slpg->vertices[idx].succ)
-       bitmap_set_bit (n_visited, idx);
+       {
+         vertices[idx].perm_in = 0;
+         vertices[idx].perm_out = 0;
+       }
+
       /* Loads are the only thing generating permutes.  */
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
        continue;
@@ -3600,104 +3715,185 @@ vect_optimize_slp (vec_info *vinfo)
       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
        perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
       perms.safe_push (perm);
-      n_perm[idx] = perms.length () - 1;
+      vertices[idx].perm_in = perms.length () - 1;
+      vertices[idx].perm_out = perms.length () - 1;
     }
 
+  /* In addition to the above we have to mark outgoing permutes facing
+     non-reduction graph entries that are not represented as to be
+     materialized.  */
+  for (slp_instance instance : vinfo->slp_instances)
+    if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
+      {
+       /* Just setting perm_out isn't enough for the propagation to
+          pick this up.  */
+       vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
+       vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
+      }
+
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;
+  bool do_materialization = false;
   unsigned iteration = 0;
   do
     {
       changed = false;
       ++iteration;
 
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "SLP optimize iteration %d\n", iteration);
+
       for (i = vertices.length (); i > 0 ; --i)
        {
          int idx = ipo[i-1];
-         slp_tree node = vertices[idx];
-         /* For leafs there's nothing to do - we've seeded permutes
-            on those above.  */
-         if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
-           continue;
+         slp_tree node = vertices[idx].node;
 
-         bitmap_set_bit (n_visited, idx);
-
-         /* We cannot move a permute across a store.  */
-         if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
-             && DR_IS_WRITE
-                  (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
+         /* Handle externals and constants optimistically throughout the
+            iteration.  */
+         if (SLP_TREE_DEF_TYPE (node) == vect_external_def
+             || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
            continue;
 
-         int perm = -1;
-         for (graph_edge *succ = slpg->vertices[idx].succ;
-              succ; succ = succ->succ_next)
+         /* We still eventually have failed backedge SLP nodes in the
+            graph, those are only cancelled when analyzing operations.
+            Simply treat them as transparent ops, propagating permutes
+            through them.  */
+         if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
            {
-             int succ_idx = succ->dest;
-             /* Handle unvisited nodes optimistically.  */
-             /* ???  But for constants once we want to handle non-bijective
-                permutes we have to verify the permute, when unifying lanes,
-                will not unify different constants.  For example see
-                gcc.dg/vect/bb-slp-14.c for a case that would break.  */
-             if (!bitmap_bit_p (n_visited, succ_idx))
-               continue;
-             int succ_perm = n_perm[succ_idx];
-             /* Once we materialize succs permutation its output lanes
-                appear unpermuted to us.  */
-             if (bitmap_bit_p (n_materialize, succ_idx))
-               succ_perm = 0;
-             if (perm == -1)
-               perm = succ_perm;
-             else if (succ_perm == 0)
-               {
-                 perm = 0;
-                 break;
-               }
-             else if (!vect_slp_perms_eq (perms, perm, succ_perm))
+             /* We do not handle stores with a permutation, so all
+                incoming permutes must have been materialized.  */
+             stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
+             if (STMT_VINFO_DATA_REF (rep)
+                 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
                {
-                 perm = 0;
-                 break;
+                 /* ???  We're forcing materialization in place
+                    of the child here, we'd need special handling
+                    in materialization to leave perm_in -1 here.  */
+                 vertices[idx].perm_in = 0;
+                 vertices[idx].perm_out = 0;
                }
+             /* We cannot move a permute across an operation that is
+                not independent on lanes.  Note this is an explicit
+                negative list since that's much shorter than the respective
+                positive one but it's critical to keep maintaining it.  */
+             if (is_gimple_call (STMT_VINFO_STMT (rep)))
+               switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
+                 {
+                 case CFN_COMPLEX_ADD_ROT90:
+                 case CFN_COMPLEX_ADD_ROT270:
+                 case CFN_COMPLEX_MUL:
+                 case CFN_COMPLEX_MUL_CONJ:
+                 case CFN_VEC_ADDSUB:
+                 case CFN_VEC_FMADDSUB:
+                 case CFN_VEC_FMSUBADD:
+                   vertices[idx].perm_in = 0;
+                   vertices[idx].perm_out = 0;
+                 default:;
+                 }
            }
 
-         if (perm == -1)
+         if (!slpg->vertices[idx].succ)
            /* Pick up pre-computed leaf values.  */
-           perm = n_perm[idx];
-         else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
+           ;
+         else
            {
-             if (iteration > 1)
-               /* Make sure we eventually converge.  */
-               gcc_checking_assert (perm == 0);
-             n_perm[idx] = perm;
-             if (perm == 0)
-               bitmap_clear_bit (n_materialize, idx);
-             changed = true;
+             bool any_succ_perm_out_m1 = false;
+             int perm_in = vertices[idx].perm_in;
+             for (graph_edge *succ = slpg->vertices[idx].succ;
+                  succ; succ = succ->succ_next)
+               {
+                 int succ_idx = succ->dest;
+                 int succ_perm = vertices[succ_idx].perm_out;
+                 /* Handle unvisited (and constant) nodes optimistically.  */
+                 /* ???  But for constants once we want to handle
+                    non-bijective permutes we have to verify the permute,
+                    when unifying lanes, will not unify different constants.
+                    For example see gcc.dg/vect/bb-slp-14.c for a case
+                    that would break.  */
+                 if (succ_perm == -1)
+                   {
+                     /* When we handled a non-leaf optimistically, note
+                        that so we can adjust its outgoing permute below.  */
+                     slp_tree succ_node = vertices[succ_idx].node;
+                     if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+                       any_succ_perm_out_m1 = true;
+                     continue;
+                   }
+                 if (perm_in == -1)
+                   perm_in = succ_perm;
+                 else if (succ_perm == 0
+                          || !vect_slp_perms_eq (perms, perm_in, succ_perm))
+                   {
+                     perm_in = 0;
+                     break;
+                   }
+               }
+
+             /* Adjust any incoming permutes we treated optimistically.  */
+             if (perm_in != -1 && any_succ_perm_out_m1)
+               {
+                 for (graph_edge *succ = slpg->vertices[idx].succ;
+                      succ; succ = succ->succ_next)
+                   {
+                     slp_tree succ_node = vertices[succ->dest].node;
+                     if (vertices[succ->dest].perm_out == -1
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+                       {
+                         vertices[succ->dest].perm_out = perm_in;
+                         /* And ensure this propagates.  */
+                         if (vertices[succ->dest].perm_in == -1)
+                           vertices[succ->dest].perm_in = perm_in;
+                       }
+                   }
+                 changed = true;
+               }
+
+             if (!vect_slp_perms_eq (perms, perm_in,
+                                     vertices[idx].perm_in))
+               {
+                 /* Make sure we eventually converge.  */
+                 gcc_checking_assert (vertices[idx].perm_in == -1
+                                      || perm_in == 0);
+                 vertices[idx].perm_in = perm_in;
+
+                 /* While we can handle VEC_PERM nodes as transparent
+                    pass-through they can be a cheap materialization
+                    point as well.  In addition they can act as source
+                    of a random permutation as well.
+                    The following ensures that former materialization
+                    points that now have zero incoming permutes no
+                    longer appear as such and that former "any" permutes
+                    get pass-through.  We keep VEC_PERM nodes optimistic
+                    as "any" outgoing permute though.  */
+                 if (vertices[idx].perm_out != 0
+                     && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
+                   vertices[idx].perm_out = perm_in;
+                 changed = true;
+               }
            }
 
-         if (perm == 0)
+         /* Elide pruning at materialization points in the first
+            iteration phase.  */
+         if (!do_materialization)
            continue;
 
-         /* Elide pruning at materialization points in the first
-            iteration so every node was visited once at least.  */
-         if (iteration == 1)
+         int perm = vertices[idx].perm_out;
+         if (perm == 0 || perm == -1)
            continue;
 
          /* Decide on permute materialization.  Look whether there's
             a use (pred) edge that is permuted differently than us.
-            In that case mark ourselves so the permutation is applied.
-            For VEC_PERM_EXPRs the permutation doesn't carry along
-            from children to parents so force materialization at the
-            point of the VEC_PERM_EXPR.  In principle VEC_PERM_EXPRs
-            are a source of an arbitrary permutation again, similar
-            to constants/externals - that's something we do not yet
-            optimally handle.  */
-         bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
-                                    && slpg->vertices[idx].pred != NULL);
+            In that case mark ourselves so the permutation is applied.  */
+         bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
          if (all_preds_permuted)
            for (graph_edge *pred = slpg->vertices[idx].pred;
                 pred; pred = pred->pred_next)
              {
-               gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
-               int pred_perm = n_perm[pred->src];
+               int pred_perm = vertices[pred->src].perm_in;
+               gcc_checking_assert (pred_perm != -1);
                if (!vect_slp_perms_eq (perms, perm, pred_perm))
                  {
                    all_preds_permuted = false;
@@ -3706,55 +3902,62 @@ vect_optimize_slp (vec_info *vinfo)
              }
          if (!all_preds_permuted)
            {
-             if (!bitmap_bit_p (n_materialize, idx))
-               changed = true;
-             bitmap_set_bit (n_materialize, idx);
+             vertices[idx].perm_out = 0;
+             changed = true;
            }
        }
+
+      /* If the initial propagation converged, switch on materialization
+        and re-propagate.  */
+      if (!changed && !do_materialization)
+       {
+         do_materialization = true;
+         changed = true;
+       }
     }
-  while (changed || iteration == 1);
+  while (changed);
+  statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
 
   /* Materialize.  */
   for (i = 0; i < vertices.length (); ++i)
     {
-      int perm = n_perm[i];
-      if (perm <= 0)
-       continue;
-
-      slp_tree node = vertices[i];
+      int perm_in = vertices[i].perm_in;
+      slp_tree node = vertices[i].node;
 
-      /* First permute invariant/external original successors.  */
+      /* First permute invariant/external original successors, we handle
+        those optimistically during propagation and duplicate them if
+        they are used with different permutations.  */
       unsigned j;
       slp_tree child;
-      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-       {
-         if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-           continue;
+      if (perm_in > 0)
+       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+         {
+           if (!child
+               || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
+                   && SLP_TREE_DEF_TYPE (child) != vect_external_def))
+             continue;
 
-         /* If the vector is uniform there's nothing to do.  */
-         if (vect_slp_tree_uniform_p (child))
-           continue;
+           /* If the vector is uniform there's nothing to do.  */
+           if (vect_slp_tree_uniform_p (child))
+             continue;
 
-         /* We can end up sharing some externals via two_operator
-            handling.  Be prepared to unshare those.  */
-         if (child->refcnt != 1)
-           {
-             gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
-             SLP_TREE_CHILDREN (node)[j] = child
-               = vect_create_new_slp_node
-                   (SLP_TREE_SCALAR_OPS (child).copy ());
-           }
-         vect_slp_permute (perms[perm],
-                           SLP_TREE_SCALAR_OPS (child), true);
-       }
+           /* We can end up sharing some externals via two_operator
+              handling.  Be prepared to unshare those.  */
+           if (child->refcnt != 1)
+             {
+               gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
+               SLP_TREE_CHILDREN (node)[j] = child
+                 = vect_create_new_slp_node
+                     (SLP_TREE_SCALAR_OPS (child).copy ());
+             }
+           vect_slp_permute (perms[perm_in],
+                             SLP_TREE_SCALAR_OPS (child), true);
+         }
 
-      if (bitmap_bit_p (n_materialize, i))
+      if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
        {
-         if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
-           /* For loads simply drop the permutation, the load permutation
-              already performs the desired permutation.  */
-           ;
-         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+         /* Apply the common permutes to the input vectors.  */
+         if (perm_in > 0)
            {
              /* If the node is already a permute node we can apply
                 the permutation to the lane selection, effectively
@@ -3763,12 +3966,30 @@ vect_optimize_slp (vec_info *vinfo)
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "simplifying permute node %p\n",
                                 node);
-
              for (unsigned k = 0;
                   k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
                SLP_TREE_LANE_PERMUTATION (node)[k].second
-                 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
+                 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
            }
+         /* Apply the anticipated output permute to the permute and
+            stmt vectors.  */
+         int perm_out = vertices[i].perm_out;
+         if (perm_out > 0)
+           {
+             vect_slp_permute (perms[perm_out],
+                               SLP_TREE_SCALAR_STMTS (node), true);
+             vect_slp_permute (perms[perm_out],
+                               SLP_TREE_LANE_PERMUTATION (node), true);
+           }
+       }
+      else if (vertices[i].get_perm_materialized () != 0)
+       {
+         if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+           /* For loads simply drop the permutation, the load permutation
+              already performs the desired permutation.  */
+           ;
+         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+           gcc_unreachable ();
          else
            {
              if (dump_enabled_p ())
@@ -3783,7 +4004,7 @@ vect_optimize_slp (vec_info *vinfo)
              SLP_TREE_CHILDREN (node) = vNULL;
              SLP_TREE_SCALAR_STMTS (copy)
                = SLP_TREE_SCALAR_STMTS (node).copy ();
-             vect_slp_permute (perms[perm],
+             vect_slp_permute (perms[perm_in],
                                SLP_TREE_SCALAR_STMTS (copy), true);
              gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
              SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
@@ -3803,28 +4024,77 @@ vect_optimize_slp (vec_info *vinfo)
              SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
              for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
                SLP_TREE_LANE_PERMUTATION (node)
-                 .quick_push (std::make_pair (0, perms[perm][j]));
+                 .quick_push (std::make_pair (0, perms[perm_in][j]));
              SLP_TREE_CODE (node) = VEC_PERM_EXPR;
            }
        }
-      else
+      else if (perm_in > 0) /* perm_in == perm_out */
        {
          /* Apply the reverse permutation to our stmts.  */
-         vect_slp_permute (perms[perm],
+         vect_slp_permute (perms[perm_in],
                            SLP_TREE_SCALAR_STMTS (node), true);
-         /* And to the load permutation, which we can simply
+         /* And to the lane/load permutation, which we can simply
             make regular by design.  */
          if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
            {
+             gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
              /* ???  When we handle non-bijective permutes the idea
                 is that we can force the load-permutation to be
                 { min, min + 1, min + 2, ... max }.  But then the
                 scalar defs might no longer match the lane content
                 which means wrong-code with live lane vectorization.
                 So we possibly have to have NULL entries for those.  */
-             vect_slp_permute (perms[perm],
+             vect_slp_permute (perms[perm_in],
                                SLP_TREE_LOAD_PERMUTATION (node), true);
            }
+         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+           gcc_unreachable ();
+       }
+    }
+
+  /* Elide any permutations at BB reduction roots.  */
+  if (is_a <bb_vec_info> (vinfo))
+    {
+      for (slp_instance instance : vinfo->slp_instances)
+       {
+         if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
+           continue;
+         slp_tree old = SLP_INSTANCE_TREE (instance);
+         if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
+             && SLP_TREE_CHILDREN (old).length () == 1)
+           {
+             slp_tree child = SLP_TREE_CHILDREN (old)[0];
+             if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
+               {
+                 /* Preserve the special VEC_PERM we use to shield existing
+                    vector defs from the rest.  But make it a no-op.  */
+                 unsigned i = 0;
+                 for (std::pair<unsigned, unsigned> &p
+                      : SLP_TREE_LANE_PERMUTATION (old))
+                   p.second = i++;
+               }
+             else
+               {
+                 SLP_INSTANCE_TREE (instance) = child;
+                 SLP_TREE_REF_COUNT (child)++;
+                 vect_free_slp_tree (old);
+               }
+           }
+         else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
+                  && SLP_TREE_REF_COUNT (old) == 1
+                  && vertices[old->vertex].get_perm_materialized () != 0)
+           {
+             /* ???  For loads the situation is more complex since
+                we can't modify the permute in place in case the
+                node is used multiple times.  In fact for loads this
+                should be somehow handled in the propagation engine.  */
+             /* Apply the reverse permutation to our stmts.  */
+             int perm = vertices[old->vertex].get_perm_materialized ();
+             vect_slp_permute (perms[perm],
+                               SLP_TREE_SCALAR_STMTS (old), true);
+             vect_slp_permute (perms[perm],
+                               SLP_TREE_LOAD_PERMUTATION (old), true);
+           }
        }
     }
 
@@ -3837,7 +4107,7 @@ vect_optimize_slp (vec_info *vinfo)
   /* Now elide load permutations that are not necessary.  */
   for (i = 0; i < leafs.length (); ++i)
     {
-      node = vertices[leafs[i]];
+      node = vertices[leafs[i]].node;
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
        continue;
 
@@ -3921,7 +4191,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
 {
   unsigned int i;
   poly_uint64 unrolling_factor = 1;
-  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  const vec<slp_instance> &slp_instances
+    = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
   slp_instance instance;
   int decided_to_slp = 0;
 
@@ -4471,7 +4742,6 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
   return res;
 }
 
-
 /* Mark lanes of NODE that are live outside of the basic-block vectorized
    region and that can be vectorized using vectorizable_live_operation
    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
@@ -4575,6 +4845,59 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
                                   cost_vec, svisited, visited);
 }
 
+/* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
+
+static bool
+vectorizable_bb_reduc_epilogue (slp_instance instance,
+                               stmt_vector_for_cost *cost_vec)
+{
+  gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
+  enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
+  if (reduc_code == MINUS_EXPR)
+    reduc_code = PLUS_EXPR;
+  internal_fn reduc_fn;
+  tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
+  if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
+      || reduc_fn == IFN_LAST
+      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
+      || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
+                                    TREE_TYPE (vectype)))
+    return false;
+
+  /* There's no way to cost a horizontal vector reduction via REDUC_FN so
+     cost log2 vector operations plus shuffles and one extraction.  */
+  unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
+  record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
+                   vectype, 0, vect_body);
+  record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
+                   vectype, 0, vect_body);
+  record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
+                   vectype, 0, vect_body);
+  return true;
+}
+
+/* Prune from ROOTS all stmts that are computed as part of lanes of NODE
+   and recurse to children.  */
+
+static void
+vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
+                             hash_set<slp_tree> &visited)
+{
+  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
+      || visited.add (node))
+    return;
+
+  stmt_vec_info stmt;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
+    roots.remove (vect_orig_stmt (stmt));
+
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (child)
+      vect_slp_prune_covered_roots (child, roots, visited);
+}
+
 /* Analyze statements in SLP instances of VINFO.  Return true if the
    operations are supported. */
 
@@ -4598,15 +4921,20 @@ vect_slp_analyze_operations (vec_info *vinfo)
                                             SLP_INSTANCE_TREE (instance),
                                             instance, visited, visited_vec,
                                             &cost_vec)
-         /* Instances with a root stmt require vectorized defs for the
-            SLP tree root.  */
-         /* ???  Do inst->kind check instead.  */
-         || (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
+         /* CTOR instances require vectorized defs for the SLP tree root.  */
+         || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
              && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
-                 != vect_internal_def)))
+                 != vect_internal_def))
+         /* Check we can vectorize the reduction.  */
+         || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
+             && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
         {
          slp_tree node = SLP_INSTANCE_TREE (instance);
-         stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+         stmt_vec_info stmt_info;
+         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
+           stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
+         else
+           stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "removing SLP instance operations starting from: %G",
@@ -4633,6 +4961,34 @@ vect_slp_analyze_operations (vec_info *vinfo)
        }
     }
 
+  /* Now look for SLP instances with a root that are covered by other
+     instances and remove them.  */
+  hash_set<stmt_vec_info> roots;
+  for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
+    if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
+      roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
+  if (!roots.is_empty ())
+    {
+      visited.empty ();
+      for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
+       vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
+                                     visited);
+      for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
+       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
+           && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
+         {
+           stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "removing SLP instance operations starting "
+                              "from: %G", root->stmt);
+           vect_free_slp_instance (instance);
+           vinfo->slp_instances.ordered_remove (i);
+         }
+       else
+         ++i;
+    }
+
   /* Compute vectorizable live stmts.  */
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
     {
@@ -4748,6 +5104,42 @@ vect_bb_partition_graph (bb_vec_info bb_vinfo)
     }
 }
 
+/* Compute the set of scalar stmts participating in internal and external
+   nodes.  */
+
+static void
+vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
+                                        hash_set<slp_tree> &visited,
+                                        hash_set<stmt_vec_info> &vstmts,
+                                        hash_set<stmt_vec_info> &estmts)
+{
+  int i;
+  stmt_vec_info stmt_info;
+  slp_tree child;
+
+  if (visited.add (node))
+    return;
+
+  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
+    {
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+       vstmts.add (stmt_info);
+
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+       if (child)
+         vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
+                                                  vstmts, estmts);
+    }
+  else
+    for (tree def : SLP_TREE_SCALAR_OPS (node))
+      {
+       stmt_vec_info def_stmt = vinfo->lookup_def (def);
+       if (def_stmt)
+         estmts.add (def_stmt);
+      }
+}
+
+
 /* Compute the scalar cost of the SLP node NODE and its children
    and return it.  Do not account defs that are marked in LIFE and
    update LIFE according to uses of NODE.  */
@@ -4756,6 +5148,7 @@ static void
 vect_bb_slp_scalar_cost (vec_info *vinfo,
                         slp_tree node, vec<bool, va_heap> *life,
                         stmt_vector_for_cost *cost_vec,
+                        hash_set<stmt_vec_info> &vectorized_scalar_stmts,
                         hash_set<slp_tree> &visited)
 {
   unsigned i;
@@ -4792,8 +5185,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
                  {
                    stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
                    if (!use_stmt_info
-                       || !PURE_SLP_STMT
-                             (vect_stmt_to_vectorize (use_stmt_info)))
+                       || !vectorized_scalar_stmts.contains (use_stmt_info))
                      {
                        (*life)[i] = true;
                        break;
@@ -4856,7 +5248,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
              subtree_life.safe_splice (*life);
            }
          vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
-                                  visited);
+                                  vectorized_scalar_stmts, visited);
          subtree_life.truncate (0);
        }
     }
@@ -4881,7 +5273,8 @@ li_cost_vec_cmp (const void *a_, const void *b_)
 
 static bool
 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
-                                   vec<slp_instance> slp_instances)
+                                   vec<slp_instance> slp_instances,
+                                   loop_p orig_loop)
 {
   slp_instance instance;
   int i;
@@ -4897,11 +5290,33 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
                              SLP_INSTANCE_TREE (instance), visited);
     }
 
+  /* Compute the set of scalar stmts we know will go away 'locally' when
+     vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
+     not accurate for nodes promoted extern late or for scalar stmts that
+     are used both in extern defs and in vectorized defs.  */
+  hash_set<stmt_vec_info> vectorized_scalar_stmts;
+  hash_set<stmt_vec_info> scalar_stmts_in_externs;
+  hash_set<slp_tree> visited;
+  FOR_EACH_VEC_ELT (slp_instances, i, instance)
+    {
+      vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
+                                              SLP_INSTANCE_TREE (instance),
+                                              visited,
+                                              vectorized_scalar_stmts,
+                                              scalar_stmts_in_externs);
+      for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
+       vectorized_scalar_stmts.add (rstmt);
+    }
+  /* Scalar stmts used as defs in external nodes need to be preseved, so
+     remove them from vectorized_scalar_stmts.  */
+  for (stmt_vec_info stmt : scalar_stmts_in_externs)
+    vectorized_scalar_stmts.remove (stmt);
+
   /* Calculate scalar cost and sum the cost for the vector stmts
      previously collected.  */
   stmt_vector_for_cost scalar_costs = vNULL;
   stmt_vector_for_cost vector_costs = vNULL;
-  hash_set<slp_tree> visited;
+  visited.empty ();
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
@@ -4914,14 +5329,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
                          SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
       vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
-                              &life, &scalar_costs, visited);
+                              &life, &scalar_costs, vectorized_scalar_stmts,
+                              visited);
       vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
-  /* Unset visited flag.  */
-  stmt_info_for_cost *cost;
-  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
-    gimple_set_visited  (cost->stmt_info->stmt, false);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
@@ -4938,6 +5350,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
     li_scalar_costs (scalar_costs.length ());
   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
     li_vector_costs (vector_costs.length ());
+  stmt_info_for_cost *cost;
   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
     {
       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
@@ -4960,6 +5373,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
   /* Now cost the portions individually.  */
   unsigned vi = 0;
   unsigned si = 0;
+  bool profitable = true;
   while (si < li_scalar_costs.length ()
         && vi < li_vector_costs.length ())
     {
@@ -5026,25 +5440,29 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
         example).  */
       if (vec_outside_cost + vec_inside_cost > scalar_cost)
        {
-         scalar_costs.release ();
-         vector_costs.release ();
-         return false;
+         profitable = false;
+         break;
        }
     }
-  if (vi < li_vector_costs.length ())
+  if (profitable && vi < li_vector_costs.length ())
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                         "Excess vector cost for part in loop %d:\n",
                         li_vector_costs[vi].first);
-      scalar_costs.release ();
-      vector_costs.release ();
-      return false;
+      profitable = false;
     }
 
+  /* Unset visited flag.  This is delayed when the subgraph is profitable
+     and we process the loop for remaining unvectorized if-converted code.  */
+  if (!orig_loop || !profitable)
+    FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+      gimple_set_visited  (cost->stmt_info->stmt, false);
+
   scalar_costs.release ();
   vector_costs.release ();
-  return true;
+
+  return profitable;
 }
 
 /* qsort comparator for lane defs.  */
@@ -5094,7 +5512,10 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
        continue;
 
       tree rhs = gimple_assign_rhs1 (assign);
-      if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
+      enum tree_code code = gimple_assign_rhs_code (assign);
+      use_operand_p use_p;
+      gimple *use_stmt;
+      if (code == CONSTRUCTOR)
        {
          if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
              || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
@@ -5115,7 +5536,7 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
          stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
          BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
        }
-      else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
+      else if (code == BIT_INSERT_EXPR
               && VECTOR_TYPE_P (TREE_TYPE (rhs))
               && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
               && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
@@ -5209,6 +5630,69 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
          else
            roots.release ();
        }
+      else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
+              && (associative_tree_code (code) || code == MINUS_EXPR)
+              /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
+                 checks pessimize a two-element reduction.  PR54400.
+                 ???  In-order reduction could be handled if we only
+                 traverse one operand chain in vect_slp_linearize_chain.  */
+              && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
+                  || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
+                      && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
+              /* Ops with constants at the tail can be stripped here.  */
+              && TREE_CODE (rhs) == SSA_NAME
+              && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
+              /* Should be the chain end.  */
+              && (!single_imm_use (gimple_assign_lhs (assign),
+                                   &use_p, &use_stmt)
+                  || !is_gimple_assign (use_stmt)
+                  || (gimple_assign_rhs_code (use_stmt) != code
+                      && ((code != PLUS_EXPR && code != MINUS_EXPR)
+                          || (gimple_assign_rhs_code (use_stmt)
+                              != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
+       {
+         /* We start the match at the end of a possible association
+            chain.  */
+         auto_vec<chain_op_t> chain;
+         auto_vec<std::pair<tree_code, gimple *> > worklist;
+         auto_vec<gimple *> chain_stmts;
+         gimple *code_stmt = NULL, *alt_code_stmt = NULL;
+         if (code == MINUS_EXPR)
+           code = PLUS_EXPR;
+         internal_fn reduc_fn;
+         if (!reduction_fn_for_scalar_code (code, &reduc_fn)
+             || reduc_fn == IFN_LAST)
+           continue;
+         vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
+                                   /* ??? */
+                                   code_stmt, alt_code_stmt, &chain_stmts);
+         if (chain.length () > 1)
+           {
+             /* Sort the chain according to def_type and operation.  */
+             chain.sort (dt_sort_cmp, bb_vinfo);
+             /* ???  Now we'd want to strip externals and constants
+                but record those to be handled in the epilogue.  */
+             /* ???  For now do not allow mixing ops or externs/constants.  */
+             bool invalid = false;
+             for (unsigned i = 0; i < chain.length (); ++i)
+               if (chain[i].dt != vect_internal_def
+                   || chain[i].code != code)
+                 invalid = true;
+             if (!invalid)
+               {
+                 vec<stmt_vec_info> stmts;
+                 stmts.create (chain.length ());
+                 for (unsigned i = 0; i < chain.length (); ++i)
+                   stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
+                 vec<stmt_vec_info> roots;
+                 roots.create (chain_stmts.length ());
+                 for (unsigned i = 0; i < chain_stmts.length (); ++i)
+                   roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
+                 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
+                                                      stmts, roots));
+               }
+           }
+       }
     }
 }
 
@@ -5392,7 +5876,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
 static bool
 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
-                vec<int> *dataref_groups, unsigned int n_stmts)
+                vec<int> *dataref_groups, unsigned int n_stmts,
+                loop_p orig_loop)
 {
   bb_vec_info bb_vinfo;
   auto_vector_modes vector_modes;
@@ -5431,9 +5916,8 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 
          bb_vinfo->shared->check_datarefs ();
 
-         unsigned i;
-         slp_instance instance;
-         FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
+         auto_vec<slp_instance> profitable_subgraphs;
+         for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
            {
              if (instance->subgraph_entries.is_empty ())
                continue;
@@ -5441,7 +5925,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
              vect_location = instance->location ();
              if (!unlimited_cost_model (NULL)
                  && !vect_bb_vectorization_profitable_p
-                       (bb_vinfo, instance->subgraph_entries))
+                       (bb_vinfo, instance->subgraph_entries, orig_loop))
                {
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5453,6 +5937,47 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
              if (!dbg_cnt (vect_slp))
                continue;
 
+             profitable_subgraphs.safe_push (instance);
+           }
+
+         /* When we're vectorizing an if-converted loop body with the
+            very-cheap cost model make sure we vectorized all if-converted
+            code.  */
+         if (!profitable_subgraphs.is_empty ()
+             && orig_loop)
+           {
+             gcc_assert (bb_vinfo->bbs.length () == 1);
+             for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+                  !gsi_end_p (gsi); gsi_next (&gsi))
+               {
+                 /* The costing above left us with DCEable vectorized scalar
+                    stmts having the visited flag set on profitable
+                    subgraphs.  Do the delayed clearing of the flag here.  */
+                 if (gimple_visited_p (gsi_stmt (gsi)))
+                   {
+                     gimple_set_visited (gsi_stmt (gsi), false);
+                     continue;
+                   }
+                 if (flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
+                   continue;
+
+                 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+                   if (gimple_assign_rhs_code (ass) == COND_EXPR)
+                     {
+                       if (!profitable_subgraphs.is_empty ()
+                           && dump_enabled_p ())
+                         dump_printf_loc (MSG_NOTE, vect_location,
+                                          "not profitable because of "
+                                          "unprofitable if-converted scalar "
+                                          "code\n");
+                       profitable_subgraphs.truncate (0);
+                     }
+               }
+           }
+
+         /* Finally schedule the profitable subgraphs.  */
+         for (slp_instance instance : profitable_subgraphs)
+           {
              if (!vectorized && dump_enabled_p ())
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "Basic block will be vectorized "
@@ -5541,7 +6066,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
    true if anything in the basic-block was vectorized.  */
 
 static bool
-vect_slp_bbs (vec<basic_block> bbs)
+vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
 {
   vec<data_reference_p> datarefs = vNULL;
   auto_vec<int> dataref_groups;
@@ -5567,20 +6092,24 @@ vect_slp_bbs (vec<basic_block> bbs)
                                              &dataref_groups, current_group))
            ++current_group;
        }
+      /* New BBs always start a new DR group.  */
+      ++current_group;
     }
 
-  return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
+  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
 }
 
-/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
-   true if anything in the basic-block was vectorized.  */
+/* Special entry for the BB vectorizer.  Analyze and transform a single
+   if-converted BB with ORIG_LOOPs body being the not if-converted
+   representation.  Returns true if anything in the basic-block was
+   vectorized.  */
 
 bool
-vect_slp_bb (basic_block bb)
+vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
 {
   auto_vec<basic_block> bbs;
   bbs.safe_push (bb);
-  return vect_slp_bbs (bbs);
+  return vect_slp_bbs (bbs, orig_loop);
 }
 
 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
@@ -5631,7 +6160,7 @@ vect_slp_function (function *fun)
 
       if (split && !bbs.is_empty ())
        {
-         r |= vect_slp_bbs (bbs);
+         r |= vect_slp_bbs (bbs, NULL);
          bbs.truncate (0);
          bbs.quick_push (bb);
        }
@@ -5649,13 +6178,13 @@ vect_slp_function (function *fun)
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                               "splitting region at control altering "
                               "definition %G", last);
-           r |= vect_slp_bbs (bbs);
+           r |= vect_slp_bbs (bbs, NULL);
            bbs.truncate (0);
          }
     }
 
   if (!bbs.is_empty ())
-    r |= vect_slp_bbs (bbs);
+    r |= vect_slp_bbs (bbs, NULL);
 
   free (rpo);
 
@@ -5686,7 +6215,7 @@ vect_slp_function (function *fun)
 
 void
 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
-                         vec<tree> elts, unsigned int nresults,
+                         const vec<tree> &elts, unsigned int nresults,
                          vec<tree> &results)
 {
   unsigned int nelts = elts.length ();
@@ -6037,14 +6566,15 @@ vect_get_slp_defs (vec_info *,
    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
    permute statements for the SLP node NODE.  Store the number of vector
    permute instructions in *N_PERMS and the number of vector load
-   instructions in *N_LOADS.  */
+   instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
+   that were not needed.  */
 
 bool
 vect_transform_slp_perm_load (vec_info *vinfo,
-                             slp_tree node, vec<tree> dr_chain,
+                             slp_tree node, const vec<tree> &dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
                              bool analyze_only, unsigned *n_perms,
-                             unsigned int *n_loads)
+                             unsigned int *n_loads, bool dce_chain)
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   int vec_index = 0;
@@ -6123,6 +6653,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
     }
   auto_sbitmap used_in_lanes (in_nlanes);
   bitmap_clear (used_in_lanes);
+  auto_bitmap used_defs;
 
   unsigned int count = mask.encoded_nelts ();
   mask.quick_grow (count);
@@ -6230,11 +6761,20 @@ vect_transform_slp_perm_load (vec_info *vinfo,
                                               mask_vec);
                      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
                                                   gsi);
+                     if (dce_chain)
+                       {
+                         bitmap_set_bit (used_defs, first_vec_index + ri);
+                         bitmap_set_bit (used_defs, second_vec_index + ri);
+                       }
                    }
                  else
-                   /* If mask was NULL_TREE generate the requested
-                      identity transform.  */
-                   perm_stmt = SSA_NAME_DEF_STMT (first_vec);
+                   {
+                     /* If mask was NULL_TREE generate the requested
+                        identity transform.  */
+                     perm_stmt = SSA_NAME_DEF_STMT (first_vec);
+                     if (dce_chain)
+                       bitmap_set_bit (used_defs, first_vec_index + ri);
+                   }
 
                  /* Store the vector statement in NODE.  */
                  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
@@ -6274,6 +6814,16 @@ vect_transform_slp_perm_load (vec_info *vinfo,
        }
     }
 
+  if (dce_chain)
+    for (unsigned i = 0; i < dr_chain.length (); ++i)
+      if (!bitmap_bit_p (used_defs, i))
+       {
+         gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
+         gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
+         gsi_remove (&rgsi, true);
+         release_defs (stmt);
+       }
+
   return true;
 }
 
@@ -6721,6 +7271,21 @@ vect_schedule_slp_node (vec_info *vinfo,
          gcc_assert (seen_vector_def);
          si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
        }
+      else if (is_a <bb_vec_info> (vinfo)
+              && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
+              && gimple_could_trap_p (stmt_info->stmt))
+       {
+         /* We've constrained possibly trapping operations to all come
+            from the same basic-block, if vectorized defs would allow earlier
+            scheduling still force vectorized stmts to the original block.
+            This is only necessary for BB vectorization since for loop vect
+            all operations are in a single BB and scalar stmt based
+            placement doesn't play well with epilogue vectorization.  */
+         gcc_assert (dominated_by_p (CDI_DOMINATORS,
+                                     gimple_bb (stmt_info->stmt),
+                                     gimple_bb (last_stmt)));
+         si = gsi_after_labels (gimple_bb (stmt_info->stmt));
+       }
       else if (is_a <gphi *> (last_stmt))
        si = gsi_after_labels (gimple_bb (last_stmt));
       else
@@ -6840,6 +7405,39 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
          rstmt = gimple_build_assign (lhs, r_constructor);
        }
     }
+  else if (instance->kind == slp_inst_kind_bb_reduc)
+    {
+      /* Largely inspired by reduction chain epilogue handling in
+        vect_create_epilog_for_reduction.  */
+      vec<tree> vec_defs = vNULL;
+      vect_get_slp_defs (node, &vec_defs);
+      enum tree_code reduc_code
+       = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+      /* ???  We actually have to reflect signs somewhere.  */
+      if (reduc_code == MINUS_EXPR)
+       reduc_code = PLUS_EXPR;
+      gimple_seq epilogue = NULL;
+      /* We may end up with more than one vector result, reduce them
+        to one vector.  */
+      tree vec_def = vec_defs[0];
+      for (unsigned i = 1; i < vec_defs.length (); ++i)
+       vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
+                               vec_def, vec_defs[i]);
+      vec_defs.release ();
+      /* ???  Support other schemes than direct internal fn.  */
+      internal_fn reduc_fn;
+      if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
+         || reduc_fn == IFN_LAST)
+       gcc_unreachable ();
+      tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
+                                     TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
+
+      gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
+      gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
+      gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
+      update_stmt (gsi_stmt (rgsi));
+      return;
+    }
   else
     gcc_unreachable ();
 
@@ -7002,7 +7600,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
 
 void
-vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
+vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
 {
   slp_instance instance;
   unsigned int i;