]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/tree-vect-slp.c
Correct a function pre/postcondition [PR102403].
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
index f9c33c0bb141b2b69ec5b39796bd430aa7dcf12d..024a1c38a2342246d7891db1de5f1d6e6458d5dd 100644 (file)
@@ -1177,21 +1177,6 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
              continue;
            }
 
-         if (need_same_oprnds)
-           {
-             tree other_op1 = (call_stmt
-                               ? gimple_call_arg (call_stmt, 1)
-                               : gimple_assign_rhs2 (stmt));
-             if (!operand_equal_p (first_op1, other_op1, 0))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "Build SLP failed: different shift "
-                                    "arguments in %G", stmt);
-                 /* Mismatch.  */
-                 continue;
-               }
-           }
          if (!load_p
              && first_stmt_code == BIT_FIELD_REF
              && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
@@ -1231,6 +1216,22 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
              continue;
            }
 
+         if (need_same_oprnds)
+           {
+             tree other_op1 = (call_stmt
+                               ? gimple_call_arg (call_stmt, 1)
+                               : gimple_assign_rhs2 (stmt));
+             if (!operand_equal_p (first_op1, other_op1, 0))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different shift "
+                                    "arguments in %G", stmt);
+                 /* Mismatch.  */
+                 continue;
+               }
+           }
+
          if (!types_compatible_p (vectype, *node_vectype))
            {
              if (dump_enabled_p ())
@@ -1591,6 +1592,14 @@ vect_build_slp_tree (vec_info *vinfo,
       SLP_TREE_SCALAR_STMTS (res) = vNULL;
       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
       res->failed = XNEWVEC (bool, group_size);
+      if (flag_checking)
+       {
+         unsigned i;
+         for (i = 0; i < group_size; ++i)
+           if (!matches[i])
+             break;
+         gcc_assert (i < group_size);
+       }
       memcpy (res->failed, matches, sizeof (bool) * group_size);
     }
   else
@@ -1898,10 +1907,14 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
          chains.quick_push (chain.copy ());
          chain.truncate (0);
        }
-      if (chains.length () == group_size
-         /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
-         && op_stmt_info)
+      if (chains.length () == group_size)
        {
+         /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
+         if (!op_stmt_info)
+           {
+             hard_fail = false;
+             goto out;
+           }
          /* Now we have a set of chains with the same length.  */
          /* 1. pre-sort according to def_type and operation.  */
          for (unsigned lane = 0; lane < group_size; ++lane)
@@ -1951,15 +1964,15 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
              if (dt == vect_constant_def
                  || dt == vect_external_def)
                {
-               /* We can always build those.  Might want to sort last
-                  or defer building.  */
-                  vec<tree> ops;
-                  ops.create (group_size);
-                  for (lane = 0; lane < group_size; ++lane)
-                    ops.quick_push (chains[lane][n].op);
-                  slp_tree child = vect_create_new_slp_node (ops);
-                  SLP_TREE_DEF_TYPE (child) = dt;
-                  children.safe_push (child);
+                 /* We can always build those.  Might want to sort last
+                    or defer building.  */
+                 vec<tree> ops;
+                 ops.create (group_size);
+                 for (lane = 0; lane < group_size; ++lane)
+                   ops.quick_push (chains[lane][n].op);
+                 slp_tree child = vect_create_new_slp_node (ops);
+                 SLP_TREE_DEF_TYPE (child) = dt;
+                 children.safe_push (child);
                }
              else if (dt != vect_internal_def)
                {
@@ -2024,9 +2037,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                        dump_printf_loc (MSG_NOTE, vect_location,
                                         "failed to match up op %d\n", n);
                      op_stmts.release ();
-                     matches[lane] = false;
                      if (lane != group_size - 1)
                        matches[0] = false;
+                     else
+                       matches[lane] = false;
                      goto out;
                    }
                  if (dump_enabled_p ())
@@ -2297,6 +2311,10 @@ out:
              }
          if (dump_enabled_p ())
            dump_printf (MSG_NOTE, "\n");
+         /* After swapping some operands we lost track whether an
+            operand has any pattern defs so be conservative here.  */
+         if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
+           oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
          /* And try again with scratch 'matches' ... */
          bool *tem = XALLOCAVEC (bool, group_size);
          if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
@@ -2538,6 +2556,44 @@ debug (slp_tree node)
                       node);
 }
 
+/* Recursive helper for the dot producer below.  */
+
+static void
+dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
+{
+  if (visited.add (node))
+    return;
+
+  fprintf (f, "\"%p\" [label=\"", (void *)node);
+  vect_print_slp_tree (MSG_NOTE,
+                      dump_location_t::from_location_t (UNKNOWN_LOCATION),
+                      node);
+  fprintf (f, "\"];\n");
+
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    dot_slp_tree (f, child, visited);
+}
+
+DEBUG_FUNCTION void
+dot_slp_tree (const char *fname, slp_tree node)
+{
+  FILE *f = fopen (fname, "w");
+  fprintf (f, "digraph {\n");
+  fflush (f);
+    {
+      debug_dump_context ctx (f);
+      hash_set<slp_tree> visited;
+      dot_slp_tree (f, node, visited);
+    }
+  fflush (f);
+  fprintf (f, "}\n");
+  fclose (f);
+}
+
 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
 
 static void
@@ -3302,7 +3358,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
   else if (kind == slp_inst_kind_reduc_group)
     {
       /* Collect reduction statements.  */
-      vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
+      const vec<stmt_vec_info> &reductions
+       = as_a <loop_vec_info> (vinfo)->reductions;
       scalar_stmts.create (reductions.length ());
       for (i = 0; reductions.iterate (i, &next_info); i++)
        if (STMT_VINFO_RELEVANT_P (next_info)
@@ -3453,11 +3510,29 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
   return opt_result::success ();
 }
 
+struct slpg_vertex
+{
+  slpg_vertex (slp_tree node_)
+    : node (node_), perm_in (-1), perm_out (-1) {}
+
+  int get_perm_materialized () const
+    { return perm_in != perm_out ? perm_in : 0; }
+
+  slp_tree node;
+  /* The common permutation on the incoming lanes (towards SLP children).  */
+  int perm_in;
+  /* The permutation on the outgoing lanes (towards SLP parents).  When
+     the node is a materialization point for a permute this differs
+     from perm_in (and is then usually zero).  Materialization happens
+     on the input side.  */
+  int perm_out;
+};
+
 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
 
 static void
 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
-                        vec<slp_tree> &vertices, vec<int> &leafs)
+                        vec<slpg_vertex> &vertices, vec<int> &leafs)
 {
   unsigned i;
   slp_tree child;
@@ -3466,41 +3541,39 @@ vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
     return;
 
   node->vertex = vertices.length ();
-  vertices.safe_push (node);
+  vertices.safe_push (slpg_vertex (node));
 
   bool leaf = true;
+  bool force_leaf = false;
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (child)
       {
        leaf = false;
        vect_slp_build_vertices (visited, child, vertices, leafs);
       }
-  if (leaf)
+    else
+      force_leaf = true;
+  /* Since SLP discovery works along use-def edges all cycles have an
+     entry - but there's the exception of cycles where we do not handle
+     the entry explicitely (but with a NULL SLP node), like some reductions
+     and inductions.  Force those SLP PHIs to act as leafs to make them
+     backwards reachable.  */
+  if (leaf || force_leaf)
     leafs.safe_push (node->vertex);
 }
 
 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
 
 static void
-vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
+vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
                         vec<int> &leafs)
 {
   hash_set<slp_tree> visited;
   unsigned i;
   slp_instance instance;
   FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
-    {
-      unsigned n_v = vertices.length ();
-      unsigned n_l = leafs.length ();
-      vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
-                              leafs);
-      /* If we added vertices but no entries to the reverse graph we've
-        added a cycle that is not backwards-reachable.   Push the entry
-        to mimic as leaf then.  */
-      if (vertices.length () > n_v
-         && leafs.length () == n_l)
-       leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
-    }
+    vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
+                            leafs);
 }
 
 /* Apply (reverse) bijectite PERM to VEC.  */
@@ -3539,7 +3612,8 @@ vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
                   int perm_a, int perm_b)
 {
   return (perm_a == perm_b
-         || (perms[perm_a].length () == perms[perm_b].length ()
+         || (perm_a != -1 && perm_b != -1
+             && perms[perm_a].length () == perms[perm_b].length ()
              && memcmp (&perms[perm_a][0], &perms[perm_b][0],
                         sizeof (unsigned) * perms[perm_a].length ()) == 0));
 }
@@ -3554,50 +3628,45 @@ vect_optimize_slp (vec_info *vinfo)
 
   slp_tree node;
   unsigned i;
-  auto_vec<slp_tree> vertices;
+  auto_vec<slpg_vertex> vertices;
   auto_vec<int> leafs;
   vect_slp_build_vertices (vinfo, vertices, leafs);
 
   struct graph *slpg = new_graph (vertices.length ());
-  FOR_EACH_VEC_ELT (vertices, i, node)
-    {
-      unsigned j;
-      slp_tree child;
-      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-       if (child)
-         add_edge (slpg, i, child->vertex);
-    }
+  for (slpg_vertex &v : vertices)
+    for (slp_tree child : SLP_TREE_CHILDREN (v.node))
+      if (child)
+       add_edge (slpg, v.node->vertex, child->vertex);
 
   /* Compute (reverse) postorder on the inverted graph.  */
   auto_vec<int> ipo;
   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
-  auto_sbitmap n_visited (vertices.length ());
-  auto_sbitmap n_materialize (vertices.length ());
-  auto_vec<int> n_perm (vertices.length ());
   auto_vec<vec<unsigned> > perms;
-
-  bitmap_clear (n_visited);
-  bitmap_clear (n_materialize);
-  n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
   /* Produce initial permutations.  */
   for (i = 0; i < leafs.length (); ++i)
     {
       int idx = leafs[i];
-      slp_tree node = vertices[idx];
+      slp_tree node = vertices[idx].node;
 
       /* Handle externals and constants optimistically throughout the
-        iteration.  */
-      if (SLP_TREE_DEF_TYPE (node) == vect_external_def
+        iteration.  But treat existing vectors as fixed since we
+        do not handle permuting them below.  */
+      if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
+          && !SLP_TREE_VEC_DEFS (node).exists ())
          || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
        continue;
 
       /* Leafs do not change across iterations.  Note leafs also double
         as entries to the reverse graph.  */
       if (!slpg->vertices[idx].succ)
-       bitmap_set_bit (n_visited, idx);
+       {
+         vertices[idx].perm_in = 0;
+         vertices[idx].perm_out = 0;
+       }
+
       /* Loads are the only thing generating permutes.  */
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
        continue;
@@ -3646,104 +3715,185 @@ vect_optimize_slp (vec_info *vinfo)
       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
        perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
       perms.safe_push (perm);
-      n_perm[idx] = perms.length () - 1;
+      vertices[idx].perm_in = perms.length () - 1;
+      vertices[idx].perm_out = perms.length () - 1;
     }
 
+  /* In addition to the above we have to mark outgoing permutes facing
+     non-reduction graph entries that are not represented as to be
+     materialized.  */
+  for (slp_instance instance : vinfo->slp_instances)
+    if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
+      {
+       /* Just setting perm_out isn't enough for the propagation to
+          pick this up.  */
+       vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
+       vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
+      }
+
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;
+  bool do_materialization = false;
   unsigned iteration = 0;
   do
     {
       changed = false;
       ++iteration;
 
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "SLP optimize iteration %d\n", iteration);
+
       for (i = vertices.length (); i > 0 ; --i)
        {
          int idx = ipo[i-1];
-         slp_tree node = vertices[idx];
-         /* For leafs there's nothing to do - we've seeded permutes
-            on those above.  */
-         if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
-           continue;
-
-         bitmap_set_bit (n_visited, idx);
+         slp_tree node = vertices[idx].node;
 
-         /* We cannot move a permute across a store.  */
-         if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
-             && DR_IS_WRITE
-                  (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
+         /* Handle externals and constants optimistically throughout the
+            iteration.  */
+         if (SLP_TREE_DEF_TYPE (node) == vect_external_def
+             || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
            continue;
 
-         int perm = -1;
-         for (graph_edge *succ = slpg->vertices[idx].succ;
-              succ; succ = succ->succ_next)
+         /* We still eventually have failed backedge SLP nodes in the
+            graph, those are only cancelled when analyzing operations.
+            Simply treat them as transparent ops, propagating permutes
+            through them.  */
+         if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
            {
-             int succ_idx = succ->dest;
-             /* Handle unvisited nodes optimistically.  */
-             /* ???  But for constants once we want to handle non-bijective
-                permutes we have to verify the permute, when unifying lanes,
-                will not unify different constants.  For example see
-                gcc.dg/vect/bb-slp-14.c for a case that would break.  */
-             if (!bitmap_bit_p (n_visited, succ_idx))
-               continue;
-             int succ_perm = n_perm[succ_idx];
-             /* Once we materialize succs permutation its output lanes
-                appear unpermuted to us.  */
-             if (bitmap_bit_p (n_materialize, succ_idx))
-               succ_perm = 0;
-             if (perm == -1)
-               perm = succ_perm;
-             else if (succ_perm == 0)
+             /* We do not handle stores with a permutation, so all
+                incoming permutes must have been materialized.  */
+             stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
+             if (STMT_VINFO_DATA_REF (rep)
+                 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
                {
-                 perm = 0;
-                 break;
-               }
-             else if (!vect_slp_perms_eq (perms, perm, succ_perm))
-               {
-                 perm = 0;
-                 break;
+                 /* ???  We're forcing materialization in place
+                    of the child here, we'd need special handling
+                    in materialization to leave perm_in -1 here.  */
+                 vertices[idx].perm_in = 0;
+                 vertices[idx].perm_out = 0;
                }
+             /* We cannot move a permute across an operation that is
+                not independent on lanes.  Note this is an explicit
+                negative list since that's much shorter than the respective
+                positive one but it's critical to keep maintaining it.  */
+             if (is_gimple_call (STMT_VINFO_STMT (rep)))
+               switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
+                 {
+                 case CFN_COMPLEX_ADD_ROT90:
+                 case CFN_COMPLEX_ADD_ROT270:
+                 case CFN_COMPLEX_MUL:
+                 case CFN_COMPLEX_MUL_CONJ:
+                 case CFN_VEC_ADDSUB:
+                 case CFN_VEC_FMADDSUB:
+                 case CFN_VEC_FMSUBADD:
+                   vertices[idx].perm_in = 0;
+                   vertices[idx].perm_out = 0;
+                 default:;
+                 }
            }
 
-         if (perm == -1)
+         if (!slpg->vertices[idx].succ)
            /* Pick up pre-computed leaf values.  */
-           perm = n_perm[idx];
-         else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
+           ;
+         else
            {
-             if (iteration > 1)
-               /* Make sure we eventually converge.  */
-               gcc_checking_assert (perm == 0);
-             n_perm[idx] = perm;
-             if (perm == 0)
-               bitmap_clear_bit (n_materialize, idx);
-             changed = true;
+             bool any_succ_perm_out_m1 = false;
+             int perm_in = vertices[idx].perm_in;
+             for (graph_edge *succ = slpg->vertices[idx].succ;
+                  succ; succ = succ->succ_next)
+               {
+                 int succ_idx = succ->dest;
+                 int succ_perm = vertices[succ_idx].perm_out;
+                 /* Handle unvisited (and constant) nodes optimistically.  */
+                 /* ???  But for constants once we want to handle
+                    non-bijective permutes we have to verify the permute,
+                    when unifying lanes, will not unify different constants.
+                    For example see gcc.dg/vect/bb-slp-14.c for a case
+                    that would break.  */
+                 if (succ_perm == -1)
+                   {
+                     /* When we handled a non-leaf optimistically, note
+                        that so we can adjust its outgoing permute below.  */
+                     slp_tree succ_node = vertices[succ_idx].node;
+                     if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+                       any_succ_perm_out_m1 = true;
+                     continue;
+                   }
+                 if (perm_in == -1)
+                   perm_in = succ_perm;
+                 else if (succ_perm == 0
+                          || !vect_slp_perms_eq (perms, perm_in, succ_perm))
+                   {
+                     perm_in = 0;
+                     break;
+                   }
+               }
+
+             /* Adjust any incoming permutes we treated optimistically.  */
+             if (perm_in != -1 && any_succ_perm_out_m1)
+               {
+                 for (graph_edge *succ = slpg->vertices[idx].succ;
+                      succ; succ = succ->succ_next)
+                   {
+                     slp_tree succ_node = vertices[succ->dest].node;
+                     if (vertices[succ->dest].perm_out == -1
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+                       {
+                         vertices[succ->dest].perm_out = perm_in;
+                         /* And ensure this propagates.  */
+                         if (vertices[succ->dest].perm_in == -1)
+                           vertices[succ->dest].perm_in = perm_in;
+                       }
+                   }
+                 changed = true;
+               }
+
+             if (!vect_slp_perms_eq (perms, perm_in,
+                                     vertices[idx].perm_in))
+               {
+                 /* Make sure we eventually converge.  */
+                 gcc_checking_assert (vertices[idx].perm_in == -1
+                                      || perm_in == 0);
+                 vertices[idx].perm_in = perm_in;
+
+                 /* While we can handle VEC_PERM nodes as transparent
+                    pass-through they can be a cheap materialization
+                    point as well.  In addition they can act as source
+                    of a random permutation as well.
+                    The following ensures that former materialization
+                    points that now have zero incoming permutes no
+                    longer appear as such and that former "any" permutes
+                    get pass-through.  We keep VEC_PERM nodes optimistic
+                    as "any" outgoing permute though.  */
+                 if (vertices[idx].perm_out != 0
+                     && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
+                   vertices[idx].perm_out = perm_in;
+                 changed = true;
+               }
            }
 
-         if (perm == 0)
+         /* Elide pruning at materialization points in the first
+            iteration phase.  */
+         if (!do_materialization)
            continue;
 
-         /* Elide pruning at materialization points in the first
-            iteration so every node was visited once at least.  */
-         if (iteration == 1)
+         int perm = vertices[idx].perm_out;
+         if (perm == 0 || perm == -1)
            continue;
 
          /* Decide on permute materialization.  Look whether there's
             a use (pred) edge that is permuted differently than us.
-            In that case mark ourselves so the permutation is applied.
-            For VEC_PERM_EXPRs the permutation doesn't carry along
-            from children to parents so force materialization at the
-            point of the VEC_PERM_EXPR.  In principle VEC_PERM_EXPRs
-            are a source of an arbitrary permutation again, similar
-            to constants/externals - that's something we do not yet
-            optimally handle.  */
-         bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
-                                    && slpg->vertices[idx].pred != NULL);
+            In that case mark ourselves so the permutation is applied.  */
+         bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
          if (all_preds_permuted)
            for (graph_edge *pred = slpg->vertices[idx].pred;
                 pred; pred = pred->pred_next)
              {
-               gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
-               int pred_perm = n_perm[pred->src];
+               int pred_perm = vertices[pred->src].perm_in;
+               gcc_checking_assert (pred_perm != -1);
                if (!vect_slp_perms_eq (perms, perm, pred_perm))
                  {
                    all_preds_permuted = false;
@@ -3752,55 +3902,62 @@ vect_optimize_slp (vec_info *vinfo)
              }
          if (!all_preds_permuted)
            {
-             if (!bitmap_bit_p (n_materialize, idx))
-               changed = true;
-             bitmap_set_bit (n_materialize, idx);
+             vertices[idx].perm_out = 0;
+             changed = true;
            }
        }
+
+      /* If the initial propagation converged, switch on materialization
+        and re-propagate.  */
+      if (!changed && !do_materialization)
+       {
+         do_materialization = true;
+         changed = true;
+       }
     }
-  while (changed || iteration == 1);
+  while (changed);
+  statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
 
   /* Materialize.  */
   for (i = 0; i < vertices.length (); ++i)
     {
-      int perm = n_perm[i];
-      if (perm <= 0)
-       continue;
-
-      slp_tree node = vertices[i];
+      int perm_in = vertices[i].perm_in;
+      slp_tree node = vertices[i].node;
 
-      /* First permute invariant/external original successors.  */
+      /* First permute invariant/external original successors, we handle
+        those optimistically during propagation and duplicate them if
+        they are used with different permutations.  */
       unsigned j;
       slp_tree child;
-      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-       {
-         if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-           continue;
+      if (perm_in > 0)
+       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+         {
+           if (!child
+               || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
+                   && SLP_TREE_DEF_TYPE (child) != vect_external_def))
+             continue;
 
-         /* If the vector is uniform there's nothing to do.  */
-         if (vect_slp_tree_uniform_p (child))
-           continue;
+           /* If the vector is uniform there's nothing to do.  */
+           if (vect_slp_tree_uniform_p (child))
+             continue;
 
-         /* We can end up sharing some externals via two_operator
-            handling.  Be prepared to unshare those.  */
-         if (child->refcnt != 1)
-           {
-             gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
-             SLP_TREE_CHILDREN (node)[j] = child
-               = vect_create_new_slp_node
-                   (SLP_TREE_SCALAR_OPS (child).copy ());
-           }
-         vect_slp_permute (perms[perm],
-                           SLP_TREE_SCALAR_OPS (child), true);
-       }
+           /* We can end up sharing some externals via two_operator
+              handling.  Be prepared to unshare those.  */
+           if (child->refcnt != 1)
+             {
+               gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
+               SLP_TREE_CHILDREN (node)[j] = child
+                 = vect_create_new_slp_node
+                     (SLP_TREE_SCALAR_OPS (child).copy ());
+             }
+           vect_slp_permute (perms[perm_in],
+                             SLP_TREE_SCALAR_OPS (child), true);
+         }
 
-      if (bitmap_bit_p (n_materialize, i))
+      if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
        {
-         if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
-           /* For loads simply drop the permutation, the load permutation
-              already performs the desired permutation.  */
-           ;
-         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+         /* Apply the common permutes to the input vectors.  */
+         if (perm_in > 0)
            {
              /* If the node is already a permute node we can apply
                 the permutation to the lane selection, effectively
@@ -3809,12 +3966,30 @@ vect_optimize_slp (vec_info *vinfo)
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "simplifying permute node %p\n",
                                 node);
-
              for (unsigned k = 0;
                   k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
                SLP_TREE_LANE_PERMUTATION (node)[k].second
-                 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
+                 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
+           }
+         /* Apply the anticipated output permute to the permute and
+            stmt vectors.  */
+         int perm_out = vertices[i].perm_out;
+         if (perm_out > 0)
+           {
+             vect_slp_permute (perms[perm_out],
+                               SLP_TREE_SCALAR_STMTS (node), true);
+             vect_slp_permute (perms[perm_out],
+                               SLP_TREE_LANE_PERMUTATION (node), true);
            }
+       }
+      else if (vertices[i].get_perm_materialized () != 0)
+       {
+         if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+           /* For loads simply drop the permutation, the load permutation
+              already performs the desired permutation.  */
+           ;
+         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+           gcc_unreachable ();
          else
            {
              if (dump_enabled_p ())
@@ -3829,7 +4004,7 @@ vect_optimize_slp (vec_info *vinfo)
              SLP_TREE_CHILDREN (node) = vNULL;
              SLP_TREE_SCALAR_STMTS (copy)
                = SLP_TREE_SCALAR_STMTS (node).copy ();
-             vect_slp_permute (perms[perm],
+             vect_slp_permute (perms[perm_in],
                                SLP_TREE_SCALAR_STMTS (copy), true);
              gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
              SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
@@ -3849,28 +4024,77 @@ vect_optimize_slp (vec_info *vinfo)
              SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
              for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
                SLP_TREE_LANE_PERMUTATION (node)
-                 .quick_push (std::make_pair (0, perms[perm][j]));
+                 .quick_push (std::make_pair (0, perms[perm_in][j]));
              SLP_TREE_CODE (node) = VEC_PERM_EXPR;
            }
        }
-      else
+      else if (perm_in > 0) /* perm_in == perm_out */
        {
          /* Apply the reverse permutation to our stmts.  */
-         vect_slp_permute (perms[perm],
+         vect_slp_permute (perms[perm_in],
                            SLP_TREE_SCALAR_STMTS (node), true);
-         /* And to the load permutation, which we can simply
+         /* And to the lane/load permutation, which we can simply
             make regular by design.  */
          if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
            {
+             gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
              /* ???  When we handle non-bijective permutes the idea
                 is that we can force the load-permutation to be
                 { min, min + 1, min + 2, ... max }.  But then the
                 scalar defs might no longer match the lane content
                 which means wrong-code with live lane vectorization.
                 So we possibly have to have NULL entries for those.  */
-             vect_slp_permute (perms[perm],
+             vect_slp_permute (perms[perm_in],
                                SLP_TREE_LOAD_PERMUTATION (node), true);
            }
+         else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+           gcc_unreachable ();
+       }
+    }
+
+  /* Elide any permutations at BB reduction roots.  */
+  if (is_a <bb_vec_info> (vinfo))
+    {
+      for (slp_instance instance : vinfo->slp_instances)
+       {
+         if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
+           continue;
+         slp_tree old = SLP_INSTANCE_TREE (instance);
+         if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
+             && SLP_TREE_CHILDREN (old).length () == 1)
+           {
+             slp_tree child = SLP_TREE_CHILDREN (old)[0];
+             if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
+               {
+                 /* Preserve the special VEC_PERM we use to shield existing
+                    vector defs from the rest.  But make it a no-op.  */
+                 unsigned i = 0;
+                 for (std::pair<unsigned, unsigned> &p
+                      : SLP_TREE_LANE_PERMUTATION (old))
+                   p.second = i++;
+               }
+             else
+               {
+                 SLP_INSTANCE_TREE (instance) = child;
+                 SLP_TREE_REF_COUNT (child)++;
+                 vect_free_slp_tree (old);
+               }
+           }
+         else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
+                  && SLP_TREE_REF_COUNT (old) == 1
+                  && vertices[old->vertex].get_perm_materialized () != 0)
+           {
+             /* ???  For loads the situation is more complex since
+                we can't modify the permute in place in case the
+                node is used multiple times.  In fact for loads this
+                should be somehow handled in the propagation engine.  */
+             /* Apply the reverse permutation to our stmts.  */
+             int perm = vertices[old->vertex].get_perm_materialized ();
+             vect_slp_permute (perms[perm],
+                               SLP_TREE_SCALAR_STMTS (old), true);
+             vect_slp_permute (perms[perm],
+                               SLP_TREE_LOAD_PERMUTATION (old), true);
+           }
        }
     }
 
@@ -3883,7 +4107,7 @@ vect_optimize_slp (vec_info *vinfo)
   /* Now elide load permutations that are not necessary.  */
   for (i = 0; i < leafs.length (); ++i)
     {
-      node = vertices[leafs[i]];
+      node = vertices[leafs[i]].node;
       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
        continue;
 
@@ -3940,48 +4164,6 @@ vect_optimize_slp (vec_info *vinfo)
            }
        }
     }
-
-  /* And any permutations of BB reductions.  */
-  if (is_a <bb_vec_info> (vinfo))
-    {
-      for (slp_instance instance : vinfo->slp_instances)
-       {
-         if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
-           continue;
-         slp_tree old = SLP_INSTANCE_TREE (instance);
-         if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
-             && SLP_TREE_CHILDREN (old).length () == 1)
-           {
-             slp_tree child = SLP_TREE_CHILDREN (old)[0];
-             if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
-               {
-                 /* Preserve the special VEC_PERM we use to shield existing
-                    vector defs from the rest.  But make it a no-op.  */
-                 unsigned i = 0;
-                 for (std::pair<unsigned, unsigned> &p
-                      : SLP_TREE_LANE_PERMUTATION (old))
-                   p.second = i++;
-               }
-             else
-               {
-                 SLP_INSTANCE_TREE (instance) = child;
-                 SLP_TREE_REF_COUNT (child)++;
-                 vect_free_slp_tree (old);
-               }
-           }
-         else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
-                  && SLP_TREE_REF_COUNT (old) == 1)
-           {
-             /* ???  For loads the situation is more complex since
-                we can't modify the permute in place in case the
-                node is used multiple times.  In fact for loads this
-                should be somehow handled in the propagation engine.  */
-             auto fn = [] (const void *a, const void *b)
-                             { return *(const int *)a - *(const int *)b; };
-             SLP_TREE_LOAD_PERMUTATION (old).qsort (fn);
-           }
-       }
-    }
 }
 
 /* Gather loads reachable from the individual SLP graph entries.  */
@@ -4009,7 +4191,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
 {
   unsigned int i;
   poly_uint64 unrolling_factor = 1;
-  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+  const vec<slp_instance> &slp_instances
+    = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
   slp_instance instance;
   int decided_to_slp = 0;
 
@@ -4668,24 +4851,28 @@ static bool
 vectorizable_bb_reduc_epilogue (slp_instance instance,
                                stmt_vector_for_cost *cost_vec)
 {
-  enum tree_code reduc_code
-    = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+  gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
+  enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
   if (reduc_code == MINUS_EXPR)
     reduc_code = PLUS_EXPR;
   internal_fn reduc_fn;
   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
       || reduc_fn == IFN_LAST
-      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
+      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
+      || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
+                                    TREE_TYPE (vectype)))
     return false;
 
   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
-     cost log2 vector operations plus shuffles.  */
+     cost log2 vector operations plus shuffles and one extraction.  */
   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
                    vectype, 0, vect_body);
   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
                    vectype, 0, vect_body);
+  record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
+                   vectype, 0, vect_body);
   return true;
 }
 
@@ -4917,6 +5104,42 @@ vect_bb_partition_graph (bb_vec_info bb_vinfo)
     }
 }
 
+/* Compute the set of scalar stmts participating in internal and external
+   nodes.  */
+
+static void
+vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
+                                        hash_set<slp_tree> &visited,
+                                        hash_set<stmt_vec_info> &vstmts,
+                                        hash_set<stmt_vec_info> &estmts)
+{
+  int i;
+  stmt_vec_info stmt_info;
+  slp_tree child;
+
+  if (visited.add (node))
+    return;
+
+  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
+    {
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+       vstmts.add (stmt_info);
+
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+       if (child)
+         vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
+                                                  vstmts, estmts);
+    }
+  else
+    for (tree def : SLP_TREE_SCALAR_OPS (node))
+      {
+       stmt_vec_info def_stmt = vinfo->lookup_def (def);
+       if (def_stmt)
+         estmts.add (def_stmt);
+      }
+}
+
+
 /* Compute the scalar cost of the SLP node NODE and its children
    and return it.  Do not account defs that are marked in LIFE and
    update LIFE according to uses of NODE.  */
@@ -4925,6 +5148,7 @@ static void
 vect_bb_slp_scalar_cost (vec_info *vinfo,
                         slp_tree node, vec<bool, va_heap> *life,
                         stmt_vector_for_cost *cost_vec,
+                        hash_set<stmt_vec_info> &vectorized_scalar_stmts,
                         hash_set<slp_tree> &visited)
 {
   unsigned i;
@@ -4961,8 +5185,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
                  {
                    stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
                    if (!use_stmt_info
-                       || !PURE_SLP_STMT
-                             (vect_stmt_to_vectorize (use_stmt_info)))
+                       || !vectorized_scalar_stmts.contains (use_stmt_info))
                      {
                        (*life)[i] = true;
                        break;
@@ -5025,7 +5248,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
              subtree_life.safe_splice (*life);
            }
          vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
-                                  visited);
+                                  vectorized_scalar_stmts, visited);
          subtree_life.truncate (0);
        }
     }
@@ -5050,7 +5273,8 @@ li_cost_vec_cmp (const void *a_, const void *b_)
 
 static bool
 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
-                                   vec<slp_instance> slp_instances)
+                                   vec<slp_instance> slp_instances,
+                                   loop_p orig_loop)
 {
   slp_instance instance;
   int i;
@@ -5066,11 +5290,33 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
                              SLP_INSTANCE_TREE (instance), visited);
     }
 
+  /* Compute the set of scalar stmts we know will go away 'locally' when
+     vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
+     not accurate for nodes promoted extern late or for scalar stmts that
+     are used both in extern defs and in vectorized defs.  */
+  hash_set<stmt_vec_info> vectorized_scalar_stmts;
+  hash_set<stmt_vec_info> scalar_stmts_in_externs;
+  hash_set<slp_tree> visited;
+  FOR_EACH_VEC_ELT (slp_instances, i, instance)
+    {
+      vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
+                                              SLP_INSTANCE_TREE (instance),
+                                              visited,
+                                              vectorized_scalar_stmts,
+                                              scalar_stmts_in_externs);
+      for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
+       vectorized_scalar_stmts.add (rstmt);
+    }
+  /* Scalar stmts used as defs in external nodes need to be preseved, so
+     remove them from vectorized_scalar_stmts.  */
+  for (stmt_vec_info stmt : scalar_stmts_in_externs)
+    vectorized_scalar_stmts.remove (stmt);
+
   /* Calculate scalar cost and sum the cost for the vector stmts
      previously collected.  */
   stmt_vector_for_cost scalar_costs = vNULL;
   stmt_vector_for_cost vector_costs = vNULL;
-  hash_set<slp_tree> visited;
+  visited.empty ();
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
@@ -5083,14 +5329,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
                          SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
       vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
-                              &life, &scalar_costs, visited);
+                              &life, &scalar_costs, vectorized_scalar_stmts,
+                              visited);
       vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
-  /* Unset visited flag.  */
-  stmt_info_for_cost *cost;
-  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
-    gimple_set_visited  (cost->stmt_info->stmt, false);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
@@ -5107,6 +5350,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
     li_scalar_costs (scalar_costs.length ());
   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
     li_vector_costs (vector_costs.length ());
+  stmt_info_for_cost *cost;
   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
     {
       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
@@ -5129,6 +5373,7 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
   /* Now cost the portions individually.  */
   unsigned vi = 0;
   unsigned si = 0;
+  bool profitable = true;
   while (si < li_scalar_costs.length ()
         && vi < li_vector_costs.length ())
     {
@@ -5195,25 +5440,29 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
         example).  */
       if (vec_outside_cost + vec_inside_cost > scalar_cost)
        {
-         scalar_costs.release ();
-         vector_costs.release ();
-         return false;
+         profitable = false;
+         break;
        }
     }
-  if (vi < li_vector_costs.length ())
+  if (profitable && vi < li_vector_costs.length ())
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                         "Excess vector cost for part in loop %d:\n",
                         li_vector_costs[vi].first);
-      scalar_costs.release ();
-      vector_costs.release ();
-      return false;
+      profitable = false;
     }
 
+  /* Unset visited flag.  This is delayed when the subgraph is profitable
+     and we process the loop for remaining unvectorized if-converted code.  */
+  if (!orig_loop || !profitable)
+    FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+      gimple_set_visited  (cost->stmt_info->stmt, false);
+
   scalar_costs.release ();
   vector_costs.release ();
-  return true;
+
+  return profitable;
 }
 
 /* qsort comparator for lane defs.  */
@@ -5627,7 +5876,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
 static bool
 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
-                vec<int> *dataref_groups, unsigned int n_stmts)
+                vec<int> *dataref_groups, unsigned int n_stmts,
+                loop_p orig_loop)
 {
   bb_vec_info bb_vinfo;
   auto_vector_modes vector_modes;
@@ -5666,9 +5916,8 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 
          bb_vinfo->shared->check_datarefs ();
 
-         unsigned i;
-         slp_instance instance;
-         FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
+         auto_vec<slp_instance> profitable_subgraphs;
+         for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
            {
              if (instance->subgraph_entries.is_empty ())
                continue;
@@ -5676,7 +5925,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
              vect_location = instance->location ();
              if (!unlimited_cost_model (NULL)
                  && !vect_bb_vectorization_profitable_p
-                       (bb_vinfo, instance->subgraph_entries))
+                       (bb_vinfo, instance->subgraph_entries, orig_loop))
                {
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5688,6 +5937,47 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
              if (!dbg_cnt (vect_slp))
                continue;
 
+             profitable_subgraphs.safe_push (instance);
+           }
+
+         /* When we're vectorizing an if-converted loop body with the
+            very-cheap cost model make sure we vectorized all if-converted
+            code.  */
+         if (!profitable_subgraphs.is_empty ()
+             && orig_loop)
+           {
+             gcc_assert (bb_vinfo->bbs.length () == 1);
+             for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+                  !gsi_end_p (gsi); gsi_next (&gsi))
+               {
+                 /* The costing above left us with DCEable vectorized scalar
+                    stmts having the visited flag set on profitable
+                    subgraphs.  Do the delayed clearing of the flag here.  */
+                 if (gimple_visited_p (gsi_stmt (gsi)))
+                   {
+                     gimple_set_visited (gsi_stmt (gsi), false);
+                     continue;
+                   }
+                 if (flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
+                   continue;
+
+                 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+                   if (gimple_assign_rhs_code (ass) == COND_EXPR)
+                     {
+                       if (!profitable_subgraphs.is_empty ()
+                           && dump_enabled_p ())
+                         dump_printf_loc (MSG_NOTE, vect_location,
+                                          "not profitable because of "
+                                          "unprofitable if-converted scalar "
+                                          "code\n");
+                       profitable_subgraphs.truncate (0);
+                     }
+               }
+           }
+
+         /* Finally schedule the profitable subgraphs.  */
+         for (slp_instance instance : profitable_subgraphs)
+           {
              if (!vectorized && dump_enabled_p ())
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "Basic block will be vectorized "
@@ -5776,7 +6066,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
    true if anything in the basic-block was vectorized.  */
 
 static bool
-vect_slp_bbs (vec<basic_block> bbs)
+vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
 {
   vec<data_reference_p> datarefs = vNULL;
   auto_vec<int> dataref_groups;
@@ -5802,20 +6092,24 @@ vect_slp_bbs (vec<basic_block> bbs)
                                              &dataref_groups, current_group))
            ++current_group;
        }
+      /* New BBs always start a new DR group.  */
+      ++current_group;
     }
 
-  return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
+  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
 }
 
-/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
-   true if anything in the basic-block was vectorized.  */
+/* Special entry for the BB vectorizer.  Analyze and transform a single
+   if-converted BB with ORIG_LOOPs body being the not if-converted
+   representation.  Returns true if anything in the basic-block was
+   vectorized.  */
 
 bool
-vect_slp_bb (basic_block bb)
+vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
 {
   auto_vec<basic_block> bbs;
   bbs.safe_push (bb);
-  return vect_slp_bbs (bbs);
+  return vect_slp_bbs (bbs, orig_loop);
 }
 
 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
@@ -5866,7 +6160,7 @@ vect_slp_function (function *fun)
 
       if (split && !bbs.is_empty ())
        {
-         r |= vect_slp_bbs (bbs);
+         r |= vect_slp_bbs (bbs, NULL);
          bbs.truncate (0);
          bbs.quick_push (bb);
        }
@@ -5884,13 +6178,13 @@ vect_slp_function (function *fun)
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                               "splitting region at control altering "
                               "definition %G", last);
-           r |= vect_slp_bbs (bbs);
+           r |= vect_slp_bbs (bbs, NULL);
            bbs.truncate (0);
          }
     }
 
   if (!bbs.is_empty ())
-    r |= vect_slp_bbs (bbs);
+    r |= vect_slp_bbs (bbs, NULL);
 
   free (rpo);
 
@@ -5921,7 +6215,7 @@ vect_slp_function (function *fun)
 
 void
 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
-                         vec<tree> elts, unsigned int nresults,
+                         const vec<tree> &elts, unsigned int nresults,
                          vec<tree> &results)
 {
   unsigned int nelts = elts.length ();
@@ -6272,14 +6566,15 @@ vect_get_slp_defs (vec_info *,
    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
    permute statements for the SLP node NODE.  Store the number of vector
    permute instructions in *N_PERMS and the number of vector load
-   instructions in *N_LOADS.  */
+   instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
+   that were not needed.  */
 
 bool
 vect_transform_slp_perm_load (vec_info *vinfo,
-                             slp_tree node, vec<tree> dr_chain,
+                             slp_tree node, const vec<tree> &dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
                              bool analyze_only, unsigned *n_perms,
-                             unsigned int *n_loads)
+                             unsigned int *n_loads, bool dce_chain)
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   int vec_index = 0;
@@ -6358,6 +6653,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
     }
   auto_sbitmap used_in_lanes (in_nlanes);
   bitmap_clear (used_in_lanes);
+  auto_bitmap used_defs;
 
   unsigned int count = mask.encoded_nelts ();
   mask.quick_grow (count);
@@ -6465,11 +6761,20 @@ vect_transform_slp_perm_load (vec_info *vinfo,
                                               mask_vec);
                      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
                                                   gsi);
+                     if (dce_chain)
+                       {
+                         bitmap_set_bit (used_defs, first_vec_index + ri);
+                         bitmap_set_bit (used_defs, second_vec_index + ri);
+                       }
                    }
                  else
-                   /* If mask was NULL_TREE generate the requested
-                      identity transform.  */
-                   perm_stmt = SSA_NAME_DEF_STMT (first_vec);
+                   {
+                     /* If mask was NULL_TREE generate the requested
+                        identity transform.  */
+                     perm_stmt = SSA_NAME_DEF_STMT (first_vec);
+                     if (dce_chain)
+                       bitmap_set_bit (used_defs, first_vec_index + ri);
+                   }
 
                  /* Store the vector statement in NODE.  */
                  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
@@ -6509,6 +6814,16 @@ vect_transform_slp_perm_load (vec_info *vinfo,
        }
     }
 
+  if (dce_chain)
+    for (unsigned i = 0; i < dr_chain.length (); ++i)
+      if (!bitmap_bit_p (used_defs, i))
+       {
+         gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
+         gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
+         gsi_remove (&rgsi, true);
+         release_defs (stmt);
+       }
+
   return true;
 }
 
@@ -6956,6 +7271,21 @@ vect_schedule_slp_node (vec_info *vinfo,
          gcc_assert (seen_vector_def);
          si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
        }
+      else if (is_a <bb_vec_info> (vinfo)
+              && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
+              && gimple_could_trap_p (stmt_info->stmt))
+       {
+         /* We've constrained possibly trapping operations to all come
+            from the same basic-block, if vectorized defs would allow earlier
+            scheduling still force vectorized stmts to the original block.
+            This is only necessary for BB vectorization since for loop vect
+            all operations are in a single BB and scalar stmt based
+            placement doesn't play well with epilogue vectorization.  */
+         gcc_assert (dominated_by_p (CDI_DOMINATORS,
+                                     gimple_bb (stmt_info->stmt),
+                                     gimple_bb (last_stmt)));
+         si = gsi_after_labels (gimple_bb (stmt_info->stmt));
+       }
       else if (is_a <gphi *> (last_stmt))
        si = gsi_after_labels (gimple_bb (last_stmt));
       else
@@ -7270,7 +7600,7 @@ vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
 
 void
-vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
+vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
 {
   slp_instance instance;
   unsigned int i;