]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/tree-vect-slp.c
[Ada] Use Standard.Natural on bit references to packed arrays
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
index 367945b69cdf08e62fd6556a8548d60b2445fbfe..836defce9901683b000216f98c809451c7c82c5d 100644 (file)
@@ -1,5 +1,5 @@
 /* SLP - Basic Block Vectorization
-   Copyright (C) 2007-2018 Free Software Foundation, Inc.
+   Copyright (C) 2007-2020 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
 
@@ -32,7 +32,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "optabs-tree.h"
 #include "insn-config.h"
 #include "recog.h"             /* FIXME: for insn_data */
-#include "params.h"
 #include "fold-const.h"
 #include "stor-layout.h"
 #include "gimple-iterator.h"
@@ -45,8 +44,40 @@ along with GCC; see the file COPYING3.  If not see
 #include "vec-perm-indices.h"
 #include "gimple-fold.h"
 #include "internal-fn.h"
+#include "dump-context.h"
 
 
+/* Initialize a SLP node.  */
+
+_slp_tree::_slp_tree ()
+{
+  SLP_TREE_SCALAR_STMTS (this) = vNULL;
+  SLP_TREE_SCALAR_OPS (this) = vNULL;
+  SLP_TREE_VEC_STMTS (this) = vNULL;
+  SLP_TREE_VEC_DEFS (this) = vNULL;
+  SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
+  SLP_TREE_CHILDREN (this) = vNULL;
+  SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
+  SLP_TREE_TWO_OPERATORS (this) = false;
+  SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
+  SLP_TREE_VECTYPE (this) = NULL_TREE;
+  SLP_TREE_REPRESENTATIVE (this) = NULL;
+  this->refcnt = 1;
+  this->max_nunits = 1;
+}
+
+/* Tear down a SLP node.  */
+
+_slp_tree::~_slp_tree ()
+{
+  SLP_TREE_CHILDREN (this).release ();
+  SLP_TREE_SCALAR_STMTS (this).release ();
+  SLP_TREE_SCALAR_OPS (this).release ();
+  SLP_TREE_VEC_STMTS (this).release ();
+  SLP_TREE_VEC_DEFS (this).release ();
+  SLP_TREE_LOAD_PERMUTATION (this).release ();
+}
+
 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
    FINAL_P is true if we have vectorized the instance or if we have
    made a final decision not to vectorize the statements in any way.  */
@@ -57,6 +88,9 @@ vect_free_slp_tree (slp_tree node, bool final_p)
   int i;
   slp_tree child;
 
+  if (--node->refcnt != 0)
+    return;
+
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     vect_free_slp_tree (child, final_p);
 
@@ -74,15 +108,9 @@ vect_free_slp_tree (slp_tree node, bool final_p)
        }
     }
 
-  SLP_TREE_CHILDREN (node).release ();
-  SLP_TREE_SCALAR_STMTS (node).release ();
-  SLP_TREE_VEC_STMTS (node).release ();
-  SLP_TREE_LOAD_PERMUTATION (node).release ();
-
-  free (node);
+  delete node;
 }
 
-
 /* Free the memory allocated for the SLP instance.  FINAL_P is true if we
    have vectorized the instance or if we have made a final decision not
    to vectorize the statements in any way.  */
@@ -99,41 +127,33 @@ vect_free_slp_instance (slp_instance instance, bool final_p)
 /* Create an SLP node for SCALAR_STMTS.  */
 
 static slp_tree
-vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
+vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 {
-  slp_tree node;
-  stmt_vec_info stmt_info = scalar_stmts[0];
-  unsigned int nops;
-
-  if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
-    nops = gimple_call_num_args (stmt);
-  else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
-    {
-      nops = gimple_num_ops (stmt) - 1;
-      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
-       nops++;
-    }
-  else if (is_a <gphi *> (stmt_info->stmt))
-    nops = 0;
-  else
-    return NULL;
-
-  node = XNEW (struct _slp_tree);
+  slp_tree node = new _slp_tree;
   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
-  SLP_TREE_VEC_STMTS (node).create (0);
-  SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
   SLP_TREE_CHILDREN (node).create (nops);
-  SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
-  SLP_TREE_TWO_OPERATORS (node) = false;
   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
+  SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 
   unsigned i;
+  stmt_vec_info stmt_info;
   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
     STMT_VINFO_NUM_SLP_USES (stmt_info)++;
 
   return node;
 }
 
+/* Create an SLP node for OPS.  */
+
+static slp_tree
+vect_create_new_slp_node (vec<tree> ops)
+{
+  slp_tree node = new _slp_tree;
+  SLP_TREE_SCALAR_OPS (node) = ops;
+  SLP_TREE_DEF_TYPE (node) = vect_external_def;
+  return node;
+}
+
 
 /* This structure is used in creation of an SLP tree.  Each instance
    corresponds to the same operand in a group of scalar stmts in an SLP
@@ -142,13 +162,14 @@ typedef struct _slp_oprnd_info
 {
   /* Def-stmts for the operands.  */
   vec<stmt_vec_info> def_stmts;
+  /* Operands.  */
+  vec<tree> ops;
   /* Information about the first statement, its vector def-type, type, the
      operand itself in case it's constant, and an indication if it's a pattern
      stmt.  */
   tree first_op_type;
   enum vect_def_type first_dt;
-  bool first_pattern;
-  bool second_pattern;
+  bool any_pattern;
 } *slp_oprnd_info;
 
 
@@ -166,10 +187,10 @@ vect_create_oprnd_info (int nops, int group_size)
     {
       oprnd_info = XNEW (struct _slp_oprnd_info);
       oprnd_info->def_stmts.create (group_size);
+      oprnd_info->ops.create (group_size);
       oprnd_info->first_dt = vect_uninitialized_def;
       oprnd_info->first_op_type = NULL_TREE;
-      oprnd_info->first_pattern = false;
-      oprnd_info->second_pattern = false;
+      oprnd_info->any_pattern = false;
       oprnds_info.quick_push (oprnd_info);
     }
 
@@ -188,6 +209,7 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
       oprnd_info->def_stmts.release ();
+      oprnd_info->ops.release ();
       XDELETE (oprnd_info);
     }
 
@@ -195,6 +217,19 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 }
 
 
+/* Return true if STMTS contains a pattern statement.  */
+
+static bool
+vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
+{
+  stmt_vec_info stmt_info;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (stmts, i, stmt_info)
+    if (is_pattern_stmt_p (stmt_info))
+      return true;
+  return false;
+}
+
 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
    of the chain.  */
@@ -222,33 +257,45 @@ vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
   return -1;
 }
 
-/* Check whether it is possible to load COUNT elements of type ELT_MODE
+/* Check whether it is possible to load COUNT elements of type ELT_TYPE
    using the method implemented by duplicate_and_interleave.  Return true
    if so, returning the number of intermediate vectors in *NVECTORS_OUT
    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
    (if nonnull).  */
 
 bool
-can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
-                               unsigned int *nvectors_out,
+can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
+                               tree elt_type, unsigned int *nvectors_out,
                                tree *vector_type_out,
                                tree *permutes)
 {
-  poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
-  poly_int64 nelts;
+  tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
+  if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
+    return false;
+
+  machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
+  poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
   unsigned int nvectors = 1;
   for (;;)
     {
       scalar_int_mode int_mode;
       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
-      if (multiple_p (current_vector_size, elt_bytes, &nelts)
-         && int_mode_for_size (elt_bits, 0).exists (&int_mode))
+      if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
        {
+         /* Get the natural vector type for this SLP group size.  */
          tree int_type = build_nonstandard_integer_type
            (GET_MODE_BITSIZE (int_mode), 1);
-         tree vector_type = build_vector_type (int_type, nelts);
-         if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
+         tree vector_type
+           = get_vectype_for_scalar_type (vinfo, int_type, count);
+         if (vector_type
+             && VECTOR_MODE_P (TYPE_MODE (vector_type))
+             && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
+                          GET_MODE_SIZE (base_vector_mode)))
            {
+             /* Try fusing consecutive sequences of COUNT / NVECTORS elements
+                together into elements of type INT_TYPE and using the result
+                to build NVECTORS vectors.  */
+             poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
              vec_perm_builder sel1 (nelts, 2, 3);
              vec_perm_builder sel2 (nelts, 2, 3);
              poly_int64 half_nelts = exact_div (nelts, 2);
@@ -306,13 +353,11 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
   tree oprnd;
   unsigned int i, number_of_oprnds;
   enum vect_def_type dt = vect_uninitialized_def;
-  bool pattern = false;
   slp_oprnd_info oprnd_info;
   int first_op_idx = 1;
   unsigned int commutative_op = -1U;
   bool first_op_cond = false;
   bool first = stmt_num == 0;
-  bool second = stmt_num == 1;
 
   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     {
@@ -322,6 +367,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
        {
          internal_fn ifn = gimple_call_internal_fn (stmt);
          commutative_op = first_commutative_argument (ifn);
+
+         /* Masked load, only look at mask.  */
+         if (ifn == IFN_MASK_LOAD)
+           {
+             number_of_oprnds = 1;
+             /* Mask operand index.  */
+             first_op_idx = 5;
+           }
        }
     }
   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
@@ -330,7 +383,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
       number_of_oprnds = gimple_num_ops (stmt) - 1;
       /* Swap can only be done for cond_expr if asked to, otherwise we
         could result in different comparison code to the first stmt.  */
-      if (gimple_assign_rhs_code (stmt) == COND_EXPR
+      if (code == COND_EXPR
          && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
        {
          first_op_cond = true;
@@ -361,6 +414,8 @@ again:
        }
       else
        oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
+      if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
+       oprnd = TREE_OPERAND (oprnd, 0);
 
       oprnd_info = (*oprnds_info)[i];
 
@@ -368,23 +423,26 @@ again:
       if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
        {
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: can't analyze def for ");
-             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
-              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-           }
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Build SLP failed: can't analyze def for %T\n",
+                            oprnd);
 
          return -1;
        }
 
-      if (second)
-       oprnd_info->second_pattern = pattern;
+      if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
+       oprnd_info->any_pattern = true;
 
       if (first)
        {
+         /* For the swapping logic below force vect_reduction_def
+            for the reduction op in a SLP reduction group.  */
+         if (!STMT_VINFO_DATA_REF (stmt_info)
+             && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+             && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
+             && def_stmt_info)
+           dt = vect_reduction_def;
          oprnd_info->first_dt = dt;
-         oprnd_info->first_pattern = pattern;
          oprnd_info->first_op_type = TREE_TYPE (oprnd);
        }
       else
@@ -393,20 +451,35 @@ again:
             the def-stmt/s of the first stmt.  Allow different definition
             types for reduction chains: the first stmt must be a
             vect_reduction_def (a phi node), and the rest
-            vect_internal_def.  */
+            end in the reduction chain.  */
          tree type = TREE_TYPE (oprnd);
          if ((oprnd_info->first_dt != dt
               && !(oprnd_info->first_dt == vect_reduction_def
-                   && dt == vect_internal_def)
+                   && !STMT_VINFO_DATA_REF (stmt_info)
+                   && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+                   && def_stmt_info
+                   && !STMT_VINFO_DATA_REF (def_stmt_info)
+                   && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
+                       == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
               && !((oprnd_info->first_dt == vect_external_def
                     || oprnd_info->first_dt == vect_constant_def)
                    && (dt == vect_external_def
                        || dt == vect_constant_def)))
-             || !types_compatible_p (oprnd_info->first_op_type, type))
+             || !types_compatible_p (oprnd_info->first_op_type, type)
+             || (!STMT_VINFO_DATA_REF (stmt_info)
+                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+                 && ((!def_stmt_info
+                      || STMT_VINFO_DATA_REF (def_stmt_info)
+                      || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
+                          != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
+                     != (oprnd_info->first_dt != vect_reduction_def))))
            {
              /* Try swapping operands if we got a mismatch.  */
              if (i == commutative_op && !swapped)
                {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "trying swapped operands\n");
                  swapped = true;
                  goto again;
                }
@@ -419,19 +492,15 @@ again:
            }
          if ((dt == vect_constant_def
               || dt == vect_external_def)
-             && !current_vector_size.is_constant ()
+             && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
              && (TREE_CODE (type) == BOOLEAN_TYPE
-                 || !can_duplicate_and_interleave_p (stmts.length (),
-                                                     TYPE_MODE (type))))
+                 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
+                                                     type)))
            {
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "Build SLP failed: invalid type of def "
-                                  "for variable-length SLP ");
-                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
-                 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-               }
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Build SLP failed: invalid type of def "
+                                "for variable-length SLP %T\n", oprnd);
              return -1;
            }
        }
@@ -439,25 +508,45 @@ again:
       /* Check the types of the definitions.  */
       switch (dt)
        {
-       case vect_constant_def:
        case vect_external_def:
+         /* Make sure to demote the overall operand to external.  */
+         oprnd_info->first_dt = vect_external_def;
+         /* Fallthru.  */
+       case vect_constant_def:
+         oprnd_info->def_stmts.quick_push (NULL);
+         oprnd_info->ops.quick_push (oprnd);
          break;
 
+       case vect_internal_def:
        case vect_reduction_def:
+         if (oprnd_info->first_dt == vect_reduction_def
+             && !STMT_VINFO_DATA_REF (stmt_info)
+             && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+             && !STMT_VINFO_DATA_REF (def_stmt_info)
+             && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
+                 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
+           {
+             /* For a SLP reduction chain we want to duplicate the
+                reduction to each of the chain members.  That gets
+                us a sane SLP graph (still the stmts are not 100%
+                correct wrt the initial values).  */
+             gcc_assert (!first);
+             oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
+             oprnd_info->ops.quick_push (oprnd_info->ops[0]);
+             break;
+           }
+         /* Fallthru.  */
        case vect_induction_def:
-       case vect_internal_def:
          oprnd_info->def_stmts.quick_push (def_stmt_info);
+         oprnd_info->ops.quick_push (oprnd);
          break;
 
        default:
          /* FORNOW: Not supported.  */
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: illegal type of def ");
-             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
-              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-           }
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Build SLP failed: illegal type of def %T\n",
+                            oprnd);
 
          return -1;
        }
@@ -466,62 +555,85 @@ again:
   /* Swap operands.  */
   if (swapped)
     {
-      /* If there are already uses of this stmt in a SLP instance then
-         we've committed to the operand order and can't swap it.  */
-      if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
-       {
-         if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: cannot swap operands of "
-                              "shared stmt ");
-             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                               stmt_info->stmt, 0);
-           }
-         return -1;
-       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "swapped operands to match def types in %G",
+                        stmt_info->stmt);
+    }
 
-      if (first_op_cond)
-       {
-         gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-         tree cond = gimple_assign_rhs1 (stmt);
-         enum tree_code code = TREE_CODE (cond);
+  *swap = swapped;
+  return 0;
+}
 
-         /* Swap.  */
-         if (*swap == 1)
-           {
-             swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
-                                &TREE_OPERAND (cond, 1));
-             TREE_SET_CODE (cond, swap_tree_comparison (code));
-           }
-         /* Invert.  */
-         else
-           {
-             swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
-                                gimple_assign_rhs3_ptr (stmt));
-             bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
-             code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
-             gcc_assert (code != ERROR_MARK);
-             TREE_SET_CODE (cond, code);
-           }
-       }
-      else
-       {
-         unsigned int op = commutative_op + first_op_idx;
-         swap_ssa_operands (stmt_info->stmt,
-                            gimple_op_ptr (stmt_info->stmt, op),
-                            gimple_op_ptr (stmt_info->stmt, op + 1));
-       }
-      if (dump_enabled_p ())
+/* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
+   Return true if we can, meaning that this choice doesn't conflict with
+   existing SLP nodes that use STMT_INFO.  */
+
+static bool
+vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
+{
+  tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
+  if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
+    return true;
+
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+      && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+    {
+      /* We maintain the invariant that if any statement in the group is
+        used, all other members of the group have the same vector type.  */
+      stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      stmt_vec_info member_info = first_info;
+      for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
+       if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
+           || is_pattern_stmt_p (member_info))
+         break;
+
+      if (!member_info)
        {
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "swapped operands to match def types in ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
+         for (member_info = first_info; member_info;
+              member_info = DR_GROUP_NEXT_ELEMENT (member_info))
+           STMT_VINFO_VECTYPE (member_info) = vectype;
+         return true;
        }
     }
+  else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
+          && !is_pattern_stmt_p (stmt_info))
+    {
+      STMT_VINFO_VECTYPE (stmt_info) = vectype;
+      return true;
+    }
 
-  *swap = swapped;
-  return 0;
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                      "Build SLP failed: incompatible vector"
+                      " types for: %G", stmt_info->stmt);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "    old vector type: %T\n", old_vectype);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "    new vector type: %T\n", vectype);
+    }
+  return false;
+}
+
+/* Try to infer and assign a vector type to all the statements in STMTS.
+   Used only for BB vectorization.  */
+
+static bool
+vect_update_all_shared_vectypes (vec_info *vinfo, vec<stmt_vec_info> stmts)
+{
+  tree vectype, nunits_vectype;
+  if (!vect_get_vector_types_for_stmt (vinfo, stmts[0], &vectype,
+                                      &nunits_vectype, stmts.length ()))
+    return false;
+
+  stmt_vec_info stmt_info;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (stmts, i, stmt_info)
+    if (!vect_update_shared_vectype (stmt_info, vectype))
+      return false;
+
+  return true;
 }
 
 /* Return true if call statements CALL1 and CALL2 are similar enough
@@ -567,19 +679,16 @@ compatible_calls_p (gcall *call1, gcall *call2)
    vect_build_slp_tree.  */
 
 static bool
-vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
+vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
+                       unsigned int group_size,
                        tree vectype, poly_uint64 *max_nunits)
 {
   if (!vectype)
     {
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "Build SLP failed: unsupported data-type in ");
-         dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                           stmt_info->stmt, 0);
-         dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-       }
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Build SLP failed: unsupported data-type in %G\n",
+                        stmt_info->stmt);
       /* Fatal mismatch.  */
       return false;
     }
@@ -588,13 +697,14 @@ vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
      before adjusting *max_nunits for basic-block vectorization.  */
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   unsigned HOST_WIDE_INT const_nunits;
-  if (STMT_VINFO_BB_VINFO (stmt_info)
+  if (is_a <bb_vec_info> (vinfo)
       && (!nunits.is_constant (&const_nunits)
          || const_nunits > group_size))
     {
-      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                      "Build SLP failed: unrolling required "
-                      "in basic block SLP\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Build SLP failed: unrolling required "
+                        "in basic block SLP\n");
       /* Fatal mismatch.  */
       return false;
     }
@@ -640,7 +750,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
    is false then this indicates the comparison could not be
    carried out or the stmts will never be vectorized by SLP.
 
-   Note COND_EXPR is possibly ismorphic to another one after swapping its
+   Note COND_EXPR is possibly isomorphic to another one after swapping its
    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    the first stmt by swapping the two operands of comparison; set SWAP[i]
    to 2 if stmt I is isormorphic to the first stmt by inverting the code
@@ -648,7 +758,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 
 static bool
-vect_build_slp_tree_1 (unsigned char *swap,
+vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
                       vec<stmt_vec_info> stmts, unsigned int group_size,
                       poly_uint64 *max_nunits, bool *matches,
                       bool *two_operators)
@@ -667,6 +777,7 @@ vect_build_slp_tree_1 (unsigned char *swap,
   machine_mode optab_op2_mode;
   machine_mode vec_mode;
   stmt_vec_info first_load = NULL, prev_first_load = NULL;
+  bool load_p = false;
 
   /* For every stmt in NODE find its def stmt/s.  */
   stmt_vec_info stmt_info;
@@ -677,20 +788,15 @@ vect_build_slp_tree_1 (unsigned char *swap,
       matches[i] = false;
 
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
-       }
+       dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
 
       /* Fail to vectorize statements marked as unvectorizable.  */
       if (!STMT_VINFO_VECTORIZABLE (stmt_info))
         {
           if (dump_enabled_p ())
-            {
-              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: unvectorizable statement ");
-              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-            }
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Build SLP failed: unvectorizable statement %G",
+                            stmt);
          /* Fatal mismatch.  */
          matches[0] = false;
           return false;
@@ -700,22 +806,19 @@ vect_build_slp_tree_1 (unsigned char *swap,
       if (lhs == NULL_TREE)
        {
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: not GIMPLE_ASSIGN nor "
-                              "GIMPLE_CALL ");
-             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-           }
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Build SLP failed: not GIMPLE_ASSIGN nor "
+                            "GIMPLE_CALL %G", stmt);
          /* Fatal mismatch.  */
          matches[0] = false;
          return false;
        }
 
       tree nunits_vectype;
-      if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
-                                          &nunits_vectype)
+      if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
+                                          &nunits_vectype, group_size)
          || (nunits_vectype
-             && !vect_record_max_nunits (stmt_info, group_size,
+             && !vect_record_max_nunits (vinfo, stmt_info, group_size,
                                          nunits_vectype, max_nunits)))
        {
          /* Fatal mismatch.  */
@@ -725,31 +828,39 @@ vect_build_slp_tree_1 (unsigned char *swap,
 
       gcc_assert (vectype);
 
-      if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
+      if (is_a <bb_vec_info> (vinfo)
+         && !vect_update_shared_vectype (stmt_info, vectype))
+       continue;
+
+      gcall *call_stmt = dyn_cast <gcall *> (stmt);
+      if (call_stmt)
        {
          rhs_code = CALL_EXPR;
-         if ((gimple_call_internal_p (call_stmt)
-              && (!vectorizable_internal_fn_p
-                  (gimple_call_internal_fn (call_stmt))))
-             || gimple_call_tail_p (call_stmt)
-             || gimple_call_noreturn_p (call_stmt)
-             || !gimple_call_nothrow_p (call_stmt)
-             || gimple_call_chain (call_stmt))
+
+         if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
+           load_p = true;
+         else if ((gimple_call_internal_p (call_stmt)
+                   && (!vectorizable_internal_fn_p
+                       (gimple_call_internal_fn (call_stmt))))
+                  || gimple_call_tail_p (call_stmt)
+                  || gimple_call_noreturn_p (call_stmt)
+                  || !gimple_call_nothrow_p (call_stmt)
+                  || gimple_call_chain (call_stmt))
            {
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                  "Build SLP failed: unsupported call type ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                   call_stmt, 0);
-               }
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Build SLP failed: unsupported call type %G",
+                                call_stmt);
              /* Fatal mismatch.  */
              matches[0] = false;
              return false;
            }
        }
       else
-       rhs_code = gimple_assign_rhs_code (stmt);
+       {
+         rhs_code = gimple_assign_rhs_code (stmt);
+         load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
+       }
 
       /* Check the operation.  */
       if (i == 0)
@@ -762,17 +873,6 @@ vect_build_slp_tree_1 (unsigned char *swap,
              || rhs_code == LROTATE_EXPR
              || rhs_code == RROTATE_EXPR)
            {
-             if (vectype == boolean_type_node)
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "Build SLP failed: shift of a"
-                                    " boolean.\n");
-                 /* Fatal mismatch.  */
-                 matches[0] = false;
-                 return false;
-               }
-
              vec_mode = TYPE_MODE (vectype);
 
              /* First see if we have a vector/vector shift.  */
@@ -819,6 +919,12 @@ vect_build_slp_tree_1 (unsigned char *swap,
               need_same_oprnds = true;
               first_op1 = gimple_assign_rhs2 (stmt);
             }
+         else if (call_stmt
+                  && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
+           {
+             need_same_oprnds = true;
+             first_op1 = gimple_call_arg (call_stmt, 1);
+           }
        }
       else
        {
@@ -848,43 +954,39 @@ vect_build_slp_tree_1 (unsigned char *swap,
                {
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
                                   "Build SLP failed: different operation "
-                                  "in stmt ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                                  "in stmt %G", stmt);
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "original stmt ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                   first_stmt_info->stmt, 0);
+                                  "original stmt %G", first_stmt_info->stmt);
                }
              /* Mismatch.  */
              continue;
            }
 
-         if (need_same_oprnds
-             && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
+         if (need_same_oprnds)
            {
-             if (dump_enabled_p ())
+             tree other_op1 = (call_stmt
+                               ? gimple_call_arg (call_stmt, 1)
+                               : gimple_assign_rhs2 (stmt));
+             if (!operand_equal_p (first_op1, other_op1, 0))
                {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                  "Build SLP failed: different shift "
-                                  "arguments in ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different shift "
+                                    "arguments in %G", stmt);
+                 /* Mismatch.  */
+                 continue;
                }
-             /* Mismatch.  */
-             continue;
            }
 
-         if (rhs_code == CALL_EXPR)
+         if (!load_p && rhs_code == CALL_EXPR)
            {
              if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
                                       as_a <gcall *> (stmt)))
                {
                  if (dump_enabled_p ())
-                   {
-                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                      "Build SLP failed: different calls in ");
-                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                       stmt, 0);
-                   }
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different calls in %G",
+                                    stmt);
                  /* Mismatch.  */
                  continue;
                }
@@ -910,14 +1012,11 @@ vect_build_slp_tree_1 (unsigned char *swap,
                   if (prev_first_load != first_load)
                     {
                       if (dump_enabled_p ())
-                        {
-                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                          vect_location, 
-                                          "Build SLP failed: different "
-                                          "interleaving chains in one node ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                           stmt, 0);
-                        }
+                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                                        vect_location,
+                                        "Build SLP failed: different "
+                                        "interleaving chains in one node %G",
+                                        stmt);
                      /* Mismatch.  */
                      continue;
                     }
@@ -928,15 +1027,12 @@ vect_build_slp_tree_1 (unsigned char *swap,
         } /* Grouped access.  */
       else
        {
-         if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
+         if (load_p)
            {
              /* Not grouped load.  */
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                  "Build SLP failed: not grouped load ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-               }
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Build SLP failed: not grouped load %G", stmt);
 
              /* FORNOW: Not grouped loads are not supported.  */
              /* Fatal mismatch.  */
@@ -949,15 +1045,13 @@ vect_build_slp_tree_1 (unsigned char *swap,
              && TREE_CODE_CLASS (rhs_code) != tcc_unary
              && TREE_CODE_CLASS (rhs_code) != tcc_expression
              && TREE_CODE_CLASS (rhs_code) != tcc_comparison
+             && rhs_code != VIEW_CONVERT_EXPR
              && rhs_code != CALL_EXPR)
            {
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "Build SLP failed: operation");
-                 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-               }
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Build SLP failed: operation unsupported %G",
+                                stmt);
              /* Fatal mismatch.  */
              matches[0] = false;
              return false;
@@ -990,13 +1084,9 @@ vect_build_slp_tree_1 (unsigned char *swap,
              else
                {
                  if (dump_enabled_p ())
-                   {
-                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                      "Build SLP failed: different"
-                                      " operation");
-                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                       stmt, 0);
-                   }
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different"
+                                    " operation %G", stmt);
                  /* Mismatch.  */
                  continue;
                }
@@ -1015,9 +1105,8 @@ vect_build_slp_tree_1 (unsigned char *swap,
   if (alt_stmt_code != ERROR_MARK
       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
     {
-      if (vectype == boolean_type_node
-         || !vect_two_operations_perm_ok_p (stmts, group_size,
-                                            vectype, alt_stmt_code))
+      if (!vect_two_operations_perm_ok_p (stmts, group_size,
+                                         vectype, alt_stmt_code))
        {
          for (i = 0; i < group_size; ++i)
            if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
@@ -1027,13 +1116,9 @@ vect_build_slp_tree_1 (unsigned char *swap,
                  {
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                     "Build SLP failed: different operation "
-                                    "in stmt ");
-                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                     stmts[i]->stmt, 0);
+                                    "in stmt %G", stmts[i]->stmt);
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "original stmt ");
-                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                     first_stmt_info->stmt, 0);
+                                    "original stmt %G", first_stmt_info->stmt);
                  }
              }
          return false;
@@ -1055,6 +1140,7 @@ struct bst_traits
   static inline bool equal (value_type existing, value_type candidate);
   static inline bool is_empty (value_type x) { return !x.exists (); }
   static inline bool is_deleted (value_type x) { return !x.exists (); }
+  static const bool empty_zero_p = true;
   static inline void mark_empty (value_type &x) { x.release (); }
   static inline void mark_deleted (value_type &x) { x.release (); }
   static inline void remove (value_type &x) { x.release (); }
@@ -1078,9 +1164,6 @@ bst_traits::equal (value_type existing, value_type candidate)
   return true;
 }
 
-typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
-static scalar_stmts_set_t *bst_fail;
-
 typedef hash_map <vec <gimple *>, slp_tree,
                  simple_hashmap_traits <bst_traits, slp_tree> >
   scalar_stmts_to_slp_tree_map_t;
@@ -1089,32 +1172,40 @@ static slp_tree
 vect_build_slp_tree_2 (vec_info *vinfo,
                       vec<stmt_vec_info> stmts, unsigned int group_size,
                       poly_uint64 *max_nunits,
-                      vec<slp_tree> *loads,
                       bool *matches, unsigned *npermutes, unsigned *tree_size,
-                      unsigned max_tree_size);
+                      scalar_stmts_to_slp_tree_map_t *bst_map);
 
 static slp_tree
 vect_build_slp_tree (vec_info *vinfo,
                     vec<stmt_vec_info> stmts, unsigned int group_size,
-                    poly_uint64 *max_nunits, vec<slp_tree> *loads,
+                    poly_uint64 *max_nunits,
                     bool *matches, unsigned *npermutes, unsigned *tree_size,
-                    unsigned max_tree_size)
+                    scalar_stmts_to_slp_tree_map_t *bst_map)
 {
-  if (bst_fail->contains (stmts))
-    return NULL;
-  slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
-                                       loads, matches, npermutes, tree_size,
-                                       max_tree_size);
-  /* When SLP build fails for stmts record this, otherwise SLP build
-     can be exponential in time when we allow to construct parts from
-     scalars, see PR81723.  */
-  if (! res)
-    {
-      vec <stmt_vec_info> x;
-      x.create (stmts.length ());
-      x.splice (stmts);
-      bst_fail->add (x);
+  if (slp_tree *leader = bst_map->get (stmts))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
+                        *leader ? "" : "failed ", *leader);
+      if (*leader)
+       {
+         (*leader)->refcnt++;
+         vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
+       }
+      return *leader;
+    }
+  poly_uint64 this_max_nunits = 1;
+  slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
+                                       &this_max_nunits,
+                                       matches, npermutes, tree_size, bst_map);
+  if (res)
+    {
+      res->max_nunits = this_max_nunits;
+      vect_update_max_nunits (max_nunits, this_max_nunits);
+      /* Keep a reference for the bst_map use.  */
+      res->refcnt++;
     }
+  bst_map->put (stmts.copy (), res);
   return res;
 }
 
@@ -1129,9 +1220,8 @@ static slp_tree
 vect_build_slp_tree_2 (vec_info *vinfo,
                       vec<stmt_vec_info> stmts, unsigned int group_size,
                       poly_uint64 *max_nunits,
-                      vec<slp_tree> *loads,
                       bool *matches, unsigned *npermutes, unsigned *tree_size,
-                      unsigned max_tree_size)
+                      scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   unsigned nops, i, this_tree_size = 0;
   poly_uint64 this_max_nunits = *max_nunits;
@@ -1158,8 +1248,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
   if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     {
       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
-      tree vectype = get_vectype_for_scalar_type (scalar_type);
-      if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
+      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+      if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
+                                  max_nunits))
        return NULL;
 
       vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
@@ -1171,39 +1262,64 @@ vect_build_slp_tree_2 (vec_info *vinfo,
            if (stmt_info != other_info)
              return NULL;
        }
-      else
+      else if (def_type == vect_reduction_def
+              || def_type == vect_double_reduction_def
+              || def_type == vect_nested_cycle)
        {
          /* Else def types have to match.  */
          stmt_vec_info other_info;
          FOR_EACH_VEC_ELT (stmts, i, other_info)
-           {
-             /* But for reduction chains only check on the first stmt.  */
-             if (REDUC_GROUP_FIRST_ELEMENT (other_info)
-                 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
-               continue;
-             if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
-               return NULL;
-           }
+           if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
+             return NULL;
        }
-      node = vect_create_new_slp_node (stmts);
+      else
+       return NULL;
+      (*tree_size)++;
+      node = vect_create_new_slp_node (stmts, 0);
       return node;
     }
 
 
   bool two_operators = false;
   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
-  if (!vect_build_slp_tree_1 (swap, stmts, group_size,
+  if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
                              &this_max_nunits, matches, &two_operators))
     return NULL;
 
-  /* If the SLP node is a load, terminate the recursion.  */
+  /* If the SLP node is a load, terminate the recursion unless masked.  */
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
     {
-      *max_nunits = this_max_nunits;
-      node = vect_create_new_slp_node (stmts);
-      loads->safe_push (node);
-      return node;
+      if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
+       {
+         /* Masked load.  */
+         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
+         nops = 1;
+       }
+      else
+       {
+         *max_nunits = this_max_nunits;
+         (*tree_size)++;
+         node = vect_create_new_slp_node (stmts, 0);
+         /* And compute the load permutation.  Whether it is actually
+            a permutation depends on the unrolling factor which is
+            decided later.  */
+         vec<unsigned> load_permutation;
+         int j;
+         stmt_vec_info load_info;
+         load_permutation.create (group_size);
+         stmt_vec_info first_stmt_info
+           = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+           {
+             int load_place = vect_get_place_in_interleaving_chain
+                 (load_info, first_stmt_info);
+             gcc_assert (load_place != -1);
+             load_permutation.safe_push (load_place);
+           }
+         SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+         return node;
+       }
     }
 
   /* Get at the operands, verifying they are compatible.  */
@@ -1226,67 +1342,74 @@ vect_build_slp_tree_2 (vec_info *vinfo,
       }
 
   auto_vec<slp_tree, 4> children;
-  auto_vec<slp_tree> this_loads;
 
   stmt_info = stmts[0];
 
-  if (tree_size)
-    max_tree_size -= *tree_size;
-
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
       slp_tree child;
-      unsigned old_nloads = this_loads.length ();
       unsigned old_tree_size = this_tree_size;
       unsigned int j;
 
+      if (oprnd_info->first_dt == vect_uninitialized_def)
+       {
+         /* COND_EXPR have one too many eventually if the condition
+            is a SSA name.  */
+         gcc_assert (i == 3 && nops == 4);
+         continue;
+       }
+
       if (oprnd_info->first_dt != vect_internal_def
          && oprnd_info->first_dt != vect_reduction_def
          && oprnd_info->first_dt != vect_induction_def)
-        continue;
-
-      if (++this_tree_size > max_tree_size)
        {
-         FOR_EACH_VEC_ELT (children, j, child)
-           vect_free_slp_tree (child, false);
-         vect_free_oprnd_info (oprnds_info);
-         return NULL;
+         slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
+         SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
+         oprnd_info->ops = vNULL;
+         children.safe_push (invnode);
+         continue;
        }
 
       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
                                        group_size, &this_max_nunits,
-                                       &this_loads, matches, npermutes,
-                                       &this_tree_size,
-                                       max_tree_size)) != NULL)
+                                       matches, npermutes,
+                                       &this_tree_size, bst_map)) != NULL)
        {
-         /* If we have all children of child built up from scalars then just
-            throw that away and build it up this node from scalars.  */
-         if (!SLP_TREE_CHILDREN (child).is_empty ()
+         /* If we have all children of a non-unary child built up from
+            scalars then just throw that away and build it up this node
+            from scalars.  */
+         if (is_a <bb_vec_info> (vinfo)
+             && SLP_TREE_CHILDREN (child).length () > 1
              /* ???  Rejecting patterns this way doesn't work.  We'd have to
                 do extra work to cancel the pattern so the uses see the
                 scalar version.  */
-             && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
+             && !oprnd_info->any_pattern)
            {
              slp_tree grandchild;
 
              FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
-               if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
+               if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
                  break;
-             if (!grandchild)
+             if (!grandchild
+                 && vect_update_all_shared_vectypes (vinfo,
+                                                     oprnd_info->def_stmts))
                {
                  /* Roll back.  */
-                 this_loads.truncate (old_nloads);
                  this_tree_size = old_tree_size;
                  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
                    vect_free_slp_tree (grandchild, false);
                  SLP_TREE_CHILDREN (child).truncate (0);
 
-                 dump_printf_loc (MSG_NOTE, vect_location,
-                                  "Building parent vector operands from "
-                                  "scalars instead\n");
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "Building parent vector operands from "
+                                    "scalars instead\n");
                  oprnd_info->def_stmts = vNULL;
                  SLP_TREE_DEF_TYPE (child) = vect_external_def;
+                 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
+                 oprnd_info->ops = vNULL;
+                 ++this_tree_size;
                  children.safe_push (child);
                  continue;
                }
@@ -1310,13 +1433,19 @@ vect_build_slp_tree_2 (vec_info *vinfo,
          /* ???  Rejecting patterns this way doesn't work.  We'd have to
             do extra work to cancel the pattern so the uses see the
             scalar version.  */
-         && !is_pattern_stmt_p (stmt_info))
+         && !is_pattern_stmt_p (stmt_info)
+         && !oprnd_info->any_pattern
+         && vect_update_all_shared_vectypes (vinfo, oprnd_info->def_stmts))
        {
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "Building vector operands from scalars\n");
-         child = vect_create_new_slp_node (oprnd_info->def_stmts);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Building vector operands from scalars\n");
+         this_tree_size++;
+         child = vect_create_new_slp_node (oprnd_info->def_stmts, 0);
          SLP_TREE_DEF_TYPE (child) = vect_external_def;
+         SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
          children.safe_push (child);
+         oprnd_info->ops = vNULL;
          oprnd_info->def_stmts = vNULL;
          continue;
        }
@@ -1332,6 +1461,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
          && nops == 2
          && oprnds_info[1]->first_dt == vect_internal_def
          && is_gimple_assign (stmt_info->stmt)
+         /* Swapping operands for reductions breaks assumptions later on.  */
+         && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
+         && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
          /* Do so only if the number of not successful permutes was nor more
             than a cut-ff as re-trying the recursive match on
             possibly each level of the tree would expose exponential
@@ -1358,107 +1490,68 @@ vect_build_slp_tree_2 (vec_info *vinfo,
                      swap_not_matching = false;
                      break;
                    }
-                 /* Verify if we can safely swap or if we committed to a
-                    specific operand order already.
-                    ???  Instead of modifying GIMPLE stmts here we could
-                    record whether we want to swap operands in the SLP
-                    node and temporarily do that when processing it
-                    (or wrap operand accessors in a helper).  */
-                 else if (swap[j] != 0
-                          || STMT_VINFO_NUM_SLP_USES (stmt_info))
-                   {
-                     if (!swap_not_matching)
-                       {
-                         if (dump_enabled_p ())
-                           {
-                             dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                              vect_location,
-                                              "Build SLP failed: cannot swap "
-                                              "operands of shared stmt ");
-                             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
-                                               TDF_SLIM, stmts[j]->stmt, 0);
-                           }
-                         goto fail;
-                       }
-                     swap_not_matching = false;
-                     break;
-                   }
                }
            }
          while (j != group_size);
 
          /* Swap mismatched definition stmts.  */
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "Re-trying with swapped operands of stmts ");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Re-trying with swapped operands of stmts ");
          for (j = 0; j < group_size; ++j)
            if (matches[j] == !swap_not_matching)
              {
                std::swap (oprnds_info[0]->def_stmts[j],
                           oprnds_info[1]->def_stmts[j]);
-               dump_printf (MSG_NOTE, "%d ", j);
+               std::swap (oprnds_info[0]->ops[j],
+                          oprnds_info[1]->ops[j]);
+               if (dump_enabled_p ())
+                 dump_printf (MSG_NOTE, "%d ", j);
              }
-         dump_printf (MSG_NOTE, "\n");
+         if (dump_enabled_p ())
+           dump_printf (MSG_NOTE, "\n");
          /* And try again with scratch 'matches' ... */
          bool *tem = XALLOCAVEC (bool, group_size);
          if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
                                            group_size, &this_max_nunits,
-                                           &this_loads, tem, npermutes,
-                                           &this_tree_size,
-                                           max_tree_size)) != NULL)
+                                           tem, npermutes,
+                                           &this_tree_size, bst_map)) != NULL)
            {
-             /* ... so if successful we can apply the operand swapping
-                to the GIMPLE IL.  This is necessary because for example
-                vect_get_slp_defs uses operand indexes and thus expects
-                canonical operand order.  This is also necessary even
-                if we end up building the operand from scalars as
-                we'll continue to process swapped operand two.  */
-             for (j = 0; j < group_size; ++j)
-               gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
-             for (j = 0; j < group_size; ++j)
-               if (matches[j] == !swap_not_matching)
-                 {
-                   gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
-                   /* Avoid swapping operands twice.  */
-                   if (gimple_plf (stmt, GF_PLF_1))
-                     continue;
-                   swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
-                                      gimple_assign_rhs2_ptr (stmt));
-                   gimple_set_plf (stmt, GF_PLF_1, true);
-                 }
-             /* Verify we swap all duplicates or none.  */
-             if (flag_checking)
-               for (j = 0; j < group_size; ++j)
-                 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
-                             == (matches[j] == !swap_not_matching));
-
-             /* If we have all children of child built up from scalars then
-                just throw that away and build it up this node from scalars.  */
-             if (!SLP_TREE_CHILDREN (child).is_empty ()
+             /* If we have all children of a non-unary child built up from
+                scalars then just throw that away and build it up this node
+                from scalars.  */
+             if (is_a <bb_vec_info> (vinfo)
+                 && SLP_TREE_CHILDREN (child).length () > 1
                  /* ???  Rejecting patterns this way doesn't work.  We'd have
                     to do extra work to cancel the pattern so the uses see the
                     scalar version.  */
-                 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
+                 && !oprnd_info->any_pattern)
                {
                  unsigned int j;
                  slp_tree grandchild;
 
                  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
-                   if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
+                   if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
                      break;
-                 if (!grandchild)
+                 if (!grandchild
+                     && (vect_update_all_shared_vectypes
+                           (vinfo, oprnd_info->def_stmts)))
                    {
                      /* Roll back.  */
-                     this_loads.truncate (old_nloads);
                      this_tree_size = old_tree_size;
                      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
                        vect_free_slp_tree (grandchild, false);
                      SLP_TREE_CHILDREN (child).truncate (0);
 
-                     dump_printf_loc (MSG_NOTE, vect_location,
-                                      "Building parent vector operands from "
-                                      "scalars instead\n");
+                     if (dump_enabled_p ())
+                       dump_printf_loc (MSG_NOTE, vect_location,
+                                        "Building parent vector operands from "
+                                        "scalars instead\n");
                      oprnd_info->def_stmts = vNULL;
                      SLP_TREE_DEF_TYPE (child) = vect_external_def;
+                     SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
+                     oprnd_info->ops = vNULL;
+                     ++this_tree_size;
                      children.safe_push (child);
                      continue;
                    }
@@ -1482,68 +1575,100 @@ fail:
 
   vect_free_oprnd_info (oprnds_info);
 
-  if (tree_size)
-    *tree_size += this_tree_size;
+  *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
-  loads->safe_splice (this_loads);
 
-  node = vect_create_new_slp_node (stmts);
+  node = vect_create_new_slp_node (stmts, nops);
   SLP_TREE_TWO_OPERATORS (node) = two_operators;
   SLP_TREE_CHILDREN (node).splice (children);
   return node;
 }
 
-/* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
+/* Dump a single SLP tree NODE.  */
 
 static void
 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
                     slp_tree node)
 {
-  int i;
-  stmt_vec_info stmt_info;
+  unsigned i, j;
   slp_tree child;
-
-  dump_printf_loc (dump_kind, loc, "node%s\n",
-                  SLP_TREE_DEF_TYPE (node) != vect_internal_def
-                  ? " (external)" : "");
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+  stmt_vec_info stmt_info;
+  tree op;
+
+  dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
+  dump_user_location_t user_loc = loc.get_user_location ();
+  dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
+                  SLP_TREE_DEF_TYPE (node) == vect_external_def
+                  ? " (external)"
+                  : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+                     ? " (constant)"
+                     : ""), node,
+                  estimated_poly_value (node->max_nunits), node->refcnt);
+  if (SLP_TREE_SCALAR_STMTS (node).exists ())
+    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+      dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
+  else
+    {
+      dump_printf_loc (metadata, user_loc, "\t{ ");
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+       dump_printf (metadata, "%T%s ", op,
+                    i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
+      dump_printf (metadata, "}\n");
+    }
+  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
     {
-      dump_printf_loc (dump_kind, loc, "\tstmt %d ", i);
-      dump_gimple_stmt (dump_kind, TDF_SLIM, stmt_info->stmt, 0);
+      dump_printf_loc (metadata, user_loc, "\tload permutation {");
+      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
+       dump_printf (dump_kind, " %u", j);
+      dump_printf (dump_kind, " }\n");
     }
+  if (SLP_TREE_CHILDREN (node).is_empty ())
+    return;
+  dump_printf_loc (metadata, user_loc, "\tchildren");
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_print_slp_tree (dump_kind, loc, child);
+    dump_printf (dump_kind, " %p", (void *)child);
+  dump_printf (dump_kind, "\n");
 }
 
+DEBUG_FUNCTION void
+debug (slp_tree node)
+{
+  debug_dump_context ctx;
+  vect_print_slp_tree (MSG_NOTE,
+                      dump_location_t::from_location_t (UNKNOWN_LOCATION),
+                      node);
+}
 
-/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
-   If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
-   J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
-   stmts in NODE are to be marked.  */
+/* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
 
 static void
-vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
+vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
+                     slp_tree node, hash_set<slp_tree> &visited)
 {
-  int i;
-  stmt_vec_info stmt_info;
+  unsigned i;
   slp_tree child;
 
-  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+  if (visited.add (node))
     return;
 
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
-    if (j < 0 || i == j)
-      STMT_SLP_TYPE (stmt_info) = mark;
+  vect_print_slp_tree (dump_kind, loc, node);
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_mark_slp_stmts (child, mark, j);
+    vect_print_slp_graph (dump_kind, loc, child, visited);
 }
 
+static void
+vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
+                     slp_tree entry)
+{
+  hash_set<slp_tree> visited;
+  vect_print_slp_graph (dump_kind, loc, entry, visited);
+}
 
-/* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
+/* Mark the tree rooted at NODE with PURE_SLP.  */
 
 static void
-vect_mark_slp_stmts_relevant (slp_tree node)
+vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
 {
   int i;
   stmt_vec_info stmt_info;
@@ -1552,6 +1677,38 @@ vect_mark_slp_stmts_relevant (slp_tree node)
   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
     return;
 
+  if (visited.add (node))
+    return;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    STMT_SLP_TYPE (stmt_info) = pure_slp;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    vect_mark_slp_stmts (child, visited);
+}
+
+static void
+vect_mark_slp_stmts (slp_tree node)
+{
+  hash_set<slp_tree> visited;
+  vect_mark_slp_stmts (node, visited);
+}
+
+/* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
+
+static void
+vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
+{
+  int i;
+  stmt_vec_info stmt_info;
+  slp_tree child;
+
+  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+    return;
+
+  if (visited.add (node))
+    return;
+
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
     {
       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
@@ -1560,33 +1717,99 @@ vect_mark_slp_stmts_relevant (slp_tree node)
     }
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_mark_slp_stmts_relevant (child);
+    vect_mark_slp_stmts_relevant (child, visited);
+}
+
+static void
+vect_mark_slp_stmts_relevant (slp_tree node)
+{
+  hash_set<slp_tree> visited;
+  vect_mark_slp_stmts_relevant (node, visited);
 }
 
+/* Copy the SLP subtree rooted at NODE.  */
+
+static slp_tree
+slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
+{
+  unsigned i;
+
+  bool existed_p;
+  slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
+  if (existed_p)
+    return copy_ref;
+
+  copy_ref = new _slp_tree;
+  slp_tree copy = copy_ref;
+  SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
+  SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
+  SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
+  copy->max_nunits = node->max_nunits;
+  copy->refcnt = 0;
+  if (SLP_TREE_SCALAR_STMTS (node).exists ())
+    {
+      SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
+      stmt_vec_info stmt_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+       STMT_VINFO_NUM_SLP_USES (stmt_info)++;
+    }
+  if (SLP_TREE_SCALAR_OPS (node).exists ())
+    SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
+  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+    SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
+  if (SLP_TREE_CHILDREN (node).exists ())
+    SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
+  gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
+
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
+    {
+      SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
+      SLP_TREE_CHILDREN (copy)[i]->refcnt++;
+    }
+  return copy;
+}
 
 /* Rearrange the statements of NODE according to PERMUTATION.  */
 
 static void
 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
-                          vec<unsigned> permutation)
+                          vec<unsigned> permutation,
+                         hash_set<slp_tree> &visited)
 {
-  stmt_vec_info stmt_info;
-  vec<stmt_vec_info> tmp_stmts;
   unsigned int i;
   slp_tree child;
 
-  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_slp_rearrange_stmts (child, group_size, permutation);
-
-  gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
-  tmp_stmts.create (group_size);
-  tmp_stmts.quick_grow_cleared (group_size);
+  if (visited.add (node))
+    return;
 
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
-    tmp_stmts[permutation[i]] = stmt_info;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    vect_slp_rearrange_stmts (child, group_size, permutation, visited);
 
-  SLP_TREE_SCALAR_STMTS (node).release ();
-  SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
+  if (SLP_TREE_SCALAR_STMTS (node).exists ())
+    {
+      gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
+      vec<stmt_vec_info> tmp_stmts;
+      tmp_stmts.create (group_size);
+      tmp_stmts.quick_grow (group_size);
+      stmt_vec_info stmt_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+       tmp_stmts[permutation[i]] = stmt_info;
+      SLP_TREE_SCALAR_STMTS (node).release ();
+      SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
+    }
+  if (SLP_TREE_SCALAR_OPS (node).exists ())
+    {
+      gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
+      vec<tree> tmp_ops;
+      tmp_ops.create (group_size);
+      tmp_ops.quick_grow (group_size);
+      tree op;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+       tmp_ops[permutation[i]] = op;
+      SLP_TREE_SCALAR_OPS (node).release ();
+      SLP_TREE_SCALAR_OPS (node) = tmp_ops;
+    }
 }
 
 
@@ -1597,22 +1820,26 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
 static bool
 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
 {
-  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
   unsigned int i, j;
   unsigned int lidx;
   slp_tree node, load;
 
+  if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
+    return false;
+
   /* Compare all the permutation sequences to the first one.  We know
      that at least one load is permuted.  */
   node = SLP_INSTANCE_LOADS (slp_instn)[0];
-  if (!node->load_permutation.exists ())
+  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
     return false;
+  unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
     {
-      if (!load->load_permutation.exists ())
+      if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
+         || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
        return false;
-      FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
-       if (lidx != node->load_permutation[j])
+      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
+       if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
          return false;
     }
 
@@ -1637,8 +1864,21 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
      statements in the nodes is not important unless they are memory
      accesses, we can rearrange the statements in all the nodes
      according to the order of the loads.  */
+
+  /* We have to unshare the SLP tree we modify.  */
+  hash_map<slp_tree, slp_tree> map;
+  slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
+  vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
+  unshared->refcnt++;
+  SLP_INSTANCE_TREE (slp_instn) = unshared;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
+  node = SLP_INSTANCE_LOADS (slp_instn)[0];
+
+  /* Do the actual re-arrangement.  */
+  hash_set<slp_tree> visited;
   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-                           node->load_permutation);
+                           node->load_permutation, visited);
 
   /* We are done, no actual permutations need to be generated.  */
   poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
@@ -1660,132 +1900,38 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
   return true;
 }
 
-/* Check if the required load permutations in the SLP instance
-   SLP_INSTN are supported.  */
+/* Gather loads in the SLP graph NODE and populate the INST loads array.  */
 
-static bool
-vect_supported_load_permutation_p (slp_instance slp_instn)
+static void
+vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
+                      hash_set<slp_tree> &visited)
 {
-  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
-  unsigned int i, j, k, next;
-  slp_tree node;
+  if (visited.add (node))
+    return;
 
-  if (dump_enabled_p ())
+  if (SLP_TREE_CHILDREN (node).length () == 0)
     {
-      dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       if (node->load_permutation.exists ())
-         FOR_EACH_VEC_ELT (node->load_permutation, j, next)
-           dump_printf (MSG_NOTE, "%d ", next);
-       else
-         for (k = 0; k < group_size; ++k)
-           dump_printf (MSG_NOTE, "%d ", k);
-      dump_printf (MSG_NOTE, "\n");
+      if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+       return;
+      stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+         && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+       loads.safe_push (node);
     }
-
-  /* In case of reduction every load permutation is allowed, since the order
-     of the reduction statements is not important (as opposed to the case of
-     grouped stores).  The only condition we need to check is that all the
-     load nodes are of the same size and have the same permutation (and then
-     rearrange all the nodes of the SLP instance according to this 
-     permutation).  */
-
-  /* Check that all the load nodes are of the same size.  */
-  /* ???  Can't we assert this? */
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
-      return false;
-
-  node = SLP_INSTANCE_TREE (slp_instn);
-  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-
-  /* Reduction (there are no data-refs in the root).
-     In reduction chain the order of the loads is not important.  */
-  if (!STMT_VINFO_DATA_REF (stmt_info)
-      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    vect_attempt_slp_rearrange_stmts (slp_instn);
-
-  /* In basic block vectorization we allow any subchain of an interleaving
-     chain.
-     FORNOW: not supported in loop SLP because of realignment compications.  */
-  if (STMT_VINFO_BB_VINFO (stmt_info))
+  else
     {
-      /* Check whether the loads in an instance form a subchain and thus
-         no permutation is necessary.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-        {
-         if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
-           continue;
-         bool subchain_p = true;
-         stmt_vec_info next_load_info = NULL;
-         stmt_vec_info load_info;
-         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
-           {
-             if (j != 0
-                 && (next_load_info != load_info
-                     || DR_GROUP_GAP (load_info) != 1))
-               {
-                 subchain_p = false;
-                 break;
-               }
-             next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
-           }
-         if (subchain_p)
-           SLP_TREE_LOAD_PERMUTATION (node).release ();
-         else
-           {
-             stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
-             group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-             unsigned HOST_WIDE_INT nunits;
-             unsigned k, maxk = 0;
-             FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
-               if (k > maxk)
-                 maxk = k;
-             /* In BB vectorization we may not actually use a loaded vector
-                accessing elements in excess of DR_GROUP_SIZE.  */
-             tree vectype = STMT_VINFO_VECTYPE (group_info);
-             if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "BB vectorization with gaps at the end of "
-                                  "a load is not supported\n");
-                 return false;
-               }
-
-             /* Verify the permutation can be generated.  */
-             vec<tree> tem;
-             unsigned n_perms;
-             if (!vect_transform_slp_perm_load (node, tem, NULL,
-                                                1, slp_instn, true, &n_perms))
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                  vect_location,
-                                  "unsupported load permutation\n");
-                 return false;
-               }
-           }
-        }
-      return true;
+      unsigned i;
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+       vect_gather_slp_loads (loads, child, visited);
     }
+}
 
-  /* For loop vectorization verify we can generate the permutation.  Be
-     conservative about the vectorization factor, there are permutations
-     that will use three vector inputs only starting from a specific factor
-     and the vectorization factor is not yet final.
-     ???  The SLP instance unrolling factor might not be the maximum one.  */
-  unsigned n_perms;
-  poly_uint64 test_vf
-    = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
-                            LOOP_VINFO_VECT_FACTOR
-                            (STMT_VINFO_LOOP_VINFO (stmt_info)));
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (node->load_permutation.exists ()
-       && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
-                                         slp_instn, true, &n_perms))
-      return false;
-
-  return true;
+static void
+vect_gather_slp_loads (slp_instance inst, slp_tree node)
+{
+  hash_set<slp_tree> visited;
+  vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
 }
 
 
@@ -1868,6 +2014,7 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
 
 static bool
 vect_analyze_slp_instance (vec_info *vinfo,
+                          scalar_stmts_to_slp_tree_map_t *bst_map,
                           stmt_vec_info stmt_info, unsigned max_tree_size)
 {
   slp_instance new_instance;
@@ -1875,15 +2022,15 @@ vect_analyze_slp_instance (vec_info *vinfo,
   unsigned int group_size;
   tree vectype, scalar_type = NULL_TREE;
   unsigned int i;
-  vec<slp_tree> loads;
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   vec<stmt_vec_info> scalar_stmts;
+  bool constructor = false;
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       scalar_type = TREE_TYPE (DR_REF (dr));
-      vectype = get_vectype_for_scalar_type (scalar_type);
       group_size = DR_GROUP_SIZE (stmt_info);
+      vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
     }
   else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
     {
@@ -1891,6 +2038,13 @@ vect_analyze_slp_instance (vec_info *vinfo,
       vectype = STMT_VINFO_VECTYPE (stmt_info);
       group_size = REDUC_GROUP_SIZE (stmt_info);
     }
+  else if (is_gimple_assign (stmt_info->stmt)
+           && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
+    {
+      vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
+      group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
+      constructor = true;
+    }
   else
     {
       gcc_assert (is_a <loop_vec_info> (vinfo));
@@ -1901,12 +2055,9 @@ vect_analyze_slp_instance (vec_info *vinfo,
   if (!vectype)
     {
       if (dump_enabled_p ())
-        {
-          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "Build SLP failed: unsupported data-type ");
-          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
-          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-        }
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Build SLP failed: unsupported data-type %T\n",
+                        scalar_type);
 
       return false;
     }
@@ -1936,7 +2087,34 @@ vect_analyze_slp_instance (vec_info *vinfo,
       /* Mark the first element of the reduction chain as reduction to properly
         transform the node.  In the reduction analysis phase only the last
         element of the chain is marked as reduction.  */
-      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
+      STMT_VINFO_DEF_TYPE (stmt_info)
+       = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
+      STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
+       = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
+    }
+  else if (constructor)
+    {
+      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+      tree val;
+      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+       {
+         if (TREE_CODE (val) == SSA_NAME)
+           {
+             gimple* def = SSA_NAME_DEF_STMT (val);
+             stmt_vec_info def_info = vinfo->lookup_stmt (def);
+             /* Value is defined in another basic block.  */
+             if (!def_info)
+               return false;
+             def_info = vect_stmt_to_vectorize (def_info);
+             scalar_stmts.safe_push (def_info);
+           }
+         else
+           return false;
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Analyzing vectorizable constructor: %G\n",
+                        stmt_info->stmt);
     }
   else
     {
@@ -1946,17 +2124,14 @@ vect_analyze_slp_instance (vec_info *vinfo,
        scalar_stmts.safe_push (next_info);
     }
 
-  loads.create (group_size);
-
   /* Build the tree for the SLP instance.  */
   bool *matches = XALLOCAVEC (bool, group_size);
   unsigned npermutes = 0;
-  bst_fail = new scalar_stmts_set_t ();
   poly_uint64 max_nunits = nunits;
+  unsigned tree_size = 0;
   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
-                             &max_nunits, &loads, matches, &npermutes,
-                             NULL, max_tree_size);
-  delete bst_fail;
+                             &max_nunits, matches, &npermutes,
+                             &tree_size, bst_map);
   if (node != NULL)
     {
       /* Calculate the unrolling factor based on the smallest type.  */
@@ -1976,123 +2151,127 @@ vect_analyze_slp_instance (vec_info *vinfo,
                                 "size not a multiple of the vector size "
                                 "in basic block SLP\n");
              vect_free_slp_tree (node, false);
-             loads.release ();
              return false;
            }
          /* Fatal mismatch.  */
          matches[group_size / const_max_nunits * const_max_nunits] = false;
          vect_free_slp_tree (node, false);
-         loads.release ();
        }
       else
        {
-      /* Create a new SLP instance.  */
-      new_instance = XNEW (struct _slp_instance);
-      SLP_INSTANCE_TREE (new_instance) = node;
-      SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
-      SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
-      SLP_INSTANCE_LOADS (new_instance) = loads;
-
-      /* Compute the load permutation.  */
-      slp_tree load_node;
-      bool loads_permuted = false;
-      FOR_EACH_VEC_ELT (loads, i, load_node)
-       {
-         vec<unsigned> load_permutation;
-         int j;
-         stmt_vec_info load_info;
-         bool this_load_permuted = false;
-         load_permutation.create (group_size);
-         stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
-           (SLP_TREE_SCALAR_STMTS (load_node)[0]);
-         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+         /* Create a new SLP instance.  */
+         new_instance = XNEW (class _slp_instance);
+         SLP_INSTANCE_TREE (new_instance) = node;
+         SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+         SLP_INSTANCE_LOADS (new_instance) = vNULL;
+         SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
+
+         vect_gather_slp_loads (new_instance, node);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "SLP size %u vs. limit %u.\n",
+                            tree_size, max_tree_size);
+
+         /* Check whether any load is possibly permuted.  */
+         slp_tree load_node;
+         bool loads_permuted = false;
+         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
            {
-             int load_place = vect_get_place_in_interleaving_chain
-               (load_info, first_stmt_info);
-             gcc_assert (load_place != -1);
-             if (load_place != j)
-               this_load_permuted = true;
-             load_permutation.safe_push (load_place);
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
            }
-         if (!this_load_permuted
-             /* The load requires permutation when unrolling exposes
-                a gap either because the group is larger than the SLP
-                group-size or because there is a gap between the groups.  */
-             && (known_eq (unrolling_factor, 1U)
-                 || (group_size == DR_GROUP_SIZE (first_stmt_info)
-                     && DR_GROUP_GAP (first_stmt_info) == 0)))
+
+         /* If the loads and stores can be handled with load/store-lane
+            instructions do not generate this SLP instance.  */
+         if (is_a <loop_vec_info> (vinfo)
+             && loads_permuted
+             && dr && vect_store_lanes_supported (vectype, group_size, false))
            {
-             load_permutation.release ();
-             continue;
+             slp_tree load_node;
+             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
+               {
+                 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                     (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+                 /* Use SLP for strided accesses (or if we can't load-lanes).  */
+                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                     || ! vect_load_lanes_supported
+                     (STMT_VINFO_VECTYPE (stmt_vinfo),
+                      DR_GROUP_SIZE (stmt_vinfo), false))
+                   break;
+               }
+             if (i == SLP_INSTANCE_LOADS (new_instance).length ())
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Built SLP cancelled: can use "
+                                    "load/store-lanes\n");
+                 vect_free_slp_instance (new_instance, false);
+                 return false;
+               }
            }
-         SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
-         loads_permuted = true;
-       }
-
-      if (loads_permuted)
-        {
-          if (!vect_supported_load_permutation_p (new_instance))
-            {
-              if (dump_enabled_p ())
-                {
-                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "Build SLP failed: unsupported load "
-                                  "permutation ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,
-                                   TDF_SLIM, stmt_info->stmt, 0);
-                }
-             vect_free_slp_instance (new_instance, false);
-              return false;
-            }
-        }
 
-         /* If the loads and stores can be handled with load/store-lan
-        instructions do not generate this SLP instance.  */
-      if (is_a <loop_vec_info> (vinfo)
-         && loads_permuted
-         && dr && vect_store_lanes_supported (vectype, group_size, false))
-       {
-         slp_tree load_node;
-         FOR_EACH_VEC_ELT (loads, i, load_node)
+         /* If this is a reduction chain with a conversion in front
+            amend the SLP tree with a node for that.  */
+         if (!dr
+             && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+             && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
            {
-             stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
-               (SLP_TREE_SCALAR_STMTS (load_node)[0]);
-             /* Use SLP for strided accesses (or if we can't load-lanes).  */
-             if (STMT_VINFO_STRIDED_P (stmt_vinfo)
-                 || ! vect_load_lanes_supported
-                       (STMT_VINFO_VECTYPE (stmt_vinfo),
-                        DR_GROUP_SIZE (stmt_vinfo), false))
-               break;
+             /* Get at the conversion stmt - we know it's the single use
+                of the last stmt of the reduction chain.  */
+             gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
+             use_operand_p use_p;
+             gimple *use_stmt;
+             bool r = single_imm_use (gimple_assign_lhs (tem),
+                                      &use_p, &use_stmt);
+             gcc_assert (r);
+             next_info = vinfo->lookup_stmt (use_stmt);
+             next_info = vect_stmt_to_vectorize (next_info);
+             scalar_stmts = vNULL;
+             scalar_stmts.create (group_size);
+             for (unsigned i = 0; i < group_size; ++i)
+               scalar_stmts.quick_push (next_info);
+             slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
+             SLP_TREE_CHILDREN (conv).quick_push (node);
+             SLP_INSTANCE_TREE (new_instance) = conv;
+             /* We also have to fake this conversion stmt as SLP reduction
+                group so we don't have to mess with too much code
+                elsewhere.  */
+             REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
+             REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
            }
-         if (i == loads.length ())
+
+         vinfo->slp_instances.safe_push (new_instance);
+
+         /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+            the number of scalar stmts in the root in a few places.
+            Verify that assumption holds.  */
+         gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+                       .length () == group_size);
+
+         if (dump_enabled_p ())
            {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "Built SLP cancelled: can use "
-                                "load/store-lanes\n");
-             vect_free_slp_instance (new_instance, false);
-             return false;
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "Final SLP tree for instance:\n");
+             vect_print_slp_graph (MSG_NOTE, vect_location,
+                                   SLP_INSTANCE_TREE (new_instance));
            }
-       }
-
-      vinfo->slp_instances.safe_push (new_instance);
 
-      if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "Final SLP tree for instance:\n");
-         vect_print_slp_tree (MSG_NOTE, vect_location, node);
+         return true;
        }
-
-      return true;
-    }
     }
   else
     {
-  /* Failed to SLP.  */
-  /* Free the allocated memory.  */
-  scalar_stmts.release ();
-  loads.release ();
+      /* Failed to SLP.  */
+      /* Free the allocated memory.  */
+      scalar_stmts.release ();
     }
 
   /* For basic block SLP, try to break the group up into multiples of the
@@ -2116,7 +2295,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
 
          stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
                                                           group1_size);
-         bool res = vect_analyze_slp_instance (vinfo, stmt_info,
+         bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
                                                max_tree_size);
          /* If the first non-match was in the middle of a vector,
             skip the rest of that vector.  */
@@ -2127,7 +2306,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
                rest = vect_split_slp_store_group (rest, const_nunits);
            }
          if (i < group_size)
-           res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
+           res |= vect_analyze_slp_instance (vinfo, bst_map,
+                                             rest, max_tree_size);
          return res;
        }
       /* Even though the first vector did not all match, we might be able to SLP
@@ -2141,7 +2321,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    trees of packed scalar stmts if SLP is possible.  */
 
-bool
+opt_result
 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 {
   unsigned int i;
@@ -2149,9 +2329,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 
   DUMP_VECT_SCOPE ("vect_analyze_slp");
 
+  scalar_stmts_to_slp_tree_map_t *bst_map
+    = new scalar_stmts_to_slp_tree_map_t ();
+
   /* Find SLP sequences starting from groups of grouped stores.  */
   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
-    vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
+    vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
 
   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
     {
@@ -2159,29 +2342,124 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
        {
          /* Find SLP sequences starting from reduction chains.  */
          FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
-           if (! vect_analyze_slp_instance (vinfo, first_element,
+           if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
                                             max_tree_size))
              {
                /* Dissolve reduction chain group.  */
                stmt_vec_info vinfo = first_element;
+               stmt_vec_info last = NULL;
                while (vinfo)
                  {
                    stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
                    REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
                    REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+                   last = vinfo;
                    vinfo = next;
                  }
                STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
+               /* It can be still vectorized as part of an SLP reduction.  */
+               loop_vinfo->reductions.safe_push (last);
              }
        }
 
       /* Find SLP sequences starting from groups of reductions.  */
       if (loop_vinfo->reductions.length () > 1)
-       vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
+       vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
                                   max_tree_size);
     }
 
-  return true;
+  /* The map keeps a reference on SLP nodes built, release that.  */
+  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
+       it != bst_map->end (); ++it)
+    if ((*it).second)
+      vect_free_slp_tree ((*it).second, false);
+  delete bst_map;
+
+  /* Optimize permutations in SLP reductions.  */
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+    {
+      slp_tree node = SLP_INSTANCE_TREE (instance);
+      stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+      /* Reduction (there are no data-refs in the root).
+        In reduction chain the order of the loads is not important.  */
+      if (!STMT_VINFO_DATA_REF (stmt_info)
+         && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+       vect_attempt_slp_rearrange_stmts (instance);
+    }
+
+  /* Gather all loads in the SLP graph.  */
+  hash_set<slp_tree> visited;
+  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+    vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
+                          visited);
+
+  return opt_result::success ();
+}
+
+void
+vect_optimize_slp (vec_info *vinfo)
+{
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
+    {
+      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+       continue;
+
+      /* In basic block vectorization we allow any subchain of an interleaving
+        chain.
+        FORNOW: not in loop SLP because of realignment complications.  */
+      if (is_a <bb_vec_info> (vinfo))
+       {
+         bool subchain_p = true;
+         stmt_vec_info next_load_info = NULL;
+         stmt_vec_info load_info;
+         unsigned j;
+         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+           {
+             if (j != 0
+                 && (next_load_info != load_info
+                     || DR_GROUP_GAP (load_info) != 1))
+               {
+                 subchain_p = false;
+                 break;
+               }
+             next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
+           }
+         if (subchain_p)
+           {
+             SLP_TREE_LOAD_PERMUTATION (node).release ();
+             continue;
+           }
+       }
+      else
+       {
+         stmt_vec_info load_info;
+         bool this_load_permuted = false;
+         unsigned j;
+         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+           if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
+             {
+               this_load_permuted = true;
+               break;
+             }
+         stmt_vec_info first_stmt_info
+           = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+         if (!this_load_permuted
+             /* The load requires permutation when unrolling exposes
+                a gap either because the group is larger than the SLP
+                group-size or because there is a gap between the groups.  */
+             && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
+                 || ((SLP_TREE_SCALAR_STMTS (node).length ()
+                      == DR_GROUP_SIZE (first_stmt_info))
+                     && DR_GROUP_GAP (first_stmt_info) == 0)))
+           {
+             SLP_TREE_LOAD_PERMUTATION (node).release ();
+             continue;
+           }
+       }
+    }
 }
 
 
@@ -2203,8 +2481,11 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       /* FORNOW: SLP if you can.  */
-      /* All unroll factors have the form current_vector_size * X for some
-        rational X, so they must have a common multiple.  */
+      /* All unroll factors have the form:
+
+          GET_MODE_SIZE (vinfo->vector_mode) * X
+
+        for some rational X, so they must have a common multiple.  */
       unrolling_factor
        = force_common_multiple (unrolling_factor,
                                 SLP_INSTANCE_UNROLLING_FACTOR (instance));
@@ -2212,7 +2493,7 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
         call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
         loop-based vectorization.  Such stmts will be marked as HYBRID.  */
-      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
+      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
       decided_to_slp++;
     }
 
@@ -2230,138 +2511,62 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   return (decided_to_slp > 0);
 }
 
-
-/* Find stmts that must be both vectorized and SLPed (since they feed stmts that
-   can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
-
-static void
-vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
+/* Private data for vect_detect_hybrid_slp.  */
+struct vdhs_data
 {
-  stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
-  imm_use_iterator imm_iter;
-  gimple *use_stmt;
-  stmt_vec_info use_vinfo;
-  slp_tree child;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
-  int j;
-
-  /* Propagate hybrid down the SLP tree.  */
-  if (stype == hybrid)
-    ;
-  else if (HYBRID_SLP_STMT (stmt_vinfo))
-    stype = hybrid;
-  else
-    {
-      /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
-      gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
-      /* If we get a pattern stmt here we have to use the LHS of the
-         original stmt for immediate uses.  */
-      gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
-      tree def;
-      if (gimple_code (stmt) == GIMPLE_PHI)
-       def = gimple_phi_result (stmt);
-      else
-       def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
-      if (def)
-       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
-         {
-           use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
-           if (!use_vinfo)
-             continue;
-           use_vinfo = vect_stmt_to_vectorize (use_vinfo);
-           if (!STMT_SLP_TYPE (use_vinfo)
-               && (STMT_VINFO_RELEVANT (use_vinfo)
-                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
-               && !(gimple_code (use_stmt) == GIMPLE_PHI
-                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-             {
-               if (dump_enabled_p ())
-                 {
-                   dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
-                                    "def in non-SLP stmt: ");
-                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
-                 }
-               stype = hybrid;
-             }
-         }
-    }
-
-  if (stype == hybrid
-      && !HYBRID_SLP_STMT (stmt_vinfo))
-    {
-      if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_vinfo->stmt, 0);
-       }
-      STMT_SLP_TYPE (stmt_vinfo) = hybrid;
-    }
-
-  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
-      vect_detect_hybrid_slp_stmts (child, i, stype);
-}
+  loop_vec_info loop_vinfo;
+  vec<stmt_vec_info> *worklist;
+};
 
-/* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
+/* Walker for walk_gimple_op.  */
 
 static tree
-vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
+vect_detect_hybrid_slp (tree *tp, int *, void *data)
 {
   walk_stmt_info *wi = (walk_stmt_info *)data;
-  loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
+  vdhs_data *dat = (vdhs_data *)wi->info;
 
   if (wi->is_lhs)
     return NULL_TREE;
 
-  stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
-  if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
+  stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
+  if (!def_stmt_info)
+    return NULL_TREE;
+  def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
+  if (PURE_SLP_STMT (def_stmt_info))
     {
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt_info->stmt, 0);
-       }
+       dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
+                        def_stmt_info->stmt);
       STMT_SLP_TYPE (def_stmt_info) = hybrid;
+      dat->worklist->safe_push (def_stmt_info);
     }
 
   return NULL_TREE;
 }
 
-static tree
-vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
-                         walk_stmt_info *wi)
-{
-  loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
-  stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
-  /* If the stmt is in a SLP instance then this isn't a reason
-     to mark use definitions in other SLP instances as hybrid.  */
-  if (! STMT_SLP_TYPE (use_vinfo)
-      && (STMT_VINFO_RELEVANT (use_vinfo)
-         || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
-      && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
-           && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-    ;
-  else
-    *handled = true;
-  return NULL_TREE;
-}
-
 /* Find stmts that must be both vectorized and SLPed.  */
 
 void
 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
 {
-  unsigned int i;
-  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
-  slp_instance instance;
-
   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
 
-  /* First walk all pattern stmt in the loop and mark defs of uses as
-     hybrid because immediate uses in them are not recorded.  */
-  for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
+  /* All stmts participating in SLP are marked pure_slp, all other
+     stmts are loop_vect.
+     First collect all loop_vect stmts into a worklist.  */
+  auto_vec<stmt_vec_info> worklist;
+  for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
     {
       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
+      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
+          gsi_next (&gsi))
+       {
+         gphi *phi = gsi.phi ();
+         stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
+         if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
+           worklist.safe_push (stmt_info);
+       }
       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
           gsi_next (&gsi))
        {
@@ -2369,28 +2574,40 @@ vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
          stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
          if (STMT_VINFO_IN_PATTERN_P (stmt_info))
            {
-             walk_stmt_info wi;
-             memset (&wi, 0, sizeof (wi));
-             wi.info = loop_vinfo;
-             gimple_stmt_iterator gsi2
-               = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
-             walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
-                               vect_detect_hybrid_slp_1, &wi);
-             walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
-                              vect_detect_hybrid_slp_2,
-                              vect_detect_hybrid_slp_1, &wi);
+             for (gimple_stmt_iterator gsi2
+                    = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
+                  !gsi_end_p (gsi2); gsi_next (&gsi2))
+               {
+                 stmt_vec_info patt_info
+                   = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
+                 if (!STMT_SLP_TYPE (patt_info)
+                     && STMT_VINFO_RELEVANT (patt_info))
+                   worklist.safe_push (patt_info);
+               }
+             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
            }
+         if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
+           worklist.safe_push (stmt_info);
        }
     }
 
-  /* Then walk the SLP instance trees marking stmts with uses in
-     non-SLP stmts as hybrid, also propagating hybrid down the
-     SLP tree, collecting the above info on-the-fly.  */
-  FOR_EACH_VEC_ELT (slp_instances, i, instance)
+  /* Now we have a worklist of non-SLP stmts, follow use->def chains and
+     mark any SLP vectorized stmt as hybrid.
+     ???  We're visiting def stmts N times (once for each non-SLP and
+     once for each hybrid-SLP use).  */
+  walk_stmt_info wi;
+  vdhs_data dat;
+  dat.worklist = &worklist;
+  dat.loop_vinfo = loop_vinfo;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = (void *)&dat;
+  while (!worklist.is_empty ())
     {
-      for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
-       vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
-                                     i, pure_slp);
+      stmt_vec_info stmt_info = worklist.pop ();
+      /* Since SSA operands are not set up for pattern stmts we need
+        to use walk_gimple_op.  */
+      wi.is_lhs = 0;
+      walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
     }
 }
 
@@ -2442,36 +2659,9 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
                                    slp_instance node_instance,
                                    stmt_vector_for_cost *cost_vec)
 {
-  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
 
-  /* For BB vectorization vector types are assigned here.
-     Memory accesses already got their vector type assigned
-     in vect_analyze_data_refs.  */
-  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
-  if (bb_vinfo
-      && ! STMT_VINFO_DATA_REF (stmt_info))
-    {
-      tree vectype, nunits_vectype;
-      if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
-                                          &nunits_vectype))
-       /* We checked this when building the node.  */
-       gcc_unreachable ();
-      if (vectype == boolean_type_node)
-       {
-         vectype = vect_get_mask_type_for_stmt (stmt_info);
-         if (!vectype)
-           /* vect_get_mask_type_for_stmt has already explained the
-              failure.  */
-           return false;
-       }
-
-      stmt_vec_info sstmt_info;
-      unsigned int i;
-      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
-       STMT_VINFO_VECTYPE (sstmt_info) = vectype;
-    }
-
   /* Calculate the number of vector statements to be created for the
      scalar stmts in this node.  For SLP reductions it is equal to the
      number of vector statements in the children (which has already been
@@ -2480,8 +2670,15 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
      VF divided by the number of elements in a vector.  */
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    SLP_TREE_NUMBER_OF_VEC_STMTS (node)
-      = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
+    {
+      for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
+       if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
+         {
+           SLP_TREE_NUMBER_OF_VEC_STMTS (node)
+             = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
+           break;
+         }
+    }
   else
     {
       poly_uint64 vf;
@@ -2489,14 +2686,102 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
        vf = loop_vinfo->vectorization_factor;
       else
        vf = 1;
-      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
+      unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
        = vect_get_num_vectors (vf * group_size, vectype);
     }
 
   bool dummy;
-  return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
+  return vect_analyze_stmt (vinfo, stmt_info, &dummy,
+                           node, node_instance, cost_vec);
+}
+
+/* Try to build NODE from scalars, returning true on success.
+   NODE_INSTANCE is the SLP instance that contains NODE.  */
+
+static bool
+vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
+                             slp_instance node_instance)
+{
+  stmt_vec_info stmt_info;
+  unsigned int i;
+
+  if (!is_a <bb_vec_info> (vinfo)
+      || node == SLP_INSTANCE_TREE (node_instance)
+      || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "Building vector operands from scalars instead\n");
+
+  /* Don't remove and free the child nodes here, since they could be
+     referenced by other structures.  The analysis and scheduling phases
+     (need to) ignore child nodes of anything that isn't vect_internal_def.  */
+  unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
+  SLP_TREE_DEF_TYPE (node) = vect_external_def;
+  SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    {
+      tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
+      SLP_TREE_SCALAR_OPS (node)[i] = lhs;
+    }
+  return true;
+}
+
+/* Compute the prologue cost for invariant or constant operands represented
+   by NODE.  */
+
+static void
+vect_prologue_cost_for_slp (slp_tree node,
+                           stmt_vector_for_cost *cost_vec)
+{
+  /* Without looking at the actual initializer a vector of
+     constants can be implemented as load from the constant pool.
+     When all elements are the same we can use a splat.  */
+  tree vectype = SLP_TREE_VECTYPE (node);
+  unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
+  unsigned num_vects_to_check;
+  unsigned HOST_WIDE_INT const_nunits;
+  unsigned nelt_limit;
+  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
+      && ! multiple_p (const_nunits, group_size))
+    {
+      num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
+      nelt_limit = const_nunits;
+    }
+  else
+    {
+      /* If either the vector has variable length or the vectors
+        are composed of repeated whole groups we only need to
+        cost construction once.  All vectors will be the same.  */
+      num_vects_to_check = 1;
+      nelt_limit = group_size;
+    }
+  tree elt = NULL_TREE;
+  unsigned nelt = 0;
+  for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
+    {
+      unsigned si = j % group_size;
+      if (nelt == 0)
+       elt = SLP_TREE_SCALAR_OPS (node)[si];
+      /* ???  We're just tracking whether all operands of a single
+        vector initializer are the same, ideally we'd check if
+        we emitted the same one already.  */
+      else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
+       elt = NULL_TREE;
+      nelt++;
+      if (nelt == nelt_limit)
+       {
+         record_stmt_cost (cost_vec, 1,
+                           SLP_TREE_DEF_TYPE (node) == vect_external_def
+                           ? (elt ? scalar_to_vec : vec_construct)
+                           : vector_load,
+                           NULL, vectype, 0, vect_prologue);
+         nelt = 0;
+       }
+    }
 }
 
 /* Analyze statements contained in SLP tree NODE after recursively analyzing
@@ -2507,53 +2792,128 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
 static bool
 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
                                  slp_instance node_instance,
-                                 scalar_stmts_to_slp_tree_map_t *visited,
-                                 scalar_stmts_to_slp_tree_map_t *lvisited,
+                                 hash_set<slp_tree> &visited,
+                                 hash_set<slp_tree> &lvisited,
                                  stmt_vector_for_cost *cost_vec)
 {
   int i, j;
   slp_tree child;
 
+  /* Assume we can code-generate all invariants.  */
   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
     return true;
 
   /* If we already analyzed the exact same set of scalar stmts we're done.
-     We share the generated vector stmts for those.  */
-  slp_tree *leader;
-  if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
-      || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
-    {
-      SLP_TREE_NUMBER_OF_VEC_STMTS (node)
-       = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
-      return true;
-    }
-
-  /* The SLP graph is acyclic so not caching whether we failed or succeeded
+     We share the generated vector stmts for those.
+     The SLP graph is acyclic so not caching whether we failed or succeeded
      doesn't result in any issue since we throw away the lvisited set
      when we fail.  */
-  lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
+  if (visited.contains (node)
+      || lvisited.add (node))
+    return true;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
                                           visited, lvisited, cost_vec))
       return false;
 
+  /* ???  We have to catch the case late where two first scalar stmts appear
+     in multiple SLP children with different def type and fail.  Remember
+     original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
+     match it when that is vect_internal_def.  */
+  auto_vec<vect_def_type, 4> dt;
+  dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+    if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
+      dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
+
   /* Push SLP node def-type to stmt operands.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
+    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
+       && SLP_TREE_SCALAR_STMTS (child).length () != 0)
       STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
        = SLP_TREE_DEF_TYPE (child);
-  bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
-                                                cost_vec);
+
+  /* Check everything worked out.  */
+  bool res = true;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+      if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
+       {
+         if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
+           {
+             if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
+                 != SLP_TREE_DEF_TYPE (child))
+               res = false;
+           }
+         else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
+                  != dt[j])
+           res = false;
+       }
+  if (!res && dump_enabled_p ())
+    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                    "not vectorized: same operand with different "
+                    "def type in stmt.\n");
+
+  if (res)
+    res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
+                                             cost_vec);
+
   /* Restore def-types.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
-      STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
-       = vect_internal_def;
-  if (! res)
-    return false;
+    if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
+      STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
+
+  /* When the node can be vectorized cost invariant nodes it references.
+     This is not done in DFS order to allow the refering node
+     vectorizable_* calls to nail down the invariant nodes vector type
+     and possibly unshare it if it needs a different vector type than
+     other referrers.  */
+  if (res)
+    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+      if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
+          || SLP_TREE_DEF_TYPE (child) == vect_external_def)
+         /* Perform usual caching, note code-generation still
+            code-gens these nodes multiple times but we expect
+            to CSE them later.  */
+         && !visited.contains (child)
+         && !lvisited.add (child))
+       {
+         /* ???  After auditing more code paths make a "default"
+            and push the vector type from NODE to all children
+            if it is not already set.  */
+         /* Compute the number of vectors to be generated.  */
+         tree vector_type = SLP_TREE_VECTYPE (child);
+         if (!vector_type)
+           {
+             /* For shifts with a scalar argument we don't need
+                to cost or code-generate anything.
+                ???  Represent this more explicitely.  */
+             gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_SCALAR_STMTS (node)[0])
+                          == shift_vec_info_type)
+                         && j == 1);
+             continue;
+           }
+         unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
+         poly_uint64 vf = 1;
+         if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
+           vf = loop_vinfo->vectorization_factor;
+         SLP_TREE_NUMBER_OF_VEC_STMTS (child)
+           = vect_get_num_vectors (vf * group_size, vector_type);
+         /* And cost them.  */
+         vect_prologue_cost_for_slp (child, cost_vec);
+       }
 
-  return true;
+  /* If this node can't be vectorized, try pruning the tree here rather
+     than felling the whole thing.  */
+  if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
+    {
+      /* We'll need to revisit this for invariant costing and number
+        of vectorized stmt setting.   */
+      lvisited.remove (node);
+      res = true;
+    }
+
+  return res;
 }
 
 
@@ -2568,39 +2928,43 @@ vect_slp_analyze_operations (vec_info *vinfo)
 
   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
 
-  scalar_stmts_to_slp_tree_map_t *visited
-    = new scalar_stmts_to_slp_tree_map_t ();
+  hash_set<slp_tree> visited;
   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
     {
-      scalar_stmts_to_slp_tree_map_t lvisited;
+      hash_set<slp_tree> lvisited;
       stmt_vector_for_cost cost_vec;
       cost_vec.create (2);
       if (!vect_slp_analyze_node_operations (vinfo,
                                             SLP_INSTANCE_TREE (instance),
-                                            instance, visited, &lvisited,
-                                            &cost_vec))
+                                            instance, visited, lvisited,
+                                            &cost_vec)
+         /* Instances with a root stmt require vectorized defs for the
+            SLP tree root.  */
+         || (SLP_INSTANCE_ROOT_STMT (instance)
+             && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
+                 != vect_internal_def)))
         {
          slp_tree node = SLP_INSTANCE_TREE (instance);
          stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "removing SLP instance operations starting from: ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "removing SLP instance operations starting from: %G",
+                            stmt_info->stmt);
          vect_free_slp_instance (instance, false);
           vinfo->slp_instances.ordered_remove (i);
          cost_vec.release ();
        }
       else
        {
-         for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
+         for (hash_set<slp_tree>::iterator x = lvisited.begin();
               x != lvisited.end(); ++x)
-           visited->put ((*x).first.copy (), (*x).second);
+           visited.add (*x);
          i++;
 
-         add_stmt_costs (vinfo->target_cost_data, &cost_vec);
+         add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
          cost_vec.release ();
        }
     }
-  delete visited;
 
   return !vinfo->slp_instances.is_empty ();
 }
@@ -2611,18 +2975,21 @@ vect_slp_analyze_operations (vec_info *vinfo)
    update LIFE according to uses of NODE.  */
 
 static void 
-vect_bb_slp_scalar_cost (basic_block bb,
+vect_bb_slp_scalar_cost (vec_info *vinfo,
                         slp_tree node, vec<bool, va_heap> *life,
-                        stmt_vector_for_cost *cost_vec)
+                        stmt_vector_for_cost *cost_vec,
+                        hash_set<slp_tree> &visited)
 {
   unsigned i;
   stmt_vec_info stmt_info;
   slp_tree child;
 
+  if (visited.add (node))
+    return; 
+
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
     {
       gimple *stmt = stmt_info->stmt;
-      vec_info *vinfo = stmt_info->vinfo;
       ssa_op_iter op_iter;
       def_operand_p def_p;
 
@@ -2665,6 +3032,8 @@ vect_bb_slp_scalar_cost (basic_block bb,
           else
            kind = scalar_store;
         }
+      else if (vect_nop_conversion_p (stmt_info))
+       continue;
       else
        kind = scalar_stmt;
       record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
@@ -2678,7 +3047,8 @@ vect_bb_slp_scalar_cost (basic_block bb,
          /* Do not directly pass LIFE to the recursive call, copy it to
             confine changes in the callee to the current child/subtree.  */
          subtree_life.safe_splice (*life);
-         vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec);
+         vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
+                                  visited);
          subtree_life.truncate (0);
        }
     }
@@ -2698,16 +3068,18 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   /* Calculate scalar cost.  */
   stmt_vector_for_cost scalar_costs;
   scalar_costs.create (0);
+  hash_set<slp_tree> visited;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
-      life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
-      vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
+      life.safe_grow_cleared
+       (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)).length ());
+      vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
-                              &life, &scalar_costs);
+                              &life, &scalar_costs, visited);
     }
   void *target_cost_data = init_cost (NULL);
-  add_stmt_costs (target_cost_data, &scalar_costs);
+  add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
   scalar_costs.release ();
   unsigned dummy;
   finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
@@ -2745,17 +3117,45 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   return true;
 }
 
-/* Check if the basic block can be vectorized.  Returns a bb_vec_info
-   if so and sets fatal to true if failure is independent of
-   current_vector_size.  */
+/* Find any vectorizable constructors and add them to the grouped_store
+   array.  */
 
-static bb_vec_info
-vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
-                      gimple_stmt_iterator region_end,
-                      vec<data_reference_p> datarefs, int n_stmts,
-                      bool &fatal, vec_info_shared *shared)
+static void
+vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
 {
-  bb_vec_info bb_vinfo;
+  gimple_stmt_iterator gsi;
+
+  for (gsi = bb_vinfo->region_begin;
+       gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
+    {
+      gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
+      if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
+       continue;
+
+      tree rhs = gimple_assign_rhs1 (stmt);
+      if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
+         || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
+                      CONSTRUCTOR_NELTS (rhs))
+         || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
+         || uniform_vector_p (rhs))
+       continue;
+
+      stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
+      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
+    }
+}
+
+/* Check if the region described by BB_VINFO can be vectorized, returning
+   true if so.  When returning false, set FATAL to true if the same failure
+   would prevent vectorization at other vector sizes, false if it is still
+   worth trying other sizes.  N_STMTS is the number of statements in the
+   region.  */
+
+static bool
+vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
+{
+  DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
+
   slp_instance instance;
   int i;
   poly_uint64 min_vf = 2;
@@ -2763,34 +3163,15 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
 
-  if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: too many instructions in "
-                        "basic block.\n");
-      free_data_refs (datarefs);
-      return NULL;
-    }
-
-  bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
-  if (!bb_vinfo)
-    return NULL;
-
-  BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
-  bb_vinfo->shared->save_datarefs ();
-
   /* Analyze the data references.  */
 
-  if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
+  if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: unhandled data-ref in basic "
                         "block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
@@ -2799,9 +3180,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: not enough data-refs in "
                         "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (!vect_analyze_data_ref_accesses (bb_vinfo))
@@ -2810,11 +3189,11 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                        "not vectorized: unhandled data access in "
                        "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
+  vect_slp_check_for_constructors (bb_vinfo);
+
   /* If there are no grouped stores in the region there is no need
      to continue with pattern recog as vect_analyze_slp will fail
      anyway.  */
@@ -2824,9 +3203,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "not vectorized: no grouped stores in "
                         "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* While the rest of the analysis below depends on it in some way.  */
@@ -2846,100 +3223,206 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
                           "not vectorized: failed to find SLP opportunities "
                           "in basic block.\n");
        }
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
+  /* Optimize permutations.  */
+  vect_optimize_slp (bb_vinfo);
+
   vect_record_base_alignments (bb_vinfo);
 
   /* Analyze and verify the alignment of data references and the
      dependence in the SLP instances.  */
   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
     {
-      if (! vect_slp_analyze_and_verify_instance_alignment (instance)
-         || ! vect_slp_analyze_instance_dependence (instance))
+      if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
+         || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
+       {
+         slp_tree node = SLP_INSTANCE_TREE (instance);
+         stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "removing SLP instance operations starting from: %G",
+                            stmt_info->stmt);
+         vect_free_slp_instance (instance, false);
+         BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
+         continue;
+       }
+
+      /* Mark all the statements that we want to vectorize as pure SLP and
+        relevant.  */
+      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
+      vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+       STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
+
+      i++;
+    }
+  if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
+    return false;
+
+  if (!vect_slp_analyze_operations (bb_vinfo))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: bad operation in basic block.\n");
+      return false;
+    }
+
+  /* Cost model: check if the vectorization is worthwhile.  */
+  if (!unlimited_cost_model (NULL)
+      && !vect_bb_vectorization_profitable_p (bb_vinfo))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: vectorization is not "
+                        "profitable.\n");
+      return false;
+    }
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "Basic block will be vectorized using SLP\n");
+  return true;
+}
+
+/* Subroutine of vect_slp_bb.  Try to vectorize the statements between
+   REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
+   on success.  The region has N_STMTS statements and has the datarefs
+   given by DATAREFS.  */
+
+static bool
+vect_slp_bb_region (gimple_stmt_iterator region_begin,
+                   gimple_stmt_iterator region_end,
+                   vec<data_reference_p> datarefs,
+                   unsigned int n_stmts)
+{
+  bb_vec_info bb_vinfo;
+  auto_vector_modes vector_modes;
+
+  /* Autodetect first vector size we try.  */
+  machine_mode next_vector_mode = VOIDmode;
+  targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
+  unsigned int mode_i = 0;
+
+  vec_info_shared shared;
+
+  machine_mode autodetected_vector_mode = VOIDmode;
+  while (1)
+    {
+      bool vectorized = false;
+      bool fatal = false;
+      bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
+
+      bool first_time_p = shared.datarefs.is_empty ();
+      BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
+      if (first_time_p)
+       bb_vinfo->shared->save_datarefs ();
+      else
+       bb_vinfo->shared->check_datarefs ();
+      bb_vinfo->vector_mode = next_vector_mode;
+
+      if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
+         && dbg_cnt (vect_slp))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "***** Analysis succeeded with vector mode"
+                              " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
+             dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
+           }
+
+         bb_vinfo->shared->check_datarefs ();
+         vect_schedule_slp (bb_vinfo);
+
+         unsigned HOST_WIDE_INT bytes;
+         if (dump_enabled_p ())
+           {
+             if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
+               dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+                                "basic block part vectorized using %wu byte "
+                                "vectors\n", bytes);
+             else
+               dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+                                "basic block part vectorized using variable "
+                                "length vectors\n");
+           }
+
+         vectorized = true;
+       }
+      else
        {
-         slp_tree node = SLP_INSTANCE_TREE (instance);
-         stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "removing SLP instance operations starting from: ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
-         vect_free_slp_instance (instance, false);
-         BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
-         continue;
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Analysis failed with vector mode %s\n",
+                            GET_MODE_NAME (bb_vinfo->vector_mode));
        }
 
-      /* Mark all the statements that we want to vectorize as pure SLP and
-        relevant.  */
-      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
-      vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
+      if (mode_i == 0)
+       autodetected_vector_mode = bb_vinfo->vector_mode;
+
+      if (!fatal)
+       while (mode_i < vector_modes.length ()
+              && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "***** The result for vector mode %s would"
+                              " be the same\n",
+                              GET_MODE_NAME (vector_modes[mode_i]));
+           mode_i += 1;
+         }
 
-      i++;
-    }
-  if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
-    {
       delete bb_vinfo;
-      return NULL;
-    }
 
-  if (!vect_slp_analyze_operations (bb_vinfo))
-    {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: bad operation in basic block.\n");
+      if (mode_i < vector_modes.length ()
+         && VECTOR_MODE_P (autodetected_vector_mode)
+         && (related_vector_mode (vector_modes[mode_i],
+                                  GET_MODE_INNER (autodetected_vector_mode))
+             == autodetected_vector_mode)
+         && (related_vector_mode (autodetected_vector_mode,
+                                  GET_MODE_INNER (vector_modes[mode_i]))
+             == vector_modes[mode_i]))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Skipping vector mode %s, which would"
+                            " repeat the analysis for %s\n",
+                            GET_MODE_NAME (vector_modes[mode_i]),
+                            GET_MODE_NAME (autodetected_vector_mode));
+         mode_i += 1;
+       }
 
-      delete bb_vinfo;
-      return NULL;
-    }
+      if (vectorized
+         || mode_i == vector_modes.length ()
+         || autodetected_vector_mode == VOIDmode
+         /* If vect_slp_analyze_bb_1 signaled that analysis for all
+            vector sizes will fail do not bother iterating.  */
+         || fatal)
+       return vectorized;
 
-  /* Cost model: check if the vectorization is worthwhile.  */
-  if (!unlimited_cost_model (NULL)
-      && !vect_bb_vectorization_profitable_p (bb_vinfo))
-    {
+      /* Try the next biggest vector size.  */
+      next_vector_mode = vector_modes[mode_i++];
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: vectorization is not "
-                        "profitable.\n");
-
-      delete bb_vinfo;
-      return NULL;
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "***** Re-trying analysis with vector mode %s\n",
+                        GET_MODE_NAME (next_vector_mode));
     }
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                    "Basic block will be vectorized using SLP\n");
-
-  return bb_vinfo;
 }
 
-
 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
    true if anything in the basic-block was vectorized.  */
 
 bool
 vect_slp_bb (basic_block bb)
 {
-  bb_vec_info bb_vinfo;
   gimple_stmt_iterator gsi;
   bool any_vectorized = false;
-  auto_vector_sizes vector_sizes;
-
-  DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
-
-  /* Autodetect first vector size we try.  */
-  current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
-  unsigned int next_size = 0;
-
-  gsi = gsi_start_bb (bb);
 
-  poly_uint64 autodetected_vector_size = 0;
-  while (1)
+  gsi = gsi_after_labels (bb);
+  while (!gsi_end_p (gsi))
     {
-      if (gsi_end_p (gsi))
-       break;
-
       gimple_stmt_iterator region_begin = gsi;
       vec<data_reference_p> datarefs = vNULL;
       int insns = 0;
@@ -2967,130 +3450,27 @@ vect_slp_bb (basic_block bb)
 
       gimple_stmt_iterator region_end = gsi;
 
-      bool vectorized = false;
-      bool fatal = false;
-      vec_info_shared shared;
-      bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
-                                       datarefs, insns, fatal, &shared);
-      if (bb_vinfo
-         && dbg_cnt (vect_slp))
+      if (insns > param_slp_max_insns_in_bb)
        {
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
-
-         bb_vinfo->shared->check_datarefs ();
-         vect_schedule_slp (bb_vinfo);
-
-         unsigned HOST_WIDE_INT bytes;
-         if (current_vector_size.is_constant (&bytes))
-           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-                            "basic block part vectorized using "
-                            HOST_WIDE_INT_PRINT_UNSIGNED " byte "
-                            "vectors\n", bytes);
-         else
-           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-                            "basic block part vectorized using variable "
-                            "length vectors\n");
-
-         vectorized = true;
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: too many instructions in "
+                            "basic block.\n");
        }
-      delete bb_vinfo;
-
-      any_vectorized |= vectorized;
-
-      if (next_size == 0)
-       autodetected_vector_size = current_vector_size;
-
-      if (next_size < vector_sizes.length ()
-         && known_eq (vector_sizes[next_size], autodetected_vector_size))
-       next_size += 1;
-
-      if (vectorized
-         || next_size == vector_sizes.length ()
-         || known_eq (current_vector_size, 0U)
-         /* If vect_slp_analyze_bb_1 signaled that analysis for all
-            vector sizes will fail do not bother iterating.  */
-         || fatal)
-       {
-         if (gsi_end_p (region_end))
-           break;
-
-         /* Skip the unhandled stmt.  */
-         gsi_next (&gsi);
+      else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
+       any_vectorized = true;
 
-         /* And reset vector sizes.  */
-         current_vector_size = 0;
-         next_size = 0;
-       }
-      else
-       {
-         /* Try the next biggest vector size.  */
-         current_vector_size = vector_sizes[next_size++];
-         if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                              "***** Re-trying analysis with "
-                              "vector size ");
-             dump_dec (MSG_NOTE, current_vector_size);
-             dump_printf (MSG_NOTE, "\n");
-           }
+      if (gsi_end_p (region_end))
+       break;
 
-         /* Start over.  */
-         gsi = region_begin;
-       }
+      /* Skip the unhandled stmt.  */
+      gsi_next (&gsi);
     }
 
   return any_vectorized;
 }
 
 
-/* Return 1 if vector type of boolean constant which is OPNUM
-   operand in statement STMT_VINFO is a boolean vector.  */
-
-static bool
-vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, int opnum)
-{
-  enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
-  tree op, vectype;
-  enum vect_def_type dt;
-
-  /* For comparison and COND_EXPR type is chosen depending
-     on the other comparison operand.  */
-  if (TREE_CODE_CLASS (code) == tcc_comparison)
-    {
-      gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
-      if (opnum)
-       op = gimple_assign_rhs1 (stmt);
-      else
-       op = gimple_assign_rhs2 (stmt);
-
-      if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
-       gcc_unreachable ();
-
-      return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
-    }
-
-  if (code == COND_EXPR)
-    {
-      gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
-      tree cond = gimple_assign_rhs1 (stmt);
-
-      if (TREE_CODE (cond) == SSA_NAME)
-       op = cond;
-      else if (opnum)
-       op = TREE_OPERAND (cond, 1);
-      else
-       op = TREE_OPERAND (cond, 0);
-
-      if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
-       gcc_unreachable ();
-
-      return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
-    }
-
-  return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
-}
-
 /* Build a variable-length vector in which the elements in ELTS are repeated
    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
    RESULTS and add any new instructions to SEQ.
@@ -3114,8 +3494,9 @@ vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, int opnum)
    to cut down on the number of interleaves.  */
 
 void
-duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
-                         unsigned int nresults, vec<tree> &results)
+duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
+                         vec<tree> elts, unsigned int nresults,
+                         vec<tree> &results)
 {
   unsigned int nelts = elts.length ();
   tree element_type = TREE_TYPE (vector_type);
@@ -3124,7 +3505,7 @@ duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
   unsigned int nvectors = 1;
   tree new_vector_type;
   tree permutes[2];
-  if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
+  if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
                                       &nvectors, &new_vector_type,
                                       permutes))
     gcc_unreachable ();
@@ -3204,54 +3585,31 @@ duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
 }
 
 
-/* For constant and loop invariant defs of SLP_NODE this function returns
-   (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
-   OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
-   scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
-   REDUC_INDEX is the index of the reduction operand in the statements, unless
-   it is -1.  */
+/* For constant and loop invariant defs in OP_NODE this function creates
+   vector defs that will be used in the vectorized stmts and stores them
+   to SLP_TREE_VEC_DEFS of OP_NODE.  */
 
 static void
-vect_get_constant_vectors (tree op, slp_tree slp_node,
-                           vec<tree> *vec_oprnds,
-                          unsigned int op_num, unsigned int number_of_vectors)
+vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
-  gimple *stmt = stmt_vinfo->stmt;
   unsigned HOST_WIDE_INT nunits;
   tree vec_cst;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type;
   tree vop;
-  int group_size = stmts.length ();
+  int group_size = op_node->ops.length ();
   unsigned int vec_num, i;
   unsigned number_of_copies = 1;
-  vec<tree> voprnds;
-  voprnds.create (number_of_vectors);
-  bool constant_p, is_store;
-  tree neutral_op = NULL;
-  enum tree_code code = gimple_expr_code (stmt);
+  bool constant_p;
   gimple_seq ctor_seq = NULL;
   auto_vec<tree, 16> permute_results;
 
-  /* Check if vector type is a boolean vector.  */
-  if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
-      && vect_mask_constant_operand_p (stmt_vinfo, op_num))
-    vector_type
-      = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
-  else
-    vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
-
-  if (STMT_VINFO_DATA_REF (stmt_vinfo))
-    {
-      is_store = true;
-      op = gimple_assign_rhs1 (stmt);
-    }
-  else
-    is_store = false;
+  /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
+  vector_type = SLP_TREE_VECTYPE (op_node);
 
-  gcc_assert (op);
+  unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
+  SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
+  auto_vec<tree> voprnds (number_of_vectors);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -3280,59 +3638,12 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   constant_p = true;
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
-  bool place_after_defs = false;
+  stmt_vec_info insert_after = NULL;
   for (j = 0; j < number_of_copies; j++)
     {
-      for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
+      tree op;
+      for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
         {
-         stmt = stmt_vinfo->stmt;
-          if (is_store)
-            op = gimple_assign_rhs1 (stmt);
-          else
-           {
-             switch (code)
-               {
-                 case COND_EXPR:
-                   {
-                     tree cond = gimple_assign_rhs1 (stmt);
-                     if (TREE_CODE (cond) == SSA_NAME)
-                       op = gimple_op (stmt, op_num + 1);
-                     else if (op_num == 0 || op_num == 1)
-                       op = TREE_OPERAND (cond, op_num);
-                     else
-                       {
-                         if (op_num == 2)
-                           op = gimple_assign_rhs2 (stmt);
-                         else
-                           op = gimple_assign_rhs3 (stmt);
-                       }
-                   }
-                   break;
-
-                 case CALL_EXPR:
-                   op = gimple_call_arg (stmt, op_num);
-                   break;
-
-                 case LSHIFT_EXPR:
-                 case RSHIFT_EXPR:
-                 case LROTATE_EXPR:
-                 case RROTATE_EXPR:
-                   op = gimple_op (stmt, op_num + 1);
-                   /* Unlike the other binary operators, shifts/rotates have
-                      the shift count being int, instead of the same type as
-                      the lhs, so make sure the scalar is the right type if
-                      we are dealing with vectors of
-                      long long/long/short/char.  */
-                   if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
-                     op = fold_convert (TREE_TYPE (vector_type), op);
-                   break;
-
-                 default:
-                   op = gimple_op (stmt, op_num + 1);
-                   break;
-               }
-           }
-
           /* Create 'vect_ = {op0,op1,...,opn}'.  */
           number_of_places_left_in_vector--;
          tree orig_op = op;
@@ -3387,12 +3698,20 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
          elts[number_of_places_left_in_vector] = op;
          if (!CONSTANT_CLASS_P (op))
            constant_p = false;
+         /* For BB vectorization we have to compute an insert location
+            when a def is inside the analyzed region since we cannot
+            simply insert at the BB start in this case.  */
+         stmt_vec_info opdef;
          if (TREE_CODE (orig_op) == SSA_NAME
              && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
-             && STMT_VINFO_BB_VINFO (stmt_vinfo)
-             && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
-                 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
-           place_after_defs = true;
+             && is_a <bb_vec_info> (vinfo)
+             && (opdef = vinfo->lookup_def (orig_op)))
+           {
+             if (!insert_after)
+               insert_after = opdef;
+             else
+               insert_after = get_later_stmt (insert_after, opdef);
+           }
 
           if (number_of_places_left_in_vector == 0)
             {
@@ -3402,33 +3721,33 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
                vec_cst = gimple_build_vector (&ctor_seq, &elts);
              else
                {
-                 if (vec_oprnds->is_empty ())
-                   duplicate_and_interleave (&ctor_seq, vector_type, elts,
-                                             number_of_vectors,
+                 if (permute_results.is_empty ())
+                   duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
+                                             elts, number_of_vectors,
                                              permute_results);
                  vec_cst = permute_results[number_of_vectors - j - 1];
                }
              tree init;
-             gimple_stmt_iterator gsi;
-             if (place_after_defs)
+             if (insert_after)
                {
-                 stmt_vec_info last_stmt_info
-                   = vect_find_last_scalar_stmt_in_slp (slp_node);
-                 gsi = gsi_for_stmt (last_stmt_info->stmt);
-                 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
-                                          &gsi);
+                 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
+                 /* vect_init_vector inserts before.  */
+                 gsi_next (&gsi);
+                 init = vect_init_vector (vinfo, NULL, vec_cst,
+                                          vector_type, &gsi);
                }
              else
-               init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
-                                        NULL);
+               init = vect_init_vector (vinfo, NULL, vec_cst,
+                                        vector_type, NULL);
              if (ctor_seq != NULL)
                {
-                 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
+                 gimple_stmt_iterator gsi
+                   = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
                  gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
                  ctor_seq = NULL;
                }
              voprnds.quick_push (init);
-             place_after_defs = false;
+             insert_after = NULL;
               number_of_places_left_in_vector = nunits;
              constant_p = true;
              elts.new_vector (vector_type, nunits, 1);
@@ -3443,32 +3762,17 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   for (j = vec_num; j != 0; j--)
     {
       vop = voprnds[j - 1];
-      vec_oprnds->quick_push (vop);
+      SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
     }
 
-  voprnds.release ();
-
   /* In case that VF is greater than the unrolling factor needed for the SLP
      group of stmts, NUMBER_OF_VECTORS to be created is greater than
      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
      to replicate the vectors.  */
-  while (number_of_vectors > vec_oprnds->length ())
-    {
-      tree neutral_vec = NULL;
-
-      if (neutral_op)
-        {
-          if (!neutral_vec)
-           neutral_vec = build_vector_from_val (vector_type, neutral_op);
-
-          vec_oprnds->quick_push (neutral_vec);
-        }
-      else
-        {
-          for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
-            vec_oprnds->quick_push (vop);
-        }
-    }
+  while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
+    for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
+        i++)
+      SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
 }
 
 
@@ -3478,117 +3782,38 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 static void
 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
 {
-  tree vec_oprnd;
   stmt_vec_info vec_def_stmt_info;
   unsigned int i;
 
   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
 
   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
-    {
-      gcc_assert (vec_def_stmt_info);
-      if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
-       vec_oprnd = gimple_phi_result (vec_def_phi);
-      else
-       vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
-      vec_oprnds->quick_push (vec_oprnd);
-    }
+    vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
 }
 
 
-/* Get vectorized definitions for SLP_NODE.
-   If the scalar definitions are loop invariants or constants, collect them and
-   call vect_get_constant_vectors() to create vector stmts.
-   Otherwise, the def-stmts must be already vectorized and the vectorized stmts
-   must be stored in the corresponding child of SLP_NODE, and we call
-   vect_get_slp_vect_defs () to retrieve them.  */
+/* Get N vectorized definitions for SLP_NODE.  */
 
 void
-vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
-                  vec<vec<tree> > *vec_oprnds)
-{
-  int number_of_vects = 0, i;
-  unsigned int child_index = 0;
-  HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
-  slp_tree child = NULL;
-  vec<tree> vec_defs;
-  tree oprnd;
-  bool vectorized_defs;
+vect_get_slp_defs (vec_info *,
+                  slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
+{
+  if (n == -1U)
+    n = SLP_TREE_CHILDREN (slp_node).length ();
 
-  stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-  FOR_EACH_VEC_ELT (ops, i, oprnd)
+  for (unsigned i = 0; i < n; ++i)
     {
-      /* For each operand we check if it has vectorized definitions in a child
-        node or we need to create them (for invariants and constants).  We
-        check if the LHS of the first stmt of the next child matches OPRND.
-        If it does, we found the correct child.  Otherwise, we call
-        vect_get_constant_vectors (), and not advance CHILD_INDEX in order
-        to check this child node for the next operand.  */
-      vectorized_defs = false;
-      if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
-        {
-          child = SLP_TREE_CHILDREN (slp_node)[child_index];
-
-         /* We have to check both pattern and original def, if available.  */
-         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-           {
-             stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
-             stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
-             tree first_def_op;
-
-             if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
-               first_def_op = gimple_phi_result (first_def);
-             else
-               first_def_op = gimple_get_lhs (first_def_info->stmt);
-             if (operand_equal_p (oprnd, first_def_op, 0)
-                 || (related
-                     && operand_equal_p (oprnd,
-                                         gimple_get_lhs (related->stmt), 0)))
-               {
-                 /* The number of vector defs is determined by the number of
-                    vector statements in the node from which we get those
-                    statements.  */
-                 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
-                 vectorized_defs = true;
-                 child_index++;
-               }
-           }
-         else
-           child_index++;
-        }
-
-      if (!vectorized_defs)
-        {
-          if (i == 0)
-            {
-              number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-              /* Number of vector stmts was calculated according to LHS in
-                 vect_schedule_slp_instance (), fix it by replacing LHS with
-                 RHS, if necessary.  See vect_get_smallest_scalar_type () for
-                 details.  */
-             vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
-                                            &rhs_size_unit);
-              if (rhs_size_unit != lhs_size_unit)
-                {
-                  number_of_vects *= rhs_size_unit;
-                  number_of_vects /= lhs_size_unit;
-                }
-            }
-        }
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
 
-      /* Allocate memory for vectorized defs.  */
-      vec_defs = vNULL;
-      vec_defs.create (number_of_vects);
+      vec<tree> vec_defs = vNULL;
 
-      /* For reduction defs we call vect_get_constant_vectors (), since we are
-         looking for initial loop invariant values.  */
-      if (vectorized_defs)
-        /* The defs are already vectorized.  */
+      /* For each operand we check if it has vectorized definitions in a child
+        node or we need to create them (for invariants and constants).  */
+      vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
        vect_get_slp_vect_defs (child, &vec_defs);
       else
-       /* Build vectors from scalar defs.  */
-       vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
-                                  number_of_vects);
+       vec_defs.splice (SLP_TREE_VEC_DEFS (child));
 
       vec_oprnds->quick_push (vec_defs);
     }
@@ -3596,24 +3821,20 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
 
 /* Generate vector permute statements from a list of loads in DR_CHAIN.
    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
-   permute statements for the SLP node NODE of the SLP instance
-   SLP_NODE_INSTANCE.  */
+   permute statements for the SLP node NODE.  */
 
 bool
-vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
+vect_transform_slp_perm_load (vec_info *vinfo,
+                             slp_tree node, vec<tree> dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
-                             slp_instance slp_node_instance, bool analyze_only,
-                             unsigned *n_perms)
+                             bool analyze_only, unsigned *n_perms)
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-  vec_info *vinfo = stmt_info->vinfo;
-  tree mask_element_type = NULL_TREE, mask_type;
   int vec_index = 0;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+  unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned int mask_element;
   machine_mode mode;
-  unsigned HOST_WIDE_INT nunits, const_vf;
 
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
     return false;
@@ -3621,22 +3842,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
 
   mode = TYPE_MODE (vectype);
-
-  /* At the moment, all permutations are represented using per-element
-     indices, so we can't cope with variable vector lengths or
-     vectorization factors.  */
-  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-      || !vf.is_constant (&const_vf))
-    return false;
-
-  /* The generic VEC_PERM_EXPR code always uses an integral type of the
-     same size as the vector element being permuted.  */
-  mask_element_type = lang_hooks.types.type_for_mode
-    (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
-  mask_type = get_vectype_for_scalar_type (mask_element_type);
-  vec_perm_builder mask (nunits, nunits, 1);
-  mask.quick_grow (nunits);
-  vec_perm_indices indices;
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   /* Initialize the vect stmts of NODE to properly insert the generated
      stmts later.  */
@@ -3670,14 +3876,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   bool noop_p = true;
   *n_perms = 0;
 
-  for (unsigned int j = 0; j < const_vf; j++)
+  vec_perm_builder mask;
+  unsigned int nelts_to_build;
+  unsigned int nvectors_per_build;
+  bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+                     && multiple_p (nunits, group_size));
+  if (repeating_p)
+    {
+      /* A single vector contains a whole number of copies of the node, so:
+        (a) all permutes can use the same mask; and
+        (b) the permutes only need a single vector input.  */
+      mask.new_vector (nunits, group_size, 3);
+      nelts_to_build = mask.encoded_nelts ();
+      nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+    }
+  else
+    {
+      /* We need to construct a separate mask for each vector statement.  */
+      unsigned HOST_WIDE_INT const_nunits, const_vf;
+      if (!nunits.is_constant (&const_nunits)
+         || !vf.is_constant (&const_vf))
+       return false;
+      mask.new_vector (const_nunits, const_nunits, 1);
+      nelts_to_build = const_vf * group_size;
+      nvectors_per_build = 1;
+    }
+
+  unsigned int count = mask.encoded_nelts ();
+  mask.quick_grow (count);
+  vec_perm_indices indices;
+
+  for (unsigned int j = 0; j < nelts_to_build; j++)
     {
-      for (int k = 0; k < group_size; k++)
+      unsigned int iter_num = j / group_size;
+      unsigned int stmt_num = j % group_size;
+      unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+                       + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+      if (repeating_p)
+       {
+         first_vec_index = 0;
+         mask_element = i;
+       }
+      else
        {
-         unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
-                           + j * DR_GROUP_SIZE (stmt_info));
-         vec_index = i / nunits;
-         mask_element = i % nunits;
+         /* Enforced before the loop when !repeating_p.  */
+         unsigned int const_nunits = nunits.to_constant ();
+         vec_index = i / const_nunits;
+         mask_element = i % const_nunits;
          if (vec_index == first_vec_index
              || first_vec_index == -1)
            {
@@ -3687,66 +3932,67 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                   || second_vec_index == -1)
            {
              second_vec_index = vec_index;
-             mask_element += nunits;
+             mask_element += const_nunits;
            }
          else
            {
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "permutation requires at "
-                                  "least three vectors ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                   stmt_info->stmt, 0);
-               }
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "permutation requires at "
+                                "least three vectors %G",
+                                stmt_info->stmt);
              gcc_assert (analyze_only);
              return false;
            }
 
-         gcc_assert (mask_element < 2 * nunits);
-         if (mask_element != index)
-           noop_p = false;
-         mask[index++] = mask_element;
+         gcc_assert (mask_element < 2 * const_nunits);
+       }
+
+      if (mask_element != index)
+       noop_p = false;
+      mask[index++] = mask_element;
 
-         if (index == nunits && !noop_p)
+      if (index == count && !noop_p)
+       {
+         indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
+         if (!can_vec_perm_const_p (mode, indices))
            {
-             indices.new_vector (mask, 2, nunits);
-             if (!can_vec_perm_const_p (mode, indices))
+             if (dump_enabled_p ())
                {
-                 if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                                  vect_location,
+                                  "unsupported vect permute { ");
+                 for (i = 0; i < count; ++i)
                    {
-                     dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                      vect_location, 
-                                      "unsupported vect permute { ");
-                     for (i = 0; i < nunits; ++i)
-                       {
-                         dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
-                         dump_printf (MSG_MISSED_OPTIMIZATION, " ");
-                       }
-                     dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
+                     dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
+                     dump_printf (MSG_MISSED_OPTIMIZATION, " ");
                    }
-                 gcc_assert (analyze_only);
-                 return false;
+                 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
                }
-
-             ++*n_perms;
+             gcc_assert (analyze_only);
+             return false;
            }
 
-         if (index == nunits)
+         ++*n_perms;
+       }
+
+      if (index == count)
+       {
+         if (!analyze_only)
            {
-             if (!analyze_only)
-               {
-                 tree mask_vec = NULL_TREE;
+             tree mask_vec = NULL_TREE;
                  
-                 if (! noop_p)
-                   mask_vec = vec_perm_indices_to_tree (mask_type, indices);
+             if (! noop_p)
+               mask_vec = vect_gen_perm_mask_checked (vectype, indices);
 
-                 if (second_vec_index == -1)
-                   second_vec_index = first_vec_index;
+             if (second_vec_index == -1)
+               second_vec_index = first_vec_index;
 
+             for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
+               {
                  /* Generate the permute statement if necessary.  */
-                 tree first_vec = dr_chain[first_vec_index];
-                 tree second_vec = dr_chain[second_vec_index];
+                 tree first_vec = dr_chain[first_vec_index + ri];
+                 tree second_vec = dr_chain[second_vec_index + ri];
                  stmt_vec_info perm_stmt_info;
                  if (! noop_p)
                    {
@@ -3760,7 +4006,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                                               first_vec, second_vec,
                                               mask_vec);
                      perm_stmt_info
-                       = vect_finish_stmt_generation (stmt_info, perm_stmt,
+                       = vect_finish_stmt_generation (vinfo,
+                                                      stmt_info, perm_stmt,
                                                       gsi);
                    }
                  else
@@ -3772,12 +4019,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
                    = perm_stmt_info;
                }
-
-             index = 0;
-             first_vec_index = -1;
-             second_vec_index = -1;
-             noop_p = true;
            }
+
+         index = 0;
+         first_vec_index = -1;
+         second_vec_index = -1;
+         noop_p = true;
        }
     }
 
@@ -3787,30 +4034,36 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 /* Vectorize SLP instance tree in postorder.  */
 
 static void
-vect_schedule_slp_instance (slp_tree node, slp_instance instance,
-                           scalar_stmts_to_slp_tree_map_t *bst_map)
+vect_schedule_slp_instance (vec_info *vinfo,
+                           slp_tree node, slp_instance instance)
 {
   gimple_stmt_iterator si;
-  stmt_vec_info stmt_info;
-  unsigned int group_size;
-  tree vectype;
   int i, j;
   slp_tree child;
 
-  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+  /* See if we have already vectorized the node in the graph of the
+     SLP instance.  */
+  if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
+       && SLP_TREE_VEC_STMTS (node).exists ())
+      || SLP_TREE_VEC_DEFS (node).exists ())
     return;
 
-  /* See if we have already vectorized the same set of stmts and reuse their
-     vectorized stmts.  */
-  if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
+  /* Vectorize externals and constants.  */
+  if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+      || SLP_TREE_DEF_TYPE (node) == vect_external_def)
     {
-      SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
+      /* ???  vectorizable_shift can end up using a scalar operand which is
+        currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
+        node in this case.  */
+      if (!SLP_TREE_VECTYPE (node))
+       return;
+
+      vect_create_constant_vectors (vinfo, node);
       return;
     }
 
-  bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_schedule_slp_instance (child, instance, bst_map);
+    vect_schedule_slp_instance (vinfo, child, instance);
 
   /* Push SLP node def-type to stmts.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
@@ -3821,42 +4074,29 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
          STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
       }
 
-  stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
 
   /* VECTYPE is the type of the destination.  */
-  vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  group_size = SLP_INSTANCE_GROUP_SIZE (instance);
+  unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
 
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
-  if (!SLP_TREE_VEC_STMTS (node).exists ())
-    SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
+  SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
 
   if (dump_enabled_p ())
-    {
-      dump_printf_loc (MSG_NOTE,vect_location,
-                      "------>vectorizing SLP node starting from: ");
-      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
-    }
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "------>vectorizing SLP node starting from: %G",
+                    stmt_info->stmt);
 
   /* Vectorized stmts go before the last scalar stmt which is where
      all uses are ready.  */
   stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   si = gsi_for_stmt (last_stmt_info->stmt);
 
-  /* Mark the first element of the reduction chain as reduction to properly
-     transform the node.  In the analysis phase only the last element of the
-     chain is marked as reduction.  */
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
-      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
-      && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
-    {
-      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
-      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
-    }
-
   /* Handle two-operation SLP nodes by vectorizing the group with
      both operations and then performing a merge.  */
+  bool done_p = false;
   if (SLP_TREE_TWO_OPERATORS (node))
     {
       gassign *stmt = as_a <gassign *> (stmt_info->stmt);
@@ -3881,11 +4121,11 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
          vec<stmt_vec_info> v1;
          unsigned j;
          tree tmask = NULL_TREE;
-         vect_transform_stmt (stmt_info, &si, node, instance);
+         vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
          v0 = SLP_TREE_VEC_STMTS (node).copy ();
          SLP_TREE_VEC_STMTS (node).truncate (0);
          gimple_assign_set_rhs_code (stmt, ocode);
-         vect_transform_stmt (stmt_info, &si, node, instance);
+         vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
          gimple_assign_set_rhs_code (stmt, code0);
          v1 = SLP_TREE_VEC_STMTS (node).copy ();
          SLP_TREE_VEC_STMTS (node).truncate (0);
@@ -3923,14 +4163,15 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
                                           gimple_assign_lhs (v1[j]->stmt),
                                           tmask);
              SLP_TREE_VEC_STMTS (node).quick_push
-               (vect_finish_stmt_generation (stmt_info, vstmt, &si));
+               (vect_finish_stmt_generation (vinfo, stmt_info, vstmt, &si));
            }
          v0.release ();
          v1.release ();
-         return;
+         done_p = true;
        }
     }
-  vect_transform_stmt (stmt_info, &si, node, instance);
+  if (!done_p)
+    vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
 
   /* Restore stmt def-types.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
@@ -3948,7 +4189,8 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
    SLP instances may refer to the same scalar stmt.  */
 
 static void
-vect_remove_slp_scalar_calls (slp_tree node)
+vect_remove_slp_scalar_calls (vec_info *vinfo,
+                             slp_tree node, hash_set<slp_tree> &visited)
 {
   gimple *new_stmt;
   gimple_stmt_iterator gsi;
@@ -3960,8 +4202,11 @@ vect_remove_slp_scalar_calls (slp_tree node)
   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
     return;
 
+  if (visited.add (node))
+    return;
+
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_remove_slp_scalar_calls (child);
+    vect_remove_slp_scalar_calls (vinfo, child, visited);
 
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
     {
@@ -3974,11 +4219,68 @@ vect_remove_slp_scalar_calls (slp_tree node)
       lhs = gimple_call_lhs (stmt);
       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
       gsi = gsi_for_stmt (stmt);
-      stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
+      vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
     }
 }
 
+static void
+vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
+{
+  hash_set<slp_tree> visited;
+  vect_remove_slp_scalar_calls (vinfo, node, visited);
+}
+
+/* Vectorize the instance root.  */
+
+void
+vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
+{
+  gassign *rstmt = NULL;
+
+  if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
+    {
+      stmt_vec_info child_stmt_info;
+      int j;
+
+      FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
+       {
+         tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
+         tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
+         if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
+                                         TREE_TYPE (vect_lhs)))
+           vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
+                              vect_lhs);
+         rstmt = gimple_build_assign (root_lhs, vect_lhs);
+         break;
+       }
+    }
+  else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
+    {
+      int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
+      stmt_vec_info child_stmt_info;
+      int j;
+      vec<constructor_elt, va_gc> *v;
+      vec_alloc (v, nelts);
+
+      FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
+       {
+         CONSTRUCTOR_APPEND_ELT (v,
+                                 NULL_TREE,
+                                 gimple_get_lhs (child_stmt_info->stmt));
+       }
+      tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
+      tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
+      tree r_constructor = build_constructor (rtype, v);
+      rstmt = gimple_build_assign (lhs, r_constructor);
+    }
+
+    gcc_assert (rstmt);
+
+    gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
+    gsi_replace (&rgsi, rstmt, true);
+}
+
 /* Generate vector code for all SLP instances in the loop/basic block.  */
 
 void
@@ -3988,19 +4290,20 @@ vect_schedule_slp (vec_info *vinfo)
   slp_instance instance;
   unsigned int i;
 
-  scalar_stmts_to_slp_tree_map_t *bst_map
-    = new scalar_stmts_to_slp_tree_map_t ();
   slp_instances = vinfo->slp_instances;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
+      slp_tree node = SLP_INSTANCE_TREE (instance);
       /* Schedule the tree of INSTANCE.  */
-      vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
-                                 instance, bst_map);
+      vect_schedule_slp_instance (vinfo, node, instance);
+
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+       vectorize_slp_instance_root_stmt (node, instance);
+
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                          "vectorizing stmts using SLP.\n");
     }
-  delete bst_map;
 
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
@@ -4016,14 +4319,16 @@ vect_schedule_slp (vec_info *vinfo)
         stmts starting from the SLP tree root if they have no
         uses.  */
       if (is_a <loop_vec_info> (vinfo))
-       vect_remove_slp_scalar_calls (root);
+       vect_remove_slp_scalar_calls (vinfo, root);
 
-      for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
-                  && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
+      for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
         {
          if (!STMT_VINFO_DATA_REF (store_info))
            break;
 
+         if (SLP_INSTANCE_ROOT_STMT (instance))
+           continue;
+
          store_info = vect_orig_stmt (store_info);
          /* Free the attached stmt_vec_info and remove the stmt.  */
          vinfo->remove_stmt (store_info);