]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/tree-vect-slp.c
[Ada] Use Standard.Natural on bit references to packed arrays
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
index e32731b6db16b06508bee4c373536fd611d07f4d..836defce9901683b000216f98c809451c7c82c5d 100644 (file)
@@ -1,5 +1,5 @@
 /* SLP - Basic Block Vectorization
-   Copyright (C) 2007-2019 Free Software Foundation, Inc.
+   Copyright (C) 2007-2020 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
 
@@ -32,7 +32,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "optabs-tree.h"
 #include "insn-config.h"
 #include "recog.h"             /* FIXME: for insn_data */
-#include "params.h"
 #include "fold-const.h"
 #include "stor-layout.h"
 #include "gimple-iterator.h"
@@ -45,8 +44,40 @@ along with GCC; see the file COPYING3.  If not see
 #include "vec-perm-indices.h"
 #include "gimple-fold.h"
 #include "internal-fn.h"
+#include "dump-context.h"
 
 
+/* Initialize a SLP node.  */
+
+_slp_tree::_slp_tree ()
+{
+  SLP_TREE_SCALAR_STMTS (this) = vNULL;
+  SLP_TREE_SCALAR_OPS (this) = vNULL;
+  SLP_TREE_VEC_STMTS (this) = vNULL;
+  SLP_TREE_VEC_DEFS (this) = vNULL;
+  SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
+  SLP_TREE_CHILDREN (this) = vNULL;
+  SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
+  SLP_TREE_TWO_OPERATORS (this) = false;
+  SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
+  SLP_TREE_VECTYPE (this) = NULL_TREE;
+  SLP_TREE_REPRESENTATIVE (this) = NULL;
+  this->refcnt = 1;
+  this->max_nunits = 1;
+}
+
+/* Tear down a SLP node.  */
+
+_slp_tree::~_slp_tree ()
+{
+  SLP_TREE_CHILDREN (this).release ();
+  SLP_TREE_SCALAR_STMTS (this).release ();
+  SLP_TREE_SCALAR_OPS (this).release ();
+  SLP_TREE_VEC_STMTS (this).release ();
+  SLP_TREE_VEC_DEFS (this).release ();
+  SLP_TREE_LOAD_PERMUTATION (this).release ();
+}
+
 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
    FINAL_P is true if we have vectorized the instance or if we have
    made a final decision not to vectorize the statements in any way.  */
@@ -77,13 +108,7 @@ vect_free_slp_tree (slp_tree node, bool final_p)
        }
     }
 
-  SLP_TREE_CHILDREN (node).release ();
-  SLP_TREE_SCALAR_STMTS (node).release ();
-  SLP_TREE_SCALAR_OPS (node).release ();
-  SLP_TREE_VEC_STMTS (node).release ();
-  SLP_TREE_LOAD_PERMUTATION (node).release ();
-
-  free (node);
+  delete node;
 }
 
 /* Free the memory allocated for the SLP instance.  FINAL_P is true if we
@@ -102,38 +127,16 @@ vect_free_slp_instance (slp_instance instance, bool final_p)
 /* Create an SLP node for SCALAR_STMTS.  */
 
 static slp_tree
-vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
+vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 {
-  slp_tree node;
-  stmt_vec_info stmt_info = scalar_stmts[0];
-  unsigned int nops;
-
-  if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
-    nops = gimple_call_num_args (stmt);
-  else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
-    {
-      nops = gimple_num_ops (stmt) - 1;
-      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
-       nops++;
-    }
-  else if (is_a <gphi *> (stmt_info->stmt))
-    nops = 0;
-  else
-    return NULL;
-
-  node = XNEW (struct _slp_tree);
+  slp_tree node = new _slp_tree;
   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
-  SLP_TREE_SCALAR_OPS (node) = vNULL;
-  SLP_TREE_VEC_STMTS (node).create (0);
-  SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
   SLP_TREE_CHILDREN (node).create (nops);
-  SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
-  SLP_TREE_TWO_OPERATORS (node) = false;
   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
-  node->refcnt = 1;
-  node->max_nunits = 1;
+  SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 
   unsigned i;
+  stmt_vec_info stmt_info;
   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
     STMT_VINFO_NUM_SLP_USES (stmt_info)++;
 
@@ -145,20 +148,9 @@ vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
 static slp_tree
 vect_create_new_slp_node (vec<tree> ops)
 {
-  slp_tree node;
-
-  node = XNEW (struct _slp_tree);
-  SLP_TREE_SCALAR_STMTS (node) = vNULL;
+  slp_tree node = new _slp_tree;
   SLP_TREE_SCALAR_OPS (node) = ops;
-  SLP_TREE_VEC_STMTS (node).create (0);
-  SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
-  SLP_TREE_CHILDREN (node) = vNULL;
-  SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
-  SLP_TREE_TWO_OPERATORS (node) = false;
   SLP_TREE_DEF_TYPE (node) = vect_external_def;
-  node->refcnt = 1;
-  node->max_nunits = 1;
-
   return node;
 }
 
@@ -225,6 +217,19 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 }
 
 
+/* Return true if STMTS contains a pattern statement.  */
+
+static bool
+vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
+{
+  stmt_vec_info stmt_info;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (stmts, i, stmt_info)
+    if (is_pattern_stmt_p (stmt_info))
+      return true;
+  return false;
+}
+
 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
    of the chain.  */
@@ -252,7 +257,7 @@ vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
   return -1;
 }
 
-/* Check whether it is possible to load COUNT elements of type ELT_MODE
+/* Check whether it is possible to load COUNT elements of type ELT_TYPE
    using the method implemented by duplicate_and_interleave.  Return true
    if so, returning the number of intermediate vectors in *NVECTORS_OUT
    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
@@ -260,26 +265,37 @@ vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 
 bool
 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
-                               machine_mode elt_mode,
-                               unsigned int *nvectors_out,
+                               tree elt_type, unsigned int *nvectors_out,
                                tree *vector_type_out,
                                tree *permutes)
 {
-  poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
-  poly_int64 nelts;
+  tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
+  if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
+    return false;
+
+  machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
+  poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
   unsigned int nvectors = 1;
   for (;;)
     {
       scalar_int_mode int_mode;
       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
-      if (multiple_p (vinfo->vector_size, elt_bytes, &nelts)
-         && int_mode_for_size (elt_bits, 0).exists (&int_mode))
+      if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
        {
+         /* Get the natural vector type for this SLP group size.  */
          tree int_type = build_nonstandard_integer_type
            (GET_MODE_BITSIZE (int_mode), 1);
-         tree vector_type = build_vector_type (int_type, nelts);
-         if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
+         tree vector_type
+           = get_vectype_for_scalar_type (vinfo, int_type, count);
+         if (vector_type
+             && VECTOR_MODE_P (TYPE_MODE (vector_type))
+             && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
+                          GET_MODE_SIZE (base_vector_mode)))
            {
+             /* Try fusing consecutive sequences of COUNT / NVECTORS elements
+                together into elements of type INT_TYPE and using the result
+                to build NVECTORS vectors.  */
+             poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
              vec_perm_builder sel1 (nelts, 2, 3);
              vec_perm_builder sel2 (nelts, 2, 3);
              poly_int64 half_nelts = exact_div (nelts, 2);
@@ -419,6 +435,13 @@ again:
 
       if (first)
        {
+         /* For the swapping logic below force vect_reduction_def
+            for the reduction op in a SLP reduction group.  */
+         if (!STMT_VINFO_DATA_REF (stmt_info)
+             && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+             && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
+             && def_stmt_info)
+           dt = vect_reduction_def;
          oprnd_info->first_dt = dt;
          oprnd_info->first_op_type = TREE_TYPE (oprnd);
        }
@@ -469,10 +492,10 @@ again:
            }
          if ((dt == vect_constant_def
               || dt == vect_external_def)
-             && !vinfo->vector_size.is_constant ()
+             && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
              && (TREE_CODE (type) == BOOLEAN_TYPE
                  || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
-                                                     TYPE_MODE (type))))
+                                                     type)))
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -532,48 +555,6 @@ again:
   /* Swap operands.  */
   if (swapped)
     {
-      if (first_op_cond)
-       {
-         /* If there are already uses of this stmt in a SLP instance then
-            we've committed to the operand order and can't swap it.  */
-         if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "Build SLP failed: cannot swap operands of "
-                                "shared stmt %G", stmt_info->stmt);
-             return -1;
-           }
-
-         /* To get rid of this swapping we have to move the stmt code
-            to the SLP tree as well (and gather it here per stmt).  */
-         gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-         tree cond = gimple_assign_rhs1 (stmt);
-         enum tree_code code = TREE_CODE (cond);
-
-         /* Swap.  */
-         if (*swap == 1)
-           {
-             swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
-                                &TREE_OPERAND (cond, 1));
-             TREE_SET_CODE (cond, swap_tree_comparison (code));
-           }
-         /* Invert.  */
-         else
-           {
-             swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
-                                gimple_assign_rhs3_ptr (stmt));
-             bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
-             code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
-             gcc_assert (code != ERROR_MARK);
-             TREE_SET_CODE (cond, code);
-           }
-       }
-      else
-       {
-         /* Commutative ops need not reflect swapping, ops are in
-            the SLP tree.  */
-       }
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                         "swapped operands to match def types in %G",
@@ -584,6 +565,77 @@ again:
   return 0;
 }
 
+/* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
+   Return true if we can, meaning that this choice doesn't conflict with
+   existing SLP nodes that use STMT_INFO.  */
+
+static bool
+vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
+{
+  tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
+  if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
+    return true;
+
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+      && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+    {
+      /* We maintain the invariant that if any statement in the group is
+        used, all other members of the group have the same vector type.  */
+      stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      stmt_vec_info member_info = first_info;
+      for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
+       if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
+           || is_pattern_stmt_p (member_info))
+         break;
+
+      if (!member_info)
+       {
+         for (member_info = first_info; member_info;
+              member_info = DR_GROUP_NEXT_ELEMENT (member_info))
+           STMT_VINFO_VECTYPE (member_info) = vectype;
+         return true;
+       }
+    }
+  else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
+          && !is_pattern_stmt_p (stmt_info))
+    {
+      STMT_VINFO_VECTYPE (stmt_info) = vectype;
+      return true;
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                      "Build SLP failed: incompatible vector"
+                      " types for: %G", stmt_info->stmt);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "    old vector type: %T\n", old_vectype);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "    new vector type: %T\n", vectype);
+    }
+  return false;
+}
+
+/* Try to infer and assign a vector type to all the statements in STMTS.
+   Used only for BB vectorization.  */
+
+static bool
+vect_update_all_shared_vectypes (vec_info *vinfo, vec<stmt_vec_info> stmts)
+{
+  tree vectype, nunits_vectype;
+  if (!vect_get_vector_types_for_stmt (vinfo, stmts[0], &vectype,
+                                      &nunits_vectype, stmts.length ()))
+    return false;
+
+  stmt_vec_info stmt_info;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (stmts, i, stmt_info)
+    if (!vect_update_shared_vectype (stmt_info, vectype))
+      return false;
+
+  return true;
+}
+
 /* Return true if call statements CALL1 and CALL2 are similar enough
    to be combined into the same SLP group.  */
 
@@ -627,7 +679,8 @@ compatible_calls_p (gcall *call1, gcall *call2)
    vect_build_slp_tree.  */
 
 static bool
-vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
+vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
+                       unsigned int group_size,
                        tree vectype, poly_uint64 *max_nunits)
 {
   if (!vectype)
@@ -644,7 +697,7 @@ vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
      before adjusting *max_nunits for basic-block vectorization.  */
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   unsigned HOST_WIDE_INT const_nunits;
-  if (STMT_VINFO_BB_VINFO (stmt_info)
+  if (is_a <bb_vec_info> (vinfo)
       && (!nunits.is_constant (&const_nunits)
          || const_nunits > group_size))
     {
@@ -705,7 +758,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
 
 static bool
-vect_build_slp_tree_1 (unsigned char *swap,
+vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
                       vec<stmt_vec_info> stmts, unsigned int group_size,
                       poly_uint64 *max_nunits, bool *matches,
                       bool *two_operators)
@@ -762,10 +815,10 @@ vect_build_slp_tree_1 (unsigned char *swap,
        }
 
       tree nunits_vectype;
-      if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
-                                          &nunits_vectype)
+      if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
+                                          &nunits_vectype, group_size)
          || (nunits_vectype
-             && !vect_record_max_nunits (stmt_info, group_size,
+             && !vect_record_max_nunits (vinfo, stmt_info, group_size,
                                          nunits_vectype, max_nunits)))
        {
          /* Fatal mismatch.  */
@@ -775,7 +828,12 @@ vect_build_slp_tree_1 (unsigned char *swap,
 
       gcc_assert (vectype);
 
-      if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
+      if (is_a <bb_vec_info> (vinfo)
+         && !vect_update_shared_vectype (stmt_info, vectype))
+       continue;
+
+      gcall *call_stmt = dyn_cast <gcall *> (stmt);
+      if (call_stmt)
        {
          rhs_code = CALL_EXPR;
 
@@ -815,17 +873,6 @@ vect_build_slp_tree_1 (unsigned char *swap,
              || rhs_code == LROTATE_EXPR
              || rhs_code == RROTATE_EXPR)
            {
-             if (vectype == boolean_type_node)
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "Build SLP failed: shift of a"
-                                    " boolean.\n");
-                 /* Fatal mismatch.  */
-                 matches[0] = false;
-                 return false;
-               }
-
              vec_mode = TYPE_MODE (vectype);
 
              /* First see if we have a vector/vector shift.  */
@@ -872,6 +919,12 @@ vect_build_slp_tree_1 (unsigned char *swap,
               need_same_oprnds = true;
               first_op1 = gimple_assign_rhs2 (stmt);
             }
+         else if (call_stmt
+                  && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
+           {
+             need_same_oprnds = true;
+             first_op1 = gimple_call_arg (call_stmt, 1);
+           }
        }
       else
        {
@@ -909,15 +962,20 @@ vect_build_slp_tree_1 (unsigned char *swap,
              continue;
            }
 
-         if (need_same_oprnds
-             && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
+         if (need_same_oprnds)
            {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "Build SLP failed: different shift "
-                                "arguments in %G", stmt);
-             /* Mismatch.  */
-             continue;
+             tree other_op1 = (call_stmt
+                               ? gimple_call_arg (call_stmt, 1)
+                               : gimple_assign_rhs2 (stmt));
+             if (!operand_equal_p (first_op1, other_op1, 0))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different shift "
+                                    "arguments in %G", stmt);
+                 /* Mismatch.  */
+                 continue;
+               }
            }
 
          if (!load_p && rhs_code == CALL_EXPR)
@@ -1047,9 +1105,8 @@ vect_build_slp_tree_1 (unsigned char *swap,
   if (alt_stmt_code != ERROR_MARK
       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
     {
-      if (vectype == boolean_type_node
-         || !vect_two_operations_perm_ok_p (stmts, group_size,
-                                            vectype, alt_stmt_code))
+      if (!vect_two_operations_perm_ok_p (stmts, group_size,
+                                         vectype, alt_stmt_code))
        {
          for (i = 0; i < group_size; ++i)
            if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
@@ -1083,6 +1140,7 @@ struct bst_traits
   static inline bool equal (value_type existing, value_type candidate);
   static inline bool is_empty (value_type x) { return !x.exists (); }
   static inline bool is_deleted (value_type x) { return !x.exists (); }
+  static const bool empty_zero_p = true;
   static inline void mark_empty (value_type &x) { x.release (); }
   static inline void mark_deleted (value_type &x) { x.release (); }
   static inline void remove (value_type &x) { x.release (); }
@@ -1137,7 +1195,8 @@ vect_build_slp_tree (vec_info *vinfo,
       return *leader;
     }
   poly_uint64 this_max_nunits = 1;
-  slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
+  slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
+                                       &this_max_nunits,
                                        matches, npermutes, tree_size, bst_map);
   if (res)
     {
@@ -1190,7 +1249,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
     {
       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
-      if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
+      if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
+                                  max_nunits))
        return NULL;
 
       vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
@@ -1215,14 +1275,14 @@ vect_build_slp_tree_2 (vec_info *vinfo,
       else
        return NULL;
       (*tree_size)++;
-      node = vect_create_new_slp_node (stmts);
+      node = vect_create_new_slp_node (stmts, 0);
       return node;
     }
 
 
   bool two_operators = false;
   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
-  if (!vect_build_slp_tree_1 (swap, stmts, group_size,
+  if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
                              &this_max_nunits, matches, &two_operators))
     return NULL;
 
@@ -1240,7 +1300,24 @@ vect_build_slp_tree_2 (vec_info *vinfo,
        {
          *max_nunits = this_max_nunits;
          (*tree_size)++;
-         node = vect_create_new_slp_node (stmts);
+         node = vect_create_new_slp_node (stmts, 0);
+         /* And compute the load permutation.  Whether it is actually
+            a permutation depends on the unrolling factor which is
+            decided later.  */
+         vec<unsigned> load_permutation;
+         int j;
+         stmt_vec_info load_info;
+         load_permutation.create (group_size);
+         stmt_vec_info first_stmt_info
+           = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+           {
+             int load_place = vect_get_place_in_interleaving_chain
+                 (load_info, first_stmt_info);
+             gcc_assert (load_place != -1);
+             load_permutation.safe_push (load_place);
+           }
+         SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
          return node;
        }
     }
@@ -1299,10 +1376,11 @@ vect_build_slp_tree_2 (vec_info *vinfo,
                                        matches, npermutes,
                                        &this_tree_size, bst_map)) != NULL)
        {
-         /* If we have all children of child built up from scalars then just
-            throw that away and build it up this node from scalars.  */
+         /* If we have all children of a non-unary child built up from
+            scalars then just throw that away and build it up this node
+            from scalars.  */
          if (is_a <bb_vec_info> (vinfo)
-             && !SLP_TREE_CHILDREN (child).is_empty ()
+             && SLP_TREE_CHILDREN (child).length () > 1
              /* ???  Rejecting patterns this way doesn't work.  We'd have to
                 do extra work to cancel the pattern so the uses see the
                 scalar version.  */
@@ -1313,7 +1391,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
              FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
                if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
                  break;
-             if (!grandchild)
+             if (!grandchild
+                 && vect_update_all_shared_vectypes (vinfo,
+                                                     oprnd_info->def_stmts))
                {
                  /* Roll back.  */
                  this_tree_size = old_tree_size;
@@ -1354,13 +1434,14 @@ vect_build_slp_tree_2 (vec_info *vinfo,
             do extra work to cancel the pattern so the uses see the
             scalar version.  */
          && !is_pattern_stmt_p (stmt_info)
-         && !oprnd_info->any_pattern)
+         && !oprnd_info->any_pattern
+         && vect_update_all_shared_vectypes (vinfo, oprnd_info->def_stmts))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "Building vector operands from scalars\n");
          this_tree_size++;
-         child = vect_create_new_slp_node (oprnd_info->def_stmts);
+         child = vect_create_new_slp_node (oprnd_info->def_stmts, 0);
          SLP_TREE_DEF_TYPE (child) = vect_external_def;
          SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
          children.safe_push (child);
@@ -1436,10 +1517,11 @@ vect_build_slp_tree_2 (vec_info *vinfo,
                                            tem, npermutes,
                                            &this_tree_size, bst_map)) != NULL)
            {
-             /* If we have all children of child built up from scalars then
-                just throw that away and build it up this node from scalars.  */
+             /* If we have all children of a non-unary child built up from
+                scalars then just throw that away and build it up this node
+                from scalars.  */
              if (is_a <bb_vec_info> (vinfo)
-                 && !SLP_TREE_CHILDREN (child).is_empty ()
+                 && SLP_TREE_CHILDREN (child).length () > 1
                  /* ???  Rejecting patterns this way doesn't work.  We'd have
                     to do extra work to cancel the pattern so the uses see the
                     scalar version.  */
@@ -1451,7 +1533,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
                  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
                    if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
                      break;
-                 if (!grandchild)
+                 if (!grandchild
+                     && (vect_update_all_shared_vectypes
+                           (vinfo, oprnd_info->def_stmts)))
                    {
                      /* Roll back.  */
                      this_tree_size = old_tree_size;
@@ -1494,35 +1578,32 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
-  node = vect_create_new_slp_node (stmts);
+  node = vect_create_new_slp_node (stmts, nops);
   SLP_TREE_TWO_OPERATORS (node) = two_operators;
   SLP_TREE_CHILDREN (node).splice (children);
   return node;
 }
 
-/* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
+/* Dump a single SLP tree NODE.  */
 
 static void
 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
-                    slp_tree node, hash_set<slp_tree> &visited)
+                    slp_tree node)
 {
-  unsigned i;
-  stmt_vec_info stmt_info;
+  unsigned i, j;
   slp_tree child;
+  stmt_vec_info stmt_info;
   tree op;
 
-  if (visited.add (node))
-    return;
-
   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
   dump_user_location_t user_loc = loc.get_user_location ();
-  dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
+  dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
                   SLP_TREE_DEF_TYPE (node) == vect_external_def
                   ? " (external)"
                   : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
                      ? " (constant)"
                      : ""), node,
-                  estimated_poly_value (node->max_nunits));
+                  estimated_poly_value (node->max_nunits), node->refcnt);
   if (SLP_TREE_SCALAR_STMTS (node).exists ())
     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
@@ -1534,22 +1615,54 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
                     i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
       dump_printf (metadata, "}\n");
     }
+  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+    {
+      dump_printf_loc (metadata, user_loc, "\tload permutation {");
+      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
+       dump_printf (dump_kind, " %u", j);
+      dump_printf (dump_kind, " }\n");
+    }
   if (SLP_TREE_CHILDREN (node).is_empty ())
     return;
   dump_printf_loc (metadata, user_loc, "\tchildren");
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     dump_printf (dump_kind, " %p", (void *)child);
   dump_printf (dump_kind, "\n");
+}
+
+DEBUG_FUNCTION void
+debug (slp_tree node)
+{
+  debug_dump_context ctx;
+  vect_print_slp_tree (MSG_NOTE,
+                      dump_location_t::from_location_t (UNKNOWN_LOCATION),
+                      node);
+}
+
+/* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
+
+static void
+vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
+                     slp_tree node, hash_set<slp_tree> &visited)
+{
+  unsigned i;
+  slp_tree child;
+
+  if (visited.add (node))
+    return;
+
+  vect_print_slp_tree (dump_kind, loc, node);
+
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_print_slp_tree (dump_kind, loc, child, visited);
+    vect_print_slp_graph (dump_kind, loc, child, visited);
 }
 
 static void
-vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
-                    slp_tree node)
+vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
+                     slp_tree entry)
 {
   hash_set<slp_tree> visited;
-  vect_print_slp_tree (dump_kind, loc, node, visited);
+  vect_print_slp_graph (dump_kind, loc, entry, visited);
 }
 
 /* Mark the tree rooted at NODE with PURE_SLP.  */
@@ -1614,6 +1727,48 @@ vect_mark_slp_stmts_relevant (slp_tree node)
   vect_mark_slp_stmts_relevant (node, visited);
 }
 
+/* Copy the SLP subtree rooted at NODE.  */
+
+static slp_tree
+slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
+{
+  unsigned i;
+
+  bool existed_p;
+  slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
+  if (existed_p)
+    return copy_ref;
+
+  copy_ref = new _slp_tree;
+  slp_tree copy = copy_ref;
+  SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
+  SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
+  SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
+  copy->max_nunits = node->max_nunits;
+  copy->refcnt = 0;
+  if (SLP_TREE_SCALAR_STMTS (node).exists ())
+    {
+      SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
+      stmt_vec_info stmt_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+       STMT_VINFO_NUM_SLP_USES (stmt_info)++;
+    }
+  if (SLP_TREE_SCALAR_OPS (node).exists ())
+    SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
+  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+    SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
+  if (SLP_TREE_CHILDREN (node).exists ())
+    SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
+  gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
+
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
+    {
+      SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
+      SLP_TREE_CHILDREN (copy)[i]->refcnt++;
+    }
+  return copy;
+}
 
 /* Rearrange the statements of NODE according to PERMUTATION.  */
 
@@ -1665,22 +1820,26 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
 static bool
 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
 {
-  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
   unsigned int i, j;
   unsigned int lidx;
   slp_tree node, load;
 
+  if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
+    return false;
+
   /* Compare all the permutation sequences to the first one.  We know
      that at least one load is permuted.  */
   node = SLP_INSTANCE_LOADS (slp_instn)[0];
-  if (!node->load_permutation.exists ())
+  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
     return false;
+  unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
   for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
     {
-      if (!load->load_permutation.exists ())
+      if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
+         || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
        return false;
-      FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
-       if (lidx != node->load_permutation[j])
+      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
+       if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
          return false;
     }
 
@@ -1705,6 +1864,18 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
      statements in the nodes is not important unless they are memory
      accesses, we can rearrange the statements in all the nodes
      according to the order of the loads.  */
+
+  /* We have to unshare the SLP tree we modify.  */
+  hash_map<slp_tree, slp_tree> map;
+  slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
+  vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
+  unshared->refcnt++;
+  SLP_INSTANCE_TREE (slp_instn) = unshared;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
+  node = SLP_INSTANCE_LOADS (slp_instn)[0];
+
+  /* Do the actual re-arrangement.  */
   hash_set<slp_tree> visited;
   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
                            node->load_permutation, visited);
@@ -1732,7 +1903,7 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
 
 static void
-vect_gather_slp_loads (slp_instance inst, slp_tree node,
+vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
                       hash_set<slp_tree> &visited)
 {
   if (visited.add (node))
@@ -1745,14 +1916,14 @@ vect_gather_slp_loads (slp_instance inst, slp_tree node,
       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
          && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
-       SLP_INSTANCE_LOADS (inst).safe_push (node);
+       loads.safe_push (node);
     }
   else
     {
       unsigned i;
       slp_tree child;
       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-       vect_gather_slp_loads (inst, child, visited);
+       vect_gather_slp_loads (loads, child, visited);
     }
 }
 
@@ -1760,137 +1931,7 @@ static void
 vect_gather_slp_loads (slp_instance inst, slp_tree node)
 {
   hash_set<slp_tree> visited;
-  vect_gather_slp_loads (inst, node, visited);
-}
-
-/* Check if the required load permutations in the SLP instance
-   SLP_INSTN are supported.  */
-
-static bool
-vect_supported_load_permutation_p (slp_instance slp_instn)
-{
-  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
-  unsigned int i, j, k, next;
-  slp_tree node;
-
-  if (dump_enabled_p ())
-    {
-      dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       if (node->load_permutation.exists ())
-         FOR_EACH_VEC_ELT (node->load_permutation, j, next)
-           dump_printf (MSG_NOTE, "%d ", next);
-       else
-         for (k = 0; k < group_size; ++k)
-           dump_printf (MSG_NOTE, "%d ", k);
-      dump_printf (MSG_NOTE, "\n");
-    }
-
-  /* In case of reduction every load permutation is allowed, since the order
-     of the reduction statements is not important (as opposed to the case of
-     grouped stores).  The only condition we need to check is that all the
-     load nodes are of the same size and have the same permutation (and then
-     rearrange all the nodes of the SLP instance according to this 
-     permutation).  */
-
-  /* Check that all the load nodes are of the same size.  */
-  /* ???  Can't we assert this? */
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
-      return false;
-
-  node = SLP_INSTANCE_TREE (slp_instn);
-  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-
-  /* Reduction (there are no data-refs in the root).
-     In reduction chain the order of the loads is not important.  */
-  if (!STMT_VINFO_DATA_REF (stmt_info)
-      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    vect_attempt_slp_rearrange_stmts (slp_instn);
-
-  /* In basic block vectorization we allow any subchain of an interleaving
-     chain.
-     FORNOW: not supported in loop SLP because of realignment compications.  */
-  if (STMT_VINFO_BB_VINFO (stmt_info))
-    {
-      /* Check whether the loads in an instance form a subchain and thus
-         no permutation is necessary.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-        {
-         if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
-           continue;
-         bool subchain_p = true;
-         stmt_vec_info next_load_info = NULL;
-         stmt_vec_info load_info;
-         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
-           {
-             if (j != 0
-                 && (next_load_info != load_info
-                     || DR_GROUP_GAP (load_info) != 1))
-               {
-                 subchain_p = false;
-                 break;
-               }
-             next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
-           }
-         if (subchain_p)
-           SLP_TREE_LOAD_PERMUTATION (node).release ();
-         else
-           {
-             stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
-             group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-             unsigned HOST_WIDE_INT nunits;
-             unsigned k, maxk = 0;
-             FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
-               if (k > maxk)
-                 maxk = k;
-             /* In BB vectorization we may not actually use a loaded vector
-                accessing elements in excess of DR_GROUP_SIZE.  */
-             tree vectype = STMT_VINFO_VECTYPE (group_info);
-             if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "BB vectorization with gaps at the end of "
-                                    "a load is not supported\n");
-                 return false;
-               }
-
-             /* Verify the permutation can be generated.  */
-             vec<tree> tem;
-             unsigned n_perms;
-             if (!vect_transform_slp_perm_load (node, tem, NULL,
-                                                1, slp_instn, true, &n_perms))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                    vect_location,
-                                    "unsupported load permutation\n");
-                 return false;
-               }
-           }
-        }
-      return true;
-    }
-
-  /* For loop vectorization verify we can generate the permutation.  Be
-     conservative about the vectorization factor, there are permutations
-     that will use three vector inputs only starting from a specific factor
-     and the vectorization factor is not yet final.
-     ???  The SLP instance unrolling factor might not be the maximum one.  */
-  unsigned n_perms;
-  poly_uint64 test_vf
-    = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
-                            LOOP_VINFO_VECT_FACTOR
-                            (STMT_VINFO_LOOP_VINFO (stmt_info)));
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (node->load_permutation.exists ()
-       && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
-                                         slp_instn, true, &n_perms))
-      return false;
-
-  return true;
+  vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
 }
 
 
@@ -1973,6 +2014,7 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
 
 static bool
 vect_analyze_slp_instance (vec_info *vinfo,
+                          scalar_stmts_to_slp_tree_map_t *bst_map,
                           stmt_vec_info stmt_info, unsigned max_tree_size)
 {
   slp_instance new_instance;
@@ -1982,12 +2024,13 @@ vect_analyze_slp_instance (vec_info *vinfo,
   unsigned int i;
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   vec<stmt_vec_info> scalar_stmts;
+  bool constructor = false;
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       scalar_type = TREE_TYPE (DR_REF (dr));
-      vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
       group_size = DR_GROUP_SIZE (stmt_info);
+      vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
     }
   else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
     {
@@ -1995,6 +2038,13 @@ vect_analyze_slp_instance (vec_info *vinfo,
       vectype = STMT_VINFO_VECTYPE (stmt_info);
       group_size = REDUC_GROUP_SIZE (stmt_info);
     }
+  else if (is_gimple_assign (stmt_info->stmt)
+           && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
+    {
+      vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
+      group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
+      constructor = true;
+    }
   else
     {
       gcc_assert (is_a <loop_vec_info> (vinfo));
@@ -2037,10 +2087,35 @@ vect_analyze_slp_instance (vec_info *vinfo,
       /* Mark the first element of the reduction chain as reduction to properly
         transform the node.  In the reduction analysis phase only the last
         element of the chain is marked as reduction.  */
-      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
+      STMT_VINFO_DEF_TYPE (stmt_info)
+       = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
        = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
     }
+  else if (constructor)
+    {
+      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+      tree val;
+      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
+       {
+         if (TREE_CODE (val) == SSA_NAME)
+           {
+             gimple* def = SSA_NAME_DEF_STMT (val);
+             stmt_vec_info def_info = vinfo->lookup_stmt (def);
+             /* Value is defined in another basic block.  */
+             if (!def_info)
+               return false;
+             def_info = vect_stmt_to_vectorize (def_info);
+             scalar_stmts.safe_push (def_info);
+           }
+         else
+           return false;
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Analyzing vectorizable constructor: %G\n",
+                        stmt_info->stmt);
+    }
   else
     {
       /* Collect reduction statements.  */
@@ -2052,19 +2127,11 @@ vect_analyze_slp_instance (vec_info *vinfo,
   /* Build the tree for the SLP instance.  */
   bool *matches = XALLOCAVEC (bool, group_size);
   unsigned npermutes = 0;
-  scalar_stmts_to_slp_tree_map_t *bst_map
-    = new scalar_stmts_to_slp_tree_map_t ();
   poly_uint64 max_nunits = nunits;
   unsigned tree_size = 0;
   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
                              &max_nunits, matches, &npermutes,
                              &tree_size, bst_map);
-  /* The map keeps a reference on SLP nodes built, release that.  */
-  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
-       it != bst_map->end (); ++it)
-    if ((*it).second)
-      vect_free_slp_tree ((*it).second, false);
-  delete bst_map;
   if (node != NULL)
     {
       /* Calculate the unrolling factor based on the smallest type.  */
@@ -2095,65 +2162,34 @@ vect_analyze_slp_instance (vec_info *vinfo,
          /* Create a new SLP instance.  */
          new_instance = XNEW (class _slp_instance);
          SLP_INSTANCE_TREE (new_instance) = node;
-         SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
          SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
          SLP_INSTANCE_LOADS (new_instance) = vNULL;
+         SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
+
          vect_gather_slp_loads (new_instance, node);
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "SLP size %u vs. limit %u.\n",
                             tree_size, max_tree_size);
 
-         /* Compute the load permutation.  */
+         /* Check whether any load is possibly permuted.  */
          slp_tree load_node;
          bool loads_permuted = false;
          FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
            {
-             vec<unsigned> load_permutation;
-             int j;
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
              stmt_vec_info load_info;
-             bool this_load_permuted = false;
-             load_permutation.create (group_size);
-             stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
-                 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
              FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
-               {
-                 int load_place = vect_get_place_in_interleaving_chain
-                     (load_info, first_stmt_info);
-                 gcc_assert (load_place != -1);
-                 if (load_place != j)
-                   this_load_permuted = true;
-                 load_permutation.safe_push (load_place);
-               }
-             if (!this_load_permuted
-                 /* The load requires permutation when unrolling exposes
-                    a gap either because the group is larger than the SLP
-                    group-size or because there is a gap between the groups.  */
-                 && (known_eq (unrolling_factor, 1U)
-                     || (group_size == DR_GROUP_SIZE (first_stmt_info)
-                         && DR_GROUP_GAP (first_stmt_info) == 0)))
-               {
-                 load_permutation.release ();
-                 continue;
-               }
-             SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
-             loads_permuted = true;
-           }
-
-         if (loads_permuted)
-           {
-             if (!vect_supported_load_permutation_p (new_instance))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "Build SLP failed: unsupported load "
-                                    "permutation %G", stmt_info->stmt);
-                 vect_free_slp_instance (new_instance, false);
-                 return false;
-               }
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
            }
 
-         /* If the loads and stores can be handled with load/store-lan
+         /* If the loads and stores can be handled with load/store-lane
             instructions do not generate this SLP instance.  */
          if (is_a <loop_vec_info> (vinfo)
              && loads_permuted
@@ -2182,13 +2218,50 @@ vect_analyze_slp_instance (vec_info *vinfo,
                }
            }
 
+         /* If this is a reduction chain with a conversion in front
+            amend the SLP tree with a node for that.  */
+         if (!dr
+             && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+             && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
+           {
+             /* Get at the conversion stmt - we know it's the single use
+                of the last stmt of the reduction chain.  */
+             gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
+             use_operand_p use_p;
+             gimple *use_stmt;
+             bool r = single_imm_use (gimple_assign_lhs (tem),
+                                      &use_p, &use_stmt);
+             gcc_assert (r);
+             next_info = vinfo->lookup_stmt (use_stmt);
+             next_info = vect_stmt_to_vectorize (next_info);
+             scalar_stmts = vNULL;
+             scalar_stmts.create (group_size);
+             for (unsigned i = 0; i < group_size; ++i)
+               scalar_stmts.quick_push (next_info);
+             slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
+             SLP_TREE_CHILDREN (conv).quick_push (node);
+             SLP_INSTANCE_TREE (new_instance) = conv;
+             /* We also have to fake this conversion stmt as SLP reduction
+                group so we don't have to mess with too much code
+                elsewhere.  */
+             REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
+             REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
+           }
+
          vinfo->slp_instances.safe_push (new_instance);
 
+         /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+            the number of scalar stmts in the root in a few places.
+            Verify that assumption holds.  */
+         gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+                       .length () == group_size);
+
          if (dump_enabled_p ())
            {
              dump_printf_loc (MSG_NOTE, vect_location,
                               "Final SLP tree for instance:\n");
-             vect_print_slp_tree (MSG_NOTE, vect_location, node);
+             vect_print_slp_graph (MSG_NOTE, vect_location,
+                                   SLP_INSTANCE_TREE (new_instance));
            }
 
          return true;
@@ -2222,7 +2295,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
 
          stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
                                                           group1_size);
-         bool res = vect_analyze_slp_instance (vinfo, stmt_info,
+         bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
                                                max_tree_size);
          /* If the first non-match was in the middle of a vector,
             skip the rest of that vector.  */
@@ -2233,7 +2306,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
                rest = vect_split_slp_store_group (rest, const_nunits);
            }
          if (i < group_size)
-           res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
+           res |= vect_analyze_slp_instance (vinfo, bst_map,
+                                             rest, max_tree_size);
          return res;
        }
       /* Even though the first vector did not all match, we might be able to SLP
@@ -2255,9 +2329,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 
   DUMP_VECT_SCOPE ("vect_analyze_slp");
 
+  scalar_stmts_to_slp_tree_map_t *bst_map
+    = new scalar_stmts_to_slp_tree_map_t ();
+
   /* Find SLP sequences starting from groups of grouped stores.  */
   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
-    vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
+    vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
 
   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
     {
@@ -2265,7 +2342,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
        {
          /* Find SLP sequences starting from reduction chains.  */
          FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
-           if (! vect_analyze_slp_instance (vinfo, first_element,
+           if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
                                             max_tree_size))
              {
                /* Dissolve reduction chain group.  */
@@ -2287,13 +2364,104 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 
       /* Find SLP sequences starting from groups of reductions.  */
       if (loop_vinfo->reductions.length () > 1)
-       vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
+       vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
                                   max_tree_size);
     }
 
+  /* The map keeps a reference on SLP nodes built, release that.  */
+  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
+       it != bst_map->end (); ++it)
+    if ((*it).second)
+      vect_free_slp_tree ((*it).second, false);
+  delete bst_map;
+
+  /* Optimize permutations in SLP reductions.  */
+  slp_instance instance;
+  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+    {
+      slp_tree node = SLP_INSTANCE_TREE (instance);
+      stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+      /* Reduction (there are no data-refs in the root).
+        In reduction chain the order of the loads is not important.  */
+      if (!STMT_VINFO_DATA_REF (stmt_info)
+         && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+       vect_attempt_slp_rearrange_stmts (instance);
+    }
+
+  /* Gather all loads in the SLP graph.  */
+  hash_set<slp_tree> visited;
+  FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+    vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
+                          visited);
+
   return opt_result::success ();
 }
 
+void
+vect_optimize_slp (vec_info *vinfo)
+{
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
+    {
+      if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+       continue;
+
+      /* In basic block vectorization we allow any subchain of an interleaving
+        chain.
+        FORNOW: not in loop SLP because of realignment complications.  */
+      if (is_a <bb_vec_info> (vinfo))
+       {
+         bool subchain_p = true;
+         stmt_vec_info next_load_info = NULL;
+         stmt_vec_info load_info;
+         unsigned j;
+         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+           {
+             if (j != 0
+                 && (next_load_info != load_info
+                     || DR_GROUP_GAP (load_info) != 1))
+               {
+                 subchain_p = false;
+                 break;
+               }
+             next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
+           }
+         if (subchain_p)
+           {
+             SLP_TREE_LOAD_PERMUTATION (node).release ();
+             continue;
+           }
+       }
+      else
+       {
+         stmt_vec_info load_info;
+         bool this_load_permuted = false;
+         unsigned j;
+         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+           if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
+             {
+               this_load_permuted = true;
+               break;
+             }
+         stmt_vec_info first_stmt_info
+           = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+         if (!this_load_permuted
+             /* The load requires permutation when unrolling exposes
+                a gap either because the group is larger than the SLP
+                group-size or because there is a gap between the groups.  */
+             && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
+                 || ((SLP_TREE_SCALAR_STMTS (node).length ()
+                      == DR_GROUP_SIZE (first_stmt_info))
+                     && DR_GROUP_GAP (first_stmt_info) == 0)))
+           {
+             SLP_TREE_LOAD_PERMUTATION (node).release ();
+             continue;
+           }
+       }
+    }
+}
+
 
 /* For each possible SLP instance decide whether to SLP it and calculate overall
    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
@@ -2313,8 +2481,11 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       /* FORNOW: SLP if you can.  */
-      /* All unroll factors have the form vinfo->vector_size * X for some
-        rational X, so they must have a common multiple.  */
+      /* All unroll factors have the form:
+
+          GET_MODE_SIZE (vinfo->vector_mode) * X
+
+        for some rational X, so they must have a common multiple.  */
       unrolling_factor
        = force_common_multiple (unrolling_factor,
                                 SLP_INSTANCE_UNROLLING_FACTOR (instance));
@@ -2340,145 +2511,62 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   return (decided_to_slp > 0);
 }
 
-
-/* Find stmts that must be both vectorized and SLPed (since they feed stmts that
-   can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
-
-static void
-vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
-                             hash_map<slp_tree, unsigned> &visited)
+/* Private data for vect_detect_hybrid_slp.  */
+struct vdhs_data
 {
-  stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
-  imm_use_iterator imm_iter;
-  gimple *use_stmt;
-  stmt_vec_info use_vinfo;
-  slp_tree child;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
-  int j;
-
-  /* We need to union stype over the incoming graph edges but we still
-     want to limit recursion to stay O(N+E).  */
-  bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
-
-  /* Propagate hybrid down the SLP tree.  */
-  if (stype == hybrid)
-    ;
-  else if (HYBRID_SLP_STMT (stmt_vinfo))
-    stype = hybrid;
-  else if (!only_edge)
-    {
-      /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
-      gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
-      /* If we get a pattern stmt here we have to use the LHS of the
-         original stmt for immediate uses.  */
-      gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
-      tree def;
-      if (gimple_code (stmt) == GIMPLE_PHI)
-       def = gimple_phi_result (stmt);
-      else
-       def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
-      if (def)
-       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
-         {
-           use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
-           if (!use_vinfo)
-             continue;
-           use_vinfo = vect_stmt_to_vectorize (use_vinfo);
-           if (!STMT_SLP_TYPE (use_vinfo)
-               && (STMT_VINFO_RELEVANT (use_vinfo)
-                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
-               && !(gimple_code (use_stmt) == GIMPLE_PHI
-                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-             {
-               if (dump_enabled_p ())
-                 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
-                                  "def in non-SLP stmt: %G", use_stmt);
-               stype = hybrid;
-             }
-         }
-    }
-
-  if (stype == hybrid
-      && !HYBRID_SLP_STMT (stmt_vinfo))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
-                        stmt_vinfo->stmt);
-      STMT_SLP_TYPE (stmt_vinfo) = hybrid;
-    }
-
-  if (!only_edge)
-    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-      if (SLP_TREE_DEF_TYPE (child) != vect_external_def
-         && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
-       vect_detect_hybrid_slp_stmts (child, i, stype, visited);
-}
-
-static void
-vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
-{
-  hash_map<slp_tree, unsigned> visited;
-  vect_detect_hybrid_slp_stmts (node, i, stype, visited);
-}
+  loop_vec_info loop_vinfo;
+  vec<stmt_vec_info> *worklist;
+};
 
-/* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
+/* Walker for walk_gimple_op.  */
 
 static tree
-vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
+vect_detect_hybrid_slp (tree *tp, int *, void *data)
 {
   walk_stmt_info *wi = (walk_stmt_info *)data;
-  loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
+  vdhs_data *dat = (vdhs_data *)wi->info;
 
   if (wi->is_lhs)
     return NULL_TREE;
 
-  stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
-  if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
+  stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
+  if (!def_stmt_info)
+    return NULL_TREE;
+  def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
+  if (PURE_SLP_STMT (def_stmt_info))
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
                         def_stmt_info->stmt);
       STMT_SLP_TYPE (def_stmt_info) = hybrid;
+      dat->worklist->safe_push (def_stmt_info);
     }
 
   return NULL_TREE;
 }
 
-static tree
-vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
-                         walk_stmt_info *wi)
-{
-  loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
-  stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
-  /* If the stmt is in a SLP instance then this isn't a reason
-     to mark use definitions in other SLP instances as hybrid.  */
-  if (! STMT_SLP_TYPE (use_vinfo)
-      && (STMT_VINFO_RELEVANT (use_vinfo)
-         || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
-      && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
-           && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-    ;
-  else
-    *handled = true;
-  return NULL_TREE;
-}
-
 /* Find stmts that must be both vectorized and SLPed.  */
 
 void
 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
 {
-  unsigned int i;
-  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
-  slp_instance instance;
-
   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
 
-  /* First walk all pattern stmt in the loop and mark defs of uses as
-     hybrid because immediate uses in them are not recorded.  */
-  for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
+  /* All stmts participating in SLP are marked pure_slp, all other
+     stmts are loop_vect.
+     First collect all loop_vect stmts into a worklist.  */
+  auto_vec<stmt_vec_info> worklist;
+  for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
     {
       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
+      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
+          gsi_next (&gsi))
+       {
+         gphi *phi = gsi.phi ();
+         stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
+         if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
+           worklist.safe_push (stmt_info);
+       }
       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
           gsi_next (&gsi))
        {
@@ -2486,28 +2574,40 @@ vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
          stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
          if (STMT_VINFO_IN_PATTERN_P (stmt_info))
            {
-             walk_stmt_info wi;
-             memset (&wi, 0, sizeof (wi));
-             wi.info = loop_vinfo;
-             gimple_stmt_iterator gsi2
-               = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
-             walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
-                               vect_detect_hybrid_slp_1, &wi);
-             walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
-                              vect_detect_hybrid_slp_2,
-                              vect_detect_hybrid_slp_1, &wi);
+             for (gimple_stmt_iterator gsi2
+                    = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
+                  !gsi_end_p (gsi2); gsi_next (&gsi2))
+               {
+                 stmt_vec_info patt_info
+                   = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
+                 if (!STMT_SLP_TYPE (patt_info)
+                     && STMT_VINFO_RELEVANT (patt_info))
+                   worklist.safe_push (patt_info);
+               }
+             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
            }
+         if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
+           worklist.safe_push (stmt_info);
        }
     }
 
-  /* Then walk the SLP instance trees marking stmts with uses in
-     non-SLP stmts as hybrid, also propagating hybrid down the
-     SLP tree, collecting the above info on-the-fly.  */
-  FOR_EACH_VEC_ELT (slp_instances, i, instance)
+  /* Now we have a worklist of non-SLP stmts, follow use->def chains and
+     mark any SLP vectorized stmt as hybrid.
+     ???  We're visiting def stmts N times (once for each non-SLP and
+     once for each hybrid-SLP use).  */
+  walk_stmt_info wi;
+  vdhs_data dat;
+  dat.worklist = &worklist;
+  dat.loop_vinfo = loop_vinfo;
+  memset (&wi, 0, sizeof (wi));
+  wi.info = (void *)&dat;
+  while (!worklist.is_empty ())
     {
-      for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
-       vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
-                                     i, pure_slp);
+      stmt_vec_info stmt_info = worklist.pop ();
+      /* Since SSA operands are not set up for pattern stmts we need
+        to use walk_gimple_op.  */
+      wi.is_lhs = 0;
+      walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
     }
 }
 
@@ -2559,36 +2659,9 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
                                    slp_instance node_instance,
                                    stmt_vector_for_cost *cost_vec)
 {
-  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
 
-  /* For BB vectorization vector types are assigned here.
-     Memory accesses already got their vector type assigned
-     in vect_analyze_data_refs.  */
-  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
-  if (bb_vinfo
-      && ! STMT_VINFO_DATA_REF (stmt_info))
-    {
-      tree vectype, nunits_vectype;
-      if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
-                                          &nunits_vectype))
-       /* We checked this when building the node.  */
-       gcc_unreachable ();
-      if (vectype == boolean_type_node)
-       {
-         vectype = vect_get_mask_type_for_stmt (stmt_info);
-         if (!vectype)
-           /* vect_get_mask_type_for_stmt has already explained the
-              failure.  */
-           return false;
-       }
-
-      stmt_vec_info sstmt_info;
-      unsigned int i;
-      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
-       STMT_VINFO_VECTYPE (sstmt_info) = vectype;
-    }
-
   /* Calculate the number of vector statements to be created for the
      scalar stmts in this node.  For SLP reductions it is equal to the
      number of vector statements in the children (which has already been
@@ -2613,14 +2686,102 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
        vf = loop_vinfo->vectorization_factor;
       else
        vf = 1;
-      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
+      unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
        = vect_get_num_vectors (vf * group_size, vectype);
     }
 
   bool dummy;
-  return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
+  return vect_analyze_stmt (vinfo, stmt_info, &dummy,
+                           node, node_instance, cost_vec);
+}
+
+/* Try to build NODE from scalars, returning true on success.
+   NODE_INSTANCE is the SLP instance that contains NODE.  */
+
+static bool
+vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
+                             slp_instance node_instance)
+{
+  stmt_vec_info stmt_info;
+  unsigned int i;
+
+  if (!is_a <bb_vec_info> (vinfo)
+      || node == SLP_INSTANCE_TREE (node_instance)
+      || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "Building vector operands from scalars instead\n");
+
+  /* Don't remove and free the child nodes here, since they could be
+     referenced by other structures.  The analysis and scheduling phases
+     (need to) ignore child nodes of anything that isn't vect_internal_def.  */
+  unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
+  SLP_TREE_DEF_TYPE (node) = vect_external_def;
+  SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    {
+      tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
+      SLP_TREE_SCALAR_OPS (node)[i] = lhs;
+    }
+  return true;
+}
+
+/* Compute the prologue cost for invariant or constant operands represented
+   by NODE.  */
+
+static void
+vect_prologue_cost_for_slp (slp_tree node,
+                           stmt_vector_for_cost *cost_vec)
+{
+  /* Without looking at the actual initializer a vector of
+     constants can be implemented as load from the constant pool.
+     When all elements are the same we can use a splat.  */
+  tree vectype = SLP_TREE_VECTYPE (node);
+  unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
+  unsigned num_vects_to_check;
+  unsigned HOST_WIDE_INT const_nunits;
+  unsigned nelt_limit;
+  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
+      && ! multiple_p (const_nunits, group_size))
+    {
+      num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
+      nelt_limit = const_nunits;
+    }
+  else
+    {
+      /* If either the vector has variable length or the vectors
+        are composed of repeated whole groups we only need to
+        cost construction once.  All vectors will be the same.  */
+      num_vects_to_check = 1;
+      nelt_limit = group_size;
+    }
+  tree elt = NULL_TREE;
+  unsigned nelt = 0;
+  for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
+    {
+      unsigned si = j % group_size;
+      if (nelt == 0)
+       elt = SLP_TREE_SCALAR_OPS (node)[si];
+      /* ???  We're just tracking whether all operands of a single
+        vector initializer are the same, ideally we'd check if
+        we emitted the same one already.  */
+      else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
+       elt = NULL_TREE;
+      nelt++;
+      if (nelt == nelt_limit)
+       {
+         record_stmt_cost (cost_vec, 1,
+                           SLP_TREE_DEF_TYPE (node) == vect_external_def
+                           ? (elt ? scalar_to_vec : vec_construct)
+                           : vector_load,
+                           NULL, vectype, 0, vect_prologue);
+         nelt = 0;
+       }
+    }
 }
 
 /* Analyze statements contained in SLP tree NODE after recursively analyzing
@@ -2631,31 +2792,25 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
 static bool
 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
                                  slp_instance node_instance,
-                                 scalar_stmts_to_slp_tree_map_t *visited,
-                                 scalar_stmts_to_slp_tree_map_t *lvisited,
+                                 hash_set<slp_tree> &visited,
+                                 hash_set<slp_tree> &lvisited,
                                  stmt_vector_for_cost *cost_vec)
 {
   int i, j;
   slp_tree child;
 
+  /* Assume we can code-generate all invariants.  */
   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
     return true;
 
   /* If we already analyzed the exact same set of scalar stmts we're done.
-     We share the generated vector stmts for those.  */
-  slp_tree *leader;
-  if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
-      || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
-    {
-      SLP_TREE_NUMBER_OF_VEC_STMTS (node)
-       = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
-      return true;
-    }
-
-  /* The SLP graph is acyclic so not caching whether we failed or succeeded
+     We share the generated vector stmts for those.
+     The SLP graph is acyclic so not caching whether we failed or succeeded
      doesn't result in any issue since we throw away the lvisited set
      when we fail.  */
-  lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
+  if (visited.contains (node)
+      || lvisited.add (node))
+    return true;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
@@ -2708,6 +2863,56 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
     if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
       STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
 
+  /* When the node can be vectorized cost invariant nodes it references.
+     This is not done in DFS order to allow the refering node
+     vectorizable_* calls to nail down the invariant nodes vector type
+     and possibly unshare it if it needs a different vector type than
+     other referrers.  */
+  if (res)
+    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+      if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
+          || SLP_TREE_DEF_TYPE (child) == vect_external_def)
+         /* Perform usual caching, note code-generation still
+            code-gens these nodes multiple times but we expect
+            to CSE them later.  */
+         && !visited.contains (child)
+         && !lvisited.add (child))
+       {
+         /* ???  After auditing more code paths make a "default"
+            and push the vector type from NODE to all children
+            if it is not already set.  */
+         /* Compute the number of vectors to be generated.  */
+         tree vector_type = SLP_TREE_VECTYPE (child);
+         if (!vector_type)
+           {
+             /* For shifts with a scalar argument we don't need
+                to cost or code-generate anything.
+                ???  Represent this more explicitely.  */
+             gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_SCALAR_STMTS (node)[0])
+                          == shift_vec_info_type)
+                         && j == 1);
+             continue;
+           }
+         unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
+         poly_uint64 vf = 1;
+         if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
+           vf = loop_vinfo->vectorization_factor;
+         SLP_TREE_NUMBER_OF_VEC_STMTS (child)
+           = vect_get_num_vectors (vf * group_size, vector_type);
+         /* And cost them.  */
+         vect_prologue_cost_for_slp (child, cost_vec);
+       }
+
+  /* If this node can't be vectorized, try pruning the tree here rather
+     than felling the whole thing.  */
+  if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
+    {
+      /* We'll need to revisit this for invariant costing and number
+        of vectorized stmt setting.   */
+      lvisited.remove (node);
+      res = true;
+    }
+
   return res;
 }
 
@@ -2723,17 +2928,21 @@ vect_slp_analyze_operations (vec_info *vinfo)
 
   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
 
-  scalar_stmts_to_slp_tree_map_t *visited
-    = new scalar_stmts_to_slp_tree_map_t ();
+  hash_set<slp_tree> visited;
   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
     {
-      scalar_stmts_to_slp_tree_map_t lvisited;
+      hash_set<slp_tree> lvisited;
       stmt_vector_for_cost cost_vec;
       cost_vec.create (2);
       if (!vect_slp_analyze_node_operations (vinfo,
                                             SLP_INSTANCE_TREE (instance),
-                                            instance, visited, &lvisited,
-                                            &cost_vec))
+                                            instance, visited, lvisited,
+                                            &cost_vec)
+         /* Instances with a root stmt require vectorized defs for the
+            SLP tree root.  */
+         || (SLP_INSTANCE_ROOT_STMT (instance)
+             && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
+                 != vect_internal_def)))
         {
          slp_tree node = SLP_INSTANCE_TREE (instance);
          stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
@@ -2747,16 +2956,15 @@ vect_slp_analyze_operations (vec_info *vinfo)
        }
       else
        {
-         for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
+         for (hash_set<slp_tree>::iterator x = lvisited.begin();
               x != lvisited.end(); ++x)
-           visited->put ((*x).first.copy (), (*x).second);
+           visited.add (*x);
          i++;
 
-         add_stmt_costs (vinfo->target_cost_data, &cost_vec);
+         add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
          cost_vec.release ();
        }
     }
-  delete visited;
 
   return !vinfo->slp_instances.is_empty ();
 }
@@ -2767,7 +2975,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
    update LIFE according to uses of NODE.  */
 
 static void 
-vect_bb_slp_scalar_cost (basic_block bb,
+vect_bb_slp_scalar_cost (vec_info *vinfo,
                         slp_tree node, vec<bool, va_heap> *life,
                         stmt_vector_for_cost *cost_vec,
                         hash_set<slp_tree> &visited)
@@ -2782,7 +2990,6 @@ vect_bb_slp_scalar_cost (basic_block bb,
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
     {
       gimple *stmt = stmt_info->stmt;
-      vec_info *vinfo = stmt_info->vinfo;
       ssa_op_iter op_iter;
       def_operand_p def_p;
 
@@ -2825,6 +3032,8 @@ vect_bb_slp_scalar_cost (basic_block bb,
           else
            kind = scalar_store;
         }
+      else if (vect_nop_conversion_p (stmt_info))
+       continue;
       else
        kind = scalar_stmt;
       record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
@@ -2838,22 +3047,13 @@ vect_bb_slp_scalar_cost (basic_block bb,
          /* Do not directly pass LIFE to the recursive call, copy it to
             confine changes in the callee to the current child/subtree.  */
          subtree_life.safe_splice (*life);
-         vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
+         vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
                                   visited);
          subtree_life.truncate (0);
        }
     }
 }
 
-static void 
-vect_bb_slp_scalar_cost (basic_block bb,
-                        slp_tree node, vec<bool, va_heap> *life,
-                        stmt_vector_for_cost *cost_vec)
-{
-  hash_set<slp_tree> visited;
-  vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
-}
-
 /* Check if vectorization of the basic block is profitable.  */
 
 static bool
@@ -2868,16 +3068,18 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   /* Calculate scalar cost.  */
   stmt_vector_for_cost scalar_costs;
   scalar_costs.create (0);
+  hash_set<slp_tree> visited;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
-      life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
-      vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
+      life.safe_grow_cleared
+       (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)).length ());
+      vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
-                              &life, &scalar_costs);
+                              &life, &scalar_costs, visited);
     }
   void *target_cost_data = init_cost (NULL);
-  add_stmt_costs (target_cost_data, &scalar_costs);
+  add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
   scalar_costs.release ();
   unsigned dummy;
   finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
@@ -2915,6 +3117,34 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   return true;
 }
 
+/* Find any vectorizable constructors and add them to the grouped_store
+   array.  */
+
+static void
+vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
+{
+  gimple_stmt_iterator gsi;
+
+  for (gsi = bb_vinfo->region_begin;
+       gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
+    {
+      gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
+      if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
+       continue;
+
+      tree rhs = gimple_assign_rhs1 (stmt);
+      if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
+         || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
+                      CONSTRUCTOR_NELTS (rhs))
+         || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
+         || uniform_vector_p (rhs))
+       continue;
+
+      stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
+      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
+    }
+}
+
 /* Check if the region described by BB_VINFO can be vectorized, returning
    true if so.  When returning false, set FATAL to true if the same failure
    would prevent vectorization at other vector sizes, false if it is still
@@ -2962,6 +3192,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
       return false;
     }
 
+  vect_slp_check_for_constructors (bb_vinfo);
+
   /* If there are no grouped stores in the region there is no need
      to continue with pattern recog as vect_analyze_slp will fail
      anyway.  */
@@ -2994,14 +3226,17 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
       return false;
     }
 
+  /* Optimize permutations.  */
+  vect_optimize_slp (bb_vinfo);
+
   vect_record_base_alignments (bb_vinfo);
 
   /* Analyze and verify the alignment of data references and the
      dependence in the SLP instances.  */
   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
     {
-      if (! vect_slp_analyze_and_verify_instance_alignment (instance)
-         || ! vect_slp_analyze_instance_dependence (instance))
+      if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
+         || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
        {
          slp_tree node = SLP_INSTANCE_TREE (instance);
          stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
@@ -3018,6 +3253,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
         relevant.  */
       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+       STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
 
       i++;
     }
@@ -3061,16 +3298,16 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
                    unsigned int n_stmts)
 {
   bb_vec_info bb_vinfo;
-  auto_vector_sizes vector_sizes;
+  auto_vector_modes vector_modes;
 
   /* Autodetect first vector size we try.  */
-  poly_uint64 next_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
-  unsigned int next_size = 0;
+  machine_mode next_vector_mode = VOIDmode;
+  targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
+  unsigned int mode_i = 0;
 
   vec_info_shared shared;
 
-  poly_uint64 autodetected_vector_size = 0;
+  machine_mode autodetected_vector_mode = VOIDmode;
   while (1)
     {
       bool vectorized = false;
@@ -3083,13 +3320,18 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
        bb_vinfo->shared->save_datarefs ();
       else
        bb_vinfo->shared->check_datarefs ();
-      bb_vinfo->vector_size = next_vector_size;
+      bb_vinfo->vector_mode = next_vector_mode;
 
       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
          && dbg_cnt (vect_slp))
        {
          if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "***** Analysis succeeded with vector mode"
+                              " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
+             dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
+           }
 
          bb_vinfo->shared->check_datarefs ();
          vect_schedule_slp (bb_vinfo);
@@ -3097,7 +3339,7 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
          unsigned HOST_WIDE_INT bytes;
          if (dump_enabled_p ())
            {
-             if (bb_vinfo->vector_size.is_constant (&bytes))
+             if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
                dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
                                 "basic block part vectorized using %wu byte "
                                 "vectors\n", bytes);
@@ -3109,34 +3351,63 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
 
          vectorized = true;
        }
+      else
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Analysis failed with vector mode %s\n",
+                            GET_MODE_NAME (bb_vinfo->vector_mode));
+       }
 
-      if (next_size == 0)
-       autodetected_vector_size = bb_vinfo->vector_size;
+      if (mode_i == 0)
+       autodetected_vector_mode = bb_vinfo->vector_mode;
+
+      if (!fatal)
+       while (mode_i < vector_modes.length ()
+              && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "***** The result for vector mode %s would"
+                              " be the same\n",
+                              GET_MODE_NAME (vector_modes[mode_i]));
+           mode_i += 1;
+         }
 
       delete bb_vinfo;
 
-      if (next_size < vector_sizes.length ()
-         && known_eq (vector_sizes[next_size], autodetected_vector_size))
-       next_size += 1;
+      if (mode_i < vector_modes.length ()
+         && VECTOR_MODE_P (autodetected_vector_mode)
+         && (related_vector_mode (vector_modes[mode_i],
+                                  GET_MODE_INNER (autodetected_vector_mode))
+             == autodetected_vector_mode)
+         && (related_vector_mode (autodetected_vector_mode,
+                                  GET_MODE_INNER (vector_modes[mode_i]))
+             == vector_modes[mode_i]))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Skipping vector mode %s, which would"
+                            " repeat the analysis for %s\n",
+                            GET_MODE_NAME (vector_modes[mode_i]),
+                            GET_MODE_NAME (autodetected_vector_mode));
+         mode_i += 1;
+       }
 
       if (vectorized
-         || next_size == vector_sizes.length ()
-         || known_eq (autodetected_vector_size, 0U)
+         || mode_i == vector_modes.length ()
+         || autodetected_vector_mode == VOIDmode
          /* If vect_slp_analyze_bb_1 signaled that analysis for all
             vector sizes will fail do not bother iterating.  */
          || fatal)
        return vectorized;
 
       /* Try the next biggest vector size.  */
-      next_vector_size = vector_sizes[next_size++];
+      next_vector_mode = vector_modes[mode_i++];
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "***** Re-trying analysis with "
-                          "vector size ");
-         dump_dec (MSG_NOTE, next_vector_size);
-         dump_printf (MSG_NOTE, "\n");
-       }
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "***** Re-trying analysis with vector mode %s\n",
+                        GET_MODE_NAME (next_vector_mode));
     }
 }
 
@@ -3149,7 +3420,7 @@ vect_slp_bb (basic_block bb)
   gimple_stmt_iterator gsi;
   bool any_vectorized = false;
 
-  gsi = gsi_start_bb (bb);
+  gsi = gsi_after_labels (bb);
   while (!gsi_end_p (gsi))
     {
       gimple_stmt_iterator region_begin = gsi;
@@ -3179,7 +3450,7 @@ vect_slp_bb (basic_block bb)
 
       gimple_stmt_iterator region_end = gsi;
 
-      if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
+      if (insns > param_slp_max_insns_in_bb)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -3200,47 +3471,6 @@ vect_slp_bb (basic_block bb)
 }
 
 
-/* Return 1 if vector type STMT_VINFO is a boolean vector.  */
-
-static bool
-vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
-{
-  enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
-  tree op, vectype;
-  enum vect_def_type dt;
-
-  /* For comparison and COND_EXPR type is chosen depending
-     on the non-constant other comparison operand.  */
-  if (TREE_CODE_CLASS (code) == tcc_comparison)
-    {
-      gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
-      op = gimple_assign_rhs1 (stmt);
-
-      if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
-       gcc_unreachable ();
-
-      return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
-    }
-
-  if (code == COND_EXPR)
-    {
-      gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
-      tree cond = gimple_assign_rhs1 (stmt);
-
-      if (TREE_CODE (cond) == SSA_NAME)
-       op = cond;
-      else
-       op = TREE_OPERAND (cond, 0);
-
-      if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
-       gcc_unreachable ();
-
-      return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
-    }
-
-  return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
-}
-
 /* Build a variable-length vector in which the elements in ELTS are repeated
    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
    RESULTS and add any new instructions to SEQ.
@@ -3275,7 +3505,7 @@ duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
   unsigned int nvectors = 1;
   tree new_vector_type;
   tree permutes[2];
-  if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type),
+  if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
                                       &nvectors, &new_vector_type,
                                       permutes))
     gcc_unreachable ();
@@ -3355,17 +3585,13 @@ duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
 }
 
 
-/* For constant and loop invariant defs of SLP_NODE this function returns
-   (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
-   OP_NODE determines the node for the operand containing the scalar
-   operands.  */
+/* For constant and loop invariant defs in OP_NODE this function creates
+   vector defs that will be used in the vectorized stmts and stores them
+   to SLP_TREE_VEC_DEFS of OP_NODE.  */
 
 static void
-vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
-                           vec<tree> *vec_oprnds)
+vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
 {
-  stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-  vec_info *vinfo = stmt_vinfo->vinfo;
   unsigned HOST_WIDE_INT nunits;
   tree vec_cst;
   unsigned j, number_of_places_left_in_vector;
@@ -3375,27 +3601,14 @@ vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
   unsigned int vec_num, i;
   unsigned number_of_copies = 1;
   bool constant_p;
-  tree neutral_op = NULL;
   gimple_seq ctor_seq = NULL;
   auto_vec<tree, 16> permute_results;
 
-  /* ???  SLP analysis should compute the vector type for the
-     constant / invariant and store it in the SLP node.  */
-  tree op = op_node->ops[0];
-  /* Check if vector type is a boolean vector.  */
-  tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
-  if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
-      && vect_mask_constant_operand_p (stmt_vinfo))
-    vector_type
-      = build_same_sized_truth_vector_type (stmt_vectype);
-  else
-    vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
+  /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
+  vector_type = SLP_TREE_VECTYPE (op_node);
 
-  unsigned int number_of_vectors
-    = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
-                           * TYPE_VECTOR_SUBPARTS (stmt_vectype),
-                           vector_type);
-  vec_oprnds->create (number_of_vectors);
+  unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
+  SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
   auto_vec<tree> voprnds (number_of_vectors);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
@@ -3425,9 +3638,10 @@ vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
   constant_p = true;
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
-  bool place_after_defs = false;
+  stmt_vec_info insert_after = NULL;
   for (j = 0; j < number_of_copies; j++)
     {
+      tree op;
       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
         {
           /* Create 'vect_ = {op0,op1,...,opn}'.  */
@@ -3484,12 +3698,20 @@ vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
          elts[number_of_places_left_in_vector] = op;
          if (!CONSTANT_CLASS_P (op))
            constant_p = false;
+         /* For BB vectorization we have to compute an insert location
+            when a def is inside the analyzed region since we cannot
+            simply insert at the BB start in this case.  */
+         stmt_vec_info opdef;
          if (TREE_CODE (orig_op) == SSA_NAME
              && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
-             && STMT_VINFO_BB_VINFO (stmt_vinfo)
-             && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
-                 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
-           place_after_defs = true;
+             && is_a <bb_vec_info> (vinfo)
+             && (opdef = vinfo->lookup_def (orig_op)))
+           {
+             if (!insert_after)
+               insert_after = opdef;
+             else
+               insert_after = get_later_stmt (insert_after, opdef);
+           }
 
           if (number_of_places_left_in_vector == 0)
             {
@@ -3506,26 +3728,26 @@ vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
                  vec_cst = permute_results[number_of_vectors - j - 1];
                }
              tree init;
-             gimple_stmt_iterator gsi;
-             if (place_after_defs)
+             if (insert_after)
                {
-                 stmt_vec_info last_stmt_info
-                   = vect_find_last_scalar_stmt_in_slp (slp_node);
-                 gsi = gsi_for_stmt (last_stmt_info->stmt);
-                 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
-                                          &gsi);
+                 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
+                 /* vect_init_vector inserts before.  */
+                 gsi_next (&gsi);
+                 init = vect_init_vector (vinfo, NULL, vec_cst,
+                                          vector_type, &gsi);
                }
              else
-               init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
-                                        NULL);
+               init = vect_init_vector (vinfo, NULL, vec_cst,
+                                        vector_type, NULL);
              if (ctor_seq != NULL)
                {
-                 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
+                 gimple_stmt_iterator gsi
+                   = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
                  gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
                  ctor_seq = NULL;
                }
              voprnds.quick_push (init);
-             place_after_defs = false;
+             insert_after = NULL;
               number_of_places_left_in_vector = nunits;
              constant_p = true;
              elts.new_vector (vector_type, nunits, 1);
@@ -3540,30 +3762,17 @@ vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
   for (j = vec_num; j != 0; j--)
     {
       vop = voprnds[j - 1];
-      vec_oprnds->quick_push (vop);
+      SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
     }
 
   /* In case that VF is greater than the unrolling factor needed for the SLP
      group of stmts, NUMBER_OF_VECTORS to be created is greater than
      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
      to replicate the vectors.  */
-  while (number_of_vectors > vec_oprnds->length ())
-    {
-      tree neutral_vec = NULL;
-
-      if (neutral_op)
-        {
-          if (!neutral_vec)
-           neutral_vec = build_vector_from_val (vector_type, neutral_op);
-
-          vec_oprnds->quick_push (neutral_vec);
-        }
-      else
-        {
-          for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
-            vec_oprnds->quick_push (vop);
-        }
-    }
+  while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
+    for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
+        i++)
+      SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
 }
 
 
@@ -3583,15 +3792,11 @@ vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
 }
 
 
-/* Get N vectorized definitions for SLP_NODE.
-   If the scalar definitions are loop invariants or constants, collect them and
-   call vect_get_constant_vectors() to create vector stmts.
-   Otherwise, the def-stmts must be already vectorized and the vectorized stmts
-   must be stored in the corresponding child of SLP_NODE, and we call
-   vect_get_slp_vect_defs () to retrieve them.  */
+/* Get N vectorized definitions for SLP_NODE.  */
 
 void
-vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
+vect_get_slp_defs (vec_info *,
+                  slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
 {
   if (n == -1U)
     n = SLP_TREE_CHILDREN (slp_node).length ();
@@ -3604,13 +3809,11 @@ vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
 
       /* For each operand we check if it has vectorized definitions in a child
         node or we need to create them (for invariants and constants).  */
+      vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-       {
-         vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
-         vect_get_slp_vect_defs (child, &vec_defs);
-       }
+       vect_get_slp_vect_defs (child, &vec_defs);
       else
-       vect_get_constant_vectors (child, slp_node, &vec_defs);
+       vec_defs.splice (SLP_TREE_VEC_DEFS (child));
 
       vec_oprnds->quick_push (vec_defs);
     }
@@ -3618,20 +3821,18 @@ vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
 
 /* Generate vector permute statements from a list of loads in DR_CHAIN.
    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
-   permute statements for the SLP node NODE of the SLP instance
-   SLP_NODE_INSTANCE.  */
+   permute statements for the SLP node NODE.  */
 
 bool
-vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
+vect_transform_slp_perm_load (vec_info *vinfo,
+                             slp_tree node, vec<tree> dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
-                             slp_instance slp_node_instance, bool analyze_only,
-                             unsigned *n_perms)
+                             bool analyze_only, unsigned *n_perms)
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-  vec_info *vinfo = stmt_info->vinfo;
   int vec_index = 0;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+  unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned int mask_element;
   machine_mode mode;
 
@@ -3805,7 +4006,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                                               first_vec, second_vec,
                                               mask_vec);
                      perm_stmt_info
-                       = vect_finish_stmt_generation (stmt_info, perm_stmt,
+                       = vect_finish_stmt_generation (vinfo,
+                                                      stmt_info, perm_stmt,
                                                       gsi);
                    }
                  else
@@ -3832,35 +4034,36 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 /* Vectorize SLP instance tree in postorder.  */
 
 static void
-vect_schedule_slp_instance (slp_tree node, slp_instance instance,
-                           scalar_stmts_to_slp_tree_map_t *bst_map)
+vect_schedule_slp_instance (vec_info *vinfo,
+                           slp_tree node, slp_instance instance)
 {
   gimple_stmt_iterator si;
-  stmt_vec_info stmt_info;
-  unsigned int group_size;
-  tree vectype;
   int i, j;
   slp_tree child;
 
-  if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
-    return;
-
   /* See if we have already vectorized the node in the graph of the
      SLP instance.  */
-  if (SLP_TREE_VEC_STMTS (node).exists ())
+  if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
+       && SLP_TREE_VEC_STMTS (node).exists ())
+      || SLP_TREE_VEC_DEFS (node).exists ())
     return;
 
-  /* See if we have already vectorized the same set of stmts and reuse their
-     vectorized stmts across instances.  */
-  if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
+  /* Vectorize externals and constants.  */
+  if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+      || SLP_TREE_DEF_TYPE (node) == vect_external_def)
     {
-      SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
+      /* ???  vectorizable_shift can end up using a scalar operand which is
+        currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
+        node in this case.  */
+      if (!SLP_TREE_VECTYPE (node))
+       return;
+
+      vect_create_constant_vectors (vinfo, node);
       return;
     }
 
-  bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_schedule_slp_instance (child, instance, bst_map);
+    vect_schedule_slp_instance (vinfo, child, instance);
 
   /* Push SLP node def-type to stmts.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
@@ -3871,12 +4074,12 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
          STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
       }
 
-  stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
 
   /* VECTYPE is the type of the destination.  */
-  vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  group_size = SLP_INSTANCE_GROUP_SIZE (instance);
+  unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
 
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
@@ -3893,6 +4096,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
 
   /* Handle two-operation SLP nodes by vectorizing the group with
      both operations and then performing a merge.  */
+  bool done_p = false;
   if (SLP_TREE_TWO_OPERATORS (node))
     {
       gassign *stmt = as_a <gassign *> (stmt_info->stmt);
@@ -3917,11 +4121,11 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
          vec<stmt_vec_info> v1;
          unsigned j;
          tree tmask = NULL_TREE;
-         vect_transform_stmt (stmt_info, &si, node, instance);
+         vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
          v0 = SLP_TREE_VEC_STMTS (node).copy ();
          SLP_TREE_VEC_STMTS (node).truncate (0);
          gimple_assign_set_rhs_code (stmt, ocode);
-         vect_transform_stmt (stmt_info, &si, node, instance);
+         vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
          gimple_assign_set_rhs_code (stmt, code0);
          v1 = SLP_TREE_VEC_STMTS (node).copy ();
          SLP_TREE_VEC_STMTS (node).truncate (0);
@@ -3959,14 +4163,15 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
                                           gimple_assign_lhs (v1[j]->stmt),
                                           tmask);
              SLP_TREE_VEC_STMTS (node).quick_push
-               (vect_finish_stmt_generation (stmt_info, vstmt, &si));
+               (vect_finish_stmt_generation (vinfo, stmt_info, vstmt, &si));
            }
          v0.release ();
          v1.release ();
-         return;
+         done_p = true;
        }
     }
-  vect_transform_stmt (stmt_info, &si, node, instance);
+  if (!done_p)
+    vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
 
   /* Restore stmt def-types.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
@@ -3984,7 +4189,8 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
    SLP instances may refer to the same scalar stmt.  */
 
 static void
-vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
+vect_remove_slp_scalar_calls (vec_info *vinfo,
+                             slp_tree node, hash_set<slp_tree> &visited)
 {
   gimple *new_stmt;
   gimple_stmt_iterator gsi;
@@ -4000,7 +4206,7 @@ vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
     return;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_remove_slp_scalar_calls (child, visited);
+    vect_remove_slp_scalar_calls (vinfo, child, visited);
 
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
     {
@@ -4013,16 +4219,66 @@ vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
       lhs = gimple_call_lhs (stmt);
       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
       gsi = gsi_for_stmt (stmt);
-      stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
+      vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
     }
 }
 
 static void
-vect_remove_slp_scalar_calls (slp_tree node)
+vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
 {
   hash_set<slp_tree> visited;
-  vect_remove_slp_scalar_calls (node, visited);
+  vect_remove_slp_scalar_calls (vinfo, node, visited);
+}
+
+/* Vectorize the instance root.  */
+
+void
+vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
+{
+  gassign *rstmt = NULL;
+
+  if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
+    {
+      stmt_vec_info child_stmt_info;
+      int j;
+
+      FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
+       {
+         tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
+         tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
+         if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
+                                         TREE_TYPE (vect_lhs)))
+           vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
+                              vect_lhs);
+         rstmt = gimple_build_assign (root_lhs, vect_lhs);
+         break;
+       }
+    }
+  else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
+    {
+      int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
+      stmt_vec_info child_stmt_info;
+      int j;
+      vec<constructor_elt, va_gc> *v;
+      vec_alloc (v, nelts);
+
+      FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
+       {
+         CONSTRUCTOR_APPEND_ELT (v,
+                                 NULL_TREE,
+                                 gimple_get_lhs (child_stmt_info->stmt));
+       }
+      tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
+      tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
+      tree r_constructor = build_constructor (rtype, v);
+      rstmt = gimple_build_assign (lhs, r_constructor);
+    }
+
+    gcc_assert (rstmt);
+
+    gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
+    gsi_replace (&rgsi, rstmt, true);
 }
 
 /* Generate vector code for all SLP instances in the loop/basic block.  */
@@ -4034,19 +4290,20 @@ vect_schedule_slp (vec_info *vinfo)
   slp_instance instance;
   unsigned int i;
 
-  scalar_stmts_to_slp_tree_map_t *bst_map
-    = new scalar_stmts_to_slp_tree_map_t ();
   slp_instances = vinfo->slp_instances;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
+      slp_tree node = SLP_INSTANCE_TREE (instance);
       /* Schedule the tree of INSTANCE.  */
-      vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
-                                 instance, bst_map);
+      vect_schedule_slp_instance (vinfo, node, instance);
+
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+       vectorize_slp_instance_root_stmt (node, instance);
+
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                          "vectorizing stmts using SLP.\n");
     }
-  delete bst_map;
 
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
@@ -4062,14 +4319,16 @@ vect_schedule_slp (vec_info *vinfo)
         stmts starting from the SLP tree root if they have no
         uses.  */
       if (is_a <loop_vec_info> (vinfo))
-       vect_remove_slp_scalar_calls (root);
+       vect_remove_slp_scalar_calls (vinfo, root);
 
-      for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
-                  && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
+      for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
         {
          if (!STMT_VINFO_DATA_REF (store_info))
            break;
 
+         if (SLP_INSTANCE_ROOT_STMT (instance))
+           continue;
+
          store_info = vect_orig_stmt (store_info);
          /* Free the attached stmt_vec_info and remove the stmt.  */
          vinfo->remove_stmt (store_info);