]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/tree-vect-loop.c
[Ada] Add the System.Bitfield_Utils runtime unit
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
index 3ac4aee02b3e6d612ddbe0fcce8f3745279256b3..b0cbbac0cb5ba1ffce706715d3dbb9139063803d 100644 (file)
@@ -1,5 +1,5 @@
 /* Loop Vectorization
-   Copyright (C) 2003-2018 Free Software Foundation, Inc.
+   Copyright (C) 2003-2019 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
    Ira Rosen <irar@il.ibm.com>
 
@@ -159,7 +159,7 @@ static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
    may already be set for general statements (not just data refs).  */
 
-static bool
+static opt_result
 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
                              bool vectype_maybe_set_p,
                              poly_uint64 *vf,
@@ -173,13 +173,14 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
-      return true;
+      return opt_result::success ();
     }
 
   tree stmt_vectype, nunits_vectype;
-  if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
-                                      &nunits_vectype))
-    return false;
+  opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
+                                                  &nunits_vectype);
+  if (!res)
+    return res;
 
   if (stmt_vectype)
     {
@@ -199,7 +200,7 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
   if (nunits_vectype)
     vect_update_max_nunits (vf, nunits_vectype);
 
-  return true;
+  return opt_result::success ();
 }
 
 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
@@ -209,18 +210,18 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
    add them to MASK_PRODUCERS.  Return true on success or false if
    something prevented vectorization.  */
 
-static bool
+static opt_result
 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
                            vec<stmt_vec_info > *mask_producers)
 {
   vec_info *vinfo = stmt_info->vinfo;
   if (dump_enabled_p ())
-    {
-      dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
-      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
-    }
-  if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
-    return false;
+    dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
+                    stmt_info->stmt);
+  opt_result res
+    = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
+  if (!res)
+    return res;
 
   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
       && STMT_VINFO_RELATED_STMT (stmt_info))
@@ -234,28 +235,27 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
        {
          stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                              "==> examining pattern def stmt: ");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
-                               def_stmt_info->stmt, 0);
-           }
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "==> examining pattern def stmt: %G",
+                            def_stmt_info->stmt);
          if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
                                             vf, mask_producers))
-           return false;
+         res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
+                                             vf, mask_producers);
+         if (!res)
+           return res;
        }
 
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "==> examining pattern statement: ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
-       }
-      if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
-       return false;
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "==> examining pattern statement: %G",
+                        stmt_info->stmt);
+      res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
+      if (!res)
+       return res;
     }
 
-  return true;
+  return opt_result::success ();
 }
 
 /* Function vect_determine_vectorization_factor
@@ -283,10 +283,10 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
         }
 */
 
-static bool
+static opt_result
 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   unsigned nbbs = loop->num_nodes;
   poly_uint64 vectorization_factor = 1;
@@ -309,10 +309,8 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
          phi = si.phi ();
          stmt_info = loop_vinfo->lookup_stmt (phi);
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
-           }
+           dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
+                            phi);
 
          gcc_assert (stmt_info);
 
@@ -323,35 +321,21 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_NOTE, vect_location,
-                                   "get vectype for scalar type:  ");
-                 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
-                  dump_printf (MSG_NOTE, "\n");
-               }
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "get vectype for scalar type:  %T\n",
+                                scalar_type);
 
              vectype = get_vectype_for_scalar_type (scalar_type);
              if (!vectype)
-               {
-                 if (dump_enabled_p ())
-                   {
-                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                       "not vectorized: unsupported "
-                                       "data-type ");
-                     dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                         scalar_type);
-                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                   }
-                 return false;
-               }
+               return opt_result::failure_at (phi,
+                                              "not vectorized: unsupported "
+                                              "data-type %T\n",
+                                              scalar_type);
              STMT_VINFO_VECTYPE (stmt_info) = vectype;
 
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
-                 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
-                  dump_printf (MSG_NOTE, "\n");
-               }
+               dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
+                                vectype);
 
              if (dump_enabled_p ())
                {
@@ -368,9 +352,11 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
           gsi_next (&si))
        {
          stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
-         if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
-                                          &mask_producers))
-           return false;
+         opt_result res
+           = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
+                                         &mask_producers);
+         if (!res)
+           return res;
         }
     }
 
@@ -383,24 +369,20 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
     }
 
   if (known_le (vectorization_factor, 1U))
-    {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "not vectorized: unsupported data-type\n");
-      return false;
-    }
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized: unsupported data-type\n");
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 
   for (i = 0; i < mask_producers.length (); i++)
     {
       stmt_info = mask_producers[i];
-      tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
+      opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
       if (!mask_type)
-       return false;
+       return opt_result::propagate_failure (mask_type);
       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
     }
 
-  return true;
+  return opt_result::success ();
 }
 
 
@@ -432,13 +414,8 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 
   if (dump_enabled_p ())
-    {
-      dump_printf_loc (MSG_NOTE, vect_location, "step: ");
-      dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
-      dump_printf (MSG_NOTE, ",  init: ");
-      dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
-      dump_printf (MSG_NOTE, "\n");
-    }
+    dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
+                    step_expr, init_expr);
 
   *init = init_expr;
   *step = step_expr;
@@ -462,6 +439,40 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
   return true;
 }
 
+/* Return true if PHI, described by STMT_INFO, is the inner PHI in
+   what we are assuming is a double reduction.  For example, given
+   a structure like this:
+
+      outer1:
+       x_1 = PHI <x_4(outer2), ...>;
+       ...
+
+      inner:
+       x_2 = PHI <x_1(outer1), ...>;
+       ...
+       x_3 = ...;
+       ...
+
+      outer2:
+       x_4 = PHI <x_3(inner)>;
+       ...
+
+   outer loop analysis would treat x_1 as a double reduction phi and
+   this function would then return true for x_2.  */
+
+static bool
+vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
+{
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  use_operand_p use_p;
+  ssa_op_iter op_iter;
+  FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
+    if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
+      if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
+       return true;
+  return false;
+}
+
 /* Function vect_analyze_scalar_cycles_1.
 
    Examine the cross iteration def-use cycles of scalar variables
@@ -470,11 +481,11 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
    enclosing LOOP).  */
 
 static void
-vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
+vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 {
   basic_block bb = loop->header;
   tree init, step;
-  auto_vec<gimple *, 64> worklist;
+  auto_vec<stmt_vec_info, 64> worklist;
   gphi_iterator gsi;
   bool double_reduc;
 
@@ -491,10 +502,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
-         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
-       }
+       dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 
       /* Skip virtual phi's.  The data dependences that are associated with
          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
@@ -509,12 +517,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
        {
          STRIP_NOPS (access_fn);
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                               "Access function of PHI: ");
-             dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
-              dump_printf (MSG_NOTE, "\n");
-           }
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Access function of PHI: %T\n", access_fn);
          STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
            = initial_condition_in_loop_num (access_fn, loop->num);
          STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
@@ -522,11 +526,12 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
        }
 
       if (!access_fn
+         || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
          || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
          || (LOOP_VINFO_LOOP (loop_vinfo) != loop
              && TREE_CODE (step) != INTEGER_CST))
        {
-         worklist.safe_push (phi);
+         worklist.safe_push (stmt_vinfo);
          continue;
        }
 
@@ -543,15 +548,12 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
   /* Second - identify all reductions and nested cycles.  */
   while (worklist.length () > 0)
     {
-      gimple *phi = worklist.pop ();
+      stmt_vec_info stmt_vinfo = worklist.pop ();
+      gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
       tree def = PHI_RESULT (phi);
-      stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 
       if (dump_enabled_p ())
-        {
-          dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
-          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
-        }
+       dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 
       gcc_assert (!virtual_operand_p (def)
                  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
@@ -631,7 +633,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 static void
 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 
@@ -648,12 +650,12 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 }
 
-/* Transfer group and reduction information from STMT to its pattern stmt.  */
+/* Transfer group and reduction information from STMT_INFO to its
+   pattern stmt.  */
 
 static void
-vect_fixup_reduc_chain (gimple *stmt)
+vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 {
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
   stmt_vec_info stmtp;
   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
@@ -661,14 +663,14 @@ vect_fixup_reduc_chain (gimple *stmt)
   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
   do
     {
-      stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
+      stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
-      stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
-      if (stmt)
+      stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
+      if (stmt_info)
        REDUC_GROUP_NEXT_ELEMENT (stmtp)
-         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
+         = STMT_VINFO_RELATED_STMT (stmt_info);
     }
-  while (stmt);
+  while (stmt_info);
   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 }
 
@@ -677,18 +679,18 @@ vect_fixup_reduc_chain (gimple *stmt)
 static void
 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 {
-  gimple *first;
+  stmt_vec_info first;
   unsigned i;
 
   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
-    if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
+    if (STMT_VINFO_IN_PATTERN_P (first))
       {
-       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
+       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
        while (next)
          {
-           if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
+           if (! STMT_VINFO_IN_PATTERN_P (next))
              break;
-           next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
+           next = REDUC_GROUP_NEXT_ELEMENT (next);
          }
        /* If not all stmt in the chain are patterns try to handle
           the chain without patterns.  */
@@ -696,7 +698,7 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
          {
            vect_fixup_reduc_chain (first);
            LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
-             = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
+             = STMT_VINFO_RELATED_STMT (first);
          }
       }
 }
@@ -712,11 +714,11 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 
 
 static gcond *
-vect_get_loop_niters (struct loop *loop, tree *assumptions,
+vect_get_loop_niters (class loop *loop, tree *assumptions,
                      tree *number_of_iterations, tree *number_of_iterationsm1)
 {
   edge exit = single_exit (loop);
-  struct tree_niter_desc niter_desc;
+  class tree_niter_desc niter_desc;
   tree niter_assumptions, niter, may_be_zero;
   gcond *cond = get_loop_exit_condition (loop);
 
@@ -728,9 +730,7 @@ vect_get_loop_niters (struct loop *loop, tree *assumptions,
   if (!exit)
     return cond;
 
-  niter = chrec_dont_know;
   may_be_zero = NULL_TREE;
-  niter_assumptions = boolean_true_node;
   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
       || chrec_contains_undetermined (niter_desc.niter))
     return cond;
@@ -793,7 +793,7 @@ vect_get_loop_niters (struct loop *loop, tree *assumptions,
 static bool
 bb_in_loop_p (const_basic_block bb, const void *data)
 {
-  const struct loop *const loop = (const struct loop *)data;
+  const class loop *const loop = (const class loop *)data;
   if (flow_bb_inside_loop_p (loop, bb))
     return true;
   return false;
@@ -803,7 +803,7 @@ bb_in_loop_p (const_basic_block bb, const void *data)
 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 
-_loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
+_loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
   : vec_info (vec_info::loop, init_cost (loop_in), shared),
     loop (loop_in),
     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
@@ -817,10 +817,12 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     mask_compare_type (NULL_TREE),
+    simd_if_cond (NULL_TREE),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
     ptr_mask (0),
     ivexpr_map (NULL),
+    scan_map (NULL),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
     vectorizable (false),
@@ -831,14 +833,22 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
     operands_swapped (false),
     no_data_dependencies (false),
     has_mask_store (false),
+    scalar_loop_scaling (profile_probability::uninitialized ()),
     scalar_loop (NULL),
     orig_loop_info (NULL)
 {
-  /* Create/Update stmt_info for all stmts in the loop.  */
-  basic_block *body = get_loop_body (loop);
-  for (unsigned int i = 0; i < loop->num_nodes; i++)
+  /* CHECKME: We want to visit all BBs before their successors (except for
+     latch blocks, for which this assertion wouldn't hold).  In the simple
+     case of the loop forms we allow, a dfs order of the BBs would the same
+     as reversed postorder traversal, so we are safe.  */
+
+  unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
+                                         bbs, loop->num_nodes, loop);
+  gcc_assert (nbbs == loop->num_nodes);
+
+  for (unsigned int i = 0; i < nbbs; i++)
     {
-      basic_block bb = body[i];
+      basic_block bb = bbs[i];
       gimple_stmt_iterator si;
 
       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
@@ -853,18 +863,28 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
          gimple *stmt = gsi_stmt (si);
          gimple_set_uid (stmt, 0);
          add_stmt (stmt);
+         /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
+            third argument is the #pragma omp simd if (x) condition, when 0,
+            loop shouldn't be vectorized, when non-zero constant, it should
+            be vectorized normally, otherwise versioned with vectorized loop
+            done if the condition is non-zero at runtime.  */
+         if (loop_in->simduid
+             && is_gimple_call (stmt)
+             && gimple_call_internal_p (stmt)
+             && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
+             && gimple_call_num_args (stmt) >= 3
+             && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
+             && (loop_in->simduid
+                 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
+           {
+             tree arg = gimple_call_arg (stmt, 2);
+             if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
+               simd_if_cond = arg;
+             else
+               gcc_assert (integer_nonzerop (arg));
+           }
        }
     }
-  free (body);
-
-  /* CHECKME: We want to visit all BBs before their successors (except for
-     latch blocks, for which this assertion wouldn't hold).  In the simple
-     case of the loop forms we allow, a dfs order of the BBs would the same
-     as reversed postorder traversal, so we are safe.  */
-
-  unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
-                                         bbs, loop->num_nodes, loop);
-  gcc_assert (nbbs == loop->num_nodes);
 }
 
 /* Free all levels of MASKS.  */
@@ -888,15 +908,10 @@ _loop_vec_info::~_loop_vec_info ()
   gimple_stmt_iterator si;
   int j;
 
-  /* ???  We're releasing loop_vinfos en-block.  */
-  set_stmt_vec_info_vec (&stmt_vec_infos);
   nbbs = loop->num_nodes;
   for (j = 0; j < nbbs; j++)
     {
       basic_block bb = bbs[j];
-      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
-        free_stmt_vec_info (gsi_stmt (si));
-
       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
         {
          gimple *stmt = gsi_stmt (si);
@@ -936,9 +951,6 @@ _loop_vec_info::~_loop_vec_info ()
                    }
                }
            }
-
-         /* Free stmt_vec_info.  */
-         free_stmt_vec_info (stmt);
           gsi_next (&si);
         }
     }
@@ -947,6 +959,7 @@ _loop_vec_info::~_loop_vec_info ()
 
   release_vec_loop_masks (&masks);
   delete ivexpr_map;
+  delete scan_map;
 
   loop->aux = NULL;
 }
@@ -1016,8 +1029,10 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 static bool
 vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
 
   /* Use a normal loop if there are no statements that need masking.
      This only happens in rare degenerate cases: it means that the loop
@@ -1036,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     max_ni = wi::smin (max_ni, max_back_edges + 1);
 
   /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
 
   /* Work out how many bits we need to represent the limit.  */
   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1044,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  unsigned int iv_precision = UINT_MAX;
+
+  if (iv_limit != -1)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+                                     UNSIGNED);
+
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1055,10 +1078,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
              && can_produce_all_loop_masks_p (loop_vinfo, this_type))
            {
              /* Although we could stop as soon as we find a valid mode,
-                it's often better to continue until we hit Pmode, since the
-                operands to the WHILE are more likely to be reusable in
-                address calculations.  */
-             cmp_type = this_type;
+                there are at least two reasons why that's not always the
+                best choice:
+
+                - An IV that's Pmode or wider is more likely to be reusable
+                  in address calculations than an IV that's narrower than
+                  Pmode.
+
+                - Doing the comparison in IV_PRECISION or wider allows
+                  a natural 0-based IV, whereas using a narrower comparison
+                  type requires mitigations against wrap-around.
+
+                Conversely, if the IV limit is variable, doing the comparison
+                in a wider type than the original type can introduce
+                unnecessary extensions, so picking the widest valid mode
+                is not always a good choice either.
+
+                Here we prefer the first IV type that's Pmode or wider,
+                and the first comparison type that's IV_PRECISION or wider.
+                (The comparison type must be no wider than the IV type,
+                to avoid extensions in the vector loop.)
+
+                ??? We might want to try continuing beyond Pmode for ILP32
+                targets if CMP_BITS < IV_PRECISION.  */
+             iv_type = this_type;
+             if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
+               cmp_type = this_type;
              if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
                break;
            }
@@ -1069,6 +1114,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
     return false;
 
   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
   return true;
 }
 
@@ -1076,11 +1122,13 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 static void
 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes, factor;
   int innerloop_iters, i;
 
+  DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
+
   /* Gather costs for statements in the scalar loop.  */
 
   /* FORNOW.  */
@@ -1107,11 +1155,11 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
             continue;
 
           /* Skip stmts that are not vectorized inside the loop.  */
-          if (stmt_info
-              && !STMT_VINFO_RELEVANT_P (stmt_info)
-              && (!STMT_VINFO_LIVE_P (stmt_info)
-                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
-             && !STMT_VINFO_IN_PATTERN_P (stmt_info))
+         stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
+          if (!STMT_VINFO_RELEVANT_P (vstmt_info)
+              && (!STMT_VINFO_LIVE_P (vstmt_info)
+                  || !VECTORIZABLE_CYCLE_DEF
+                       (STMT_VINFO_DEF_TYPE (vstmt_info))))
             continue;
 
          vect_cost_for_stmt kind;
@@ -1136,13 +1184,9 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
   int j;
   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
                    j, si)
-    {
-      struct _stmt_vec_info *stmt_info
-       = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
-      (void) add_stmt_cost (target_cost_data, si->count,
-                           si->kind, stmt_info, si->misalign,
-                           vect_body);
-    }
+    (void) add_stmt_cost (target_cost_data, si->count,
+                         si->kind, si->stmt_info, si->misalign,
+                         vect_body);
   unsigned dummy, body_cost = 0;
   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
   destroy_cost_data (target_cost_data);
@@ -1159,8 +1203,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
    - the number of iterations can be analyzed, i.e, a countable loop.  The
      niter could be analyzed under some assumptions.  */
 
-bool
-vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
+opt_result
+vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
                          tree *assumptions, tree *number_of_iterationsm1,
                          tree *number_of_iterations, gcond **inner_loop_cond)
 {
@@ -1185,24 +1229,17 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
                         (exit-bb)  */
 
       if (loop->num_nodes != 2)
-        {
-          if (dump_enabled_p ())
-            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: control flow in loop.\n");
-          return false;
-        }
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized:"
+                                      " control flow in loop.\n");
 
       if (empty_block_p (loop->header))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: empty loop.\n");
-         return false;
-       }
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized: empty loop.\n");
     }
   else
     {
-      struct loop *innerloop = loop->inner;
+      class loop *innerloop = loop->inner;
       edge entryedge;
 
       /* Nested loop. We currently require that the loop is doubly-nested,
@@ -1223,75 +1260,60 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
         as described above.  */
 
       if ((loop->inner)->inner || (loop->inner)->next)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: multiple nested loops.\n");
-         return false;
-       }
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized:"
+                                      " multiple nested loops.\n");
 
       if (loop->num_nodes != 5)
-        {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: control flow in loop.\n");
-         return false;
-        }
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized:"
+                                      " control flow in loop.\n");
 
       entryedge = loop_preheader_edge (innerloop);
       if (entryedge->src != loop->header
          || !single_exit (innerloop)
          || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: unsupported outerloop form.\n");
-         return false;
-       }
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized:"
+                                      " unsupported outerloop form.\n");
 
       /* Analyze the inner-loop.  */
       tree inner_niterm1, inner_niter, inner_assumptions;
-      if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
-                                     &inner_assumptions, &inner_niterm1,
-                                     &inner_niter, NULL)
-         /* Don't support analyzing niter under assumptions for inner
-            loop.  */
-         || !integer_onep (inner_assumptions))
+      opt_result res
+       = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
+                                   &inner_assumptions, &inner_niterm1,
+                                   &inner_niter, NULL);
+      if (!res)
        {
          if (dump_enabled_p ())
-            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "not vectorized: Bad inner loop.\n");
-         return false;
+         return res;
        }
 
+      /* Don't support analyzing niter under assumptions for inner
+        loop.  */
+      if (!integer_onep (inner_assumptions))
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized: Bad inner loop.\n");
+
       if (!expr_invariant_in_loop_p (loop, inner_niter))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: inner-loop count not"
-                             " invariant.\n");
-         return false;
-       }
+       return opt_result::failure_at (vect_location,
+                                      "not vectorized: inner-loop count not"
+                                      " invariant.\n");
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                         "Considering outer-loop vectorization.\n");
     }
 
-  if (!single_exit (loop)
-      || EDGE_COUNT (loop->header->preds) != 2)
-    {
-      if (dump_enabled_p ())
-        {
-          if (!single_exit (loop))
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: multiple exits.\n");
-          else if (EDGE_COUNT (loop->header->preds) != 2)
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: too many incoming edges.\n");
-        }
-      return false;
-    }
+  if (!single_exit (loop))
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized: multiple exits.\n");
+  if (EDGE_COUNT (loop->header->preds) != 2)
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized:"
+                                  " too many incoming edges.\n");
 
   /* We assume that the loop exit condition is at the end of the loop. i.e,
      that the loop is represented as a do-while (with a proper if-guard
@@ -1299,67 +1321,52 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
      executable statements, and the latch is empty.  */
   if (!empty_block_p (loop->latch)
       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: latch block not empty.\n");
-      return false;
-    }
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized: latch block not empty.\n");
 
   /* Make sure the exit is not abnormal.  */
   edge e = single_exit (loop);
   if (e->flags & EDGE_ABNORMAL)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: abnormal loop exit edge.\n");
-      return false;
-    }
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized:"
+                                  " abnormal loop exit edge.\n");
 
   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
                                     number_of_iterationsm1);
   if (!*loop_cond)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: complicated exit condition.\n");
-      return false;
-    }
+    return opt_result::failure_at
+      (vect_location,
+       "not vectorized: complicated exit condition.\n");
 
   if (integer_zerop (*assumptions)
       || !*number_of_iterations
       || chrec_contains_undetermined (*number_of_iterations))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: number of iterations cannot be "
-                        "computed.\n");
-      return false;
-    }
+    return opt_result::failure_at
+      (*loop_cond,
+       "not vectorized: number of iterations cannot be computed.\n");
 
   if (integer_zerop (*number_of_iterations))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: number of iterations = 0.\n");
-      return false;
-    }
+    return opt_result::failure_at
+      (*loop_cond,
+       "not vectorized: number of iterations = 0.\n");
 
-  return true;
+  return opt_result::success ();
 }
 
 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
 
-loop_vec_info
-vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
+opt_loop_vec_info
+vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
 {
   tree assumptions, number_of_iterations, number_of_iterationsm1;
   gcond *loop_cond, *inner_loop_cond = NULL;
 
-  if (! vect_analyze_loop_form_1 (loop, &loop_cond,
-                                 &assumptions, &number_of_iterationsm1,
-                                 &number_of_iterations, &inner_loop_cond))
-    return NULL;
+  opt_result res
+    = vect_analyze_loop_form_1 (loop, &loop_cond,
+                               &assumptions, &number_of_iterationsm1,
+                               &number_of_iterations, &inner_loop_cond);
+  if (!res)
+    return opt_loop_vec_info::propagate_failure (res);
 
   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
@@ -1401,7 +1408,7 @@ vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
 
   gcc_assert (!loop->aux);
   loop->aux = loop_vinfo;
-  return loop_vinfo;
+  return opt_loop_vec_info::success (loop_vinfo);
 }
 
 
@@ -1412,7 +1419,7 @@ vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
 static void
 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   poly_uint64 vectorization_factor;
@@ -1436,9 +1443,7 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
           gsi_next (&si))
        {
          stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
-         if (STMT_VINFO_IN_PATTERN_P (stmt_info)
-             && STMT_VINFO_RELATED_STMT (stmt_info))
-           stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
+         stmt_info = vect_stmt_to_vectorize (stmt_info);
          if ((STMT_VINFO_RELEVANT_P (stmt_info)
               || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
              && !PURE_SLP_STMT (stmt_info))
@@ -1449,14 +1454,16 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
 
   if (only_slp_in_loop)
     {
-      dump_printf_loc (MSG_NOTE, vect_location,
-                      "Loop contains only SLP stmts\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Loop contains only SLP stmts\n");
       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
     }
   else
     {
-      dump_printf_loc (MSG_NOTE, vect_location,
-                      "Loop contains SLP and non-SLP stmts\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Loop contains SLP and non-SLP stmts\n");
       /* Both the vectorization factor and unroll factor have the form
         current_vector_size * X for some rational X, so they must have
         a common multiple.  */
@@ -1505,10 +1512,10 @@ vect_active_double_reduction_p (stmt_vec_info stmt_info)
 
    Scan the loop stmts and make sure they are all vectorizable.  */
 
-static bool
+static opt_result
 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   int i;
@@ -1518,8 +1525,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 
   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
 
-  stmt_vector_for_cost cost_vec;
-  cost_vec.create (2);
+  auto_vec<stmt_info_for_cost> cost_vec;
 
   for (i = 0; i < nbbs; i++)
     {
@@ -1533,10 +1539,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 
          stmt_info = loop_vinfo->lookup_stmt (phi);
           if (dump_enabled_p ())
-            {
-              dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
-              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
-            }
+           dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
          if (virtual_operand_p (gimple_phi_result (phi)))
            continue;
 
@@ -1550,13 +1553,9 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                  requires to actually do something here.  */
               if (STMT_VINFO_LIVE_P (stmt_info)
                  && !vect_active_double_reduction_p (stmt_info))
-                {
-                  if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "Unsupported loop-closed phi in "
-                                    "outer-loop.\n");
-                  return false;
-                }
+               return opt_result::failure_at (phi,
+                                              "Unsupported loop-closed phi"
+                                              " in outer-loop.\n");
 
               /* If PHI is used in the outer loop, we check that its operand
                  is defined in the inner loop.  */
@@ -1565,17 +1564,17 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                   tree phi_op;
 
                   if (gimple_phi_num_args (phi) != 1)
-                    return false;
+                    return opt_result::failure_at (phi, "unsupported phi");
 
                   phi_op = PHI_ARG_DEF (phi, 0);
                  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
                  if (!op_def_info)
-                    return false;
+                   return opt_result::failure_at (phi, "unsupported phi");
 
                  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
                      && (STMT_VINFO_RELEVANT (op_def_info)
                          != vect_used_in_outer_by_reduction))
-                   return false;
+                   return opt_result::failure_at (phi, "unsupported phi");
                 }
 
               continue;
@@ -1586,24 +1585,22 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
                || STMT_VINFO_LIVE_P (stmt_info))
               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
-            {
-              /* A scalar-dependence cycle that we don't support.  */
-              if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "not vectorized: scalar dependence cycle.\n");
-              return false;
-            }
+           /* A scalar-dependence cycle that we don't support.  */
+           return opt_result::failure_at (phi,
+                                          "not vectorized:"
+                                          " scalar dependence cycle.\n");
 
           if (STMT_VINFO_RELEVANT_P (stmt_info))
             {
               need_to_vectorize = true;
               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
                  && ! PURE_SLP_STMT (stmt_info))
-                ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
+               ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
+                                            &cost_vec);
              else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
                        || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
                       && ! PURE_SLP_STMT (stmt_info))
-               ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
+               ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
                                             &cost_vec);
             }
 
@@ -1611,35 +1608,33 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
          if (ok
              && STMT_VINFO_LIVE_P (stmt_info)
              && !PURE_SLP_STMT (stmt_info))
-           ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
+           ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
                                              &cost_vec);
 
           if (!ok)
-            {
-              if (dump_enabled_p ())
-                {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                  "not vectorized: relevant phi not "
-                                  "supported: ");
-                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
-                }
-             return false;
-            }
+           return opt_result::failure_at (phi,
+                                          "not vectorized: relevant phi not "
+                                          "supported: %G",
+                                          static_cast <gimple *> (phi));
         }
 
       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
           gsi_next (&si))
         {
          gimple *stmt = gsi_stmt (si);
-         if (!gimple_clobber_p (stmt)
-             && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
-                                    &cost_vec))
-           return false;
+         if (!gimple_clobber_p (stmt))
+           {
+             opt_result res
+               = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
+                                    &need_to_vectorize,
+                                    NULL, NULL, &cost_vec);
+             if (!res)
+               return res;
+           }
         }
     } /* bbs */
 
   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
-  cost_vec.release ();
 
   /* All operations in the loop are either irrelevant (deal with loop
      control, or dead), or only used outside the loop and can be moved
@@ -1651,14 +1646,12 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
                         "All the computation can be taken out of the loop.\n");
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: redundant loop. no profit to "
-                        "vectorize.\n");
-      return false;
+      return opt_result::failure_at
+       (vect_location,
+        "not vectorized: redundant loop. no profit to vectorize.\n");
     }
 
-  return true;
+  return opt_result::success ();
 }
 
 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
@@ -1668,7 +1661,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 static int
 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
 
   /* Only fully-masked loops can have iteration counts less than the
@@ -1756,7 +1749,7 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
   return 1;
 }
 
-static bool
+static opt_result
 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
                           vec<data_reference_p> *datarefs,
                           unsigned int *n_stmts)
@@ -1770,7 +1763,8 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
        if (is_gimple_debug (stmt))
          continue;
        ++(*n_stmts);
-       if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
+       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
+       if (!res)
          {
            if (is_gimple_call (stmt) && loop->safelen)
              {
@@ -1802,15 +1796,60 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
                      }
                  }
              }
-           return false;
+           return res;
          }
        /* If dependence analysis will give up due to the limit on the
           number of datarefs stop here and fail fatally.  */
        if (datarefs->length ()
            > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
-         return false;
+         return opt_result::failure_at (stmt, "exceeded param "
+                                        "loop-max-datarefs-for-datadeps\n");
       }
-  return true;
+  return opt_result::success ();
+}
+
+/* Look for SLP-only access groups and turn each individual access into its own
+   group.  */
+static void
+vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
+{
+  unsigned int i;
+  struct data_reference *dr;
+
+  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
+
+  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      gcc_assert (DR_REF (dr));
+      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
+
+      /* Check if the load is a part of an interleaving chain.  */
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+       {
+         stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         unsigned int group_size = DR_GROUP_SIZE (first_element);
+
+         /* Check if SLP-only groups.  */
+         if (!STMT_SLP_TYPE (stmt_info)
+             && STMT_VINFO_SLP_VECT_ONLY (first_element))
+           {
+             /* Dissolve the group.  */
+             STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
+
+             stmt_vec_info vinfo = first_element;
+             while (vinfo)
+               {
+                 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
+                 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
+                 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+                 DR_GROUP_SIZE (vinfo) = 1;
+                 DR_GROUP_GAP (vinfo) = group_size - 1;
+                 vinfo = next;
+               }
+           }
+       }
+    }
 }
 
 /* Function vect_analyze_loop_2.
@@ -1818,10 +1857,10 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
    for it.  The different analyses will record information in the
    loop_vec_info struct.  */
-static bool
+static opt_result
 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 {
-  bool ok;
+  opt_result ok = opt_result::success ();
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
   poly_uint64 min_vf = 2;
@@ -1829,6 +1868,11 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
 
+  if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
+      && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized: simd if(0)\n");
+
   /* Find all data references in the loop (which correspond to vdefs/vuses)
      and analyze their evolution in the loop.  */
 
@@ -1837,16 +1881,18 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* Gather the data references and count stmts in the loop.  */
   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
     {
-      if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
-                                     &LOOP_VINFO_DATAREFS (loop_vinfo),
-                                     n_stmts))
+      opt_result res
+       = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
+                                    &LOOP_VINFO_DATAREFS (loop_vinfo),
+                                    n_stmts);
+      if (!res)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "not vectorized: loop contains function "
                             "calls or data references that cannot "
                             "be analyzed\n");
-         return false;
+         return res;
        }
       loop_vinfo->shared->save_datarefs ();
     }
@@ -1856,13 +1902,13 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* Analyze the data references and also adjust the minimal
      vectorization factor according to the loads and stores.  */
 
-  ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
+  ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
   if (!ok)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "bad data references.\n");
-      return false;
+      return ok;
     }
 
   /* Classify all cross-iteration scalar data-flow cycles.
@@ -1882,18 +1928,18 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "bad data access.\n");
-      return false;
+      return ok;
     }
 
   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
 
-  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
+  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
   if (!ok)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "unexpected pattern.\n");
-      return false;
+      return ok;
     }
 
   /* While the rest of the analysis below depends on it in some way.  */
@@ -1905,15 +1951,16 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
      FORNOW: fail at the first data dependence that we encounter.  */
 
   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
-  if (!ok
-      || (max_vf != MAX_VECTORIZATION_FACTOR
-         && maybe_lt (max_vf, min_vf)))
+  if (!ok)
     {
       if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "bad data dependence.\n");
-      return false;
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data dependence.\n");
+      return ok;
     }
+  if (max_vf != MAX_VECTORIZATION_FACTOR
+      && maybe_lt (max_vf, min_vf))
+    return opt_result::failure_at (vect_location, "bad data dependence.\n");
   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
 
   ok = vect_determine_vectorization_factor (loop_vinfo);
@@ -1922,16 +1969,11 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "can't determine vectorization factor.\n");
-      return false;
+      return ok;
     }
   if (max_vf != MAX_VECTORIZATION_FACTOR
       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data dependence.\n");
-      return false;
-    }
+    return opt_result::failure_at (vect_location, "bad data dependence.\n");
 
   /* Compute the scalar iteration cost.  */
   vect_compute_single_scalar_iteration_cost (loop_vinfo);
@@ -1942,7 +1984,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
   if (!ok)
-    return false;
+    return ok;
 
   /* If there are any SLP instances mark them as pure_slp.  */
   bool slp = vect_make_slp_decision (loop_vinfo);
@@ -1973,7 +2015,7 @@ start_over:
       dump_printf_loc (MSG_NOTE, vect_location,
                       "vectorization_factor = ");
       dump_dec (MSG_NOTE, vectorization_factor);
-      dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
+      dump_printf (MSG_NOTE, ", niters = %wd\n",
                   LOOP_VINFO_INT_NITERS (loop_vinfo));
     }
 
@@ -1989,7 +2031,7 @@ start_over:
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "bad data alignment.\n");
-      return false;
+      return ok;
     }
 
   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
@@ -1997,23 +2039,19 @@ start_over:
      since we use grouping information gathered by interleaving analysis.  */
   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
   if (!ok)
-    return false;
+    return ok;
 
-  /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
-     vectorization.  */
+  /* Do not invoke vect_enhance_data_refs_alignment for epilogue
+     vectorization, since we do not want to add extra peeling or
+     add versioning for alignment.  */
   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
-    {
     /* This pass will decide on using loop versioning and/or loop peeling in
        order to enhance the alignment of data references in the loop.  */
     ok = vect_enhance_data_refs_alignment (loop_vinfo);
-    if (!ok)
-      {
-       if (dump_enabled_p ())
-         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "bad data alignment.\n");
-        return false;
-      }
-    }
+  else
+    ok = vect_verify_datarefs_alignment (loop_vinfo);
+  if (!ok)
+    return ok;
 
   if (slp)
     {
@@ -2023,9 +2061,16 @@ start_over:
       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
       vect_slp_analyze_operations (loop_vinfo);
       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
-       goto again;
+       {
+         ok = opt_result::failure_at (vect_location,
+                                      "unsupported SLP instances\n");
+         goto again;
+       }
     }
 
+  /* Dissolve SLP-only groups.  */
+  vect_dissolve_slp_only_groups (loop_vinfo);
+
   /* Scan all the remaining operations in the loop that are not subject
      to SLP and make sure they are vectorizable.  */
   ok = vect_analyze_loop_operations (loop_vinfo);
@@ -2034,7 +2079,7 @@ start_over:
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "bad operation or unsupported loop bound.\n");
-      return false;
+      return ok;
     }
 
   /* Decide whether to use a fully-masked loop for this vectorization
@@ -2063,26 +2108,22 @@ start_over:
       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
 
       if (known_lt (wi::to_widest (scalar_niters), vf))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location,
-                            "loop has no enough iterations to support"
-                            " peeling for gaps.\n");
-         return false;
-       }
+       return opt_result::failure_at (vect_location,
+                                      "loop has no enough iterations to"
+                                      " support peeling for gaps.\n");
     }
 
   /* Check the costings of the loop make vectorizing worthwhile.  */
   res = vect_analyze_loop_costing (loop_vinfo);
   if (res < 0)
-    goto again;
-  if (!res)
     {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "Loop costings not worthwhile.\n");
-      return false;
+      ok = opt_result::failure_at (vect_location,
+                                  "Loop costings may not be worthwhile.\n");
+      goto again;
     }
+  if (!res)
+    return opt_result::failure_at (vect_location,
+                                  "Loop costings not worthwhile.\n");
 
   /* Decide whether we need to create an epilogue loop to handle
      remaining scalar iterations.  */
@@ -2093,14 +2134,22 @@ start_over:
     /* The main loop handles all iterations.  */
     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+          && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
     {
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
-                      - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
+      /* Work out the (constant) number of iterations that need to be
+        peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+       peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
                       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
        LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
     }
   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+          /* ??? When peeling for gaps but not alignment, we could
+             try to check whether the (variable) niters is known to be
+             VF * N + 1.  That's something of a niche case though.  */
+          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
           || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
           || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
                < (unsigned) exact_log2 (const_vf))
@@ -2123,10 +2172,9 @@ start_over:
                                           single_exit (LOOP_VINFO_LOOP
                                                         (loop_vinfo))))
         {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: can't create required "
-                            "epilog loop\n");
+         ok = opt_result::failure_at (vect_location,
+                                      "not vectorized: can't create required "
+                                      "epilog loop\n");
           goto again;
         }
     }
@@ -2144,8 +2192,8 @@ start_over:
          /* Niters for peeled prolog loop.  */
          if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
            {
-             struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
-             tree vectype = STMT_VINFO_VECTYPE (vect_dr_stmt (dr));
+             dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+             tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
              niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
            }
          else
@@ -2165,17 +2213,20 @@ start_over:
                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
 
   /* Ok to vectorize!  */
-  return true;
+  return opt_result::success ();
 
 again:
+  /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
+  gcc_assert (!ok);
+
   /* Try again with SLP forced off but if we didn't do any SLP there is
      no point in re-trying.  */
   if (!slp)
-    return false;
+    return ok;
 
   /* If there are reduction chains re-trying will fail anyway.  */
   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
-    return false;
+    return ok;
 
   /* Likewise if the grouped loads or stores in the SLP cannot be handled
      via interleaving or lane instructions.  */
@@ -2188,24 +2239,26 @@ again:
       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
        continue;
-      vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
+      vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
       unsigned int size = DR_GROUP_SIZE (vinfo);
       tree vectype = STMT_VINFO_VECTYPE (vinfo);
       if (! vect_store_lanes_supported (vectype, size, false)
         && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
         && ! vect_grouped_store_supported (vectype, size))
-       return false;
+       return opt_result::failure_at (vinfo->stmt,
+                                      "unsupported grouped store\n");
       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
        {
          vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
-         vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
+         vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
          bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
          size = DR_GROUP_SIZE (vinfo);
          vectype = STMT_VINFO_VECTYPE (vinfo);
          if (! vect_load_lanes_supported (vectype, size, false)
              && ! vect_grouped_load_supported (vectype, single_element_p,
                                                size))
-           return false;
+           return opt_result::failure_at (vinfo->stmt,
+                                          "unsupported grouped load\n");
        }
     }
 
@@ -2274,16 +2327,16 @@ again:
    for it.  The different analyses will record information in the
    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
    be vectorized.  */
-loop_vec_info
-vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
+opt_loop_vec_info
+vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
                   vec_info_shared *shared)
 {
-  loop_vec_info loop_vinfo;
   auto_vector_sizes vector_sizes;
 
   /* Autodetect first vector size we try.  */
   current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
+                                               loop->simdlen != 0);
   unsigned int next_size = 0;
 
   DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2291,35 +2344,31 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
   if (loop_outer (loop)
       && loop_vec_info_for_loop (loop_outer (loop))
       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "outer-loop already vectorized.\n");
-      return NULL;
-    }
+    return opt_loop_vec_info::failure_at (vect_location,
+                                         "outer-loop already vectorized.\n");
 
   if (!find_loop_nest (loop, &shared->loop_nest))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: loop nest containing two "
-                        "or more consecutive inner loops cannot be "
-                        "vectorized\n");
-      return NULL;
-    }
+    return opt_loop_vec_info::failure_at
+      (vect_location,
+       "not vectorized: loop nest containing two or more consecutive inner"
+       " loops cannot be vectorized\n");
 
   unsigned n_stmts = 0;
   poly_uint64 autodetected_vector_size = 0;
+  opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
+  poly_uint64 first_vector_size = 0;
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
-      loop_vinfo = vect_analyze_loop_form (loop, shared);
+      opt_loop_vec_info loop_vinfo
+       = vect_analyze_loop_form (loop, shared);
       if (!loop_vinfo)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "bad loop form.\n");
-         return NULL;
+         gcc_checking_assert (first_loop_vinfo == NULL);
+         return loop_vinfo;
        }
 
       bool fatal = false;
@@ -2327,14 +2376,32 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
       if (orig_loop_vinfo)
        LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
 
-      if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
+      opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
+      if (res)
        {
          LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
 
-         return loop_vinfo;
+         if (loop->simdlen
+             && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                          (unsigned HOST_WIDE_INT) loop->simdlen))
+           {
+             if (first_loop_vinfo == NULL)
+               {
+                 first_loop_vinfo = loop_vinfo;
+                 first_vector_size = current_vector_size;
+                 loop->aux = NULL;
+               }
+             else
+               delete loop_vinfo;
+           }
+         else
+           {
+             delete first_loop_vinfo;
+             return loop_vinfo;
+           }
        }
-
-      delete loop_vinfo;
+      else
+       delete loop_vinfo;
 
       if (next_size == 0)
        autodetected_vector_size = current_vector_size;
@@ -2343,10 +2410,31 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
          && known_eq (vector_sizes[next_size], autodetected_vector_size))
        next_size += 1;
 
-      if (fatal
-         || next_size == vector_sizes.length ()
+      if (fatal)
+       {
+         gcc_checking_assert (first_loop_vinfo == NULL);
+         return opt_loop_vec_info::propagate_failure (res);
+       }
+
+      if (next_size == vector_sizes.length ()
          || known_eq (current_vector_size, 0U))
-       return NULL;
+       {
+         if (first_loop_vinfo)
+           {
+             current_vector_size = first_vector_size;
+             loop->aux = (loop_vec_info) first_loop_vinfo;
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "***** Choosing vector size ");
+                 dump_dec (MSG_NOTE, current_vector_size);
+                 dump_printf (MSG_NOTE, "\n");
+               }
+             return first_loop_vinfo;
+           }
+         else
+           return opt_loop_vec_info::propagate_failure (res);
+       }
 
       /* Try the next biggest vector size.  */
       current_vector_size = vector_sizes[next_size++];
@@ -2444,7 +2532,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
   stmt_vec_info stmt_vinfo = stmts[0];
   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
   tree scalar_type = TREE_TYPE (vector_type);
-  struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
+  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
   gcc_assert (loop);
 
   switch (code)
@@ -2485,8 +2573,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
 static void
 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
 {
-  dump_printf_loc (msg_type, vect_location, "%s", msg);
-  dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
+  dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
 }
 
 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
@@ -2524,11 +2611,11 @@ static bool
 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
                       gimple *first_stmt)
 {
-  struct loop *loop = (gimple_bb (phi))->loop_father;
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  class loop *loop = (gimple_bb (phi))->loop_father;
+  class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
   enum tree_code code;
-  gimple *loop_use_stmt = NULL, *first, *next_stmt;
-  stmt_vec_info use_stmt_info, current_stmt_info = NULL;
+  gimple *loop_use_stmt = NULL;
+  stmt_vec_info use_stmt_info;
   tree lhs;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
@@ -2538,6 +2625,7 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
   if (loop != vect_loop)
     return false;
 
+  auto_vec<stmt_vec_info, 8> reduc_chain;
   lhs = PHI_RESULT (phi);
   code = gimple_assign_rhs_code (first_stmt);
   while (1)
@@ -2590,17 +2678,9 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
 
       /* Insert USE_STMT into reduction chain.  */
       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
-      if (current_stmt_info)
-        {
-         REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
-          REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
-            = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
-        }
-      else
-       REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
+      reduc_chain.safe_push (use_stmt_info);
 
       lhs = gimple_assign_lhs (loop_use_stmt);
-      current_stmt_info = use_stmt_info;
       size++;
    }
 
@@ -2610,9 +2690,9 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
   /* Swap the operands, if needed, to make the reduction operand be the second
      operand.  */
   lhs = PHI_RESULT (phi);
-  next_stmt = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
-  while (next_stmt)
+  for (unsigned i = 0; i < reduc_chain.length (); ++i)
     {
+      gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
       if (gimple_assign_rhs2 (next_stmt) == lhs)
        {
          tree op = gimple_assign_rhs1 (next_stmt);
@@ -2626,7 +2706,6 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
              && vect_valid_reduction_input_p (def_stmt_info))
            {
              lhs = gimple_assign_lhs (next_stmt);
-             next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
              continue;
            }
 
@@ -2645,10 +2724,8 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
              && vect_valid_reduction_input_p (def_stmt_info))
            {
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
-                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
-               }
+               dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
+                                next_stmt);
 
              swap_ssa_operands (next_stmt,
                                 gimple_assign_rhs1_ptr (next_stmt),
@@ -2663,13 +2740,20 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
         }
 
       lhs = gimple_assign_lhs (next_stmt);
-      next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
     }
 
+  /* Build up the actual chain.  */
+  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
+    {
+      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
+      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
+    }
+  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
+  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
+
   /* Save the chain for further analysis in SLP detection.  */
-  first = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
-  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
-  REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
+  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
+  REDUC_GROUP_SIZE (reduc_chain[0]) = size;
 
   return true;
 }
@@ -2779,10 +2863,7 @@ pop:
       unsigned i;
       std::pair<ssa_op_iter, use_operand_p> *x;
       FOR_EACH_VEC_ELT (path, i, x)
-       {
-         dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
-         dump_printf (MSG_NOTE, " ");
-       }
+       dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
       dump_printf (MSG_NOTE, "\n");
     }
 
@@ -2870,13 +2951,13 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
                          enum vect_reduction_type *v_reduc_type)
 {
   gphi *phi = as_a <gphi *> (phi_info->stmt);
-  struct loop *loop = (gimple_bb (phi))->loop_father;
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  class loop *loop = (gimple_bb (phi))->loop_father;
+  class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   gimple *phi_use_stmt = NULL;
   enum tree_code orig_code, code;
   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
   tree type;
-  int nloop_uses;
   tree name;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
@@ -2892,7 +2973,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
      can be constant.  See PR60382.  */
   if (has_zero_uses (phi_name))
     return NULL;
-  nloop_uses = 0;
+  unsigned nphi_def_loop_uses = 0;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
     {
       gimple *use_stmt = USE_STMT (use_p);
@@ -2908,15 +2989,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nloop_uses++;
-      if (nloop_uses > 1)
-        {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction value used in loop.\n");
-          return NULL;
-        }
-
+      nphi_def_loop_uses++;
       phi_use_stmt = use_stmt;
     }
 
@@ -2925,17 +2998,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   if (TREE_CODE (loop_arg) != SSA_NAME)
     {
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "reduction: not ssa_name: ");
-         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
-          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-       }
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction: not ssa_name: %T\n", loop_arg);
       return NULL;
     }
 
   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
-  if (!def_stmt_info)
+  if (!def_stmt_info
+      || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
     return NULL;
 
   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
@@ -2951,36 +3021,45 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   else
     {
       if (dump_enabled_p ())
-       {
-         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "reduction: unhandled reduction operation: ");
-         dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                           def_stmt_info->stmt, 0);
-       }
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction: unhandled reduction operation: %G",
+                        def_stmt_info->stmt);
       return NULL;
     }
 
-  nloop_uses = 0;
+  unsigned nlatch_def_loop_uses = 0;
   auto_vec<gphi *, 3> lcphis;
+  bool inner_loop_of_double_reduc = false;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
     {
       gimple *use_stmt = USE_STMT (use_p);
       if (is_gimple_debug (use_stmt))
        continue;
       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
-       nloop_uses++;
+       nlatch_def_loop_uses++;
       else
-       /* We can have more than one loop-closed PHI.  */
-       lcphis.safe_push (as_a <gphi *> (use_stmt));
-      if (nloop_uses > 1)
        {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.\n");
-         return NULL;
+         /* We can have more than one loop-closed PHI.  */
+         lcphis.safe_push (as_a <gphi *> (use_stmt));
+         if (nested_in_vect_loop
+             && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
+                 == vect_double_reduction_def))
+           inner_loop_of_double_reduc = true;
        }
     }
 
+  /* If this isn't a nested cycle or if the nested cycle reduction value
+     is used ouside of the inner loop we cannot handle uses of the reduction
+     value.  */
+  if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
+      && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction used in loop.\n");
+      return NULL;
+    }
+
   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
      defined in the inner loop.  */
   if (phi_def)
@@ -3004,6 +3083,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           && loop->inner
           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
           && is_gimple_assign (def1)
+         && is_a <gphi *> (phi_use_stmt)
          && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
         {
           if (dump_enabled_p ())
@@ -3038,9 +3118,31 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
     }
 
   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   code = orig_code = gimple_assign_rhs_code (def_stmt);
 
+  if (nested_in_vect_loop && !check_reduction)
+    {
+      /* FIXME: Even for non-reductions code generation is funneled
+        through vectorizable_reduction for the stmt defining the
+        PHI latch value.  So we have to artificially restrict ourselves
+        for the supported operations.  */
+      switch (get_gimple_rhs_class (code))
+       {
+       case GIMPLE_BINARY_RHS:
+       case GIMPLE_TERNARY_RHS:
+         break;
+       default:
+         /* Not supported by vectorizable_reduction.  */
+         if (dump_enabled_p ())
+           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                           "nested cycle: not handled operation: ");
+         return NULL;
+       }
+      if (dump_enabled_p ())
+       report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
+      return def_stmt_info;
+    }
+
   /* We can handle "res -= x[i]", which is non-associative by
      simply rewriting this into "res += -x[i]".  Avoid changing
      gimple instruction for the first simple tests and only do this
@@ -3113,27 +3215,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       if (dump_enabled_p ())
         {
           dump_printf_loc (MSG_NOTE, vect_location,
-                          "reduction: multiple types: operation type: ");
-          dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
-          dump_printf (MSG_NOTE, ", operands types: ");
-          dump_generic_expr (MSG_NOTE, TDF_SLIM,
-                            TREE_TYPE (op1));
-          dump_printf (MSG_NOTE, ",");
-          dump_generic_expr (MSG_NOTE, TDF_SLIM,
-                            TREE_TYPE (op2));
+                          "reduction: multiple types: operation type: "
+                          "%T, operands types: %T,%T",
+                          type,  TREE_TYPE (op1), TREE_TYPE (op2));
           if (op3)
-            {
-              dump_printf (MSG_NOTE, ",");
-              dump_generic_expr (MSG_NOTE, TDF_SLIM,
-                                TREE_TYPE (op3));
-            }
+           dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
 
           if (op4)
-            {
-              dump_printf (MSG_NOTE, ",");
-              dump_generic_expr (MSG_NOTE, TDF_SLIM,
-                                TREE_TYPE (op4));
-            }
+           dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
           dump_printf (MSG_NOTE, "\n");
         }
 
@@ -3174,6 +3263,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       && def2_info->stmt == phi
       && (code == COND_EXPR
          || !def1_info
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
          || vect_valid_reduction_input_p (def1_info)))
     {
       if (dump_enabled_p ())
@@ -3185,6 +3275,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       && def1_info->stmt == phi
       && (code == COND_EXPR
          || !def2_info
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
          || vect_valid_reduction_input_p (def2_info)))
     {
       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
@@ -3253,16 +3344,6 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       return def_stmt_info;
     }
 
-  /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
-  gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
-  while (first)
-    {
-      gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
-      REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
-      REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
-      first = next;
-    }
-
   /* Look for the expression computing loop_arg from loop PHI result.  */
   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
     return def_stmt_info;
@@ -3323,8 +3404,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
          iterations are unknown, count a taken branch per peeled loop.  */
       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
                                 NULL, 0, vect_prologue);
-      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-                                NULL, 0, vect_epilogue);
+      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+                                 NULL, 0, vect_epilogue);
     }
   else
     {
@@ -3342,24 +3423,16 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
   int j;
   if (peel_iters_prologue)
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
-       {
-         stmt_vec_info stmt_info
-           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
-         retval += record_stmt_cost (prologue_cost_vec,
-                                     si->count * peel_iters_prologue,
-                                     si->kind, stmt_info, si->misalign,
-                                     vect_prologue);
-       }
+      retval += record_stmt_cost (prologue_cost_vec,
+                                 si->count * peel_iters_prologue,
+                                 si->kind, si->stmt_info, si->misalign,
+                                 vect_prologue);
   if (*peel_iters_epilogue)
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
-       {
-         stmt_vec_info stmt_info
-           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
-         retval += record_stmt_cost (epilogue_cost_vec,
-                                     si->count * *peel_iters_epilogue,
-                                     si->kind, stmt_info, si->misalign,
-                                     vect_epilogue);
-       }
+      retval += record_stmt_cost (epilogue_cost_vec,
+                                 si->count * *peel_iters_epilogue,
+                                 si->kind, si->stmt_info, si->misalign,
+                                 vect_epilogue);
 
   return retval;
 }
@@ -3400,7 +3473,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   /* Cost model disabled.  */
   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
-      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
       *ret_min_profitable_niters = 0;
       *ret_min_profitable_estimate = 0;
       return;
@@ -3413,9 +3487,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
                            vect_prologue);
-      dump_printf (MSG_NOTE,
-                   "cost model: Adding cost of checks for loop "
-                   "versioning to treat misalignment.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "cost model: Adding cost of checks for loop "
+                    "versioning to treat misalignment.\n");
     }
 
   /* Requires loop versioning with alias checks.  */
@@ -3442,9 +3517,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
          (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
                                NULL, 0, vect_prologue);
        }
-      dump_printf (MSG_NOTE,
-                   "cost model: Adding cost of checks for loop "
-                   "versioning aliasing.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "cost model: Adding cost of checks for loop "
+                    "versioning aliasing.\n");
     }
 
   /* Requires loop versioning with niter checks.  */
@@ -3453,9 +3529,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       /*  FIXME: Make cost depend on complexity of individual check.  */
       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
                            vect_prologue);
-      dump_printf (MSG_NOTE,
-                  "cost model: Adding cost of checks for loop "
-                  "versioning niters.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "cost model: Adding cost of checks for loop "
+                    "versioning niters.\n");
     }
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
@@ -3495,27 +3572,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
          int j;
          FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
                            j, si)
-           {
-             struct _stmt_vec_info *stmt_info
-               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
-             (void) add_stmt_cost (target_cost_data, si->count,
-                                   si->kind, stmt_info, si->misalign,
-                                   vect_epilogue);
-           }
+           (void) add_stmt_cost (target_cost_data, si->count,
+                                 si->kind, si->stmt_info, si->misalign,
+                                 vect_epilogue);
        }
     }
   else if (npeel < 0)
     {
       peel_iters_prologue = assumed_vf / 2;
-      dump_printf (MSG_NOTE, "cost model: "
-                   "prologue peel iters set to vf/2.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "prologue peel iters set to vf/2.\n");
 
       /* If peeling for alignment is unknown, loop bound of main loop becomes
          unknown.  */
       peel_iters_epilogue = assumed_vf / 2;
-      dump_printf (MSG_NOTE, "cost model: "
-                   "epilogue peel iters set to vf/2 because "
-                   "peeling for alignment is unknown.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "epilogue peel iters set to vf/2 because "
+                    "peeling for alignment is unknown.\n");
 
       /* If peeled iterations are unknown, count a taken branch and a not taken
          branch per peeled loop. Even if scalar loop iterations are known,
@@ -3533,15 +3608,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       int j;
       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
        {
-         struct _stmt_vec_info *stmt_info
-           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
          (void) add_stmt_cost (target_cost_data,
                                si->count * peel_iters_prologue,
-                               si->kind, stmt_info, si->misalign,
+                               si->kind, si->stmt_info, si->misalign,
                                vect_prologue);
          (void) add_stmt_cost (target_cost_data,
                                si->count * peel_iters_epilogue,
-                               si->kind, stmt_info, si->misalign,
+                               si->kind, si->stmt_info, si->misalign,
                                vect_epilogue);
        }
     }
@@ -3564,20 +3637,12 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                                          &epilogue_cost_vec);
 
       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
-       {
-         struct _stmt_vec_info *stmt_info
-           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
-         (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
-                               si->misalign, vect_prologue);
-       }
+       (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
+                             si->misalign, vect_prologue);
 
       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
-       {
-         struct _stmt_vec_info *stmt_info
-           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
-         (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
-                               si->misalign, vect_epilogue);
-       }
+       (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
+                             si->misalign, vect_epilogue);
 
       prologue_cost_vec.release ();
       epilogue_cost_vec.release ();
@@ -3684,14 +3749,89 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   /* Calculate number of iterations required to make the vector version
      profitable, relative to the loop bodies only.  The following condition
      must hold true:
-     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+     SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
      where
      SIC = scalar iteration cost, VIC = vector iteration cost,
      VOC = vector outside cost, VF = vectorization factor,
-     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+     NPEEL = prologue iterations + epilogue iterations,
      SOC = scalar outside cost for run time cost model check.  */
 
-  if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
+  int saving_per_viter = (scalar_single_iter_cost * assumed_vf
+                         - vec_inside_cost);
+  if (saving_per_viter <= 0)
+    {
+      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
+       warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
+                   "vectorization did not happen for a simd loop");
+
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cost model: the vector iteration cost = %d "
+                        "divided by the scalar iteration cost = %d "
+                        "is greater or equal to the vectorization factor = %d"
+                         ".\n",
+                        vec_inside_cost, scalar_single_iter_cost, assumed_vf);
+      *ret_min_profitable_niters = -1;
+      *ret_min_profitable_estimate = -1;
+      return;
+    }
+
+  /* ??? The "if" arm is written to handle all cases; see below for what
+     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* Rewriting the condition above in terms of the number of
+        vector iterations (vniters) rather than the number of
+        scalar iterations (niters) gives:
+
+        SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
+
+        <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
+
+        For integer N, X and Y when X > 0:
+
+        N * X > Y <==> N >= (Y /[floor] X) + 1.  */
+      int outside_overhead = (vec_outside_cost
+                             - scalar_single_iter_cost * peel_iters_prologue
+                             - scalar_single_iter_cost * peel_iters_epilogue
+                             - scalar_outside_cost);
+      /* We're only interested in cases that require at least one
+        vector iteration.  */
+      int min_vec_niters = 1;
+      if (outside_overhead > 0)
+       min_vec_niters = outside_overhead / saving_per_viter + 1;
+
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
+                    min_vec_niters);
+
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Now that we know the minimum number of vector iterations,
+            find the minimum niters for which the scalar cost is larger:
+
+            SIC * niters > VIC * vniters + VOC - SOC
+
+            We know that the minimum niters is no more than
+            vniters * VF + NPEEL, but it might be (and often is) less
+            than that if a partial vector iteration is cheaper than the
+            equivalent scalar code.  */
+         int threshold = (vec_inside_cost * min_vec_niters
+                          + vec_outside_cost
+                          - scalar_outside_cost);
+         if (threshold <= 0)
+           min_profitable_iters = 1;
+         else
+           min_profitable_iters = threshold / scalar_single_iter_cost + 1;
+       }
+      else
+       /* Convert the number of vector iterations into a number of
+          scalar iterations.  */
+       min_profitable_iters = (min_vec_niters * assumed_vf
+                               + peel_iters_prologue
+                               + peel_iters_epilogue);
+    }
+  else
     {
       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
                              * assumed_vf
@@ -3701,8 +3841,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
         min_profitable_iters = 0;
       else
        {
-         min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
-                                  - vec_inside_cost);
+         min_profitable_iters /= saving_per_viter;
 
          if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
              <= (((int) vec_inside_cost * min_profitable_iters)
@@ -3711,28 +3850,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
            min_profitable_iters++;
        }
     }
-  /* vector version will never be profitable.  */
-  else
-    {
-      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
-       warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
-                   "vectorization did not happen for a simd loop");
-
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "cost model: the vector iteration cost = %d "
-                        "divided by the scalar iteration cost = %d "
-                        "is greater or equal to the vectorization factor = %d"
-                         ".\n",
-                        vec_inside_cost, scalar_single_iter_cost, assumed_vf);
-      *ret_min_profitable_niters = -1;
-      *ret_min_profitable_estimate = -1;
-      return;
-    }
 
-  dump_printf (MSG_NOTE,
-              "  Calculated minimum iters for profitability: %d\n",
-              min_profitable_iters);
+  if (dump_enabled_p ())
+    dump_printf (MSG_NOTE,
+                "  Calculated minimum iters for profitability: %d\n",
+                min_profitable_iters);
 
   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
@@ -3751,10 +3873,34 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
      Non-vectorized variant is SIC * niters and it must win over vector
      variant on the expected loop trip count.  The following condition must hold true:
-     SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
+     SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
 
   if (vec_outside_cost <= 0)
     min_profitable_estimate = 0;
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* This is a repeat of the code above, but with + SOC rather
+        than - SOC.  */
+      int outside_overhead = (vec_outside_cost
+                             - scalar_single_iter_cost * peel_iters_prologue
+                             - scalar_single_iter_cost * peel_iters_epilogue
+                             + scalar_outside_cost);
+      int min_vec_niters = 1;
+      if (outside_overhead > 0)
+       min_vec_niters = outside_overhead / saving_per_viter + 1;
+
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         int threshold = (vec_inside_cost * min_vec_niters
+                          + vec_outside_cost
+                          + scalar_outside_cost);
+         min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
+       }
+      else
+       min_profitable_estimate = (min_vec_niters * assumed_vf
+                                  + peel_iters_prologue
+                                  + peel_iters_epilogue);
+    }
   else
     {
       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
@@ -3831,7 +3977,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
   tree vectype;
   machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
 
   if (loop_vinfo)
     loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -3844,10 +3990,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
 
   vectype = STMT_VINFO_VECTYPE (stmt_info);
   mode = TYPE_MODE (vectype);
-  stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
-
-  if (!orig_stmt_info)
-    orig_stmt_info = stmt_info;
+  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
 
   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
 
@@ -4020,15 +4163,15 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
 /* Function get_initial_def_for_reduction
 
    Input:
-   STMT - a stmt that performs a reduction operation in the loop.
+   STMT_VINFO - a stmt that performs a reduction operation in the loop.
    INIT_VAL - the initial value of the reduction variable
 
    Output:
    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
         of the reduction (used for adjusting the epilog - see below).
-   Return a vector variable, initialized according to the operation that STMT
-        performs. This vector will be used as the initial value of the
-        vector of partial results.
+   Return a vector variable, initialized according to the operation that
+       STMT_VINFO performs. This vector will be used as the initial value
+       of the vector of partial results.
 
    Option1 (adjust in epilog): Initialize the vector as follows:
      add/bit or/xor:    [0,0,...,0,0]
@@ -4049,7 +4192,7 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
    for (i=0;i<n;i++)
      s = s + a[i];
 
-   STMT is 's = s + a[i]', and the reduction variable is 's'.
+   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
    For a vector of 4 units, we want to return either [0,0,0,init_val],
    or [0,0,0,0] and let the caller know that it needs to adjust
    the result at the end by 'init_val'.
@@ -4061,15 +4204,14 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
    A cost model should help decide between these two schemes.  */
 
 tree
-get_initial_def_for_reduction (gimple *stmt, tree init_val,
+get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
                                tree *adjustment_def)
 {
-  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (scalar_type);
-  enum tree_code code = gimple_assign_rhs_code (stmt);
+  enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
   tree def_for_init;
   tree init_def;
   REAL_VALUE_TYPE real_init_val = dconst0;
@@ -4081,8 +4223,8 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
              || SCALAR_FLOAT_TYPE_P (scalar_type));
 
-  gcc_assert (nested_in_vect_loop_p (loop, stmt)
-             || loop == (gimple_bb (stmt))->loop_father);
+  gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
+             || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
 
   vect_reduction_type reduction_type
     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
@@ -4151,7 +4293,7 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
            if (reduction_type != COND_REDUCTION
                && reduction_type != EXTRACT_LAST_REDUCTION)
              {
-               init_def = vect_get_vec_def_for_operand (init_val, stmt);
+               init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
                break;
              }
          }
@@ -4185,14 +4327,9 @@ get_initial_defs_for_reduction (slp_tree slp_node,
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type;
-  tree vop;
-  int group_size = stmts.length ();
-  unsigned int vec_num, i;
-  unsigned number_of_copies = 1;
-  vec<tree> voprnds;
-  voprnds.create (number_of_vectors);
-  struct loop *loop;
-  auto_vec<tree, 16> permute_results;
+  unsigned int group_size = stmts.length ();
+  unsigned int i;
+  class loop *loop;
 
   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
 
@@ -4223,116 +4360,76 @@ get_initial_defs_for_reduction (slp_tree slp_node,
   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
     nunits = group_size;
 
-  number_of_copies = nunits * number_of_vectors / group_size;
-
   number_of_places_left_in_vector = nunits;
   bool constant_p = true;
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
-  for (j = 0; j < number_of_copies; j++)
+  gimple_seq ctor_seq = NULL;
+  for (j = 0; j < nunits * number_of_vectors; ++j)
     {
-      for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
-        {
-         tree op;
-         /* Get the def before the loop.  In reduction chain we have only
-            one initial value.  */
-         if ((j != (number_of_copies - 1)
-              || (reduc_chain && i != 0))
-             && neutral_op)
-           op = neutral_op;
-         else
-           op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
-
-          /* Create 'vect_ = {op0,op1,...,opn}'.  */
-          number_of_places_left_in_vector--;
-         elts[number_of_places_left_in_vector] = op;
-         if (!CONSTANT_CLASS_P (op))
-           constant_p = false;
+      tree op;
+      i = j % group_size;
+      stmt_vinfo = stmts[i];
 
-          if (number_of_places_left_in_vector == 0)
-            {
-             gimple_seq ctor_seq = NULL;
-             tree init;
-             if (constant_p && !neutral_op
-                 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
-                 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
-               /* Build the vector directly from ELTS.  */
-               init = gimple_build_vector (&ctor_seq, &elts);
-             else if (neutral_op)
-               {
-                 /* Build a vector of the neutral value and shift the
-                    other elements into place.  */
-                 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
-                                                      neutral_op);
-                 int k = nunits;
-                 while (k > 0 && elts[k - 1] == neutral_op)
-                   k -= 1;
-                 while (k > 0)
-                   {
-                     k -= 1;
-                     init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
-                                          vector_type, init, elts[k]);
-                   }
-               }
-             else
+      /* Get the def before the loop.  In reduction chain we have only
+        one initial value.  Else we have as many as PHIs in the group.  */
+      if (reduc_chain)
+       op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+      else if (((vec_oprnds->length () + 1) * nunits
+               - number_of_places_left_in_vector >= group_size)
+              && neutral_op)
+       op = neutral_op;
+      else
+       op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+
+      /* Create 'vect_ = {op0,op1,...,opn}'.  */
+      number_of_places_left_in_vector--;
+      elts[nunits - number_of_places_left_in_vector - 1] = op;
+      if (!CONSTANT_CLASS_P (op))
+       constant_p = false;
+
+      if (number_of_places_left_in_vector == 0)
+       {
+         tree init;
+         if (constant_p && !neutral_op
+             ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
+             : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
+           /* Build the vector directly from ELTS.  */
+           init = gimple_build_vector (&ctor_seq, &elts);
+         else if (neutral_op)
+           {
+             /* Build a vector of the neutral value and shift the
+                other elements into place.  */
+             init = gimple_build_vector_from_val (&ctor_seq, vector_type,
+                                                  neutral_op);
+             int k = nunits;
+             while (k > 0 && elts[k - 1] == neutral_op)
+               k -= 1;
+             while (k > 0)
                {
-                 /* First time round, duplicate ELTS to fill the
-                    required number of vectors, then cherry pick the
-                    appropriate result for each iteration.  */
-                 if (vec_oprnds->is_empty ())
-                   duplicate_and_interleave (&ctor_seq, vector_type, elts,
-                                             number_of_vectors,
-                                             permute_results);
-                 init = permute_results[number_of_vectors - j - 1];
+                 k -= 1;
+                 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
+                                      vector_type, init, elts[k]);
                }
-             if (ctor_seq != NULL)
-               gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
-             voprnds.quick_push (init);
-
-              number_of_places_left_in_vector = nunits;
-             elts.new_vector (vector_type, nunits, 1);
-             elts.quick_grow (nunits);
-             constant_p = true;
-            }
-        }
-    }
-
-  /* Since the vectors are created in the reverse order, we should invert
-     them.  */
-  vec_num = voprnds.length ();
-  for (j = vec_num; j != 0; j--)
-    {
-      vop = voprnds[j - 1];
-      vec_oprnds->quick_push (vop);
-    }
-
-  voprnds.release ();
-
-  /* In case that VF is greater than the unrolling factor needed for the SLP
-     group of stmts, NUMBER_OF_VECTORS to be created is greater than
-     NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
-     to replicate the vectors.  */
-  tree neutral_vec = NULL;
-  while (number_of_vectors > vec_oprnds->length ())
-    {
-      if (neutral_op)
-        {
-          if (!neutral_vec)
+           }
+         else
            {
-             gimple_seq ctor_seq = NULL;
-             neutral_vec = gimple_build_vector_from_val
-               (&ctor_seq, vector_type, neutral_op);
-             if (ctor_seq != NULL)
-               gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+             /* First time round, duplicate ELTS to fill the
+                required number of vectors.  */
+             duplicate_and_interleave (&ctor_seq, vector_type, elts,
+                                       number_of_vectors, *vec_oprnds);
+             break;
            }
-          vec_oprnds->quick_push (neutral_vec);
-        }
-      else
-        {
-          for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
-            vec_oprnds->quick_push (vop);
-        }
+         vec_oprnds->quick_push (init);
+
+         number_of_places_left_in_vector = nunits;
+         elts.new_vector (vector_type, nunits, 1);
+         elts.quick_grow (nunits);
+         constant_p = true;
+       }
     }
+  if (ctor_seq != NULL)
+    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
 }
 
 
@@ -4343,7 +4440,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
   
    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 
      reduction statements. 
-   STMT is the scalar reduction stmt that is being vectorized.
+   STMT_INFO is the scalar reduction stmt that is being vectorized.
    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
      number of elements that we can fit in a vectype (nunits).  In this case
      we have to generate more than one vector stmt - i.e - we need to "unroll"
@@ -4356,7 +4453,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
      statement that is defined by REDUCTION_PHI.
    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
    SLP_NODE is an SLP node containing a group of reduction statements. The 
-     first one in this group is STMT.
+     first one in this group is STMT_INFO.
    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
@@ -4381,8 +4478,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 
         loop:
           vec_def = phi <null, null>            # REDUCTION_PHI
-          VECT_DEF = vector_stmt                # vectorized form of STMT
-          s_loop = scalar_stmt                  # (scalar) STMT
+          VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
+          s_loop = scalar_stmt                  # (scalar) STMT_INFO
         loop_exit:
           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
           use <s_out0>
@@ -4392,8 +4489,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 
         loop:
           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
-          VECT_DEF = vector_stmt                # vectorized form of STMT
-          s_loop = scalar_stmt                  # (scalar) STMT
+          VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
+          s_loop = scalar_stmt                  # (scalar) STMT_INFO
         loop_exit:
           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
@@ -4405,7 +4502,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 */
 
 static void
-vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
+vect_create_epilog_for_reduction (vec<tree> vect_defs,
+                                 stmt_vec_info stmt_info,
                                  gimple *reduc_def_stmt,
                                  int ncopies, internal_fn reduc_fn,
                                  vec<stmt_vec_info> reduction_phis,
@@ -4415,12 +4513,11 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
                                  tree induc_val, enum tree_code induc_code,
                                  tree neutral_op)
 {
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   stmt_vec_info prev_phi_info;
   tree vectype;
   machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
   basic_block exit_bb;
   tree scalar_dest;
   tree scalar_type;
@@ -4430,7 +4527,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
   tree vec_dest;
   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
-  enum tree_code code = gimple_assign_rhs_code (stmt);
+  enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
   gimple *exit_phi;
   tree bitsize;
   tree adjustment_def = NULL;
@@ -4444,7 +4541,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
   bool nested_in_vect_loop = false;
   auto_vec<gimple *> new_phis;
   auto_vec<stmt_vec_info> inner_phis;
-  enum vect_def_type dt = vect_unknown_def_type;
   int j, i;
   auto_vec<tree> scalar_results;
   unsigned int group_size = 1, k, ratio;
@@ -4459,7 +4555,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
   if (slp_node)
     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 
 
-  if (nested_in_vect_loop_p (loop, stmt))
+  if (nested_in_vect_loop_p (loop, stmt_info))
     {
       outer_loop = loop;
       loop = loop->inner;
@@ -4528,11 +4624,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
          /* Do not use an adjustment def as that case is not supported
             correctly if ncopies is not one.  */
          vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
-         vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
+         vec_initial_def = vect_get_vec_def_for_operand (initial_def,
+                                                         stmt_info);
        }
       else
-       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
-                                                        &adjustment_def);
+       vec_initial_def
+         = get_initial_def_for_reduction (stmt_info, initial_def,
+                                          &adjustment_def);
       vec_initial_defs.create (1);
       vec_initial_defs.quick_push (vec_initial_def);
     }
@@ -4549,8 +4647,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
              phi_info = STMT_VINFO_RELATED_STMT (phi_info);
              if (nested_in_vect_loop)
                vec_init_def
-                 = vect_get_vec_def_for_stmt_copy (initial_def_dt,
-                                                   vec_init_def);
+                 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
            }
 
          /* Set the loop-entry arg of the reduction-phi.  */
@@ -4577,17 +4674,14 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 
           /* Set the loop-latch arg for the reduction-phi.  */
           if (j > 0)
-            def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
+           def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
 
          add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
 
           if (dump_enabled_p ())
-            {
-              dump_printf_loc (MSG_NOTE, vect_location,
-                              "transform reduction: created def-use cycle: ");
-              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
-              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
-            }
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "transform reduction: created def-use cycle: %G%G",
+                            phi, SSA_NAME_DEF_STMT (def));
         }
     }
 
@@ -4718,7 +4812,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
             new_phis.quick_push (phi);
           else
            {
-             def = vect_get_vec_def_for_stmt_copy (dt, def);
+             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
              STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
            }
 
@@ -4769,13 +4863,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
          Otherwise (it is a regular reduction) - the tree-code and scalar-def
          are taken from STMT.  */
 
-  stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
-  if (!orig_stmt_info)
-    {
-      /* Regular reduction  */
-      orig_stmt_info = stmt_info;
-    }
-  else
+  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
+  if (orig_stmt_info != stmt_info)
     {
       /* Reduction pattern  */
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
@@ -4808,7 +4897,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
      # b1 = phi <b2, b0>
      a2 = operation (a1)
      b2 = operation (b1)  */
-  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
+  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 
   /* True if we should implement SLP_REDUC using native reduction operations
      instead of scalar operations.  */
@@ -4823,7 +4912,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 
      we may end up with more than one vector result.  Here we reduce them to
      one vector.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
     {
       tree first_vect = PHI_RESULT (new_phis[0]);
       gassign *new_vec_stmt = NULL;
@@ -4853,11 +4942,11 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
       tree first_vect = PHI_RESULT (new_phis[0]);
       gassign *new_vec_stmt = NULL;
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
-      gimple *next_phi = new_phis[0];
+      stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
       for (int k = 1; k < ncopies; ++k)
        {
-         next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
-         tree second_vect = PHI_RESULT (next_phi);
+         next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
+         tree second_vect = PHI_RESULT (next_phi_info->stmt);
           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
           new_vec_stmt = gimple_build_assign (tem, code,
                                              first_vect, second_vect);
@@ -5023,7 +5112,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
          if (off != 0)
            {
              tree new_idx_val = idx_val;
-             tree new_val = val;
              if (off != v_size - el_size)
                {
                  new_idx_val = make_ssa_name (idx_eltype);
@@ -5032,7 +5120,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
                                                     old_idx_val);
                  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
                }
-             new_val = make_ssa_name (data_eltype);
+             tree new_val = make_ssa_name (data_eltype);
              epilog_stmt = gimple_build_assign (new_val,
                                                 COND_EXPR,
                                                 build2 (GT_EXPR,
@@ -5202,6 +5290,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
          if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
              == INTEGER_INDUC_COND_REDUCTION)
            code = induc_code;
+         else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
+                  == CONST_COND_REDUCTION)
+           code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
          else
            code = MAX_EXPR;
        }
@@ -5210,14 +5301,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
         in a vector mode of smaller size and first reduce upper/lower
         halves against each other.  */
       enum machine_mode mode1 = mode;
-      tree vectype1 = vectype;
       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
       unsigned sz1 = sz;
       if (!slp_reduc
          && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
        sz1 = GET_MODE_SIZE (mode1).to_constant ();
 
-      vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+      tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
       reduce_with_shift = have_whole_vector_shift (mode1);
       if (!VECTOR_MODE_P (mode1))
        reduce_with_shift = false;
@@ -5338,7 +5428,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
             dump_printf_loc (MSG_NOTE, vect_location,
                             "Reduce using vector shifts\n");
 
-         mode1 = TYPE_MODE (vectype1);
           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
           for (elt_offset = nelements / 2;
                elt_offset >= 1;
@@ -5568,14 +5657,10 @@ vect_finalize_reduction:
      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
      LHS of the last stmt in the reduction chain, since we are looking for
      the loop exit phi node.  */
-  if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
     {
       stmt_vec_info dest_stmt_info
-       = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
-      /* Handle reduction patterns.  */
-      if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
-       dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
-
+       = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
       group_size = 1;
     }
@@ -5594,11 +5679,12 @@ vect_finalize_reduction:
   else
     ratio = 1;
 
+  stmt_vec_info epilog_stmt_info = NULL;
   for (k = 0; k < group_size; k++)
     {
       if (k % ratio == 0)
         {
-          epilog_stmt = new_phis[k / ratio];
+         epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
          reduction_phi_info = reduction_phis[k / ratio];
          if (double_reduc)
            inner_phi = inner_phis[k / ratio];
@@ -5634,18 +5720,10 @@ vect_finalize_reduction:
                = loop_vinfo->lookup_stmt (exit_phi);
               gphi *vect_phi;
 
-              /* FORNOW. Currently not supporting the case that an inner-loop
-                 reduction is not used in the outer-loop (but only outside the
-                 outer-loop), unless it is double reduction.  */
-              gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
-                           && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
-                          || double_reduc);
-
              if (double_reduc)
                STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
              else
-               STMT_VINFO_VEC_STMT (exit_phi_vinfo)
-                 = vinfo_for_stmt (epilog_stmt);
+               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
               if (!double_reduc
                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
                       != vect_double_reduction_def)
@@ -5700,7 +5778,7 @@ vect_finalize_reduction:
                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
                                              loop_preheader_edge (outer_loop));
                   vect_phi_init = get_initial_def_for_reduction
-                   (stmt, preheader_arg, NULL);
+                   (stmt_info, preheader_arg, NULL);
 
                   /* Update phi node arguments with vs0 and vs2.  */
                   add_phi_arg (vect_phi, vect_phi_init,
@@ -5709,11 +5787,9 @@ vect_finalize_reduction:
                  add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
                               loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
                   if (dump_enabled_p ())
-                    {
-                      dump_printf_loc (MSG_NOTE, vect_location,
-                                      "created double reduction phi node: ");
-                      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
-                    }
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "created double reduction phi node: %G",
+                                    vect_phi);
 
                   vect_phi_res = PHI_RESULT (vect_phi);
 
@@ -5836,9 +5912,33 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
   return lhs;
 }
 
-/* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
+/* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
+   type of the vector input.  */
+
+static internal_fn
+get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
+{
+  internal_fn mask_reduc_fn;
+
+  switch (reduc_fn)
+    {
+    case IFN_FOLD_LEFT_PLUS:
+      mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
+      break;
+
+    default:
+      return IFN_LAST;
+    }
+
+  if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
+                                     OPTIMIZE_FOR_SPEED))
+    return mask_reduc_fn;
+  return IFN_LAST;
+}
+
+/* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
-   statement.  CODE is the operation performed by STMT and OPS are
+   statement.  CODE is the operation performed by STMT_INFO and OPS are
    its scalar operands.  REDUC_INDEX is the index of the operand in
    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
    implements in-order reduction, or IFN_LAST if we should open-code it.
@@ -5846,18 +5946,19 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
    that should be used to control the operation in a fully-masked loop.  */
 
 static bool
-vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
+vectorize_fold_left_reduction (stmt_vec_info stmt_info,
+                              gimple_stmt_iterator *gsi,
                               stmt_vec_info *vec_stmt, slp_tree slp_node,
                               gimple *reduc_def_stmt,
                               tree_code code, internal_fn reduc_fn,
                               tree ops[3], tree vectype_in,
                               int reduc_index, vec_loop_masks *masks)
 {
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   stmt_vec_info new_stmt_info = NULL;
+  internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
 
   int ncopies;
   if (slp_node)
@@ -5865,7 +5966,7 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   else
     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
 
-  gcc_assert (!nested_in_vect_loop_p (loop, stmt));
+  gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
@@ -5883,13 +5984,20 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   auto_vec<tree> vec_oprnds0;
   if (slp_node)
     {
-      vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
+      auto_vec<vec<tree> > vec_defs (2);
+      auto_vec<tree> sops(2);
+      sops.quick_push (ops[0]);
+      sops.quick_push (ops[1]);
+      vect_get_slp_defs (sops, slp_node, &vec_defs);
+      vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
+      vec_defs[0].release ();
+      vec_defs[1].release ();
       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
     }
   else
     {
-      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
+      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
       vec_oprnds0.create (1);
       vec_oprnds0.quick_push (loop_vec_def0);
       scalar_dest_def_info = stmt_info;
@@ -5927,16 +6035,21 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
          def0 = negated;
        }
 
-      if (mask)
+      if (mask && mask_reduc_fn == IFN_LAST)
        def0 = merge_with_identity (gsi, mask, vectype_out, def0,
                                    vector_identity);
 
       /* On the first iteration the input is simply the scalar phi
         result, and for subsequent iterations it is the output of
         the preceding operation.  */
-      if (reduc_fn != IFN_LAST)
+      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
        {
-         new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
+         if (mask && mask_reduc_fn != IFN_LAST)
+           new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
+                                                  def0, mask);
+         else
+           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
+                                                  def0);
          /* For chained SLP reductions the output of the previous reduction
             operation serves as the input of the next. For the final statement
             the output cannot be a temporary - we reuse the original
@@ -5956,7 +6069,7 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
          /* Remove the statement, so that we can use the same code paths
             as for statements that we've just created.  */
          gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
-         gsi_remove (&tmp_gsi, false);
+         gsi_remove (&tmp_gsi, true);
        }
 
       if (i == vec_num - 1)
@@ -5981,16 +6094,16 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
 /* Function is_nonwrapping_integer_induction.
 
-   Check if STMT (which is part of loop LOOP) both increments and
+   Check if STMT_VINO (which is part of loop LOOP) both increments and
    does not cause overflow.  */
 
 static bool
-is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
+is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
 {
-  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+  gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
-  tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
+  tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
   widest_int ni, max_loop_value, lhs_max;
   wi::overflow_type overflow = wi::OVF_NONE;
 
@@ -6021,19 +6134,81 @@ is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
          <= TYPE_PRECISION (lhs_type));
 }
 
+/* Check if masking can be supported by inserting a conditional expression.
+   CODE is the code for the operation.  COND_FN is the conditional internal
+   function, if it exists.  VECTYPE_IN is the type of the vector input.  */
+static bool
+use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
+                        tree vectype_in)
+{
+  if (cond_fn != IFN_LAST
+      && direct_internal_fn_supported_p (cond_fn, vectype_in,
+                                        OPTIMIZE_FOR_SPEED))
+    return false;
+
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+    case SAD_EXPR:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Insert a conditional expression to enable masked vectorization.  CODE is the
+   code for the operation.  VOP is the array of operands.  MASK is the loop
+   mask.  GSI is a statement iterator used to place the new conditional
+   expression.  */
+static void
+build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
+                     gimple_stmt_iterator *gsi)
+{
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      {
+       tree vectype = TREE_TYPE (vop[1]);
+       tree zero = build_zero_cst (vectype);
+       tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+       gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+                                              mask, vop[1], zero);
+       gsi_insert_before (gsi, select, GSI_SAME_STMT);
+       vop[1] = masked_op1;
+       break;
+      }
+
+    case SAD_EXPR:
+      {
+       tree vectype = TREE_TYPE (vop[1]);
+       tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+       gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+                                              mask, vop[1], vop[0]);
+       gsi_insert_before (gsi, select, GSI_SAME_STMT);
+       vop[1] = masked_op1;
+       break;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Function vectorizable_reduction.
 
-   Check if STMT performs a reduction operation that can be vectorized.
-   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+   Check if STMT_INFO performs a reduction operation that can be vectorized.
+   If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
-   Return FALSE if not a vectorizable STMT, TRUE otherwise.
+   Return true if STMT_INFO is vectorizable in this way.
 
    This function also handles reduction idioms (patterns) that have been
-   recognized in advance during vect_pattern_recog.  In this case, STMT may be
-   of this form:
+   recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
+   may be of this form:
      X = pattern_expr (arg0, arg1, ..., X)
-   and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
-   sequence that had been detected and replaced by the pattern-stmt (STMT).
+   and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
+   sequence that had been detected and replaced by the pattern-stmt
+   (STMT_INFO).
 
    This function also handles reduction of condition expressions, for example:
      for (int i = 0; i < N; i++)
@@ -6045,9 +6220,9 @@ is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
    index into the vector of results.
 
    In some cases of reduction patterns, the type of the reduction variable X is
-   different than the type of the other arguments of STMT.
-   In such cases, the vectype that is used when transforming STMT into a vector
-   stmt is different than the vectype that is used to determine the
+   different than the type of the other arguments of STMT_INFO.
+   In such cases, the vectype that is used when transforming STMT_INFO into
+   a vector stmt is different than the vectype that is used to determine the
    vectorization factor, because it consists of a different number of elements
    than the actual number of elements that are being operated upon in parallel.
 
@@ -6071,18 +6246,17 @@ is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
    does *NOT* necessarily hold for reduction patterns.  */
 
 bool
-vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
+vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                        stmt_vec_info *vec_stmt, slp_tree slp_node,
                        slp_instance slp_node_instance,
                        stmt_vector_for_cost *cost_vec)
 {
   tree vec_dest;
   tree scalar_dest;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   tree vectype_in = NULL_TREE;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   enum tree_code code, orig_code;
   internal_fn reduc_fn;
   machine_mode vec_mode;
@@ -6090,7 +6264,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   optab optab;
   tree new_temp = NULL_TREE;
   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
-  gimple *cond_reduc_def_stmt = NULL;
+  stmt_vec_info cond_stmt_vinfo = NULL;
   enum tree_code cond_reduc_op_code = ERROR_MARK;
   tree scalar_type;
   bool is_simple_use;
@@ -6106,7 +6280,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   bool nested_cycle = false, found_nested_cycle_def = false;
   bool double_reduc = false;
   basic_block def_bb;
-  struct loop * def_stmt_loop;
+  class loop * def_stmt_loop;
   tree def_arg;
   auto_vec<tree> vec_oprnds0;
   auto_vec<tree> vec_oprnds1;
@@ -6119,22 +6293,23 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   tree cond_reduc_val = NULL_TREE;
 
   /* Make sure it was already recognized as a reduction computation.  */
-  if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
-      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
     return false;
 
-  if (nested_in_vect_loop_p (loop, stmt))
+  if (nested_in_vect_loop_p (loop, stmt_info))
     {
       loop = loop->inner;
       nested_cycle = true;
     }
 
   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
+    gcc_assert (slp_node
+               && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
 
-  if (gimple_code (stmt) == GIMPLE_PHI)
+  if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
     {
-      tree phi_result = gimple_phi_result (stmt);
+      tree phi_result = gimple_phi_result (phi);
       /* Analysis is fully done on the reduction stmt invocation.  */
       if (! vec_stmt)
        {
@@ -6152,8 +6327,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
        return true;
 
       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
-      if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
-       reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
+      reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
 
       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
          == EXTRACT_LAST_REDUCTION)
@@ -6161,13 +6335,17 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
        return true;
 
       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
+      code = gimple_assign_rhs_code (reduc_stmt);
       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
        {
          tree op = gimple_op (reduc_stmt, k);
-         if (op == gimple_phi_result (stmt))
+         if (op == phi_result)
+           continue;
+         if (k == 1 && code == COND_EXPR)
            continue;
-         if (k == 1
-             && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
+         bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
+         gcc_assert (is_simple_use);
+         if (dt == vect_constant_def || dt == vect_external_def)
            continue;
          if (!vectype_in
              || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
@@ -6175,6 +6353,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
            vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
          break;
        }
+      /* For a nested cycle we might end up with an operation like
+         phi_result * phi_result.  */
+      if (!vectype_in)
+       vectype_in = STMT_VINFO_VECTYPE (stmt_info);
       gcc_assert (vectype_in);
 
       if (slp_node)
@@ -6186,8 +6368,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       if (ncopies > 1
          && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
          && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
-         && (use_stmt_info == reduc_stmt_info
-             || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
+         && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
        single_defuse_cycle = true;
 
       /* Create the destination vector  */
@@ -6265,7 +6446,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
         inside the loop body. The last operand is the reduction variable,
         which is defined by the loop-header-phi.  */
 
-  gcc_assert (is_gimple_assign (stmt));
+  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
 
   /* Flatten RHS.  */
   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
@@ -6311,7 +6492,14 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  stmt_vec_info reduc_def_info = NULL;
+  stmt_vec_info reduc_def_info;
+  if (orig_stmt_info)
+    reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
+  else
+    reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
+  gcc_assert (reduc_def_info);
+  gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
+  tree reduc_def = PHI_RESULT (reduc_def_phi);
   int reduc_index = -1;
   for (i = 0; i < op_type; i++)
     {
@@ -6324,9 +6512,9 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
                                          &def_stmt_info);
       dt = dts[i];
       gcc_assert (is_simple_use);
-      if (dt == vect_reduction_def)
+      if (dt == vect_reduction_def
+         && ops[i] == reduc_def)
        {
-         reduc_def_info = def_stmt_info;
          reduc_index = i;
          continue;
        }
@@ -6347,10 +6535,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           && !(dt == vect_nested_cycle && nested_cycle))
        return false;
 
-      if (dt == vect_nested_cycle)
+      if (dt == vect_nested_cycle
+         && ops[i] == reduc_def)
        {
          found_nested_cycle_def = true;
-         reduc_def_info = def_stmt_info;
          reduc_index = i;
        }
 
@@ -6367,7 +6555,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
              && is_nonwrapping_integer_induction (def_stmt_info, loop))
            {
              cond_reduc_dt = dt;
-             cond_reduc_def_stmt = def_stmt_info;
+             cond_stmt_vinfo = def_stmt_info;
            }
        }
     }
@@ -6386,20 +6574,8 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
                             "in-order reduction chain without SLP.\n");
          return false;
        }
-
-      if (orig_stmt_info)
-       reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
-      else
-       reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
     }
 
-  if (! reduc_def_info)
-    return false;
-
-  gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
-  if (!reduc_def_phi)
-    return false;
-
   if (!(reduc_index == -1
        || dts[reduc_index] == vect_reduction_def
        || dts[reduc_index] == vect_nested_cycle
@@ -6473,7 +6649,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
        }
       else if (cond_reduc_dt == vect_induction_def)
        {
-         stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
          tree base
            = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
          tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
@@ -6567,13 +6742,37 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   vec_mode = TYPE_MODE (vectype_in);
   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
+  if (nested_cycle)
+    {
+      def_bb = gimple_bb (reduc_def_phi);
+      def_stmt_loop = def_bb->loop_father;
+      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                                       loop_preheader_edge (def_stmt_loop));
+      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
+      if (def_arg_stmt_info
+         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
+             == vect_double_reduction_def))
+        double_reduc = true;
+    }
+
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
+  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
+      && ncopies > 1)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "multiple types in double reduction or condition "
+                        "reduction.\n");
+      return false;
+    }
+
   if (code == COND_EXPR)
     {
       /* Only call during the analysis stage, otherwise we'll lose
         STMT_VINFO_TYPE.  */
-      if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
-                                               ops[reduc_index], 0, NULL,
-                                               cost_vec))
+      if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
+                                               true, NULL, cost_vec))
         {
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6581,20 +6780,26 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
          return false;
         }
     }
-  else
+  else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+          || code == LROTATE_EXPR || code == RROTATE_EXPR)
     {
-      /* 4. Supportable by target?  */
-
-      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
-         || code == LROTATE_EXPR || code == RROTATE_EXPR)
+      /* Only call during the analysis stage, otherwise we'll lose
+        STMT_VINFO_TYPE.  We only support this for nested cycles
+        without double reductions at the moment.  */
+      if (!nested_cycle
+         || double_reduc
+         || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
+                                               NULL, cost_vec)))
        {
-         /* Shifts and rotates are only supported by vectorizable_shifts,
-            not vectorizable_reduction.  */
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported shift or rotation.\n");
+                            "unsupported shift or rotation in reduction\n");
          return false;
        }
+    }
+  else
+    {
+      /* 4. Supportable by target?  */
 
       /* 4.1. check support for the operation in the loop  */
       optab = optab_for_tree_code (code, vectype_in, optab_default);
@@ -6667,8 +6872,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           (and also the same tree-code) when generating the epilog code and
           when generating the code inside the loop.  */
 
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
   if (orig_stmt_info
       && (reduction_type == TREE_CODE_REDUCTION
          || reduction_type == FOLD_LEFT_REDUCTION))
@@ -6699,19 +6902,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
        orig_code = cond_reduc_op_code;
     }
 
-  if (nested_cycle)
-    {
-      def_bb = gimple_bb (reduc_def_phi);
-      def_stmt_loop = def_bb->loop_father;
-      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                       loop_preheader_edge (def_stmt_loop));
-      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
-      if (def_arg_stmt_info
-         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
-             == vect_double_reduction_def))
-        double_reduc = true;
-    }
-
   reduc_fn = IFN_LAST;
 
   if (reduction_type == TREE_CODE_REDUCTION
@@ -6760,6 +6950,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
     }
 
   if (reduction_type != EXTRACT_LAST_REDUCTION
+      && (!nested_cycle || double_reduc)
       && reduc_fn == IFN_LAST
       && !nunits_out.is_constant ())
     {
@@ -6770,22 +6961,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
-  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
-      && ncopies > 1)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "multiple types in double reduction or condition "
-                        "reduction.\n");
-      return false;
-    }
-
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   if (slp_node)
     neutral_op = neutral_op_for_slp_reduction
-                  (slp_node_instance->reduc_phis, code,
-                   REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+      (slp_node_instance->reduc_phis, code,
+       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
 
   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
     {
@@ -6812,7 +6993,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   if (reduction_type == FOLD_LEFT_REDUCTION
       && slp_node
-      && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
     {
       /* We cannot use in-order reductions in this case because there is
         an implicit reassociation of the operations involved.  */
@@ -6841,7 +7022,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   /* Check extra constraints for variable-length unchained SLP reductions.  */
   if (STMT_SLP_TYPE (stmt_info)
-      && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
+      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
       && !nunits_out.is_constant ())
     {
       /* We checked above that we could build the initial vector when
@@ -6957,8 +7138,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   if (ncopies > 1
       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
-      && (use_stmt_info == stmt_info
-         || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
+      && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
     {
       single_defuse_cycle = true;
       epilog_copies = 1;
@@ -6988,6 +7168,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   internal_fn cond_fn = get_conditional_internal_fn (code);
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
 
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -6995,6 +7176,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
        {
          if (reduction_type != FOLD_LEFT_REDUCTION
+             && !mask_by_cond_expr
              && (cond_fn == IFN_LAST
                  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
                                                      OPTIMIZE_FOR_SPEED)))
@@ -7038,14 +7220,14 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   if (reduction_type == FOLD_LEFT_REDUCTION)
     return vectorize_fold_left_reduction
-      (stmt, gsi, vec_stmt, slp_node, reduc_def_phi, code,
+      (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
        reduc_fn, ops, vectype_in, reduc_index, masks);
 
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     {
       gcc_assert (!slp_node);
-      return vectorizable_condition (stmt, gsi, vec_stmt,
-                                    NULL, reduc_index, NULL, NULL);
+      return vectorizable_condition (stmt_info, gsi, vec_stmt,
+                                    true, NULL, NULL);
     }
 
   /* Create the destination vector  */
@@ -7076,12 +7258,16 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       if (code == COND_EXPR)
         {
           gcc_assert (!slp_node);
-         vectorizable_condition (stmt, gsi, vec_stmt,
-                                 PHI_RESULT (phis[0]->stmt),
-                                 reduc_index, NULL, NULL);
-          /* Multiple types are not supported for condition.  */
+         vectorizable_condition (stmt_info, gsi, vec_stmt,
+                                 true, NULL, NULL);
           break;
         }
+      if (code == LSHIFT_EXPR
+         || code == RSHIFT_EXPR)
+       {
+         vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
+         break;
+       }
 
       /* Handle uses.  */
       if (j == 0)
@@ -7113,12 +7299,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           else
            {
               vec_oprnds0.quick_push
-               (vect_get_vec_def_for_operand (ops[0], stmt));
+               (vect_get_vec_def_for_operand (ops[0], stmt_info));
               vec_oprnds1.quick_push
-               (vect_get_vec_def_for_operand (ops[1], stmt));
+               (vect_get_vec_def_for_operand (ops[1], stmt_info));
               if (op_type == ternary_op)
                vec_oprnds2.quick_push 
-                 (vect_get_vec_def_for_operand (ops[2], stmt));
+                 (vect_get_vec_def_for_operand (ops[2], stmt_info));
            }
         }
       else
@@ -7131,19 +7317,22 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
                vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
              else
                vec_oprnds0[0]
-                 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
+                 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
+                                                   vec_oprnds0[0]);
              if (single_defuse_cycle && reduc_index == 1)
                vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
              else
                vec_oprnds1[0]
-                 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
+                 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
+                                                   vec_oprnds1[0]);
              if (op_type == ternary_op)
                {
                  if (single_defuse_cycle && reduc_index == 2)
                    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
                  else
                    vec_oprnds2[0] 
-                     = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
+                     = vect_get_vec_def_for_stmt_copy (loop_vinfo,
+                                                       vec_oprnds2[0]);
                }
             }
         }
@@ -7151,7 +7340,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
         {
          tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-         if (masked_loop_p)
+         if (masked_loop_p && !mask_by_cond_expr)
            {
              /* Make sure that the reduction accumulator is vop[0].  */
              if (reduc_index == 1)
@@ -7167,19 +7356,28 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
              new_temp = make_ssa_name (vec_dest, call);
              gimple_call_set_lhs (call, new_temp);
              gimple_call_set_nothrow (call, true);
-             new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
+             new_stmt_info
+               = vect_finish_stmt_generation (stmt_info, call, gsi);
            }
          else
            {
              if (op_type == ternary_op)
                vop[2] = vec_oprnds2[i];
 
+             if (masked_loop_p && mask_by_cond_expr)
+               {
+                 tree mask = vect_get_loop_mask (gsi, masks,
+                                                 vec_num * ncopies,
+                                                 vectype_in, i * ncopies + j);
+                 build_vect_cond_expr (code, vop, mask, gsi);
+               }
+
              gassign *new_stmt = gimple_build_assign (vec_dest, code,
                                                       vop[0], vop[1], vop[2]);
              new_temp = make_ssa_name (vec_dest, new_stmt);
              gimple_assign_set_lhs (new_stmt, new_temp);
              new_stmt_info
-               = vect_finish_stmt_generation (stmt, new_stmt, gsi);
+               = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
            }
 
           if (slp_node)
@@ -7207,7 +7405,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
 
-  vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_phi,
+  vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
                                    epilog_copies, reduc_fn, phis,
                                    double_reduc, slp_node, slp_node_instance,
                                    cond_reduc_val, cond_reduc_op_code,
@@ -7258,23 +7456,22 @@ vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
 
 /* Function vectorizable_induction
 
-   Check if PHI performs an induction computation that can be vectorized.
+   Check if STMT_INFO performs an induction computation that can be vectorized.
    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
-   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
+   Return true if STMT_INFO is vectorizable in this way.  */
 
 bool
-vectorizable_induction (gimple *phi,
+vectorizable_induction (stmt_vec_info stmt_info,
                        gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
                        stmt_vec_info *vec_stmt, slp_tree slp_node,
                        stmt_vector_for_cost *cost_vec)
 {
-  stmt_vec_info stmt_info = vinfo_for_stmt (phi);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned ncopies;
   bool nested_in_vect_loop = false;
-  struct loop *iv_loop;
+  class loop *iv_loop;
   tree vec_def;
   edge pe = loop_preheader_edge (loop);
   basic_block new_bb;
@@ -7294,9 +7491,9 @@ vectorizable_induction (gimple *phi,
   edge latch_e;
   tree loop_arg;
   gimple_stmt_iterator si;
-  basic_block bb = gimple_bb (phi);
 
-  if (gimple_code (phi) != GIMPLE_PHI)
+  gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
+  if (!phi)
     return false;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info))
@@ -7316,7 +7513,7 @@ vectorizable_induction (gimple *phi,
   gcc_assert (ncopies >= 1);
 
   /* FORNOW. These restrictions should be relaxed.  */
-  if (nested_in_vect_loop_p (loop, phi))
+  if (nested_in_vect_loop_p (loop, stmt_info))
     {
       imm_use_iterator imm_iter;
       use_operand_p use_p;
@@ -7444,6 +7641,7 @@ vectorizable_induction (gimple *phi,
     }
 
   /* Find the first insertion point in the BB.  */
+  basic_block bb = gimple_bb (phi);
   si = gsi_after_labels (bb);
 
   /* For SLP induction we have to generate several IVs as for example
@@ -7466,10 +7664,10 @@ vectorizable_induction (gimple *phi,
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                              expr, step_expr);
       if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (phi, new_name,
+       new_name = vect_init_vector (stmt_info, new_name,
                                     TREE_TYPE (step_expr), NULL);
       new_vec = build_vector_from_val (vectype, new_name);
-      vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
+      vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
 
       /* Now generate the IVs.  */
       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
@@ -7536,10 +7734,10 @@ vectorizable_induction (gimple *phi,
          new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                                  expr, step_expr);
          if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (phi, new_name,
+           new_name = vect_init_vector (stmt_info, new_name,
                                         TREE_TYPE (step_expr), NULL);
          new_vec = build_vector_from_val (vectype, new_name);
-         vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
+         vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
          for (; ivn < nvects; ++ivn)
            {
              gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
@@ -7572,7 +7770,7 @@ vectorizable_induction (gimple *phi,
       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
         been created during vectorization of previous stmts.  We obtain it
         from the STMT_VINFO_VEC_STMT of the defining stmt.  */
-      vec_init = vect_get_vec_def_for_operand (init_expr, phi);
+      vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
       /* If the initial value is not of proper type, convert it.  */
       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
        {
@@ -7674,7 +7872,7 @@ vectorizable_induction (gimple *phi,
   gcc_assert (CONSTANT_CLASS_P (new_name)
              || TREE_CODE (new_name) == SSA_NAME);
   new_vec = build_vector_from_val (vectype, t);
-  vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
+  vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
 
 
   /* Create the following def-use cycle:
@@ -7740,7 +7938,7 @@ vectorizable_induction (gimple *phi,
       gcc_assert (CONSTANT_CLASS_P (new_name)
                  || TREE_CODE (new_name) == SSA_NAME);
       new_vec = build_vector_from_val (vectype, t);
-      vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
+      vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
 
       vec_def = induc_def;
       prev_stmt_vinfo = induction_phi_info;
@@ -7786,42 +7984,35 @@ vectorizable_induction (gimple *phi,
 
          STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                              "vector of inductions after inner-loop:");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
-           }
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "vector of inductions after inner-loop:%G",
+                            new_stmt);
        }
     }
 
 
   if (dump_enabled_p ())
-    {
-      dump_printf_loc (MSG_NOTE, vect_location,
-                      "transform induction: created def-use cycle: ");
-      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
-      dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
-                       SSA_NAME_DEF_STMT (vec_def), 0);
-    }
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "transform induction: created def-use cycle: %G%G",
+                    induction_phi, SSA_NAME_DEF_STMT (vec_def));
 
   return true;
 }
 
 /* Function vectorizable_live_operation.
 
-   STMT computes a value that is used outside the loop.  Check if
+   STMT_INFO computes a value that is used outside the loop.  Check if
    it can be supported.  */
 
 bool
-vectorizable_live_operation (gimple *stmt,
+vectorizable_live_operation (stmt_vec_info stmt_info,
                             gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
                             slp_tree slp_node, int slp_index,
                             stmt_vec_info *vec_stmt,
                             stmt_vector_for_cost *)
 {
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   imm_use_iterator imm_iter;
   tree lhs, lhs_type, bitsize, vec_bitsize;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -7838,7 +8029,7 @@ vectorizable_live_operation (gimple *stmt,
     return false;
 
   /* FORNOW.  CHECKME.  */
-  if (nested_in_vect_loop_p (loop, stmt))
+  if (nested_in_vect_loop_p (loop, stmt_info))
     return false;
 
   /* If STMT is not relevant and it is a simple assignment and its inputs are
@@ -7846,7 +8037,7 @@ vectorizable_live_operation (gimple *stmt,
      scalar value that it computes will be used.  */
   if (!STMT_VINFO_RELEVANT_P (stmt_info))
     {
-      gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
+      gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location,
                         "statement is simple and uses invariant.  Leaving in "
@@ -7925,9 +8116,8 @@ vectorizable_live_operation (gimple *stmt,
       return true;
     }
 
-  /* If stmt has a related stmt, then use that for getting the lhs.  */
-  if (is_pattern_stmt_p (stmt_info))
-    stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+  /* Use the lhs of the original scalar statement.  */
+  gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
 
   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
        : gimple_get_lhs (stmt);
@@ -7964,8 +8154,7 @@ vectorizable_live_operation (gimple *stmt,
 
       /* For multiple copies, get the last copy.  */
       for (int i = 1; i < ncopies; ++i)
-       vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
-                                                 vec_lhs);
+       vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
 
       /* Get the last lane in the vector.  */
       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
@@ -8028,17 +8217,17 @@ vectorizable_live_operation (gimple *stmt,
   return true;
 }
 
-/* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
+/* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
 
 static void
-vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
+vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
 {
   ssa_op_iter op_iter;
   imm_use_iterator imm_iter;
   def_operand_p def_p;
   gimple *ustmt;
 
-  FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
+  FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
     {
       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
        {
@@ -8087,7 +8276,7 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo)
     }
 
   widest_int max;
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   /* Check the upper bound of loop niters.  */
   if (get_max_loop_iterations (loop, &max))
     {
@@ -8195,7 +8384,7 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
    by factor VF.  */
 
 static void
-scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
+scale_profile_for_vect_loop (class loop *loop, unsigned vf)
 {
   edge preheader = loop_preheader_edge (loop);
   /* Reduce loop iterations by the vectorization factor.  */
@@ -8225,31 +8414,23 @@ scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
 }
 
-/* Vectorize STMT if relevant, inserting any new instructions before GSI.
-   When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
-   *SLP_SCHEDULE is a running record of whether we have called
-   vect_schedule_slp.  */
+/* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
+   When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
+   stmt_vec_info.  */
 
 static void
-vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
-                         gimple_stmt_iterator *gsi,
-                         stmt_vec_info *seen_store, bool *slp_scheduled)
+vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+                         gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
-  if (!stmt_info)
-    return;
 
   if (dump_enabled_p ())
-    {
-      dump_printf_loc (MSG_NOTE, vect_location,
-                      "------>vectorizing statement: ");
-      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
-    }
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "------>vectorizing statement: %G", stmt_info->stmt);
 
   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
-    vect_loop_kill_debug_uses (loop, stmt);
+    vect_loop_kill_debug_uses (loop, stmt_info);
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info)
       && !STMT_VINFO_LIVE_P (stmt_info))
@@ -8267,30 +8448,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
        dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
     }
 
-  /* SLP.  Schedule all the SLP instances when the first SLP stmt is
-     reached.  */
-  if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
-    {
-
-      if (!*slp_scheduled)
-       {
-         *slp_scheduled = true;
-
-         DUMP_VECT_SCOPE ("scheduling SLP instances");
-
-         vect_schedule_slp (loop_vinfo);
-       }
-
-      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
-      if (slptype == pure_slp)
-       return;
-    }
+  /* Pure SLP statements have already been vectorized.  We still need
+     to apply loop vectorization to hybrid SLP statements.  */
+  if (PURE_SLP_STMT (stmt_info))
+    return;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
 
-  bool grouped_store = false;
-  if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
+  if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
     *seen_store = stmt_info;
 }
 
@@ -8301,11 +8467,11 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
    stmts in the loop, and update the loop exit condition.
    Returns scalar epilogue loop if any.  */
 
-struct loop *
+class loop *
 vect_transform_loop (loop_vec_info loop_vinfo)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  struct loop *epilogue = NULL;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *epilogue = NULL;
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   int i;
@@ -8314,7 +8480,6 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   tree niters_vector_mult_vf = NULL_TREE;
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned int lowest_vf = constant_lower_bound (vf);
-  bool slp_scheduled = false;
   gimple *stmt;
   bool check_profitability = false;
   unsigned int th;
@@ -8344,7 +8509,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   edge e = single_exit (loop);
   if (! single_pred_p (e->dest))
     {
-      split_loop_exit_edge (e);
+      split_loop_exit_edge (e, true);
       if (dump_enabled_p ())
        dump_printf (MSG_NOTE, "split exit edge\n");
     }
@@ -8363,8 +8528,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                                              versioning_threshold);
          check_profitability = false;
        }
-      vect_loop_versioning (loop_vinfo, th, check_profitability,
-                           versioning_threshold);
+      class loop *sloop
+       = vect_loop_versioning (loop_vinfo, th, check_profitability,
+                               versioning_threshold);
+      sloop->force_vectorize = false;
       check_profitability = false;
     }
 
@@ -8378,7 +8545,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
       if (! single_pred_p (e->dest))
        {
-         split_loop_exit_edge (e);
+         split_loop_exit_edge (e, true);
          if (dump_enabled_p ())
            dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
        }
@@ -8391,6 +8558,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
                              &step_vector, &niters_vector_mult_vf, th,
                              check_profitability, niters_no_overflow);
+  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
+      && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
+    scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
+                           LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
 
   if (niters_vector == NULL_TREE)
     {
@@ -8420,6 +8591,14 @@ vect_transform_loop (loop_vec_info loop_vinfo)
     /* This will deal with any possible peeling.  */
     vect_prepare_for_masked_peels (loop_vinfo);
 
+  /* Schedule the SLP instances first, then handle loop vectorization
+     below.  */
+  if (!loop_vinfo->slp_instances.is_empty ())
+    {
+      DUMP_VECT_SCOPE ("scheduling SLP instances");
+      vect_schedule_slp (loop_vinfo);
+    }
+
   /* FORNOW: the vectorizer supports only loops which body consist
      of one basic block (header + empty latch). When the vectorizer will
      support more involved loop forms, the order by which the BBs are
@@ -8435,17 +8614,14 @@ vect_transform_loop (loop_vec_info loop_vinfo)
         {
          gphi *phi = si.phi ();
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                               "------>vectorizing phi: ");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
-           }
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "------>vectorizing phi: %G", phi);
          stmt_info = loop_vinfo->lookup_stmt (phi);
          if (!stmt_info)
            continue;
 
          if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
-           vect_loop_kill_debug_uses (loop, phi);
+           vect_loop_kill_debug_uses (loop, stmt_info);
 
          if (!STMT_VINFO_RELEVANT_P (stmt_info)
              && !STMT_VINFO_LIVE_P (stmt_info))
@@ -8464,7 +8640,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
-             vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
+             vect_transform_stmt (stmt_info, NULL, NULL, NULL);
            }
        }
 
@@ -8494,39 +8670,32 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
                      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
                           !gsi_end_p (subsi); gsi_next (&subsi))
-                       vect_transform_loop_stmt (loop_vinfo,
-                                                 gsi_stmt (subsi), &si,
-                                                 &seen_store,
-                                                 &slp_scheduled);
-                     gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
-                     vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
-                                               &seen_store, &slp_scheduled);
+                       {
+                         stmt_vec_info pat_stmt_info
+                           = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
+                         vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+                                                   &si, &seen_store);
+                       }
+                     stmt_vec_info pat_stmt_info
+                       = STMT_VINFO_RELATED_STMT (stmt_info);
+                     vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
+                                               &seen_store);
                    }
-                 vect_transform_loop_stmt (loop_vinfo, stmt, &si,
-                                           &seen_store, &slp_scheduled);
+                 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+                                           &seen_store);
                }
+             gsi_next (&si);
              if (seen_store)
                {
                  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
-                   {
-                     /* Interleaving.  If IS_STORE is TRUE, the
-                        vectorization of the interleaving chain was
-                        completed - free all the stores in the chain.  */
-                     gsi_next (&si);
-                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
-                   }
+                   /* Interleaving.  If IS_STORE is TRUE, the
+                      vectorization of the interleaving chain was
+                      completed - free all the stores in the chain.  */
+                   vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
                  else
-                   {
-                     /* Free the attached stmt_vec_info and remove the
-                        stmt.  */
-                     free_stmt_vec_info (stmt);
-                     unlink_stmt_vdef (stmt);
-                     gsi_remove (&si, true);
-                     release_defs (stmt);
-                   }
+                   /* Free the attached stmt_vec_info and remove the stmt.  */
+                   loop_vinfo->remove_stmt (stmt_info);
                }
-             else
-               gsi_next (&si);
            }
        }
 
@@ -8626,6 +8795,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
        }
     }
 
+  /* Loops vectorized with a variable factor won't benefit from
+     unrolling/peeling.  */
+  if (!vf.is_constant ())
+    {
+      loop->unroll = 1;
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
+                        " variable-length vectorization factor\n");
+    }
   /* Free SLP instances here because otherwise stmt reference counting
      won't work.  */
   slp_instance instance;
@@ -8646,18 +8824,21 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   if (epilogue)
     {
       auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
       unsigned int next_size = 0;
 
+      /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+         on niters already ajusted for the iterations of the prologue.  */
       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-         && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
          && known_eq (vf, lowest_vf))
        {
-         unsigned int eiters
+         unsigned HOST_WIDE_INT eiters
            = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
-         eiters = eiters % lowest_vf;
+              - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+         eiters
+           = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
          epilogue->nb_iterations_upper_bound = eiters - 1;
+         epilogue->any_upper_bound = true;
 
          unsigned int ratio;
          while (next_size < vector_sizes.length ()
@@ -8713,16 +8894,17 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 */
 
 void
-optimize_mask_stores (struct loop *loop)
+optimize_mask_stores (class loop *loop)
 {
   basic_block *bbs = get_loop_body (loop);
   unsigned nbbs = loop->num_nodes;
   unsigned i;
   basic_block bb;
-  struct loop *bb_loop;
+  class loop *bb_loop;
   gimple_stmt_iterator gsi;
   gimple *stmt;
   auto_vec<gimple *> worklist;
+  auto_purge_vect_location sentinel;
 
   vect_location = find_loop_location (loop);
   /* Pick up all masked stores in loop if any.  */
@@ -8816,11 +8998,8 @@ optimize_mask_stores (struct loop *loop)
          /* Setup GSI_TO to the non-empty block start.  */
          gsi_to = gsi_start_bb (store_bb);
          if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                              "Move stmt to created bb\n");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
-           }
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Move stmt to created bb\n%G", last);
          /* Move all stored value producers if possible.  */
          while (!gsi_end_p (gsi))
            {
@@ -8884,11 +9063,8 @@ optimize_mask_stores (struct loop *loop)
 
              /* Can move STMT1 to STORE_BB.  */
              if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_NOTE, vect_location,
-                                  "Move stmt to created bb\n");
-                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
-               }
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "Move stmt to created bb\n%G", stmt1);
              gsi_move_before (&gsi_from, &gsi_to);
              /* Shift GSI_TO for further insertion.  */
              gsi_prev (&gsi_to);
@@ -8903,3 +9079,45 @@ optimize_mask_stores (struct loop *loop)
       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
     }
 }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+       {
+         /* Add the maximum number of skipped iterations to the
+            maximum iteration count.  */
+         if (TREE_CODE (niters_skip) == INTEGER_CST)
+           iv_limit += wi::to_widest (niters_skip);
+         else
+           iv_limit += max_vf - 1;
+       }
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+       /* Make a conservatively-correct assumption.  */
+       iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+        the maximum in-range IV value.  Round this value down to the previous
+        vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+