[Ada] Add the System.Bitfield_Utils runtime unit

[thirdparty/gcc.git] / gcc / tree-vect-loop.c
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 6ea1e77d4baaf4c1190b1a4a935623c1035880e7..b0cbbac0cb5ba1ffce706715d3dbb9139063803d 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1,5 +1,5 @@
  /* Loop Vectorization
-   Copyright (C) 2003-2018 Free Software Foundation, Inc.
+   Copyright (C) 2003-2019 Free Software Foundation, Inc.
     Contributed by Dorit Naishlos <dorit@il.ibm.com> and
     Ira Rosen <irar@il.ibm.com>
  
@@ -286,7 +286,7 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
  static opt_result
  vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    unsigned nbbs = loop->num_nodes;
    poly_uint64 vectorization_factor = 1;
@@ -481,7 +481,7 @@ vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
     enclosing LOOP).  */
  
  static void
-vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
+vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
  {
    basic_block bb = loop->header;
    tree init, step;
@@ -633,7 +633,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
  static void
  vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
  
    vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
  
@@ -714,11 +714,11 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
  
  
  static gcond *
-vect_get_loop_niters (struct loop *loop, tree *assumptions,
+vect_get_loop_niters (class loop *loop, tree *assumptions,
                       tree *number_of_iterations, tree *number_of_iterationsm1)
  {
    edge exit = single_exit (loop);
-  struct tree_niter_desc niter_desc;
+  class tree_niter_desc niter_desc;
    tree niter_assumptions, niter, may_be_zero;
    gcond *cond = get_loop_exit_condition (loop);
  
@@ -730,9 +730,7 @@ vect_get_loop_niters (struct loop *loop, tree *assumptions,
    if (!exit)
      return cond;
  
-  niter = chrec_dont_know;
    may_be_zero = NULL_TREE;
-  niter_assumptions = boolean_true_node;
    if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
        || chrec_contains_undetermined (niter_desc.niter))
      return cond;
@@ -795,7 +793,7 @@ vect_get_loop_niters (struct loop *loop, tree *assumptions,
  static bool
  bb_in_loop_p (const_basic_block bb, const void *data)
  {
-  const struct loop *const loop = (const struct loop *)data;
+  const class loop *const loop = (const class loop *)data;
    if (flow_bb_inside_loop_p (loop, bb))
      return true;
    return false;
@@ -805,7 +803,7 @@ bb_in_loop_p (const_basic_block bb, const void *data)
  /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
     stmt_vec_info structs for all the stmts in LOOP_IN.  */
  
-_loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
+_loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
    : vec_info (vec_info::loop, init_cost (loop_in), shared),
      loop (loop_in),
      bbs (XCNEWVEC (basic_block, loop->num_nodes)),
@@ -819,10 +817,12 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
      max_vectorization_factor (0),
      mask_skip_niters (NULL_TREE),
      mask_compare_type (NULL_TREE),
+    simd_if_cond (NULL_TREE),
      unaligned_dr (NULL),
      peeling_for_alignment (0),
      ptr_mask (0),
      ivexpr_map (NULL),
+    scan_map (NULL),
      slp_unrolling_factor (1),
      single_scalar_iteration_cost (0),
      vectorizable (false),
@@ -833,6 +833,7 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
      operands_swapped (false),
      no_data_dependencies (false),
      has_mask_store (false),
+    scalar_loop_scaling (profile_probability::uninitialized ()),
      scalar_loop (NULL),
      orig_loop_info (NULL)
  {
@@ -862,6 +863,26 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
           gimple *stmt = gsi_stmt (si);
           gimple_set_uid (stmt, 0);
           add_stmt (stmt);
+         /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
+            third argument is the #pragma omp simd if (x) condition, when 0,
+            loop shouldn't be vectorized, when non-zero constant, it should
+            be vectorized normally, otherwise versioned with vectorized loop
+            done if the condition is non-zero at runtime.  */
+         if (loop_in->simduid
+             && is_gimple_call (stmt)
+             && gimple_call_internal_p (stmt)
+             && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
+             && gimple_call_num_args (stmt) >= 3
+             && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
+             && (loop_in->simduid
+                 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
+           {
+             tree arg = gimple_call_arg (stmt, 2);
+             if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
+               simd_if_cond = arg;
+             else
+               gcc_assert (integer_nonzerop (arg));
+           }
         }
      }
  }
@@ -938,6 +959,7 @@ _loop_vec_info::~_loop_vec_info ()
  
    release_vec_loop_masks (&masks);
    delete ivexpr_map;
+  delete scan_map;
  
    loop->aux = NULL;
  }
@@ -1007,8 +1029,10 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
  static bool
  vect_verify_full_masking (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned int min_ni_width;
+  unsigned int max_nscalars_per_iter
+    = vect_get_max_nscalars_per_iter (loop_vinfo);
  
    /* Use a normal loop if there are no statements that need masking.
       This only happens in rare degenerate cases: it means that the loop
@@ -1027,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
      max_ni = wi::smin (max_ni, max_back_edges + 1);
  
    /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+  max_ni *= max_nscalars_per_iter;
  
    /* Work out how many bits we need to represent the limit.  */
    min_ni_width = wi::min_precision (max_ni, UNSIGNED);
@@ -1035,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
    /* Find a scalar mode for which WHILE_ULT is supported.  */
    opt_scalar_int_mode cmp_mode_iter;
    tree cmp_type = NULL_TREE;
+  tree iv_type = NULL_TREE;
+  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  unsigned int iv_precision = UINT_MAX;
+
+  if (iv_limit != -1)
+    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
+                                     UNSIGNED);
+
    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
      {
        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
@@ -1046,10 +1078,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
             {
               /* Although we could stop as soon as we find a valid mode,
-                it's often better to continue until we hit Pmode, since the
-                operands to the WHILE are more likely to be reusable in
-                address calculations.  */
-             cmp_type = this_type;
+                there are at least two reasons why that's not always the
+                best choice:
+
+                - An IV that's Pmode or wider is more likely to be reusable
+                  in address calculations than an IV that's narrower than
+                  Pmode.
+
+                - Doing the comparison in IV_PRECISION or wider allows
+                  a natural 0-based IV, whereas using a narrower comparison
+                  type requires mitigations against wrap-around.
+
+                Conversely, if the IV limit is variable, doing the comparison
+                in a wider type than the original type can introduce
+                unnecessary extensions, so picking the widest valid mode
+                is not always a good choice either.
+
+                Here we prefer the first IV type that's Pmode or wider,
+                and the first comparison type that's IV_PRECISION or wider.
+                (The comparison type must be no wider than the IV type,
+                to avoid extensions in the vector loop.)
+
+                ??? We might want to try continuing beyond Pmode for ILP32
+                targets if CMP_BITS < IV_PRECISION.  */
+             iv_type = this_type;
+             if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
+               cmp_type = this_type;
               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
                 break;
             }
@@ -1060,6 +1114,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
      return false;
  
    LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
    return true;
  }
  
@@ -1067,11 +1122,13 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
  static void
  vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes, factor;
    int innerloop_iters, i;
  
+  DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
+
    /* Gather costs for statements in the scalar loop.  */
  
    /* FORNOW.  */
@@ -1098,11 +1155,11 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
              continue;
  
            /* Skip stmts that are not vectorized inside the loop.  */
-          if (stmt_info
-              && !STMT_VINFO_RELEVANT_P (stmt_info)
-              && (!STMT_VINFO_LIVE_P (stmt_info)
-                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
-             && !STMT_VINFO_IN_PATTERN_P (stmt_info))
+         stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
+          if (!STMT_VINFO_RELEVANT_P (vstmt_info)
+              && (!STMT_VINFO_LIVE_P (vstmt_info)
+                  || !VECTORIZABLE_CYCLE_DEF
+                       (STMT_VINFO_DEF_TYPE (vstmt_info))))
              continue;
  
           vect_cost_for_stmt kind;
@@ -1147,7 +1204,7 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
       niter could be analyzed under some assumptions.  */
  
  opt_result
-vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
+vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
                           tree *assumptions, tree *number_of_iterationsm1,
                           tree *number_of_iterations, gcond **inner_loop_cond)
  {
@@ -1182,7 +1239,7 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
      }
    else
      {
-      struct loop *innerloop = loop->inner;
+      class loop *innerloop = loop->inner;
        edge entryedge;
  
        /* Nested loop. We currently require that the loop is doubly-nested,
@@ -1299,7 +1356,7 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
  /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
  
  opt_loop_vec_info
-vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
+vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
  {
    tree assumptions, number_of_iterations, number_of_iterationsm1;
    gcond *loop_cond, *inner_loop_cond = NULL;
@@ -1362,7 +1419,7 @@ vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
  static void
  vect_update_vf_for_slp (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
    poly_uint64 vectorization_factor;
@@ -1397,14 +1454,16 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
  
    if (only_slp_in_loop)
      {
-      dump_printf_loc (MSG_NOTE, vect_location,
-                      "Loop contains only SLP stmts\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Loop contains only SLP stmts\n");
        vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
      }
    else
      {
-      dump_printf_loc (MSG_NOTE, vect_location,
-                      "Loop contains SLP and non-SLP stmts\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Loop contains SLP and non-SLP stmts\n");
        /* Both the vectorization factor and unroll factor have the form
          current_vector_size * X for some rational X, so they must have
          a common multiple.  */
@@ -1456,7 +1515,7 @@ vect_active_double_reduction_p (stmt_vec_info stmt_info)
  static opt_result
  vect_analyze_loop_operations (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
    int i;
@@ -1466,8 +1525,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
  
    DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
  
-  stmt_vector_for_cost cost_vec;
-  cost_vec.create (2);
+  auto_vec<stmt_info_for_cost> cost_vec;
  
    for (i = 0; i < nbbs; i++)
      {
@@ -1577,7 +1635,6 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
      } /* bbs */
  
    add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
-  cost_vec.release ();
  
    /* All operations in the loop are either irrelevant (deal with loop
       control, or dead), or only used outside the loop and can be moved
@@ -1604,7 +1661,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
  static int
  vect_analyze_loop_costing (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
  
    /* Only fully-masked loops can have iteration counts less than the
@@ -1751,6 +1808,50 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
    return opt_result::success ();
  }
  
+/* Look for SLP-only access groups and turn each individual access into its own
+   group.  */
+static void
+vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
+{
+  unsigned int i;
+  struct data_reference *dr;
+
+  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
+
+  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      gcc_assert (DR_REF (dr));
+      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
+
+      /* Check if the load is a part of an interleaving chain.  */
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+       {
+         stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         unsigned int group_size = DR_GROUP_SIZE (first_element);
+
+         /* Check if SLP-only groups.  */
+         if (!STMT_SLP_TYPE (stmt_info)
+             && STMT_VINFO_SLP_VECT_ONLY (first_element))
+           {
+             /* Dissolve the group.  */
+             STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
+
+             stmt_vec_info vinfo = first_element;
+             while (vinfo)
+               {
+                 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
+                 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
+                 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+                 DR_GROUP_SIZE (vinfo) = 1;
+                 DR_GROUP_GAP (vinfo) = group_size - 1;
+                 vinfo = next;
+               }
+           }
+       }
+    }
+}
+
  /* Function vect_analyze_loop_2.
  
     Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -1767,6 +1868,11 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
    /* The first group of checks is independent of the vector size.  */
    fatal = true;
  
+  if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
+      && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
+    return opt_result::failure_at (vect_location,
+                                  "not vectorized: simd if(0)\n");
+
    /* Find all data references in the loop (which correspond to vdefs/vuses)
       and analyze their evolution in the loop.  */
  
@@ -1796,7 +1902,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
    /* Analyze the data references and also adjust the minimal
       vectorization factor according to the loads and stores.  */
  
-  ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
+  ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
    if (!ok)
      {
        if (dump_enabled_p ())
@@ -1827,7 +1933,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
  
    /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
  
-  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
+  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
    if (!ok)
      {
        if (dump_enabled_p ())
@@ -1962,6 +2068,9 @@ start_over:
         }
      }
  
+  /* Dissolve SLP-only groups.  */
+  vect_dissolve_slp_only_groups (loop_vinfo);
+
    /* Scan all the remaining operations in the loop that are not subject
       to SLP and make sure they are vectorizable.  */
    ok = vect_analyze_loop_operations (loop_vinfo);
@@ -2219,14 +2328,15 @@ again:
     loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
     be vectorized.  */
  opt_loop_vec_info
-vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
+vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
                    vec_info_shared *shared)
  {
    auto_vector_sizes vector_sizes;
  
    /* Autodetect first vector size we try.  */
    current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
+                                               loop->simdlen != 0);
    unsigned int next_size = 0;
  
    DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2245,6 +2355,8 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
  
    unsigned n_stmts = 0;
    poly_uint64 autodetected_vector_size = 0;
+  opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
+  poly_uint64 first_vector_size = 0;
    while (1)
      {
        /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -2255,6 +2367,7 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                              "bad loop form.\n");
+         gcc_checking_assert (first_loop_vinfo == NULL);
           return loop_vinfo;
         }
  
@@ -2268,10 +2381,27 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
         {
           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
  
-         return loop_vinfo;
+         if (loop->simdlen
+             && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                          (unsigned HOST_WIDE_INT) loop->simdlen))
+           {
+             if (first_loop_vinfo == NULL)
+               {
+                 first_loop_vinfo = loop_vinfo;
+                 first_vector_size = current_vector_size;
+                 loop->aux = NULL;
+               }
+             else
+               delete loop_vinfo;
+           }
+         else
+           {
+             delete first_loop_vinfo;
+             return loop_vinfo;
+           }
         }
-
-      delete loop_vinfo;
+      else
+       delete loop_vinfo;
  
        if (next_size == 0)
         autodetected_vector_size = current_vector_size;
@@ -2280,10 +2410,31 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
           && known_eq (vector_sizes[next_size], autodetected_vector_size))
         next_size += 1;
  
-      if (fatal
-         || next_size == vector_sizes.length ()
+      if (fatal)
+       {
+         gcc_checking_assert (first_loop_vinfo == NULL);
+         return opt_loop_vec_info::propagate_failure (res);
+       }
+
+      if (next_size == vector_sizes.length ()
           || known_eq (current_vector_size, 0U))
-       return opt_loop_vec_info::propagate_failure (res);
+       {
+         if (first_loop_vinfo)
+           {
+             current_vector_size = first_vector_size;
+             loop->aux = (loop_vec_info) first_loop_vinfo;
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "***** Choosing vector size ");
+                 dump_dec (MSG_NOTE, current_vector_size);
+                 dump_printf (MSG_NOTE, "\n");
+               }
+             return first_loop_vinfo;
+           }
+         else
+           return opt_loop_vec_info::propagate_failure (res);
+       }
  
        /* Try the next biggest vector size.  */
        current_vector_size = vector_sizes[next_size++];
@@ -2381,7 +2532,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
    stmt_vec_info stmt_vinfo = stmts[0];
    tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
    tree scalar_type = TREE_TYPE (vector_type);
-  struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
+  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
    gcc_assert (loop);
  
    switch (code)
@@ -2460,11 +2611,11 @@ static bool
  vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
                        gimple *first_stmt)
  {
-  struct loop *loop = (gimple_bb (phi))->loop_father;
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  class loop *loop = (gimple_bb (phi))->loop_father;
+  class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
    enum tree_code code;
    gimple *loop_use_stmt = NULL;
-  stmt_vec_info use_stmt_info, current_stmt_info = NULL;
+  stmt_vec_info use_stmt_info;
    tree lhs;
    imm_use_iterator imm_iter;
    use_operand_p use_p;
@@ -2474,6 +2625,7 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
    if (loop != vect_loop)
      return false;
  
+  auto_vec<stmt_vec_info, 8> reduc_chain;
    lhs = PHI_RESULT (phi);
    code = gimple_assign_rhs_code (first_stmt);
    while (1)
@@ -2526,17 +2678,9 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
  
        /* Insert USE_STMT into reduction chain.  */
        use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
-      if (current_stmt_info)
-        {
-         REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
-          REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
-            = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
-        }
-      else
-       REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
+      reduc_chain.safe_push (use_stmt_info);
  
        lhs = gimple_assign_lhs (loop_use_stmt);
-      current_stmt_info = use_stmt_info;
        size++;
     }
  
@@ -2546,10 +2690,9 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
    /* Swap the operands, if needed, to make the reduction operand be the second
       operand.  */
    lhs = PHI_RESULT (phi);
-  stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
-  while (next_stmt_info)
+  for (unsigned i = 0; i < reduc_chain.length (); ++i)
      {
-      gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
+      gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
        if (gimple_assign_rhs2 (next_stmt) == lhs)
         {
           tree op = gimple_assign_rhs1 (next_stmt);
@@ -2563,7 +2706,6 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
               && vect_valid_reduction_input_p (def_stmt_info))
             {
               lhs = gimple_assign_lhs (next_stmt);
-             next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
               continue;
             }
  
@@ -2598,14 +2740,20 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
          }
  
        lhs = gimple_assign_lhs (next_stmt);
-      next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
      }
  
+  /* Build up the actual chain.  */
+  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
+    {
+      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
+      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
+    }
+  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
+  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
+
    /* Save the chain for further analysis in SLP detection.  */
-  stmt_vec_info first_stmt_info
-    = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
-  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
-  REDUC_GROUP_SIZE (first_stmt_info) = size;
+  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
+  REDUC_GROUP_SIZE (reduc_chain[0]) = size;
  
    return true;
  }
@@ -2803,13 +2951,13 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
                           enum vect_reduction_type *v_reduc_type)
  {
    gphi *phi = as_a <gphi *> (phi_info->stmt);
-  struct loop *loop = (gimple_bb (phi))->loop_father;
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  class loop *loop = (gimple_bb (phi))->loop_father;
+  class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
    gimple *phi_use_stmt = NULL;
    enum tree_code orig_code, code;
    tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
    tree type;
-  int nloop_uses;
    tree name;
    imm_use_iterator imm_iter;
    use_operand_p use_p;
@@ -2825,7 +2973,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       can be constant.  See PR60382.  */
    if (has_zero_uses (phi_name))
      return NULL;
-  nloop_uses = 0;
+  unsigned nphi_def_loop_uses = 0;
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
      {
        gimple *use_stmt = USE_STMT (use_p);
@@ -2841,15 +2989,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
            return NULL;
          }
  
-      nloop_uses++;
-      if (nloop_uses > 1)
-        {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction value used in loop.\n");
-          return NULL;
-        }
-
+      nphi_def_loop_uses++;
        phi_use_stmt = use_stmt;
      }
  
@@ -2887,27 +3027,39 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
        return NULL;
      }
  
-  nloop_uses = 0;
+  unsigned nlatch_def_loop_uses = 0;
    auto_vec<gphi *, 3> lcphis;
+  bool inner_loop_of_double_reduc = false;
    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
      {
        gimple *use_stmt = USE_STMT (use_p);
        if (is_gimple_debug (use_stmt))
         continue;
        if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
-       nloop_uses++;
+       nlatch_def_loop_uses++;
        else
-       /* We can have more than one loop-closed PHI.  */
-       lcphis.safe_push (as_a <gphi *> (use_stmt));
-      if (nloop_uses > 1)
         {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.\n");
-         return NULL;
+         /* We can have more than one loop-closed PHI.  */
+         lcphis.safe_push (as_a <gphi *> (use_stmt));
+         if (nested_in_vect_loop
+             && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
+                 == vect_double_reduction_def))
+           inner_loop_of_double_reduc = true;
         }
      }
  
+  /* If this isn't a nested cycle or if the nested cycle reduction value
+     is used ouside of the inner loop we cannot handle uses of the reduction
+     value.  */
+  if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
+      && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction used in loop.\n");
+      return NULL;
+    }
+
    /* If DEF_STMT is a phi node itself, we expect it to have a single argument
       defined in the inner loop.  */
    if (phi_def)
@@ -2966,9 +3118,31 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
      }
  
    gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
    code = orig_code = gimple_assign_rhs_code (def_stmt);
  
+  if (nested_in_vect_loop && !check_reduction)
+    {
+      /* FIXME: Even for non-reductions code generation is funneled
+        through vectorizable_reduction for the stmt defining the
+        PHI latch value.  So we have to artificially restrict ourselves
+        for the supported operations.  */
+      switch (get_gimple_rhs_class (code))
+       {
+       case GIMPLE_BINARY_RHS:
+       case GIMPLE_TERNARY_RHS:
+         break;
+       default:
+         /* Not supported by vectorizable_reduction.  */
+         if (dump_enabled_p ())
+           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                           "nested cycle: not handled operation: ");
+         return NULL;
+       }
+      if (dump_enabled_p ())
+       report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
+      return def_stmt_info;
+    }
+
    /* We can handle "res -= x[i]", which is non-associative by
       simply rewriting this into "res += -x[i]".  Avoid changing
       gimple instruction for the first simple tests and only do this
@@ -3170,16 +3344,6 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
        return def_stmt_info;
      }
  
-  /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
-  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
-  while (first)
-    {
-      stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
-      REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
-      REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
-      first = next;
-    }
-
    /* Look for the expression computing loop_arg from loop PHI result.  */
    if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
      return def_stmt_info;
@@ -3240,8 +3404,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
           iterations are unknown, count a taken branch per peeled loop.  */
        retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
                                  NULL, 0, vect_prologue);
-      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-                                NULL, 0, vect_epilogue);
+      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+                                 NULL, 0, vect_epilogue);
      }
    else
      {
@@ -3309,7 +3473,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    /* Cost model disabled.  */
    if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
      {
-      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
        *ret_min_profitable_niters = 0;
        *ret_min_profitable_estimate = 0;
        return;
@@ -3322,9 +3487,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
        unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
        (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
                             vect_prologue);
-      dump_printf (MSG_NOTE,
-                   "cost model: Adding cost of checks for loop "
-                   "versioning to treat misalignment.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "cost model: Adding cost of checks for loop "
+                    "versioning to treat misalignment.\n");
      }
  
    /* Requires loop versioning with alias checks.  */
@@ -3351,9 +3517,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
                                 NULL, 0, vect_prologue);
         }
-      dump_printf (MSG_NOTE,
-                   "cost model: Adding cost of checks for loop "
-                   "versioning aliasing.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "cost model: Adding cost of checks for loop "
+                    "versioning aliasing.\n");
      }
  
    /* Requires loop versioning with niter checks.  */
@@ -3362,9 +3529,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
        /*  FIXME: Make cost depend on complexity of individual check.  */
        (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
                             vect_prologue);
-      dump_printf (MSG_NOTE,
-                  "cost model: Adding cost of checks for loop "
-                  "versioning niters.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "cost model: Adding cost of checks for loop "
+                    "versioning niters.\n");
      }
  
    if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
@@ -3412,15 +3580,17 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    else if (npeel < 0)
      {
        peel_iters_prologue = assumed_vf / 2;
-      dump_printf (MSG_NOTE, "cost model: "
-                   "prologue peel iters set to vf/2.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "prologue peel iters set to vf/2.\n");
  
        /* If peeling for alignment is unknown, loop bound of main loop becomes
           unknown.  */
        peel_iters_epilogue = assumed_vf / 2;
-      dump_printf (MSG_NOTE, "cost model: "
-                   "epilogue peel iters set to vf/2 because "
-                   "peeling for alignment is unknown.\n");
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "epilogue peel iters set to vf/2 because "
+                    "peeling for alignment is unknown.\n");
  
        /* If peeled iterations are unknown, count a taken branch and a not taken
           branch per peeled loop. Even if scalar loop iterations are known,
@@ -3579,14 +3749,89 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    /* Calculate number of iterations required to make the vector version
       profitable, relative to the loop bodies only.  The following condition
       must hold true:
-     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
+     SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
       where
       SIC = scalar iteration cost, VIC = vector iteration cost,
       VOC = vector outside cost, VF = vectorization factor,
-     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
+     NPEEL = prologue iterations + epilogue iterations,
       SOC = scalar outside cost for run time cost model check.  */
  
-  if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
+  int saving_per_viter = (scalar_single_iter_cost * assumed_vf
+                         - vec_inside_cost);
+  if (saving_per_viter <= 0)
+    {
+      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
+       warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
+                   "vectorization did not happen for a simd loop");
+
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cost model: the vector iteration cost = %d "
+                        "divided by the scalar iteration cost = %d "
+                        "is greater or equal to the vectorization factor = %d"
+                         ".\n",
+                        vec_inside_cost, scalar_single_iter_cost, assumed_vf);
+      *ret_min_profitable_niters = -1;
+      *ret_min_profitable_estimate = -1;
+      return;
+    }
+
+  /* ??? The "if" arm is written to handle all cases; see below for what
+     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* Rewriting the condition above in terms of the number of
+        vector iterations (vniters) rather than the number of
+        scalar iterations (niters) gives:
+
+        SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
+
+        <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
+
+        For integer N, X and Y when X > 0:
+
+        N * X > Y <==> N >= (Y /[floor] X) + 1.  */
+      int outside_overhead = (vec_outside_cost
+                             - scalar_single_iter_cost * peel_iters_prologue
+                             - scalar_single_iter_cost * peel_iters_epilogue
+                             - scalar_outside_cost);
+      /* We're only interested in cases that require at least one
+        vector iteration.  */
+      int min_vec_niters = 1;
+      if (outside_overhead > 0)
+       min_vec_niters = outside_overhead / saving_per_viter + 1;
+
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
+                    min_vec_niters);
+
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Now that we know the minimum number of vector iterations,
+            find the minimum niters for which the scalar cost is larger:
+
+            SIC * niters > VIC * vniters + VOC - SOC
+
+            We know that the minimum niters is no more than
+            vniters * VF + NPEEL, but it might be (and often is) less
+            than that if a partial vector iteration is cheaper than the
+            equivalent scalar code.  */
+         int threshold = (vec_inside_cost * min_vec_niters
+                          + vec_outside_cost
+                          - scalar_outside_cost);
+         if (threshold <= 0)
+           min_profitable_iters = 1;
+         else
+           min_profitable_iters = threshold / scalar_single_iter_cost + 1;
+       }
+      else
+       /* Convert the number of vector iterations into a number of
+          scalar iterations.  */
+       min_profitable_iters = (min_vec_niters * assumed_vf
+                               + peel_iters_prologue
+                               + peel_iters_epilogue);
+    }
+  else
      {
        min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
                               * assumed_vf
@@ -3596,8 +3841,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
          min_profitable_iters = 0;
        else
         {
-         min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
-                                  - vec_inside_cost);
+         min_profitable_iters /= saving_per_viter;
  
           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
               <= (((int) vec_inside_cost * min_profitable_iters)
@@ -3606,28 +3850,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
             min_profitable_iters++;
         }
      }
-  /* vector version will never be profitable.  */
-  else
-    {
-      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
-       warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
-                   "vectorization did not happen for a simd loop");
  
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "cost model: the vector iteration cost = %d "
-                        "divided by the scalar iteration cost = %d "
-                        "is greater or equal to the vectorization factor = %d"
-                         ".\n",
-                        vec_inside_cost, scalar_single_iter_cost, assumed_vf);
-      *ret_min_profitable_niters = -1;
-      *ret_min_profitable_estimate = -1;
-      return;
-    }
-
-  dump_printf (MSG_NOTE,
-              "  Calculated minimum iters for profitability: %d\n",
-              min_profitable_iters);
+  if (dump_enabled_p ())
+    dump_printf (MSG_NOTE,
+                "  Calculated minimum iters for profitability: %d\n",
+                min_profitable_iters);
  
    if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
        && min_profitable_iters < (assumed_vf + peel_iters_prologue))
@@ -3646,10 +3873,34 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  
       Non-vectorized variant is SIC * niters and it must win over vector
       variant on the expected loop trip count.  The following condition must hold true:
-     SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
+     SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
  
    if (vec_outside_cost <= 0)
      min_profitable_estimate = 0;
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* This is a repeat of the code above, but with + SOC rather
+        than - SOC.  */
+      int outside_overhead = (vec_outside_cost
+                             - scalar_single_iter_cost * peel_iters_prologue
+                             - scalar_single_iter_cost * peel_iters_epilogue
+                             + scalar_outside_cost);
+      int min_vec_niters = 1;
+      if (outside_overhead > 0)
+       min_vec_niters = outside_overhead / saving_per_viter + 1;
+
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         int threshold = (vec_inside_cost * min_vec_niters
+                          + vec_outside_cost
+                          + scalar_outside_cost);
+         min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
+       }
+      else
+       min_profitable_estimate = (min_vec_niters * assumed_vf
+                                  + peel_iters_prologue
+                                  + peel_iters_epilogue);
+    }
    else
      {
        min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
@@ -3726,7 +3977,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
    tree vectype;
    machine_mode mode;
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = NULL;
+  class loop *loop = NULL;
  
    if (loop_vinfo)
      loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -3957,7 +4208,7 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
                                 tree *adjustment_def)
  {
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree scalar_type = TREE_TYPE (init_val);
    tree vectype = get_vectype_for_scalar_type (scalar_type);
    enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
@@ -4076,14 +4327,9 @@ get_initial_defs_for_reduction (slp_tree slp_node,
    unsigned HOST_WIDE_INT nunits;
    unsigned j, number_of_places_left_in_vector;
    tree vector_type;
-  tree vop;
-  int group_size = stmts.length ();
-  unsigned int vec_num, i;
-  unsigned number_of_copies = 1;
-  vec<tree> voprnds;
-  voprnds.create (number_of_vectors);
-  struct loop *loop;
-  auto_vec<tree, 16> permute_results;
+  unsigned int group_size = stmts.length ();
+  unsigned int i;
+  class loop *loop;
  
    vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
  
@@ -4114,116 +4360,76 @@ get_initial_defs_for_reduction (slp_tree slp_node,
    if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
      nunits = group_size;
  
-  number_of_copies = nunits * number_of_vectors / group_size;
-
    number_of_places_left_in_vector = nunits;
    bool constant_p = true;
    tree_vector_builder elts (vector_type, nunits, 1);
    elts.quick_grow (nunits);
-  for (j = 0; j < number_of_copies; j++)
+  gimple_seq ctor_seq = NULL;
+  for (j = 0; j < nunits * number_of_vectors; ++j)
      {
-      for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
-        {
-         tree op;
-         /* Get the def before the loop.  In reduction chain we have only
-            one initial value.  */
-         if ((j != (number_of_copies - 1)
-              || (reduc_chain && i != 0))
-             && neutral_op)
-           op = neutral_op;
-         else
-           op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+      tree op;
+      i = j % group_size;
+      stmt_vinfo = stmts[i];
+
+      /* Get the def before the loop.  In reduction chain we have only
+        one initial value.  Else we have as many as PHIs in the group.  */
+      if (reduc_chain)
+       op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
+      else if (((vec_oprnds->length () + 1) * nunits
+               - number_of_places_left_in_vector >= group_size)
+              && neutral_op)
+       op = neutral_op;
+      else
+       op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
  
-          /* Create 'vect_ = {op0,op1,...,opn}'.  */
-          number_of_places_left_in_vector--;
-         elts[number_of_places_left_in_vector] = op;
-         if (!CONSTANT_CLASS_P (op))
-           constant_p = false;
+      /* Create 'vect_ = {op0,op1,...,opn}'.  */
+      number_of_places_left_in_vector--;
+      elts[nunits - number_of_places_left_in_vector - 1] = op;
+      if (!CONSTANT_CLASS_P (op))
+       constant_p = false;
  
-          if (number_of_places_left_in_vector == 0)
-            {
-             gimple_seq ctor_seq = NULL;
-             tree init;
-             if (constant_p && !neutral_op
-                 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
-                 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
-               /* Build the vector directly from ELTS.  */
-               init = gimple_build_vector (&ctor_seq, &elts);
-             else if (neutral_op)
-               {
-                 /* Build a vector of the neutral value and shift the
-                    other elements into place.  */
-                 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
-                                                      neutral_op);
-                 int k = nunits;
-                 while (k > 0 && elts[k - 1] == neutral_op)
-                   k -= 1;
-                 while (k > 0)
-                   {
-                     k -= 1;
-                     init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
-                                          vector_type, init, elts[k]);
-                   }
-               }
-             else
+      if (number_of_places_left_in_vector == 0)
+       {
+         tree init;
+         if (constant_p && !neutral_op
+             ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
+             : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
+           /* Build the vector directly from ELTS.  */
+           init = gimple_build_vector (&ctor_seq, &elts);
+         else if (neutral_op)
+           {
+             /* Build a vector of the neutral value and shift the
+                other elements into place.  */
+             init = gimple_build_vector_from_val (&ctor_seq, vector_type,
+                                                  neutral_op);
+             int k = nunits;
+             while (k > 0 && elts[k - 1] == neutral_op)
+               k -= 1;
+             while (k > 0)
                 {
-                 /* First time round, duplicate ELTS to fill the
-                    required number of vectors, then cherry pick the
-                    appropriate result for each iteration.  */
-                 if (vec_oprnds->is_empty ())
-                   duplicate_and_interleave (&ctor_seq, vector_type, elts,
-                                             number_of_vectors,
-                                             permute_results);
-                 init = permute_results[number_of_vectors - j - 1];
+                 k -= 1;
+                 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
+                                      vector_type, init, elts[k]);
                 }
-             if (ctor_seq != NULL)
-               gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
-             voprnds.quick_push (init);
-
-              number_of_places_left_in_vector = nunits;
-             elts.new_vector (vector_type, nunits, 1);
-             elts.quick_grow (nunits);
-             constant_p = true;
-            }
-        }
-    }
-
-  /* Since the vectors are created in the reverse order, we should invert
-     them.  */
-  vec_num = voprnds.length ();
-  for (j = vec_num; j != 0; j--)
-    {
-      vop = voprnds[j - 1];
-      vec_oprnds->quick_push (vop);
-    }
-
-  voprnds.release ();
-
-  /* In case that VF is greater than the unrolling factor needed for the SLP
-     group of stmts, NUMBER_OF_VECTORS to be created is greater than
-     NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
-     to replicate the vectors.  */
-  tree neutral_vec = NULL;
-  while (number_of_vectors > vec_oprnds->length ())
-    {
-      if (neutral_op)
-        {
-          if (!neutral_vec)
+           }
+         else
             {
-             gimple_seq ctor_seq = NULL;
-             neutral_vec = gimple_build_vector_from_val
-               (&ctor_seq, vector_type, neutral_op);
-             if (ctor_seq != NULL)
-               gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
+             /* First time round, duplicate ELTS to fill the
+                required number of vectors.  */
+             duplicate_and_interleave (&ctor_seq, vector_type, elts,
+                                       number_of_vectors, *vec_oprnds);
+             break;
             }
-          vec_oprnds->quick_push (neutral_vec);
-        }
-      else
-        {
-          for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
-            vec_oprnds->quick_push (vop);
-        }
+         vec_oprnds->quick_push (init);
+
+         number_of_places_left_in_vector = nunits;
+         elts.new_vector (vector_type, nunits, 1);
+         elts.quick_grow (nunits);
+         constant_p = true;
+       }
      }
+  if (ctor_seq != NULL)
+    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
  }
  
  
@@ -4311,7 +4517,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
    tree vectype;
    machine_mode mode;
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
    basic_block exit_bb;
    tree scalar_dest;
    tree scalar_type;
@@ -4906,7 +5112,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
           if (off != 0)
             {
               tree new_idx_val = idx_val;
-             tree new_val = val;
               if (off != v_size - el_size)
                 {
                   new_idx_val = make_ssa_name (idx_eltype);
@@ -4915,7 +5120,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
                                                      old_idx_val);
                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
                 }
-             new_val = make_ssa_name (data_eltype);
+             tree new_val = make_ssa_name (data_eltype);
               epilog_stmt = gimple_build_assign (new_val,
                                                  COND_EXPR,
                                                  build2 (GT_EXPR,
@@ -5096,14 +5301,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
          in a vector mode of smaller size and first reduce upper/lower
          halves against each other.  */
        enum machine_mode mode1 = mode;
-      tree vectype1 = vectype;
        unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
        unsigned sz1 = sz;
        if (!slp_reduc
           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
         sz1 = GET_MODE_SIZE (mode1).to_constant ();
  
-      vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+      tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
        reduce_with_shift = have_whole_vector_shift (mode1);
        if (!VECTOR_MODE_P (mode1))
         reduce_with_shift = false;
@@ -5224,7 +5428,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
              dump_printf_loc (MSG_NOTE, vect_location,
                              "Reduce using vector shifts\n");
  
-         mode1 = TYPE_MODE (vectype1);
            vec_dest = vect_create_destination_var (scalar_dest, vectype1);
            for (elt_offset = nelements / 2;
                 elt_offset >= 1;
@@ -5517,13 +5720,6 @@ vect_finalize_reduction:
                 = loop_vinfo->lookup_stmt (exit_phi);
                gphi *vect_phi;
  
-              /* FORNOW. Currently not supporting the case that an inner-loop
-                 reduction is not used in the outer-loop (but only outside the
-                 outer-loop), unless it is double reduction.  */
-              gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
-                           && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
-                          || double_reduc);
-
               if (double_reduc)
                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
               else
@@ -5716,6 +5912,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
    return lhs;
  }
  
+/* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
+   type of the vector input.  */
+
+static internal_fn
+get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
+{
+  internal_fn mask_reduc_fn;
+
+  switch (reduc_fn)
+    {
+    case IFN_FOLD_LEFT_PLUS:
+      mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
+      break;
+
+    default:
+      return IFN_LAST;
+    }
+
+  if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
+                                     OPTIMIZE_FOR_SPEED))
+    return mask_reduc_fn;
+  return IFN_LAST;
+}
+
  /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
     statement that sets the live-out value.  REDUC_DEF_STMT is the phi
     statement.  CODE is the operation performed by STMT_INFO and OPS are
@@ -5735,9 +5955,10 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
                                int reduc_index, vec_loop_masks *masks)
  {
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
    stmt_vec_info new_stmt_info = NULL;
+  internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
  
    int ncopies;
    if (slp_node)
@@ -5763,8 +5984,14 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
    auto_vec<tree> vec_oprnds0;
    if (slp_node)
      {
-      vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
-                        slp_node);
+      auto_vec<vec<tree> > vec_defs (2);
+      auto_vec<tree> sops(2);
+      sops.quick_push (ops[0]);
+      sops.quick_push (ops[1]);
+      vect_get_slp_defs (sops, slp_node, &vec_defs);
+      vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
+      vec_defs[0].release ();
+      vec_defs[1].release ();
        group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
        scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
      }
@@ -5808,16 +6035,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
           def0 = negated;
         }
  
-      if (mask)
+      if (mask && mask_reduc_fn == IFN_LAST)
         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
                                     vector_identity);
  
        /* On the first iteration the input is simply the scalar phi
          result, and for subsequent iterations it is the output of
          the preceding operation.  */
-      if (reduc_fn != IFN_LAST)
+      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
         {
-         new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
+         if (mask && mask_reduc_fn != IFN_LAST)
+           new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
+                                                  def0, mask);
+         else
+           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
+                                                  def0);
           /* For chained SLP reductions the output of the previous reduction
              operation serves as the input of the next. For the final statement
              the output cannot be a temporary - we reuse the original
@@ -5837,7 +6069,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
           /* Remove the statement, so that we can use the same code paths
              as for statements that we've just created.  */
           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
-         gsi_remove (&tmp_gsi, false);
+         gsi_remove (&tmp_gsi, true);
         }
  
        if (i == vec_num - 1)
@@ -5866,7 +6098,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
     does not cause overflow.  */
  
  static bool
-is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
+is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
  {
    gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
    tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
@@ -5902,6 +6134,67 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
           <= TYPE_PRECISION (lhs_type));
  }
  
+/* Check if masking can be supported by inserting a conditional expression.
+   CODE is the code for the operation.  COND_FN is the conditional internal
+   function, if it exists.  VECTYPE_IN is the type of the vector input.  */
+static bool
+use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
+                        tree vectype_in)
+{
+  if (cond_fn != IFN_LAST
+      && direct_internal_fn_supported_p (cond_fn, vectype_in,
+                                        OPTIMIZE_FOR_SPEED))
+    return false;
+
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+    case SAD_EXPR:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Insert a conditional expression to enable masked vectorization.  CODE is the
+   code for the operation.  VOP is the array of operands.  MASK is the loop
+   mask.  GSI is a statement iterator used to place the new conditional
+   expression.  */
+static void
+build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
+                     gimple_stmt_iterator *gsi)
+{
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      {
+       tree vectype = TREE_TYPE (vop[1]);
+       tree zero = build_zero_cst (vectype);
+       tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+       gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+                                              mask, vop[1], zero);
+       gsi_insert_before (gsi, select, GSI_SAME_STMT);
+       vop[1] = masked_op1;
+       break;
+      }
+
+    case SAD_EXPR:
+      {
+       tree vectype = TREE_TYPE (vop[1]);
+       tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+       gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+                                              mask, vop[1], vop[0]);
+       gsi_insert_before (gsi, select, GSI_SAME_STMT);
+       vop[1] = masked_op1;
+       break;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
  /* Function vectorizable_reduction.
  
     Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -5963,7 +6256,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
    tree vectype_in = NULL_TREE;
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    enum tree_code code, orig_code;
    internal_fn reduc_fn;
    machine_mode vec_mode;
@@ -5987,7 +6280,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    bool nested_cycle = false, found_nested_cycle_def = false;
    bool double_reduc = false;
    basic_block def_bb;
-  struct loop * def_stmt_loop;
+  class loop * def_stmt_loop;
    tree def_arg;
    auto_vec<tree> vec_oprnds0;
    auto_vec<tree> vec_oprnds1;
@@ -6042,13 +6335,17 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
         return true;
  
        gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
+      code = gimple_assign_rhs_code (reduc_stmt);
        for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
         {
           tree op = gimple_op (reduc_stmt, k);
           if (op == phi_result)
             continue;
-         if (k == 1
-             && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
+         if (k == 1 && code == COND_EXPR)
+           continue;
+         bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
+         gcc_assert (is_simple_use);
+         if (dt == vect_constant_def || dt == vect_external_def)
             continue;
           if (!vectype_in
               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
@@ -6056,6 +6353,10 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
           break;
         }
+      /* For a nested cycle we might end up with an operation like
+         phi_result * phi_result.  */
+      if (!vectype_in)
+       vectype_in = STMT_VINFO_VECTYPE (stmt_info);
        gcc_assert (vectype_in);
  
        if (slp_node)
@@ -6191,7 +6492,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       The last use is the reduction variable.  In case of nested cycle this
       assumption is not true: we use reduc_index to record the index of the
       reduction variable.  */
-  stmt_vec_info reduc_def_info = NULL;
+  stmt_vec_info reduc_def_info;
+  if (orig_stmt_info)
+    reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
+  else
+    reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
+  gcc_assert (reduc_def_info);
+  gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
+  tree reduc_def = PHI_RESULT (reduc_def_phi);
    int reduc_index = -1;
    for (i = 0; i < op_type; i++)
      {
@@ -6204,9 +6512,9 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                                           &def_stmt_info);
        dt = dts[i];
        gcc_assert (is_simple_use);
-      if (dt == vect_reduction_def)
+      if (dt == vect_reduction_def
+         && ops[i] == reduc_def)
         {
-         reduc_def_info = def_stmt_info;
           reduc_index = i;
           continue;
         }
@@ -6227,10 +6535,10 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
            && !(dt == vect_nested_cycle && nested_cycle))
         return false;
  
-      if (dt == vect_nested_cycle)
+      if (dt == vect_nested_cycle
+         && ops[i] == reduc_def)
         {
           found_nested_cycle_def = true;
-         reduc_def_info = def_stmt_info;
           reduc_index = i;
         }
  
@@ -6266,20 +6574,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                              "in-order reduction chain without SLP.\n");
           return false;
         }
-
-      if (orig_stmt_info)
-       reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
-      else
-       reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
      }
  
-  if (! reduc_def_info)
-    return false;
-
-  gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
-  if (!reduc_def_phi)
-    return false;
-
    if (!(reduc_index == -1
         || dts[reduc_index] == vect_reduction_def
         || dts[reduc_index] == vect_nested_cycle
@@ -6446,13 +6742,37 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    vec_mode = TYPE_MODE (vectype_in);
    poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
  
+  if (nested_cycle)
+    {
+      def_bb = gimple_bb (reduc_def_phi);
+      def_stmt_loop = def_bb->loop_father;
+      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                                       loop_preheader_edge (def_stmt_loop));
+      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
+      if (def_arg_stmt_info
+         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
+             == vect_double_reduction_def))
+        double_reduc = true;
+    }
+
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
+  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
+      && ncopies > 1)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "multiple types in double reduction or condition "
+                        "reduction.\n");
+      return false;
+    }
+
    if (code == COND_EXPR)
      {
        /* Only call during the analysis stage, otherwise we'll lose
          STMT_VINFO_TYPE.  */
        if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
-                                               ops[reduc_index], 0, NULL,
-                                               cost_vec))
+                                               true, NULL, cost_vec))
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6460,20 +6780,26 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           return false;
          }
      }
-  else
+  else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+          || code == LROTATE_EXPR || code == RROTATE_EXPR)
      {
-      /* 4. Supportable by target?  */
-
-      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
-         || code == LROTATE_EXPR || code == RROTATE_EXPR)
+      /* Only call during the analysis stage, otherwise we'll lose
+        STMT_VINFO_TYPE.  We only support this for nested cycles
+        without double reductions at the moment.  */
+      if (!nested_cycle
+         || double_reduc
+         || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
+                                               NULL, cost_vec)))
         {
-         /* Shifts and rotates are only supported by vectorizable_shifts,
-            not vectorizable_reduction.  */
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported shift or rotation.\n");
+                            "unsupported shift or rotation in reduction\n");
           return false;
         }
+    }
+  else
+    {
+      /* 4. Supportable by target?  */
  
        /* 4.1. check support for the operation in the loop  */
        optab = optab_for_tree_code (code, vectype_in, optab_default);
@@ -6546,8 +6872,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
            (and also the same tree-code) when generating the epilog code and
            when generating the code inside the loop.  */
  
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
    if (orig_stmt_info
        && (reduction_type == TREE_CODE_REDUCTION
           || reduction_type == FOLD_LEFT_REDUCTION))
@@ -6578,19 +6902,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
         orig_code = cond_reduc_op_code;
      }
  
-  if (nested_cycle)
-    {
-      def_bb = gimple_bb (reduc_def_phi);
-      def_stmt_loop = def_bb->loop_father;
-      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                       loop_preheader_edge (def_stmt_loop));
-      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
-      if (def_arg_stmt_info
-         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
-             == vect_double_reduction_def))
-        double_reduc = true;
-    }
-
    reduc_fn = IFN_LAST;
  
    if (reduction_type == TREE_CODE_REDUCTION
@@ -6650,16 +6961,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        return false;
      }
  
-  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
-      && ncopies > 1)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "multiple types in double reduction or condition "
-                        "reduction.\n");
-      return false;
-    }
-
    /* For SLP reductions, see if there is a neutral value we can use.  */
    tree neutral_op = NULL_TREE;
    if (slp_node)
@@ -6867,6 +7168,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
  
    internal_fn cond_fn = get_conditional_internal_fn (code);
    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
  
    if (!vec_stmt) /* transformation not required.  */
      {
@@ -6874,6 +7176,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
         {
           if (reduction_type != FOLD_LEFT_REDUCTION
+             && !mask_by_cond_expr
               && (cond_fn == IFN_LAST
                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
                                                       OPTIMIZE_FOR_SPEED)))
@@ -6924,7 +7227,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
      {
        gcc_assert (!slp_node);
        return vectorizable_condition (stmt_info, gsi, vec_stmt,
-                                    NULL, reduc_index, NULL, NULL);
+                                    true, NULL, NULL);
      }
  
    /* Create the destination vector  */
@@ -6956,11 +7259,15 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
          {
            gcc_assert (!slp_node);
           vectorizable_condition (stmt_info, gsi, vec_stmt,
-                                 PHI_RESULT (phis[0]->stmt),
-                                 reduc_index, NULL, NULL);
-          /* Multiple types are not supported for condition.  */
+                                 true, NULL, NULL);
            break;
          }
+      if (code == LSHIFT_EXPR
+         || code == RSHIFT_EXPR)
+       {
+         vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
+         break;
+       }
  
        /* Handle uses.  */
        if (j == 0)
@@ -7033,7 +7340,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
          {
           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-         if (masked_loop_p)
+         if (masked_loop_p && !mask_by_cond_expr)
             {
               /* Make sure that the reduction accumulator is vop[0].  */
               if (reduc_index == 1)
@@ -7057,6 +7364,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
               if (op_type == ternary_op)
                 vop[2] = vec_oprnds2[i];
  
+             if (masked_loop_p && mask_by_cond_expr)
+               {
+                 tree mask = vect_get_loop_mask (gsi, masks,
+                                                 vec_num * ncopies,
+                                                 vectype_in, i * ncopies + j);
+                 build_vect_cond_expr (code, vop, mask, gsi);
+               }
+
               gassign *new_stmt = gimple_build_assign (vec_dest, code,
                                                        vop[0], vop[1], vop[2]);
               new_temp = make_ssa_name (vec_dest, new_stmt);
@@ -7153,10 +7468,10 @@ vectorizable_induction (stmt_vec_info stmt_info,
                         stmt_vector_for_cost *cost_vec)
  {
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned ncopies;
    bool nested_in_vect_loop = false;
-  struct loop *iv_loop;
+  class loop *iv_loop;
    tree vec_def;
    edge pe = loop_preheader_edge (loop);
    basic_block new_bb;
@@ -7697,7 +8012,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
                              stmt_vector_for_cost *)
  {
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    imm_use_iterator imm_iter;
    tree lhs, lhs_type, bitsize, vec_bitsize;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -7905,7 +8220,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
  /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
  
  static void
-vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
+vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
  {
    ssa_op_iter op_iter;
    imm_use_iterator imm_iter;
@@ -7961,7 +8276,7 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo)
      }
  
    widest_int max;
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    /* Check the upper bound of loop niters.  */
    if (get_max_loop_iterations (loop, &max))
      {
@@ -8069,7 +8384,7 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
     by factor VF.  */
  
  static void
-scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
+scale_profile_for_vect_loop (class loop *loop, unsigned vf)
  {
    edge preheader = loop_preheader_edge (loop);
    /* Reduce loop iterations by the vectorization factor.  */
@@ -8107,7 +8422,7 @@ static void
  vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
  
    if (dump_enabled_p ())
@@ -8152,11 +8467,11 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
     stmts in the loop, and update the loop exit condition.
     Returns scalar epilogue loop if any.  */
  
-struct loop *
+class loop *
  vect_transform_loop (loop_vec_info loop_vinfo)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  struct loop *epilogue = NULL;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  class loop *epilogue = NULL;
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
    int i;
@@ -8194,7 +8509,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    edge e = single_exit (loop);
    if (! single_pred_p (e->dest))
      {
-      split_loop_exit_edge (e);
+      split_loop_exit_edge (e, true);
        if (dump_enabled_p ())
         dump_printf (MSG_NOTE, "split exit edge\n");
      }
@@ -8213,8 +8528,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                                               versioning_threshold);
           check_profitability = false;
         }
-      vect_loop_versioning (loop_vinfo, th, check_profitability,
-                           versioning_threshold);
+      class loop *sloop
+       = vect_loop_versioning (loop_vinfo, th, check_profitability,
+                               versioning_threshold);
+      sloop->force_vectorize = false;
        check_profitability = false;
      }
  
@@ -8228,7 +8545,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
        e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
        if (! single_pred_p (e->dest))
         {
-         split_loop_exit_edge (e);
+         split_loop_exit_edge (e, true);
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
         }
@@ -8241,6 +8558,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
                               &step_vector, &niters_vector_mult_vf, th,
                               check_profitability, niters_no_overflow);
+  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
+      && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
+    scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
+                           LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
  
    if (niters_vector == NULL_TREE)
      {
@@ -8474,6 +8795,15 @@ vect_transform_loop (loop_vec_info loop_vinfo)
         }
      }
  
+  /* Loops vectorized with a variable factor won't benefit from
+     unrolling/peeling.  */
+  if (!vf.is_constant ())
+    {
+      loop->unroll = 1;
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
+                        " variable-length vectorization factor\n");
+    }
    /* Free SLP instances here because otherwise stmt reference counting
       won't work.  */
    slp_instance instance;
@@ -8494,18 +8824,21 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    if (epilogue)
      {
        auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
        unsigned int next_size = 0;
  
+      /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
+         on niters already ajusted for the iterations of the prologue.  */
        if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-         && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
           && known_eq (vf, lowest_vf))
         {
-         unsigned int eiters
+         unsigned HOST_WIDE_INT eiters
             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
-         eiters = eiters % lowest_vf;
+              - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+         eiters
+           = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
           epilogue->nb_iterations_upper_bound = eiters - 1;
+         epilogue->any_upper_bound = true;
  
           unsigned int ratio;
           while (next_size < vector_sizes.length ()
@@ -8561,16 +8894,17 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  */
  
  void
-optimize_mask_stores (struct loop *loop)
+optimize_mask_stores (class loop *loop)
  {
    basic_block *bbs = get_loop_body (loop);
    unsigned nbbs = loop->num_nodes;
    unsigned i;
    basic_block bb;
-  struct loop *bb_loop;
+  class loop *bb_loop;
    gimple_stmt_iterator gsi;
    gimple *stmt;
    auto_vec<gimple *> worklist;
+  auto_purge_vect_location sentinel;
  
    vect_location = find_loop_location (loop);
    /* Pick up all masked stores in loop if any.  */
@@ -8745,3 +9079,45 @@ optimize_mask_stores (struct loop *loop)
        add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
      }
  }
+
+/* Decide whether it is possible to use a zero-based induction variable
+   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
+   return the value that the induction variable must be able to hold
+   in order to ensure that the loop ends with an all-false mask.
+   Return -1 otherwise.  */
+widest_int
+vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+{
+  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+
+  /* Calculate the value that the induction variable must be able
+     to hit in order to ensure that we end the loop with an all-false mask.
+     This involves adding the maximum number of inactive trailing scalar
+     iterations.  */
+  widest_int iv_limit = -1;
+  if (max_loop_iterations (loop, &iv_limit))
+    {
+      if (niters_skip)
+       {
+         /* Add the maximum number of skipped iterations to the
+            maximum iteration count.  */
+         if (TREE_CODE (niters_skip) == INTEGER_CST)
+           iv_limit += wi::to_widest (niters_skip);
+         else
+           iv_limit += max_vf - 1;
+       }
+      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+       /* Make a conservatively-correct assumption.  */
+       iv_limit += max_vf - 1;
+
+      /* IV_LIMIT is the maximum number of latch iterations, which is also
+        the maximum in-range IV value.  Round this value down to the previous
+        vector alignment boundary and then add an extra full iteration.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+    }
+  return iv_limit;
+}
+