/* Reduction chain backedge defs are filled manually.
??? Need a better way to identify a SLP reduction chain PHI.
Or a better overall way to SLP match those. */
- if (all_same && def_type == vect_reduction_def)
+ if (stmts.length () > 1
+ && all_same && def_type == vect_reduction_def)
skip_args[loop_latch_edge (loop)->dest_idx] = true;
}
else if (def_type != vect_internal_def)
}
/* Find SLP sequences starting from groups of reductions. */
- if (loop_vinfo->reductions.length () > 1)
+ if (loop_vinfo->reductions.length () > 0)
{
- /* Collect reduction statements. */
+ /* Collect reduction statements we can combine into
+ a SLP reduction. */
vec<stmt_vec_info> scalar_stmts;
scalar_stmts.create (loop_vinfo->reductions.length ());
for (auto next_info : loop_vinfo->reductions)
reduction path. In that case we'd have to reverse
engineer that conversion stmt following the chain using
reduc_idx and from the PHI using reduc_def. */
- && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
- /* Do not discover SLP reductions for lane-reducing ops, that
- will fail later. */
- && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
- || !lane_reducing_op_p (gimple_assign_rhs_code (g))))
- scalar_stmts.quick_push (next_info);
+ && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+ {
+ /* Do not discover SLP reductions combining lane-reducing
+ ops, that will fail later. */
+ if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
+ || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
+ scalar_stmts.quick_push (next_info);
+ else
+ {
+ /* Do SLP discovery for single-lane reductions. */
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (next_info);
+ vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, NULL);
+ }
+ }
}
- if (scalar_stmts.length () > 1)
+ /* Save for re-processing on failure. */
+ vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ if (scalar_stmts.length () <= 1
+ || !vect_build_slp_instance (loop_vinfo,
+ slp_inst_kind_reduc_group,
+ scalar_stmts, roots, remain,
+ max_tree_size, &limit, bst_map,
+ NULL))
{
- vec<stmt_vec_info> roots = vNULL;
- vec<tree> remain = vNULL;
- vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
- scalar_stmts, roots, remain,
- max_tree_size, &limit, bst_map, NULL);
+ if (scalar_stmts.length () <= 1)
+ scalar_stmts.release ();
+ /* Do SLP discovery for single-lane reductions. */
+ for (auto stmt_info : saved_stmts)
+ {
+ vec<stmt_vec_info> stmts;
+ vec<stmt_vec_info> roots = vNULL;
+ vec<tree> remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+ vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, &limit,
+ bst_map, NULL);
+ }
+ saved_stmts.release ();
}
- else
- scalar_stmts.release ();
}
}
}
overrun_p = true;
}
+
+ /* If this is single-element interleaving with an element
+ distance that leaves unused vector loads around punt - we
+ at least create very sub-optimal code in that case (and
+ blow up memory, see PR65518). */
+ if (loop_vinfo
+ && *memory_access_type == VMAT_CONTIGUOUS
+ && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+ && single_element_p
+ && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads\n");
+ return false;
+ }
}
}
else
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
+ if (loop
+ && nested_in_vect_loop_p (loop, stmt_info)
+ && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (nested_in_vect_loop && ncopies > 1)
+ if (nested_in_vect_loop
+ && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,