2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 for (i=0; i<N/8; i++){
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
156 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
164 vect_determine_vf_for_stmt_1 (vec_info
*vinfo
, stmt_vec_info stmt_info
,
165 bool vectype_maybe_set_p
,
168 gimple
*stmt
= stmt_info
->stmt
;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
171 && !STMT_VINFO_LIVE_P (stmt_info
))
172 || gimple_clobber_p (stmt
))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype
, nunits_vectype
;
180 opt_result res
= vect_get_vector_types_for_stmt (vinfo
, stmt_info
,
188 if (STMT_VINFO_VECTYPE (stmt_info
))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
193 || vectype_maybe_set_p
)
194 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
196 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
200 vect_update_max_nunits (vf
, nunits_vectype
);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
211 vect_determine_vf_for_stmt (vec_info
*vinfo
,
212 stmt_vec_info stmt_info
, poly_uint64
*vf
)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
217 opt_result res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, false, vf
);
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
222 && STMT_VINFO_RELATED_STMT (stmt_info
))
224 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
225 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
229 !gsi_end_p (si
); gsi_next (&si
))
231 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE
, vect_location
,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info
->stmt
);
236 res
= vect_determine_vf_for_stmt_1 (vinfo
, def_stmt_info
, true, vf
);
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE
, vect_location
,
243 "==> examining pattern statement: %G",
245 res
= vect_determine_vf_for_stmt_1 (vinfo
, stmt_info
, true, vf
);
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
281 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
282 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
283 unsigned nbbs
= loop
->num_nodes
;
284 poly_uint64 vectorization_factor
= 1;
285 tree scalar_type
= NULL_TREE
;
288 stmt_vec_info stmt_info
;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i
= 0; i
< nbbs
; i
++)
295 basic_block bb
= bbs
[i
];
297 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
301 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
306 gcc_assert (stmt_info
);
308 if (STMT_VINFO_RELEVANT_P (stmt_info
)
309 || STMT_VINFO_LIVE_P (stmt_info
))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
312 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE
, vect_location
,
316 "get vectype for scalar type: %T\n",
319 vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
321 return opt_result::failure_at (phi
,
322 "not vectorized: unsupported "
325 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
334 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
335 dump_printf (MSG_NOTE
, "\n");
338 vect_update_max_nunits (&vectorization_factor
, vectype
);
342 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
345 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
347 = vect_determine_vf_for_stmt (loop_vinfo
,
348 stmt_info
, &vectorization_factor
);
354 /* TODO: Analyze cost. Decide if worth while to vectorize. */
355 if (dump_enabled_p ())
357 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
358 dump_dec (MSG_NOTE
, vectorization_factor
);
359 dump_printf (MSG_NOTE
, "\n");
362 if (known_le (vectorization_factor
, 1U))
363 return opt_result::failure_at (vect_location
,
364 "not vectorized: unsupported data-type\n");
365 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
366 return opt_result::success ();
370 /* Function vect_is_simple_iv_evolution.
372 FORNOW: A simple evolution of an induction variables in the loop is
373 considered a polynomial evolution. */
376 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
381 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
384 /* When there is no evolution in this loop, the evolution function
386 if (evolution_part
== NULL_TREE
)
389 /* When the evolution is a polynomial of degree >= 2
390 the evolution function is not "simple". */
391 if (tree_is_chrec (evolution_part
))
394 step_expr
= evolution_part
;
395 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
397 if (dump_enabled_p ())
398 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
399 step_expr
, init_expr
);
404 if (TREE_CODE (step_expr
) != INTEGER_CST
405 && (TREE_CODE (step_expr
) != SSA_NAME
406 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
407 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
408 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
409 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
410 || !flag_associative_math
)))
411 && (TREE_CODE (step_expr
) != REAL_CST
412 || !flag_associative_math
))
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
424 what we are assuming is a double reduction. For example, given
425 a structure like this:
428 x_1 = PHI <x_4(outer2), ...>;
432 x_2 = PHI <x_1(outer1), ...>;
438 x_4 = PHI <x_3(inner)>;
441 outer loop analysis would treat x_1 as a double reduction phi and
442 this function would then return true for x_2. */
445 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo
, gphi
*phi
)
449 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
450 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
451 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
456 /* Function vect_analyze_scalar_cycles_1.
458 Examine the cross iteration def-use cycles of scalar variables
459 in LOOP. LOOP_VINFO represents the loop that is now being
460 considered for vectorization (can be LOOP, or an outer-loop
464 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
)
466 basic_block bb
= loop
->header
;
468 auto_vec
<stmt_vec_info
, 64> worklist
;
470 bool double_reduc
, reduc_chain
;
472 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
474 /* First - identify all inductions. Reduction detection assumes that all the
475 inductions have been identified, therefore, this order must not be
477 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
479 gphi
*phi
= gsi
.phi ();
480 tree access_fn
= NULL
;
481 tree def
= PHI_RESULT (phi
);
482 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
487 /* Skip virtual phi's. The data dependences that are associated with
488 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
489 if (virtual_operand_p (def
))
492 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
494 /* Analyze the evolution function. */
495 access_fn
= analyze_scalar_evolution (loop
, def
);
498 STRIP_NOPS (access_fn
);
499 if (dump_enabled_p ())
500 dump_printf_loc (MSG_NOTE
, vect_location
,
501 "Access function of PHI: %T\n", access_fn
);
502 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
503 = initial_condition_in_loop_num (access_fn
, loop
->num
);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
505 = evolution_part_in_loop_num (access_fn
, loop
->num
);
509 || vect_inner_phi_in_double_reduction_p (loop_vinfo
, phi
)
510 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
511 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
512 && TREE_CODE (step
) != INTEGER_CST
))
514 worklist
.safe_push (stmt_vinfo
);
518 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
524 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
528 /* Second - identify all reductions and nested cycles. */
529 while (worklist
.length () > 0)
531 stmt_vec_info stmt_vinfo
= worklist
.pop ();
532 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
533 tree def
= PHI_RESULT (phi
);
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
538 gcc_assert (!virtual_operand_p (def
)
539 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
541 stmt_vec_info reduc_stmt_info
542 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
,
546 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
547 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
550 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE
, vect_location
,
552 "Detected double reduction.\n");
554 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
555 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
559 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
561 if (dump_enabled_p ())
562 dump_printf_loc (MSG_NOTE
, vect_location
,
563 "Detected vectorizable nested cycle.\n");
565 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE
, vect_location
,
571 "Detected reduction.\n");
573 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
574 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
575 /* Store the reduction cycles for possible vectorization in
576 loop-aware SLP if it was not detected as reduction
579 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
587 "Unknown def-use cycle pattern.\n");
592 /* Function vect_analyze_scalar_cycles.
594 Examine the cross iteration def-use cycles of scalar variables, by
595 analyzing the loop-header PHIs of scalar variables. Classify each
596 cycle as one of the following: invariant, induction, reduction, unknown.
597 We do that for the loop represented by LOOP_VINFO, and also to its
598 inner-loop, if exists.
599 Examples for scalar cycles:
614 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
616 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
618 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
620 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
621 Reductions in such inner-loop therefore have different properties than
622 the reductions in the nest that gets vectorized:
623 1. When vectorized, they are executed in the same order as in the original
624 scalar loop, so we can't change the order of computation when
626 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
627 current checks are too strict. */
630 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
633 /* Transfer group and reduction information from STMT_INFO to its
637 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
639 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
641 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
642 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
643 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
646 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
647 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp
)
648 == STMT_VINFO_DEF_TYPE (stmt_info
));
649 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
650 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
652 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
653 = STMT_VINFO_RELATED_STMT (stmt_info
);
658 /* Fixup scalar cycles that now have their stmts detected as patterns. */
661 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
666 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
667 if (STMT_VINFO_IN_PATTERN_P (first
))
669 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
672 if (! STMT_VINFO_IN_PATTERN_P (next
)
673 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next
)) == -1)
675 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
677 /* If not all stmt in the chain are patterns or if we failed
678 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first
)) != -1)
683 vect_fixup_reduc_chain (first
);
684 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
685 = STMT_VINFO_RELATED_STMT (first
);
690 /* Function vect_get_loop_niters.
692 Determine how many iterations the loop is executed and place it
693 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
694 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
695 niter information holds in ASSUMPTIONS.
697 Return the loop exit condition. */
701 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
702 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
704 edge exit
= single_exit (loop
);
705 class tree_niter_desc niter_desc
;
706 tree niter_assumptions
, niter
, may_be_zero
;
707 gcond
*cond
= get_loop_exit_condition (loop
);
709 *assumptions
= boolean_true_node
;
710 *number_of_iterationsm1
= chrec_dont_know
;
711 *number_of_iterations
= chrec_dont_know
;
712 DUMP_VECT_SCOPE ("get_loop_niters");
717 may_be_zero
= NULL_TREE
;
718 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
719 || chrec_contains_undetermined (niter_desc
.niter
))
722 niter_assumptions
= niter_desc
.assumptions
;
723 may_be_zero
= niter_desc
.may_be_zero
;
724 niter
= niter_desc
.niter
;
726 if (may_be_zero
&& integer_zerop (may_be_zero
))
727 may_be_zero
= NULL_TREE
;
731 if (COMPARISON_CLASS_P (may_be_zero
))
733 /* Try to combine may_be_zero with assumptions, this can simplify
734 computation of niter expression. */
735 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
736 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
738 fold_build1 (TRUTH_NOT_EXPR
,
742 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
743 build_int_cst (TREE_TYPE (niter
), 0),
744 rewrite_to_non_trapping_overflow (niter
));
746 may_be_zero
= NULL_TREE
;
748 else if (integer_nonzerop (may_be_zero
))
750 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
751 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
758 *assumptions
= niter_assumptions
;
759 *number_of_iterationsm1
= niter
;
761 /* We want the number of loop header executions which is the number
762 of latch executions plus one.
763 ??? For UINT_MAX latch executions this number overflows to zero
764 for loops like do { n++; } while (n != 0); */
765 if (niter
&& !chrec_contains_undetermined (niter
))
766 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
767 build_int_cst (TREE_TYPE (niter
), 1));
768 *number_of_iterations
= niter
;
773 /* Function bb_in_loop_p
775 Used as predicate for dfs order traversal of the loop bbs. */
778 bb_in_loop_p (const_basic_block bb
, const void *data
)
780 const class loop
*const loop
= (const class loop
*)data
;
781 if (flow_bb_inside_loop_p (loop
, bb
))
787 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
788 stmt_vec_info structs for all the stmts in LOOP_IN. */
790 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
791 : vec_info (vec_info::loop
, init_cost (loop_in
), shared
),
793 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
794 num_itersm1 (NULL_TREE
),
795 num_iters (NULL_TREE
),
796 num_iters_unchanged (NULL_TREE
),
797 num_iters_assumptions (NULL_TREE
),
799 versioning_threshold (0),
800 vectorization_factor (0),
801 max_vectorization_factor (0),
802 mask_skip_niters (NULL_TREE
),
803 mask_compare_type (NULL_TREE
),
804 simd_if_cond (NULL_TREE
),
806 peeling_for_alignment (0),
810 slp_unrolling_factor (1),
811 single_scalar_iteration_cost (0),
812 vec_outside_cost (0),
814 vectorizable (false),
815 can_fully_mask_p (true),
816 fully_masked_p (false),
817 peeling_for_gaps (false),
818 peeling_for_niter (false),
819 no_data_dependencies (false),
820 has_mask_store (false),
821 scalar_loop_scaling (profile_probability::uninitialized ()),
823 orig_loop_info (NULL
)
825 /* CHECKME: We want to visit all BBs before their successors (except for
826 latch blocks, for which this assertion wouldn't hold). In the simple
827 case of the loop forms we allow, a dfs order of the BBs would the same
828 as reversed postorder traversal, so we are safe. */
830 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
831 bbs
, loop
->num_nodes
, loop
);
832 gcc_assert (nbbs
== loop
->num_nodes
);
834 for (unsigned int i
= 0; i
< nbbs
; i
++)
836 basic_block bb
= bbs
[i
];
837 gimple_stmt_iterator si
;
839 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
841 gimple
*phi
= gsi_stmt (si
);
842 gimple_set_uid (phi
, 0);
846 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
848 gimple
*stmt
= gsi_stmt (si
);
849 gimple_set_uid (stmt
, 0);
851 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
852 third argument is the #pragma omp simd if (x) condition, when 0,
853 loop shouldn't be vectorized, when non-zero constant, it should
854 be vectorized normally, otherwise versioned with vectorized loop
855 done if the condition is non-zero at runtime. */
857 && is_gimple_call (stmt
)
858 && gimple_call_internal_p (stmt
)
859 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
860 && gimple_call_num_args (stmt
) >= 3
861 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
863 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
865 tree arg
= gimple_call_arg (stmt
, 2);
866 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
869 gcc_assert (integer_nonzerop (arg
));
874 epilogue_vinfos
.create (6);
877 /* Free all levels of MASKS. */
880 release_vec_loop_masks (vec_loop_masks
*masks
)
884 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
885 rgm
->masks
.release ();
889 /* Free all memory used by the _loop_vec_info, as well as all the
890 stmt_vec_info structs of all the stmts in the loop. */
892 _loop_vec_info::~_loop_vec_info ()
896 release_vec_loop_masks (&masks
);
899 epilogue_vinfos
.release ();
904 /* Return an invariant or register for EXPR and emit necessary
905 computations in the LOOP_VINFO loop preheader. */
908 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
910 if (is_gimple_reg (expr
)
911 || is_gimple_min_invariant (expr
))
914 if (! loop_vinfo
->ivexpr_map
)
915 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
916 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
919 gimple_seq stmts
= NULL
;
920 cached
= force_gimple_operand (unshare_expr (expr
),
921 &stmts
, true, NULL_TREE
);
924 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
925 gsi_insert_seq_on_edge_immediate (e
, stmts
);
931 /* Return true if we can use CMP_TYPE as the comparison type to produce
932 all masks required to mask LOOP_VINFO. */
935 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
939 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
940 if (rgm
->mask_type
!= NULL_TREE
941 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
942 cmp_type
, rgm
->mask_type
,
948 /* Calculate the maximum number of scalars per iteration for every
949 rgroup in LOOP_VINFO. */
952 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
954 unsigned int res
= 1;
957 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
958 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
962 /* Each statement in LOOP_VINFO can be masked where necessary. Check
963 whether we can actually generate the masks required. Return true if so,
964 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
967 vect_verify_full_masking (loop_vec_info loop_vinfo
)
969 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
970 unsigned int min_ni_width
;
971 unsigned int max_nscalars_per_iter
972 = vect_get_max_nscalars_per_iter (loop_vinfo
);
974 /* Use a normal loop if there are no statements that need masking.
975 This only happens in rare degenerate cases: it means that the loop
976 has no loads, no stores, and no live-out values. */
977 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
983 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges
;
987 if (max_loop_iterations (loop
, &max_back_edges
))
988 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
990 /* Account for rgroup masks, in which each bit is replicated N times. */
991 max_ni
*= max_nscalars_per_iter
;
993 /* Work out how many bits we need to represent the limit. */
994 min_ni_width
= wi::min_precision (max_ni
, UNSIGNED
);
996 /* Find a scalar mode for which WHILE_ULT is supported. */
997 opt_scalar_int_mode cmp_mode_iter
;
998 tree cmp_type
= NULL_TREE
;
999 tree iv_type
= NULL_TREE
;
1000 widest_int iv_limit
= vect_iv_limit_for_full_masking (loop_vinfo
);
1001 unsigned int iv_precision
= UINT_MAX
;
1004 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1007 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1009 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1010 if (cmp_bits
>= min_ni_width
1011 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1013 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1015 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1017 /* Although we could stop as soon as we find a valid mode,
1018 there are at least two reasons why that's not always the
1021 - An IV that's Pmode or wider is more likely to be reusable
1022 in address calculations than an IV that's narrower than
1025 - Doing the comparison in IV_PRECISION or wider allows
1026 a natural 0-based IV, whereas using a narrower comparison
1027 type requires mitigations against wrap-around.
1029 Conversely, if the IV limit is variable, doing the comparison
1030 in a wider type than the original type can introduce
1031 unnecessary extensions, so picking the widest valid mode
1032 is not always a good choice either.
1034 Here we prefer the first IV type that's Pmode or wider,
1035 and the first comparison type that's IV_PRECISION or wider.
1036 (The comparison type must be no wider than the IV type,
1037 to avoid extensions in the vector loop.)
1039 ??? We might want to try continuing beyond Pmode for ILP32
1040 targets if CMP_BITS < IV_PRECISION. */
1041 iv_type
= this_type
;
1042 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1043 cmp_type
= this_type
;
1044 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1053 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1054 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo
) = iv_type
;
1058 /* Calculate the cost of one scalar iteration of the loop. */
1060 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1062 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1063 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1064 int nbbs
= loop
->num_nodes
, factor
;
1065 int innerloop_iters
, i
;
1067 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1069 /* Gather costs for statements in the scalar loop. */
1072 innerloop_iters
= 1;
1074 innerloop_iters
= 50; /* FIXME */
1076 for (i
= 0; i
< nbbs
; i
++)
1078 gimple_stmt_iterator si
;
1079 basic_block bb
= bbs
[i
];
1081 if (bb
->loop_father
== loop
->inner
)
1082 factor
= innerloop_iters
;
1086 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1088 gimple
*stmt
= gsi_stmt (si
);
1089 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1091 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1094 /* Skip stmts that are not vectorized inside the loop. */
1095 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1096 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1097 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1098 || !VECTORIZABLE_CYCLE_DEF
1099 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1102 vect_cost_for_stmt kind
;
1103 if (STMT_VINFO_DATA_REF (stmt_info
))
1105 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1108 kind
= scalar_store
;
1110 else if (vect_nop_conversion_p (stmt_info
))
1115 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1116 factor
, kind
, stmt_info
, 0, vect_prologue
);
1120 /* Now accumulate cost. */
1121 void *target_cost_data
= init_cost (loop
);
1122 stmt_info_for_cost
*si
;
1124 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1126 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, si
->count
,
1127 si
->kind
, si
->stmt_info
, si
->vectype
,
1128 si
->misalign
, vect_body
);
1129 unsigned dummy
, body_cost
= 0;
1130 finish_cost (target_cost_data
, &dummy
, &body_cost
, &dummy
);
1131 destroy_cost_data (target_cost_data
);
1132 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
) = body_cost
;
1136 /* Function vect_analyze_loop_form_1.
1138 Verify that certain CFG restrictions hold, including:
1139 - the loop has a pre-header
1140 - the loop has a single entry and exit
1141 - the loop exit condition is simple enough
1142 - the number of iterations can be analyzed, i.e, a countable loop. The
1143 niter could be analyzed under some assumptions. */
1146 vect_analyze_loop_form_1 (class loop
*loop
, gcond
**loop_cond
,
1147 tree
*assumptions
, tree
*number_of_iterationsm1
,
1148 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1150 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1152 /* Different restrictions apply when we are considering an inner-most loop,
1153 vs. an outer (nested) loop.
1154 (FORNOW. May want to relax some of these restrictions in the future). */
1158 /* Inner-most loop. We currently require that the number of BBs is
1159 exactly 2 (the header and latch). Vectorizable inner-most loops
1170 if (loop
->num_nodes
!= 2)
1171 return opt_result::failure_at (vect_location
,
1173 " control flow in loop.\n");
1175 if (empty_block_p (loop
->header
))
1176 return opt_result::failure_at (vect_location
,
1177 "not vectorized: empty loop.\n");
1181 class loop
*innerloop
= loop
->inner
;
1184 /* Nested loop. We currently require that the loop is doubly-nested,
1185 contains a single inner loop, and the number of BBs is exactly 5.
1186 Vectorizable outer-loops look like this:
1198 The inner-loop has the properties expected of inner-most loops
1199 as described above. */
1201 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1202 return opt_result::failure_at (vect_location
,
1204 " multiple nested loops.\n");
1206 if (loop
->num_nodes
!= 5)
1207 return opt_result::failure_at (vect_location
,
1209 " control flow in loop.\n");
1211 entryedge
= loop_preheader_edge (innerloop
);
1212 if (entryedge
->src
!= loop
->header
1213 || !single_exit (innerloop
)
1214 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1215 return opt_result::failure_at (vect_location
,
1217 " unsupported outerloop form.\n");
1219 /* Analyze the inner-loop. */
1220 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1222 = vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1223 &inner_assumptions
, &inner_niterm1
,
1224 &inner_niter
, NULL
);
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1229 "not vectorized: Bad inner loop.\n");
1233 /* Don't support analyzing niter under assumptions for inner
1235 if (!integer_onep (inner_assumptions
))
1236 return opt_result::failure_at (vect_location
,
1237 "not vectorized: Bad inner loop.\n");
1239 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1240 return opt_result::failure_at (vect_location
,
1241 "not vectorized: inner-loop count not"
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_NOTE
, vect_location
,
1246 "Considering outer-loop vectorization.\n");
1249 if (!single_exit (loop
))
1250 return opt_result::failure_at (vect_location
,
1251 "not vectorized: multiple exits.\n");
1252 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1253 return opt_result::failure_at (vect_location
,
1255 " too many incoming edges.\n");
1257 /* We assume that the loop exit condition is at the end of the loop. i.e,
1258 that the loop is represented as a do-while (with a proper if-guard
1259 before the loop if needed), where the loop header contains all the
1260 executable statements, and the latch is empty. */
1261 if (!empty_block_p (loop
->latch
)
1262 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1263 return opt_result::failure_at (vect_location
,
1264 "not vectorized: latch block not empty.\n");
1266 /* Make sure the exit is not abnormal. */
1267 edge e
= single_exit (loop
);
1268 if (e
->flags
& EDGE_ABNORMAL
)
1269 return opt_result::failure_at (vect_location
,
1271 " abnormal loop exit edge.\n");
1273 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1274 number_of_iterationsm1
);
1276 return opt_result::failure_at
1278 "not vectorized: complicated exit condition.\n");
1280 if (integer_zerop (*assumptions
)
1281 || !*number_of_iterations
1282 || chrec_contains_undetermined (*number_of_iterations
))
1283 return opt_result::failure_at
1285 "not vectorized: number of iterations cannot be computed.\n");
1287 if (integer_zerop (*number_of_iterations
))
1288 return opt_result::failure_at
1290 "not vectorized: number of iterations = 0.\n");
1292 return opt_result::success ();
1295 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1298 vect_analyze_loop_form (class loop
*loop
, vec_info_shared
*shared
)
1300 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1301 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1304 = vect_analyze_loop_form_1 (loop
, &loop_cond
,
1305 &assumptions
, &number_of_iterationsm1
,
1306 &number_of_iterations
, &inner_loop_cond
);
1308 return opt_loop_vec_info::propagate_failure (res
);
1310 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1311 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1312 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1313 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1314 if (!integer_onep (assumptions
))
1316 /* We consider to vectorize this loop by versioning it under
1317 some assumptions. In order to do this, we need to clear
1318 existing information computed by scev and niter analyzer. */
1320 free_numbers_of_iterations_estimates (loop
);
1321 /* Also set flag for this loop so that following scev and niter
1322 analysis are done under the assumptions. */
1323 loop_constraint_set (loop
, LOOP_C_FINITE
);
1324 /* Also record the assumptions for versioning. */
1325 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1328 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1330 if (dump_enabled_p ())
1332 dump_printf_loc (MSG_NOTE
, vect_location
,
1333 "Symbolic number of iterations is ");
1334 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1335 dump_printf (MSG_NOTE
, "\n");
1339 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (loop_cond
);
1340 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1341 if (inner_loop_cond
)
1343 stmt_vec_info inner_loop_cond_info
1344 = loop_vinfo
->lookup_stmt (inner_loop_cond
);
1345 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1348 gcc_assert (!loop
->aux
);
1349 loop
->aux
= loop_vinfo
;
1350 return opt_loop_vec_info::success (loop_vinfo
);
1355 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1356 statements update the vectorization factor. */
1359 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1361 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1362 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1363 int nbbs
= loop
->num_nodes
;
1364 poly_uint64 vectorization_factor
;
1367 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1369 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1370 gcc_assert (known_ne (vectorization_factor
, 0U));
1372 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1373 vectorization factor of the loop is the unrolling factor required by
1374 the SLP instances. If that unrolling factor is 1, we say, that we
1375 perform pure SLP on loop - cross iteration parallelism is not
1377 bool only_slp_in_loop
= true;
1378 for (i
= 0; i
< nbbs
; i
++)
1380 basic_block bb
= bbs
[i
];
1381 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1384 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (si
.phi ());
1387 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1388 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1389 && !PURE_SLP_STMT (stmt_info
))
1390 /* STMT needs both SLP and loop-based vectorization. */
1391 only_slp_in_loop
= false;
1393 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1396 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1397 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1398 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1399 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1400 && !PURE_SLP_STMT (stmt_info
))
1401 /* STMT needs both SLP and loop-based vectorization. */
1402 only_slp_in_loop
= false;
1406 if (only_slp_in_loop
)
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_NOTE
, vect_location
,
1410 "Loop contains only SLP stmts\n");
1411 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_NOTE
, vect_location
,
1417 "Loop contains SLP and non-SLP stmts\n");
1418 /* Both the vectorization factor and unroll factor have the form
1419 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1420 so they must have a common multiple. */
1421 vectorization_factor
1422 = force_common_multiple (vectorization_factor
,
1423 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1426 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1427 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_NOTE
, vect_location
,
1430 "Updating vectorization factor to ");
1431 dump_dec (MSG_NOTE
, vectorization_factor
);
1432 dump_printf (MSG_NOTE
, ".\n");
1436 /* Return true if STMT_INFO describes a double reduction phi and if
1437 the other phi in the reduction is also relevant for vectorization.
1438 This rejects cases such as:
1441 x_1 = PHI <x_3(outer2), ...>;
1449 x_3 = PHI <x_2(inner)>;
1451 if nothing in x_2 or elsewhere makes x_1 relevant. */
1454 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1456 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1459 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1462 /* Function vect_analyze_loop_operations.
1464 Scan the loop stmts and make sure they are all vectorizable. */
1467 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1469 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1470 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1471 int nbbs
= loop
->num_nodes
;
1473 stmt_vec_info stmt_info
;
1474 bool need_to_vectorize
= false;
1477 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1479 auto_vec
<stmt_info_for_cost
> cost_vec
;
1481 for (i
= 0; i
< nbbs
; i
++)
1483 basic_block bb
= bbs
[i
];
1485 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1488 gphi
*phi
= si
.phi ();
1491 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G", phi
);
1494 if (virtual_operand_p (gimple_phi_result (phi
)))
1497 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1498 (i.e., a phi in the tail of the outer-loop). */
1499 if (! is_loop_header_bb_p (bb
))
1501 /* FORNOW: we currently don't support the case that these phis
1502 are not used in the outerloop (unless it is double reduction,
1503 i.e., this phi is vect_reduction_def), cause this case
1504 requires to actually do something here. */
1505 if (STMT_VINFO_LIVE_P (stmt_info
)
1506 && !vect_active_double_reduction_p (stmt_info
))
1507 return opt_result::failure_at (phi
,
1508 "Unsupported loop-closed phi"
1509 " in outer-loop.\n");
1511 /* If PHI is used in the outer loop, we check that its operand
1512 is defined in the inner loop. */
1513 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1517 if (gimple_phi_num_args (phi
) != 1)
1518 return opt_result::failure_at (phi
, "unsupported phi");
1520 phi_op
= PHI_ARG_DEF (phi
, 0);
1521 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1523 return opt_result::failure_at (phi
, "unsupported phi\n");
1525 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1526 && (STMT_VINFO_RELEVANT (op_def_info
)
1527 != vect_used_in_outer_by_reduction
))
1528 return opt_result::failure_at (phi
, "unsupported phi\n");
1530 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1531 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1532 == vect_double_reduction_def
))
1533 && !vectorizable_lc_phi (loop_vinfo
,
1534 stmt_info
, NULL
, NULL
))
1535 return opt_result::failure_at (phi
, "unsupported phi\n");
1541 gcc_assert (stmt_info
);
1543 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1544 || STMT_VINFO_LIVE_P (stmt_info
))
1545 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1546 /* A scalar-dependence cycle that we don't support. */
1547 return opt_result::failure_at (phi
,
1549 " scalar dependence cycle.\n");
1551 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1553 need_to_vectorize
= true;
1554 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1555 && ! PURE_SLP_STMT (stmt_info
))
1556 ok
= vectorizable_induction (loop_vinfo
,
1557 stmt_info
, NULL
, NULL
, NULL
,
1559 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1560 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1561 == vect_double_reduction_def
)
1562 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1563 && ! PURE_SLP_STMT (stmt_info
))
1564 ok
= vectorizable_reduction (loop_vinfo
,
1565 stmt_info
, NULL
, NULL
, &cost_vec
);
1568 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1570 && STMT_VINFO_LIVE_P (stmt_info
)
1571 && !PURE_SLP_STMT (stmt_info
))
1572 ok
= vectorizable_live_operation (loop_vinfo
,
1573 stmt_info
, NULL
, NULL
, NULL
,
1574 -1, false, &cost_vec
);
1577 return opt_result::failure_at (phi
,
1578 "not vectorized: relevant phi not "
1580 static_cast <gimple
*> (phi
));
1583 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1586 gimple
*stmt
= gsi_stmt (si
);
1587 if (!gimple_clobber_p (stmt
))
1590 = vect_analyze_stmt (loop_vinfo
,
1591 loop_vinfo
->lookup_stmt (stmt
),
1593 NULL
, NULL
, &cost_vec
);
1600 add_stmt_costs (loop_vinfo
, loop_vinfo
->target_cost_data
, &cost_vec
);
1602 /* All operations in the loop are either irrelevant (deal with loop
1603 control, or dead), or only used outside the loop and can be moved
1604 out of the loop (e.g. invariants, inductions). The loop can be
1605 optimized away by scalar optimizations. We're better off not
1606 touching this loop. */
1607 if (!need_to_vectorize
)
1609 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_NOTE
, vect_location
,
1611 "All the computation can be taken out of the loop.\n");
1612 return opt_result::failure_at
1614 "not vectorized: redundant loop. no profit to vectorize.\n");
1617 return opt_result::success ();
1620 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1621 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1622 definitely no, or -1 if it's worth retrying. */
1625 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1627 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1628 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1630 /* Only fully-masked loops can have iteration counts less than the
1631 vectorization factor. */
1632 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1634 HOST_WIDE_INT max_niter
;
1636 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1637 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1639 max_niter
= max_stmt_executions_int (loop
);
1642 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1646 "not vectorized: iteration count smaller than "
1647 "vectorization factor.\n");
1652 int min_profitable_iters
, min_profitable_estimate
;
1653 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1654 &min_profitable_estimate
);
1656 if (min_profitable_iters
< 0)
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1660 "not vectorized: vectorization not profitable.\n");
1661 if (dump_enabled_p ())
1662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1663 "not vectorized: vector version will never be "
1668 int min_scalar_loop_bound
= (param_min_vect_loop_bound
1671 /* Use the cost model only if it is more conservative than user specified
1673 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1674 min_profitable_iters
);
1676 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1678 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1679 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1683 "not vectorized: vectorization not profitable.\n");
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_NOTE
, vect_location
,
1686 "not vectorized: iteration count smaller than user "
1687 "specified loop bound parameter or minimum profitable "
1688 "iterations (whichever is more conservative).\n");
1692 /* The static profitablity threshold min_profitable_estimate includes
1693 the cost of having to check at runtime whether the scalar loop
1694 should be used instead. If it turns out that we don't need or want
1695 such a check, the threshold we should use for the static estimate
1696 is simply the point at which the vector loop becomes more profitable
1697 than the scalar loop. */
1698 if (min_profitable_estimate
> min_profitable_iters
1699 && !LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1700 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
)
1701 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1702 && !vect_apply_runtime_profitability_check_p (loop_vinfo
))
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_NOTE
, vect_location
, "no need for a runtime"
1706 " choice between the scalar and vector loops\n");
1707 min_profitable_estimate
= min_profitable_iters
;
1710 HOST_WIDE_INT estimated_niter
;
1712 /* If we are vectorizing an epilogue then we know the maximum number of
1713 scalar iterations it will cover is at least one lower than the
1714 vectorization factor of the main loop. */
1715 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
1717 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
)) - 1;
1720 estimated_niter
= estimated_stmt_executions_int (loop
);
1721 if (estimated_niter
== -1)
1722 estimated_niter
= likely_max_stmt_executions_int (loop
);
1724 if (estimated_niter
!= -1
1725 && ((unsigned HOST_WIDE_INT
) estimated_niter
1726 < MAX (th
, (unsigned) min_profitable_estimate
)))
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1730 "not vectorized: estimated iteration count too "
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_NOTE
, vect_location
,
1734 "not vectorized: estimated iteration count smaller "
1735 "than specified loop bound parameter or minimum "
1736 "profitable iterations (whichever is more "
1737 "conservative).\n");
1745 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
1746 vec
<data_reference_p
> *datarefs
,
1747 unsigned int *n_stmts
)
1750 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1751 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1752 !gsi_end_p (gsi
); gsi_next (&gsi
))
1754 gimple
*stmt
= gsi_stmt (gsi
);
1755 if (is_gimple_debug (stmt
))
1758 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
);
1761 if (is_gimple_call (stmt
) && loop
->safelen
)
1763 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1764 if (fndecl
!= NULL_TREE
)
1766 cgraph_node
*node
= cgraph_node::get (fndecl
);
1767 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1769 unsigned int j
, n
= gimple_call_num_args (stmt
);
1770 for (j
= 0; j
< n
; j
++)
1772 op
= gimple_call_arg (stmt
, j
);
1774 || (REFERENCE_CLASS_P (op
)
1775 && get_base_address (op
)))
1778 op
= gimple_call_lhs (stmt
);
1779 /* Ignore #pragma omp declare simd functions
1780 if they don't have data references in the
1781 call stmt itself. */
1785 || (REFERENCE_CLASS_P (op
)
1786 && get_base_address (op
)))))
1793 /* If dependence analysis will give up due to the limit on the
1794 number of datarefs stop here and fail fatally. */
1795 if (datarefs
->length ()
1796 > (unsigned)param_loop_max_datarefs_for_datadeps
)
1797 return opt_result::failure_at (stmt
, "exceeded param "
1798 "loop-max-datarefs-for-datadeps\n");
1800 return opt_result::success ();
1803 /* Look for SLP-only access groups and turn each individual access into its own
1806 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
1809 struct data_reference
*dr
;
1811 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1813 vec
<data_reference_p
> datarefs
= loop_vinfo
->shared
->datarefs
;
1814 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1816 gcc_assert (DR_REF (dr
));
1817 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
1819 /* Check if the load is a part of an interleaving chain. */
1820 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1822 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1823 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
1825 /* Check if SLP-only groups. */
1826 if (!STMT_SLP_TYPE (stmt_info
)
1827 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
1829 /* Dissolve the group. */
1830 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
1832 stmt_vec_info vinfo
= first_element
;
1835 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
1836 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
1837 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
1838 DR_GROUP_SIZE (vinfo
) = 1;
1839 if (STMT_VINFO_STRIDED_P (first_element
))
1840 DR_GROUP_GAP (vinfo
) = 0;
1842 DR_GROUP_GAP (vinfo
) = group_size
- 1;
1851 /* Decides whether we need to create an epilogue loop to handle
1852 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1855 determine_peel_for_niter (loop_vec_info loop_vinfo
)
1857 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
1859 unsigned HOST_WIDE_INT const_vf
;
1860 HOST_WIDE_INT max_niter
1861 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1863 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1864 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1865 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1868 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1869 /* The main loop handles all iterations. */
1870 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
1871 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1872 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1874 /* Work out the (constant) number of iterations that need to be
1875 peeled for reasons other than niters. */
1876 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1877 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1879 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1880 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1881 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1883 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1884 /* ??? When peeling for gaps but not alignment, we could
1885 try to check whether the (variable) niters is known to be
1886 VF * N + 1. That's something of a niche case though. */
1887 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1888 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1889 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1890 < (unsigned) exact_log2 (const_vf
))
1891 /* In case of versioning, check if the maximum number of
1892 iterations is greater than th. If they are identical,
1893 the epilogue is unnecessary. */
1894 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1895 || ((unsigned HOST_WIDE_INT
) max_niter
1896 > (th
/ const_vf
) * const_vf
))))
1897 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1901 /* Function vect_analyze_loop_2.
1903 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1904 for it. The different analyses will record information in the
1905 loop_vec_info struct. */
1907 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
, unsigned *n_stmts
)
1909 opt_result ok
= opt_result::success ();
1911 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
1912 poly_uint64 min_vf
= 2;
1913 loop_vec_info orig_loop_vinfo
= NULL
;
1915 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1916 loop_vec_info of the first vectorized loop. */
1917 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
1918 orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1920 orig_loop_vinfo
= loop_vinfo
;
1921 gcc_assert (orig_loop_vinfo
);
1923 /* The first group of checks is independent of the vector size. */
1926 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
1927 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
1928 return opt_result::failure_at (vect_location
,
1929 "not vectorized: simd if(0)\n");
1931 /* Find all data references in the loop (which correspond to vdefs/vuses)
1932 and analyze their evolution in the loop. */
1934 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1936 /* Gather the data references and count stmts in the loop. */
1937 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
1940 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
1941 &LOOP_VINFO_DATAREFS (loop_vinfo
),
1945 if (dump_enabled_p ())
1946 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1947 "not vectorized: loop contains function "
1948 "calls or data references that cannot "
1952 loop_vinfo
->shared
->save_datarefs ();
1955 loop_vinfo
->shared
->check_datarefs ();
1957 /* Analyze the data references and also adjust the minimal
1958 vectorization factor according to the loads and stores. */
1960 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1965 "bad data references.\n");
1969 /* Classify all cross-iteration scalar data-flow cycles.
1970 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1971 vect_analyze_scalar_cycles (loop_vinfo
);
1973 vect_pattern_recog (loop_vinfo
);
1975 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
1977 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1978 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1980 ok
= vect_analyze_data_ref_accesses (loop_vinfo
);
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1985 "bad data access.\n");
1989 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1991 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1996 "unexpected pattern.\n");
2000 /* While the rest of the analysis below depends on it in some way. */
2003 /* Analyze data dependences between the data-refs in the loop
2004 and adjust the maximum vectorization factor according to
2006 FORNOW: fail at the first data dependence that we encounter. */
2008 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2013 "bad data dependence.\n");
2016 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2017 && maybe_lt (max_vf
, min_vf
))
2018 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2019 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
2021 ok
= vect_determine_vectorization_factor (loop_vinfo
);
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2026 "can't determine vectorization factor.\n");
2029 if (max_vf
!= MAX_VECTORIZATION_FACTOR
2030 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2031 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
2033 /* Compute the scalar iteration cost. */
2034 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
2036 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2038 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2039 ok
= vect_analyze_slp (loop_vinfo
, *n_stmts
);
2043 /* If there are any SLP instances mark them as pure_slp. */
2044 bool slp
= vect_make_slp_decision (loop_vinfo
);
2047 /* Find stmts that need to be both vectorized and SLPed. */
2048 vect_detect_hybrid_slp (loop_vinfo
);
2050 /* Update the vectorization factor based on the SLP decision. */
2051 vect_update_vf_for_slp (loop_vinfo
);
2053 /* Optimize the SLP graph with the vectorization factor fixed. */
2054 vect_optimize_slp (loop_vinfo
);
2057 bool saved_can_fully_mask_p
= LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
);
2059 /* We don't expect to have to roll back to anything other than an empty
2061 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2063 /* This is the point where we can re-start analysis with SLP forced off. */
2066 /* Now the vectorization factor is final. */
2067 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2068 gcc_assert (known_ne (vectorization_factor
, 0U));
2070 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE
, vect_location
,
2073 "vectorization_factor = ");
2074 dump_dec (MSG_NOTE
, vectorization_factor
);
2075 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2076 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2079 /* Analyze the alignment of the data-refs in the loop.
2080 Fail if a data reference is found that cannot be vectorized. */
2082 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2087 "bad data alignment.\n");
2091 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2092 It is important to call pruning after vect_analyze_data_ref_accesses,
2093 since we use grouping information gathered by interleaving analysis. */
2094 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2098 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2099 vectorization, since we do not want to add extra peeling or
2100 add versioning for alignment. */
2101 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2102 /* This pass will decide on using loop versioning and/or loop peeling in
2103 order to enhance the alignment of data references in the loop. */
2104 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2106 ok
= vect_verify_datarefs_alignment (loop_vinfo
);
2112 /* Analyze operations in the SLP instances. Note this may
2113 remove unsupported SLP instances which makes the above
2114 SLP kind detection invalid. */
2115 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2116 vect_slp_analyze_operations (loop_vinfo
);
2117 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2119 ok
= opt_result::failure_at (vect_location
,
2120 "unsupported SLP instances\n");
2125 /* Dissolve SLP-only groups. */
2126 vect_dissolve_slp_only_groups (loop_vinfo
);
2128 /* Scan all the remaining operations in the loop that are not subject
2129 to SLP and make sure they are vectorizable. */
2130 ok
= vect_analyze_loop_operations (loop_vinfo
);
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2135 "bad operation or unsupported loop bound.\n");
2139 /* Decide whether to use a fully-masked loop for this vectorization
2141 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2142 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
)
2143 && vect_verify_full_masking (loop_vinfo
));
2144 if (dump_enabled_p ())
2146 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2147 dump_printf_loc (MSG_NOTE
, vect_location
,
2148 "using a fully-masked loop.\n");
2150 dump_printf_loc (MSG_NOTE
, vect_location
,
2151 "not using a fully-masked loop.\n");
2154 /* If epilog loop is required because of data accesses with gaps,
2155 one additional iteration needs to be peeled. Check if there is
2156 enough iterations for vectorization. */
2157 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2158 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2159 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2161 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2162 tree scalar_niters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2164 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2165 return opt_result::failure_at (vect_location
,
2166 "loop has no enough iterations to"
2167 " support peeling for gaps.\n");
2170 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2171 loop or a loop that has a lower VF than the main loop. */
2172 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
)
2173 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2174 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2175 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo
)))
2176 return opt_result::failure_at (vect_location
,
2177 "Vectorization factor too high for"
2178 " epilogue loop.\n");
2180 /* Check the costings of the loop make vectorizing worthwhile. */
2181 res
= vect_analyze_loop_costing (loop_vinfo
);
2184 ok
= opt_result::failure_at (vect_location
,
2185 "Loop costings may not be worthwhile.\n");
2189 return opt_result::failure_at (vect_location
,
2190 "Loop costings not worthwhile.\n");
2192 determine_peel_for_niter (loop_vinfo
);
2193 /* If an epilogue loop is required make sure we can create one. */
2194 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2195 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2197 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2199 if (!vect_can_advance_ivs_p (loop_vinfo
)
2200 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2201 single_exit (LOOP_VINFO_LOOP
2204 ok
= opt_result::failure_at (vect_location
,
2205 "not vectorized: can't create required "
2211 /* During peeling, we need to check if number of loop iterations is
2212 enough for both peeled prolog loop and vector loop. This check
2213 can be merged along with threshold check of loop versioning, so
2214 increase threshold for this case if necessary.
2216 If we are analyzing an epilogue we still want to check what its
2217 versioning threshold would be. If we decide to vectorize the epilogues we
2218 will want to use the lowest versioning threshold of all epilogues and main
2219 loop. This will enable us to enter a vectorized epilogue even when
2220 versioning the loop. We can't simply check whether the epilogue requires
2221 versioning though since we may have skipped some versioning checks when
2222 analyzing the epilogue. For instance, checks for alias versioning will be
2223 skipped when dealing with epilogues as we assume we already checked them
2224 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2225 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo
))
2227 poly_uint64 niters_th
= 0;
2228 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2230 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2232 /* Niters for peeled prolog loop. */
2233 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2235 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2236 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2237 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2240 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2243 /* Niters for at least one iteration of vectorized loop. */
2244 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2245 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2246 /* One additional iteration because of peeling for gap. */
2247 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2250 /* Use the same condition as vect_transform_loop to decide when to use
2251 the cost to determine a versioning threshold. */
2252 if (vect_apply_runtime_profitability_check_p (loop_vinfo
)
2253 && ordered_p (th
, niters_th
))
2254 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2256 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2259 gcc_assert (known_eq (vectorization_factor
,
2260 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2262 /* Ok to vectorize! */
2263 return opt_result::success ();
2266 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2269 /* Try again with SLP forced off but if we didn't do any SLP there is
2270 no point in re-trying. */
2274 /* If there are reduction chains re-trying will fail anyway. */
2275 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2278 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2279 via interleaving or lane instructions. */
2280 slp_instance instance
;
2283 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2285 stmt_vec_info vinfo
;
2286 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2287 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2289 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2290 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2291 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2292 if (! vect_store_lanes_supported (vectype
, size
, false)
2293 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2294 && ! vect_grouped_store_supported (vectype
, size
))
2295 return opt_result::failure_at (vinfo
->stmt
,
2296 "unsupported grouped store\n");
2297 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2299 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2300 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2301 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2302 size
= DR_GROUP_SIZE (vinfo
);
2303 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2304 if (! vect_load_lanes_supported (vectype
, size
, false)
2305 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2307 return opt_result::failure_at (vinfo
->stmt
,
2308 "unsupported grouped load\n");
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE
, vect_location
,
2314 "re-trying with SLP disabled\n");
2316 /* Roll back state appropriately. No SLP this time. */
2318 /* Restore vectorization factor as it were without SLP. */
2319 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2320 /* Free the SLP instances. */
2321 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2322 vect_free_slp_instance (instance
, false);
2323 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2324 /* Reset SLP type to loop_vect on all stmts. */
2325 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2327 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2328 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2329 !gsi_end_p (si
); gsi_next (&si
))
2331 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2332 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2333 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
2334 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
2336 /* vectorizable_reduction adjusts reduction stmt def-types,
2337 restore them to that of the PHI. */
2338 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info
))
2339 = STMT_VINFO_DEF_TYPE (stmt_info
);
2340 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2341 (STMT_VINFO_REDUC_DEF (stmt_info
)))
2342 = STMT_VINFO_DEF_TYPE (stmt_info
);
2345 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2346 !gsi_end_p (si
); gsi_next (&si
))
2348 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2349 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2350 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2352 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2353 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2354 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2355 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2356 !gsi_end_p (pi
); gsi_next (&pi
))
2357 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2362 /* Free optimized alias test DDRS. */
2363 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2364 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2365 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2366 /* Reset target cost data. */
2367 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2368 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2369 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2370 /* Reset accumulated rgroup information. */
2371 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo
));
2372 /* Reset assorted flags. */
2373 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2374 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2375 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2376 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2377 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = saved_can_fully_mask_p
;
2382 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2383 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2384 OLD_LOOP_VINFO is better unless something specifically indicates
2387 Note that this deliberately isn't a partial order. */
2390 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo
,
2391 loop_vec_info old_loop_vinfo
)
2393 struct loop
*loop
= LOOP_VINFO_LOOP (new_loop_vinfo
);
2394 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo
) == loop
);
2396 poly_int64 new_vf
= LOOP_VINFO_VECT_FACTOR (new_loop_vinfo
);
2397 poly_int64 old_vf
= LOOP_VINFO_VECT_FACTOR (old_loop_vinfo
);
2399 /* Always prefer a VF of loop->simdlen over any other VF. */
2402 bool new_simdlen_p
= known_eq (new_vf
, loop
->simdlen
);
2403 bool old_simdlen_p
= known_eq (old_vf
, loop
->simdlen
);
2404 if (new_simdlen_p
!= old_simdlen_p
)
2405 return new_simdlen_p
;
2408 /* Limit the VFs to what is likely to be the maximum number of iterations,
2409 to handle cases in which at least one loop_vinfo is fully-masked. */
2410 HOST_WIDE_INT estimated_max_niter
= likely_max_stmt_executions_int (loop
);
2411 if (estimated_max_niter
!= -1)
2413 if (known_le (estimated_max_niter
, new_vf
))
2414 new_vf
= estimated_max_niter
;
2415 if (known_le (estimated_max_niter
, old_vf
))
2416 old_vf
= estimated_max_niter
;
2419 /* Check whether the (fractional) cost per scalar iteration is lower
2420 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2421 poly_widest_int rel_new
= (new_loop_vinfo
->vec_inside_cost
2422 * poly_widest_int (old_vf
));
2423 poly_widest_int rel_old
= (old_loop_vinfo
->vec_inside_cost
2424 * poly_widest_int (new_vf
));
2425 if (maybe_lt (rel_old
, rel_new
))
2427 /* When old_loop_vinfo uses a variable vectorization factor,
2428 we know that it has a lower cost for at least one runtime VF.
2429 However, we don't know how likely that VF is.
2431 One option would be to compare the costs for the estimated VFs.
2432 The problem is that that can put too much pressure on the cost
2433 model. E.g. if the estimated VF is also the lowest possible VF,
2434 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2435 for the estimated VF, we'd then choose new_loop_vinfo even
2436 though (a) new_loop_vinfo might not actually be better than
2437 old_loop_vinfo for that VF and (b) it would be significantly
2438 worse at larger VFs.
2440 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2441 no more expensive than old_loop_vinfo even after doubling the
2442 estimated old_loop_vinfo VF. For all but trivial loops, this
2443 ensures that we only pick new_loop_vinfo if it is significantly
2444 better than old_loop_vinfo at the estimated VF. */
2445 if (rel_new
.is_constant ())
2448 HOST_WIDE_INT new_estimated_vf
= estimated_poly_value (new_vf
);
2449 HOST_WIDE_INT old_estimated_vf
= estimated_poly_value (old_vf
);
2450 widest_int estimated_rel_new
= (new_loop_vinfo
->vec_inside_cost
2451 * widest_int (old_estimated_vf
));
2452 widest_int estimated_rel_old
= (old_loop_vinfo
->vec_inside_cost
2453 * widest_int (new_estimated_vf
));
2454 return estimated_rel_new
* 2 <= estimated_rel_old
;
2456 if (known_lt (rel_new
, rel_old
))
2459 /* If there's nothing to choose between the loop bodies, see whether
2460 there's a difference in the prologue and epilogue costs. */
2461 if (new_loop_vinfo
->vec_outside_cost
!= old_loop_vinfo
->vec_outside_cost
)
2462 return new_loop_vinfo
->vec_outside_cost
< old_loop_vinfo
->vec_outside_cost
;
2467 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2468 true if we should. */
2471 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo
,
2472 loop_vec_info old_loop_vinfo
)
2474 if (!vect_better_loop_vinfo_p (new_loop_vinfo
, old_loop_vinfo
))
2477 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_NOTE
, vect_location
,
2479 "***** Preferring vector mode %s to vector mode %s\n",
2480 GET_MODE_NAME (new_loop_vinfo
->vector_mode
),
2481 GET_MODE_NAME (old_loop_vinfo
->vector_mode
));
2485 /* Function vect_analyze_loop.
2487 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2488 for it. The different analyses will record information in the
2489 loop_vec_info struct. */
2491 vect_analyze_loop (class loop
*loop
, vec_info_shared
*shared
)
2493 auto_vector_modes vector_modes
;
2495 /* Autodetect first vector size we try. */
2496 unsigned int autovec_flags
2497 = targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
,
2498 loop
->simdlen
!= 0);
2499 unsigned int mode_i
= 0;
2501 DUMP_VECT_SCOPE ("analyze_loop_nest");
2503 if (loop_outer (loop
)
2504 && loop_vec_info_for_loop (loop_outer (loop
))
2505 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2506 return opt_loop_vec_info::failure_at (vect_location
,
2507 "outer-loop already vectorized.\n");
2509 if (!find_loop_nest (loop
, &shared
->loop_nest
))
2510 return opt_loop_vec_info::failure_at
2512 "not vectorized: loop nest containing two or more consecutive inner"
2513 " loops cannot be vectorized\n");
2515 unsigned n_stmts
= 0;
2516 machine_mode autodetected_vector_mode
= VOIDmode
;
2517 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2518 machine_mode next_vector_mode
= VOIDmode
;
2519 poly_uint64 lowest_th
= 0;
2520 unsigned vectorized_loops
= 0;
2521 bool pick_lowest_cost_p
= ((autovec_flags
& VECT_COMPARE_COSTS
)
2522 && !unlimited_cost_model (loop
));
2524 bool vect_epilogues
= false;
2525 opt_result res
= opt_result::success ();
2526 unsigned HOST_WIDE_INT simdlen
= loop
->simdlen
;
2529 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2530 opt_loop_vec_info loop_vinfo
= vect_analyze_loop_form (loop
, shared
);
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2535 "bad loop form.\n");
2536 gcc_checking_assert (first_loop_vinfo
== NULL
);
2539 loop_vinfo
->vector_mode
= next_vector_mode
;
2543 /* When pick_lowest_cost_p is true, we should in principle iterate
2544 over all the loop_vec_infos that LOOP_VINFO could replace and
2545 try to vectorize LOOP_VINFO under the same conditions.
2546 E.g. when trying to replace an epilogue loop, we should vectorize
2547 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2548 to replace the main loop, we should vectorize LOOP_VINFO as a main
2551 However, autovectorize_vector_modes is usually sorted as follows:
2553 - Modes that naturally produce lower VFs usually follow modes that
2554 naturally produce higher VFs.
2556 - When modes naturally produce the same VF, maskable modes
2557 usually follow unmaskable ones, so that the maskable mode
2558 can be used to vectorize the epilogue of the unmaskable mode.
2560 This order is preferred because it leads to the maximum
2561 epilogue vectorization opportunities. Targets should only use
2562 a different order if they want to make wide modes available while
2563 disparaging them relative to earlier, smaller modes. The assumption
2564 in that case is that the wider modes are more expensive in some
2565 way that isn't reflected directly in the costs.
2567 There should therefore be few interesting cases in which
2568 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2569 treated as a standalone loop, and ends up being genuinely cheaper
2570 than FIRST_LOOP_VINFO. */
2572 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = first_loop_vinfo
;
2574 res
= vect_analyze_loop_2 (loop_vinfo
, fatal
, &n_stmts
);
2576 autodetected_vector_mode
= loop_vinfo
->vector_mode
;
2577 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE
, vect_location
,
2581 "***** Analysis succeeded with vector mode %s\n",
2582 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2584 dump_printf_loc (MSG_NOTE
, vect_location
,
2585 "***** Analysis failed with vector mode %s\n",
2586 GET_MODE_NAME (loop_vinfo
->vector_mode
));
2592 while (mode_i
< vector_modes
.length ()
2593 && vect_chooses_same_modes_p (loop_vinfo
, vector_modes
[mode_i
]))
2595 if (dump_enabled_p ())
2596 dump_printf_loc (MSG_NOTE
, vect_location
,
2597 "***** The result for vector mode %s would"
2599 GET_MODE_NAME (vector_modes
[mode_i
]));
2605 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2608 /* Once we hit the desired simdlen for the first time,
2609 discard any previous attempts. */
2611 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), simdlen
))
2613 delete first_loop_vinfo
;
2614 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2615 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = NULL
;
2618 else if (pick_lowest_cost_p
&& first_loop_vinfo
)
2620 /* Keep trying to roll back vectorization attempts while the
2621 loop_vec_infos they produced were worse than this one. */
2622 vec
<loop_vec_info
> &vinfos
= first_loop_vinfo
->epilogue_vinfos
;
2623 while (!vinfos
.is_empty ()
2624 && vect_joust_loop_vinfos (loop_vinfo
, vinfos
.last ()))
2626 gcc_assert (vect_epilogues
);
2627 delete vinfos
.pop ();
2629 if (vinfos
.is_empty ()
2630 && vect_joust_loop_vinfos (loop_vinfo
, first_loop_vinfo
))
2632 delete first_loop_vinfo
;
2633 first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2634 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = NULL
;
2638 if (first_loop_vinfo
== NULL
)
2640 first_loop_vinfo
= loop_vinfo
;
2641 lowest_th
= LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
);
2643 else if (vect_epilogues
2644 /* For now only allow one epilogue loop. */
2645 && first_loop_vinfo
->epilogue_vinfos
.is_empty ())
2647 first_loop_vinfo
->epilogue_vinfos
.safe_push (loop_vinfo
);
2648 poly_uint64 th
= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
2649 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2650 || maybe_ne (lowest_th
, 0U));
2651 /* Keep track of the known smallest versioning
2653 if (ordered_p (lowest_th
, th
))
2654 lowest_th
= ordered_min (lowest_th
, th
);
2659 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2660 enabled, SIMDUID is not set, it is the innermost loop and we have
2661 either already found the loop's SIMDLEN or there was no SIMDLEN to
2663 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2664 vect_epilogues
= (!simdlen
2665 && loop
->inner
== NULL
2666 && param_vect_epilogues_nomask
2667 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo
)
2669 /* For now only allow one epilogue loop, but allow
2670 pick_lowest_cost_p to replace it. */
2671 && (first_loop_vinfo
->epilogue_vinfos
.is_empty ()
2672 || pick_lowest_cost_p
));
2674 /* Commit to first_loop_vinfo if we have no reason to try
2676 if (!simdlen
&& !vect_epilogues
&& !pick_lowest_cost_p
)
2684 gcc_checking_assert (first_loop_vinfo
== NULL
);
2689 if (mode_i
< vector_modes
.length ()
2690 && VECTOR_MODE_P (autodetected_vector_mode
)
2691 && (related_vector_mode (vector_modes
[mode_i
],
2692 GET_MODE_INNER (autodetected_vector_mode
))
2693 == autodetected_vector_mode
)
2694 && (related_vector_mode (autodetected_vector_mode
,
2695 GET_MODE_INNER (vector_modes
[mode_i
]))
2696 == vector_modes
[mode_i
]))
2698 if (dump_enabled_p ())
2699 dump_printf_loc (MSG_NOTE
, vect_location
,
2700 "***** Skipping vector mode %s, which would"
2701 " repeat the analysis for %s\n",
2702 GET_MODE_NAME (vector_modes
[mode_i
]),
2703 GET_MODE_NAME (autodetected_vector_mode
));
2707 if (mode_i
== vector_modes
.length ()
2708 || autodetected_vector_mode
== VOIDmode
)
2711 /* Try the next biggest vector size. */
2712 next_vector_mode
= vector_modes
[mode_i
++];
2713 if (dump_enabled_p ())
2714 dump_printf_loc (MSG_NOTE
, vect_location
,
2715 "***** Re-trying analysis with vector mode %s\n",
2716 GET_MODE_NAME (next_vector_mode
));
2719 if (first_loop_vinfo
)
2721 loop
->aux
= (loop_vec_info
) first_loop_vinfo
;
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE
, vect_location
,
2724 "***** Choosing vector mode %s\n",
2725 GET_MODE_NAME (first_loop_vinfo
->vector_mode
));
2726 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo
) = lowest_th
;
2727 return first_loop_vinfo
;
2730 return opt_loop_vec_info::propagate_failure (res
);
2733 /* Return true if there is an in-order reduction function for CODE, storing
2734 it in *REDUC_FN if so. */
2737 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
2742 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
2750 /* Function reduction_fn_for_scalar_code
2753 CODE - tree_code of a reduction operations.
2756 REDUC_FN - the corresponding internal function to be used to reduce the
2757 vector of partial results into a single scalar result, or IFN_LAST
2758 if the operation is a supported reduction operation, but does not have
2759 such an internal function.
2761 Return FALSE if CODE currently cannot be vectorized as reduction. */
2764 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
2769 *reduc_fn
= IFN_REDUC_MAX
;
2773 *reduc_fn
= IFN_REDUC_MIN
;
2777 *reduc_fn
= IFN_REDUC_PLUS
;
2781 *reduc_fn
= IFN_REDUC_AND
;
2785 *reduc_fn
= IFN_REDUC_IOR
;
2789 *reduc_fn
= IFN_REDUC_XOR
;
2794 *reduc_fn
= IFN_LAST
;
2802 /* If there is a neutral value X such that SLP reduction NODE would not
2803 be affected by the introduction of additional X elements, return that X,
2804 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2805 is the vector type that would hold element X. REDUC_CHAIN is true if
2806 the SLP statements perform a single reduction, false if each statement
2807 performs an independent reduction. */
2810 neutral_op_for_slp_reduction (slp_tree slp_node
, tree vector_type
,
2811 tree_code code
, bool reduc_chain
)
2813 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2814 stmt_vec_info stmt_vinfo
= stmts
[0];
2815 tree scalar_type
= TREE_TYPE (vector_type
);
2816 class loop
*loop
= gimple_bb (stmt_vinfo
->stmt
)->loop_father
;
2821 case WIDEN_SUM_EXPR
:
2828 return build_zero_cst (scalar_type
);
2831 return build_one_cst (scalar_type
);
2834 return build_all_ones_cst (scalar_type
);
2838 /* For MIN/MAX the initial values are neutral. A reduction chain
2839 has only a single initial value, so that value is neutral for
2842 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
,
2843 loop_preheader_edge (loop
));
2851 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2852 STMT is printed with a message MSG. */
2855 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
2857 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
2860 /* Return true if we need an in-order reduction for operation CODE
2861 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2862 overflow must wrap. */
2865 needs_fold_left_reduction_p (tree type
, tree_code code
)
2867 /* CHECKME: check for !flag_finite_math_only too? */
2868 if (SCALAR_FLOAT_TYPE_P (type
))
2876 return !flag_associative_math
;
2879 if (INTEGRAL_TYPE_P (type
))
2881 if (!operation_no_trapping_overflow (type
, code
))
2886 if (SAT_FIXED_POINT_TYPE_P (type
))
2892 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2893 has a handled computation expression. Store the main reduction
2894 operation in *CODE. */
2897 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
2898 tree loop_arg
, enum tree_code
*code
,
2899 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
2901 auto_bitmap visited
;
2902 tree lookfor
= PHI_RESULT (phi
);
2904 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
2905 while (USE_FROM_PTR (curr
) != loop_arg
)
2906 curr
= op_iter_next_use (&curri
);
2907 curri
.i
= curri
.numops
;
2910 path
.safe_push (std::make_pair (curri
, curr
));
2911 tree use
= USE_FROM_PTR (curr
);
2914 gimple
*def
= SSA_NAME_DEF_STMT (use
);
2915 if (gimple_nop_p (def
)
2916 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
2921 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
2925 curr
= op_iter_next_use (&curri
);
2926 /* Skip already visited or non-SSA operands (from iterating
2928 while (curr
!= NULL_USE_OPERAND_P
2929 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2930 || ! bitmap_set_bit (visited
,
2932 (USE_FROM_PTR (curr
)))));
2934 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
2935 if (curr
== NULL_USE_OPERAND_P
)
2940 if (gimple_code (def
) == GIMPLE_PHI
)
2941 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
2943 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
2944 while (curr
!= NULL_USE_OPERAND_P
2945 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2946 || ! bitmap_set_bit (visited
,
2948 (USE_FROM_PTR (curr
)))))
2949 curr
= op_iter_next_use (&curri
);
2950 if (curr
== NULL_USE_OPERAND_P
)
2955 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2957 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
2959 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
2960 FOR_EACH_VEC_ELT (path
, i
, x
)
2961 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
2962 dump_printf (MSG_NOTE
, "\n");
2965 /* Check whether the reduction path detected is valid. */
2966 bool fail
= path
.length () == 0;
2970 for (unsigned i
= 1; i
< path
.length (); ++i
)
2972 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
2973 tree op
= USE_FROM_PTR (path
[i
].second
);
2974 if (! is_gimple_assign (use_stmt
)
2975 /* The following make sure we can compute the operand index
2976 easily plus it mostly disallows chaining via COND_EXPR condition
2978 || (gimple_assign_rhs1_ptr (use_stmt
) != path
[i
].second
->use
2979 && (gimple_num_ops (use_stmt
) <= 2
2980 || gimple_assign_rhs2_ptr (use_stmt
) != path
[i
].second
->use
)
2981 && (gimple_num_ops (use_stmt
) <= 3
2982 || gimple_assign_rhs3_ptr (use_stmt
) != path
[i
].second
->use
)))
2987 /* Check there's only a single stmt the op is used on inside
2989 imm_use_iterator imm_iter
;
2990 gimple
*op_use_stmt
;
2992 FOR_EACH_IMM_USE_STMT (op_use_stmt
, imm_iter
, op
)
2993 if (!is_gimple_debug (op_use_stmt
)
2994 && flow_bb_inside_loop_p (loop
, gimple_bb (op_use_stmt
)))
2996 /* We want to allow x + x but not x < 1 ? x : 2. */
2997 if (is_gimple_assign (op_use_stmt
)
2998 && gimple_assign_rhs_code (op_use_stmt
) == COND_EXPR
)
3000 use_operand_p use_p
;
3001 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
3012 tree_code use_code
= gimple_assign_rhs_code (use_stmt
);
3013 if (use_code
== MINUS_EXPR
)
3015 use_code
= PLUS_EXPR
;
3016 /* Track whether we negate the reduction value each iteration. */
3017 if (gimple_assign_rhs2 (use_stmt
) == op
)
3020 if (CONVERT_EXPR_CODE_P (use_code
)
3021 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt
)),
3022 TREE_TYPE (gimple_assign_rhs1 (use_stmt
))))
3024 else if (*code
== ERROR_MARK
)
3027 sign
= TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt
)));
3029 else if (use_code
!= *code
)
3034 else if ((use_code
== MIN_EXPR
3035 || use_code
== MAX_EXPR
)
3036 && sign
!= TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt
))))
3042 return ! fail
&& ! neg
&& *code
!= ERROR_MARK
;
3046 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
3047 tree loop_arg
, enum tree_code code
)
3049 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3050 enum tree_code code_
;
3051 return (check_reduction_path (loc
, loop
, phi
, loop_arg
, &code_
, path
)
3057 /* Function vect_is_simple_reduction
3059 (1) Detect a cross-iteration def-use cycle that represents a simple
3060 reduction computation. We look for the following pattern:
3065 a2 = operation (a3, a1)
3072 a2 = operation (a3, a1)
3075 1. operation is commutative and associative and it is safe to
3076 change the order of the computation
3077 2. no uses for a2 in the loop (a2 is used out of the loop)
3078 3. no uses of a1 in the loop besides the reduction operation
3079 4. no uses of a1 outside the loop.
3081 Conditions 1,4 are tested here.
3082 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3084 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3087 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3091 inner loop (def of a3)
3094 (4) Detect condition expressions, ie:
3095 for (int i = 0; i < N; i++)
3101 static stmt_vec_info
3102 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3103 bool *double_reduc
, bool *reduc_chain_p
)
3105 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
3106 gimple
*phi_use_stmt
= NULL
;
3107 imm_use_iterator imm_iter
;
3108 use_operand_p use_p
;
3110 *double_reduc
= false;
3111 *reduc_chain_p
= false;
3112 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
3114 tree phi_name
= PHI_RESULT (phi
);
3115 /* ??? If there are no uses of the PHI result the inner loop reduction
3116 won't be detected as possibly double-reduction by vectorizable_reduction
3117 because that tries to walk the PHI arg from the preheader edge which
3118 can be constant. See PR60382. */
3119 if (has_zero_uses (phi_name
))
3121 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
3122 unsigned nphi_def_loop_uses
= 0;
3123 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
3125 gimple
*use_stmt
= USE_STMT (use_p
);
3126 if (is_gimple_debug (use_stmt
))
3129 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3131 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3133 "intermediate value used outside loop.\n");
3138 nphi_def_loop_uses
++;
3139 phi_use_stmt
= use_stmt
;
3142 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
3143 if (TREE_CODE (latch_def
) != SSA_NAME
)
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3147 "reduction: not ssa_name: %T\n", latch_def
);
3151 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
3153 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
3156 bool nested_in_vect_loop
3157 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
3158 unsigned nlatch_def_loop_uses
= 0;
3159 auto_vec
<gphi
*, 3> lcphis
;
3160 bool inner_loop_of_double_reduc
= false;
3161 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
3163 gimple
*use_stmt
= USE_STMT (use_p
);
3164 if (is_gimple_debug (use_stmt
))
3166 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
3167 nlatch_def_loop_uses
++;
3170 /* We can have more than one loop-closed PHI. */
3171 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
3172 if (nested_in_vect_loop
3173 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
3174 == vect_double_reduction_def
))
3175 inner_loop_of_double_reduc
= true;
3179 /* If we are vectorizing an inner reduction we are executing that
3180 in the original order only in case we are not dealing with a
3181 double reduction. */
3182 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
3184 if (dump_enabled_p ())
3185 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
3186 "detected nested cycle: ");
3187 return def_stmt_info
;
3190 /* If this isn't a nested cycle or if the nested cycle reduction value
3191 is used ouside of the inner loop we cannot handle uses of the reduction
3193 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3197 "reduction used in loop.\n");
3201 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3202 defined in the inner loop. */
3203 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
3205 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
3206 if (gimple_phi_num_args (def_stmt
) != 1
3207 || TREE_CODE (op1
) != SSA_NAME
)
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3211 "unsupported phi node definition.\n");
3216 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
3217 if (gimple_bb (def1
)
3218 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
3220 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
3221 && is_gimple_assign (def1
)
3222 && is_a
<gphi
*> (phi_use_stmt
)
3223 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
3225 if (dump_enabled_p ())
3226 report_vect_op (MSG_NOTE
, def_stmt
,
3227 "detected double reduction: ");
3229 *double_reduc
= true;
3230 return def_stmt_info
;
3236 /* Look for the expression computing latch_def from then loop PHI result. */
3237 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
3238 enum tree_code code
;
3239 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, &code
,
3242 STMT_VINFO_REDUC_CODE (phi_info
) = code
;
3243 if (code
== COND_EXPR
&& !nested_in_vect_loop
)
3244 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
3246 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3247 reduction chain for which the additional restriction is that
3248 all operations in the chain are the same. */
3249 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
3251 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
3252 for (i
= path
.length () - 1; i
>= 1; --i
)
3254 gimple
*stmt
= USE_STMT (path
[i
].second
);
3255 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
3256 STMT_VINFO_REDUC_IDX (stmt_info
)
3257 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (stmt
);
3258 enum tree_code stmt_code
= gimple_assign_rhs_code (stmt
);
3259 bool leading_conversion
= (CONVERT_EXPR_CODE_P (stmt_code
)
3260 && (i
== 1 || i
== path
.length () - 1));
3261 if ((stmt_code
!= code
&& !leading_conversion
)
3262 /* We can only handle the final value in epilogue
3263 generation for reduction chains. */
3264 || (i
!= 1 && !has_single_use (gimple_assign_lhs (stmt
))))
3265 is_slp_reduc
= false;
3266 /* For reduction chains we support a trailing/leading
3267 conversions. We do not store those in the actual chain. */
3268 if (leading_conversion
)
3270 reduc_chain
.safe_push (stmt_info
);
3272 if (is_slp_reduc
&& reduc_chain
.length () > 1)
3274 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
3276 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
3277 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
3279 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
3280 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
3282 /* Save the chain for further analysis in SLP detection. */
3283 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
3284 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
3286 *reduc_chain_p
= true;
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_NOTE
, vect_location
,
3289 "reduction: detected reduction chain\n");
3291 else if (dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE
, vect_location
,
3293 "reduction: detected reduction\n");
3295 return def_stmt_info
;
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_NOTE
, vect_location
,
3300 "reduction: unknown pattern\n");
3305 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3307 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3308 int *peel_iters_epilogue
,
3309 stmt_vector_for_cost
*scalar_cost_vec
,
3310 stmt_vector_for_cost
*prologue_cost_vec
,
3311 stmt_vector_for_cost
*epilogue_cost_vec
)
3314 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3316 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3318 *peel_iters_epilogue
= assumed_vf
/ 2;
3319 if (dump_enabled_p ())
3320 dump_printf_loc (MSG_NOTE
, vect_location
,
3321 "cost model: epilogue peel iters set to vf/2 "
3322 "because loop iterations are unknown .\n");
3324 /* If peeled iterations are known but number of scalar loop
3325 iterations are unknown, count a taken branch per peeled loop. */
3326 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3327 NULL
, NULL_TREE
, 0, vect_prologue
);
3328 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
3329 NULL
, NULL_TREE
, 0, vect_epilogue
);
3333 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3334 peel_iters_prologue
= niters
< peel_iters_prologue
?
3335 niters
: peel_iters_prologue
;
3336 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3337 /* If we need to peel for gaps, but no peeling is required, we have to
3338 peel VF iterations. */
3339 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
3340 *peel_iters_epilogue
= assumed_vf
;
3343 stmt_info_for_cost
*si
;
3345 if (peel_iters_prologue
)
3346 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3347 retval
+= record_stmt_cost (prologue_cost_vec
,
3348 si
->count
* peel_iters_prologue
,
3349 si
->kind
, si
->stmt_info
, si
->misalign
,
3351 if (*peel_iters_epilogue
)
3352 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3353 retval
+= record_stmt_cost (epilogue_cost_vec
,
3354 si
->count
* *peel_iters_epilogue
,
3355 si
->kind
, si
->stmt_info
, si
->misalign
,
3361 /* Function vect_estimate_min_profitable_iters
3363 Return the number of iterations required for the vector version of the
3364 loop to be profitable relative to the cost of the scalar version of the
3367 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3368 of iterations for vectorization. -1 value means loop vectorization
3369 is not profitable. This returned value may be used for dynamic
3370 profitability check.
3372 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3373 for static check against estimated number of iterations. */
3376 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3377 int *ret_min_profitable_niters
,
3378 int *ret_min_profitable_estimate
)
3380 int min_profitable_iters
;
3381 int min_profitable_estimate
;
3382 int peel_iters_prologue
;
3383 int peel_iters_epilogue
;
3384 unsigned vec_inside_cost
= 0;
3385 int vec_outside_cost
= 0;
3386 unsigned vec_prologue_cost
= 0;
3387 unsigned vec_epilogue_cost
= 0;
3388 int scalar_single_iter_cost
= 0;
3389 int scalar_outside_cost
= 0;
3390 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3391 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3392 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3394 /* Cost model disabled. */
3395 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3399 *ret_min_profitable_niters
= 0;
3400 *ret_min_profitable_estimate
= 0;
3404 /* Requires loop versioning tests to handle misalignment. */
3405 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3407 /* FIXME: Make cost depend on complexity of individual check. */
3408 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3409 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
, vector_stmt
,
3410 NULL
, NULL_TREE
, 0, vect_prologue
);
3411 if (dump_enabled_p ())
3412 dump_printf (MSG_NOTE
,
3413 "cost model: Adding cost of checks for loop "
3414 "versioning to treat misalignment.\n");
3417 /* Requires loop versioning with alias checks. */
3418 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3420 /* FIXME: Make cost depend on complexity of individual check. */
3421 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3422 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
, vector_stmt
,
3423 NULL
, NULL_TREE
, 0, vect_prologue
);
3424 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3426 /* Count LEN - 1 ANDs and LEN comparisons. */
3427 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, len
* 2 - 1,
3428 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
3429 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3432 /* Count LEN - 1 ANDs and LEN comparisons. */
3433 unsigned int nstmts
= len
* 2 - 1;
3434 /* +1 for each bias that needs adding. */
3435 for (unsigned int i
= 0; i
< len
; ++i
)
3436 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3438 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, nstmts
,
3439 scalar_stmt
, NULL
, NULL_TREE
, 0, vect_prologue
);
3441 if (dump_enabled_p ())
3442 dump_printf (MSG_NOTE
,
3443 "cost model: Adding cost of checks for loop "
3444 "versioning aliasing.\n");
3447 /* Requires loop versioning with niter checks. */
3448 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3450 /* FIXME: Make cost depend on complexity of individual check. */
3451 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, vector_stmt
,
3452 NULL
, NULL_TREE
, 0, vect_prologue
);
3453 if (dump_enabled_p ())
3454 dump_printf (MSG_NOTE
,
3455 "cost model: Adding cost of checks for loop "
3456 "versioning niters.\n");
3459 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3460 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
3461 NULL
, NULL_TREE
, 0, vect_prologue
);
3463 /* Count statements in scalar loop. Using this as scalar cost for a single
3466 TODO: Add outer loop support.
3468 TODO: Consider assigning different costs to different scalar
3471 scalar_single_iter_cost
3472 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3474 /* Add additional cost for the peeled instructions in prologue and epilogue
3475 loop. (For fully-masked loops there will be no peeling.)
3477 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3478 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3480 TODO: Build an expression that represents peel_iters for prologue and
3481 epilogue to be used in a run-time test. */
3483 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3485 peel_iters_prologue
= 0;
3486 peel_iters_epilogue
= 0;
3488 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3490 /* We need to peel exactly one iteration. */
3491 peel_iters_epilogue
+= 1;
3492 stmt_info_for_cost
*si
;
3494 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
3496 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, si
->count
,
3497 si
->kind
, si
->stmt_info
, si
->vectype
,
3498 si
->misalign
, vect_epilogue
);
3501 /* Calculate how many masks we need to generate. */
3502 unsigned int num_masks
= 0;
3504 unsigned int num_vectors_m1
;
3505 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), num_vectors_m1
, rgm
)
3507 num_masks
+= num_vectors_m1
+ 1;
3508 gcc_assert (num_masks
> 0);
3510 /* In the worst case, we need to generate each mask in the prologue
3511 and in the loop body. One of the loop body mask instructions
3512 replaces the comparison in the scalar loop, and since we don't
3513 count the scalar comparison against the scalar body, we shouldn't
3514 count that vector instruction against the vector body either.
3516 Sometimes we can use unpacks instead of generating prologue
3517 masks and sometimes the prologue mask will fold to a constant,
3518 so the actual prologue cost might be smaller. However, it's
3519 simpler and safer to use the worst-case cost; if this ends up
3520 being the tie-breaker between vectorizing or not, then it's
3521 probably better not to vectorize. */
3522 (void) add_stmt_cost (loop_vinfo
,
3523 target_cost_data
, num_masks
, vector_stmt
,
3524 NULL
, NULL_TREE
, 0, vect_prologue
);
3525 (void) add_stmt_cost (loop_vinfo
,
3526 target_cost_data
, num_masks
- 1, vector_stmt
,
3527 NULL
, NULL_TREE
, 0, vect_body
);
3531 peel_iters_prologue
= assumed_vf
/ 2;
3532 if (dump_enabled_p ())
3533 dump_printf (MSG_NOTE
, "cost model: "
3534 "prologue peel iters set to vf/2.\n");
3536 /* If peeling for alignment is unknown, loop bound of main loop becomes
3538 peel_iters_epilogue
= assumed_vf
/ 2;
3539 if (dump_enabled_p ())
3540 dump_printf (MSG_NOTE
, "cost model: "
3541 "epilogue peel iters set to vf/2 because "
3542 "peeling for alignment is unknown.\n");
3544 /* If peeled iterations are unknown, count a taken branch and a not taken
3545 branch per peeled loop. Even if scalar loop iterations are known,
3546 vector iterations are not known since peeled prologue iterations are
3547 not known. Hence guards remain the same. */
3548 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
3549 NULL
, NULL_TREE
, 0, vect_prologue
);
3550 (void) add_stmt_cost (loop_vinfo
,
3551 target_cost_data
, 1, cond_branch_not_taken
,
3552 NULL
, NULL_TREE
, 0, vect_prologue
);
3553 (void) add_stmt_cost (loop_vinfo
, target_cost_data
, 1, cond_branch_taken
,
3554 NULL
, NULL_TREE
, 0, vect_epilogue
);
3555 (void) add_stmt_cost (loop_vinfo
,
3556 target_cost_data
, 1, cond_branch_not_taken
,
3557 NULL
, NULL_TREE
, 0, vect_epilogue
);
3558 stmt_info_for_cost
*si
;
3560 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
3562 (void) add_stmt_cost (loop_vinfo
, target_cost_data
,
3563 si
->count
* peel_iters_prologue
,
3564 si
->kind
, si
->stmt_info
, si
->vectype
,
3567 (void) add_stmt_cost (loop_vinfo
, target_cost_data
,
3568 si
->count
* peel_iters_epilogue
,
3569 si
->kind
, si
->stmt_info
, si
->vectype
,
3576 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
3577 stmt_info_for_cost
*si
;
3579 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3581 prologue_cost_vec
.create (2);
3582 epilogue_cost_vec
.create (2);
3583 peel_iters_prologue
= npeel
;
3585 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
3586 &peel_iters_epilogue
,
3587 &LOOP_VINFO_SCALAR_ITERATION_COST
3590 &epilogue_cost_vec
);
3592 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
3593 (void) add_stmt_cost (loop_vinfo
,
3594 data
, si
->count
, si
->kind
, si
->stmt_info
,
3595 si
->vectype
, si
->misalign
, vect_prologue
);
3597 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
3598 (void) add_stmt_cost (loop_vinfo
,
3599 data
, si
->count
, si
->kind
, si
->stmt_info
,
3600 si
->vectype
, si
->misalign
, vect_epilogue
);
3602 prologue_cost_vec
.release ();
3603 epilogue_cost_vec
.release ();
3606 /* FORNOW: The scalar outside cost is incremented in one of the
3609 1. The vectorizer checks for alignment and aliasing and generates
3610 a condition that allows dynamic vectorization. A cost model
3611 check is ANDED with the versioning condition. Hence scalar code
3612 path now has the added cost of the versioning check.
3614 if (cost > th & versioning_check)
3617 Hence run-time scalar is incremented by not-taken branch cost.
3619 2. The vectorizer then checks if a prologue is required. If the
3620 cost model check was not done before during versioning, it has to
3621 be done before the prologue check.
3624 prologue = scalar_iters
3629 if (prologue == num_iters)
3632 Hence the run-time scalar cost is incremented by a taken branch,
3633 plus a not-taken branch, plus a taken branch cost.
3635 3. The vectorizer then checks if an epilogue is required. If the
3636 cost model check was not done before during prologue check, it
3637 has to be done with the epilogue check.
3643 if (prologue == num_iters)
3646 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3649 Hence the run-time scalar cost should be incremented by 2 taken
3652 TODO: The back end may reorder the BBS's differently and reverse
3653 conditions/branch directions. Change the estimates below to
3654 something more reasonable. */
3656 /* If the number of iterations is known and we do not do versioning, we can
3657 decide whether to vectorize at compile time. Hence the scalar version
3658 do not carry cost model guard costs. */
3659 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3660 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3662 /* Cost model check occurs at versioning. */
3663 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3664 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3667 /* Cost model check occurs at prologue generation. */
3668 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3669 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3670 + vect_get_stmt_cost (cond_branch_not_taken
);
3671 /* Cost model check occurs at epilogue generation. */
3673 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3677 /* Complete the target-specific cost calculations. */
3678 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3679 &vec_inside_cost
, &vec_epilogue_cost
);
3681 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3683 /* Stash the costs so that we can compare two loop_vec_infos. */
3684 loop_vinfo
->vec_inside_cost
= vec_inside_cost
;
3685 loop_vinfo
->vec_outside_cost
= vec_outside_cost
;
3687 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3690 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3692 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3694 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3696 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3697 scalar_single_iter_cost
);
3698 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3699 scalar_outside_cost
);
3700 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3702 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3703 peel_iters_prologue
);
3704 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3705 peel_iters_epilogue
);
3708 /* Calculate number of iterations required to make the vector version
3709 profitable, relative to the loop bodies only. The following condition
3711 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3713 SIC = scalar iteration cost, VIC = vector iteration cost,
3714 VOC = vector outside cost, VF = vectorization factor,
3715 NPEEL = prologue iterations + epilogue iterations,
3716 SOC = scalar outside cost for run time cost model check. */
3718 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
3720 if (saving_per_viter
<= 0)
3722 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3723 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
3724 "vectorization did not happen for a simd loop");
3726 if (dump_enabled_p ())
3727 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3728 "cost model: the vector iteration cost = %d "
3729 "divided by the scalar iteration cost = %d "
3730 "is greater or equal to the vectorization factor = %d"
3732 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
3733 *ret_min_profitable_niters
= -1;
3734 *ret_min_profitable_estimate
= -1;
3738 /* ??? The "if" arm is written to handle all cases; see below for what
3739 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3740 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3742 /* Rewriting the condition above in terms of the number of
3743 vector iterations (vniters) rather than the number of
3744 scalar iterations (niters) gives:
3746 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3748 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3750 For integer N, X and Y when X > 0:
3752 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3753 int outside_overhead
= (vec_outside_cost
3754 - scalar_single_iter_cost
* peel_iters_prologue
3755 - scalar_single_iter_cost
* peel_iters_epilogue
3756 - scalar_outside_cost
);
3757 /* We're only interested in cases that require at least one
3758 vector iteration. */
3759 int min_vec_niters
= 1;
3760 if (outside_overhead
> 0)
3761 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
3763 if (dump_enabled_p ())
3764 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
3767 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3769 /* Now that we know the minimum number of vector iterations,
3770 find the minimum niters for which the scalar cost is larger:
3772 SIC * niters > VIC * vniters + VOC - SOC
3774 We know that the minimum niters is no more than
3775 vniters * VF + NPEEL, but it might be (and often is) less
3776 than that if a partial vector iteration is cheaper than the
3777 equivalent scalar code. */
3778 int threshold
= (vec_inside_cost
* min_vec_niters
3780 - scalar_outside_cost
);
3782 min_profitable_iters
= 1;
3784 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
3787 /* Convert the number of vector iterations into a number of
3788 scalar iterations. */
3789 min_profitable_iters
= (min_vec_niters
* assumed_vf
3790 + peel_iters_prologue
3791 + peel_iters_epilogue
);
3795 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
3797 - vec_inside_cost
* peel_iters_prologue
3798 - vec_inside_cost
* peel_iters_epilogue
);
3799 if (min_profitable_iters
<= 0)
3800 min_profitable_iters
= 0;
3803 min_profitable_iters
/= saving_per_viter
;
3805 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
3806 <= (((int) vec_inside_cost
* min_profitable_iters
)
3807 + (((int) vec_outside_cost
- scalar_outside_cost
)
3809 min_profitable_iters
++;
3813 if (dump_enabled_p ())
3814 dump_printf (MSG_NOTE
,
3815 " Calculated minimum iters for profitability: %d\n",
3816 min_profitable_iters
);
3818 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
3819 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
3820 /* We want the vectorized loop to execute at least once. */
3821 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
3823 if (dump_enabled_p ())
3824 dump_printf_loc (MSG_NOTE
, vect_location
,
3825 " Runtime profitability threshold = %d\n",
3826 min_profitable_iters
);
3828 *ret_min_profitable_niters
= min_profitable_iters
;
3830 /* Calculate number of iterations required to make the vector version
3831 profitable, relative to the loop bodies only.
3833 Non-vectorized variant is SIC * niters and it must win over vector
3834 variant on the expected loop trip count. The following condition must hold true:
3835 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3837 if (vec_outside_cost
<= 0)
3838 min_profitable_estimate
= 0;
3839 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3841 /* This is a repeat of the code above, but with + SOC rather
3843 int outside_overhead
= (vec_outside_cost
3844 - scalar_single_iter_cost
* peel_iters_prologue
3845 - scalar_single_iter_cost
* peel_iters_epilogue
3846 + scalar_outside_cost
);
3847 int min_vec_niters
= 1;
3848 if (outside_overhead
> 0)
3849 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
3851 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3853 int threshold
= (vec_inside_cost
* min_vec_niters
3855 + scalar_outside_cost
);
3856 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
3859 min_profitable_estimate
= (min_vec_niters
* assumed_vf
3860 + peel_iters_prologue
3861 + peel_iters_epilogue
);
3865 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
3867 - vec_inside_cost
* peel_iters_prologue
3868 - vec_inside_cost
* peel_iters_epilogue
)
3869 / ((scalar_single_iter_cost
* assumed_vf
)
3872 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3873 if (dump_enabled_p ())
3874 dump_printf_loc (MSG_NOTE
, vect_location
,
3875 " Static estimate profitability threshold = %d\n",
3876 min_profitable_estimate
);
3878 *ret_min_profitable_estimate
= min_profitable_estimate
;
3881 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3882 vector elements (not bits) for a vector with NELT elements. */
3884 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
3885 vec_perm_builder
*sel
)
3887 /* The encoding is a single stepped pattern. Any wrap-around is handled
3888 by vec_perm_indices. */
3889 sel
->new_vector (nelt
, 1, 3);
3890 for (unsigned int i
= 0; i
< 3; i
++)
3891 sel
->quick_push (i
+ offset
);
3894 /* Checks whether the target supports whole-vector shifts for vectors of mode
3895 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3896 it supports vec_perm_const with masks for all necessary shift amounts. */
3898 have_whole_vector_shift (machine_mode mode
)
3900 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3903 /* Variable-length vectors should be handled via the optab. */
3905 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
3908 vec_perm_builder sel
;
3909 vec_perm_indices indices
;
3910 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
3912 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
3913 indices
.new_vector (sel
, 2, nelt
);
3914 if (!can_vec_perm_const_p (mode
, indices
, false))
3920 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3921 functions. Design better to avoid maintenance issues. */
3923 /* Function vect_model_reduction_cost.
3925 Models cost for a reduction operation, including the vector ops
3926 generated within the strip-mine loop, the initial definition before
3927 the loop, and the epilogue code that must be generated. */
3930 vect_model_reduction_cost (loop_vec_info loop_vinfo
,
3931 stmt_vec_info stmt_info
, internal_fn reduc_fn
,
3932 vect_reduction_type reduction_type
,
3933 int ncopies
, stmt_vector_for_cost
*cost_vec
)
3935 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
3936 enum tree_code code
;
3940 class loop
*loop
= NULL
;
3943 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3945 /* Condition reductions generate two reductions in the loop. */
3946 if (reduction_type
== COND_REDUCTION
)
3949 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3950 mode
= TYPE_MODE (vectype
);
3951 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3953 code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
3955 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
3956 /* No extra instructions are needed in the prologue. The loop body
3957 operations are costed in vectorizable_condition. */
3959 else if (reduction_type
== FOLD_LEFT_REDUCTION
)
3961 /* No extra instructions needed in the prologue. */
3964 if (reduc_fn
!= IFN_LAST
)
3965 /* Count one reduction-like operation per vector. */
3966 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
3967 stmt_info
, 0, vect_body
);
3970 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3971 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
3972 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
3973 vec_to_scalar
, stmt_info
, 0,
3975 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
3976 scalar_stmt
, stmt_info
, 0,
3982 /* Add in cost for initial definition.
3983 For cond reduction we have four vectors: initial index, step,
3984 initial result of the data reduction, initial value of the index
3986 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
3987 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
3988 scalar_to_vec
, stmt_info
, 0,
3991 /* Cost of reduction op inside loop. */
3992 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
3993 stmt_info
, 0, vect_body
);
3996 /* Determine cost of epilogue code.
3998 We have a reduction operator that will reduce the vector in one statement.
3999 Also requires scalar extract. */
4001 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
4003 if (reduc_fn
!= IFN_LAST
)
4005 if (reduction_type
== COND_REDUCTION
)
4007 /* An EQ stmt and an COND_EXPR stmt. */
4008 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4009 vector_stmt
, stmt_info
, 0,
4011 /* Reduction of the max index and a reduction of the found
4013 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
4014 vec_to_scalar
, stmt_info
, 0,
4016 /* A broadcast of the max value. */
4017 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4018 scalar_to_vec
, stmt_info
, 0,
4023 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
4024 stmt_info
, 0, vect_epilogue
);
4025 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4026 vec_to_scalar
, stmt_info
, 0,
4030 else if (reduction_type
== COND_REDUCTION
)
4032 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
4033 /* Extraction of scalar elements. */
4034 epilogue_cost
+= record_stmt_cost (cost_vec
,
4035 2 * estimated_nunits
,
4036 vec_to_scalar
, stmt_info
, 0,
4038 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4039 epilogue_cost
+= record_stmt_cost (cost_vec
,
4040 2 * estimated_nunits
- 3,
4041 scalar_stmt
, stmt_info
, 0,
4044 else if (reduction_type
== EXTRACT_LAST_REDUCTION
4045 || reduction_type
== FOLD_LEFT_REDUCTION
)
4046 /* No extra instructions need in the epilogue. */
4050 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
4052 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info
->stmt
)));
4053 int element_bitsize
= tree_to_uhwi (bitsize
);
4054 int nelements
= vec_size_in_bits
/ element_bitsize
;
4056 if (code
== COND_EXPR
)
4059 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
4061 /* We have a whole vector shift available. */
4062 if (optab
!= unknown_optab
4063 && VECTOR_MODE_P (mode
)
4064 && optab_handler (optab
, mode
) != CODE_FOR_nothing
4065 && have_whole_vector_shift (mode
))
4067 /* Final reduction via vector shifts and the reduction operator.
4068 Also requires scalar extract. */
4069 epilogue_cost
+= record_stmt_cost (cost_vec
,
4070 exact_log2 (nelements
) * 2,
4071 vector_stmt
, stmt_info
, 0,
4073 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
4074 vec_to_scalar
, stmt_info
, 0,
4078 /* Use extracts and reduction op for final reduction. For N
4079 elements, we have N extracts and N-1 reduction ops. */
4080 epilogue_cost
+= record_stmt_cost (cost_vec
,
4081 nelements
+ nelements
- 1,
4082 vector_stmt
, stmt_info
, 0,
4087 if (dump_enabled_p ())
4088 dump_printf (MSG_NOTE
,
4089 "vect_model_reduction_cost: inside_cost = %d, "
4090 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4091 prologue_cost
, epilogue_cost
);
4095 /* Function vect_model_induction_cost.
4097 Models cost for induction operations. */
4100 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
,
4101 stmt_vector_for_cost
*cost_vec
)
4103 unsigned inside_cost
, prologue_cost
;
4105 if (PURE_SLP_STMT (stmt_info
))
4108 /* loop cost for vec_loop. */
4109 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
4110 stmt_info
, 0, vect_body
);
4112 /* prologue cost for vec_init and vec_step. */
4113 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
4114 stmt_info
, 0, vect_prologue
);
4116 if (dump_enabled_p ())
4117 dump_printf_loc (MSG_NOTE
, vect_location
,
4118 "vect_model_induction_cost: inside_cost = %d, "
4119 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
4124 /* Function get_initial_def_for_reduction
4127 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4128 INIT_VAL - the initial value of the reduction variable
4131 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4132 of the reduction (used for adjusting the epilog - see below).
4133 Return a vector variable, initialized according to the operation that
4134 STMT_VINFO performs. This vector will be used as the initial value
4135 of the vector of partial results.
4137 Option1 (adjust in epilog): Initialize the vector as follows:
4138 add/bit or/xor: [0,0,...,0,0]
4139 mult/bit and: [1,1,...,1,1]
4140 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4141 and when necessary (e.g. add/mult case) let the caller know
4142 that it needs to adjust the result by init_val.
4144 Option2: Initialize the vector as follows:
4145 add/bit or/xor: [init_val,0,0,...,0]
4146 mult/bit and: [init_val,1,1,...,1]
4147 min/max/cond_expr: [init_val,init_val,...,init_val]
4148 and no adjustments are needed.
4150 For example, for the following code:
4156 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4157 For a vector of 4 units, we want to return either [0,0,0,init_val],
4158 or [0,0,0,0] and let the caller know that it needs to adjust
4159 the result at the end by 'init_val'.
4161 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4162 initialization vector is simpler (same element in all entries), if
4163 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4165 A cost model should help decide between these two schemes. */
4168 get_initial_def_for_reduction (loop_vec_info loop_vinfo
,
4169 stmt_vec_info stmt_vinfo
,
4170 enum tree_code code
, tree init_val
,
4171 tree
*adjustment_def
)
4173 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4174 tree scalar_type
= TREE_TYPE (init_val
);
4175 tree vectype
= get_vectype_for_scalar_type (loop_vinfo
, scalar_type
);
4178 REAL_VALUE_TYPE real_init_val
= dconst0
;
4179 int int_init_val
= 0;
4180 gimple_seq stmts
= NULL
;
4182 gcc_assert (vectype
);
4184 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4185 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4187 gcc_assert (nested_in_vect_loop_p (loop
, stmt_vinfo
)
4188 || loop
== (gimple_bb (stmt_vinfo
->stmt
))->loop_father
);
4190 /* ADJUSTMENT_DEF is NULL when called from
4191 vect_create_epilog_for_reduction to vectorize double reduction. */
4193 *adjustment_def
= NULL
;
4197 case WIDEN_SUM_EXPR
:
4207 if (code
== MULT_EXPR
)
4209 real_init_val
= dconst1
;
4213 if (code
== BIT_AND_EXPR
)
4216 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4217 def_for_init
= build_real (scalar_type
, real_init_val
);
4219 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4221 if (adjustment_def
|| operand_equal_p (def_for_init
, init_val
, 0))
4223 /* Option1: the first element is '0' or '1' as well. */
4224 if (!operand_equal_p (def_for_init
, init_val
, 0))
4225 *adjustment_def
= init_val
;
4226 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4229 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4231 /* Option2 (variable length): the first element is INIT_VAL. */
4232 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4234 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4235 vectype
, init_def
, init_val
);
4239 /* Option2: the first element is INIT_VAL. */
4240 tree_vector_builder
elts (vectype
, 1, 2);
4241 elts
.quick_push (init_val
);
4242 elts
.quick_push (def_for_init
);
4243 init_def
= gimple_build_vector (&stmts
, &elts
);
4252 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4253 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
4262 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4266 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4267 NUMBER_OF_VECTORS is the number of vector defs to create.
4268 If NEUTRAL_OP is nonnull, introducing extra elements of that
4269 value will not change the result. */
4272 get_initial_defs_for_reduction (vec_info
*vinfo
,
4274 vec
<tree
> *vec_oprnds
,
4275 unsigned int number_of_vectors
,
4276 bool reduc_chain
, tree neutral_op
)
4278 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
4279 stmt_vec_info stmt_vinfo
= stmts
[0];
4280 unsigned HOST_WIDE_INT nunits
;
4281 unsigned j
, number_of_places_left_in_vector
;
4283 unsigned int group_size
= stmts
.length ();
4287 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
4289 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
4291 loop
= (gimple_bb (stmt_vinfo
->stmt
))->loop_father
;
4293 edge pe
= loop_preheader_edge (loop
);
4295 gcc_assert (!reduc_chain
|| neutral_op
);
4297 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4298 created vectors. It is greater than 1 if unrolling is performed.
4300 For example, we have two scalar operands, s1 and s2 (e.g., group of
4301 strided accesses of size two), while NUNITS is four (i.e., four scalars
4302 of this type can be packed in a vector). The output vector will contain
4303 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4306 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4307 vectors containing the operands.
4309 For example, NUNITS is four as before, and the group size is 8
4310 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4311 {s5, s6, s7, s8}. */
4313 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4314 nunits
= group_size
;
4316 number_of_places_left_in_vector
= nunits
;
4317 bool constant_p
= true;
4318 tree_vector_builder
elts (vector_type
, nunits
, 1);
4319 elts
.quick_grow (nunits
);
4320 gimple_seq ctor_seq
= NULL
;
4321 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
4325 stmt_vinfo
= stmts
[i
];
4327 /* Get the def before the loop. In reduction chain we have only
4328 one initial value. Else we have as many as PHIs in the group. */
4330 op
= j
!= 0 ? neutral_op
: PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4331 else if (((vec_oprnds
->length () + 1) * nunits
4332 - number_of_places_left_in_vector
>= group_size
)
4336 op
= PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4338 /* Create 'vect_ = {op0,op1,...,opn}'. */
4339 number_of_places_left_in_vector
--;
4340 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
4341 if (!CONSTANT_CLASS_P (op
))
4344 if (number_of_places_left_in_vector
== 0)
4347 if (constant_p
&& !neutral_op
4348 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4349 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4350 /* Build the vector directly from ELTS. */
4351 init
= gimple_build_vector (&ctor_seq
, &elts
);
4352 else if (neutral_op
)
4354 /* Build a vector of the neutral value and shift the
4355 other elements into place. */
4356 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4359 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4364 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4365 vector_type
, init
, elts
[k
]);
4370 /* First time round, duplicate ELTS to fill the
4371 required number of vectors. */
4372 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
, elts
,
4373 number_of_vectors
, *vec_oprnds
);
4376 vec_oprnds
->quick_push (init
);
4378 number_of_places_left_in_vector
= nunits
;
4379 elts
.new_vector (vector_type
, nunits
, 1);
4380 elts
.quick_grow (nunits
);
4384 if (ctor_seq
!= NULL
)
4385 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4388 /* For a statement STMT_INFO taking part in a reduction operation return
4389 the stmt_vec_info the meta information is stored on. */
4392 info_for_reduction (vec_info
*vinfo
, stmt_vec_info stmt_info
)
4394 stmt_info
= vect_orig_stmt (stmt_info
);
4395 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
4396 if (!is_a
<gphi
*> (stmt_info
->stmt
))
4397 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4398 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
4399 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4401 if (gimple_phi_num_args (phi
) == 1)
4402 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4404 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
4406 edge pe
= loop_preheader_edge (gimple_bb (phi
)->loop_father
);
4408 = vinfo
->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi
, pe
));
4409 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
4415 /* Function vect_create_epilog_for_reduction
4417 Create code at the loop-epilog to finalize the result of a reduction
4420 STMT_INFO is the scalar reduction stmt that is being vectorized.
4421 SLP_NODE is an SLP node containing a group of reduction statements. The
4422 first one in this group is STMT_INFO.
4423 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4424 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4428 1. Completes the reduction def-use cycles.
4429 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4430 by calling the function specified by REDUC_FN if available, or by
4431 other means (whole-vector shifts or a scalar loop).
4432 The function also creates a new phi node at the loop exit to preserve
4433 loop-closed form, as illustrated below.
4435 The flow at the entry to this function:
4438 vec_def = phi <vec_init, null> # REDUCTION_PHI
4439 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4440 s_loop = scalar_stmt # (scalar) STMT_INFO
4442 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4446 The above is transformed by this function into:
4449 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4450 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4451 s_loop = scalar_stmt # (scalar) STMT_INFO
4453 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4454 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4455 v_out2 = reduce <v_out1>
4456 s_out3 = extract_field <v_out2, 0>
4457 s_out4 = adjust_result <s_out3>
4463 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo
,
4464 stmt_vec_info stmt_info
,
4466 slp_instance slp_node_instance
)
4468 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
4469 gcc_assert (reduc_info
->is_reduc_info
);
4470 /* For double reductions we need to get at the inner loop reduction
4471 stmt which has the meta info attached. Our stmt_info is that of the
4472 loop-closed PHI of the inner loop which we remember as
4473 def for the reduction PHI generation. */
4474 bool double_reduc
= false;
4475 stmt_vec_info rdef_info
= stmt_info
;
4476 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4478 gcc_assert (!slp_node
);
4479 double_reduc
= true;
4480 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
4481 (stmt_info
->stmt
, 0));
4482 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
4484 gphi
*reduc_def_stmt
4485 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
4486 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
4487 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
4488 stmt_vec_info prev_phi_info
;
4491 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4492 basic_block exit_bb
;
4495 gimple
*new_phi
= NULL
, *phi
;
4496 stmt_vec_info phi_info
;
4497 gimple_stmt_iterator exit_gsi
;
4498 tree new_temp
= NULL_TREE
, new_name
, new_scalar_dest
;
4499 gimple
*epilog_stmt
= NULL
;
4503 tree orig_name
, scalar_result
;
4504 imm_use_iterator imm_iter
, phi_imm_iter
;
4505 use_operand_p use_p
, phi_use_p
;
4507 bool nested_in_vect_loop
= false;
4508 auto_vec
<gimple
*> new_phis
;
4510 auto_vec
<tree
> scalar_results
;
4511 unsigned int group_size
= 1, k
;
4512 auto_vec
<gimple
*> phis
;
4513 bool slp_reduc
= false;
4514 bool direct_slp_reduc
;
4515 tree new_phi_result
;
4516 tree induction_index
= NULL_TREE
;
4519 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4521 if (nested_in_vect_loop_p (loop
, stmt_info
))
4525 nested_in_vect_loop
= true;
4526 gcc_assert (!slp_node
);
4528 gcc_assert (!nested_in_vect_loop
|| double_reduc
);
4530 vectype
= STMT_VINFO_REDUC_VECTYPE (reduc_info
);
4531 gcc_assert (vectype
);
4532 mode
= TYPE_MODE (vectype
);
4534 tree initial_def
= NULL
;
4535 tree induc_val
= NULL_TREE
;
4536 tree adjustment_def
= NULL
;
4541 /* Get at the scalar def before the loop, that defines the initial value
4542 of the reduction variable. */
4543 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4544 loop_preheader_edge (loop
));
4545 /* Optimize: for induction condition reduction, if we can't use zero
4546 for induc_val, use initial_def. */
4547 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
4548 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
4549 else if (double_reduc
)
4551 else if (nested_in_vect_loop
)
4554 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
4561 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
4568 phi_info
= STMT_VINFO_VEC_STMT (loop_vinfo
->lookup_stmt (reduc_def_stmt
));
4572 phi_info
= STMT_VINFO_RELATED_STMT (phi_info
);
4577 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4578 which is updated with the current index of the loop for every match of
4579 the original loop's cond_expr (VEC_STMT). This results in a vector
4580 containing the last time the condition passed for that vector lane.
4581 The first match will be a 1 to allow 0 to be used for non-matching
4582 indexes. If there are no matches at all then the vector will be all
4585 PR92772: This algorithm is broken for architectures that support
4586 masked vectors, but do not provide fold_extract_last. */
4587 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
4589 auto_vec
<std::pair
<tree
, bool>, 2> ccompares
;
4590 stmt_vec_info cond_info
= STMT_VINFO_REDUC_DEF (reduc_info
);
4591 cond_info
= vect_stmt_to_vectorize (cond_info
);
4592 while (cond_info
!= reduc_info
)
4594 if (gimple_assign_rhs_code (cond_info
->stmt
) == COND_EXPR
)
4596 gimple
*vec_stmt
= STMT_VINFO_VEC_STMT (cond_info
)->stmt
;
4597 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
4599 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt
)),
4600 STMT_VINFO_REDUC_IDX (cond_info
) == 2));
4603 = loop_vinfo
->lookup_def (gimple_op (cond_info
->stmt
,
4604 1 + STMT_VINFO_REDUC_IDX
4606 cond_info
= vect_stmt_to_vectorize (cond_info
);
4608 gcc_assert (ccompares
.length () != 0);
4610 tree indx_before_incr
, indx_after_incr
;
4611 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
4612 int scalar_precision
4613 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
4614 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
4615 tree cr_index_vector_type
= get_related_vectype_for_scalar_type
4616 (TYPE_MODE (vectype
), cr_index_scalar_type
,
4617 TYPE_VECTOR_SUBPARTS (vectype
));
4619 /* First we create a simple vector induction variable which starts
4620 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4621 vector size (STEP). */
4623 /* Create a {1,2,3,...} vector. */
4624 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
4626 /* Create a vector of the step value. */
4627 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
4628 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
4630 /* Create an induction variable. */
4631 gimple_stmt_iterator incr_gsi
;
4633 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4634 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
4635 insert_after
, &indx_before_incr
, &indx_after_incr
);
4637 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4638 filled with zeros (VEC_ZERO). */
4640 /* Create a vector of 0s. */
4641 tree zero
= build_zero_cst (cr_index_scalar_type
);
4642 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
4644 /* Create a vector phi node. */
4645 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
4646 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
4647 loop_vinfo
->add_stmt (new_phi
);
4648 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
4649 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4651 /* Now take the condition from the loops original cond_exprs
4652 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4653 every match uses values from the induction variable
4654 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4656 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4657 the new cond_expr (INDEX_COND_EXPR). */
4658 gimple_seq stmts
= NULL
;
4659 for (int i
= ccompares
.length () - 1; i
!= -1; --i
)
4661 tree ccompare
= ccompares
[i
].first
;
4662 if (ccompares
[i
].second
)
4663 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
4664 cr_index_vector_type
,
4666 indx_before_incr
, new_phi_tree
);
4668 new_phi_tree
= gimple_build (&stmts
, VEC_COND_EXPR
,
4669 cr_index_vector_type
,
4671 new_phi_tree
, indx_before_incr
);
4673 gsi_insert_seq_before (&incr_gsi
, stmts
, GSI_SAME_STMT
);
4674 stmt_vec_info index_vec_info
4675 = loop_vinfo
->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree
));
4676 STMT_VINFO_VECTYPE (index_vec_info
) = cr_index_vector_type
;
4678 /* Update the phi with the vec cond. */
4679 induction_index
= new_phi_tree
;
4680 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
4681 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
4684 /* 2. Create epilog code.
4685 The reduction epilog code operates across the elements of the vector
4686 of partial results computed by the vectorized loop.
4687 The reduction epilog code consists of:
4689 step 1: compute the scalar result in a vector (v_out2)
4690 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4691 step 3: adjust the scalar result (s_out3) if needed.
4693 Step 1 can be accomplished using one the following three schemes:
4694 (scheme 1) using reduc_fn, if available.
4695 (scheme 2) using whole-vector shifts, if available.
4696 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4699 The overall epilog code looks like this:
4701 s_out0 = phi <s_loop> # original EXIT_PHI
4702 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4703 v_out2 = reduce <v_out1> # step 1
4704 s_out3 = extract_field <v_out2, 0> # step 2
4705 s_out4 = adjust_result <s_out3> # step 3
4707 (step 3 is optional, and steps 1 and 2 may be combined).
4708 Lastly, the uses of s_out0 are replaced by s_out4. */
4711 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4712 v_out1 = phi <VECT_DEF>
4713 Store them in NEW_PHIS. */
4716 exit_bb
= single_exit (loop
)->dest
;
4717 prev_phi_info
= NULL
;
4718 new_phis
.create (slp_node
? vec_num
: ncopies
);
4719 for (unsigned i
= 0; i
< vec_num
; i
++)
4722 def
= gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]->stmt
);
4724 def
= gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info
)->stmt
);
4725 for (j
= 0; j
< ncopies
; j
++)
4727 tree new_def
= copy_ssa_name (def
);
4728 phi
= create_phi_node (new_def
, exit_bb
);
4729 stmt_vec_info phi_info
= loop_vinfo
->add_stmt (phi
);
4731 new_phis
.quick_push (phi
);
4734 def
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, def
);
4735 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi_info
;
4738 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4739 prev_phi_info
= phi_info
;
4743 exit_gsi
= gsi_after_labels (exit_bb
);
4745 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4746 (i.e. when reduc_fn is not available) and in the final adjustment
4747 code (if needed). Also get the original scalar reduction variable as
4748 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4749 represents a reduction pattern), the tree-code and scalar-def are
4750 taken from the original stmt that the pattern-stmt (STMT) replaces.
4751 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4752 are taken from STMT. */
4754 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4755 if (orig_stmt_info
!= stmt_info
)
4757 /* Reduction pattern */
4758 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4759 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
4762 scalar_dest
= gimple_assign_lhs (orig_stmt_info
->stmt
);
4763 scalar_type
= TREE_TYPE (scalar_dest
);
4764 scalar_results
.create (group_size
);
4765 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4766 bitsize
= TYPE_SIZE (scalar_type
);
4768 /* SLP reduction without reduction chain, e.g.,
4772 b2 = operation (b1) */
4773 slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
4775 /* True if we should implement SLP_REDUC using native reduction operations
4776 instead of scalar operations. */
4777 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
4779 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
4781 /* In case of reduction chain, e.g.,
4784 a3 = operation (a2),
4786 we may end up with more than one vector result. Here we reduce them to
4788 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) || direct_slp_reduc
)
4790 gimple_seq stmts
= NULL
;
4791 tree first_vect
= PHI_RESULT (new_phis
[0]);
4792 first_vect
= gimple_convert (&stmts
, vectype
, first_vect
);
4793 for (k
= 1; k
< new_phis
.length (); k
++)
4795 gimple
*next_phi
= new_phis
[k
];
4796 tree second_vect
= PHI_RESULT (next_phi
);
4797 second_vect
= gimple_convert (&stmts
, vectype
, second_vect
);
4798 first_vect
= gimple_build (&stmts
, code
, vectype
,
4799 first_vect
, second_vect
);
4801 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4803 new_phi_result
= first_vect
;
4804 new_phis
.truncate (0);
4805 new_phis
.safe_push (SSA_NAME_DEF_STMT (first_vect
));
4807 /* Likewise if we couldn't use a single defuse cycle. */
4808 else if (ncopies
> 1)
4810 gcc_assert (new_phis
.length () == 1);
4811 gimple_seq stmts
= NULL
;
4812 tree first_vect
= PHI_RESULT (new_phis
[0]);
4813 first_vect
= gimple_convert (&stmts
, vectype
, first_vect
);
4814 stmt_vec_info next_phi_info
= loop_vinfo
->lookup_stmt (new_phis
[0]);
4815 for (int k
= 1; k
< ncopies
; ++k
)
4817 next_phi_info
= STMT_VINFO_RELATED_STMT (next_phi_info
);
4818 tree second_vect
= PHI_RESULT (next_phi_info
->stmt
);
4819 second_vect
= gimple_convert (&stmts
, vectype
, second_vect
);
4820 first_vect
= gimple_build (&stmts
, code
, vectype
,
4821 first_vect
, second_vect
);
4823 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4824 new_phi_result
= first_vect
;
4825 new_phis
.truncate (0);
4826 new_phis
.safe_push (SSA_NAME_DEF_STMT (first_vect
));
4829 new_phi_result
= PHI_RESULT (new_phis
[0]);
4831 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
4832 && reduc_fn
!= IFN_LAST
)
4834 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4835 various data values where the condition matched and another vector
4836 (INDUCTION_INDEX) containing all the indexes of those matches. We
4837 need to extract the last matching index (which will be the index with
4838 highest value) and use this to index into the data vector.
4839 For the case where there were no matches, the data vector will contain
4840 all default values and the index vector will be all zeros. */
4842 /* Get various versions of the type of the vector of indexes. */
4843 tree index_vec_type
= TREE_TYPE (induction_index
);
4844 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
4845 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
4846 tree index_vec_cmp_type
= truth_type_for (index_vec_type
);
4848 /* Get an unsigned integer version of the type of the data vector. */
4849 int scalar_precision
4850 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
4851 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
4852 tree vectype_unsigned
= build_vector_type
4853 (scalar_type_unsigned
, TYPE_VECTOR_SUBPARTS (vectype
));
4855 /* First we need to create a vector (ZERO_VEC) of zeros and another
4856 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4857 can create using a MAX reduction and then expanding.
4858 In the case where the loop never made any matches, the max index will
4861 /* Vector of {0, 0, 0,...}. */
4862 tree zero_vec
= build_zero_cst (vectype
);
4864 gimple_seq stmts
= NULL
;
4865 new_phi_result
= gimple_convert (&stmts
, vectype
, new_phi_result
);
4866 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4868 /* Find maximum value from the vector of found indexes. */
4869 tree max_index
= make_ssa_name (index_scalar_type
);
4870 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4871 1, induction_index
);
4872 gimple_call_set_lhs (max_index_stmt
, max_index
);
4873 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
4875 /* Vector of {max_index, max_index, max_index,...}. */
4876 tree max_index_vec
= make_ssa_name (index_vec_type
);
4877 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
4879 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
4881 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
4883 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4884 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4885 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4886 otherwise. Only one value should match, resulting in a vector
4887 (VEC_COND) with one data value and the rest zeros.
4888 In the case where the loop never made any matches, every index will
4889 match, resulting in a vector with all data values (which will all be
4890 the default value). */
4892 /* Compare the max index vector to the vector of found indexes to find
4893 the position of the max value. */
4894 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
4895 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
4898 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
4900 /* Use the compare to choose either values from the data vector or
4902 tree vec_cond
= make_ssa_name (vectype
);
4903 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
4904 vec_compare
, new_phi_result
,
4906 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
4908 /* Finally we need to extract the data value from the vector (VEC_COND)
4909 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4910 reduction, but because this doesn't exist, we can use a MAX reduction
4911 instead. The data value might be signed or a float so we need to cast
4913 In the case where the loop never made any matches, the data values are
4914 all identical, and so will reduce down correctly. */
4916 /* Make the matched data values unsigned. */
4917 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
4918 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
4920 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
4923 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
4925 /* Reduce down to a scalar value. */
4926 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
4927 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4929 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
4930 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
4932 /* Convert the reduced value back to the result type and set as the
4935 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
4937 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4938 scalar_results
.safe_push (new_temp
);
4940 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
4941 && reduc_fn
== IFN_LAST
)
4943 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4945 idx_val = induction_index[0];
4946 val = data_reduc[0];
4947 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4948 if (induction_index[i] > idx_val)
4949 val = data_reduc[i], idx_val = induction_index[i];
4952 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4953 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
4954 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
4955 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
4956 /* Enforced by vectorizable_reduction, which ensures we have target
4957 support before allowing a conditional reduction on variable-length
4959 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
4960 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
4961 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
4963 tree old_idx_val
= idx_val
;
4965 idx_val
= make_ssa_name (idx_eltype
);
4966 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
4967 build3 (BIT_FIELD_REF
, idx_eltype
,
4969 bitsize_int (el_size
),
4970 bitsize_int (off
)));
4971 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4972 val
= make_ssa_name (data_eltype
);
4973 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
4974 build3 (BIT_FIELD_REF
,
4977 bitsize_int (el_size
),
4978 bitsize_int (off
)));
4979 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4982 tree new_idx_val
= idx_val
;
4983 if (off
!= v_size
- el_size
)
4985 new_idx_val
= make_ssa_name (idx_eltype
);
4986 epilog_stmt
= gimple_build_assign (new_idx_val
,
4989 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4991 tree new_val
= make_ssa_name (data_eltype
);
4992 epilog_stmt
= gimple_build_assign (new_val
,
4999 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5000 idx_val
= new_idx_val
;
5004 /* Convert the reduced value back to the result type and set as the
5006 gimple_seq stmts
= NULL
;
5007 val
= gimple_convert (&stmts
, scalar_type
, val
);
5008 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5009 scalar_results
.safe_push (val
);
5012 /* 2.3 Create the reduction code, using one of the three schemes described
5013 above. In SLP we simply need to extract all the elements from the
5014 vector (without reducing them), so we use scalar shifts. */
5015 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5021 v_out2 = reduc_expr <v_out1> */
5023 if (dump_enabled_p ())
5024 dump_printf_loc (MSG_NOTE
, vect_location
,
5025 "Reduce using direct vector reduction.\n");
5027 gimple_seq stmts
= NULL
;
5028 new_phi_result
= gimple_convert (&stmts
, vectype
, new_phi_result
);
5029 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5030 new_temp
= gimple_build (&stmts
, as_combined_fn (reduc_fn
),
5031 vec_elem_type
, new_phi_result
);
5032 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5033 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5035 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5038 /* Earlier we set the initial value to be a vector if induc_val
5039 values. Check the result and if it is induc_val then replace
5040 with the original initial value, unless induc_val is
5041 the same as initial_def already. */
5042 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5045 tmp
= make_ssa_name (new_scalar_dest
);
5046 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5047 initial_def
, new_temp
);
5048 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5052 scalar_results
.safe_push (new_temp
);
5054 else if (direct_slp_reduc
)
5056 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5057 with the elements for other SLP statements replaced with the
5058 neutral value. We can then do a normal reduction on each vector. */
5060 /* Enforced by vectorizable_reduction. */
5061 gcc_assert (new_phis
.length () == 1);
5062 gcc_assert (pow2p_hwi (group_size
));
5064 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
5065 vec
<stmt_vec_info
> orig_phis
5066 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
5067 gimple_seq seq
= NULL
;
5069 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5070 and the same element size as VECTYPE. */
5071 tree index
= build_index_vector (vectype
, 0, 1);
5072 tree index_type
= TREE_TYPE (index
);
5073 tree index_elt_type
= TREE_TYPE (index_type
);
5074 tree mask_type
= truth_type_for (index_type
);
5076 /* Create a vector that, for each element, identifies which of
5077 the REDUC_GROUP_SIZE results should use it. */
5078 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5079 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5080 build_vector_from_val (index_type
, index_mask
));
5082 /* Get a neutral vector value. This is simply a splat of the neutral
5083 scalar value if we have one, otherwise the initial scalar value
5084 is itself a neutral value. */
5085 tree vector_identity
= NULL_TREE
;
5086 tree neutral_op
= NULL_TREE
;
5089 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
5091 = neutral_op_for_slp_reduction (slp_node_instance
->reduc_phis
,
5092 vectype
, code
, first
!= NULL
);
5095 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5097 for (unsigned int i
= 0; i
< group_size
; ++i
)
5099 /* If there's no univeral neutral value, we can use the
5100 initial scalar value from the original PHI. This is used
5101 for MIN and MAX reduction, for example. */
5105 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
]->stmt
,
5106 loop_preheader_edge (loop
));
5107 scalar_value
= gimple_convert (&seq
, TREE_TYPE (vectype
),
5109 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5113 /* Calculate the equivalent of:
5115 sel[j] = (index[j] == i);
5117 which selects the elements of NEW_PHI_RESULT that should
5118 be included in the result. */
5119 tree compare_val
= build_int_cst (index_elt_type
, i
);
5120 compare_val
= build_vector_from_val (index_type
, compare_val
);
5121 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5122 index
, compare_val
);
5124 /* Calculate the equivalent of:
5126 vec = seq ? new_phi_result : vector_identity;
5128 VEC is now suitable for a full vector reduction. */
5129 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5130 sel
, new_phi_result
, vector_identity
);
5132 /* Do the reduction and convert it to the appropriate type. */
5133 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5134 TREE_TYPE (vectype
), vec
);
5135 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5136 scalar_results
.safe_push (scalar
);
5138 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5142 bool reduce_with_shift
;
5145 gcc_assert (slp_reduc
|| new_phis
.length () == 1);
5147 /* See if the target wants to do the final (shift) reduction
5148 in a vector mode of smaller size and first reduce upper/lower
5149 halves against each other. */
5150 enum machine_mode mode1
= mode
;
5151 tree stype
= TREE_TYPE (vectype
);
5152 unsigned nunits
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5153 unsigned nunits1
= nunits
;
5154 if ((mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
5155 && new_phis
.length () == 1)
5157 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5158 /* For SLP reductions we have to make sure lanes match up, but
5159 since we're doing individual element final reduction reducing
5160 vector width here is even more important.
5161 ??? We can also separate lanes with permutes, for the common
5162 case of power-of-two group-size odd/even extracts would work. */
5163 if (slp_reduc
&& nunits
!= nunits1
)
5165 nunits1
= least_common_multiple (nunits1
, group_size
);
5166 gcc_assert (exact_log2 (nunits1
) != -1 && nunits1
<= nunits
);
5170 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5171 nunits1
= GET_MODE_NUNITS (mode1
).to_constant ();
5173 tree vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5175 reduce_with_shift
= have_whole_vector_shift (mode1
);
5176 if (!VECTOR_MODE_P (mode1
))
5177 reduce_with_shift
= false;
5180 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
5181 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
5182 reduce_with_shift
= false;
5185 /* First reduce the vector to the desired vector size we should
5186 do shift reduction on by combining upper and lower halves. */
5187 new_temp
= new_phi_result
;
5188 while (nunits
> nunits1
)
5191 vectype1
= get_related_vectype_for_scalar_type (TYPE_MODE (vectype
),
5193 unsigned int bitsize
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5195 /* The target has to make sure we support lowpart/highpart
5196 extraction, either via direct vector extract or through
5197 an integer mode punning. */
5199 if (convert_optab_handler (vec_extract_optab
,
5200 TYPE_MODE (TREE_TYPE (new_temp
)),
5201 TYPE_MODE (vectype1
))
5202 != CODE_FOR_nothing
)
5204 /* Extract sub-vectors directly once vec_extract becomes
5205 a conversion optab. */
5206 dst1
= make_ssa_name (vectype1
);
5208 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5209 build3 (BIT_FIELD_REF
, vectype1
,
5210 new_temp
, TYPE_SIZE (vectype1
),
5212 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5213 dst2
= make_ssa_name (vectype1
);
5215 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5216 build3 (BIT_FIELD_REF
, vectype1
,
5217 new_temp
, TYPE_SIZE (vectype1
),
5218 bitsize_int (bitsize
)));
5219 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5223 /* Extract via punning to appropriately sized integer mode
5225 tree eltype
= build_nonstandard_integer_type (bitsize
, 1);
5226 tree etype
= build_vector_type (eltype
, 2);
5227 gcc_assert (convert_optab_handler (vec_extract_optab
,
5230 != CODE_FOR_nothing
);
5231 tree tem
= make_ssa_name (etype
);
5232 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5233 build1 (VIEW_CONVERT_EXPR
,
5235 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5237 tem
= make_ssa_name (eltype
);
5239 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5240 build3 (BIT_FIELD_REF
, eltype
,
5241 new_temp
, TYPE_SIZE (eltype
),
5243 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5244 dst1
= make_ssa_name (vectype1
);
5245 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5246 build1 (VIEW_CONVERT_EXPR
,
5248 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5249 tem
= make_ssa_name (eltype
);
5251 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5252 build3 (BIT_FIELD_REF
, eltype
,
5253 new_temp
, TYPE_SIZE (eltype
),
5254 bitsize_int (bitsize
)));
5255 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5256 dst2
= make_ssa_name (vectype1
);
5257 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5258 build1 (VIEW_CONVERT_EXPR
,
5260 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5263 new_temp
= make_ssa_name (vectype1
);
5264 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
5265 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5266 new_phis
[0] = epilog_stmt
;
5269 if (reduce_with_shift
&& !slp_reduc
)
5271 int element_bitsize
= tree_to_uhwi (bitsize
);
5272 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5273 for variable-length vectors and also requires direct target support
5274 for loop reductions. */
5275 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5276 int nelements
= vec_size_in_bits
/ element_bitsize
;
5277 vec_perm_builder sel
;
5278 vec_perm_indices indices
;
5282 tree zero_vec
= build_zero_cst (vectype1
);
5284 for (offset = nelements/2; offset >= 1; offset/=2)
5286 Create: va' = vec_shift <va, offset>
5287 Create: va = vop <va, va'>
5292 if (dump_enabled_p ())
5293 dump_printf_loc (MSG_NOTE
, vect_location
,
5294 "Reduce using vector shifts\n");
5296 gimple_seq stmts
= NULL
;
5297 new_temp
= gimple_convert (&stmts
, vectype1
, new_temp
);
5298 for (elt_offset
= nelements
/ 2;
5302 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5303 indices
.new_vector (sel
, 2, nelements
);
5304 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5305 new_name
= gimple_build (&stmts
, VEC_PERM_EXPR
, vectype1
,
5306 new_temp
, zero_vec
, mask
);
5307 new_temp
= gimple_build (&stmts
, code
,
5308 vectype1
, new_name
, new_temp
);
5310 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5312 /* 2.4 Extract the final scalar result. Create:
5313 s_out3 = extract_field <v_out2, bitpos> */
5315 if (dump_enabled_p ())
5316 dump_printf_loc (MSG_NOTE
, vect_location
,
5317 "extract scalar result\n");
5319 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5320 bitsize
, bitsize_zero_node
);
5321 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5322 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5323 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5324 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5325 scalar_results
.safe_push (new_temp
);
5330 s = extract_field <v_out2, 0>
5331 for (offset = element_size;
5332 offset < vector_size;
5333 offset += element_size;)
5335 Create: s' = extract_field <v_out2, offset>
5336 Create: s = op <s, s'> // For non SLP cases
5339 if (dump_enabled_p ())
5340 dump_printf_loc (MSG_NOTE
, vect_location
,
5341 "Reduce using scalar code.\n");
5343 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5344 int element_bitsize
= tree_to_uhwi (bitsize
);
5345 tree compute_type
= TREE_TYPE (vectype
);
5346 gimple_seq stmts
= NULL
;
5347 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
5350 if (gimple_code (new_phi
) == GIMPLE_PHI
)
5351 vec_temp
= PHI_RESULT (new_phi
);
5353 vec_temp
= gimple_assign_lhs (new_phi
);
5354 new_temp
= gimple_build (&stmts
, BIT_FIELD_REF
, compute_type
,
5355 vec_temp
, bitsize
, bitsize_zero_node
);
5357 /* In SLP we don't need to apply reduction operation, so we just
5358 collect s' values in SCALAR_RESULTS. */
5360 scalar_results
.safe_push (new_temp
);
5362 for (bit_offset
= element_bitsize
;
5363 bit_offset
< vec_size_in_bits
;
5364 bit_offset
+= element_bitsize
)
5366 tree bitpos
= bitsize_int (bit_offset
);
5367 new_name
= gimple_build (&stmts
, BIT_FIELD_REF
,
5368 compute_type
, vec_temp
,
5372 /* In SLP we don't need to apply reduction operation, so
5373 we just collect s' values in SCALAR_RESULTS. */
5374 new_temp
= new_name
;
5375 scalar_results
.safe_push (new_name
);
5378 new_temp
= gimple_build (&stmts
, code
, compute_type
,
5379 new_name
, new_temp
);
5383 /* The only case where we need to reduce scalar results in SLP, is
5384 unrolling. If the size of SCALAR_RESULTS is greater than
5385 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5386 REDUC_GROUP_SIZE. */
5389 tree res
, first_res
, new_res
;
5391 /* Reduce multiple scalar results in case of SLP unrolling. */
5392 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5395 first_res
= scalar_results
[j
% group_size
];
5396 new_res
= gimple_build (&stmts
, code
, compute_type
,
5398 scalar_results
[j
% group_size
] = new_res
;
5400 for (k
= 0; k
< group_size
; k
++)
5401 scalar_results
[k
] = gimple_convert (&stmts
, scalar_type
,
5406 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5407 new_temp
= gimple_convert (&stmts
, scalar_type
, new_temp
);
5408 scalar_results
.safe_push (new_temp
);
5411 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5414 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5417 /* Earlier we set the initial value to be a vector if induc_val
5418 values. Check the result and if it is induc_val then replace
5419 with the original initial value, unless induc_val is
5420 the same as initial_def already. */
5421 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5424 tree tmp
= make_ssa_name (new_scalar_dest
);
5425 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5426 initial_def
, new_temp
);
5427 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5428 scalar_results
[0] = tmp
;
5432 /* 2.5 Adjust the final result by the initial value of the reduction
5433 variable. (When such adjustment is not needed, then
5434 'adjustment_def' is zero). For example, if code is PLUS we create:
5435 new_temp = loop_exit_def + adjustment_def */
5439 gcc_assert (!slp_reduc
);
5440 gimple_seq stmts
= NULL
;
5441 if (nested_in_vect_loop
)
5443 new_phi
= new_phis
[0];
5444 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def
)));
5445 adjustment_def
= gimple_convert (&stmts
, vectype
, adjustment_def
);
5446 new_temp
= gimple_build (&stmts
, code
, vectype
,
5447 PHI_RESULT (new_phi
), adjustment_def
);
5451 new_temp
= scalar_results
[0];
5452 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5453 adjustment_def
= gimple_convert (&stmts
, scalar_type
, adjustment_def
);
5454 new_temp
= gimple_build (&stmts
, code
, scalar_type
,
5455 new_temp
, adjustment_def
);
5458 epilog_stmt
= gimple_seq_last_stmt (stmts
);
5459 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5460 if (nested_in_vect_loop
)
5462 stmt_vec_info epilog_stmt_info
= loop_vinfo
->add_stmt (epilog_stmt
);
5463 STMT_VINFO_RELATED_STMT (epilog_stmt_info
)
5464 = STMT_VINFO_RELATED_STMT (loop_vinfo
->lookup_stmt (new_phi
));
5467 scalar_results
.quick_push (new_temp
);
5469 scalar_results
[0] = new_temp
;
5472 scalar_results
[0] = new_temp
;
5474 new_phis
[0] = epilog_stmt
;
5480 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5481 phis with new adjusted scalar results, i.e., replace use <s_out0>
5486 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5487 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5488 v_out2 = reduce <v_out1>
5489 s_out3 = extract_field <v_out2, 0>
5490 s_out4 = adjust_result <s_out3>
5497 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5498 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5499 v_out2 = reduce <v_out1>
5500 s_out3 = extract_field <v_out2, 0>
5501 s_out4 = adjust_result <s_out3>
5506 /* In SLP reduction chain we reduce vector results into one vector if
5507 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5508 LHS of the last stmt in the reduction chain, since we are looking for
5509 the loop exit phi node. */
5510 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5512 stmt_vec_info dest_stmt_info
5513 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
5514 scalar_dest
= gimple_assign_lhs (dest_stmt_info
->stmt
);
5518 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5519 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5520 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5521 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5522 correspond to the first vector stmt, etc.
5523 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5524 if (group_size
> new_phis
.length ())
5525 gcc_assert (!(group_size
% new_phis
.length ()));
5527 for (k
= 0; k
< group_size
; k
++)
5531 stmt_vec_info scalar_stmt_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5533 orig_stmt_info
= STMT_VINFO_RELATED_STMT (scalar_stmt_info
);
5534 /* SLP statements can't participate in patterns. */
5535 gcc_assert (!orig_stmt_info
);
5536 scalar_dest
= gimple_assign_lhs (scalar_stmt_info
->stmt
);
5539 if (nested_in_vect_loop
)
5548 /* Find the loop-closed-use at the loop exit of the original scalar
5549 result. (The reduction result is expected to have two immediate uses,
5550 one at the latch block, and one at the loop exit). For double
5551 reductions we are looking for exit phis of the outer loop. */
5552 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5554 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
5556 if (!is_gimple_debug (USE_STMT (use_p
)))
5557 phis
.safe_push (USE_STMT (use_p
));
5561 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
5563 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
5565 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
5567 if (!flow_bb_inside_loop_p (loop
,
5568 gimple_bb (USE_STMT (phi_use_p
)))
5569 && !is_gimple_debug (USE_STMT (phi_use_p
)))
5570 phis
.safe_push (USE_STMT (phi_use_p
));
5576 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5578 /* Replace the uses: */
5579 orig_name
= PHI_RESULT (exit_phi
);
5580 scalar_result
= scalar_results
[k
];
5581 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5583 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
5584 SET_USE (use_p
, scalar_result
);
5585 update_stmt (use_stmt
);
5593 /* Return a vector of type VECTYPE that is equal to the vector select
5594 operation "MASK ? VEC : IDENTITY". Insert the select statements
5598 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
5599 tree vec
, tree identity
)
5601 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
5602 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
5603 mask
, vec
, identity
);
5604 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5608 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5609 order, starting with LHS. Insert the extraction statements before GSI and
5610 associate the new scalar SSA names with variable SCALAR_DEST.
5611 Return the SSA name for the result. */
5614 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
5615 tree_code code
, tree lhs
, tree vector_rhs
)
5617 tree vectype
= TREE_TYPE (vector_rhs
);
5618 tree scalar_type
= TREE_TYPE (vectype
);
5619 tree bitsize
= TYPE_SIZE (scalar_type
);
5620 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
5621 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
5623 for (unsigned HOST_WIDE_INT bit_offset
= 0;
5624 bit_offset
< vec_size_in_bits
;
5625 bit_offset
+= element_bitsize
)
5627 tree bitpos
= bitsize_int (bit_offset
);
5628 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
5631 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
5632 rhs
= make_ssa_name (scalar_dest
, stmt
);
5633 gimple_assign_set_lhs (stmt
, rhs
);
5634 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5636 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
5637 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
5638 gimple_assign_set_lhs (stmt
, new_name
);
5639 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5645 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5646 type of the vector input. */
5649 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
5651 internal_fn mask_reduc_fn
;
5655 case IFN_FOLD_LEFT_PLUS
:
5656 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
5663 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
5664 OPTIMIZE_FOR_SPEED
))
5665 return mask_reduc_fn
;
5669 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5670 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5671 statement. CODE is the operation performed by STMT_INFO and OPS are
5672 its scalar operands. REDUC_INDEX is the index of the operand in
5673 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5674 implements in-order reduction, or IFN_LAST if we should open-code it.
5675 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5676 that should be used to control the operation in a fully-masked loop. */
5679 vectorize_fold_left_reduction (loop_vec_info loop_vinfo
,
5680 stmt_vec_info stmt_info
,
5681 gimple_stmt_iterator
*gsi
,
5682 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
5683 gimple
*reduc_def_stmt
,
5684 tree_code code
, internal_fn reduc_fn
,
5685 tree ops
[3], tree vectype_in
,
5686 int reduc_index
, vec_loop_masks
*masks
)
5688 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5689 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
5690 stmt_vec_info new_stmt_info
= NULL
;
5691 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
5697 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
5699 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
5700 gcc_assert (ncopies
== 1);
5701 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
5704 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
5705 TYPE_VECTOR_SUBPARTS (vectype_in
)));
5707 tree op0
= ops
[1 - reduc_index
];
5710 stmt_vec_info scalar_dest_def_info
;
5711 auto_vec
<tree
> vec_oprnds0
;
5714 auto_vec
<vec
<tree
> > vec_defs (2);
5715 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
5716 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
5717 vec_defs
[0].release ();
5718 vec_defs
[1].release ();
5719 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
5720 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5724 tree loop_vec_def0
= vect_get_vec_def_for_operand (loop_vinfo
,
5726 vec_oprnds0
.create (1);
5727 vec_oprnds0
.quick_push (loop_vec_def0
);
5728 scalar_dest_def_info
= stmt_info
;
5731 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
5732 tree scalar_type
= TREE_TYPE (scalar_dest
);
5733 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
5735 int vec_num
= vec_oprnds0
.length ();
5736 gcc_assert (vec_num
== 1 || slp_node
);
5737 tree vec_elem_type
= TREE_TYPE (vectype_out
);
5738 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
5740 tree vector_identity
= NULL_TREE
;
5741 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5742 vector_identity
= build_zero_cst (vectype_out
);
5744 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
5747 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5750 tree mask
= NULL_TREE
;
5751 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5752 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
5754 /* Handle MINUS by adding the negative. */
5755 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
5757 tree negated
= make_ssa_name (vectype_out
);
5758 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
5759 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5763 if (mask
&& mask_reduc_fn
== IFN_LAST
)
5764 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
5767 /* On the first iteration the input is simply the scalar phi
5768 result, and for subsequent iterations it is the output of
5769 the preceding operation. */
5770 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
5772 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
5773 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
5776 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
5778 /* For chained SLP reductions the output of the previous reduction
5779 operation serves as the input of the next. For the final statement
5780 the output cannot be a temporary - we reuse the original
5781 scalar destination of the last statement. */
5782 if (i
!= vec_num
- 1)
5784 gimple_set_lhs (new_stmt
, scalar_dest_var
);
5785 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
5786 gimple_set_lhs (new_stmt
, reduc_var
);
5791 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
5793 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
5794 /* Remove the statement, so that we can use the same code paths
5795 as for statements that we've just created. */
5796 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
5797 gsi_remove (&tmp_gsi
, true);
5800 if (i
== vec_num
- 1)
5802 gimple_set_lhs (new_stmt
, scalar_dest
);
5803 new_stmt_info
= vect_finish_replace_stmt (loop_vinfo
,
5804 scalar_dest_def_info
,
5808 new_stmt_info
= vect_finish_stmt_generation (loop_vinfo
,
5809 scalar_dest_def_info
,
5813 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt_info
);
5817 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
5822 /* Function is_nonwrapping_integer_induction.
5824 Check if STMT_VINO (which is part of loop LOOP) both increments and
5825 does not cause overflow. */
5828 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
5830 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
5831 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
5832 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
5833 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
5834 widest_int ni
, max_loop_value
, lhs_max
;
5835 wi::overflow_type overflow
= wi::OVF_NONE
;
5837 /* Make sure the loop is integer based. */
5838 if (TREE_CODE (base
) != INTEGER_CST
5839 || TREE_CODE (step
) != INTEGER_CST
)
5842 /* Check that the max size of the loop will not wrap. */
5844 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
5847 if (! max_stmt_executions (loop
, &ni
))
5850 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
5855 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
5856 TYPE_SIGN (lhs_type
), &overflow
);
5860 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
5861 <= TYPE_PRECISION (lhs_type
));
5864 /* Check if masking can be supported by inserting a conditional expression.
5865 CODE is the code for the operation. COND_FN is the conditional internal
5866 function, if it exists. VECTYPE_IN is the type of the vector input. */
5868 use_mask_by_cond_expr_p (enum tree_code code
, internal_fn cond_fn
,
5871 if (cond_fn
!= IFN_LAST
5872 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
5873 OPTIMIZE_FOR_SPEED
))
5887 /* Insert a conditional expression to enable masked vectorization. CODE is the
5888 code for the operation. VOP is the array of operands. MASK is the loop
5889 mask. GSI is a statement iterator used to place the new conditional
5892 build_vect_cond_expr (enum tree_code code
, tree vop
[3], tree mask
,
5893 gimple_stmt_iterator
*gsi
)
5899 tree vectype
= TREE_TYPE (vop
[1]);
5900 tree zero
= build_zero_cst (vectype
);
5901 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
5902 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
5903 mask
, vop
[1], zero
);
5904 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
5905 vop
[1] = masked_op1
;
5911 tree vectype
= TREE_TYPE (vop
[1]);
5912 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
5913 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
5914 mask
, vop
[1], vop
[0]);
5915 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
5916 vop
[1] = masked_op1
;
5925 /* Function vectorizable_reduction.
5927 Check if STMT_INFO performs a reduction operation that can be vectorized.
5928 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5929 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5930 Return true if STMT_INFO is vectorizable in this way.
5932 This function also handles reduction idioms (patterns) that have been
5933 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5934 may be of this form:
5935 X = pattern_expr (arg0, arg1, ..., X)
5936 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5937 sequence that had been detected and replaced by the pattern-stmt
5940 This function also handles reduction of condition expressions, for example:
5941 for (int i = 0; i < N; i++)
5944 This is handled by vectorising the loop and creating an additional vector
5945 containing the loop indexes for which "a[i] < value" was true. In the
5946 function epilogue this is reduced to a single max value and then used to
5947 index into the vector of results.
5949 In some cases of reduction patterns, the type of the reduction variable X is
5950 different than the type of the other arguments of STMT_INFO.
5951 In such cases, the vectype that is used when transforming STMT_INFO into
5952 a vector stmt is different than the vectype that is used to determine the
5953 vectorization factor, because it consists of a different number of elements
5954 than the actual number of elements that are being operated upon in parallel.
5956 For example, consider an accumulation of shorts into an int accumulator.
5957 On some targets it's possible to vectorize this pattern operating on 8
5958 shorts at a time (hence, the vectype for purposes of determining the
5959 vectorization factor should be V8HI); on the other hand, the vectype that
5960 is used to create the vector form is actually V4SI (the type of the result).
5962 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5963 indicates what is the actual level of parallelism (V8HI in the example), so
5964 that the right vectorization factor would be derived. This vectype
5965 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5966 be used to create the vectorized stmt. The right vectype for the vectorized
5967 stmt is obtained from the type of the result X:
5968 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5970 This means that, contrary to "regular" reductions (or "regular" stmts in
5971 general), the following equation:
5972 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5973 does *NOT* necessarily hold for reduction patterns. */
5976 vectorizable_reduction (loop_vec_info loop_vinfo
,
5977 stmt_vec_info stmt_info
, slp_tree slp_node
,
5978 slp_instance slp_node_instance
,
5979 stmt_vector_for_cost
*cost_vec
)
5982 tree vectype_in
= NULL_TREE
;
5983 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5984 enum vect_def_type cond_reduc_dt
= vect_unknown_def_type
;
5985 stmt_vec_info cond_stmt_vinfo
= NULL
;
5989 bool single_defuse_cycle
= false;
5990 bool nested_cycle
= false;
5991 bool double_reduc
= false;
5994 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
5995 tree cond_reduc_val
= NULL_TREE
;
5997 /* Make sure it was already recognized as a reduction computation. */
5998 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
5999 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
6000 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6003 /* The stmt we store reduction analysis meta on. */
6004 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6005 reduc_info
->is_reduc_info
= true;
6007 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
6009 if (is_a
<gphi
*> (stmt_info
->stmt
))
6010 /* Analysis for double-reduction is done on the outer
6011 loop PHI, nested cycles have no further restrictions. */
6012 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
6014 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6018 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
6019 stmt_vec_info phi_info
= stmt_info
;
6020 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
6021 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
6023 if (!is_a
<gphi
*> (stmt_info
->stmt
))
6025 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6030 slp_node_instance
->reduc_phis
= slp_node
;
6031 /* ??? We're leaving slp_node to point to the PHIs, we only
6032 need it to get at the number of vector stmts which wasn't
6033 yet initialized for the instance root. */
6035 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
6036 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info
));
6037 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6039 use_operand_p use_p
;
6041 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
6044 phi_info
= loop_vinfo
->lookup_stmt (use_stmt
);
6045 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
6049 /* PHIs should not participate in patterns. */
6050 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6051 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6053 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6054 and compute the reduction chain length. */
6055 tree reduc_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6056 loop_latch_edge (loop
));
6057 unsigned reduc_chain_length
= 0;
6058 bool only_slp_reduc_chain
= true;
6060 while (reduc_def
!= PHI_RESULT (reduc_def_phi
))
6062 stmt_vec_info def
= loop_vinfo
->lookup_def (reduc_def
);
6063 stmt_vec_info vdef
= vect_stmt_to_vectorize (def
);
6064 if (STMT_VINFO_REDUC_IDX (vdef
) == -1)
6066 if (dump_enabled_p ())
6067 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6068 "reduction chain broken by patterns.\n");
6071 if (!REDUC_GROUP_FIRST_ELEMENT (vdef
))
6072 only_slp_reduc_chain
= false;
6073 /* ??? For epilogue generation live members of the chain need
6074 to point back to the PHI via their original stmt for
6075 info_for_reduction to work. */
6076 if (STMT_VINFO_LIVE_P (vdef
))
6077 STMT_VINFO_REDUC_DEF (def
) = phi_info
;
6078 gassign
*assign
= dyn_cast
<gassign
*> (vdef
->stmt
);
6081 if (dump_enabled_p ())
6082 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6083 "reduction chain includes calls.\n");
6086 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
6088 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign
)),
6089 TREE_TYPE (gimple_assign_rhs1 (assign
))))
6091 if (dump_enabled_p ())
6092 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6093 "conversion in the reduction chain.\n");
6097 else if (!stmt_info
)
6098 /* First non-conversion stmt. */
6100 reduc_def
= gimple_op (vdef
->stmt
, 1 + STMT_VINFO_REDUC_IDX (vdef
));
6101 reduc_chain_length
++;
6103 /* PHIs should not participate in patterns. */
6104 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
6106 if (nested_in_vect_loop_p (loop
, stmt_info
))
6109 nested_cycle
= true;
6112 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6114 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6116 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
6117 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
6119 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6120 gcc_assert (slp_node
6121 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6123 /* 1. Is vectorizable reduction? */
6124 /* Not supportable if the reduction variable is used in the loop, unless
6125 it's a reduction chain. */
6126 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6127 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6130 /* Reductions that are not used even in an enclosing outer-loop,
6131 are expected to be "live" (used out of the loop). */
6132 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6133 && !STMT_VINFO_LIVE_P (stmt_info
))
6136 /* 2. Has this been recognized as a reduction pattern?
6138 Check if STMT represents a pattern that has been recognized
6139 in earlier analysis stages. For stmts that represent a pattern,
6140 the STMT_VINFO_RELATED_STMT field records the last stmt in
6141 the original sequence that constitutes the pattern. */
6143 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6146 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6147 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6150 /* 3. Check the operands of the operation. The first operands are defined
6151 inside the loop body. The last operand is the reduction variable,
6152 which is defined by the loop-header-phi. */
6154 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6155 STMT_VINFO_REDUC_VECTYPE (reduc_info
) = vectype_out
;
6156 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6157 enum tree_code code
= gimple_assign_rhs_code (stmt
);
6158 bool lane_reduc_code_p
6159 = (code
== DOT_PROD_EXPR
|| code
== WIDEN_SUM_EXPR
|| code
== SAD_EXPR
);
6160 int op_type
= TREE_CODE_LENGTH (code
);
6162 scalar_dest
= gimple_assign_lhs (stmt
);
6163 scalar_type
= TREE_TYPE (scalar_dest
);
6164 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
6165 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
6168 /* Do not try to vectorize bit-precision reductions. */
6169 if (!type_has_mode_precision_p (scalar_type
))
6172 /* For lane-reducing ops we're reducing the number of reduction PHIs
6173 which means the only use of that may be in the lane-reducing operation. */
6174 if (lane_reduc_code_p
6175 && reduc_chain_length
!= 1
6176 && !only_slp_reduc_chain
)
6178 if (dump_enabled_p ())
6179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6180 "lane-reducing reduction with extra stmts.\n");
6184 /* All uses but the last are expected to be defined in the loop.
6185 The last use is the reduction variable. In case of nested cycle this
6186 assumption is not true: we use reduc_index to record the index of the
6187 reduction variable. */
6188 /* ??? To get at invariant/constant uses on the SLP node we have to
6189 get to it here, slp_node is still the reduction PHI. */
6190 slp_tree slp_for_stmt_info
= NULL
;
6193 slp_for_stmt_info
= slp_node_instance
->root
;
6194 /* And then there's reduction chain with a conversion ... */
6195 if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info
) != stmt_info
)
6196 slp_for_stmt_info
= SLP_TREE_CHILDREN (slp_for_stmt_info
)[0];
6197 gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info
) == stmt_info
);
6199 slp_tree
*slp_op
= XALLOCAVEC (slp_tree
, op_type
);
6200 for (i
= 0; i
< op_type
; i
++)
6202 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6203 if (i
== 0 && code
== COND_EXPR
)
6206 stmt_vec_info def_stmt_info
;
6207 enum vect_def_type dt
;
6209 if (!vect_is_simple_use (loop_vinfo
, stmt_info
, slp_for_stmt_info
,
6210 i
, &op
, &slp_op
[i
], &dt
, &tem
,
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6215 "use not simple.\n");
6218 if (i
== STMT_VINFO_REDUC_IDX (stmt_info
))
6221 /* There should be only one cycle def in the stmt, the one
6222 leading to reduc_def. */
6223 if (VECTORIZABLE_CYCLE_DEF (dt
))
6226 /* To properly compute ncopies we are interested in the widest
6227 non-reduction input type in case we're looking at a widening
6228 accumulation that we later handle in vect_transform_reduction. */
6229 if (lane_reduc_code_p
6232 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6233 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
))))))
6236 if (code
== COND_EXPR
)
6238 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6239 if (dt
== vect_constant_def
)
6242 cond_reduc_val
= op
;
6244 if (dt
== vect_induction_def
6246 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
6249 cond_stmt_vinfo
= def_stmt_info
;
6254 vectype_in
= STMT_VINFO_VECTYPE (phi_info
);
6255 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
6257 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
6258 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
6259 /* If we have a condition reduction, see if we can simplify it further. */
6260 if (v_reduc_type
== COND_REDUCTION
)
6265 /* When the condition uses the reduction value in the condition, fail. */
6266 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 0)
6268 if (dump_enabled_p ())
6269 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6270 "condition depends on previous iteration\n");
6274 if (reduc_chain_length
== 1
6275 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6276 vectype_in
, OPTIMIZE_FOR_SPEED
))
6278 if (dump_enabled_p ())
6279 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6280 "optimizing condition reduction with"
6281 " FOLD_EXTRACT_LAST.\n");
6282 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
6284 else if (cond_reduc_dt
== vect_induction_def
)
6287 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6288 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6290 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6291 && TREE_CODE (step
) == INTEGER_CST
);
6292 cond_reduc_val
= NULL_TREE
;
6293 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6294 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
6295 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
6297 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6298 above base; punt if base is the minimum value of the type for
6299 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6300 else if (tree_int_cst_sgn (step
) == -1)
6302 cond_reduc_op_code
= MIN_EXPR
;
6303 if (tree_int_cst_sgn (base
) == -1)
6304 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6305 else if (tree_int_cst_lt (base
,
6306 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6308 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6312 cond_reduc_op_code
= MAX_EXPR
;
6313 if (tree_int_cst_sgn (base
) == 1)
6314 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6315 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6318 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_NOTE
, vect_location
,
6324 "condition expression based on "
6325 "integer induction.\n");
6326 STMT_VINFO_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
6327 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
6329 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
6332 else if (cond_reduc_dt
== vect_constant_def
)
6334 enum vect_def_type cond_initial_dt
;
6335 tree cond_initial_val
6336 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
, loop_preheader_edge (loop
));
6338 gcc_assert (cond_reduc_val
!= NULL_TREE
);
6339 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
6340 if (cond_initial_dt
== vect_constant_def
6341 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6342 TREE_TYPE (cond_reduc_val
)))
6344 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6345 cond_initial_val
, cond_reduc_val
);
6346 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6348 if (dump_enabled_p ())
6349 dump_printf_loc (MSG_NOTE
, vect_location
,
6350 "condition expression based on "
6351 "compile time constant.\n");
6352 /* Record reduction code at analysis stage. */
6353 STMT_VINFO_REDUC_CODE (reduc_info
)
6354 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6355 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
6361 if (STMT_VINFO_LIVE_P (phi_info
))
6367 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6369 gcc_assert (ncopies
>= 1);
6371 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6375 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
6376 == vect_double_reduction_def
);
6377 double_reduc
= true;
6380 /* 4.2. Check support for the epilog operation.
6382 If STMT represents a reduction pattern, then the type of the
6383 reduction variable may be different than the type of the rest
6384 of the arguments. For example, consider the case of accumulation
6385 of shorts into an int accumulator; The original code:
6386 S1: int_a = (int) short_a;
6387 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6390 STMT: int_acc = widen_sum <short_a, int_acc>
6393 1. The tree-code that is used to create the vector operation in the
6394 epilog code (that reduces the partial results) is not the
6395 tree-code of STMT, but is rather the tree-code of the original
6396 stmt from the pattern that STMT is replacing. I.e, in the example
6397 above we want to use 'widen_sum' in the loop, but 'plus' in the
6399 2. The type (mode) we use to check available target support
6400 for the vector operation to be created in the *epilog*, is
6401 determined by the type of the reduction variable (in the example
6402 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6403 However the type (mode) we use to check available target support
6404 for the vector operation to be created *inside the loop*, is
6405 determined by the type of the other arguments to STMT (in the
6406 example we'd check this: optab_handler (widen_sum_optab,
6409 This is contrary to "regular" reductions, in which the types of all
6410 the arguments are the same as the type of the reduction variable.
6411 For "regular" reductions we can therefore use the same vector type
6412 (and also the same tree-code) when generating the epilog code and
6413 when generating the code inside the loop. */
6415 enum tree_code orig_code
= STMT_VINFO_REDUC_CODE (phi_info
);
6416 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
6418 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
6419 if (reduction_type
== TREE_CODE_REDUCTION
)
6421 /* Check whether it's ok to change the order of the computation.
6422 Generally, when vectorizing a reduction we change the order of the
6423 computation. This may change the behavior of the program in some
6424 cases, so we need to check that this is ok. One exception is when
6425 vectorizing an outer-loop: the inner-loop is executed sequentially,
6426 and therefore vectorizing reductions in the inner-loop during
6427 outer-loop vectorization is safe. */
6428 if (needs_fold_left_reduction_p (scalar_type
, orig_code
))
6430 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6431 is not directy used in stmt. */
6432 if (!only_slp_reduc_chain
6433 && reduc_chain_length
!= 1)
6435 if (dump_enabled_p ())
6436 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6437 "in-order reduction chain without SLP.\n");
6440 STMT_VINFO_REDUC_TYPE (reduc_info
)
6441 = reduction_type
= FOLD_LEFT_REDUCTION
;
6443 else if (!commutative_tree_code (orig_code
)
6444 || !associative_tree_code (orig_code
))
6446 if (dump_enabled_p ())
6447 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6448 "reduction: not commutative/associative");
6453 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
6456 if (dump_enabled_p ())
6457 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6458 "multiple types in double reduction or condition "
6459 "reduction or fold-left reduction.\n");
6463 internal_fn reduc_fn
= IFN_LAST
;
6464 if (reduction_type
== TREE_CODE_REDUCTION
6465 || reduction_type
== FOLD_LEFT_REDUCTION
6466 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
6467 || reduction_type
== CONST_COND_REDUCTION
)
6469 if (reduction_type
== FOLD_LEFT_REDUCTION
6470 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
6471 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
6473 if (reduc_fn
!= IFN_LAST
6474 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
6475 OPTIMIZE_FOR_SPEED
))
6477 if (dump_enabled_p ())
6478 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6479 "reduc op not supported by target.\n");
6481 reduc_fn
= IFN_LAST
;
6486 if (!nested_cycle
|| double_reduc
)
6488 if (dump_enabled_p ())
6489 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6490 "no reduc code for scalar code.\n");
6496 else if (reduction_type
== COND_REDUCTION
)
6498 int scalar_precision
6499 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
6500 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
6501 cr_index_vector_type
= build_vector_type (cr_index_scalar_type
,
6504 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
6505 OPTIMIZE_FOR_SPEED
))
6506 reduc_fn
= IFN_REDUC_MAX
;
6508 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
6510 if (reduction_type
!= EXTRACT_LAST_REDUCTION
6511 && (!nested_cycle
|| double_reduc
)
6512 && reduc_fn
== IFN_LAST
6513 && !nunits_out
.is_constant ())
6515 if (dump_enabled_p ())
6516 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6517 "missing target support for reduction on"
6518 " variable-length vectors.\n");
6522 /* For SLP reductions, see if there is a neutral value we can use. */
6523 tree neutral_op
= NULL_TREE
;
6525 neutral_op
= neutral_op_for_slp_reduction
6526 (slp_node_instance
->reduc_phis
, vectype_out
, orig_code
,
6527 REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
6529 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
6531 /* We can't support in-order reductions of code such as this:
6533 for (int i = 0; i < n1; ++i)
6534 for (int j = 0; j < n2; ++j)
6537 since GCC effectively transforms the loop when vectorizing:
6539 for (int i = 0; i < n1 / VF; ++i)
6540 for (int j = 0; j < n2; ++j)
6541 for (int k = 0; k < VF; ++k)
6544 which is a reassociation of the original operation. */
6545 if (dump_enabled_p ())
6546 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6547 "in-order double reduction not supported.\n");
6552 if (reduction_type
== FOLD_LEFT_REDUCTION
6554 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6556 /* We cannot use in-order reductions in this case because there is
6557 an implicit reassociation of the operations involved. */
6558 if (dump_enabled_p ())
6559 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6560 "in-order unchained SLP reductions not supported.\n");
6564 /* For double reductions, and for SLP reductions with a neutral value,
6565 we construct a variable-length initial vector by loading a vector
6566 full of the neutral value and then shift-and-inserting the start
6567 values into the low-numbered elements. */
6568 if ((double_reduc
|| neutral_op
)
6569 && !nunits_out
.is_constant ()
6570 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
6571 vectype_out
, OPTIMIZE_FOR_SPEED
))
6573 if (dump_enabled_p ())
6574 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6575 "reduction on variable-length vectors requires"
6576 " target support for a vector-shift-and-insert"
6581 /* Check extra constraints for variable-length unchained SLP reductions. */
6582 if (STMT_SLP_TYPE (stmt_info
)
6583 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6584 && !nunits_out
.is_constant ())
6586 /* We checked above that we could build the initial vector when
6587 there's a neutral element value. Check here for the case in
6588 which each SLP statement has its own initial value and in which
6589 that value needs to be repeated for every instance of the
6590 statement within the initial vector. */
6591 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6593 && !can_duplicate_and_interleave_p (loop_vinfo
, group_size
,
6594 TREE_TYPE (vectype_out
)))
6596 if (dump_enabled_p ())
6597 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6598 "unsupported form of SLP reduction for"
6599 " variable-length vectors: cannot build"
6600 " initial vector.\n");
6603 /* The epilogue code relies on the number of elements being a multiple
6604 of the group size. The duplicate-and-interleave approach to setting
6605 up the initial vector does too. */
6606 if (!multiple_p (nunits_out
, group_size
))
6608 if (dump_enabled_p ())
6609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6610 "unsupported form of SLP reduction for"
6611 " variable-length vectors: the vector size"
6612 " is not a multiple of the number of results.\n");
6617 if (reduction_type
== COND_REDUCTION
)
6621 if (! max_loop_iterations (loop
, &ni
))
6623 if (dump_enabled_p ())
6624 dump_printf_loc (MSG_NOTE
, vect_location
,
6625 "loop count not known, cannot create cond "
6629 /* Convert backedges to iterations. */
6632 /* The additional index will be the same type as the condition. Check
6633 that the loop can fit into this less one (because we'll use up the
6634 zero slot for when there are no matches). */
6635 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
6636 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
6638 if (dump_enabled_p ())
6639 dump_printf_loc (MSG_NOTE
, vect_location
,
6640 "loop size is greater than data size.\n");
6645 /* In case the vectorization factor (VF) is bigger than the number
6646 of elements that we can fit in a vectype (nunits), we have to generate
6647 more than one vector stmt - i.e - we need to "unroll" the
6648 vector stmt by a factor VF/nunits. For more details see documentation
6649 in vectorizable_operation. */
6651 /* If the reduction is used in an outer loop we need to generate
6652 VF intermediate results, like so (e.g. for ncopies=2):
6657 (i.e. we generate VF results in 2 registers).
6658 In this case we have a separate def-use cycle for each copy, and therefore
6659 for each copy we get the vector def for the reduction variable from the
6660 respective phi node created for this copy.
6662 Otherwise (the reduction is unused in the loop nest), we can combine
6663 together intermediate results, like so (e.g. for ncopies=2):
6667 (i.e. we generate VF/2 results in a single register).
6668 In this case for each copy we get the vector def for the reduction variable
6669 from the vectorized reduction operation generated in the previous iteration.
6671 This only works when we see both the reduction PHI and its only consumer
6672 in vectorizable_reduction and there are no intermediate stmts
6675 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
6676 && reduc_chain_length
== 1)
6677 single_defuse_cycle
= true;
6679 if (single_defuse_cycle
|| lane_reduc_code_p
)
6681 gcc_assert (code
!= COND_EXPR
);
6683 /* 4. Supportable by target? */
6686 /* 4.1. check support for the operation in the loop */
6687 optab optab
= optab_for_tree_code (code
, vectype_in
, optab_vector
);
6690 if (dump_enabled_p ())
6691 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6696 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
6697 if (ok
&& optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
6699 if (dump_enabled_p ())
6700 dump_printf (MSG_NOTE
, "op not supported by target.\n");
6701 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
6702 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6705 if (dump_enabled_p ())
6706 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
6709 /* Worthwhile without SIMD support? */
6711 && !VECTOR_MODE_P (TYPE_MODE (vectype_in
))
6712 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6714 if (dump_enabled_p ())
6715 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6716 "not worthwhile without SIMD support.\n");
6720 /* lane-reducing operations have to go through vect_transform_reduction.
6721 For the other cases try without the single cycle optimization. */
6724 if (lane_reduc_code_p
)
6727 single_defuse_cycle
= false;
6730 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
;
6732 /* If the reduction stmt is one of the patterns that have lane
6733 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6734 if ((ncopies
> 1 && ! single_defuse_cycle
)
6735 && lane_reduc_code_p
)
6737 if (dump_enabled_p ())
6738 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6739 "multi def-use cycle not possible for lane-reducing "
6740 "reduction operation\n");
6745 && !(!single_defuse_cycle
6746 && code
!= DOT_PROD_EXPR
6747 && code
!= WIDEN_SUM_EXPR
6749 && reduction_type
!= FOLD_LEFT_REDUCTION
))
6750 for (i
= 0; i
< op_type
; i
++)
6751 if (!vect_maybe_update_slp_op_vectype (slp_op
[i
], vectype_in
))
6753 if (dump_enabled_p ())
6754 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6755 "incompatible vector types for invariants\n");
6760 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
6764 vect_model_reduction_cost (loop_vinfo
, stmt_info
, reduc_fn
,
6765 reduction_type
, ncopies
, cost_vec
);
6766 if (dump_enabled_p ()
6767 && reduction_type
== FOLD_LEFT_REDUCTION
)
6768 dump_printf_loc (MSG_NOTE
, vect_location
,
6769 "using an in-order (fold-left) reduction.\n");
6770 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
6771 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6772 reductions go through their own vectorizable_* routines. */
6773 if (!single_defuse_cycle
6774 && code
!= DOT_PROD_EXPR
6775 && code
!= WIDEN_SUM_EXPR
6777 && reduction_type
!= FOLD_LEFT_REDUCTION
)
6780 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info
));
6781 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (tem
))
6783 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem
));
6784 tem
= REDUC_GROUP_FIRST_ELEMENT (tem
);
6786 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem
)) = vect_internal_def
;
6787 STMT_VINFO_DEF_TYPE (tem
) = vect_internal_def
;
6789 else if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
6791 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
6792 internal_fn cond_fn
= get_conditional_internal_fn (code
);
6794 if (reduction_type
!= FOLD_LEFT_REDUCTION
6795 && !use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
)
6796 && (cond_fn
== IFN_LAST
6797 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6798 OPTIMIZE_FOR_SPEED
)))
6800 if (dump_enabled_p ())
6801 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6802 "can't use a fully-masked loop because no"
6803 " conditional operation is available.\n");
6804 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
6806 else if (reduction_type
== FOLD_LEFT_REDUCTION
6807 && reduc_fn
== IFN_LAST
6808 && !expand_vec_cond_expr_p (vectype_in
,
6809 truth_type_for (vectype_in
),
6812 if (dump_enabled_p ())
6813 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6814 "can't use a fully-masked loop because no"
6815 " conditional operation is available.\n");
6816 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
6819 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
6825 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6829 vect_transform_reduction (loop_vec_info loop_vinfo
,
6830 stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
6831 stmt_vec_info
*vec_stmt
, slp_tree slp_node
)
6833 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6834 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6840 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
6841 gcc_assert (reduc_info
->is_reduc_info
);
6843 if (nested_in_vect_loop_p (loop
, stmt_info
))
6846 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
6849 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6850 enum tree_code code
= gimple_assign_rhs_code (stmt
);
6851 int op_type
= TREE_CODE_LENGTH (code
);
6855 switch (get_gimple_rhs_class (code
))
6857 case GIMPLE_TERNARY_RHS
:
6858 ops
[2] = gimple_assign_rhs3 (stmt
);
6860 case GIMPLE_BINARY_RHS
:
6861 ops
[0] = gimple_assign_rhs1 (stmt
);
6862 ops
[1] = gimple_assign_rhs2 (stmt
);
6868 /* All uses but the last are expected to be defined in the loop.
6869 The last use is the reduction variable. In case of nested cycle this
6870 assumption is not true: we use reduc_index to record the index of the
6871 reduction variable. */
6872 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
6873 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6874 int reduc_index
= STMT_VINFO_REDUC_IDX (stmt_info
);
6875 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
6880 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
6884 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6888 internal_fn cond_fn
= get_conditional_internal_fn (code
);
6889 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
6890 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
6893 stmt_vec_info new_stmt_info
= NULL
;
6894 stmt_vec_info prev_stmt_info
;
6895 tree new_temp
= NULL_TREE
;
6896 auto_vec
<tree
> vec_oprnds0
;
6897 auto_vec
<tree
> vec_oprnds1
;
6898 auto_vec
<tree
> vec_oprnds2
;
6901 if (dump_enabled_p ())
6902 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
6904 /* FORNOW: Multiple types are not supported for condition. */
6905 if (code
== COND_EXPR
)
6906 gcc_assert (ncopies
== 1);
6908 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
6910 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
6911 if (reduction_type
== FOLD_LEFT_REDUCTION
)
6913 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
6914 return vectorize_fold_left_reduction
6915 (loop_vinfo
, stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
6916 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
6919 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
6920 gcc_assert (single_defuse_cycle
6921 || code
== DOT_PROD_EXPR
6922 || code
== WIDEN_SUM_EXPR
6923 || code
== SAD_EXPR
);
6925 /* Create the destination vector */
6926 tree scalar_dest
= gimple_assign_lhs (stmt
);
6927 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
6929 prev_stmt_info
= NULL
;
6932 vec_oprnds0
.create (1);
6933 vec_oprnds1
.create (1);
6934 if (op_type
== ternary_op
)
6935 vec_oprnds2
.create (1);
6938 for (j
= 0; j
< ncopies
; j
++)
6945 /* Get vec defs for all the operands except the reduction index,
6946 ensuring the ordering of the ops in the vector is kept. */
6947 auto_vec
<vec
<tree
>, 3> vec_defs
;
6948 vect_get_slp_defs (loop_vinfo
, slp_node
, &vec_defs
);
6949 vec_oprnds0
.safe_splice (vec_defs
[0]);
6950 vec_defs
[0].release ();
6951 vec_oprnds1
.safe_splice (vec_defs
[1]);
6952 vec_defs
[1].release ();
6953 if (op_type
== ternary_op
)
6955 vec_oprnds2
.safe_splice (vec_defs
[2]);
6956 vec_defs
[2].release ();
6961 vec_oprnds0
.quick_push
6962 (vect_get_vec_def_for_operand (loop_vinfo
, ops
[0], stmt_info
));
6963 vec_oprnds1
.quick_push
6964 (vect_get_vec_def_for_operand (loop_vinfo
, ops
[1], stmt_info
));
6965 if (op_type
== ternary_op
)
6966 vec_oprnds2
.quick_push
6967 (vect_get_vec_def_for_operand (loop_vinfo
, ops
[2], stmt_info
));
6974 gcc_assert (reduc_index
!= -1 || ! single_defuse_cycle
);
6976 if (single_defuse_cycle
&& reduc_index
== 0)
6977 vec_oprnds0
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
6980 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
6982 if (single_defuse_cycle
&& reduc_index
== 1)
6983 vec_oprnds1
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
6986 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
6988 if (op_type
== ternary_op
)
6990 if (single_defuse_cycle
&& reduc_index
== 2)
6991 vec_oprnds2
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
6994 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
7000 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7002 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7003 if (masked_loop_p
&& !mask_by_cond_expr
)
7005 /* Make sure that the reduction accumulator is vop[0]. */
7006 if (reduc_index
== 1)
7008 gcc_assert (commutative_tree_code (code
));
7009 std::swap (vop
[0], vop
[1]);
7011 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7012 vectype_in
, i
* ncopies
+ j
);
7013 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7016 new_temp
= make_ssa_name (vec_dest
, call
);
7017 gimple_call_set_lhs (call
, new_temp
);
7018 gimple_call_set_nothrow (call
, true);
7020 = vect_finish_stmt_generation (loop_vinfo
,
7021 stmt_info
, call
, gsi
);
7025 if (op_type
== ternary_op
)
7026 vop
[2] = vec_oprnds2
[i
];
7028 if (masked_loop_p
&& mask_by_cond_expr
)
7030 tree mask
= vect_get_loop_mask (gsi
, masks
,
7032 vectype_in
, i
* ncopies
+ j
);
7033 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7036 gassign
*new_stmt
= gimple_build_assign (vec_dest
, code
,
7037 vop
[0], vop
[1], vop
[2]);
7038 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7039 gimple_assign_set_lhs (new_stmt
, new_temp
);
7041 = vect_finish_stmt_generation (loop_vinfo
,
7042 stmt_info
, new_stmt
, gsi
);
7046 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt_info
);
7049 if (slp_node
|| single_defuse_cycle
)
7053 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
7055 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt_info
;
7057 prev_stmt_info
= new_stmt_info
;
7060 if (single_defuse_cycle
&& !slp_node
)
7061 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
7066 /* Transform phase of a cycle PHI. */
7069 vect_transform_cycle_phi (loop_vec_info loop_vinfo
,
7070 stmt_vec_info stmt_info
, stmt_vec_info
*vec_stmt
,
7071 slp_tree slp_node
, slp_instance slp_node_instance
)
7073 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
7074 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7077 stmt_vec_info prev_phi_info
;
7079 bool nested_cycle
= false;
7082 if (nested_in_vect_loop_p (loop
, stmt_info
))
7085 nested_cycle
= true;
7088 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
7089 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
7090 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7091 gcc_assert (reduc_info
->is_reduc_info
);
7093 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
7094 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
7095 /* Leave the scalar phi in place. */
7098 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
7099 /* For a nested cycle we do not fill the above. */
7101 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
7102 gcc_assert (vectype_in
);
7106 /* The size vect_schedule_slp_instance computes is off for us. */
7107 vec_num
= vect_get_num_vectors
7108 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
7109 * SLP_TREE_SCALAR_STMTS (slp_node
).length (), vectype_in
);
7115 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
7118 /* Check whether we should use a single PHI node and accumulate
7119 vectors to one before the backedge. */
7120 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
7123 /* Create the destination vector */
7124 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
7125 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
7128 /* Get the loop-entry arguments. */
7129 tree vec_initial_def
;
7130 auto_vec
<tree
> vec_initial_defs
;
7133 vec_initial_defs
.reserve (vec_num
);
7134 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
7135 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
);
7137 = neutral_op_for_slp_reduction (slp_node
, vectype_out
,
7138 STMT_VINFO_REDUC_CODE (reduc_info
),
7140 get_initial_defs_for_reduction (loop_vinfo
, slp_node_instance
->reduc_phis
,
7141 &vec_initial_defs
, vec_num
,
7142 first
!= NULL
, neutral_op
);
7146 /* Get at the scalar def before the loop, that defines the initial
7147 value of the reduction variable. */
7148 tree initial_def
= PHI_ARG_DEF_FROM_EDGE (phi
,
7149 loop_preheader_edge (loop
));
7150 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7151 and we can't use zero for induc_val, use initial_def. Similarly
7152 for REDUC_MIN and initial_def larger than the base. */
7153 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
7155 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
7156 if (TREE_CODE (initial_def
) == INTEGER_CST
7157 && !integer_zerop (induc_val
)
7158 && ((STMT_VINFO_REDUC_CODE (reduc_info
) == MAX_EXPR
7159 && tree_int_cst_lt (initial_def
, induc_val
))
7160 || (STMT_VINFO_REDUC_CODE (reduc_info
) == MIN_EXPR
7161 && tree_int_cst_lt (induc_val
, initial_def
))))
7163 induc_val
= initial_def
;
7164 /* Communicate we used the initial_def to epilouge
7166 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
7168 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
7170 else if (nested_cycle
)
7172 /* Do not use an adjustment def as that case is not supported
7173 correctly if ncopies is not one. */
7174 vec_initial_def
= vect_get_vec_def_for_operand (loop_vinfo
,
7180 tree adjustment_def
= NULL_TREE
;
7181 tree
*adjustment_defp
= &adjustment_def
;
7182 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
7183 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
7184 adjustment_defp
= NULL
;
7186 = get_initial_def_for_reduction (loop_vinfo
, reduc_stmt_info
, code
,
7187 initial_def
, adjustment_defp
);
7188 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = adjustment_def
;
7190 vec_initial_defs
.create (1);
7191 vec_initial_defs
.quick_push (vec_initial_def
);
7194 /* Generate the reduction PHIs upfront. */
7195 prev_phi_info
= NULL
;
7196 for (i
= 0; i
< vec_num
; i
++)
7198 tree vec_init_def
= vec_initial_defs
[i
];
7199 for (j
= 0; j
< ncopies
; j
++)
7201 /* Create the reduction-phi that defines the reduction
7203 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
7204 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
7206 /* Set the loop-entry arg of the reduction-phi. */
7207 if (j
!= 0 && nested_cycle
)
7208 vec_init_def
= vect_get_vec_def_for_stmt_copy (loop_vinfo
,
7210 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
7213 /* The loop-latch arg is set in epilogue processing. */
7216 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi_info
);
7220 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_phi_info
;
7222 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi_info
;
7223 prev_phi_info
= new_phi_info
;
7231 /* Vectorizes LC PHIs. */
7234 vectorizable_lc_phi (loop_vec_info loop_vinfo
,
7235 stmt_vec_info stmt_info
, stmt_vec_info
*vec_stmt
,
7239 || !is_a
<gphi
*> (stmt_info
->stmt
)
7240 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
7243 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
7244 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
7247 if (!vec_stmt
) /* transformation not required. */
7249 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
7253 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7254 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
7255 basic_block bb
= gimple_bb (stmt_info
->stmt
);
7256 edge e
= single_pred_edge (bb
);
7257 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
7258 vec
<tree
> vec_oprnds
= vNULL
;
7259 vect_get_vec_defs (loop_vinfo
,
7260 gimple_phi_arg_def (stmt_info
->stmt
, 0), NULL_TREE
,
7261 stmt_info
, &vec_oprnds
, NULL
, slp_node
);
7264 unsigned vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7265 gcc_assert (vec_oprnds
.length () == vec_num
);
7266 for (unsigned i
= 0; i
< vec_num
; i
++)
7268 /* Create the vectorized LC PHI node. */
7269 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
7270 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
7271 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
7272 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi_info
);
7277 unsigned ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7278 stmt_vec_info prev_phi_info
= NULL
;
7279 for (unsigned i
= 0; i
< ncopies
; i
++)
7282 vect_get_vec_defs_for_stmt_copy (loop_vinfo
, &vec_oprnds
, NULL
);
7283 /* Create the vectorized LC PHI node. */
7284 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
7285 add_phi_arg (new_phi
, vec_oprnds
[0], e
, UNKNOWN_LOCATION
);
7286 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
7288 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_phi_info
;
7290 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi_info
;
7291 prev_phi_info
= new_phi_info
;
7294 vec_oprnds
.release ();
7300 /* Function vect_min_worthwhile_factor.
7302 For a loop where we could vectorize the operation indicated by CODE,
7303 return the minimum vectorization factor that makes it worthwhile
7304 to use generic vectors. */
7306 vect_min_worthwhile_factor (enum tree_code code
)
7326 /* Return true if VINFO indicates we are doing loop vectorization and if
7327 it is worth decomposing CODE operations into scalar operations for
7328 that loop's vectorization factor. */
7331 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7333 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7334 unsigned HOST_WIDE_INT value
;
7336 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7337 && value
>= vect_min_worthwhile_factor (code
));
7340 /* Function vectorizable_induction
7342 Check if STMT_INFO performs an induction computation that can be vectorized.
7343 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7344 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7345 Return true if STMT_INFO is vectorizable in this way. */
7348 vectorizable_induction (loop_vec_info loop_vinfo
,
7349 stmt_vec_info stmt_info
,
7350 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7351 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
7352 stmt_vector_for_cost
*cost_vec
)
7354 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7356 bool nested_in_vect_loop
= false;
7357 class loop
*iv_loop
;
7359 edge pe
= loop_preheader_edge (loop
);
7361 tree new_vec
, vec_init
, vec_step
, t
;
7364 gphi
*induction_phi
;
7365 tree induc_def
, vec_dest
;
7366 tree init_expr
, step_expr
;
7367 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7371 imm_use_iterator imm_iter
;
7372 use_operand_p use_p
;
7376 gimple_stmt_iterator si
;
7378 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
7382 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7385 /* Make sure it was recognized as induction computation. */
7386 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7389 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7390 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7395 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7396 gcc_assert (ncopies
>= 1);
7398 /* FORNOW. These restrictions should be relaxed. */
7399 if (nested_in_vect_loop_p (loop
, stmt_info
))
7401 imm_use_iterator imm_iter
;
7402 use_operand_p use_p
;
7409 if (dump_enabled_p ())
7410 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7411 "multiple types in nested loop.\n");
7415 /* FORNOW: outer loop induction with SLP not supported. */
7416 if (STMT_SLP_TYPE (stmt_info
))
7420 latch_e
= loop_latch_edge (loop
->inner
);
7421 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7422 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7424 gimple
*use_stmt
= USE_STMT (use_p
);
7425 if (is_gimple_debug (use_stmt
))
7428 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7430 exit_phi
= use_stmt
;
7436 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7437 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7438 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7440 if (dump_enabled_p ())
7441 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7442 "inner-loop induction only used outside "
7443 "of the outer vectorized loop.\n");
7448 nested_in_vect_loop
= true;
7449 iv_loop
= loop
->inner
;
7453 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7455 if (slp_node
&& !nunits
.is_constant ())
7457 /* The current SLP code creates the initial value element-by-element. */
7458 if (dump_enabled_p ())
7459 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7460 "SLP induction not supported for variable-length"
7465 if (!vec_stmt
) /* transformation not required. */
7467 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
7468 DUMP_VECT_SCOPE ("vectorizable_induction");
7469 vect_model_induction_cost (stmt_info
, ncopies
, cost_vec
);
7475 /* Compute a vector variable, initialized with the first VF values of
7476 the induction variable. E.g., for an iv with IV_PHI='X' and
7477 evolution S, for a vector of 4 units, we want to compute:
7478 [X, X + S, X + 2*S, X + 3*S]. */
7480 if (dump_enabled_p ())
7481 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
7483 latch_e
= loop_latch_edge (iv_loop
);
7484 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7486 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
7487 gcc_assert (step_expr
!= NULL_TREE
);
7488 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
7490 pe
= loop_preheader_edge (iv_loop
);
7491 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
7492 loop_preheader_edge (iv_loop
));
7495 if (!nested_in_vect_loop
)
7497 /* Convert the initial value to the IV update type. */
7498 tree new_type
= TREE_TYPE (step_expr
);
7499 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
7501 /* If we are using the loop mask to "peel" for alignment then we need
7502 to adjust the start value here. */
7503 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
7504 if (skip_niters
!= NULL_TREE
)
7506 if (FLOAT_TYPE_P (vectype
))
7507 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
7510 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
7511 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
7512 skip_niters
, step_expr
);
7513 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
7514 init_expr
, skip_step
);
7520 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7521 gcc_assert (!new_bb
);
7524 /* Find the first insertion point in the BB. */
7525 basic_block bb
= gimple_bb (phi
);
7526 si
= gsi_after_labels (bb
);
7528 /* For SLP induction we have to generate several IVs as for example
7529 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7530 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7531 [VF*S, VF*S, VF*S, VF*S] for all. */
7534 /* Enforced above. */
7535 unsigned int const_nunits
= nunits
.to_constant ();
7537 /* Generate [VF*S, VF*S, ... ]. */
7538 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7540 expr
= build_int_cst (integer_type_node
, vf
);
7541 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7544 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7545 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7547 if (! CONSTANT_CLASS_P (new_name
))
7548 new_name
= vect_init_vector (loop_vinfo
, stmt_info
, new_name
,
7549 TREE_TYPE (step_expr
), NULL
);
7550 new_vec
= build_vector_from_val (step_vectype
, new_name
);
7551 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
7552 new_vec
, step_vectype
, NULL
);
7554 /* Now generate the IVs. */
7555 unsigned group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7556 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7557 unsigned elts
= const_nunits
* nvects
;
7558 /* Compute the number of distinct IVs we need. First reduce
7559 group_size if it is a multiple of const_nunits so we get
7560 one IV for a group_size of 4 but const_nunits 2. */
7561 unsigned group_sizep
= group_size
;
7562 if (group_sizep
% const_nunits
== 0)
7563 group_sizep
= group_sizep
/ const_nunits
;
7564 unsigned nivs
= least_common_multiple (group_sizep
,
7565 const_nunits
) / const_nunits
;
7566 gcc_assert (elts
% group_size
== 0);
7567 tree elt
= init_expr
;
7569 for (ivn
= 0; ivn
< nivs
; ++ivn
)
7571 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
7573 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
7575 if (ivn
*const_nunits
+ eltn
>= group_size
7576 && (ivn
* const_nunits
+ eltn
) % group_size
== 0)
7577 elt
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (elt
),
7579 elts
.quick_push (elt
);
7581 vec_init
= gimple_build_vector (&stmts
, &elts
);
7582 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
7585 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7586 gcc_assert (!new_bb
);
7589 /* Create the induction-phi that defines the induction-operand. */
7590 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7591 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7592 stmt_vec_info induction_phi_info
7593 = loop_vinfo
->add_stmt (induction_phi
);
7594 induc_def
= PHI_RESULT (induction_phi
);
7596 /* Create the iv update inside the loop */
7597 gimple_seq stmts
= NULL
;
7598 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
7599 vec_def
= gimple_build (&stmts
,
7600 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
7601 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
7602 loop_vinfo
->add_stmt (SSA_NAME_DEF_STMT (vec_def
));
7603 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7605 /* Set the arguments of the phi node: */
7606 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7607 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7610 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi_info
);
7612 /* Fill up to the number of vectors we need for the whole group. */
7613 nivs
= least_common_multiple (group_size
,
7614 const_nunits
) / const_nunits
;
7615 for (; ivn
< nivs
; ++ivn
)
7616 SLP_TREE_VEC_STMTS (slp_node
)
7617 .quick_push (SLP_TREE_VEC_STMTS (slp_node
)[0]);
7619 /* Re-use IVs when we can. */
7623 = least_common_multiple (group_size
, const_nunits
) / group_size
;
7624 /* Generate [VF'*S, VF'*S, ... ]. */
7625 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7627 expr
= build_int_cst (integer_type_node
, vfp
);
7628 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7631 expr
= build_int_cst (TREE_TYPE (step_expr
), vfp
);
7632 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7634 if (! CONSTANT_CLASS_P (new_name
))
7635 new_name
= vect_init_vector (loop_vinfo
, stmt_info
, new_name
,
7636 TREE_TYPE (step_expr
), NULL
);
7637 new_vec
= build_vector_from_val (step_vectype
, new_name
);
7638 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
, new_vec
,
7639 step_vectype
, NULL
);
7640 for (; ivn
< nvects
; ++ivn
)
7642 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
]->stmt
;
7644 if (gimple_code (iv
) == GIMPLE_PHI
)
7645 def
= gimple_phi_result (iv
);
7647 def
= gimple_assign_lhs (iv
);
7648 gimple_seq stmts
= NULL
;
7649 def
= gimple_convert (&stmts
, step_vectype
, def
);
7650 def
= gimple_build (&stmts
,
7651 PLUS_EXPR
, step_vectype
, def
, vec_step
);
7652 def
= gimple_convert (&stmts
, vectype
, def
);
7653 if (gimple_code (iv
) == GIMPLE_PHI
)
7654 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7657 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
7658 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
7660 SLP_TREE_VEC_STMTS (slp_node
).quick_push
7661 (loop_vinfo
->add_stmt (SSA_NAME_DEF_STMT (def
)));
7668 /* Create the vector that holds the initial_value of the induction. */
7669 if (nested_in_vect_loop
)
7671 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7672 been created during vectorization of previous stmts. We obtain it
7673 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7674 vec_init
= vect_get_vec_def_for_operand (loop_vinfo
,
7675 init_expr
, stmt_info
);
7676 /* If the initial value is not of proper type, convert it. */
7677 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
7680 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
7684 build1 (VIEW_CONVERT_EXPR
, vectype
,
7686 vec_init
= gimple_assign_lhs (new_stmt
);
7687 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
7689 gcc_assert (!new_bb
);
7690 loop_vinfo
->add_stmt (new_stmt
);
7695 /* iv_loop is the loop to be vectorized. Create:
7696 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7698 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
7700 unsigned HOST_WIDE_INT const_nunits
;
7701 if (nunits
.is_constant (&const_nunits
))
7703 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
7704 elts
.quick_push (new_name
);
7705 for (i
= 1; i
< const_nunits
; i
++)
7707 /* Create: new_name_i = new_name + step_expr */
7708 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
7709 new_name
, step_expr
);
7710 elts
.quick_push (new_name
);
7712 /* Create a vector from [new_name_0, new_name_1, ...,
7713 new_name_nunits-1] */
7714 vec_init
= gimple_build_vector (&stmts
, &elts
);
7716 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
7717 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7718 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
7719 new_name
, step_expr
);
7723 [base, base, base, ...]
7724 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7725 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
7726 gcc_assert (flag_associative_math
);
7727 tree index
= build_index_vector (step_vectype
, 0, 1);
7728 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
7730 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
7732 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
7733 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
7734 vec_init
, step_vec
);
7735 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
7736 vec_init
, base_vec
);
7738 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
7742 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7743 gcc_assert (!new_bb
);
7748 /* Create the vector that holds the step of the induction. */
7749 if (nested_in_vect_loop
)
7750 /* iv_loop is nested in the loop to be vectorized. Generate:
7751 vec_step = [S, S, S, S] */
7752 new_name
= step_expr
;
7755 /* iv_loop is the loop to be vectorized. Generate:
7756 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7757 gimple_seq seq
= NULL
;
7758 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7760 expr
= build_int_cst (integer_type_node
, vf
);
7761 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7764 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7765 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7769 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7770 gcc_assert (!new_bb
);
7774 t
= unshare_expr (new_name
);
7775 gcc_assert (CONSTANT_CLASS_P (new_name
)
7776 || TREE_CODE (new_name
) == SSA_NAME
);
7777 new_vec
= build_vector_from_val (step_vectype
, t
);
7778 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
7779 new_vec
, step_vectype
, NULL
);
7782 /* Create the following def-use cycle:
7787 vec_iv = PHI <vec_init, vec_loop>
7791 vec_loop = vec_iv + vec_step; */
7793 /* Create the induction-phi that defines the induction-operand. */
7794 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7795 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7796 stmt_vec_info induction_phi_info
= loop_vinfo
->add_stmt (induction_phi
);
7797 induc_def
= PHI_RESULT (induction_phi
);
7799 /* Create the iv update inside the loop */
7801 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
7802 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
7803 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
7804 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7805 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
7806 stmt_vec_info new_stmt_info
= loop_vinfo
->add_stmt (new_stmt
);
7808 /* Set the arguments of the phi node: */
7809 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7810 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7813 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= induction_phi_info
;
7815 /* In case that vectorization factor (VF) is bigger than the number
7816 of elements that we can fit in a vectype (nunits), we have to generate
7817 more than one vector stmt - i.e - we need to "unroll" the
7818 vector stmt by a factor VF/nunits. For more details see documentation
7819 in vectorizable_operation. */
7823 gimple_seq seq
= NULL
;
7824 stmt_vec_info prev_stmt_vinfo
;
7825 /* FORNOW. This restriction should be relaxed. */
7826 gcc_assert (!nested_in_vect_loop
);
7828 /* Create the vector that holds the step of the induction. */
7829 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7831 expr
= build_int_cst (integer_type_node
, nunits
);
7832 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7835 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
7836 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7840 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7841 gcc_assert (!new_bb
);
7844 t
= unshare_expr (new_name
);
7845 gcc_assert (CONSTANT_CLASS_P (new_name
)
7846 || TREE_CODE (new_name
) == SSA_NAME
);
7847 new_vec
= build_vector_from_val (step_vectype
, t
);
7848 vec_step
= vect_init_vector (loop_vinfo
, stmt_info
,
7849 new_vec
, step_vectype
, NULL
);
7851 vec_def
= induc_def
;
7852 prev_stmt_vinfo
= induction_phi_info
;
7853 for (i
= 1; i
< ncopies
; i
++)
7855 /* vec_i = vec_prev + vec_step */
7856 gimple_seq stmts
= NULL
;
7857 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
7858 vec_def
= gimple_build (&stmts
,
7859 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
7860 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
7862 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7863 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
7864 new_stmt_info
= loop_vinfo
->add_stmt (new_stmt
);
7865 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt_info
;
7866 prev_stmt_vinfo
= new_stmt_info
;
7870 if (nested_in_vect_loop
)
7872 /* Find the loop-closed exit-phi of the induction, and record
7873 the final vector of induction results: */
7875 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7877 gimple
*use_stmt
= USE_STMT (use_p
);
7878 if (is_gimple_debug (use_stmt
))
7881 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
7883 exit_phi
= use_stmt
;
7889 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7890 /* FORNOW. Currently not supporting the case that an inner-loop induction
7891 is not used in the outer-loop (i.e. only outside the outer-loop). */
7892 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
7893 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
7895 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt_info
;
7896 if (dump_enabled_p ())
7897 dump_printf_loc (MSG_NOTE
, vect_location
,
7898 "vector of inductions after inner-loop:%G",
7904 if (dump_enabled_p ())
7905 dump_printf_loc (MSG_NOTE
, vect_location
,
7906 "transform induction: created def-use cycle: %G%G",
7907 induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
7912 /* Function vectorizable_live_operation.
7914 STMT_INFO computes a value that is used outside the loop. Check if
7915 it can be supported. */
7918 vectorizable_live_operation (loop_vec_info loop_vinfo
,
7919 stmt_vec_info stmt_info
,
7920 gimple_stmt_iterator
*gsi
,
7921 slp_tree slp_node
, slp_instance slp_node_instance
,
7922 int slp_index
, bool vec_stmt_p
,
7923 stmt_vector_for_cost
*)
7925 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7926 imm_use_iterator imm_iter
;
7927 tree lhs
, lhs_type
, bitsize
, vec_bitsize
;
7928 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7929 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7932 auto_vec
<tree
> vec_oprnds
;
7934 poly_uint64 vec_index
= 0;
7936 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
7938 /* If a stmt of a reduction is live, vectorize it via
7939 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7940 validity so just trigger the transform here. */
7941 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
7947 /* For reduction chains the meta-info is attached to
7948 the group leader. */
7949 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7950 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
7951 /* For SLP reductions we vectorize the epilogue for
7952 all involved stmts together. */
7953 else if (slp_index
!= 0)
7956 /* For SLP reductions the meta-info is attached to
7957 the representative. */
7958 stmt_info
= SLP_TREE_REPRESENTATIVE (slp_node
);
7960 stmt_vec_info reduc_info
= info_for_reduction (loop_vinfo
, stmt_info
);
7961 gcc_assert (reduc_info
->is_reduc_info
);
7962 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
7963 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
7965 vect_create_epilog_for_reduction (loop_vinfo
, stmt_info
, slp_node
,
7970 /* FORNOW. CHECKME. */
7971 if (nested_in_vect_loop_p (loop
, stmt_info
))
7974 /* If STMT is not relevant and it is a simple assignment and its inputs are
7975 invariant then it can remain in place, unvectorized. The original last
7976 scalar value that it computes will be used. */
7977 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7979 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
7980 if (dump_enabled_p ())
7981 dump_printf_loc (MSG_NOTE
, vect_location
,
7982 "statement is simple and uses invariant. Leaving in "
7990 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7994 gcc_assert (slp_index
>= 0);
7996 int num_scalar
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7997 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7999 /* Get the last occurrence of the scalar index from the concatenation of
8000 all the slp vectors. Calculate which slp vector it is and the index
8002 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
8004 /* Calculate which vector contains the result, and which lane of
8005 that vector we need. */
8006 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
8008 if (dump_enabled_p ())
8009 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8010 "Cannot determine which vector holds the"
8011 " final result.\n");
8018 /* No transformation required. */
8019 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
8021 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
8022 OPTIMIZE_FOR_SPEED
))
8024 if (dump_enabled_p ())
8025 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8026 "can't use a fully-masked loop because "
8027 "the target doesn't support extract last "
8029 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
8033 if (dump_enabled_p ())
8034 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8035 "can't use a fully-masked loop because an "
8036 "SLP statement is live after the loop.\n");
8037 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
8039 else if (ncopies
> 1)
8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8043 "can't use a fully-masked loop because"
8044 " ncopies is greater than 1.\n");
8045 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
8049 gcc_assert (ncopies
== 1 && !slp_node
);
8050 vect_record_loop_mask (loop_vinfo
,
8051 &LOOP_VINFO_MASKS (loop_vinfo
),
8058 /* Use the lhs of the original scalar statement. */
8059 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
8061 lhs
= (is_a
<gphi
*> (stmt
)) ? gimple_phi_result (stmt
)
8062 : gimple_get_lhs (stmt
);
8063 lhs_type
= TREE_TYPE (lhs
);
8065 bitsize
= vector_element_bits_tree (vectype
);
8066 vec_bitsize
= TYPE_SIZE (vectype
);
8068 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8069 tree vec_lhs
, bitstart
;
8072 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8074 /* Get the correct slp vectorized stmt. */
8075 gimple
*vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
]->stmt
;
8076 if (gphi
*phi
= dyn_cast
<gphi
*> (vec_stmt
))
8077 vec_lhs
= gimple_phi_result (phi
);
8079 vec_lhs
= gimple_get_lhs (vec_stmt
);
8081 /* Get entry to use. */
8082 bitstart
= bitsize_int (vec_index
);
8083 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8087 enum vect_def_type dt
= STMT_VINFO_DEF_TYPE (stmt_info
);
8088 vec_lhs
= vect_get_vec_def_for_operand_1 (stmt_info
, dt
);
8089 gcc_checking_assert (ncopies
== 1
8090 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8092 /* For multiple copies, get the last copy. */
8093 for (int i
= 1; i
< ncopies
; ++i
)
8094 vec_lhs
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, vec_lhs
);
8096 /* Get the last lane in the vector. */
8097 bitstart
= int_const_binop (MINUS_EXPR
, vec_bitsize
, bitsize
);
8100 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8101 requirement, insert one phi node for it. It looks like:
8108 # vec_lhs' = PHI <vec_lhs>
8109 new_tree = lane_extract <vec_lhs', ...>;
8112 basic_block exit_bb
= single_exit (loop
)->dest
;
8113 gcc_assert (single_pred_p (exit_bb
));
8115 tree vec_lhs_phi
= copy_ssa_name (vec_lhs
);
8116 gimple
*phi
= create_phi_node (vec_lhs_phi
, exit_bb
);
8117 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, vec_lhs
);
8119 gimple_seq stmts
= NULL
;
8121 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8125 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8127 where VEC_LHS is the vectorized live-out result and MASK is
8128 the loop mask for the final iteration. */
8129 gcc_assert (ncopies
== 1 && !slp_node
);
8130 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8131 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
), 1,
8133 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
, scalar_type
,
8136 /* Convert the extracted vector element to the required scalar type. */
8137 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8141 tree bftype
= TREE_TYPE (vectype
);
8142 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8143 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8144 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs_phi
, bitsize
, bitstart
);
8145 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8146 &stmts
, true, NULL_TREE
);
8151 gimple_stmt_iterator exit_gsi
= gsi_after_labels (exit_bb
);
8152 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
8154 /* Remove existing phi from lhs and create one copy from new_tree. */
8155 tree lhs_phi
= NULL_TREE
;
8156 gimple_stmt_iterator gsi
;
8157 for (gsi
= gsi_start_phis (exit_bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
8159 gimple
*phi
= gsi_stmt (gsi
);
8160 if ((gimple_phi_arg_def (phi
, 0) == lhs
))
8162 remove_phi_node (&gsi
, false);
8163 lhs_phi
= gimple_phi_result (phi
);
8164 gimple
*copy
= gimple_build_assign (lhs_phi
, new_tree
);
8165 gsi_insert_before (&exit_gsi
, copy
, GSI_SAME_STMT
);
8171 /* Replace use of lhs with newly computed result. If the use stmt is a
8172 single arg PHI, just replace all uses of PHI result. It's necessary
8173 because lcssa PHI defining lhs may be before newly inserted stmt. */
8174 use_operand_p use_p
;
8175 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8176 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8177 && !is_gimple_debug (use_stmt
))
8179 if (gimple_code (use_stmt
) == GIMPLE_PHI
8180 && gimple_phi_num_args (use_stmt
) == 1)
8182 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8186 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8187 SET_USE (use_p
, new_tree
);
8189 update_stmt (use_stmt
);
8195 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8198 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
8200 ssa_op_iter op_iter
;
8201 imm_use_iterator imm_iter
;
8202 def_operand_p def_p
;
8205 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
8207 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
8211 if (!is_gimple_debug (ustmt
))
8214 bb
= gimple_bb (ustmt
);
8216 if (!flow_bb_inside_loop_p (loop
, bb
))
8218 if (gimple_debug_bind_p (ustmt
))
8220 if (dump_enabled_p ())
8221 dump_printf_loc (MSG_NOTE
, vect_location
,
8222 "killing debug use\n");
8224 gimple_debug_bind_reset_value (ustmt
);
8225 update_stmt (ustmt
);
8234 /* Given loop represented by LOOP_VINFO, return true if computation of
8235 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8239 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
8241 /* Constant case. */
8242 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8244 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
8245 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
8247 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
8248 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
8249 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
8254 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8255 /* Check the upper bound of loop niters. */
8256 if (get_max_loop_iterations (loop
, &max
))
8258 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
8259 signop sgn
= TYPE_SIGN (type
);
8260 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
8267 /* Return a mask type with half the number of elements as OLD_TYPE,
8268 given that it should have mode NEW_MODE. */
8271 vect_halve_mask_nunits (tree old_type
, machine_mode new_mode
)
8273 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (old_type
), 2);
8274 return build_truth_vector_type_for_mode (nunits
, new_mode
);
8277 /* Return a mask type with twice as many elements as OLD_TYPE,
8278 given that it should have mode NEW_MODE. */
8281 vect_double_mask_nunits (tree old_type
, machine_mode new_mode
)
8283 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (old_type
) * 2;
8284 return build_truth_vector_type_for_mode (nunits
, new_mode
);
8287 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8288 contain a sequence of NVECTORS masks that each control a vector of type
8289 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8290 these vector masks with the vector version of SCALAR_MASK. */
8293 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
8294 unsigned int nvectors
, tree vectype
, tree scalar_mask
)
8296 gcc_assert (nvectors
!= 0);
8297 if (masks
->length () < nvectors
)
8298 masks
->safe_grow_cleared (nvectors
);
8299 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8300 /* The number of scalars per iteration and the number of vectors are
8301 both compile-time constants. */
8302 unsigned int nscalars_per_iter
8303 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
8304 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
8308 scalar_cond_masked_key
cond (scalar_mask
, nvectors
);
8309 loop_vinfo
->scalar_cond_masked_set
.add (cond
);
8312 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
8314 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
8315 rgm
->mask_type
= truth_type_for (vectype
);
8319 /* Given a complete set of masks MASKS, extract mask number INDEX
8320 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8321 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8323 See the comment above vec_loop_masks for more details about the mask
8327 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
8328 unsigned int nvectors
, tree vectype
, unsigned int index
)
8330 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8331 tree mask_type
= rgm
->mask_type
;
8333 /* Populate the rgroup's mask array, if this is the first time we've
8335 if (rgm
->masks
.is_empty ())
8337 rgm
->masks
.safe_grow_cleared (nvectors
);
8338 for (unsigned int i
= 0; i
< nvectors
; ++i
)
8340 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
8341 /* Provide a dummy definition until the real one is available. */
8342 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
8343 rgm
->masks
[i
] = mask
;
8347 tree mask
= rgm
->masks
[index
];
8348 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
8349 TYPE_VECTOR_SUBPARTS (vectype
)))
8351 /* A loop mask for data type X can be reused for data type Y
8352 if X has N times more elements than Y and if Y's elements
8353 are N times bigger than X's. In this case each sequence
8354 of N elements in the loop mask will be all-zero or all-one.
8355 We can then view-convert the mask so that each sequence of
8356 N elements is replaced by a single element. */
8357 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
8358 TYPE_VECTOR_SUBPARTS (vectype
)));
8359 gimple_seq seq
= NULL
;
8360 mask_type
= truth_type_for (vectype
);
8361 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
8363 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
8368 /* Scale profiling counters by estimation for LOOP which is vectorized
8372 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
8374 edge preheader
= loop_preheader_edge (loop
);
8375 /* Reduce loop iterations by the vectorization factor. */
8376 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
8377 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
8379 if (freq_h
.nonzero_p ())
8381 profile_probability p
;
8383 /* Avoid dropping loop body profile counter to 0 because of zero count
8384 in loop's preheader. */
8385 if (!(freq_e
== profile_count::zero ()))
8386 freq_e
= freq_e
.force_nonzero ();
8387 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
8388 scale_loop_frequencies (loop
, p
);
8391 edge exit_e
= single_exit (loop
);
8392 exit_e
->probability
= profile_probability::always ()
8393 .apply_scale (1, new_est_niter
+ 1);
8395 edge exit_l
= single_pred_edge (loop
->latch
);
8396 profile_probability prob
= exit_l
->probability
;
8397 exit_l
->probability
= exit_e
->probability
.invert ();
8398 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
8399 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
8402 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8403 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8407 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
8408 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
8410 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8411 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8413 if (dump_enabled_p ())
8414 dump_printf_loc (MSG_NOTE
, vect_location
,
8415 "------>vectorizing statement: %G", stmt_info
->stmt
);
8417 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8418 vect_loop_kill_debug_uses (loop
, stmt_info
);
8420 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8421 && !STMT_VINFO_LIVE_P (stmt_info
))
8424 if (STMT_VINFO_VECTYPE (stmt_info
))
8427 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
8428 if (!STMT_SLP_TYPE (stmt_info
)
8429 && maybe_ne (nunits
, vf
)
8430 && dump_enabled_p ())
8431 /* For SLP VF is set according to unrolling factor, and not
8432 to vector size, hence for SLP this print is not valid. */
8433 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8436 /* Pure SLP statements have already been vectorized. We still need
8437 to apply loop vectorization to hybrid SLP statements. */
8438 if (PURE_SLP_STMT (stmt_info
))
8441 if (dump_enabled_p ())
8442 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
8444 if (vect_transform_stmt (loop_vinfo
, stmt_info
, gsi
, NULL
, NULL
))
8445 *seen_store
= stmt_info
;
8448 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8449 in the hash_map with its corresponding values. */
8452 find_in_mapping (tree t
, void *context
)
8454 hash_map
<tree
,tree
>* mapping
= (hash_map
<tree
, tree
>*) context
;
8456 tree
*value
= mapping
->get (t
);
8457 return value
? *value
: t
;
8460 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8461 original loop that has now been vectorized.
8463 The inits of the data_references need to be advanced with the number of
8464 iterations of the main loop. This has been computed in vect_do_peeling and
8465 is stored in parameter ADVANCE. We first restore the data_references
8466 initial offset with the values recored in ORIG_DRS_INIT.
8468 Since the loop_vec_info of this EPILOGUE was constructed for the original
8469 loop, its stmt_vec_infos all point to the original statements. These need
8470 to be updated to point to their corresponding copies as well as the SSA_NAMES
8471 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8473 The data_reference's connections also need to be updated. Their
8474 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8475 stmt_vec_infos, their statements need to point to their corresponding copy,
8476 if they are gather loads or scatter stores then their reference needs to be
8477 updated to point to its corresponding copy and finally we set
8478 'base_misaligned' to false as we have already peeled for alignment in the
8479 prologue of the main loop. */
8482 update_epilogue_loop_vinfo (class loop
*epilogue
, tree advance
)
8484 loop_vec_info epilogue_vinfo
= loop_vec_info_for_loop (epilogue
);
8485 auto_vec
<gimple
*> stmt_worklist
;
8486 hash_map
<tree
,tree
> mapping
;
8487 gimple
*orig_stmt
, *new_stmt
;
8488 gimple_stmt_iterator epilogue_gsi
;
8489 gphi_iterator epilogue_phi_gsi
;
8490 stmt_vec_info stmt_vinfo
= NULL
, related_vinfo
;
8491 basic_block
*epilogue_bbs
= get_loop_body (epilogue
);
8494 LOOP_VINFO_BBS (epilogue_vinfo
) = epilogue_bbs
;
8496 /* Advance data_reference's with the number of iterations of the previous
8497 loop and its prologue. */
8498 vect_update_inits_of_drs (epilogue_vinfo
, advance
, PLUS_EXPR
);
8501 /* The EPILOGUE loop is a copy of the original loop so they share the same
8502 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8503 point to the copied statements. We also create a mapping of all LHS' in
8504 the original loop and all the LHS' in the EPILOGUE and create worklists to
8505 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8506 for (unsigned i
= 0; i
< epilogue
->num_nodes
; ++i
)
8508 for (epilogue_phi_gsi
= gsi_start_phis (epilogue_bbs
[i
]);
8509 !gsi_end_p (epilogue_phi_gsi
); gsi_next (&epilogue_phi_gsi
))
8511 new_stmt
= epilogue_phi_gsi
.phi ();
8513 gcc_assert (gimple_uid (new_stmt
) > 0);
8515 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
8517 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
8518 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
8520 mapping
.put (gimple_phi_result (orig_stmt
),
8521 gimple_phi_result (new_stmt
));
8522 /* PHI nodes can not have patterns or related statements. */
8523 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
) == NULL
8524 && STMT_VINFO_RELATED_STMT (stmt_vinfo
) == NULL
);
8527 for (epilogue_gsi
= gsi_start_bb (epilogue_bbs
[i
]);
8528 !gsi_end_p (epilogue_gsi
); gsi_next (&epilogue_gsi
))
8530 new_stmt
= gsi_stmt (epilogue_gsi
);
8532 gcc_assert (gimple_uid (new_stmt
) > 0);
8534 = epilogue_vinfo
->stmt_vec_infos
[gimple_uid (new_stmt
) - 1];
8536 orig_stmt
= STMT_VINFO_STMT (stmt_vinfo
);
8537 STMT_VINFO_STMT (stmt_vinfo
) = new_stmt
;
8539 if (tree old_lhs
= gimple_get_lhs (orig_stmt
))
8540 mapping
.put (old_lhs
, gimple_get_lhs (new_stmt
));
8542 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
))
8544 gimple_seq seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo
);
8545 for (gimple_stmt_iterator gsi
= gsi_start (seq
);
8546 !gsi_end_p (gsi
); gsi_next (&gsi
))
8547 stmt_worklist
.safe_push (gsi_stmt (gsi
));
8550 related_vinfo
= STMT_VINFO_RELATED_STMT (stmt_vinfo
);
8551 if (related_vinfo
!= NULL
&& related_vinfo
!= stmt_vinfo
)
8553 gimple
*stmt
= STMT_VINFO_STMT (related_vinfo
);
8554 stmt_worklist
.safe_push (stmt
);
8555 /* Set BB such that the assert in
8556 'get_initial_def_for_reduction' is able to determine that
8557 the BB of the related stmt is inside this loop. */
8558 gimple_set_bb (stmt
,
8559 gimple_bb (new_stmt
));
8560 related_vinfo
= STMT_VINFO_RELATED_STMT (related_vinfo
);
8561 gcc_assert (related_vinfo
== NULL
8562 || related_vinfo
== stmt_vinfo
);
8567 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8568 using the original main loop and thus need to be updated to refer to the
8569 cloned variables used in the epilogue. */
8570 for (unsigned i
= 0; i
< stmt_worklist
.length (); ++i
)
8572 gimple
*stmt
= stmt_worklist
[i
];
8575 for (unsigned j
= 1; j
< gimple_num_ops (stmt
); ++j
)
8577 tree op
= gimple_op (stmt
, j
);
8578 if ((new_op
= mapping
.get(op
)))
8579 gimple_set_op (stmt
, j
, *new_op
);
8582 /* PR92429: The last argument of simplify_replace_tree disables
8583 folding when replacing arguments. This is required as
8584 otherwise you might end up with different statements than the
8585 ones analyzed in vect_loop_analyze, leading to different
8587 op
= simplify_replace_tree (op
, NULL_TREE
, NULL_TREE
,
8588 &find_in_mapping
, &mapping
, false);
8589 gimple_set_op (stmt
, j
, op
);
8594 struct data_reference
*dr
;
8595 vec
<data_reference_p
> datarefs
= epilogue_vinfo
->shared
->datarefs
;
8596 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
8598 orig_stmt
= DR_STMT (dr
);
8599 gcc_assert (gimple_uid (orig_stmt
) > 0);
8600 stmt_vinfo
= epilogue_vinfo
->stmt_vec_infos
[gimple_uid (orig_stmt
) - 1];
8601 /* Data references for gather loads and scatter stores do not use the
8602 updated offset we set using ADVANCE. Instead we have to make sure the
8603 reference in the data references point to the corresponding copy of
8604 the original in the epilogue. */
8605 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo
))
8606 == VMAT_GATHER_SCATTER
)
8609 = simplify_replace_tree (DR_REF (dr
), NULL_TREE
, NULL_TREE
,
8610 &find_in_mapping
, &mapping
);
8611 DR_BASE_ADDRESS (dr
)
8612 = simplify_replace_tree (DR_BASE_ADDRESS (dr
), NULL_TREE
, NULL_TREE
,
8613 &find_in_mapping
, &mapping
);
8615 DR_STMT (dr
) = STMT_VINFO_STMT (stmt_vinfo
);
8616 stmt_vinfo
->dr_aux
.stmt
= stmt_vinfo
;
8617 /* The vector size of the epilogue is smaller than that of the main loop
8618 so the alignment is either the same or lower. This means the dr will
8619 thus by definition be aligned. */
8620 STMT_VINFO_DR_INFO (stmt_vinfo
)->base_misaligned
= false;
8623 epilogue_vinfo
->shared
->datarefs_copy
.release ();
8624 epilogue_vinfo
->shared
->save_datarefs ();
8627 /* Function vect_transform_loop.
8629 The analysis phase has determined that the loop is vectorizable.
8630 Vectorize the loop - created vectorized stmts to replace the scalar
8631 stmts in the loop, and update the loop exit condition.
8632 Returns scalar epilogue loop if any. */
8635 vect_transform_loop (loop_vec_info loop_vinfo
, gimple
*loop_vectorized_call
)
8637 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8638 class loop
*epilogue
= NULL
;
8639 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
8640 int nbbs
= loop
->num_nodes
;
8642 tree niters_vector
= NULL_TREE
;
8643 tree step_vector
= NULL_TREE
;
8644 tree niters_vector_mult_vf
= NULL_TREE
;
8645 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8646 unsigned int lowest_vf
= constant_lower_bound (vf
);
8648 bool check_profitability
= false;
8651 DUMP_VECT_SCOPE ("vec_transform_loop");
8653 loop_vinfo
->shared
->check_datarefs ();
8655 /* Use the more conservative vectorization threshold. If the number
8656 of iterations is constant assume the cost check has been performed
8657 by our caller. If the threshold makes all loops profitable that
8658 run at least the (estimated) vectorization factor number of times
8659 checking is pointless, too. */
8660 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
8661 if (vect_apply_runtime_profitability_check_p (loop_vinfo
))
8663 if (dump_enabled_p ())
8664 dump_printf_loc (MSG_NOTE
, vect_location
,
8665 "Profitability threshold is %d loop iterations.\n",
8667 check_profitability
= true;
8670 /* Make sure there exists a single-predecessor exit bb. Do this before
8672 edge e
= single_exit (loop
);
8673 if (! single_pred_p (e
->dest
))
8675 split_loop_exit_edge (e
, true);
8676 if (dump_enabled_p ())
8677 dump_printf (MSG_NOTE
, "split exit edge\n");
8680 /* Version the loop first, if required, so the profitability check
8683 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
8686 = vect_loop_versioning (loop_vinfo
, loop_vectorized_call
);
8687 sloop
->force_vectorize
= false;
8688 check_profitability
= false;
8691 /* Make sure there exists a single-predecessor exit bb also on the
8692 scalar loop copy. Do this after versioning but before peeling
8693 so CFG structure is fine for both scalar and if-converted loop
8694 to make slpeel_duplicate_current_defs_from_edges face matched
8695 loop closed PHI nodes on the exit. */
8696 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8698 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
8699 if (! single_pred_p (e
->dest
))
8701 split_loop_exit_edge (e
, true);
8702 if (dump_enabled_p ())
8703 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
8707 tree niters
= vect_build_loop_niters (loop_vinfo
);
8708 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
8709 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
8710 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
8712 drs_init_vec orig_drs_init
;
8714 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
8715 &step_vector
, &niters_vector_mult_vf
, th
,
8716 check_profitability
, niters_no_overflow
,
8719 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
8720 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
8721 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
8722 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
8724 if (niters_vector
== NULL_TREE
)
8726 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8727 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8728 && known_eq (lowest_vf
, vf
))
8731 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
8732 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
8733 step_vector
= build_one_cst (TREE_TYPE (niters
));
8736 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
8737 &step_vector
, niters_no_overflow
);
8740 /* 1) Make sure the loop header has exactly two entries
8741 2) Make sure we have a preheader basic block. */
8743 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
8745 split_edge (loop_preheader_edge (loop
));
8747 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8748 && vect_use_loop_mask_for_alignment_p (loop_vinfo
))
8749 /* This will deal with any possible peeling. */
8750 vect_prepare_for_masked_peels (loop_vinfo
);
8752 /* Schedule the SLP instances first, then handle loop vectorization
8754 if (!loop_vinfo
->slp_instances
.is_empty ())
8756 DUMP_VECT_SCOPE ("scheduling SLP instances");
8757 vect_schedule_slp (loop_vinfo
);
8760 /* FORNOW: the vectorizer supports only loops which body consist
8761 of one basic block (header + empty latch). When the vectorizer will
8762 support more involved loop forms, the order by which the BBs are
8763 traversed need to be reconsidered. */
8765 for (i
= 0; i
< nbbs
; i
++)
8767 basic_block bb
= bbs
[i
];
8768 stmt_vec_info stmt_info
;
8770 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
8773 gphi
*phi
= si
.phi ();
8774 if (dump_enabled_p ())
8775 dump_printf_loc (MSG_NOTE
, vect_location
,
8776 "------>vectorizing phi: %G", phi
);
8777 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
8781 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8782 vect_loop_kill_debug_uses (loop
, stmt_info
);
8784 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8785 && !STMT_VINFO_LIVE_P (stmt_info
))
8788 if (STMT_VINFO_VECTYPE (stmt_info
)
8790 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
8791 && dump_enabled_p ())
8792 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8794 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
8795 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8796 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
8797 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
8798 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
8799 && ! PURE_SLP_STMT (stmt_info
))
8801 if (dump_enabled_p ())
8802 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
8803 vect_transform_stmt (loop_vinfo
, stmt_info
, NULL
, NULL
, NULL
);
8807 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
8810 stmt
= gsi_stmt (si
);
8811 /* During vectorization remove existing clobber stmts. */
8812 if (gimple_clobber_p (stmt
))
8814 unlink_stmt_vdef (stmt
);
8815 gsi_remove (&si
, true);
8816 release_defs (stmt
);
8820 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
8822 /* vector stmts created in the outer-loop during vectorization of
8823 stmts in an inner-loop may not have a stmt_info, and do not
8824 need to be vectorized. */
8825 stmt_vec_info seen_store
= NULL
;
8828 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
8830 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
8831 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
8832 !gsi_end_p (subsi
); gsi_next (&subsi
))
8834 stmt_vec_info pat_stmt_info
8835 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
8836 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
8839 stmt_vec_info pat_stmt_info
8840 = STMT_VINFO_RELATED_STMT (stmt_info
);
8841 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
, &si
,
8844 vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
8850 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
8851 /* Interleaving. If IS_STORE is TRUE, the
8852 vectorization of the interleaving chain was
8853 completed - free all the stores in the chain. */
8854 vect_remove_stores (loop_vinfo
,
8855 DR_GROUP_FIRST_ELEMENT (seen_store
));
8857 /* Free the attached stmt_vec_info and remove the stmt. */
8858 loop_vinfo
->remove_stmt (stmt_info
);
8863 /* Stub out scalar statements that must not survive vectorization.
8864 Doing this here helps with grouped statements, or statements that
8865 are involved in patterns. */
8866 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
8867 !gsi_end_p (gsi
); gsi_next (&gsi
))
8869 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
8870 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
8872 tree lhs
= gimple_get_lhs (call
);
8873 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8875 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
8876 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
8877 gsi_replace (&gsi
, new_stmt
, true);
8883 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8884 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8885 if (integer_onep (step_vector
))
8886 niters_no_overflow
= true;
8887 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
8888 niters_vector_mult_vf
, !niters_no_overflow
);
8890 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
8891 scale_profile_for_vect_loop (loop
, assumed_vf
);
8893 /* True if the final iteration might not handle a full vector's
8894 worth of scalar iterations. */
8895 bool final_iter_may_be_partial
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8896 /* The minimum number of iterations performed by the epilogue. This
8897 is 1 when peeling for gaps because we always need a final scalar
8899 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
8900 /* +1 to convert latch counts to loop iteration counts,
8901 -min_epilogue_iters to remove iterations that cannot be performed
8902 by the vector code. */
8903 int bias_for_lowest
= 1 - min_epilogue_iters
;
8904 int bias_for_assumed
= bias_for_lowest
;
8905 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
8906 if (alignment_npeels
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8908 /* When the amount of peeling is known at compile time, the first
8909 iteration will have exactly alignment_npeels active elements.
8910 In the worst case it will have at least one. */
8911 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
8912 bias_for_lowest
+= lowest_vf
- min_first_active
;
8913 bias_for_assumed
+= assumed_vf
- min_first_active
;
8915 /* In these calculations the "- 1" converts loop iteration counts
8916 back to latch counts. */
8917 if (loop
->any_upper_bound
)
8918 loop
->nb_iterations_upper_bound
8919 = (final_iter_may_be_partial
8920 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8922 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8924 if (loop
->any_likely_upper_bound
)
8925 loop
->nb_iterations_likely_upper_bound
8926 = (final_iter_may_be_partial
8927 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
8928 + bias_for_lowest
, lowest_vf
) - 1
8929 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
8930 + bias_for_lowest
, lowest_vf
) - 1);
8931 if (loop
->any_estimate
)
8932 loop
->nb_iterations_estimate
8933 = (final_iter_may_be_partial
8934 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8936 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8939 if (dump_enabled_p ())
8941 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8943 dump_printf_loc (MSG_NOTE
, vect_location
,
8944 "LOOP VECTORIZED\n");
8946 dump_printf_loc (MSG_NOTE
, vect_location
,
8947 "OUTER LOOP VECTORIZED\n");
8948 dump_printf (MSG_NOTE
, "\n");
8951 dump_printf_loc (MSG_NOTE
, vect_location
,
8952 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8953 GET_MODE_NAME (loop_vinfo
->vector_mode
));
8956 /* Loops vectorized with a variable factor won't benefit from
8957 unrolling/peeling. */
8958 if (!vf
.is_constant ())
8961 if (dump_enabled_p ())
8962 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
8963 " variable-length vectorization factor\n");
8965 /* Free SLP instances here because otherwise stmt reference counting
8967 slp_instance instance
;
8968 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
8969 vect_free_slp_instance (instance
, true);
8970 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
8971 /* Clear-up safelen field since its value is invalid after vectorization
8972 since vectorized loop can have loop-carried dependencies. */
8977 update_epilogue_loop_vinfo (epilogue
, advance
);
8979 epilogue
->simduid
= loop
->simduid
;
8980 epilogue
->force_vectorize
= loop
->force_vectorize
;
8981 epilogue
->dont_vectorize
= false;
8987 /* The code below is trying to perform simple optimization - revert
8988 if-conversion for masked stores, i.e. if the mask of a store is zero
8989 do not perform it and all stored value producers also if possible.
8997 this transformation will produce the following semi-hammock:
8999 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9001 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9002 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9003 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9004 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9005 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9006 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9011 optimize_mask_stores (class loop
*loop
)
9013 basic_block
*bbs
= get_loop_body (loop
);
9014 unsigned nbbs
= loop
->num_nodes
;
9017 class loop
*bb_loop
;
9018 gimple_stmt_iterator gsi
;
9020 auto_vec
<gimple
*> worklist
;
9021 auto_purge_vect_location sentinel
;
9023 vect_location
= find_loop_location (loop
);
9024 /* Pick up all masked stores in loop if any. */
9025 for (i
= 0; i
< nbbs
; i
++)
9028 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
9031 stmt
= gsi_stmt (gsi
);
9032 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
9033 worklist
.safe_push (stmt
);
9038 if (worklist
.is_empty ())
9041 /* Loop has masked stores. */
9042 while (!worklist
.is_empty ())
9044 gimple
*last
, *last_store
;
9047 basic_block store_bb
, join_bb
;
9048 gimple_stmt_iterator gsi_to
;
9049 tree vdef
, new_vdef
;
9054 last
= worklist
.pop ();
9055 mask
= gimple_call_arg (last
, 2);
9056 bb
= gimple_bb (last
);
9057 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9058 the same loop as if_bb. It could be different to LOOP when two
9059 level loop-nest is vectorized and mask_store belongs to the inner
9061 e
= split_block (bb
, last
);
9062 bb_loop
= bb
->loop_father
;
9063 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
9065 store_bb
= create_empty_bb (bb
);
9066 add_bb_to_loop (store_bb
, bb_loop
);
9067 e
->flags
= EDGE_TRUE_VALUE
;
9068 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
9069 /* Put STORE_BB to likely part. */
9070 efalse
->probability
= profile_probability::unlikely ();
9071 store_bb
->count
= efalse
->count ();
9072 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
9073 if (dom_info_available_p (CDI_DOMINATORS
))
9074 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
9075 if (dump_enabled_p ())
9076 dump_printf_loc (MSG_NOTE
, vect_location
,
9077 "Create new block %d to sink mask stores.",
9079 /* Create vector comparison with boolean result. */
9080 vectype
= TREE_TYPE (mask
);
9081 zero
= build_zero_cst (vectype
);
9082 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
9083 gsi
= gsi_last_bb (bb
);
9084 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
9085 /* Create new PHI node for vdef of the last masked store:
9086 .MEM_2 = VDEF <.MEM_1>
9087 will be converted to
9088 .MEM.3 = VDEF <.MEM_1>
9089 and new PHI node will be created in join bb
9090 .MEM_2 = PHI <.MEM_1, .MEM_3>
9092 vdef
= gimple_vdef (last
);
9093 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
9094 gimple_set_vdef (last
, new_vdef
);
9095 phi
= create_phi_node (vdef
, join_bb
);
9096 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
9098 /* Put all masked stores with the same mask to STORE_BB if possible. */
9101 gimple_stmt_iterator gsi_from
;
9102 gimple
*stmt1
= NULL
;
9104 /* Move masked store to STORE_BB. */
9106 gsi
= gsi_for_stmt (last
);
9108 /* Shift GSI to the previous stmt for further traversal. */
9110 gsi_to
= gsi_start_bb (store_bb
);
9111 gsi_move_before (&gsi_from
, &gsi_to
);
9112 /* Setup GSI_TO to the non-empty block start. */
9113 gsi_to
= gsi_start_bb (store_bb
);
9114 if (dump_enabled_p ())
9115 dump_printf_loc (MSG_NOTE
, vect_location
,
9116 "Move stmt to created bb\n%G", last
);
9117 /* Move all stored value producers if possible. */
9118 while (!gsi_end_p (gsi
))
9121 imm_use_iterator imm_iter
;
9122 use_operand_p use_p
;
9125 /* Skip debug statements. */
9126 if (is_gimple_debug (gsi_stmt (gsi
)))
9131 stmt1
= gsi_stmt (gsi
);
9132 /* Do not consider statements writing to memory or having
9133 volatile operand. */
9134 if (gimple_vdef (stmt1
)
9135 || gimple_has_volatile_ops (stmt1
))
9139 lhs
= gimple_get_lhs (stmt1
);
9143 /* LHS of vectorized stmt must be SSA_NAME. */
9144 if (TREE_CODE (lhs
) != SSA_NAME
)
9147 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
9149 /* Remove dead scalar statement. */
9150 if (has_zero_uses (lhs
))
9152 gsi_remove (&gsi_from
, true);
9157 /* Check that LHS does not have uses outside of STORE_BB. */
9159 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
9162 use_stmt
= USE_STMT (use_p
);
9163 if (is_gimple_debug (use_stmt
))
9165 if (gimple_bb (use_stmt
) != store_bb
)
9174 if (gimple_vuse (stmt1
)
9175 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
9178 /* Can move STMT1 to STORE_BB. */
9179 if (dump_enabled_p ())
9180 dump_printf_loc (MSG_NOTE
, vect_location
,
9181 "Move stmt to created bb\n%G", stmt1
);
9182 gsi_move_before (&gsi_from
, &gsi_to
);
9183 /* Shift GSI_TO for further insertion. */
9186 /* Put other masked stores with the same mask to STORE_BB. */
9187 if (worklist
.is_empty ()
9188 || gimple_call_arg (worklist
.last (), 2) != mask
9189 || worklist
.last () != stmt1
)
9191 last
= worklist
.pop ();
9193 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
9197 /* Decide whether it is possible to use a zero-based induction variable
9198 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9199 return the value that the induction variable must be able to hold
9200 in order to ensure that the loop ends with an all-false mask.
9201 Return -1 otherwise. */
9203 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo
)
9205 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
9206 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
9207 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
9209 /* Calculate the value that the induction variable must be able
9210 to hit in order to ensure that we end the loop with an all-false mask.
9211 This involves adding the maximum number of inactive trailing scalar
9213 widest_int iv_limit
= -1;
9214 if (max_loop_iterations (loop
, &iv_limit
))
9218 /* Add the maximum number of skipped iterations to the
9219 maximum iteration count. */
9220 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
9221 iv_limit
+= wi::to_widest (niters_skip
);
9223 iv_limit
+= max_vf
- 1;
9225 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
9226 /* Make a conservatively-correct assumption. */
9227 iv_limit
+= max_vf
- 1;
9229 /* IV_LIMIT is the maximum number of latch iterations, which is also
9230 the maximum in-range IV value. Round this value down to the previous
9231 vector alignment boundary and then add an extra full iteration. */
9232 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
9233 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;