2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 for (i=0; i<N/8; i++){
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
157 static stmt_vec_info
vect_is_simple_reduction (loop_vec_info
, stmt_vec_info
,
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info
,
166 bool vectype_maybe_set_p
,
168 vec
<stmt_vec_info
> *mask_producers
)
170 gimple
*stmt
= stmt_info
->stmt
;
172 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
173 && !STMT_VINFO_LIVE_P (stmt_info
))
174 || gimple_clobber_p (stmt
))
176 if (dump_enabled_p ())
177 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
178 return opt_result::success ();
181 tree stmt_vectype
, nunits_vectype
;
182 opt_result res
= vect_get_vector_types_for_stmt (stmt_info
, &stmt_vectype
,
189 if (STMT_VINFO_VECTYPE (stmt_info
))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
194 || vectype_maybe_set_p
)
195 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
196 else if (stmt_vectype
== boolean_type_node
)
197 mask_producers
->safe_push (stmt_info
);
199 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
203 vect_update_max_nunits (vf
, nunits_vectype
);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. If some of the statements
211 produce a mask result whose vector type can only be calculated later,
212 add them to MASK_PRODUCERS. Return true on success or false if
213 something prevented vectorization. */
216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info
, poly_uint64
*vf
,
217 vec
<stmt_vec_info
> *mask_producers
)
219 vec_info
*vinfo
= stmt_info
->vinfo
;
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
224 = vect_determine_vf_for_stmt_1 (stmt_info
, false, vf
, mask_producers
);
228 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
229 && STMT_VINFO_RELATED_STMT (stmt_info
))
231 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
232 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
234 /* If a pattern statement has def stmts, analyze them too. */
235 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
236 !gsi_end_p (si
); gsi_next (&si
))
238 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
239 if (dump_enabled_p ())
240 dump_printf_loc (MSG_NOTE
, vect_location
,
241 "==> examining pattern def stmt: %G",
242 def_stmt_info
->stmt
);
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info
, true,
245 res
= vect_determine_vf_for_stmt_1 (def_stmt_info
, true,
251 if (dump_enabled_p ())
252 dump_printf_loc (MSG_NOTE
, vect_location
,
253 "==> examining pattern statement: %G",
255 res
= vect_determine_vf_for_stmt_1 (stmt_info
, true, vf
, mask_producers
);
260 return opt_result::success ();
263 /* Function vect_determine_vectorization_factor
265 Determine the vectorization factor (VF). VF is the number of data elements
266 that are operated upon in parallel in a single iteration of the vectorized
267 loop. For example, when vectorizing a loop that operates on 4byte elements,
268 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
269 elements can fit in a single vector register.
271 We currently support vectorization of loops in which all types operated upon
272 are of the same size. Therefore this function currently sets VF according to
273 the size of the types operated upon, and fails if there are multiple sizes
276 VF is also the factor by which the loop iterations are strip-mined, e.g.:
283 for (i=0; i<N; i+=VF){
284 a[i:VF] = b[i:VF] + c[i:VF];
289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
291 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
292 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
293 unsigned nbbs
= loop
->num_nodes
;
294 poly_uint64 vectorization_factor
= 1;
295 tree scalar_type
= NULL_TREE
;
298 stmt_vec_info stmt_info
;
300 auto_vec
<stmt_vec_info
> mask_producers
;
302 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
304 for (i
= 0; i
< nbbs
; i
++)
306 basic_block bb
= bbs
[i
];
308 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
312 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
317 gcc_assert (stmt_info
);
319 if (STMT_VINFO_RELEVANT_P (stmt_info
)
320 || STMT_VINFO_LIVE_P (stmt_info
))
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
323 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
325 if (dump_enabled_p ())
326 dump_printf_loc (MSG_NOTE
, vect_location
,
327 "get vectype for scalar type: %T\n",
330 vectype
= get_vectype_for_scalar_type (scalar_type
);
332 return opt_result::failure_at (phi
,
333 "not vectorized: unsupported "
336 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
338 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
342 if (dump_enabled_p ())
344 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
345 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
346 dump_printf (MSG_NOTE
, "\n");
349 vect_update_max_nunits (&vectorization_factor
, vectype
);
353 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
356 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
358 = vect_determine_vf_for_stmt (stmt_info
, &vectorization_factor
,
365 /* TODO: Analyze cost. Decide if worth while to vectorize. */
366 if (dump_enabled_p ())
368 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
369 dump_dec (MSG_NOTE
, vectorization_factor
);
370 dump_printf (MSG_NOTE
, "\n");
373 if (known_le (vectorization_factor
, 1U))
374 return opt_result::failure_at (vect_location
,
375 "not vectorized: unsupported data-type\n");
376 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
378 for (i
= 0; i
< mask_producers
.length (); i
++)
380 stmt_info
= mask_producers
[i
];
381 opt_tree mask_type
= vect_get_mask_type_for_stmt (stmt_info
);
383 return opt_result::propagate_failure (mask_type
);
384 STMT_VINFO_VECTYPE (stmt_info
) = mask_type
;
387 return opt_result::success ();
391 /* Function vect_is_simple_iv_evolution.
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution. */
397 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
402 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
405 /* When there is no evolution in this loop, the evolution function
407 if (evolution_part
== NULL_TREE
)
410 /* When the evolution is a polynomial of degree >= 2
411 the evolution function is not "simple". */
412 if (tree_is_chrec (evolution_part
))
415 step_expr
= evolution_part
;
416 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
420 step_expr
, init_expr
);
425 if (TREE_CODE (step_expr
) != INTEGER_CST
426 && (TREE_CODE (step_expr
) != SSA_NAME
427 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
428 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
429 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
430 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
431 || !flag_associative_math
)))
432 && (TREE_CODE (step_expr
) != REAL_CST
433 || !flag_associative_math
))
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
445 what we are assuming is a double reduction. For example, given
446 a structure like this:
449 x_1 = PHI <x_4(outer2), ...>;
453 x_2 = PHI <x_1(outer1), ...>;
459 x_4 = PHI <x_3(inner)>;
462 outer loop analysis would treat x_1 as a double reduction phi and
463 this function would then return true for x_2. */
466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info
, gphi
*phi
)
468 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
471 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
472 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
473 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
478 /* Function vect_analyze_scalar_cycles_1.
480 Examine the cross iteration def-use cycles of scalar variables
481 in LOOP. LOOP_VINFO represents the loop that is now being
482 considered for vectorization (can be LOOP, or an outer-loop
486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, class loop
*loop
)
488 basic_block bb
= loop
->header
;
490 auto_vec
<stmt_vec_info
, 64> worklist
;
494 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
496 /* First - identify all inductions. Reduction detection assumes that all the
497 inductions have been identified, therefore, this order must not be
499 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
501 gphi
*phi
= gsi
.phi ();
502 tree access_fn
= NULL
;
503 tree def
= PHI_RESULT (phi
);
504 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
506 if (dump_enabled_p ())
507 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
509 /* Skip virtual phi's. The data dependences that are associated with
510 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
511 if (virtual_operand_p (def
))
514 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
516 /* Analyze the evolution function. */
517 access_fn
= analyze_scalar_evolution (loop
, def
);
520 STRIP_NOPS (access_fn
);
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE
, vect_location
,
523 "Access function of PHI: %T\n", access_fn
);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
525 = initial_condition_in_loop_num (access_fn
, loop
->num
);
526 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
527 = evolution_part_in_loop_num (access_fn
, loop
->num
);
531 || vect_inner_phi_in_double_reduction_p (stmt_vinfo
, phi
)
532 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
533 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
534 && TREE_CODE (step
) != INTEGER_CST
))
536 worklist
.safe_push (stmt_vinfo
);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
542 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
544 if (dump_enabled_p ())
545 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
546 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
550 /* Second - identify all reductions and nested cycles. */
551 while (worklist
.length () > 0)
553 stmt_vec_info stmt_vinfo
= worklist
.pop ();
554 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
555 tree def
= PHI_RESULT (phi
);
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
560 gcc_assert (!virtual_operand_p (def
)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
563 stmt_vec_info reduc_stmt_info
564 = vect_is_simple_reduction (loop_vinfo
, stmt_vinfo
, &double_reduc
);
567 STMT_VINFO_REDUC_DEF (stmt_vinfo
) = reduc_stmt_info
;
568 STMT_VINFO_REDUC_DEF (reduc_stmt_info
) = stmt_vinfo
;
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE
, vect_location
,
573 "Detected double reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_double_reduction_def
;
580 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
582 if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE
, vect_location
,
584 "Detected vectorizable nested cycle.\n");
586 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE
, vect_location
,
592 "Detected reduction.\n");
594 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
595 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
596 /* Store the reduction cycles for possible vectorization in
597 loop-aware SLP if it was not detected as reduction
599 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
600 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
608 "Unknown def-use cycle pattern.\n");
613 /* Function vect_analyze_scalar_cycles.
615 Examine the cross iteration def-use cycles of scalar variables, by
616 analyzing the loop-header PHIs of scalar variables. Classify each
617 cycle as one of the following: invariant, induction, reduction, unknown.
618 We do that for the loop represented by LOOP_VINFO, and also to its
619 inner-loop, if exists.
620 Examples for scalar cycles:
635 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
637 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
639 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
641 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
642 Reductions in such inner-loop therefore have different properties than
643 the reductions in the nest that gets vectorized:
644 1. When vectorized, they are executed in the same order as in the original
645 scalar loop, so we can't change the order of computation when
647 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
648 current checks are too strict. */
651 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
654 /* Transfer group and reduction information from STMT_INFO to its
658 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
660 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
662 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
663 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
664 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
667 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
668 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
669 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
671 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
672 = STMT_VINFO_RELATED_STMT (stmt_info
);
675 STMT_VINFO_DEF_TYPE (stmtp
) = vect_reduction_def
;
678 /* Fixup scalar cycles that now have their stmts detected as patterns. */
681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
686 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
687 if (STMT_VINFO_IN_PATTERN_P (first
))
689 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
692 if (! STMT_VINFO_IN_PATTERN_P (next
))
694 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
696 /* If not all stmt in the chain are patterns try to handle
697 the chain without patterns. */
700 vect_fixup_reduc_chain (first
);
701 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
702 = STMT_VINFO_RELATED_STMT (first
);
707 /* Function vect_get_loop_niters.
709 Determine how many iterations the loop is executed and place it
710 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
711 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
712 niter information holds in ASSUMPTIONS.
714 Return the loop exit condition. */
718 vect_get_loop_niters (class loop
*loop
, tree
*assumptions
,
719 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
721 edge exit
= single_exit (loop
);
722 class tree_niter_desc niter_desc
;
723 tree niter_assumptions
, niter
, may_be_zero
;
724 gcond
*cond
= get_loop_exit_condition (loop
);
726 *assumptions
= boolean_true_node
;
727 *number_of_iterationsm1
= chrec_dont_know
;
728 *number_of_iterations
= chrec_dont_know
;
729 DUMP_VECT_SCOPE ("get_loop_niters");
734 may_be_zero
= NULL_TREE
;
735 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
736 || chrec_contains_undetermined (niter_desc
.niter
))
739 niter_assumptions
= niter_desc
.assumptions
;
740 may_be_zero
= niter_desc
.may_be_zero
;
741 niter
= niter_desc
.niter
;
743 if (may_be_zero
&& integer_zerop (may_be_zero
))
744 may_be_zero
= NULL_TREE
;
748 if (COMPARISON_CLASS_P (may_be_zero
))
750 /* Try to combine may_be_zero with assumptions, this can simplify
751 computation of niter expression. */
752 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
753 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
755 fold_build1 (TRUTH_NOT_EXPR
,
759 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
760 build_int_cst (TREE_TYPE (niter
), 0),
761 rewrite_to_non_trapping_overflow (niter
));
763 may_be_zero
= NULL_TREE
;
765 else if (integer_nonzerop (may_be_zero
))
767 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
768 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
775 *assumptions
= niter_assumptions
;
776 *number_of_iterationsm1
= niter
;
778 /* We want the number of loop header executions which is the number
779 of latch executions plus one.
780 ??? For UINT_MAX latch executions this number overflows to zero
781 for loops like do { n++; } while (n != 0); */
782 if (niter
&& !chrec_contains_undetermined (niter
))
783 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
784 build_int_cst (TREE_TYPE (niter
), 1));
785 *number_of_iterations
= niter
;
790 /* Function bb_in_loop_p
792 Used as predicate for dfs order traversal of the loop bbs. */
795 bb_in_loop_p (const_basic_block bb
, const void *data
)
797 const class loop
*const loop
= (const class loop
*)data
;
798 if (flow_bb_inside_loop_p (loop
, bb
))
804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
805 stmt_vec_info structs for all the stmts in LOOP_IN. */
807 _loop_vec_info::_loop_vec_info (class loop
*loop_in
, vec_info_shared
*shared
)
808 : vec_info (vec_info::loop
, init_cost (loop_in
), shared
),
810 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
811 num_itersm1 (NULL_TREE
),
812 num_iters (NULL_TREE
),
813 num_iters_unchanged (NULL_TREE
),
814 num_iters_assumptions (NULL_TREE
),
816 versioning_threshold (0),
817 vectorization_factor (0),
818 max_vectorization_factor (0),
819 mask_skip_niters (NULL_TREE
),
820 mask_compare_type (NULL_TREE
),
821 simd_if_cond (NULL_TREE
),
823 peeling_for_alignment (0),
827 slp_unrolling_factor (1),
828 single_scalar_iteration_cost (0),
829 vectorizable (false),
830 can_fully_mask_p (true),
831 fully_masked_p (false),
832 peeling_for_gaps (false),
833 peeling_for_niter (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop_scaling (profile_probability::uninitialized ()),
838 orig_loop_info (NULL
)
840 /* CHECKME: We want to visit all BBs before their successors (except for
841 latch blocks, for which this assertion wouldn't hold). In the simple
842 case of the loop forms we allow, a dfs order of the BBs would the same
843 as reversed postorder traversal, so we are safe. */
845 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
846 bbs
, loop
->num_nodes
, loop
);
847 gcc_assert (nbbs
== loop
->num_nodes
);
849 for (unsigned int i
= 0; i
< nbbs
; i
++)
851 basic_block bb
= bbs
[i
];
852 gimple_stmt_iterator si
;
854 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
856 gimple
*phi
= gsi_stmt (si
);
857 gimple_set_uid (phi
, 0);
861 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
863 gimple
*stmt
= gsi_stmt (si
);
864 gimple_set_uid (stmt
, 0);
866 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
867 third argument is the #pragma omp simd if (x) condition, when 0,
868 loop shouldn't be vectorized, when non-zero constant, it should
869 be vectorized normally, otherwise versioned with vectorized loop
870 done if the condition is non-zero at runtime. */
872 && is_gimple_call (stmt
)
873 && gimple_call_internal_p (stmt
)
874 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
875 && gimple_call_num_args (stmt
) >= 3
876 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
878 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
880 tree arg
= gimple_call_arg (stmt
, 2);
881 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
884 gcc_assert (integer_nonzerop (arg
));
890 /* Free all levels of MASKS. */
893 release_vec_loop_masks (vec_loop_masks
*masks
)
897 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
898 rgm
->masks
.release ();
902 /* Free all memory used by the _loop_vec_info, as well as all the
903 stmt_vec_info structs of all the stmts in the loop. */
905 _loop_vec_info::~_loop_vec_info ()
909 release_vec_loop_masks (&masks
);
916 /* Return an invariant or register for EXPR and emit necessary
917 computations in the LOOP_VINFO loop preheader. */
920 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
922 if (is_gimple_reg (expr
)
923 || is_gimple_min_invariant (expr
))
926 if (! loop_vinfo
->ivexpr_map
)
927 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
928 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
931 gimple_seq stmts
= NULL
;
932 cached
= force_gimple_operand (unshare_expr (expr
),
933 &stmts
, true, NULL_TREE
);
936 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
937 gsi_insert_seq_on_edge_immediate (e
, stmts
);
943 /* Return true if we can use CMP_TYPE as the comparison type to produce
944 all masks required to mask LOOP_VINFO. */
947 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
951 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
952 if (rgm
->mask_type
!= NULL_TREE
953 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
954 cmp_type
, rgm
->mask_type
,
960 /* Calculate the maximum number of scalars per iteration for every
961 rgroup in LOOP_VINFO. */
964 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
966 unsigned int res
= 1;
969 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
970 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
974 /* Each statement in LOOP_VINFO can be masked where necessary. Check
975 whether we can actually generate the masks required. Return true if so,
976 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
979 vect_verify_full_masking (loop_vec_info loop_vinfo
)
981 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
982 unsigned int min_ni_width
;
983 unsigned int max_nscalars_per_iter
984 = vect_get_max_nscalars_per_iter (loop_vinfo
);
986 /* Use a normal loop if there are no statements that need masking.
987 This only happens in rare degenerate cases: it means that the loop
988 has no loads, no stores, and no live-out values. */
989 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
992 /* Get the maximum number of iterations that is representable
993 in the counter type. */
994 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
995 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
997 /* Get a more refined estimate for the number of iterations. */
998 widest_int max_back_edges
;
999 if (max_loop_iterations (loop
, &max_back_edges
))
1000 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1002 /* Account for rgroup masks, in which each bit is replicated N times. */
1003 max_ni
*= max_nscalars_per_iter
;
1005 /* Work out how many bits we need to represent the limit. */
1006 min_ni_width
= wi::min_precision (max_ni
, UNSIGNED
);
1008 /* Find a scalar mode for which WHILE_ULT is supported. */
1009 opt_scalar_int_mode cmp_mode_iter
;
1010 tree cmp_type
= NULL_TREE
;
1011 tree iv_type
= NULL_TREE
;
1012 widest_int iv_limit
= vect_iv_limit_for_full_masking (loop_vinfo
);
1013 unsigned int iv_precision
= UINT_MAX
;
1016 iv_precision
= wi::min_precision (iv_limit
* max_nscalars_per_iter
,
1019 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1021 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1022 if (cmp_bits
>= min_ni_width
1023 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1025 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1027 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1029 /* Although we could stop as soon as we find a valid mode,
1030 there are at least two reasons why that's not always the
1033 - An IV that's Pmode or wider is more likely to be reusable
1034 in address calculations than an IV that's narrower than
1037 - Doing the comparison in IV_PRECISION or wider allows
1038 a natural 0-based IV, whereas using a narrower comparison
1039 type requires mitigations against wrap-around.
1041 Conversely, if the IV limit is variable, doing the comparison
1042 in a wider type than the original type can introduce
1043 unnecessary extensions, so picking the widest valid mode
1044 is not always a good choice either.
1046 Here we prefer the first IV type that's Pmode or wider,
1047 and the first comparison type that's IV_PRECISION or wider.
1048 (The comparison type must be no wider than the IV type,
1049 to avoid extensions in the vector loop.)
1051 ??? We might want to try continuing beyond Pmode for ILP32
1052 targets if CMP_BITS < IV_PRECISION. */
1053 iv_type
= this_type
;
1054 if (!cmp_type
|| iv_precision
> TYPE_PRECISION (cmp_type
))
1055 cmp_type
= this_type
;
1056 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1065 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1066 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo
) = iv_type
;
1070 /* Calculate the cost of one scalar iteration of the loop. */
1072 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1074 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1075 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1076 int nbbs
= loop
->num_nodes
, factor
;
1077 int innerloop_iters
, i
;
1079 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1081 /* Gather costs for statements in the scalar loop. */
1084 innerloop_iters
= 1;
1086 innerloop_iters
= 50; /* FIXME */
1088 for (i
= 0; i
< nbbs
; i
++)
1090 gimple_stmt_iterator si
;
1091 basic_block bb
= bbs
[i
];
1093 if (bb
->loop_father
== loop
->inner
)
1094 factor
= innerloop_iters
;
1098 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1100 gimple
*stmt
= gsi_stmt (si
);
1101 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1103 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1106 /* Skip stmts that are not vectorized inside the loop. */
1107 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1108 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1109 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1110 || !VECTORIZABLE_CYCLE_DEF
1111 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1114 vect_cost_for_stmt kind
;
1115 if (STMT_VINFO_DATA_REF (stmt_info
))
1117 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1120 kind
= scalar_store
;
1125 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1126 factor
, kind
, stmt_info
, 0, vect_prologue
);
1130 /* Now accumulate cost. */
1131 void *target_cost_data
= init_cost (loop
);
1132 stmt_info_for_cost
*si
;
1134 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1136 (void) add_stmt_cost (target_cost_data
, si
->count
,
1137 si
->kind
, si
->stmt_info
, si
->misalign
,
1139 unsigned dummy
, body_cost
= 0;
1140 finish_cost (target_cost_data
, &dummy
, &body_cost
, &dummy
);
1141 destroy_cost_data (target_cost_data
);
1142 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
) = body_cost
;
1146 /* Function vect_analyze_loop_form_1.
1148 Verify that certain CFG restrictions hold, including:
1149 - the loop has a pre-header
1150 - the loop has a single entry and exit
1151 - the loop exit condition is simple enough
1152 - the number of iterations can be analyzed, i.e, a countable loop. The
1153 niter could be analyzed under some assumptions. */
1156 vect_analyze_loop_form_1 (class loop
*loop
, gcond
**loop_cond
,
1157 tree
*assumptions
, tree
*number_of_iterationsm1
,
1158 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1160 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1162 /* Different restrictions apply when we are considering an inner-most loop,
1163 vs. an outer (nested) loop.
1164 (FORNOW. May want to relax some of these restrictions in the future). */
1168 /* Inner-most loop. We currently require that the number of BBs is
1169 exactly 2 (the header and latch). Vectorizable inner-most loops
1180 if (loop
->num_nodes
!= 2)
1181 return opt_result::failure_at (vect_location
,
1183 " control flow in loop.\n");
1185 if (empty_block_p (loop
->header
))
1186 return opt_result::failure_at (vect_location
,
1187 "not vectorized: empty loop.\n");
1191 class loop
*innerloop
= loop
->inner
;
1194 /* Nested loop. We currently require that the loop is doubly-nested,
1195 contains a single inner loop, and the number of BBs is exactly 5.
1196 Vectorizable outer-loops look like this:
1208 The inner-loop has the properties expected of inner-most loops
1209 as described above. */
1211 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1212 return opt_result::failure_at (vect_location
,
1214 " multiple nested loops.\n");
1216 if (loop
->num_nodes
!= 5)
1217 return opt_result::failure_at (vect_location
,
1219 " control flow in loop.\n");
1221 entryedge
= loop_preheader_edge (innerloop
);
1222 if (entryedge
->src
!= loop
->header
1223 || !single_exit (innerloop
)
1224 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1225 return opt_result::failure_at (vect_location
,
1227 " unsupported outerloop form.\n");
1229 /* Analyze the inner-loop. */
1230 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1232 = vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1233 &inner_assumptions
, &inner_niterm1
,
1234 &inner_niter
, NULL
);
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1239 "not vectorized: Bad inner loop.\n");
1243 /* Don't support analyzing niter under assumptions for inner
1245 if (!integer_onep (inner_assumptions
))
1246 return opt_result::failure_at (vect_location
,
1247 "not vectorized: Bad inner loop.\n");
1249 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1250 return opt_result::failure_at (vect_location
,
1251 "not vectorized: inner-loop count not"
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_NOTE
, vect_location
,
1256 "Considering outer-loop vectorization.\n");
1259 if (!single_exit (loop
))
1260 return opt_result::failure_at (vect_location
,
1261 "not vectorized: multiple exits.\n");
1262 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1263 return opt_result::failure_at (vect_location
,
1265 " too many incoming edges.\n");
1267 /* We assume that the loop exit condition is at the end of the loop. i.e,
1268 that the loop is represented as a do-while (with a proper if-guard
1269 before the loop if needed), where the loop header contains all the
1270 executable statements, and the latch is empty. */
1271 if (!empty_block_p (loop
->latch
)
1272 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1273 return opt_result::failure_at (vect_location
,
1274 "not vectorized: latch block not empty.\n");
1276 /* Make sure the exit is not abnormal. */
1277 edge e
= single_exit (loop
);
1278 if (e
->flags
& EDGE_ABNORMAL
)
1279 return opt_result::failure_at (vect_location
,
1281 " abnormal loop exit edge.\n");
1283 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1284 number_of_iterationsm1
);
1286 return opt_result::failure_at
1288 "not vectorized: complicated exit condition.\n");
1290 if (integer_zerop (*assumptions
)
1291 || !*number_of_iterations
1292 || chrec_contains_undetermined (*number_of_iterations
))
1293 return opt_result::failure_at
1295 "not vectorized: number of iterations cannot be computed.\n");
1297 if (integer_zerop (*number_of_iterations
))
1298 return opt_result::failure_at
1300 "not vectorized: number of iterations = 0.\n");
1302 return opt_result::success ();
1305 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1308 vect_analyze_loop_form (class loop
*loop
, vec_info_shared
*shared
)
1310 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1311 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1314 = vect_analyze_loop_form_1 (loop
, &loop_cond
,
1315 &assumptions
, &number_of_iterationsm1
,
1316 &number_of_iterations
, &inner_loop_cond
);
1318 return opt_loop_vec_info::propagate_failure (res
);
1320 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1321 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1322 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1323 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1324 if (!integer_onep (assumptions
))
1326 /* We consider to vectorize this loop by versioning it under
1327 some assumptions. In order to do this, we need to clear
1328 existing information computed by scev and niter analyzer. */
1330 free_numbers_of_iterations_estimates (loop
);
1331 /* Also set flag for this loop so that following scev and niter
1332 analysis are done under the assumptions. */
1333 loop_constraint_set (loop
, LOOP_C_FINITE
);
1334 /* Also record the assumptions for versioning. */
1335 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1338 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1340 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_NOTE
, vect_location
,
1343 "Symbolic number of iterations is ");
1344 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1345 dump_printf (MSG_NOTE
, "\n");
1349 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (loop_cond
);
1350 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1351 if (inner_loop_cond
)
1353 stmt_vec_info inner_loop_cond_info
1354 = loop_vinfo
->lookup_stmt (inner_loop_cond
);
1355 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1358 gcc_assert (!loop
->aux
);
1359 loop
->aux
= loop_vinfo
;
1360 return opt_loop_vec_info::success (loop_vinfo
);
1365 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1366 statements update the vectorization factor. */
1369 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1371 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1372 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1373 int nbbs
= loop
->num_nodes
;
1374 poly_uint64 vectorization_factor
;
1377 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1379 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1380 gcc_assert (known_ne (vectorization_factor
, 0U));
1382 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1383 vectorization factor of the loop is the unrolling factor required by
1384 the SLP instances. If that unrolling factor is 1, we say, that we
1385 perform pure SLP on loop - cross iteration parallelism is not
1387 bool only_slp_in_loop
= true;
1388 for (i
= 0; i
< nbbs
; i
++)
1390 basic_block bb
= bbs
[i
];
1391 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1394 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1395 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1396 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1397 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1398 && !PURE_SLP_STMT (stmt_info
))
1399 /* STMT needs both SLP and loop-based vectorization. */
1400 only_slp_in_loop
= false;
1404 if (only_slp_in_loop
)
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE
, vect_location
,
1408 "Loop contains only SLP stmts\n");
1409 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE
, vect_location
,
1415 "Loop contains SLP and non-SLP stmts\n");
1416 /* Both the vectorization factor and unroll factor have the form
1417 current_vector_size * X for some rational X, so they must have
1418 a common multiple. */
1419 vectorization_factor
1420 = force_common_multiple (vectorization_factor
,
1421 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1424 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1425 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE
, vect_location
,
1428 "Updating vectorization factor to ");
1429 dump_dec (MSG_NOTE
, vectorization_factor
);
1430 dump_printf (MSG_NOTE
, ".\n");
1434 /* Return true if STMT_INFO describes a double reduction phi and if
1435 the other phi in the reduction is also relevant for vectorization.
1436 This rejects cases such as:
1439 x_1 = PHI <x_3(outer2), ...>;
1447 x_3 = PHI <x_2(inner)>;
1449 if nothing in x_2 or elsewhere makes x_1 relevant. */
1452 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1454 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1457 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1460 /* Function vect_analyze_loop_operations.
1462 Scan the loop stmts and make sure they are all vectorizable. */
1465 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1467 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1468 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1469 int nbbs
= loop
->num_nodes
;
1471 stmt_vec_info stmt_info
;
1472 bool need_to_vectorize
= false;
1475 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1477 auto_vec
<stmt_info_for_cost
> cost_vec
;
1479 for (i
= 0; i
< nbbs
; i
++)
1481 basic_block bb
= bbs
[i
];
1483 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1486 gphi
*phi
= si
.phi ();
1489 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1490 if (dump_enabled_p ())
1491 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G", phi
);
1492 if (virtual_operand_p (gimple_phi_result (phi
)))
1495 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1496 (i.e., a phi in the tail of the outer-loop). */
1497 if (! is_loop_header_bb_p (bb
))
1499 /* FORNOW: we currently don't support the case that these phis
1500 are not used in the outerloop (unless it is double reduction,
1501 i.e., this phi is vect_reduction_def), cause this case
1502 requires to actually do something here. */
1503 if (STMT_VINFO_LIVE_P (stmt_info
)
1504 && !vect_active_double_reduction_p (stmt_info
))
1505 return opt_result::failure_at (phi
,
1506 "Unsupported loop-closed phi"
1507 " in outer-loop.\n");
1509 /* If PHI is used in the outer loop, we check that its operand
1510 is defined in the inner loop. */
1511 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1515 if (gimple_phi_num_args (phi
) != 1)
1516 return opt_result::failure_at (phi
, "unsupported phi");
1518 phi_op
= PHI_ARG_DEF (phi
, 0);
1519 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1521 return opt_result::failure_at (phi
, "unsupported phi\n");
1523 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1524 && (STMT_VINFO_RELEVANT (op_def_info
)
1525 != vect_used_in_outer_by_reduction
))
1526 return opt_result::failure_at (phi
, "unsupported phi\n");
1528 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
1529 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1530 == vect_double_reduction_def
))
1531 && !vectorizable_lc_phi (stmt_info
, NULL
, NULL
))
1532 return opt_result::failure_at (phi
, "unsupported phi\n");
1538 gcc_assert (stmt_info
);
1540 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1541 || STMT_VINFO_LIVE_P (stmt_info
))
1542 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1543 /* A scalar-dependence cycle that we don't support. */
1544 return opt_result::failure_at (phi
,
1546 " scalar dependence cycle.\n");
1548 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1550 need_to_vectorize
= true;
1551 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1552 && ! PURE_SLP_STMT (stmt_info
))
1553 ok
= vectorizable_induction (stmt_info
, NULL
, NULL
, NULL
,
1555 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1556 || (STMT_VINFO_DEF_TYPE (stmt_info
)
1557 == vect_double_reduction_def
)
1558 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1559 && ! PURE_SLP_STMT (stmt_info
))
1560 ok
= vectorizable_reduction (stmt_info
, NULL
, NULL
, &cost_vec
);
1563 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1565 && STMT_VINFO_LIVE_P (stmt_info
)
1566 && !PURE_SLP_STMT (stmt_info
))
1567 ok
= vectorizable_live_operation (stmt_info
, NULL
, NULL
, NULL
,
1568 -1, false, &cost_vec
);
1571 return opt_result::failure_at (phi
,
1572 "not vectorized: relevant phi not "
1574 static_cast <gimple
*> (phi
));
1577 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1580 gimple
*stmt
= gsi_stmt (si
);
1581 if (!gimple_clobber_p (stmt
))
1584 = vect_analyze_stmt (loop_vinfo
->lookup_stmt (stmt
),
1586 NULL
, NULL
, &cost_vec
);
1593 add_stmt_costs (loop_vinfo
->target_cost_data
, &cost_vec
);
1595 /* All operations in the loop are either irrelevant (deal with loop
1596 control, or dead), or only used outside the loop and can be moved
1597 out of the loop (e.g. invariants, inductions). The loop can be
1598 optimized away by scalar optimizations. We're better off not
1599 touching this loop. */
1600 if (!need_to_vectorize
)
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE
, vect_location
,
1604 "All the computation can be taken out of the loop.\n");
1605 return opt_result::failure_at
1607 "not vectorized: redundant loop. no profit to vectorize.\n");
1610 return opt_result::success ();
1613 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1614 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1615 definitely no, or -1 if it's worth retrying. */
1618 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1620 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1621 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1623 /* Only fully-masked loops can have iteration counts less than the
1624 vectorization factor. */
1625 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1627 HOST_WIDE_INT max_niter
;
1629 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1630 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1632 max_niter
= max_stmt_executions_int (loop
);
1635 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1639 "not vectorized: iteration count smaller than "
1640 "vectorization factor.\n");
1645 int min_profitable_iters
, min_profitable_estimate
;
1646 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1647 &min_profitable_estimate
);
1649 if (min_profitable_iters
< 0)
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1653 "not vectorized: vectorization not profitable.\n");
1654 if (dump_enabled_p ())
1655 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1656 "not vectorized: vector version will never be "
1661 int min_scalar_loop_bound
= (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1664 /* Use the cost model only if it is more conservative than user specified
1666 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1667 min_profitable_iters
);
1669 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1671 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1672 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1674 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1676 "not vectorized: vectorization not profitable.\n");
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_NOTE
, vect_location
,
1679 "not vectorized: iteration count smaller than user "
1680 "specified loop bound parameter or minimum profitable "
1681 "iterations (whichever is more conservative).\n");
1685 HOST_WIDE_INT estimated_niter
= estimated_stmt_executions_int (loop
);
1686 if (estimated_niter
== -1)
1687 estimated_niter
= likely_max_stmt_executions_int (loop
);
1688 if (estimated_niter
!= -1
1689 && ((unsigned HOST_WIDE_INT
) estimated_niter
1690 < MAX (th
, (unsigned) min_profitable_estimate
)))
1692 if (dump_enabled_p ())
1693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1694 "not vectorized: estimated iteration count too "
1696 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_NOTE
, vect_location
,
1698 "not vectorized: estimated iteration count smaller "
1699 "than specified loop bound parameter or minimum "
1700 "profitable iterations (whichever is more "
1701 "conservative).\n");
1709 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
1710 vec
<data_reference_p
> *datarefs
,
1711 unsigned int *n_stmts
)
1714 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1715 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1716 !gsi_end_p (gsi
); gsi_next (&gsi
))
1718 gimple
*stmt
= gsi_stmt (gsi
);
1719 if (is_gimple_debug (stmt
))
1722 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
);
1725 if (is_gimple_call (stmt
) && loop
->safelen
)
1727 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1728 if (fndecl
!= NULL_TREE
)
1730 cgraph_node
*node
= cgraph_node::get (fndecl
);
1731 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1733 unsigned int j
, n
= gimple_call_num_args (stmt
);
1734 for (j
= 0; j
< n
; j
++)
1736 op
= gimple_call_arg (stmt
, j
);
1738 || (REFERENCE_CLASS_P (op
)
1739 && get_base_address (op
)))
1742 op
= gimple_call_lhs (stmt
);
1743 /* Ignore #pragma omp declare simd functions
1744 if they don't have data references in the
1745 call stmt itself. */
1749 || (REFERENCE_CLASS_P (op
)
1750 && get_base_address (op
)))))
1757 /* If dependence analysis will give up due to the limit on the
1758 number of datarefs stop here and fail fatally. */
1759 if (datarefs
->length ()
1760 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS
))
1761 return opt_result::failure_at (stmt
, "exceeded param "
1762 "loop-max-datarefs-for-datadeps\n");
1764 return opt_result::success ();
1767 /* Look for SLP-only access groups and turn each individual access into its own
1770 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo
)
1773 struct data_reference
*dr
;
1775 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1777 vec
<data_reference_p
> datarefs
= loop_vinfo
->shared
->datarefs
;
1778 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1780 gcc_assert (DR_REF (dr
));
1781 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (DR_STMT (dr
));
1783 /* Check if the load is a part of an interleaving chain. */
1784 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1786 stmt_vec_info first_element
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1787 unsigned int group_size
= DR_GROUP_SIZE (first_element
);
1789 /* Check if SLP-only groups. */
1790 if (!STMT_SLP_TYPE (stmt_info
)
1791 && STMT_VINFO_SLP_VECT_ONLY (first_element
))
1793 /* Dissolve the group. */
1794 STMT_VINFO_SLP_VECT_ONLY (first_element
) = false;
1796 stmt_vec_info vinfo
= first_element
;
1799 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (vinfo
);
1800 DR_GROUP_FIRST_ELEMENT (vinfo
) = vinfo
;
1801 DR_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
1802 DR_GROUP_SIZE (vinfo
) = 1;
1803 DR_GROUP_GAP (vinfo
) = group_size
- 1;
1812 /* Decides whether we need to create an epilogue loop to handle
1813 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1816 determine_peel_for_niter (loop_vec_info loop_vinfo
)
1818 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
1820 unsigned HOST_WIDE_INT const_vf
;
1821 HOST_WIDE_INT max_niter
1822 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1824 unsigned th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
1825 if (!th
&& LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
))
1826 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1829 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1830 /* The main loop handles all iterations. */
1831 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
1832 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1833 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
1835 /* Work out the (constant) number of iterations that need to be
1836 peeled for reasons other than niters. */
1837 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
1838 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
1840 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
1841 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1842 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1844 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1845 /* ??? When peeling for gaps but not alignment, we could
1846 try to check whether the (variable) niters is known to be
1847 VF * N + 1. That's something of a niche case though. */
1848 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
1849 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
1850 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
1851 < (unsigned) exact_log2 (const_vf
))
1852 /* In case of versioning, check if the maximum number of
1853 iterations is greater than th. If they are identical,
1854 the epilogue is unnecessary. */
1855 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
1856 || ((unsigned HOST_WIDE_INT
) max_niter
1857 > (th
/ const_vf
) * const_vf
))))
1858 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
1862 /* Function vect_analyze_loop_2.
1864 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865 for it. The different analyses will record information in the
1866 loop_vec_info struct. */
1868 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
, unsigned *n_stmts
)
1870 opt_result ok
= opt_result::success ();
1872 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
1873 poly_uint64 min_vf
= 2;
1875 /* The first group of checks is independent of the vector size. */
1878 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
1879 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
1880 return opt_result::failure_at (vect_location
,
1881 "not vectorized: simd if(0)\n");
1883 /* Find all data references in the loop (which correspond to vdefs/vuses)
1884 and analyze their evolution in the loop. */
1886 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1888 /* Gather the data references and count stmts in the loop. */
1889 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
1892 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
1893 &LOOP_VINFO_DATAREFS (loop_vinfo
),
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1899 "not vectorized: loop contains function "
1900 "calls or data references that cannot "
1904 loop_vinfo
->shared
->save_datarefs ();
1907 loop_vinfo
->shared
->check_datarefs ();
1909 /* Analyze the data references and also adjust the minimal
1910 vectorization factor according to the loads and stores. */
1912 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
, &fatal
);
1915 if (dump_enabled_p ())
1916 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1917 "bad data references.\n");
1921 /* Classify all cross-iteration scalar data-flow cycles.
1922 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1923 vect_analyze_scalar_cycles (loop_vinfo
);
1925 vect_pattern_recog (loop_vinfo
);
1927 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
1929 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1930 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1932 ok
= vect_analyze_data_ref_accesses (loop_vinfo
);
1935 if (dump_enabled_p ())
1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1937 "bad data access.\n");
1941 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1943 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
, &fatal
);
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1948 "unexpected pattern.\n");
1952 /* While the rest of the analysis below depends on it in some way. */
1955 /* Analyze data dependences between the data-refs in the loop
1956 and adjust the maximum vectorization factor according to
1958 FORNOW: fail at the first data dependence that we encounter. */
1960 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1965 "bad data dependence.\n");
1968 if (max_vf
!= MAX_VECTORIZATION_FACTOR
1969 && maybe_lt (max_vf
, min_vf
))
1970 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
1971 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
1973 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1976 if (dump_enabled_p ())
1977 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1978 "can't determine vectorization factor.\n");
1981 if (max_vf
!= MAX_VECTORIZATION_FACTOR
1982 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1983 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
1985 /* Compute the scalar iteration cost. */
1986 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
1988 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1990 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1991 ok
= vect_analyze_slp (loop_vinfo
, *n_stmts
);
1995 /* If there are any SLP instances mark them as pure_slp. */
1996 bool slp
= vect_make_slp_decision (loop_vinfo
);
1999 /* Find stmts that need to be both vectorized and SLPed. */
2000 vect_detect_hybrid_slp (loop_vinfo
);
2002 /* Update the vectorization factor based on the SLP decision. */
2003 vect_update_vf_for_slp (loop_vinfo
);
2006 bool saved_can_fully_mask_p
= LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
);
2008 /* We don't expect to have to roll back to anything other than an empty
2010 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
2012 /* This is the point where we can re-start analysis with SLP forced off. */
2015 /* Now the vectorization factor is final. */
2016 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2017 gcc_assert (known_ne (vectorization_factor
, 0U));
2019 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE
, vect_location
,
2022 "vectorization_factor = ");
2023 dump_dec (MSG_NOTE
, vectorization_factor
);
2024 dump_printf (MSG_NOTE
, ", niters = %wd\n",
2025 LOOP_VINFO_INT_NITERS (loop_vinfo
));
2028 /* Analyze the alignment of the data-refs in the loop.
2029 Fail if a data reference is found that cannot be vectorized. */
2031 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2036 "bad data alignment.\n");
2040 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2041 It is important to call pruning after vect_analyze_data_ref_accesses,
2042 since we use grouping information gathered by interleaving analysis. */
2043 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
2047 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2048 vectorization, since we do not want to add extra peeling or
2049 add versioning for alignment. */
2050 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
2051 /* This pass will decide on using loop versioning and/or loop peeling in
2052 order to enhance the alignment of data references in the loop. */
2053 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
2055 ok
= vect_verify_datarefs_alignment (loop_vinfo
);
2061 /* Analyze operations in the SLP instances. Note this may
2062 remove unsupported SLP instances which makes the above
2063 SLP kind detection invalid. */
2064 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
2065 vect_slp_analyze_operations (loop_vinfo
);
2066 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
2068 ok
= opt_result::failure_at (vect_location
,
2069 "unsupported SLP instances\n");
2074 /* Dissolve SLP-only groups. */
2075 vect_dissolve_slp_only_groups (loop_vinfo
);
2077 /* Scan all the remaining operations in the loop that are not subject
2078 to SLP and make sure they are vectorizable. */
2079 ok
= vect_analyze_loop_operations (loop_vinfo
);
2082 if (dump_enabled_p ())
2083 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2084 "bad operation or unsupported loop bound.\n");
2088 /* Decide whether to use a fully-masked loop for this vectorization
2090 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2091 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
)
2092 && vect_verify_full_masking (loop_vinfo
));
2093 if (dump_enabled_p ())
2095 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2096 dump_printf_loc (MSG_NOTE
, vect_location
,
2097 "using a fully-masked loop.\n");
2099 dump_printf_loc (MSG_NOTE
, vect_location
,
2100 "not using a fully-masked loop.\n");
2103 /* If epilog loop is required because of data accesses with gaps,
2104 one additional iteration needs to be peeled. Check if there is
2105 enough iterations for vectorization. */
2106 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2107 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2108 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2110 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2111 tree scalar_niters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2113 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2114 return opt_result::failure_at (vect_location
,
2115 "loop has no enough iterations to"
2116 " support peeling for gaps.\n");
2119 /* Check the costings of the loop make vectorizing worthwhile. */
2120 res
= vect_analyze_loop_costing (loop_vinfo
);
2123 ok
= opt_result::failure_at (vect_location
,
2124 "Loop costings may not be worthwhile.\n");
2128 return opt_result::failure_at (vect_location
,
2129 "Loop costings not worthwhile.\n");
2131 determine_peel_for_niter (loop_vinfo
);
2132 /* If an epilogue loop is required make sure we can create one. */
2133 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2134 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2138 if (!vect_can_advance_ivs_p (loop_vinfo
)
2139 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2140 single_exit (LOOP_VINFO_LOOP
2143 ok
= opt_result::failure_at (vect_location
,
2144 "not vectorized: can't create required "
2150 /* During peeling, we need to check if number of loop iterations is
2151 enough for both peeled prolog loop and vector loop. This check
2152 can be merged along with threshold check of loop versioning, so
2153 increase threshold for this case if necessary. */
2154 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
2156 poly_uint64 niters_th
= 0;
2157 unsigned int th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2159 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2161 /* Niters for peeled prolog loop. */
2162 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2164 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2165 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2166 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2169 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2172 /* Niters for at least one iteration of vectorized loop. */
2173 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2174 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2175 /* One additional iteration because of peeling for gap. */
2176 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2179 /* Use the same condition as vect_transform_loop to decide when to use
2180 the cost to determine a versioning threshold. */
2181 if (th
>= vect_vf_for_cost (loop_vinfo
)
2182 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2183 && ordered_p (th
, niters_th
))
2184 niters_th
= ordered_max (poly_uint64 (th
), niters_th
);
2186 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2189 gcc_assert (known_eq (vectorization_factor
,
2190 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2192 /* Ok to vectorize! */
2193 return opt_result::success ();
2196 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2199 /* Try again with SLP forced off but if we didn't do any SLP there is
2200 no point in re-trying. */
2204 /* If there are reduction chains re-trying will fail anyway. */
2205 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2208 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2209 via interleaving or lane instructions. */
2210 slp_instance instance
;
2213 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2215 stmt_vec_info vinfo
;
2216 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2217 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2219 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2220 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2221 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2222 if (! vect_store_lanes_supported (vectype
, size
, false)
2223 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2224 && ! vect_grouped_store_supported (vectype
, size
))
2225 return opt_result::failure_at (vinfo
->stmt
,
2226 "unsupported grouped store\n");
2227 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2229 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2230 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2231 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2232 size
= DR_GROUP_SIZE (vinfo
);
2233 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2234 if (! vect_load_lanes_supported (vectype
, size
, false)
2235 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2237 return opt_result::failure_at (vinfo
->stmt
,
2238 "unsupported grouped load\n");
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_NOTE
, vect_location
,
2244 "re-trying with SLP disabled\n");
2246 /* Roll back state appropriately. No SLP this time. */
2248 /* Restore vectorization factor as it were without SLP. */
2249 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2250 /* Free the SLP instances. */
2251 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2252 vect_free_slp_instance (instance
, false);
2253 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2254 /* Reset SLP type to loop_vect on all stmts. */
2255 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2257 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2258 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2259 !gsi_end_p (si
); gsi_next (&si
))
2261 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2262 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2264 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2265 !gsi_end_p (si
); gsi_next (&si
))
2267 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2268 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2269 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2271 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2272 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2273 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2274 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2275 !gsi_end_p (pi
); gsi_next (&pi
))
2276 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2281 /* Free optimized alias test DDRS. */
2282 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2283 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2284 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2285 /* Reset target cost data. */
2286 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2287 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2288 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2289 /* Reset accumulated rgroup information. */
2290 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo
));
2291 /* Reset assorted flags. */
2292 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2293 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2294 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2295 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2296 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = saved_can_fully_mask_p
;
2301 /* Function vect_analyze_loop.
2303 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2304 for it. The different analyses will record information in the
2305 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2308 vect_analyze_loop (class loop
*loop
, loop_vec_info orig_loop_vinfo
,
2309 vec_info_shared
*shared
)
2311 auto_vector_sizes vector_sizes
;
2313 /* Autodetect first vector size we try. */
2314 current_vector_size
= 0;
2315 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
,
2316 loop
->simdlen
!= 0);
2317 unsigned int next_size
= 0;
2319 DUMP_VECT_SCOPE ("analyze_loop_nest");
2321 if (loop_outer (loop
)
2322 && loop_vec_info_for_loop (loop_outer (loop
))
2323 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2324 return opt_loop_vec_info::failure_at (vect_location
,
2325 "outer-loop already vectorized.\n");
2327 if (!find_loop_nest (loop
, &shared
->loop_nest
))
2328 return opt_loop_vec_info::failure_at
2330 "not vectorized: loop nest containing two or more consecutive inner"
2331 " loops cannot be vectorized\n");
2333 unsigned n_stmts
= 0;
2334 poly_uint64 autodetected_vector_size
= 0;
2335 opt_loop_vec_info first_loop_vinfo
= opt_loop_vec_info::success (NULL
);
2336 poly_uint64 first_vector_size
= 0;
2339 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2340 opt_loop_vec_info loop_vinfo
2341 = vect_analyze_loop_form (loop
, shared
);
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2346 "bad loop form.\n");
2347 gcc_checking_assert (first_loop_vinfo
== NULL
);
2353 if (orig_loop_vinfo
)
2354 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = orig_loop_vinfo
;
2356 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
, &n_stmts
);
2359 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2362 && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo
),
2363 (unsigned HOST_WIDE_INT
) loop
->simdlen
))
2365 if (first_loop_vinfo
== NULL
)
2367 first_loop_vinfo
= loop_vinfo
;
2368 first_vector_size
= current_vector_size
;
2376 delete first_loop_vinfo
;
2384 autodetected_vector_size
= current_vector_size
;
2386 if (next_size
< vector_sizes
.length ()
2387 && known_eq (vector_sizes
[next_size
], autodetected_vector_size
))
2392 gcc_checking_assert (first_loop_vinfo
== NULL
);
2393 return opt_loop_vec_info::propagate_failure (res
);
2396 if (next_size
== vector_sizes
.length ()
2397 || known_eq (current_vector_size
, 0U))
2399 if (first_loop_vinfo
)
2401 current_vector_size
= first_vector_size
;
2402 loop
->aux
= (loop_vec_info
) first_loop_vinfo
;
2403 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_NOTE
, vect_location
,
2406 "***** Choosing vector size ");
2407 dump_dec (MSG_NOTE
, current_vector_size
);
2408 dump_printf (MSG_NOTE
, "\n");
2410 return first_loop_vinfo
;
2413 return opt_loop_vec_info::propagate_failure (res
);
2416 /* Try the next biggest vector size. */
2417 current_vector_size
= vector_sizes
[next_size
++];
2418 if (dump_enabled_p ())
2420 dump_printf_loc (MSG_NOTE
, vect_location
,
2421 "***** Re-trying analysis with "
2423 dump_dec (MSG_NOTE
, current_vector_size
);
2424 dump_printf (MSG_NOTE
, "\n");
2429 /* Return true if there is an in-order reduction function for CODE, storing
2430 it in *REDUC_FN if so. */
2433 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
2438 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
2446 /* Function reduction_fn_for_scalar_code
2449 CODE - tree_code of a reduction operations.
2452 REDUC_FN - the corresponding internal function to be used to reduce the
2453 vector of partial results into a single scalar result, or IFN_LAST
2454 if the operation is a supported reduction operation, but does not have
2455 such an internal function.
2457 Return FALSE if CODE currently cannot be vectorized as reduction. */
2460 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
2465 *reduc_fn
= IFN_REDUC_MAX
;
2469 *reduc_fn
= IFN_REDUC_MIN
;
2473 *reduc_fn
= IFN_REDUC_PLUS
;
2477 *reduc_fn
= IFN_REDUC_AND
;
2481 *reduc_fn
= IFN_REDUC_IOR
;
2485 *reduc_fn
= IFN_REDUC_XOR
;
2490 *reduc_fn
= IFN_LAST
;
2498 /* If there is a neutral value X such that SLP reduction NODE would not
2499 be affected by the introduction of additional X elements, return that X,
2500 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2501 is true if the SLP statements perform a single reduction, false if each
2502 statement performs an independent reduction. */
2505 neutral_op_for_slp_reduction (slp_tree slp_node
, tree_code code
,
2508 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2509 stmt_vec_info stmt_vinfo
= stmts
[0];
2510 tree vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
2511 tree scalar_type
= TREE_TYPE (vector_type
);
2512 class loop
*loop
= gimple_bb (stmt_vinfo
->stmt
)->loop_father
;
2517 case WIDEN_SUM_EXPR
:
2524 return build_zero_cst (scalar_type
);
2527 return build_one_cst (scalar_type
);
2530 return build_all_ones_cst (scalar_type
);
2534 /* For MIN/MAX the initial values are neutral. A reduction chain
2535 has only a single initial value, so that value is neutral for
2538 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
,
2539 loop_preheader_edge (loop
));
2547 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2548 STMT is printed with a message MSG. */
2551 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
2553 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
2556 /* Return true if we need an in-order reduction for operation CODE
2557 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2558 overflow must wrap. */
2561 needs_fold_left_reduction_p (tree type
, tree_code code
)
2563 /* CHECKME: check for !flag_finite_math_only too? */
2564 if (SCALAR_FLOAT_TYPE_P (type
))
2572 return !flag_associative_math
;
2575 if (INTEGRAL_TYPE_P (type
))
2577 if (!operation_no_trapping_overflow (type
, code
))
2582 if (SAT_FIXED_POINT_TYPE_P (type
))
2588 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2589 reduction operation CODE has a handled computation expression. */
2592 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
2593 tree loop_arg
, enum tree_code code
,
2594 vec
<std::pair
<ssa_op_iter
, use_operand_p
> > &path
)
2596 auto_bitmap visited
;
2597 tree lookfor
= PHI_RESULT (phi
);
2599 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
2600 while (USE_FROM_PTR (curr
) != loop_arg
)
2601 curr
= op_iter_next_use (&curri
);
2602 curri
.i
= curri
.numops
;
2605 path
.safe_push (std::make_pair (curri
, curr
));
2606 tree use
= USE_FROM_PTR (curr
);
2609 gimple
*def
= SSA_NAME_DEF_STMT (use
);
2610 if (gimple_nop_p (def
)
2611 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
2616 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
2620 curr
= op_iter_next_use (&curri
);
2621 /* Skip already visited or non-SSA operands (from iterating
2623 while (curr
!= NULL_USE_OPERAND_P
2624 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2625 || ! bitmap_set_bit (visited
,
2627 (USE_FROM_PTR (curr
)))));
2629 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
2630 if (curr
== NULL_USE_OPERAND_P
)
2635 if (gimple_code (def
) == GIMPLE_PHI
)
2636 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
2638 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
2639 while (curr
!= NULL_USE_OPERAND_P
2640 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2641 || ! bitmap_set_bit (visited
,
2643 (USE_FROM_PTR (curr
)))))
2644 curr
= op_iter_next_use (&curri
);
2645 if (curr
== NULL_USE_OPERAND_P
)
2650 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2652 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
2654 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
2655 FOR_EACH_VEC_ELT (path
, i
, x
)
2656 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
2657 dump_printf (MSG_NOTE
, "\n");
2660 /* Check whether the reduction path detected is valid. */
2661 bool fail
= path
.length () == 0;
2663 for (unsigned i
= 1; i
< path
.length (); ++i
)
2665 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
2666 tree op
= USE_FROM_PTR (path
[i
].second
);
2667 if (! has_single_use (op
)
2668 || ! is_gimple_assign (use_stmt
)
2669 /* The following make sure we can compute the operand index
2670 easily plus it mostly disallows chaining via COND_EXPR condition
2672 || (gimple_assign_rhs1 (use_stmt
) != op
2673 && gimple_assign_rhs2 (use_stmt
) != op
2674 && gimple_assign_rhs3 (use_stmt
) != op
))
2679 if (gimple_assign_rhs_code (use_stmt
) != code
)
2681 if (code
== PLUS_EXPR
2682 && gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
2684 /* Track whether we negate the reduction value each iteration. */
2685 if (gimple_assign_rhs2 (use_stmt
) == op
)
2695 return ! fail
&& ! neg
;
2699 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
2700 tree loop_arg
, enum tree_code code
)
2702 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
2703 return check_reduction_path (loc
, loop
, phi
, loop_arg
, code
, path
);
2708 /* Function vect_is_simple_reduction
2710 (1) Detect a cross-iteration def-use cycle that represents a simple
2711 reduction computation. We look for the following pattern:
2716 a2 = operation (a3, a1)
2723 a2 = operation (a3, a1)
2726 1. operation is commutative and associative and it is safe to
2727 change the order of the computation
2728 2. no uses for a2 in the loop (a2 is used out of the loop)
2729 3. no uses of a1 in the loop besides the reduction operation
2730 4. no uses of a1 outside the loop.
2732 Conditions 1,4 are tested here.
2733 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2735 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2738 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2742 inner loop (def of a3)
2745 (4) Detect condition expressions, ie:
2746 for (int i = 0; i < N; i++)
2752 static stmt_vec_info
2753 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
2756 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
2757 gimple
*phi_use_stmt
= NULL
;
2758 imm_use_iterator imm_iter
;
2759 use_operand_p use_p
;
2761 *double_reduc
= false;
2762 STMT_VINFO_REDUC_TYPE (phi_info
) = TREE_CODE_REDUCTION
;
2764 tree phi_name
= PHI_RESULT (phi
);
2765 /* ??? If there are no uses of the PHI result the inner loop reduction
2766 won't be detected as possibly double-reduction by vectorizable_reduction
2767 because that tries to walk the PHI arg from the preheader edge which
2768 can be constant. See PR60382. */
2769 if (has_zero_uses (phi_name
))
2771 class loop
*loop
= (gimple_bb (phi
))->loop_father
;
2772 unsigned nphi_def_loop_uses
= 0;
2773 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
2775 gimple
*use_stmt
= USE_STMT (use_p
);
2776 if (is_gimple_debug (use_stmt
))
2779 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2783 "intermediate value used outside loop.\n");
2788 nphi_def_loop_uses
++;
2789 phi_use_stmt
= use_stmt
;
2792 tree latch_def
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
2793 if (TREE_CODE (latch_def
) != SSA_NAME
)
2795 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2797 "reduction: not ssa_name: %T\n", latch_def
);
2801 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (latch_def
);
2803 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
2806 bool nested_in_vect_loop
2807 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info
), loop
);
2808 unsigned nlatch_def_loop_uses
= 0;
2809 auto_vec
<gphi
*, 3> lcphis
;
2810 bool inner_loop_of_double_reduc
= false;
2811 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, latch_def
)
2813 gimple
*use_stmt
= USE_STMT (use_p
);
2814 if (is_gimple_debug (use_stmt
))
2816 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2817 nlatch_def_loop_uses
++;
2820 /* We can have more than one loop-closed PHI. */
2821 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
2822 if (nested_in_vect_loop
2823 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
2824 == vect_double_reduction_def
))
2825 inner_loop_of_double_reduc
= true;
2829 /* If we are vectorizing an inner reduction we are executing that
2830 in the original order only in case we are not dealing with a
2831 double reduction. */
2832 if (nested_in_vect_loop
&& !inner_loop_of_double_reduc
)
2834 if (dump_enabled_p ())
2835 report_vect_op (MSG_NOTE
, def_stmt_info
->stmt
,
2836 "detected nested cycle: ");
2837 return def_stmt_info
;
2840 /* If this isn't a nested cycle or if the nested cycle reduction value
2841 is used ouside of the inner loop we cannot handle uses of the reduction
2843 if (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1)
2845 if (dump_enabled_p ())
2846 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2847 "reduction used in loop.\n");
2851 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2852 defined in the inner loop. */
2853 if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
2855 tree op1
= PHI_ARG_DEF (def_stmt
, 0);
2856 if (gimple_phi_num_args (def_stmt
) != 1
2857 || TREE_CODE (op1
) != SSA_NAME
)
2859 if (dump_enabled_p ())
2860 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2861 "unsupported phi node definition.\n");
2866 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
2867 if (gimple_bb (def1
)
2868 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2870 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2871 && is_gimple_assign (def1
)
2872 && is_a
<gphi
*> (phi_use_stmt
)
2873 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
2875 if (dump_enabled_p ())
2876 report_vect_op (MSG_NOTE
, def_stmt
,
2877 "detected double reduction: ");
2879 *double_reduc
= true;
2880 return def_stmt_info
;
2886 gassign
*def_stmt
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
);
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2891 "reduction: unhandled reduction operation: %G",
2892 def_stmt_info
->stmt
);
2895 enum tree_code code
= gimple_assign_rhs_code (def_stmt
);
2897 /* We can handle "res -= x[i]", which is non-associative by
2898 simply rewriting this into "res += -x[i]". Avoid changing
2899 gimple instruction for the first simple tests and only do this
2900 if we're allowed to change code at all. */
2901 if (code
== MINUS_EXPR
&& gimple_assign_rhs2 (def_stmt
) != phi_name
)
2905 if (code
== COND_EXPR
)
2907 if (! nested_in_vect_loop
)
2908 STMT_VINFO_REDUC_TYPE (phi_info
) = COND_REDUCTION
;
2909 op1
= gimple_assign_rhs2 (def_stmt
);
2910 op2
= gimple_assign_rhs3 (def_stmt
);
2912 else if (get_gimple_rhs_class (code
) == GIMPLE_BINARY_RHS
)
2914 op1
= gimple_assign_rhs1 (def_stmt
);
2915 op2
= gimple_assign_rhs2 (def_stmt
);
2919 if (dump_enabled_p ())
2920 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2921 "reduction: not handled operation: ");
2925 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
2927 if (dump_enabled_p ())
2928 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
2929 "reduction: both uses not ssa_names: ");
2934 /* Reduction is safe. We're dealing with one of the following:
2935 1) integer arithmetic and no trapv
2936 2) floating point arithmetic, and special flags permit this optimization
2937 3) nested cycle (i.e., outer loop vectorization). */
2939 /* Check for the simple case that one def is the reduction def,
2940 defined by the PHI node. */
2941 stmt_vec_info def1_info
= loop_info
->lookup_def (op1
);
2942 stmt_vec_info def2_info
= loop_info
->lookup_def (op2
);
2943 if (def2_info
&& def2_info
->stmt
== phi
)
2945 STMT_VINFO_REDUC_IDX (def_stmt_info
) = 1 + (code
== COND_EXPR
? 1 : 0);
2946 if (dump_enabled_p ())
2947 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2948 return def_stmt_info
;
2950 else if (def1_info
&& def1_info
->stmt
== phi
)
2952 STMT_VINFO_REDUC_IDX (def_stmt_info
) = 0 + (code
== COND_EXPR
? 1 : 0);
2953 if (dump_enabled_p ())
2954 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
2955 return def_stmt_info
;
2958 /* Look for the expression computing latch_def from then loop PHI result
2959 in a way involving more than one stmt. */
2960 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
2961 if (check_reduction_path (vect_location
, loop
, phi
, latch_def
, code
,
2964 /* Try building an SLP reduction chain for which the additional
2965 restriction is that all operations in the chain are the same. */
2966 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
2968 bool is_slp_reduc
= !nested_in_vect_loop
&& code
!= COND_EXPR
;
2969 for (i
= path
.length () - 1; i
>= 1; --i
)
2971 gimple
*stmt
= USE_STMT (path
[i
].second
);
2972 if (gimple_assign_rhs_code (stmt
) != code
)
2973 is_slp_reduc
= false;
2974 stmt_vec_info stmt_info
= loop_info
->lookup_stmt (stmt
);
2975 STMT_VINFO_REDUC_IDX (stmt_info
)
2976 = path
[i
].second
->use
- gimple_assign_rhs1_ptr (stmt
);
2977 reduc_chain
.safe_push (stmt_info
);
2981 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
2983 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
2984 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
2986 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
2987 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
2989 /* Save the chain for further analysis in SLP detection. */
2990 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
2991 REDUC_GROUP_SIZE (reduc_chain
[0]) = reduc_chain
.length ();
2993 if (dump_enabled_p ())
2994 report_vect_op (MSG_NOTE
, def_stmt
,
2995 "reduction: detected reduction chain: ");
2998 return def_stmt_info
;
3001 if (dump_enabled_p ())
3003 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3004 "reduction: unknown pattern: ");
3010 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3012 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3013 int *peel_iters_epilogue
,
3014 stmt_vector_for_cost
*scalar_cost_vec
,
3015 stmt_vector_for_cost
*prologue_cost_vec
,
3016 stmt_vector_for_cost
*epilogue_cost_vec
)
3019 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3021 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3023 *peel_iters_epilogue
= assumed_vf
/ 2;
3024 if (dump_enabled_p ())
3025 dump_printf_loc (MSG_NOTE
, vect_location
,
3026 "cost model: epilogue peel iters set to vf/2 "
3027 "because loop iterations are unknown .\n");
3029 /* If peeled iterations are known but number of scalar loop
3030 iterations are unknown, count a taken branch per peeled loop. */
3031 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3032 NULL
, 0, vect_prologue
);
3033 retval
+= record_stmt_cost (epilogue_cost_vec
, 1, cond_branch_taken
,
3034 NULL
, 0, vect_epilogue
);
3038 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3039 peel_iters_prologue
= niters
< peel_iters_prologue
?
3040 niters
: peel_iters_prologue
;
3041 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3042 /* If we need to peel for gaps, but no peeling is required, we have to
3043 peel VF iterations. */
3044 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
3045 *peel_iters_epilogue
= assumed_vf
;
3048 stmt_info_for_cost
*si
;
3050 if (peel_iters_prologue
)
3051 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3052 retval
+= record_stmt_cost (prologue_cost_vec
,
3053 si
->count
* peel_iters_prologue
,
3054 si
->kind
, si
->stmt_info
, si
->misalign
,
3056 if (*peel_iters_epilogue
)
3057 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3058 retval
+= record_stmt_cost (epilogue_cost_vec
,
3059 si
->count
* *peel_iters_epilogue
,
3060 si
->kind
, si
->stmt_info
, si
->misalign
,
3066 /* Function vect_estimate_min_profitable_iters
3068 Return the number of iterations required for the vector version of the
3069 loop to be profitable relative to the cost of the scalar version of the
3072 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3073 of iterations for vectorization. -1 value means loop vectorization
3074 is not profitable. This returned value may be used for dynamic
3075 profitability check.
3077 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3078 for static check against estimated number of iterations. */
3081 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3082 int *ret_min_profitable_niters
,
3083 int *ret_min_profitable_estimate
)
3085 int min_profitable_iters
;
3086 int min_profitable_estimate
;
3087 int peel_iters_prologue
;
3088 int peel_iters_epilogue
;
3089 unsigned vec_inside_cost
= 0;
3090 int vec_outside_cost
= 0;
3091 unsigned vec_prologue_cost
= 0;
3092 unsigned vec_epilogue_cost
= 0;
3093 int scalar_single_iter_cost
= 0;
3094 int scalar_outside_cost
= 0;
3095 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3096 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3097 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3099 /* Cost model disabled. */
3100 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3102 if (dump_enabled_p ())
3103 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3104 *ret_min_profitable_niters
= 0;
3105 *ret_min_profitable_estimate
= 0;
3109 /* Requires loop versioning tests to handle misalignment. */
3110 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3112 /* FIXME: Make cost depend on complexity of individual check. */
3113 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3114 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3116 if (dump_enabled_p ())
3117 dump_printf (MSG_NOTE
,
3118 "cost model: Adding cost of checks for loop "
3119 "versioning to treat misalignment.\n");
3122 /* Requires loop versioning with alias checks. */
3123 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3125 /* FIXME: Make cost depend on complexity of individual check. */
3126 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3127 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3129 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3131 /* Count LEN - 1 ANDs and LEN comparisons. */
3132 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1, scalar_stmt
,
3133 NULL
, 0, vect_prologue
);
3134 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3137 /* Count LEN - 1 ANDs and LEN comparisons. */
3138 unsigned int nstmts
= len
* 2 - 1;
3139 /* +1 for each bias that needs adding. */
3140 for (unsigned int i
= 0; i
< len
; ++i
)
3141 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3143 (void) add_stmt_cost (target_cost_data
, nstmts
, scalar_stmt
,
3144 NULL
, 0, vect_prologue
);
3146 if (dump_enabled_p ())
3147 dump_printf (MSG_NOTE
,
3148 "cost model: Adding cost of checks for loop "
3149 "versioning aliasing.\n");
3152 /* Requires loop versioning with niter checks. */
3153 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3155 /* FIXME: Make cost depend on complexity of individual check. */
3156 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
, NULL
, 0,
3158 if (dump_enabled_p ())
3159 dump_printf (MSG_NOTE
,
3160 "cost model: Adding cost of checks for loop "
3161 "versioning niters.\n");
3164 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3165 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
3168 /* Count statements in scalar loop. Using this as scalar cost for a single
3171 TODO: Add outer loop support.
3173 TODO: Consider assigning different costs to different scalar
3176 scalar_single_iter_cost
3177 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3179 /* Add additional cost for the peeled instructions in prologue and epilogue
3180 loop. (For fully-masked loops there will be no peeling.)
3182 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3183 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3185 TODO: Build an expression that represents peel_iters for prologue and
3186 epilogue to be used in a run-time test. */
3188 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3190 peel_iters_prologue
= 0;
3191 peel_iters_epilogue
= 0;
3193 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3195 /* We need to peel exactly one iteration. */
3196 peel_iters_epilogue
+= 1;
3197 stmt_info_for_cost
*si
;
3199 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
3201 (void) add_stmt_cost (target_cost_data
, si
->count
,
3202 si
->kind
, si
->stmt_info
, si
->misalign
,
3208 peel_iters_prologue
= assumed_vf
/ 2;
3209 if (dump_enabled_p ())
3210 dump_printf (MSG_NOTE
, "cost model: "
3211 "prologue peel iters set to vf/2.\n");
3213 /* If peeling for alignment is unknown, loop bound of main loop becomes
3215 peel_iters_epilogue
= assumed_vf
/ 2;
3216 if (dump_enabled_p ())
3217 dump_printf (MSG_NOTE
, "cost model: "
3218 "epilogue peel iters set to vf/2 because "
3219 "peeling for alignment is unknown.\n");
3221 /* If peeled iterations are unknown, count a taken branch and a not taken
3222 branch per peeled loop. Even if scalar loop iterations are known,
3223 vector iterations are not known since peeled prologue iterations are
3224 not known. Hence guards remain the same. */
3225 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3226 NULL
, 0, vect_prologue
);
3227 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3228 NULL
, 0, vect_prologue
);
3229 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3230 NULL
, 0, vect_epilogue
);
3231 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3232 NULL
, 0, vect_epilogue
);
3233 stmt_info_for_cost
*si
;
3235 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
3237 (void) add_stmt_cost (target_cost_data
,
3238 si
->count
* peel_iters_prologue
,
3239 si
->kind
, si
->stmt_info
, si
->misalign
,
3241 (void) add_stmt_cost (target_cost_data
,
3242 si
->count
* peel_iters_epilogue
,
3243 si
->kind
, si
->stmt_info
, si
->misalign
,
3249 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
3250 stmt_info_for_cost
*si
;
3252 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3254 prologue_cost_vec
.create (2);
3255 epilogue_cost_vec
.create (2);
3256 peel_iters_prologue
= npeel
;
3258 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
3259 &peel_iters_epilogue
,
3260 &LOOP_VINFO_SCALAR_ITERATION_COST
3263 &epilogue_cost_vec
);
3265 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
3266 (void) add_stmt_cost (data
, si
->count
, si
->kind
, si
->stmt_info
,
3267 si
->misalign
, vect_prologue
);
3269 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
3270 (void) add_stmt_cost (data
, si
->count
, si
->kind
, si
->stmt_info
,
3271 si
->misalign
, vect_epilogue
);
3273 prologue_cost_vec
.release ();
3274 epilogue_cost_vec
.release ();
3277 /* FORNOW: The scalar outside cost is incremented in one of the
3280 1. The vectorizer checks for alignment and aliasing and generates
3281 a condition that allows dynamic vectorization. A cost model
3282 check is ANDED with the versioning condition. Hence scalar code
3283 path now has the added cost of the versioning check.
3285 if (cost > th & versioning_check)
3288 Hence run-time scalar is incremented by not-taken branch cost.
3290 2. The vectorizer then checks if a prologue is required. If the
3291 cost model check was not done before during versioning, it has to
3292 be done before the prologue check.
3295 prologue = scalar_iters
3300 if (prologue == num_iters)
3303 Hence the run-time scalar cost is incremented by a taken branch,
3304 plus a not-taken branch, plus a taken branch cost.
3306 3. The vectorizer then checks if an epilogue is required. If the
3307 cost model check was not done before during prologue check, it
3308 has to be done with the epilogue check.
3314 if (prologue == num_iters)
3317 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3320 Hence the run-time scalar cost should be incremented by 2 taken
3323 TODO: The back end may reorder the BBS's differently and reverse
3324 conditions/branch directions. Change the estimates below to
3325 something more reasonable. */
3327 /* If the number of iterations is known and we do not do versioning, we can
3328 decide whether to vectorize at compile time. Hence the scalar version
3329 do not carry cost model guard costs. */
3330 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3331 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3333 /* Cost model check occurs at versioning. */
3334 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3335 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3338 /* Cost model check occurs at prologue generation. */
3339 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3340 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3341 + vect_get_stmt_cost (cond_branch_not_taken
);
3342 /* Cost model check occurs at epilogue generation. */
3344 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3348 /* Complete the target-specific cost calculations. */
3349 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3350 &vec_inside_cost
, &vec_epilogue_cost
);
3352 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3354 if (dump_enabled_p ())
3356 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3357 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3359 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3361 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3363 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3364 scalar_single_iter_cost
);
3365 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3366 scalar_outside_cost
);
3367 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3369 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3370 peel_iters_prologue
);
3371 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3372 peel_iters_epilogue
);
3375 /* Calculate number of iterations required to make the vector version
3376 profitable, relative to the loop bodies only. The following condition
3378 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3380 SIC = scalar iteration cost, VIC = vector iteration cost,
3381 VOC = vector outside cost, VF = vectorization factor,
3382 NPEEL = prologue iterations + epilogue iterations,
3383 SOC = scalar outside cost for run time cost model check. */
3385 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
3387 if (saving_per_viter
<= 0)
3389 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3390 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
3391 "vectorization did not happen for a simd loop");
3393 if (dump_enabled_p ())
3394 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3395 "cost model: the vector iteration cost = %d "
3396 "divided by the scalar iteration cost = %d "
3397 "is greater or equal to the vectorization factor = %d"
3399 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
3400 *ret_min_profitable_niters
= -1;
3401 *ret_min_profitable_estimate
= -1;
3405 /* ??? The "if" arm is written to handle all cases; see below for what
3406 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3407 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3409 /* Rewriting the condition above in terms of the number of
3410 vector iterations (vniters) rather than the number of
3411 scalar iterations (niters) gives:
3413 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3415 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3417 For integer N, X and Y when X > 0:
3419 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3420 int outside_overhead
= (vec_outside_cost
3421 - scalar_single_iter_cost
* peel_iters_prologue
3422 - scalar_single_iter_cost
* peel_iters_epilogue
3423 - scalar_outside_cost
);
3424 /* We're only interested in cases that require at least one
3425 vector iteration. */
3426 int min_vec_niters
= 1;
3427 if (outside_overhead
> 0)
3428 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
3430 if (dump_enabled_p ())
3431 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
3434 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3436 /* Now that we know the minimum number of vector iterations,
3437 find the minimum niters for which the scalar cost is larger:
3439 SIC * niters > VIC * vniters + VOC - SOC
3441 We know that the minimum niters is no more than
3442 vniters * VF + NPEEL, but it might be (and often is) less
3443 than that if a partial vector iteration is cheaper than the
3444 equivalent scalar code. */
3445 int threshold
= (vec_inside_cost
* min_vec_niters
3447 - scalar_outside_cost
);
3449 min_profitable_iters
= 1;
3451 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
3454 /* Convert the number of vector iterations into a number of
3455 scalar iterations. */
3456 min_profitable_iters
= (min_vec_niters
* assumed_vf
3457 + peel_iters_prologue
3458 + peel_iters_epilogue
);
3462 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
3464 - vec_inside_cost
* peel_iters_prologue
3465 - vec_inside_cost
* peel_iters_epilogue
);
3466 if (min_profitable_iters
<= 0)
3467 min_profitable_iters
= 0;
3470 min_profitable_iters
/= saving_per_viter
;
3472 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
3473 <= (((int) vec_inside_cost
* min_profitable_iters
)
3474 + (((int) vec_outside_cost
- scalar_outside_cost
)
3476 min_profitable_iters
++;
3480 if (dump_enabled_p ())
3481 dump_printf (MSG_NOTE
,
3482 " Calculated minimum iters for profitability: %d\n",
3483 min_profitable_iters
);
3485 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
3486 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
3487 /* We want the vectorized loop to execute at least once. */
3488 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
3490 if (dump_enabled_p ())
3491 dump_printf_loc (MSG_NOTE
, vect_location
,
3492 " Runtime profitability threshold = %d\n",
3493 min_profitable_iters
);
3495 *ret_min_profitable_niters
= min_profitable_iters
;
3497 /* Calculate number of iterations required to make the vector version
3498 profitable, relative to the loop bodies only.
3500 Non-vectorized variant is SIC * niters and it must win over vector
3501 variant on the expected loop trip count. The following condition must hold true:
3502 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3504 if (vec_outside_cost
<= 0)
3505 min_profitable_estimate
= 0;
3506 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3508 /* This is a repeat of the code above, but with + SOC rather
3510 int outside_overhead
= (vec_outside_cost
3511 - scalar_single_iter_cost
* peel_iters_prologue
3512 - scalar_single_iter_cost
* peel_iters_epilogue
3513 + scalar_outside_cost
);
3514 int min_vec_niters
= 1;
3515 if (outside_overhead
> 0)
3516 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
3518 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3520 int threshold
= (vec_inside_cost
* min_vec_niters
3522 + scalar_outside_cost
);
3523 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
3526 min_profitable_estimate
= (min_vec_niters
* assumed_vf
3527 + peel_iters_prologue
3528 + peel_iters_epilogue
);
3532 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
3534 - vec_inside_cost
* peel_iters_prologue
3535 - vec_inside_cost
* peel_iters_epilogue
)
3536 / ((scalar_single_iter_cost
* assumed_vf
)
3539 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE
, vect_location
,
3542 " Static estimate profitability threshold = %d\n",
3543 min_profitable_estimate
);
3545 *ret_min_profitable_estimate
= min_profitable_estimate
;
3548 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3549 vector elements (not bits) for a vector with NELT elements. */
3551 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
3552 vec_perm_builder
*sel
)
3554 /* The encoding is a single stepped pattern. Any wrap-around is handled
3555 by vec_perm_indices. */
3556 sel
->new_vector (nelt
, 1, 3);
3557 for (unsigned int i
= 0; i
< 3; i
++)
3558 sel
->quick_push (i
+ offset
);
3561 /* Checks whether the target supports whole-vector shifts for vectors of mode
3562 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3563 it supports vec_perm_const with masks for all necessary shift amounts. */
3565 have_whole_vector_shift (machine_mode mode
)
3567 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3570 /* Variable-length vectors should be handled via the optab. */
3572 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
3575 vec_perm_builder sel
;
3576 vec_perm_indices indices
;
3577 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
3579 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
3580 indices
.new_vector (sel
, 2, nelt
);
3581 if (!can_vec_perm_const_p (mode
, indices
, false))
3587 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3588 functions. Design better to avoid maintenance issues. */
3590 /* Function vect_model_reduction_cost.
3592 Models cost for a reduction operation, including the vector ops
3593 generated within the strip-mine loop, the initial definition before
3594 the loop, and the epilogue code that must be generated. */
3597 vect_model_reduction_cost (stmt_vec_info stmt_info
, internal_fn reduc_fn
,
3598 vect_reduction_type reduction_type
,
3599 int ncopies
, stmt_vector_for_cost
*cost_vec
)
3601 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
3602 enum tree_code code
;
3606 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3607 class loop
*loop
= NULL
;
3610 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3612 /* Condition reductions generate two reductions in the loop. */
3613 if (reduction_type
== COND_REDUCTION
)
3616 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3617 mode
= TYPE_MODE (vectype
);
3618 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3620 code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
3622 if (reduction_type
== EXTRACT_LAST_REDUCTION
3623 || reduction_type
== FOLD_LEFT_REDUCTION
)
3625 /* No extra instructions needed in the prologue. */
3628 if (reduction_type
== EXTRACT_LAST_REDUCTION
|| reduc_fn
!= IFN_LAST
)
3629 /* Count one reduction-like operation per vector. */
3630 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
3631 stmt_info
, 0, vect_body
);
3634 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3635 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
3636 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
3637 vec_to_scalar
, stmt_info
, 0,
3639 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
3640 scalar_stmt
, stmt_info
, 0,
3646 /* Add in cost for initial definition.
3647 For cond reduction we have four vectors: initial index, step,
3648 initial result of the data reduction, initial value of the index
3650 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
3651 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
3652 scalar_to_vec
, stmt_info
, 0,
3655 /* Cost of reduction op inside loop. */
3656 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
3657 stmt_info
, 0, vect_body
);
3660 /* Determine cost of epilogue code.
3662 We have a reduction operator that will reduce the vector in one statement.
3663 Also requires scalar extract. */
3665 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
3667 if (reduc_fn
!= IFN_LAST
)
3669 if (reduction_type
== COND_REDUCTION
)
3671 /* An EQ stmt and an COND_EXPR stmt. */
3672 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
3673 vector_stmt
, stmt_info
, 0,
3675 /* Reduction of the max index and a reduction of the found
3677 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
3678 vec_to_scalar
, stmt_info
, 0,
3680 /* A broadcast of the max value. */
3681 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3682 scalar_to_vec
, stmt_info
, 0,
3687 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
3688 stmt_info
, 0, vect_epilogue
);
3689 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3690 vec_to_scalar
, stmt_info
, 0,
3694 else if (reduction_type
== COND_REDUCTION
)
3696 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
3697 /* Extraction of scalar elements. */
3698 epilogue_cost
+= record_stmt_cost (cost_vec
,
3699 2 * estimated_nunits
,
3700 vec_to_scalar
, stmt_info
, 0,
3702 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3703 epilogue_cost
+= record_stmt_cost (cost_vec
,
3704 2 * estimated_nunits
- 3,
3705 scalar_stmt
, stmt_info
, 0,
3708 else if (reduction_type
== EXTRACT_LAST_REDUCTION
3709 || reduction_type
== FOLD_LEFT_REDUCTION
)
3710 /* No extra instructions need in the epilogue. */
3714 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3716 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info
->stmt
)));
3717 int element_bitsize
= tree_to_uhwi (bitsize
);
3718 int nelements
= vec_size_in_bits
/ element_bitsize
;
3720 if (code
== COND_EXPR
)
3723 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3725 /* We have a whole vector shift available. */
3726 if (optab
!= unknown_optab
3727 && VECTOR_MODE_P (mode
)
3728 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3729 && have_whole_vector_shift (mode
))
3731 /* Final reduction via vector shifts and the reduction operator.
3732 Also requires scalar extract. */
3733 epilogue_cost
+= record_stmt_cost (cost_vec
,
3734 exact_log2 (nelements
) * 2,
3735 vector_stmt
, stmt_info
, 0,
3737 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3738 vec_to_scalar
, stmt_info
, 0,
3742 /* Use extracts and reduction op for final reduction. For N
3743 elements, we have N extracts and N-1 reduction ops. */
3744 epilogue_cost
+= record_stmt_cost (cost_vec
,
3745 nelements
+ nelements
- 1,
3746 vector_stmt
, stmt_info
, 0,
3751 if (dump_enabled_p ())
3752 dump_printf (MSG_NOTE
,
3753 "vect_model_reduction_cost: inside_cost = %d, "
3754 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
3755 prologue_cost
, epilogue_cost
);
3759 /* Function vect_model_induction_cost.
3761 Models cost for induction operations. */
3764 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
,
3765 stmt_vector_for_cost
*cost_vec
)
3767 unsigned inside_cost
, prologue_cost
;
3769 if (PURE_SLP_STMT (stmt_info
))
3772 /* loop cost for vec_loop. */
3773 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
3774 stmt_info
, 0, vect_body
);
3776 /* prologue cost for vec_init and vec_step. */
3777 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
3778 stmt_info
, 0, vect_prologue
);
3780 if (dump_enabled_p ())
3781 dump_printf_loc (MSG_NOTE
, vect_location
,
3782 "vect_model_induction_cost: inside_cost = %d, "
3783 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
3788 /* Function get_initial_def_for_reduction
3791 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3792 INIT_VAL - the initial value of the reduction variable
3795 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3796 of the reduction (used for adjusting the epilog - see below).
3797 Return a vector variable, initialized according to the operation that
3798 STMT_VINFO performs. This vector will be used as the initial value
3799 of the vector of partial results.
3801 Option1 (adjust in epilog): Initialize the vector as follows:
3802 add/bit or/xor: [0,0,...,0,0]
3803 mult/bit and: [1,1,...,1,1]
3804 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3805 and when necessary (e.g. add/mult case) let the caller know
3806 that it needs to adjust the result by init_val.
3808 Option2: Initialize the vector as follows:
3809 add/bit or/xor: [init_val,0,0,...,0]
3810 mult/bit and: [init_val,1,1,...,1]
3811 min/max/cond_expr: [init_val,init_val,...,init_val]
3812 and no adjustments are needed.
3814 For example, for the following code:
3820 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3821 For a vector of 4 units, we want to return either [0,0,0,init_val],
3822 or [0,0,0,0] and let the caller know that it needs to adjust
3823 the result at the end by 'init_val'.
3825 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3826 initialization vector is simpler (same element in all entries), if
3827 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3829 A cost model should help decide between these two schemes. */
3832 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo
,
3833 enum tree_code code
, tree init_val
,
3834 tree
*adjustment_def
)
3836 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
3837 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3838 tree scalar_type
= TREE_TYPE (init_val
);
3839 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
3842 REAL_VALUE_TYPE real_init_val
= dconst0
;
3843 int int_init_val
= 0;
3844 gimple_seq stmts
= NULL
;
3846 gcc_assert (vectype
);
3848 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
3849 || SCALAR_FLOAT_TYPE_P (scalar_type
));
3851 gcc_assert (nested_in_vect_loop_p (loop
, stmt_vinfo
)
3852 || loop
== (gimple_bb (stmt_vinfo
->stmt
))->loop_father
);
3854 /* ADJUSTMENT_DEF is NULL when called from
3855 vect_create_epilog_for_reduction to vectorize double reduction. */
3857 *adjustment_def
= NULL
;
3861 case WIDEN_SUM_EXPR
:
3871 if (code
== MULT_EXPR
)
3873 real_init_val
= dconst1
;
3877 if (code
== BIT_AND_EXPR
)
3880 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
3881 def_for_init
= build_real (scalar_type
, real_init_val
);
3883 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
3885 if (adjustment_def
|| operand_equal_p (def_for_init
, init_val
, 0))
3887 /* Option1: the first element is '0' or '1' as well. */
3888 if (!operand_equal_p (def_for_init
, init_val
, 0))
3889 *adjustment_def
= init_val
;
3890 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
3893 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
3895 /* Option2 (variable length): the first element is INIT_VAL. */
3896 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
3898 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
3899 vectype
, init_def
, init_val
);
3903 /* Option2: the first element is INIT_VAL. */
3904 tree_vector_builder
elts (vectype
, 1, 2);
3905 elts
.quick_push (init_val
);
3906 elts
.quick_push (def_for_init
);
3907 init_def
= gimple_build_vector (&stmts
, &elts
);
3916 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
3917 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
3926 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
3930 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
3931 NUMBER_OF_VECTORS is the number of vector defs to create.
3932 If NEUTRAL_OP is nonnull, introducing extra elements of that
3933 value will not change the result. */
3936 get_initial_defs_for_reduction (slp_tree slp_node
,
3937 vec
<tree
> *vec_oprnds
,
3938 unsigned int number_of_vectors
,
3939 bool reduc_chain
, tree neutral_op
)
3941 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
3942 stmt_vec_info stmt_vinfo
= stmts
[0];
3943 unsigned HOST_WIDE_INT nunits
;
3944 unsigned j
, number_of_places_left_in_vector
;
3946 unsigned int group_size
= stmts
.length ();
3950 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3952 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
3954 loop
= (gimple_bb (stmt_vinfo
->stmt
))->loop_father
;
3956 edge pe
= loop_preheader_edge (loop
);
3958 gcc_assert (!reduc_chain
|| neutral_op
);
3960 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3961 created vectors. It is greater than 1 if unrolling is performed.
3963 For example, we have two scalar operands, s1 and s2 (e.g., group of
3964 strided accesses of size two), while NUNITS is four (i.e., four scalars
3965 of this type can be packed in a vector). The output vector will contain
3966 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3969 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
3970 vectors containing the operands.
3972 For example, NUNITS is four as before, and the group size is 8
3973 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3974 {s5, s6, s7, s8}. */
3976 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
3977 nunits
= group_size
;
3979 number_of_places_left_in_vector
= nunits
;
3980 bool constant_p
= true;
3981 tree_vector_builder
elts (vector_type
, nunits
, 1);
3982 elts
.quick_grow (nunits
);
3983 gimple_seq ctor_seq
= NULL
;
3984 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
3988 stmt_vinfo
= stmts
[i
];
3990 /* Get the def before the loop. In reduction chain we have only
3991 one initial value. Else we have as many as PHIs in the group. */
3993 op
= j
!= 0 ? neutral_op
: PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
3994 else if (((vec_oprnds
->length () + 1) * nunits
3995 - number_of_places_left_in_vector
>= group_size
)
3999 op
= PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4001 /* Create 'vect_ = {op0,op1,...,opn}'. */
4002 number_of_places_left_in_vector
--;
4003 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
4004 if (!CONSTANT_CLASS_P (op
))
4007 if (number_of_places_left_in_vector
== 0)
4010 if (constant_p
&& !neutral_op
4011 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4012 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4013 /* Build the vector directly from ELTS. */
4014 init
= gimple_build_vector (&ctor_seq
, &elts
);
4015 else if (neutral_op
)
4017 /* Build a vector of the neutral value and shift the
4018 other elements into place. */
4019 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4022 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4027 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4028 vector_type
, init
, elts
[k
]);
4033 /* First time round, duplicate ELTS to fill the
4034 required number of vectors. */
4035 duplicate_and_interleave (&ctor_seq
, vector_type
, elts
,
4036 number_of_vectors
, *vec_oprnds
);
4039 vec_oprnds
->quick_push (init
);
4041 number_of_places_left_in_vector
= nunits
;
4042 elts
.new_vector (vector_type
, nunits
, 1);
4043 elts
.quick_grow (nunits
);
4047 if (ctor_seq
!= NULL
)
4048 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4051 /* For a statement STMT_INFO taking part in a reduction operation return
4052 the stmt_vec_info the meta information is stored on. */
4055 info_for_reduction (stmt_vec_info stmt_info
)
4057 stmt_info
= vect_orig_stmt (stmt_info
);
4058 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info
));
4059 if (!is_a
<gphi
*> (stmt_info
->stmt
))
4060 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4061 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
4062 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4064 if (gimple_phi_num_args (phi
) == 1)
4065 stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
4067 else if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
4069 edge pe
= loop_preheader_edge (gimple_bb (phi
)->loop_father
);
4071 = stmt_info
->vinfo
->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi
, pe
));
4072 if (info
&& STMT_VINFO_DEF_TYPE (info
) == vect_double_reduction_def
)
4078 /* Function vect_create_epilog_for_reduction
4080 Create code at the loop-epilog to finalize the result of a reduction
4083 STMT_INFO is the scalar reduction stmt that is being vectorized.
4084 SLP_NODE is an SLP node containing a group of reduction statements. The
4085 first one in this group is STMT_INFO.
4086 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4087 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4091 1. Completes the reduction def-use cycles.
4092 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4093 by calling the function specified by REDUC_FN if available, or by
4094 other means (whole-vector shifts or a scalar loop).
4095 The function also creates a new phi node at the loop exit to preserve
4096 loop-closed form, as illustrated below.
4098 The flow at the entry to this function:
4101 vec_def = phi <vec_init, null> # REDUCTION_PHI
4102 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4103 s_loop = scalar_stmt # (scalar) STMT_INFO
4105 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4109 The above is transformed by this function into:
4112 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4113 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4114 s_loop = scalar_stmt # (scalar) STMT_INFO
4116 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4117 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4118 v_out2 = reduce <v_out1>
4119 s_out3 = extract_field <v_out2, 0>
4120 s_out4 = adjust_result <s_out3>
4126 vect_create_epilog_for_reduction (stmt_vec_info stmt_info
,
4128 slp_instance slp_node_instance
)
4130 stmt_vec_info reduc_info
= info_for_reduction (stmt_info
);
4131 gcc_assert (reduc_info
->is_reduc_info
);
4132 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4133 /* For double reductions we need to get at the inner loop reduction
4134 stmt which has the meta info attached. Our stmt_info is that of the
4135 loop-closed PHI of the inner loop which we remember as
4136 def for the reduction PHI generation. */
4137 bool double_reduc
= false;
4138 stmt_vec_info rdef_info
= stmt_info
;
4139 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
4141 gcc_assert (!slp_node
);
4142 double_reduc
= true;
4143 stmt_info
= loop_vinfo
->lookup_def (gimple_phi_arg_def
4144 (stmt_info
->stmt
, 0));
4145 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
4147 gphi
*reduc_def_stmt
4148 = as_a
<gphi
*> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))->stmt
);
4149 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
4150 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
4151 tree neutral_op
= NULL_TREE
;
4154 = neutral_op_for_slp_reduction (slp_node_instance
->reduc_phis
, code
,
4155 REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
4156 stmt_vec_info prev_phi_info
;
4159 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4160 basic_block exit_bb
;
4163 gimple
*new_phi
= NULL
, *phi
;
4164 stmt_vec_info phi_info
;
4165 gimple_stmt_iterator exit_gsi
;
4167 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
4168 gimple
*epilog_stmt
= NULL
;
4172 tree orig_name
, scalar_result
;
4173 imm_use_iterator imm_iter
, phi_imm_iter
;
4174 use_operand_p use_p
, phi_use_p
;
4176 bool nested_in_vect_loop
= false;
4177 auto_vec
<gimple
*> new_phis
;
4179 auto_vec
<tree
> scalar_results
;
4180 unsigned int group_size
= 1, k
;
4181 auto_vec
<gimple
*> phis
;
4182 bool slp_reduc
= false;
4183 bool direct_slp_reduc
;
4184 tree new_phi_result
;
4185 tree induction_index
= NULL_TREE
;
4188 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4190 if (nested_in_vect_loop_p (loop
, stmt_info
))
4194 nested_in_vect_loop
= true;
4195 gcc_assert (!slp_node
);
4197 gcc_assert (!nested_in_vect_loop
|| double_reduc
);
4199 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4200 gcc_assert (vectype
);
4201 mode
= TYPE_MODE (vectype
);
4203 tree initial_def
= NULL
;
4204 tree induc_val
= NULL_TREE
;
4205 tree adjustment_def
= NULL
;
4210 /* Get at the scalar def before the loop, that defines the initial value
4211 of the reduction variable. */
4212 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4213 loop_preheader_edge (loop
));
4214 /* Optimize: for induction condition reduction, if we can't use zero
4215 for induc_val, use initial_def. */
4216 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
4217 induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
4218 else if (double_reduc
)
4220 else if (nested_in_vect_loop
)
4223 adjustment_def
= STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
);
4230 vec_num
= SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
).length ();
4237 phi_info
= STMT_VINFO_VEC_STMT (loop_vinfo
->lookup_stmt (reduc_def_stmt
));
4241 phi_info
= STMT_VINFO_RELATED_STMT (phi_info
);
4246 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4247 which is updated with the current index of the loop for every match of
4248 the original loop's cond_expr (VEC_STMT). This results in a vector
4249 containing the last time the condition passed for that vector lane.
4250 The first match will be a 1 to allow 0 to be used for non-matching
4251 indexes. If there are no matches at all then the vector will be all
4253 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
)
4255 tree indx_before_incr
, indx_after_incr
;
4256 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
4258 gimple
*vec_stmt
= STMT_VINFO_VEC_STMT (stmt_info
)->stmt
;
4259 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
4261 int scalar_precision
4262 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
4263 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
4264 tree cr_index_vector_type
= build_vector_type
4265 (cr_index_scalar_type
, TYPE_VECTOR_SUBPARTS (vectype
));
4267 /* First we create a simple vector induction variable which starts
4268 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4269 vector size (STEP). */
4271 /* Create a {1,2,3,...} vector. */
4272 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
4274 /* Create a vector of the step value. */
4275 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
4276 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
4278 /* Create an induction variable. */
4279 gimple_stmt_iterator incr_gsi
;
4281 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4282 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
4283 insert_after
, &indx_before_incr
, &indx_after_incr
);
4285 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4286 filled with zeros (VEC_ZERO). */
4288 /* Create a vector of 0s. */
4289 tree zero
= build_zero_cst (cr_index_scalar_type
);
4290 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
4292 /* Create a vector phi node. */
4293 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
4294 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
4295 loop_vinfo
->add_stmt (new_phi
);
4296 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
4297 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4299 /* Now take the condition from the loops original cond_expr
4300 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4301 every match uses values from the induction variable
4302 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4304 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4305 the new cond_expr (INDEX_COND_EXPR). */
4307 /* Duplicate the condition from vec_stmt. */
4308 tree ccompare
= unshare_expr (gimple_assign_rhs1 (vec_stmt
));
4310 /* Create a conditional, where the condition is taken from vec_stmt
4311 (CCOMPARE). The then and else values mirror the main VEC_COND_EXPR:
4312 the reduction phi corresponds to NEW_PHI_TREE and the new values
4313 correspond to INDEX_BEFORE_INCR. */
4314 gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info
) >= 1);
4315 tree index_cond_expr
;
4316 if (STMT_VINFO_REDUC_IDX (stmt_info
) == 2)
4317 index_cond_expr
= build3 (VEC_COND_EXPR
, cr_index_vector_type
,
4318 ccompare
, indx_before_incr
, new_phi_tree
);
4320 index_cond_expr
= build3 (VEC_COND_EXPR
, cr_index_vector_type
,
4321 ccompare
, new_phi_tree
, indx_before_incr
);
4322 induction_index
= make_ssa_name (cr_index_vector_type
);
4323 gimple
*index_condition
= gimple_build_assign (induction_index
,
4325 gsi_insert_before (&incr_gsi
, index_condition
, GSI_SAME_STMT
);
4326 stmt_vec_info index_vec_info
= loop_vinfo
->add_stmt (index_condition
);
4327 STMT_VINFO_VECTYPE (index_vec_info
) = cr_index_vector_type
;
4329 /* Update the phi with the vec cond. */
4330 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
4331 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
4334 /* 2. Create epilog code.
4335 The reduction epilog code operates across the elements of the vector
4336 of partial results computed by the vectorized loop.
4337 The reduction epilog code consists of:
4339 step 1: compute the scalar result in a vector (v_out2)
4340 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4341 step 3: adjust the scalar result (s_out3) if needed.
4343 Step 1 can be accomplished using one the following three schemes:
4344 (scheme 1) using reduc_fn, if available.
4345 (scheme 2) using whole-vector shifts, if available.
4346 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4349 The overall epilog code looks like this:
4351 s_out0 = phi <s_loop> # original EXIT_PHI
4352 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4353 v_out2 = reduce <v_out1> # step 1
4354 s_out3 = extract_field <v_out2, 0> # step 2
4355 s_out4 = adjust_result <s_out3> # step 3
4357 (step 3 is optional, and steps 1 and 2 may be combined).
4358 Lastly, the uses of s_out0 are replaced by s_out4. */
4361 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4362 v_out1 = phi <VECT_DEF>
4363 Store them in NEW_PHIS. */
4366 exit_bb
= single_exit (loop
)->dest
;
4367 prev_phi_info
= NULL
;
4368 new_phis
.create (slp_node
? vec_num
: ncopies
);
4369 for (unsigned i
= 0; i
< vec_num
; i
++)
4372 def
= gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]->stmt
);
4374 def
= gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info
)->stmt
);
4375 for (j
= 0; j
< ncopies
; j
++)
4377 tree new_def
= copy_ssa_name (def
);
4378 phi
= create_phi_node (new_def
, exit_bb
);
4379 stmt_vec_info phi_info
= loop_vinfo
->add_stmt (phi
);
4381 new_phis
.quick_push (phi
);
4384 def
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, def
);
4385 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi_info
;
4388 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4389 prev_phi_info
= phi_info
;
4393 exit_gsi
= gsi_after_labels (exit_bb
);
4395 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4396 (i.e. when reduc_fn is not available) and in the final adjustment
4397 code (if needed). Also get the original scalar reduction variable as
4398 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4399 represents a reduction pattern), the tree-code and scalar-def are
4400 taken from the original stmt that the pattern-stmt (STMT) replaces.
4401 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4402 are taken from STMT. */
4404 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4405 if (orig_stmt_info
!= stmt_info
)
4407 /* Reduction pattern */
4408 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4409 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
4412 scalar_dest
= gimple_assign_lhs (orig_stmt_info
->stmt
);
4413 scalar_type
= TREE_TYPE (scalar_dest
);
4414 scalar_results
.create (group_size
);
4415 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4416 bitsize
= TYPE_SIZE (scalar_type
);
4418 /* SLP reduction without reduction chain, e.g.,
4422 b2 = operation (b1) */
4423 slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
4425 /* True if we should implement SLP_REDUC using native reduction operations
4426 instead of scalar operations. */
4427 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
4429 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
4431 /* In case of reduction chain, e.g.,
4434 a3 = operation (a2),
4436 we may end up with more than one vector result. Here we reduce them to
4438 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) || direct_slp_reduc
)
4440 tree first_vect
= PHI_RESULT (new_phis
[0]);
4441 gassign
*new_vec_stmt
= NULL
;
4442 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4443 for (k
= 1; k
< new_phis
.length (); k
++)
4445 gimple
*next_phi
= new_phis
[k
];
4446 tree second_vect
= PHI_RESULT (next_phi
);
4447 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
4448 new_vec_stmt
= gimple_build_assign (tem
, code
,
4449 first_vect
, second_vect
);
4450 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4454 new_phi_result
= first_vect
;
4457 new_phis
.truncate (0);
4458 new_phis
.safe_push (new_vec_stmt
);
4461 /* Likewise if we couldn't use a single defuse cycle. */
4462 else if (ncopies
> 1)
4464 gcc_assert (new_phis
.length () == 1);
4465 tree first_vect
= PHI_RESULT (new_phis
[0]);
4466 gassign
*new_vec_stmt
= NULL
;
4467 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4468 stmt_vec_info next_phi_info
= loop_vinfo
->lookup_stmt (new_phis
[0]);
4469 for (int k
= 1; k
< ncopies
; ++k
)
4471 next_phi_info
= STMT_VINFO_RELATED_STMT (next_phi_info
);
4472 tree second_vect
= PHI_RESULT (next_phi_info
->stmt
);
4473 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
4474 new_vec_stmt
= gimple_build_assign (tem
, code
,
4475 first_vect
, second_vect
);
4476 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4479 new_phi_result
= first_vect
;
4480 new_phis
.truncate (0);
4481 new_phis
.safe_push (new_vec_stmt
);
4484 new_phi_result
= PHI_RESULT (new_phis
[0]);
4486 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
4487 && reduc_fn
!= IFN_LAST
)
4489 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4490 various data values where the condition matched and another vector
4491 (INDUCTION_INDEX) containing all the indexes of those matches. We
4492 need to extract the last matching index (which will be the index with
4493 highest value) and use this to index into the data vector.
4494 For the case where there were no matches, the data vector will contain
4495 all default values and the index vector will be all zeros. */
4497 /* Get various versions of the type of the vector of indexes. */
4498 tree index_vec_type
= TREE_TYPE (induction_index
);
4499 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
4500 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
4501 tree index_vec_cmp_type
= build_same_sized_truth_vector_type
4504 /* Get an unsigned integer version of the type of the data vector. */
4505 int scalar_precision
4506 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
4507 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
4508 tree vectype_unsigned
= build_vector_type
4509 (scalar_type_unsigned
, TYPE_VECTOR_SUBPARTS (vectype
));
4511 /* First we need to create a vector (ZERO_VEC) of zeros and another
4512 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4513 can create using a MAX reduction and then expanding.
4514 In the case where the loop never made any matches, the max index will
4517 /* Vector of {0, 0, 0,...}. */
4518 tree zero_vec
= make_ssa_name (vectype
);
4519 tree zero_vec_rhs
= build_zero_cst (vectype
);
4520 gimple
*zero_vec_stmt
= gimple_build_assign (zero_vec
, zero_vec_rhs
);
4521 gsi_insert_before (&exit_gsi
, zero_vec_stmt
, GSI_SAME_STMT
);
4523 /* Find maximum value from the vector of found indexes. */
4524 tree max_index
= make_ssa_name (index_scalar_type
);
4525 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4526 1, induction_index
);
4527 gimple_call_set_lhs (max_index_stmt
, max_index
);
4528 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
4530 /* Vector of {max_index, max_index, max_index,...}. */
4531 tree max_index_vec
= make_ssa_name (index_vec_type
);
4532 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
4534 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
4536 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
4538 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4539 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4540 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4541 otherwise. Only one value should match, resulting in a vector
4542 (VEC_COND) with one data value and the rest zeros.
4543 In the case where the loop never made any matches, every index will
4544 match, resulting in a vector with all data values (which will all be
4545 the default value). */
4547 /* Compare the max index vector to the vector of found indexes to find
4548 the position of the max value. */
4549 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
4550 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
4553 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
4555 /* Use the compare to choose either values from the data vector or
4557 tree vec_cond
= make_ssa_name (vectype
);
4558 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
4559 vec_compare
, new_phi_result
,
4561 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
4563 /* Finally we need to extract the data value from the vector (VEC_COND)
4564 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4565 reduction, but because this doesn't exist, we can use a MAX reduction
4566 instead. The data value might be signed or a float so we need to cast
4568 In the case where the loop never made any matches, the data values are
4569 all identical, and so will reduce down correctly. */
4571 /* Make the matched data values unsigned. */
4572 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
4573 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
4575 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
4578 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
4580 /* Reduce down to a scalar value. */
4581 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
4582 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4584 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
4585 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
4587 /* Convert the reduced value back to the result type and set as the
4589 gimple_seq stmts
= NULL
;
4590 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
4592 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4593 scalar_results
.safe_push (new_temp
);
4595 else if (STMT_VINFO_REDUC_TYPE (reduc_info
) == COND_REDUCTION
4596 && reduc_fn
== IFN_LAST
)
4598 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4600 idx_val = induction_index[0];
4601 val = data_reduc[0];
4602 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4603 if (induction_index[i] > idx_val)
4604 val = data_reduc[i], idx_val = induction_index[i];
4607 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4608 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
4609 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
4610 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
4611 /* Enforced by vectorizable_reduction, which ensures we have target
4612 support before allowing a conditional reduction on variable-length
4614 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
4615 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
4616 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
4618 tree old_idx_val
= idx_val
;
4620 idx_val
= make_ssa_name (idx_eltype
);
4621 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
4622 build3 (BIT_FIELD_REF
, idx_eltype
,
4624 bitsize_int (el_size
),
4625 bitsize_int (off
)));
4626 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4627 val
= make_ssa_name (data_eltype
);
4628 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
4629 build3 (BIT_FIELD_REF
,
4632 bitsize_int (el_size
),
4633 bitsize_int (off
)));
4634 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4637 tree new_idx_val
= idx_val
;
4638 if (off
!= v_size
- el_size
)
4640 new_idx_val
= make_ssa_name (idx_eltype
);
4641 epilog_stmt
= gimple_build_assign (new_idx_val
,
4644 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4646 tree new_val
= make_ssa_name (data_eltype
);
4647 epilog_stmt
= gimple_build_assign (new_val
,
4654 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4655 idx_val
= new_idx_val
;
4659 /* Convert the reduced value back to the result type and set as the
4661 gimple_seq stmts
= NULL
;
4662 val
= gimple_convert (&stmts
, scalar_type
, val
);
4663 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4664 scalar_results
.safe_push (val
);
4667 /* 2.3 Create the reduction code, using one of the three schemes described
4668 above. In SLP we simply need to extract all the elements from the
4669 vector (without reducing them), so we use scalar shifts. */
4670 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
4676 v_out2 = reduc_expr <v_out1> */
4678 if (dump_enabled_p ())
4679 dump_printf_loc (MSG_NOTE
, vect_location
,
4680 "Reduce using direct vector reduction.\n");
4682 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4683 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
4686 = vect_create_destination_var (scalar_dest
, vec_elem_type
);
4687 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
4689 gimple_set_lhs (epilog_stmt
, tmp_dest
);
4690 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
4691 gimple_set_lhs (epilog_stmt
, new_temp
);
4692 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4694 epilog_stmt
= gimple_build_assign (new_scalar_dest
, NOP_EXPR
,
4699 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
4701 gimple_set_lhs (epilog_stmt
, new_scalar_dest
);
4704 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4705 gimple_set_lhs (epilog_stmt
, new_temp
);
4706 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4708 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
4711 /* Earlier we set the initial value to be a vector if induc_val
4712 values. Check the result and if it is induc_val then replace
4713 with the original initial value, unless induc_val is
4714 the same as initial_def already. */
4715 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
4718 tmp
= make_ssa_name (new_scalar_dest
);
4719 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
4720 initial_def
, new_temp
);
4721 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4725 scalar_results
.safe_push (new_temp
);
4727 else if (direct_slp_reduc
)
4729 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4730 with the elements for other SLP statements replaced with the
4731 neutral value. We can then do a normal reduction on each vector. */
4733 /* Enforced by vectorizable_reduction. */
4734 gcc_assert (new_phis
.length () == 1);
4735 gcc_assert (pow2p_hwi (group_size
));
4737 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
4738 vec
<stmt_vec_info
> orig_phis
4739 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
4740 gimple_seq seq
= NULL
;
4742 /* Build a vector {0, 1, 2, ...}, with the same number of elements
4743 and the same element size as VECTYPE. */
4744 tree index
= build_index_vector (vectype
, 0, 1);
4745 tree index_type
= TREE_TYPE (index
);
4746 tree index_elt_type
= TREE_TYPE (index_type
);
4747 tree mask_type
= build_same_sized_truth_vector_type (index_type
);
4749 /* Create a vector that, for each element, identifies which of
4750 the REDUC_GROUP_SIZE results should use it. */
4751 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
4752 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
4753 build_vector_from_val (index_type
, index_mask
));
4755 /* Get a neutral vector value. This is simply a splat of the neutral
4756 scalar value if we have one, otherwise the initial scalar value
4757 is itself a neutral value. */
4758 tree vector_identity
= NULL_TREE
;
4760 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
4762 for (unsigned int i
= 0; i
< group_size
; ++i
)
4764 /* If there's no univeral neutral value, we can use the
4765 initial scalar value from the original PHI. This is used
4766 for MIN and MAX reduction, for example. */
4770 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
]->stmt
,
4771 loop_preheader_edge (loop
));
4772 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
4776 /* Calculate the equivalent of:
4778 sel[j] = (index[j] == i);
4780 which selects the elements of NEW_PHI_RESULT that should
4781 be included in the result. */
4782 tree compare_val
= build_int_cst (index_elt_type
, i
);
4783 compare_val
= build_vector_from_val (index_type
, compare_val
);
4784 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
4785 index
, compare_val
);
4787 /* Calculate the equivalent of:
4789 vec = seq ? new_phi_result : vector_identity;
4791 VEC is now suitable for a full vector reduction. */
4792 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
4793 sel
, new_phi_result
, vector_identity
);
4795 /* Do the reduction and convert it to the appropriate type. */
4796 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
4797 TREE_TYPE (vectype
), vec
);
4798 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
4799 scalar_results
.safe_push (scalar
);
4801 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
4805 bool reduce_with_shift
;
4808 /* See if the target wants to do the final (shift) reduction
4809 in a vector mode of smaller size and first reduce upper/lower
4810 halves against each other. */
4811 enum machine_mode mode1
= mode
;
4812 unsigned sz
= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
));
4815 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
4816 sz1
= GET_MODE_SIZE (mode1
).to_constant ();
4818 tree vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz1
);
4819 reduce_with_shift
= have_whole_vector_shift (mode1
);
4820 if (!VECTOR_MODE_P (mode1
))
4821 reduce_with_shift
= false;
4824 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
4825 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
4826 reduce_with_shift
= false;
4829 /* First reduce the vector to the desired vector size we should
4830 do shift reduction on by combining upper and lower halves. */
4831 new_temp
= new_phi_result
;
4834 gcc_assert (!slp_reduc
);
4836 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz
);
4838 /* The target has to make sure we support lowpart/highpart
4839 extraction, either via direct vector extract or through
4840 an integer mode punning. */
4842 if (convert_optab_handler (vec_extract_optab
,
4843 TYPE_MODE (TREE_TYPE (new_temp
)),
4844 TYPE_MODE (vectype1
))
4845 != CODE_FOR_nothing
)
4847 /* Extract sub-vectors directly once vec_extract becomes
4848 a conversion optab. */
4849 dst1
= make_ssa_name (vectype1
);
4851 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
4852 build3 (BIT_FIELD_REF
, vectype1
,
4853 new_temp
, TYPE_SIZE (vectype1
),
4855 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4856 dst2
= make_ssa_name (vectype1
);
4858 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
4859 build3 (BIT_FIELD_REF
, vectype1
,
4860 new_temp
, TYPE_SIZE (vectype1
),
4861 bitsize_int (sz
* BITS_PER_UNIT
)));
4862 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4866 /* Extract via punning to appropriately sized integer mode
4868 tree eltype
= build_nonstandard_integer_type (sz
* BITS_PER_UNIT
,
4870 tree etype
= build_vector_type (eltype
, 2);
4871 gcc_assert (convert_optab_handler (vec_extract_optab
,
4874 != CODE_FOR_nothing
);
4875 tree tem
= make_ssa_name (etype
);
4876 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
4877 build1 (VIEW_CONVERT_EXPR
,
4879 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4881 tem
= make_ssa_name (eltype
);
4883 = gimple_build_assign (tem
, BIT_FIELD_REF
,
4884 build3 (BIT_FIELD_REF
, eltype
,
4885 new_temp
, TYPE_SIZE (eltype
),
4887 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4888 dst1
= make_ssa_name (vectype1
);
4889 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
4890 build1 (VIEW_CONVERT_EXPR
,
4892 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4893 tem
= make_ssa_name (eltype
);
4895 = gimple_build_assign (tem
, BIT_FIELD_REF
,
4896 build3 (BIT_FIELD_REF
, eltype
,
4897 new_temp
, TYPE_SIZE (eltype
),
4898 bitsize_int (sz
* BITS_PER_UNIT
)));
4899 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4900 dst2
= make_ssa_name (vectype1
);
4901 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
4902 build1 (VIEW_CONVERT_EXPR
,
4904 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4907 new_temp
= make_ssa_name (vectype1
);
4908 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
4909 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4912 if (reduce_with_shift
&& !slp_reduc
)
4914 int element_bitsize
= tree_to_uhwi (bitsize
);
4915 /* Enforced by vectorizable_reduction, which disallows SLP reductions
4916 for variable-length vectors and also requires direct target support
4917 for loop reductions. */
4918 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
4919 int nelements
= vec_size_in_bits
/ element_bitsize
;
4920 vec_perm_builder sel
;
4921 vec_perm_indices indices
;
4925 tree zero_vec
= build_zero_cst (vectype1
);
4927 for (offset = nelements/2; offset >= 1; offset/=2)
4929 Create: va' = vec_shift <va, offset>
4930 Create: va = vop <va, va'>
4935 if (dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE
, vect_location
,
4937 "Reduce using vector shifts\n");
4939 vec_dest
= vect_create_destination_var (scalar_dest
, vectype1
);
4940 for (elt_offset
= nelements
/ 2;
4944 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
4945 indices
.new_vector (sel
, 2, nelements
);
4946 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
4947 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
4948 new_temp
, zero_vec
, mask
);
4949 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
4950 gimple_assign_set_lhs (epilog_stmt
, new_name
);
4951 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4953 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
4955 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
4956 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4957 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4960 /* 2.4 Extract the final scalar result. Create:
4961 s_out3 = extract_field <v_out2, bitpos> */
4963 if (dump_enabled_p ())
4964 dump_printf_loc (MSG_NOTE
, vect_location
,
4965 "extract scalar result\n");
4967 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
4968 bitsize
, bitsize_zero_node
);
4969 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
4970 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
4971 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
4972 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4973 scalar_results
.safe_push (new_temp
);
4978 s = extract_field <v_out2, 0>
4979 for (offset = element_size;
4980 offset < vector_size;
4981 offset += element_size;)
4983 Create: s' = extract_field <v_out2, offset>
4984 Create: s = op <s, s'> // For non SLP cases
4987 if (dump_enabled_p ())
4988 dump_printf_loc (MSG_NOTE
, vect_location
,
4989 "Reduce using scalar code.\n");
4991 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
4992 int element_bitsize
= tree_to_uhwi (bitsize
);
4993 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
4996 if (gimple_code (new_phi
) == GIMPLE_PHI
)
4997 vec_temp
= PHI_RESULT (new_phi
);
4999 vec_temp
= gimple_assign_lhs (new_phi
);
5000 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
5002 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5003 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5004 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5005 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5007 /* In SLP we don't need to apply reduction operation, so we just
5008 collect s' values in SCALAR_RESULTS. */
5010 scalar_results
.safe_push (new_temp
);
5012 for (bit_offset
= element_bitsize
;
5013 bit_offset
< vec_size_in_bits
;
5014 bit_offset
+= element_bitsize
)
5016 tree bitpos
= bitsize_int (bit_offset
);
5017 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
5020 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5021 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5022 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5023 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5027 /* In SLP we don't need to apply reduction operation, so
5028 we just collect s' values in SCALAR_RESULTS. */
5029 new_temp
= new_name
;
5030 scalar_results
.safe_push (new_name
);
5034 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5035 new_name
, new_temp
);
5036 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5037 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5038 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5043 /* The only case where we need to reduce scalar results in SLP, is
5044 unrolling. If the size of SCALAR_RESULTS is greater than
5045 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5046 REDUC_GROUP_SIZE. */
5049 tree res
, first_res
, new_res
;
5052 /* Reduce multiple scalar results in case of SLP unrolling. */
5053 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5056 first_res
= scalar_results
[j
% group_size
];
5057 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5059 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
5060 gimple_assign_set_lhs (new_stmt
, new_res
);
5061 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
5062 scalar_results
[j
% group_size
] = new_res
;
5066 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5067 scalar_results
.safe_push (new_temp
);
5070 if ((STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
5073 /* Earlier we set the initial value to be a vector if induc_val
5074 values. Check the result and if it is induc_val then replace
5075 with the original initial value, unless induc_val is
5076 the same as initial_def already. */
5077 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5080 tree tmp
= make_ssa_name (new_scalar_dest
);
5081 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5082 initial_def
, new_temp
);
5083 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5084 scalar_results
[0] = tmp
;
5088 /* 2.5 Adjust the final result by the initial value of the reduction
5089 variable. (When such adjustment is not needed, then
5090 'adjustment_def' is zero). For example, if code is PLUS we create:
5091 new_temp = loop_exit_def + adjustment_def */
5095 gcc_assert (!slp_reduc
);
5096 if (nested_in_vect_loop
)
5098 new_phi
= new_phis
[0];
5099 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
5100 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
5101 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5105 new_temp
= scalar_results
[0];
5106 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5107 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
5108 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
5111 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
5112 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
5113 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5114 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5115 if (nested_in_vect_loop
)
5117 stmt_vec_info epilog_stmt_info
= loop_vinfo
->add_stmt (epilog_stmt
);
5118 STMT_VINFO_RELATED_STMT (epilog_stmt_info
)
5119 = STMT_VINFO_RELATED_STMT (loop_vinfo
->lookup_stmt (new_phi
));
5122 scalar_results
.quick_push (new_temp
);
5124 scalar_results
[0] = new_temp
;
5127 scalar_results
[0] = new_temp
;
5129 new_phis
[0] = epilog_stmt
;
5135 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5136 phis with new adjusted scalar results, i.e., replace use <s_out0>
5141 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5142 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5143 v_out2 = reduce <v_out1>
5144 s_out3 = extract_field <v_out2, 0>
5145 s_out4 = adjust_result <s_out3>
5152 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5153 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5154 v_out2 = reduce <v_out1>
5155 s_out3 = extract_field <v_out2, 0>
5156 s_out4 = adjust_result <s_out3>
5161 /* In SLP reduction chain we reduce vector results into one vector if
5162 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5163 LHS of the last stmt in the reduction chain, since we are looking for
5164 the loop exit phi node. */
5165 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5167 stmt_vec_info dest_stmt_info
5168 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
5169 scalar_dest
= gimple_assign_lhs (dest_stmt_info
->stmt
);
5173 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5174 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5175 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5176 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5177 correspond to the first vector stmt, etc.
5178 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5179 if (group_size
> new_phis
.length ())
5180 gcc_assert (!(group_size
% new_phis
.length ()));
5182 for (k
= 0; k
< group_size
; k
++)
5186 stmt_vec_info scalar_stmt_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5188 orig_stmt_info
= STMT_VINFO_RELATED_STMT (scalar_stmt_info
);
5189 /* SLP statements can't participate in patterns. */
5190 gcc_assert (!orig_stmt_info
);
5191 scalar_dest
= gimple_assign_lhs (scalar_stmt_info
->stmt
);
5194 if (nested_in_vect_loop
)
5203 /* Find the loop-closed-use at the loop exit of the original scalar
5204 result. (The reduction result is expected to have two immediate uses,
5205 one at the latch block, and one at the loop exit). For double
5206 reductions we are looking for exit phis of the outer loop. */
5207 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5209 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
5211 if (!is_gimple_debug (USE_STMT (use_p
)))
5212 phis
.safe_push (USE_STMT (use_p
));
5216 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
5218 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
5220 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
5222 if (!flow_bb_inside_loop_p (loop
,
5223 gimple_bb (USE_STMT (phi_use_p
)))
5224 && !is_gimple_debug (USE_STMT (phi_use_p
)))
5225 phis
.safe_push (USE_STMT (phi_use_p
));
5231 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5233 /* Replace the uses: */
5234 orig_name
= PHI_RESULT (exit_phi
);
5235 scalar_result
= scalar_results
[k
];
5236 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5237 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
5238 SET_USE (use_p
, scalar_result
);
5245 /* Return a vector of type VECTYPE that is equal to the vector select
5246 operation "MASK ? VEC : IDENTITY". Insert the select statements
5250 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
5251 tree vec
, tree identity
)
5253 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
5254 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
5255 mask
, vec
, identity
);
5256 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5260 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5261 order, starting with LHS. Insert the extraction statements before GSI and
5262 associate the new scalar SSA names with variable SCALAR_DEST.
5263 Return the SSA name for the result. */
5266 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
5267 tree_code code
, tree lhs
, tree vector_rhs
)
5269 tree vectype
= TREE_TYPE (vector_rhs
);
5270 tree scalar_type
= TREE_TYPE (vectype
);
5271 tree bitsize
= TYPE_SIZE (scalar_type
);
5272 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
5273 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
5275 for (unsigned HOST_WIDE_INT bit_offset
= 0;
5276 bit_offset
< vec_size_in_bits
;
5277 bit_offset
+= element_bitsize
)
5279 tree bitpos
= bitsize_int (bit_offset
);
5280 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
5283 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
5284 rhs
= make_ssa_name (scalar_dest
, stmt
);
5285 gimple_assign_set_lhs (stmt
, rhs
);
5286 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5288 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
5289 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
5290 gimple_assign_set_lhs (stmt
, new_name
);
5291 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5297 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5298 type of the vector input. */
5301 get_masked_reduction_fn (internal_fn reduc_fn
, tree vectype_in
)
5303 internal_fn mask_reduc_fn
;
5307 case IFN_FOLD_LEFT_PLUS
:
5308 mask_reduc_fn
= IFN_MASK_FOLD_LEFT_PLUS
;
5315 if (direct_internal_fn_supported_p (mask_reduc_fn
, vectype_in
,
5316 OPTIMIZE_FOR_SPEED
))
5317 return mask_reduc_fn
;
5321 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5322 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5323 statement. CODE is the operation performed by STMT_INFO and OPS are
5324 its scalar operands. REDUC_INDEX is the index of the operand in
5325 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5326 implements in-order reduction, or IFN_LAST if we should open-code it.
5327 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5328 that should be used to control the operation in a fully-masked loop. */
5331 vectorize_fold_left_reduction (stmt_vec_info stmt_info
,
5332 gimple_stmt_iterator
*gsi
,
5333 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
5334 gimple
*reduc_def_stmt
,
5335 tree_code code
, internal_fn reduc_fn
,
5336 tree ops
[3], tree vectype_in
,
5337 int reduc_index
, vec_loop_masks
*masks
)
5339 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5340 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5341 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
5342 stmt_vec_info new_stmt_info
= NULL
;
5343 internal_fn mask_reduc_fn
= get_masked_reduction_fn (reduc_fn
, vectype_in
);
5349 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
5351 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
5352 gcc_assert (ncopies
== 1);
5353 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
5356 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
5357 TYPE_VECTOR_SUBPARTS (vectype_in
)));
5359 tree op0
= ops
[1 - reduc_index
];
5362 stmt_vec_info scalar_dest_def_info
;
5363 auto_vec
<tree
> vec_oprnds0
;
5366 auto_vec
<vec
<tree
> > vec_defs (2);
5367 auto_vec
<tree
> sops(2);
5368 sops
.quick_push (ops
[0]);
5369 sops
.quick_push (ops
[1]);
5370 vect_get_slp_defs (sops
, slp_node
, &vec_defs
);
5371 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
5372 vec_defs
[0].release ();
5373 vec_defs
[1].release ();
5374 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
5375 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5379 tree loop_vec_def0
= vect_get_vec_def_for_operand (op0
, stmt_info
);
5380 vec_oprnds0
.create (1);
5381 vec_oprnds0
.quick_push (loop_vec_def0
);
5382 scalar_dest_def_info
= stmt_info
;
5385 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
5386 tree scalar_type
= TREE_TYPE (scalar_dest
);
5387 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
5389 int vec_num
= vec_oprnds0
.length ();
5390 gcc_assert (vec_num
== 1 || slp_node
);
5391 tree vec_elem_type
= TREE_TYPE (vectype_out
);
5392 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
5394 tree vector_identity
= NULL_TREE
;
5395 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5396 vector_identity
= build_zero_cst (vectype_out
);
5398 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
5401 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5404 tree mask
= NULL_TREE
;
5405 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5406 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
5408 /* Handle MINUS by adding the negative. */
5409 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
5411 tree negated
= make_ssa_name (vectype_out
);
5412 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
5413 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5417 if (mask
&& mask_reduc_fn
== IFN_LAST
)
5418 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
5421 /* On the first iteration the input is simply the scalar phi
5422 result, and for subsequent iterations it is the output of
5423 the preceding operation. */
5424 if (reduc_fn
!= IFN_LAST
|| (mask
&& mask_reduc_fn
!= IFN_LAST
))
5426 if (mask
&& mask_reduc_fn
!= IFN_LAST
)
5427 new_stmt
= gimple_build_call_internal (mask_reduc_fn
, 3, reduc_var
,
5430 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
,
5432 /* For chained SLP reductions the output of the previous reduction
5433 operation serves as the input of the next. For the final statement
5434 the output cannot be a temporary - we reuse the original
5435 scalar destination of the last statement. */
5436 if (i
!= vec_num
- 1)
5438 gimple_set_lhs (new_stmt
, scalar_dest_var
);
5439 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
5440 gimple_set_lhs (new_stmt
, reduc_var
);
5445 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
5447 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
5448 /* Remove the statement, so that we can use the same code paths
5449 as for statements that we've just created. */
5450 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
5451 gsi_remove (&tmp_gsi
, true);
5454 if (i
== vec_num
- 1)
5456 gimple_set_lhs (new_stmt
, scalar_dest
);
5457 new_stmt_info
= vect_finish_replace_stmt (scalar_dest_def_info
,
5461 new_stmt_info
= vect_finish_stmt_generation (scalar_dest_def_info
,
5465 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt_info
);
5469 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
5474 /* Function is_nonwrapping_integer_induction.
5476 Check if STMT_VINO (which is part of loop LOOP) both increments and
5477 does not cause overflow. */
5480 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, class loop
*loop
)
5482 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
5483 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
5484 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
5485 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
5486 widest_int ni
, max_loop_value
, lhs_max
;
5487 wi::overflow_type overflow
= wi::OVF_NONE
;
5489 /* Make sure the loop is integer based. */
5490 if (TREE_CODE (base
) != INTEGER_CST
5491 || TREE_CODE (step
) != INTEGER_CST
)
5494 /* Check that the max size of the loop will not wrap. */
5496 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
5499 if (! max_stmt_executions (loop
, &ni
))
5502 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
5507 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
5508 TYPE_SIGN (lhs_type
), &overflow
);
5512 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
5513 <= TYPE_PRECISION (lhs_type
));
5516 /* Check if masking can be supported by inserting a conditional expression.
5517 CODE is the code for the operation. COND_FN is the conditional internal
5518 function, if it exists. VECTYPE_IN is the type of the vector input. */
5520 use_mask_by_cond_expr_p (enum tree_code code
, internal_fn cond_fn
,
5523 if (cond_fn
!= IFN_LAST
5524 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
5525 OPTIMIZE_FOR_SPEED
))
5539 /* Insert a conditional expression to enable masked vectorization. CODE is the
5540 code for the operation. VOP is the array of operands. MASK is the loop
5541 mask. GSI is a statement iterator used to place the new conditional
5544 build_vect_cond_expr (enum tree_code code
, tree vop
[3], tree mask
,
5545 gimple_stmt_iterator
*gsi
)
5551 tree vectype
= TREE_TYPE (vop
[1]);
5552 tree zero
= build_zero_cst (vectype
);
5553 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
5554 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
5555 mask
, vop
[1], zero
);
5556 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
5557 vop
[1] = masked_op1
;
5563 tree vectype
= TREE_TYPE (vop
[1]);
5564 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
5565 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
5566 mask
, vop
[1], vop
[0]);
5567 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
5568 vop
[1] = masked_op1
;
5577 /* Function vectorizable_reduction.
5579 Check if STMT_INFO performs a reduction operation that can be vectorized.
5580 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5581 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5582 Return true if STMT_INFO is vectorizable in this way.
5584 This function also handles reduction idioms (patterns) that have been
5585 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5586 may be of this form:
5587 X = pattern_expr (arg0, arg1, ..., X)
5588 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5589 sequence that had been detected and replaced by the pattern-stmt
5592 This function also handles reduction of condition expressions, for example:
5593 for (int i = 0; i < N; i++)
5596 This is handled by vectorising the loop and creating an additional vector
5597 containing the loop indexes for which "a[i] < value" was true. In the
5598 function epilogue this is reduced to a single max value and then used to
5599 index into the vector of results.
5601 In some cases of reduction patterns, the type of the reduction variable X is
5602 different than the type of the other arguments of STMT_INFO.
5603 In such cases, the vectype that is used when transforming STMT_INFO into
5604 a vector stmt is different than the vectype that is used to determine the
5605 vectorization factor, because it consists of a different number of elements
5606 than the actual number of elements that are being operated upon in parallel.
5608 For example, consider an accumulation of shorts into an int accumulator.
5609 On some targets it's possible to vectorize this pattern operating on 8
5610 shorts at a time (hence, the vectype for purposes of determining the
5611 vectorization factor should be V8HI); on the other hand, the vectype that
5612 is used to create the vector form is actually V4SI (the type of the result).
5614 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5615 indicates what is the actual level of parallelism (V8HI in the example), so
5616 that the right vectorization factor would be derived. This vectype
5617 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5618 be used to create the vectorized stmt. The right vectype for the vectorized
5619 stmt is obtained from the type of the result X:
5620 get_vectype_for_scalar_type (TREE_TYPE (X))
5622 This means that, contrary to "regular" reductions (or "regular" stmts in
5623 general), the following equation:
5624 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5625 does *NOT* necessarily hold for reduction patterns. */
5628 vectorizable_reduction (stmt_vec_info stmt_info
, slp_tree slp_node
,
5629 slp_instance slp_node_instance
,
5630 stmt_vector_for_cost
*cost_vec
)
5633 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
5634 tree vectype_in
= NULL_TREE
;
5635 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5636 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5637 enum tree_code code
;
5639 enum vect_def_type dt
, cond_reduc_dt
= vect_unknown_def_type
;
5640 stmt_vec_info cond_stmt_vinfo
= NULL
;
5644 bool single_defuse_cycle
= false;
5646 enum vect_def_type dts
[3];
5647 bool nested_cycle
= false, found_nested_cycle_def
= false;
5648 bool double_reduc
= false;
5651 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
5652 tree cond_reduc_val
= NULL_TREE
;
5654 /* Make sure it was already recognized as a reduction computation. */
5655 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
5656 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
5657 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
5660 /* The stmt we store reduction analysis meta on. */
5661 stmt_vec_info reduc_info
= info_for_reduction (stmt_info
);
5662 reduc_info
->is_reduc_info
= true;
5664 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
5666 if (is_a
<gphi
*> (stmt_info
->stmt
))
5667 /* Analysis for double-reduction is done on the outer
5668 loop PHI, nested cycles have no further restrictions. */
5669 STMT_VINFO_TYPE (stmt_info
) = cycle_phi_info_type
;
5671 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
5675 stmt_vec_info orig_stmt_of_analysis
= stmt_info
;
5676 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
5677 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
5679 if (!is_a
<gphi
*> (stmt_info
->stmt
))
5681 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
5686 slp_node_instance
->reduc_phis
= slp_node
;
5687 /* ??? We're leaving slp_node to point to the PHIs, we only
5688 need it to get at the number of vector stmts which wasn't
5689 yet initialized for the instance root. */
5691 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
5692 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info
));
5693 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5695 use_operand_p use_p
;
5697 bool res
= single_imm_use (gimple_phi_result (stmt_info
->stmt
),
5700 stmt_info
= loop_vinfo
->lookup_stmt (use_stmt
);
5701 stmt_info
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info
));
5703 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
5705 if (slp_node
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5707 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info
));
5708 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
5712 if (nested_in_vect_loop_p (loop
, stmt_info
))
5715 nested_cycle
= true;
5718 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5719 gcc_assert (slp_node
5720 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
5722 /* 1. Is vectorizable reduction? */
5723 /* Not supportable if the reduction variable is used in the loop, unless
5724 it's a reduction chain. */
5725 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
5726 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5729 /* Reductions that are not used even in an enclosing outer-loop,
5730 are expected to be "live" (used out of the loop). */
5731 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
5732 && !STMT_VINFO_LIVE_P (stmt_info
))
5735 /* 2. Has this been recognized as a reduction pattern?
5737 Check if STMT represents a pattern that has been recognized
5738 in earlier analysis stages. For stmts that represent a pattern,
5739 the STMT_VINFO_RELATED_STMT field records the last stmt in
5740 the original sequence that constitutes the pattern. */
5742 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
5745 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
5746 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
5749 /* 3. Check the operands of the operation. The first operands are defined
5750 inside the loop body. The last operand is the reduction variable,
5751 which is defined by the loop-header-phi. */
5753 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5756 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
5758 case GIMPLE_BINARY_RHS
:
5759 code
= gimple_assign_rhs_code (stmt
);
5760 op_type
= TREE_CODE_LENGTH (code
);
5761 gcc_assert (op_type
== binary_op
);
5762 ops
[0] = gimple_assign_rhs1 (stmt
);
5763 ops
[1] = gimple_assign_rhs2 (stmt
);
5766 case GIMPLE_TERNARY_RHS
:
5767 code
= gimple_assign_rhs_code (stmt
);
5768 op_type
= TREE_CODE_LENGTH (code
);
5769 gcc_assert (op_type
== ternary_op
);
5770 ops
[0] = gimple_assign_rhs1 (stmt
);
5771 ops
[1] = gimple_assign_rhs2 (stmt
);
5772 ops
[2] = gimple_assign_rhs3 (stmt
);
5775 case GIMPLE_UNARY_RHS
:
5776 case GIMPLE_SINGLE_RHS
:
5783 if (code
== COND_EXPR
&& slp_node
)
5786 scalar_dest
= gimple_assign_lhs (stmt
);
5787 scalar_type
= TREE_TYPE (scalar_dest
);
5788 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
5789 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
5792 /* Do not try to vectorize bit-precision reductions. */
5793 if (!type_has_mode_precision_p (scalar_type
))
5796 /* All uses but the last are expected to be defined in the loop.
5797 The last use is the reduction variable. In case of nested cycle this
5798 assumption is not true: we use reduc_index to record the index of the
5799 reduction variable. */
5800 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
5801 /* PHIs should not participate in patterns. */
5802 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info
));
5803 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
5804 tree reduc_def
= PHI_RESULT (reduc_def_phi
);
5805 int reduc_index
= -1;
5806 for (i
= 0; i
< op_type
; i
++)
5808 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5809 if (i
== 0 && code
== COND_EXPR
)
5812 stmt_vec_info def_stmt_info
;
5813 if (!vect_is_simple_use (ops
[i
], loop_vinfo
, &dts
[i
], &tem
,
5816 if (dump_enabled_p ())
5817 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5818 "use not simple.\n");
5822 if (dt
== vect_reduction_def
5823 && ops
[i
] == reduc_def
)
5830 /* To properly compute ncopies we are interested in the widest
5831 input type in case we're looking at a widening accumulation. */
5833 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
5834 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
)))))
5838 if (dt
!= vect_internal_def
5839 && dt
!= vect_external_def
5840 && dt
!= vect_constant_def
5841 && dt
!= vect_induction_def
5842 && !(dt
== vect_nested_cycle
&& nested_cycle
))
5845 if (dt
== vect_nested_cycle
5846 && ops
[i
] == reduc_def
)
5848 found_nested_cycle_def
= true;
5852 if (code
== COND_EXPR
)
5854 /* Record how the non-reduction-def value of COND_EXPR is defined. */
5855 if (dt
== vect_constant_def
)
5858 cond_reduc_val
= ops
[i
];
5860 if (dt
== vect_induction_def
5862 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
5865 cond_stmt_vinfo
= def_stmt_info
;
5870 vectype_in
= vectype_out
;
5871 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
) = vectype_in
;
5872 /* For the SSA cycle we store on each participating stmt the operand index
5873 where the cycle continues. Store the one relevant for the actual
5874 operation in the reduction meta. */
5875 STMT_VINFO_REDUC_IDX (reduc_info
) = reduc_index
;
5877 if (!(reduc_index
== -1
5878 || dts
[reduc_index
] == vect_reduction_def
5879 || dts
[reduc_index
] == vect_nested_cycle
5880 || ((dts
[reduc_index
] == vect_internal_def
5881 || dts
[reduc_index
] == vect_external_def
5882 || dts
[reduc_index
] == vect_constant_def
5883 || dts
[reduc_index
] == vect_induction_def
)
5884 && nested_cycle
&& found_nested_cycle_def
)))
5886 /* For pattern recognized stmts, orig_stmt might be a reduction,
5887 but some helper statements for the pattern might not, or
5888 might be COND_EXPRs with reduction uses in the condition. */
5889 gcc_assert (orig_stmt_info
);
5893 enum vect_reduction_type v_reduc_type
= STMT_VINFO_REDUC_TYPE (phi_info
);
5894 STMT_VINFO_REDUC_TYPE (reduc_info
) = v_reduc_type
;
5895 /* If we have a condition reduction, see if we can simplify it further. */
5896 if (v_reduc_type
== COND_REDUCTION
)
5898 /* TODO: We can't yet handle reduction chains, since we need to treat
5899 each COND_EXPR in the chain specially, not just the last one.
5902 x_1 = PHI <x_3, ...>
5903 x_2 = a_2 ? ... : x_1;
5904 x_3 = a_3 ? ... : x_2;
5906 we're interested in the last element in x_3 for which a_2 || a_3
5907 is true, whereas the current reduction chain handling would
5908 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
5909 as a reduction operation. */
5910 if (reduc_index
== -1)
5912 if (dump_enabled_p ())
5913 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5914 "conditional reduction chains not supported\n");
5918 /* When the condition uses the reduction value in the condition, fail. */
5919 if (reduc_index
== 0)
5921 if (dump_enabled_p ())
5922 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5923 "condition depends on previous iteration\n");
5927 if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
5928 vectype_in
, OPTIMIZE_FOR_SPEED
))
5930 if (dump_enabled_p ())
5931 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5932 "optimizing condition reduction with"
5933 " FOLD_EXTRACT_LAST.\n");
5934 STMT_VINFO_REDUC_TYPE (reduc_info
) = EXTRACT_LAST_REDUCTION
;
5936 else if (cond_reduc_dt
== vect_induction_def
)
5939 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
5940 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
5942 gcc_assert (TREE_CODE (base
) == INTEGER_CST
5943 && TREE_CODE (step
) == INTEGER_CST
);
5944 cond_reduc_val
= NULL_TREE
;
5945 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
5946 tree res
= PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo
));
5947 if (!types_compatible_p (TREE_TYPE (res
), TREE_TYPE (base
)))
5949 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5950 above base; punt if base is the minimum value of the type for
5951 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
5952 else if (tree_int_cst_sgn (step
) == -1)
5954 cond_reduc_op_code
= MIN_EXPR
;
5955 if (tree_int_cst_sgn (base
) == -1)
5956 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
5957 else if (tree_int_cst_lt (base
,
5958 TYPE_MAX_VALUE (TREE_TYPE (base
))))
5960 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
5964 cond_reduc_op_code
= MAX_EXPR
;
5965 if (tree_int_cst_sgn (base
) == 1)
5966 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
5967 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
5970 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
5974 if (dump_enabled_p ())
5975 dump_printf_loc (MSG_NOTE
, vect_location
,
5976 "condition expression based on "
5977 "integer induction.\n");
5978 STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info
) = cond_reduc_op_code
;
5979 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
)
5981 STMT_VINFO_REDUC_TYPE (reduc_info
) = INTEGER_INDUC_COND_REDUCTION
;
5984 else if (cond_reduc_dt
== vect_constant_def
)
5986 enum vect_def_type cond_initial_dt
;
5987 gimple
*def_stmt
= SSA_NAME_DEF_STMT (ops
[reduc_index
]);
5988 tree cond_initial_val
5989 = PHI_ARG_DEF_FROM_EDGE (def_stmt
, loop_preheader_edge (loop
));
5991 gcc_assert (cond_reduc_val
!= NULL_TREE
);
5992 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
5993 if (cond_initial_dt
== vect_constant_def
5994 && types_compatible_p (TREE_TYPE (cond_initial_val
),
5995 TREE_TYPE (cond_reduc_val
)))
5997 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
5998 cond_initial_val
, cond_reduc_val
);
5999 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6001 if (dump_enabled_p ())
6002 dump_printf_loc (MSG_NOTE
, vect_location
,
6003 "condition expression based on "
6004 "compile time constant.\n");
6005 /* Record reduction code at analysis stage. */
6006 STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info
)
6007 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6008 STMT_VINFO_REDUC_TYPE (reduc_info
) = CONST_COND_REDUCTION
;
6014 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6015 /* We changed STMT to be the first stmt in reduction chain, hence we
6016 check that in this case the first element in the chain is STMT. */
6017 gcc_assert (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (phi_info
))
6018 == vect_orig_stmt (stmt_info
));
6020 if (STMT_VINFO_LIVE_P (phi_info
))
6026 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6028 gcc_assert (ncopies
>= 1);
6030 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6034 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
)
6035 == vect_double_reduction_def
);
6036 double_reduc
= true;
6039 /* 4.2. Check support for the epilog operation.
6041 If STMT represents a reduction pattern, then the type of the
6042 reduction variable may be different than the type of the rest
6043 of the arguments. For example, consider the case of accumulation
6044 of shorts into an int accumulator; The original code:
6045 S1: int_a = (int) short_a;
6046 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6049 STMT: int_acc = widen_sum <short_a, int_acc>
6052 1. The tree-code that is used to create the vector operation in the
6053 epilog code (that reduces the partial results) is not the
6054 tree-code of STMT, but is rather the tree-code of the original
6055 stmt from the pattern that STMT is replacing. I.e, in the example
6056 above we want to use 'widen_sum' in the loop, but 'plus' in the
6058 2. The type (mode) we use to check available target support
6059 for the vector operation to be created in the *epilog*, is
6060 determined by the type of the reduction variable (in the example
6061 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6062 However the type (mode) we use to check available target support
6063 for the vector operation to be created *inside the loop*, is
6064 determined by the type of the other arguments to STMT (in the
6065 example we'd check this: optab_handler (widen_sum_optab,
6068 This is contrary to "regular" reductions, in which the types of all
6069 the arguments are the same as the type of the reduction variable.
6070 For "regular" reductions we can therefore use the same vector type
6071 (and also the same tree-code) when generating the epilog code and
6072 when generating the code inside the loop. */
6074 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
6075 enum tree_code orig_code
= ERROR_MARK
;
6076 if (reduction_type
== CONST_COND_REDUCTION
6077 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
)
6079 /* For simple condition reductions, replace with the actual expression
6080 we want to base our reduction around. */
6081 orig_code
= STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info
);
6082 gcc_assert (orig_code
== MAX_EXPR
|| orig_code
== MIN_EXPR
);
6084 else if (reduction_type
== COND_REDUCTION
)
6085 orig_code
= COND_EXPR
;
6086 else if (reduction_type
== TREE_CODE_REDUCTION
6087 || reduction_type
== FOLD_LEFT_REDUCTION
)
6090 orig_code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
6093 gcc_assert (vectype_out
);
6094 if (orig_code
== MINUS_EXPR
)
6095 orig_code
= PLUS_EXPR
;
6097 STMT_VINFO_REDUC_CODE (reduc_info
) = orig_code
;
6099 if (reduction_type
== TREE_CODE_REDUCTION
)
6101 /* Check whether it's ok to change the order of the computation.
6102 Generally, when vectorizing a reduction we change the order of the
6103 computation. This may change the behavior of the program in some
6104 cases, so we need to check that this is ok. One exception is when
6105 vectorizing an outer-loop: the inner-loop is executed sequentially,
6106 and therefore vectorizing reductions in the inner-loop during
6107 outer-loop vectorization is safe. */
6108 if (needs_fold_left_reduction_p (scalar_type
, orig_code
))
6110 STMT_VINFO_REDUC_TYPE (reduc_info
)
6111 = reduction_type
= FOLD_LEFT_REDUCTION
;
6112 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6113 directy used in stmt. */
6114 if (reduc_index
== -1)
6116 if (dump_enabled_p ())
6117 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6118 "in-order reduction chain without SLP.\n");
6122 else if (!commutative_tree_code (orig_code
)
6123 || !associative_tree_code (orig_code
))
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6127 "reduction: not commutative/associative");
6132 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
6135 if (dump_enabled_p ())
6136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6137 "multiple types in double reduction or condition "
6138 "reduction or fold-left reduction.\n");
6142 internal_fn reduc_fn
= IFN_LAST
;
6143 if (reduction_type
== TREE_CODE_REDUCTION
6144 || reduction_type
== FOLD_LEFT_REDUCTION
6145 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
6146 || reduction_type
== CONST_COND_REDUCTION
)
6148 if (reduction_type
== FOLD_LEFT_REDUCTION
6149 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
6150 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
6152 if (reduc_fn
!= IFN_LAST
6153 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
6154 OPTIMIZE_FOR_SPEED
))
6156 if (dump_enabled_p ())
6157 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6158 "reduc op not supported by target.\n");
6160 reduc_fn
= IFN_LAST
;
6165 if (!nested_cycle
|| double_reduc
)
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6169 "no reduc code for scalar code.\n");
6175 else if (reduction_type
== COND_REDUCTION
)
6177 int scalar_precision
6178 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
6179 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
6180 cr_index_vector_type
= build_vector_type (cr_index_scalar_type
,
6183 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
6184 OPTIMIZE_FOR_SPEED
))
6185 reduc_fn
= IFN_REDUC_MAX
;
6187 STMT_VINFO_REDUC_FN (reduc_info
) = reduc_fn
;
6189 if (reduction_type
!= EXTRACT_LAST_REDUCTION
6190 && (!nested_cycle
|| double_reduc
)
6191 && reduc_fn
== IFN_LAST
6192 && !nunits_out
.is_constant ())
6194 if (dump_enabled_p ())
6195 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6196 "missing target support for reduction on"
6197 " variable-length vectors.\n");
6201 /* For SLP reductions, see if there is a neutral value we can use. */
6202 tree neutral_op
= NULL_TREE
;
6204 neutral_op
= neutral_op_for_slp_reduction
6205 (slp_node_instance
->reduc_phis
, code
,
6206 REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
6208 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
6210 /* We can't support in-order reductions of code such as this:
6212 for (int i = 0; i < n1; ++i)
6213 for (int j = 0; j < n2; ++j)
6216 since GCC effectively transforms the loop when vectorizing:
6218 for (int i = 0; i < n1 / VF; ++i)
6219 for (int j = 0; j < n2; ++j)
6220 for (int k = 0; k < VF; ++k)
6223 which is a reassociation of the original operation. */
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6226 "in-order double reduction not supported.\n");
6231 if (reduction_type
== FOLD_LEFT_REDUCTION
6233 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6235 /* We cannot use in-order reductions in this case because there is
6236 an implicit reassociation of the operations involved. */
6237 if (dump_enabled_p ())
6238 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6239 "in-order unchained SLP reductions not supported.\n");
6243 /* For double reductions, and for SLP reductions with a neutral value,
6244 we construct a variable-length initial vector by loading a vector
6245 full of the neutral value and then shift-and-inserting the start
6246 values into the low-numbered elements. */
6247 if ((double_reduc
|| neutral_op
)
6248 && !nunits_out
.is_constant ()
6249 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
6250 vectype_out
, OPTIMIZE_FOR_SPEED
))
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6254 "reduction on variable-length vectors requires"
6255 " target support for a vector-shift-and-insert"
6260 /* Check extra constraints for variable-length unchained SLP reductions. */
6261 if (STMT_SLP_TYPE (stmt_info
)
6262 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6263 && !nunits_out
.is_constant ())
6265 /* We checked above that we could build the initial vector when
6266 there's a neutral element value. Check here for the case in
6267 which each SLP statement has its own initial value and in which
6268 that value needs to be repeated for every instance of the
6269 statement within the initial vector. */
6270 unsigned int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_node_instance
);
6271 scalar_mode elt_mode
= SCALAR_TYPE_MODE (TREE_TYPE (vectype_out
));
6273 && !can_duplicate_and_interleave_p (group_size
, elt_mode
))
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6277 "unsupported form of SLP reduction for"
6278 " variable-length vectors: cannot build"
6279 " initial vector.\n");
6282 /* The epilogue code relies on the number of elements being a multiple
6283 of the group size. The duplicate-and-interleave approach to setting
6284 up the the initial vector does too. */
6285 if (!multiple_p (nunits_out
, group_size
))
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6289 "unsupported form of SLP reduction for"
6290 " variable-length vectors: the vector size"
6291 " is not a multiple of the number of results.\n");
6296 /* In case of widenning multiplication by a constant, we update the type
6297 of the constant to be the type of the other operand. We check that the
6298 constant fits the type in the pattern recognition pass. */
6299 if (code
== DOT_PROD_EXPR
6300 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
6301 /* No testcase for this. PR49478. */
6304 if (reduction_type
== COND_REDUCTION
)
6308 if (! max_loop_iterations (loop
, &ni
))
6310 if (dump_enabled_p ())
6311 dump_printf_loc (MSG_NOTE
, vect_location
,
6312 "loop count not known, cannot create cond "
6316 /* Convert backedges to iterations. */
6319 /* The additional index will be the same type as the condition. Check
6320 that the loop can fit into this less one (because we'll use up the
6321 zero slot for when there are no matches). */
6322 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
6323 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
6325 if (dump_enabled_p ())
6326 dump_printf_loc (MSG_NOTE
, vect_location
,
6327 "loop size is greater than data size.\n");
6332 /* In case the vectorization factor (VF) is bigger than the number
6333 of elements that we can fit in a vectype (nunits), we have to generate
6334 more than one vector stmt - i.e - we need to "unroll" the
6335 vector stmt by a factor VF/nunits. For more details see documentation
6336 in vectorizable_operation. */
6338 /* If the reduction is used in an outer loop we need to generate
6339 VF intermediate results, like so (e.g. for ncopies=2):
6344 (i.e. we generate VF results in 2 registers).
6345 In this case we have a separate def-use cycle for each copy, and therefore
6346 for each copy we get the vector def for the reduction variable from the
6347 respective phi node created for this copy.
6349 Otherwise (the reduction is unused in the loop nest), we can combine
6350 together intermediate results, like so (e.g. for ncopies=2):
6354 (i.e. we generate VF/2 results in a single register).
6355 In this case for each copy we get the vector def for the reduction variable
6356 from the vectorized reduction operation generated in the previous iteration.
6358 This only works when we see both the reduction PHI and its only consumer
6359 in vectorizable_reduction and there are no intermediate stmts
6361 stmt_vec_info use_stmt_info
;
6362 tree reduc_phi_result
= gimple_phi_result (reduc_def_phi
);
6364 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
6365 && (use_stmt_info
= loop_vinfo
->lookup_single_use (reduc_phi_result
))
6366 && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info
)
6367 || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info
))
6368 && vect_stmt_to_vectorize (use_stmt_info
) == stmt_info
)
6369 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
) = single_defuse_cycle
= true;
6371 if (single_defuse_cycle
6372 || code
== DOT_PROD_EXPR
6373 || code
== WIDEN_SUM_EXPR
6374 || code
== SAD_EXPR
)
6376 gcc_assert (code
!= COND_EXPR
);
6378 /* 4. Supportable by target? */
6380 /* 4.1. check support for the operation in the loop */
6381 optab optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
6384 if (dump_enabled_p ())
6385 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6391 machine_mode vec_mode
= TYPE_MODE (vectype_in
);
6392 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
6394 if (dump_enabled_p ())
6395 dump_printf (MSG_NOTE
, "op not supported by target.\n");
6397 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
6398 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6401 if (dump_enabled_p ())
6402 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
6405 /* Worthwhile without SIMD support? */
6406 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
6407 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6409 if (dump_enabled_p ())
6410 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6411 "not worthwhile without SIMD support.\n");
6417 /* If the reduction stmt is one of the patterns that have lane
6418 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6420 && ! single_defuse_cycle
)
6421 && (code
== DOT_PROD_EXPR
6422 || code
== WIDEN_SUM_EXPR
6423 || code
== SAD_EXPR
))
6425 if (dump_enabled_p ())
6426 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6427 "multi def-use cycle not possible for lane-reducing "
6428 "reduction operation\n");
6433 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
6437 internal_fn cond_fn
= get_conditional_internal_fn (code
);
6438 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
6439 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
6441 vect_model_reduction_cost (stmt_info
, reduc_fn
, reduction_type
, ncopies
,
6443 if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
6445 if (reduction_type
!= FOLD_LEFT_REDUCTION
6446 && !mask_by_cond_expr
6447 && (cond_fn
== IFN_LAST
6448 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
6449 OPTIMIZE_FOR_SPEED
)))
6451 if (dump_enabled_p ())
6452 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6453 "can't use a fully-masked loop because no"
6454 " conditional operation is available.\n");
6455 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
6457 else if (reduc_index
== -1)
6459 if (dump_enabled_p ())
6460 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6461 "can't use a fully-masked loop for chained"
6463 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
6466 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
6469 if (dump_enabled_p ()
6470 && reduction_type
== FOLD_LEFT_REDUCTION
)
6471 dump_printf_loc (MSG_NOTE
, vect_location
,
6472 "using an in-order (fold-left) reduction.\n");
6473 STMT_VINFO_TYPE (orig_stmt_of_analysis
) = cycle_phi_info_type
;
6474 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6475 reductions go through their own vectorizable_* routines. */
6476 if (!single_defuse_cycle
6477 && code
!= DOT_PROD_EXPR
6478 && code
!= WIDEN_SUM_EXPR
6480 && reduction_type
!= FOLD_LEFT_REDUCTION
)
6482 STMT_VINFO_DEF_TYPE (stmt_info
) = vect_internal_def
;
6483 STMT_VINFO_DEF_TYPE (vect_orig_stmt (stmt_info
)) = vect_internal_def
;
6488 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6492 vect_transform_reduction (stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
6493 stmt_vec_info
*vec_stmt
, slp_tree slp_node
)
6495 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6496 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6497 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6503 stmt_vec_info reduc_info
= info_for_reduction (stmt_info
);
6504 gcc_assert (reduc_info
->is_reduc_info
);
6506 if (nested_in_vect_loop_p (loop
, stmt_info
))
6509 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info
) == vect_double_reduction_def
);
6512 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6513 enum tree_code code
= gimple_assign_rhs_code (stmt
);
6514 int op_type
= TREE_CODE_LENGTH (code
);
6518 switch (get_gimple_rhs_class (code
))
6520 case GIMPLE_TERNARY_RHS
:
6521 ops
[2] = gimple_assign_rhs3 (stmt
);
6523 case GIMPLE_BINARY_RHS
:
6524 ops
[0] = gimple_assign_rhs1 (stmt
);
6525 ops
[1] = gimple_assign_rhs2 (stmt
);
6531 /* All uses but the last are expected to be defined in the loop.
6532 The last use is the reduction variable. In case of nested cycle this
6533 assumption is not true: we use reduc_index to record the index of the
6534 reduction variable. */
6535 stmt_vec_info phi_info
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
));
6536 gphi
*reduc_def_phi
= as_a
<gphi
*> (phi_info
->stmt
);
6537 int reduc_index
= STMT_VINFO_REDUC_IDX (reduc_info
);
6538 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
6543 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
6547 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6551 internal_fn cond_fn
= get_conditional_internal_fn (code
);
6552 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
6553 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
6556 stmt_vec_info new_stmt_info
= NULL
;
6557 stmt_vec_info prev_stmt_info
;
6558 tree new_temp
= NULL_TREE
;
6559 auto_vec
<tree
> vec_oprnds0
;
6560 auto_vec
<tree
> vec_oprnds1
;
6561 auto_vec
<tree
> vec_oprnds2
;
6564 if (dump_enabled_p ())
6565 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
6567 /* FORNOW: Multiple types are not supported for condition. */
6568 if (code
== COND_EXPR
)
6569 gcc_assert (ncopies
== 1);
6571 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
6573 vect_reduction_type reduction_type
= STMT_VINFO_REDUC_TYPE (reduc_info
);
6574 if (reduction_type
== FOLD_LEFT_REDUCTION
)
6576 internal_fn reduc_fn
= STMT_VINFO_REDUC_FN (reduc_info
);
6577 return vectorize_fold_left_reduction
6578 (stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
6579 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
6582 bool single_defuse_cycle
= STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
);
6583 gcc_assert (single_defuse_cycle
6584 || code
== DOT_PROD_EXPR
6585 || code
== WIDEN_SUM_EXPR
6586 || code
== SAD_EXPR
);
6588 /* Create the destination vector */
6589 tree scalar_dest
= gimple_assign_lhs (stmt
);
6590 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
6592 prev_stmt_info
= NULL
;
6595 vec_oprnds0
.create (1);
6596 vec_oprnds1
.create (1);
6597 if (op_type
== ternary_op
)
6598 vec_oprnds2
.create (1);
6601 for (j
= 0; j
< ncopies
; j
++)
6608 /* Get vec defs for all the operands except the reduction index,
6609 ensuring the ordering of the ops in the vector is kept. */
6610 auto_vec
<tree
, 3> slp_ops
;
6611 auto_vec
<vec
<tree
>, 3> vec_defs
;
6613 slp_ops
.quick_push (ops
[0]);
6614 slp_ops
.quick_push (ops
[1]);
6615 if (op_type
== ternary_op
)
6616 slp_ops
.quick_push (ops
[2]);
6618 vect_get_slp_defs (slp_ops
, slp_node
, &vec_defs
);
6620 vec_oprnds0
.safe_splice (vec_defs
[0]);
6621 vec_defs
[0].release ();
6622 vec_oprnds1
.safe_splice (vec_defs
[1]);
6623 vec_defs
[1].release ();
6624 if (op_type
== ternary_op
)
6626 vec_oprnds2
.safe_splice (vec_defs
[2]);
6627 vec_defs
[2].release ();
6632 vec_oprnds0
.quick_push
6633 (vect_get_vec_def_for_operand (ops
[0], stmt_info
));
6634 vec_oprnds1
.quick_push
6635 (vect_get_vec_def_for_operand (ops
[1], stmt_info
));
6636 if (op_type
== ternary_op
)
6637 vec_oprnds2
.quick_push
6638 (vect_get_vec_def_for_operand (ops
[2], stmt_info
));
6645 gcc_assert (reduc_index
!= -1 || ! single_defuse_cycle
);
6647 if (single_defuse_cycle
&& reduc_index
== 0)
6648 vec_oprnds0
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
6651 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
6653 if (single_defuse_cycle
&& reduc_index
== 1)
6654 vec_oprnds1
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
6657 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
6659 if (op_type
== ternary_op
)
6661 if (single_defuse_cycle
&& reduc_index
== 2)
6662 vec_oprnds2
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
6665 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
6671 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
6673 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
6674 if (masked_loop_p
&& !mask_by_cond_expr
)
6676 /* Make sure that the reduction accumulator is vop[0]. */
6677 if (reduc_index
== 1)
6679 gcc_assert (commutative_tree_code (code
));
6680 std::swap (vop
[0], vop
[1]);
6682 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
6683 vectype_in
, i
* ncopies
+ j
);
6684 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
6687 new_temp
= make_ssa_name (vec_dest
, call
);
6688 gimple_call_set_lhs (call
, new_temp
);
6689 gimple_call_set_nothrow (call
, true);
6691 = vect_finish_stmt_generation (stmt_info
, call
, gsi
);
6695 if (op_type
== ternary_op
)
6696 vop
[2] = vec_oprnds2
[i
];
6698 if (masked_loop_p
&& mask_by_cond_expr
)
6700 tree mask
= vect_get_loop_mask (gsi
, masks
,
6702 vectype_in
, i
* ncopies
+ j
);
6703 build_vect_cond_expr (code
, vop
, mask
, gsi
);
6706 gassign
*new_stmt
= gimple_build_assign (vec_dest
, code
,
6707 vop
[0], vop
[1], vop
[2]);
6708 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
6709 gimple_assign_set_lhs (new_stmt
, new_temp
);
6711 = vect_finish_stmt_generation (stmt_info
, new_stmt
, gsi
);
6715 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt_info
);
6718 if (slp_node
|| single_defuse_cycle
)
6722 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
6724 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt_info
;
6726 prev_stmt_info
= new_stmt_info
;
6729 if (single_defuse_cycle
&& !slp_node
)
6730 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
6735 /* Transform phase of a cycle PHI. */
6738 vect_transform_cycle_phi (stmt_vec_info stmt_info
, stmt_vec_info
*vec_stmt
,
6739 slp_tree slp_node
, slp_instance slp_node_instance
)
6741 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6742 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6743 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6746 stmt_vec_info prev_phi_info
;
6748 bool nested_cycle
= false;
6751 if (nested_in_vect_loop_p (loop
, stmt_info
))
6754 nested_cycle
= true;
6757 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
6758 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
6759 stmt_vec_info reduc_info
= info_for_reduction (stmt_info
);
6760 gcc_assert (reduc_info
->is_reduc_info
);
6762 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
6763 || STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
)
6764 /* Leave the scalar phi in place. */
6767 tree vectype_in
= STMT_VINFO_REDUC_VECTYPE_IN (reduc_info
);
6768 /* For a nested cycle we do not fill the above. */
6770 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
6771 gcc_assert (vectype_in
);
6775 /* The size vect_schedule_slp_instance computes is off for us. */
6776 vec_num
= vect_get_num_vectors
6777 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6778 * SLP_TREE_SCALAR_STMTS (slp_node
).length (), vectype_in
);
6784 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6787 /* Check whether we should use a single PHI node and accumulate
6788 vectors to one before the backedge. */
6789 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info
))
6792 /* Create the destination vector */
6793 gphi
*phi
= as_a
<gphi
*> (stmt_info
->stmt
);
6794 tree vec_dest
= vect_create_destination_var (gimple_phi_result (phi
),
6797 /* Get the loop-entry arguments. */
6798 tree vec_initial_def
;
6799 auto_vec
<tree
> vec_initial_defs
;
6802 vec_initial_defs
.reserve (vec_num
);
6803 gcc_assert (slp_node
== slp_node_instance
->reduc_phis
);
6804 stmt_vec_info first
= REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
);
6806 = neutral_op_for_slp_reduction (slp_node
,
6807 STMT_VINFO_REDUC_CODE (reduc_info
),
6809 get_initial_defs_for_reduction (slp_node_instance
->reduc_phis
,
6810 &vec_initial_defs
, vec_num
,
6811 first
!= NULL
, neutral_op
);
6815 /* Get at the scalar def before the loop, that defines the initial
6816 value of the reduction variable. */
6817 tree initial_def
= PHI_ARG_DEF_FROM_EDGE (phi
,
6818 loop_preheader_edge (loop
));
6819 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
6820 and we can't use zero for induc_val, use initial_def. Similarly
6821 for REDUC_MIN and initial_def larger than the base. */
6822 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == INTEGER_INDUC_COND_REDUCTION
)
6824 tree induc_val
= STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
);
6825 if (TREE_CODE (initial_def
) == INTEGER_CST
6826 && !integer_zerop (induc_val
)
6827 && (((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info
) == MAX_EXPR
)
6828 && tree_int_cst_lt (initial_def
, induc_val
))
6829 || ((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info
) == MIN_EXPR
)
6830 && tree_int_cst_lt (induc_val
, initial_def
))))
6832 induc_val
= initial_def
;
6833 /* Communicate we used the initial_def to epilouge
6835 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info
) = NULL_TREE
;
6837 vec_initial_def
= build_vector_from_val (vectype_out
, induc_val
);
6839 else if (nested_cycle
)
6841 /* Do not use an adjustment def as that case is not supported
6842 correctly if ncopies is not one. */
6843 vec_initial_def
= vect_get_vec_def_for_operand (initial_def
,
6848 tree adjustment_def
= NULL_TREE
;
6849 tree
*adjustment_defp
= &adjustment_def
;
6850 enum tree_code code
= STMT_VINFO_REDUC_CODE (reduc_info
);
6851 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
)
6852 adjustment_defp
= NULL
;
6854 = get_initial_def_for_reduction (reduc_stmt_info
, code
,
6855 initial_def
, adjustment_defp
);
6856 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info
) = adjustment_def
;
6858 vec_initial_defs
.create (1);
6859 vec_initial_defs
.quick_push (vec_initial_def
);
6862 /* Generate the reduction PHIs upfront. */
6863 prev_phi_info
= NULL
;
6864 for (i
= 0; i
< vec_num
; i
++)
6866 tree vec_init_def
= vec_initial_defs
[i
];
6867 for (j
= 0; j
< ncopies
; j
++)
6869 /* Create the reduction-phi that defines the reduction
6871 gphi
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
6872 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
6874 /* Set the loop-entry arg of the reduction-phi. */
6875 if (j
!= 0 && nested_cycle
)
6876 vec_init_def
= vect_get_vec_def_for_stmt_copy (loop_vinfo
,
6878 add_phi_arg (new_phi
, vec_init_def
, loop_preheader_edge (loop
),
6881 /* The loop-latch arg is set in epilogue processing. */
6884 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi_info
);
6888 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_phi_info
;
6890 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi_info
;
6891 prev_phi_info
= new_phi_info
;
6899 /* Vectorizes LC PHIs. */
6902 vectorizable_lc_phi (stmt_vec_info stmt_info
, stmt_vec_info
*vec_stmt
,
6905 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6907 || !is_a
<gphi
*> (stmt_info
->stmt
)
6908 || gimple_phi_num_args (stmt_info
->stmt
) != 1)
6911 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_internal_def
6912 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
6915 if (!vec_stmt
) /* transformation not required. */
6917 STMT_VINFO_TYPE (stmt_info
) = lc_phi_info_type
;
6921 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6922 tree scalar_dest
= gimple_phi_result (stmt_info
->stmt
);
6923 basic_block bb
= gimple_bb (stmt_info
->stmt
);
6924 edge e
= single_pred_edge (bb
);
6925 tree vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
6926 vec
<tree
> vec_oprnds
= vNULL
;
6927 vect_get_vec_defs (gimple_phi_arg_def (stmt_info
->stmt
, 0), NULL_TREE
,
6928 stmt_info
, &vec_oprnds
, NULL
, slp_node
);
6931 unsigned vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
6932 gcc_assert (vec_oprnds
.length () == vec_num
);
6933 for (unsigned i
= 0; i
< vec_num
; i
++)
6935 /* Create the vectorized LC PHI node. */
6936 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
6937 add_phi_arg (new_phi
, vec_oprnds
[i
], e
, UNKNOWN_LOCATION
);
6938 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
6939 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi_info
);
6944 unsigned ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
6945 stmt_vec_info prev_phi_info
= NULL
;
6946 for (unsigned i
= 0; i
< ncopies
; i
++)
6949 vect_get_vec_defs_for_stmt_copy (loop_vinfo
, &vec_oprnds
, NULL
);
6950 /* Create the vectorized LC PHI node. */
6951 gphi
*new_phi
= create_phi_node (vec_dest
, bb
);
6952 add_phi_arg (new_phi
, vec_oprnds
[0], e
, UNKNOWN_LOCATION
);
6953 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
6955 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_phi_info
;
6957 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi_info
;
6958 prev_phi_info
= new_phi_info
;
6961 vec_oprnds
.release ();
6967 /* Function vect_min_worthwhile_factor.
6969 For a loop where we could vectorize the operation indicated by CODE,
6970 return the minimum vectorization factor that makes it worthwhile
6971 to use generic vectors. */
6973 vect_min_worthwhile_factor (enum tree_code code
)
6993 /* Return true if VINFO indicates we are doing loop vectorization and if
6994 it is worth decomposing CODE operations into scalar operations for
6995 that loop's vectorization factor. */
6998 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7000 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7001 unsigned HOST_WIDE_INT value
;
7003 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7004 && value
>= vect_min_worthwhile_factor (code
));
7007 /* Function vectorizable_induction
7009 Check if STMT_INFO performs an induction computation that can be vectorized.
7010 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7011 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7012 Return true if STMT_INFO is vectorizable in this way. */
7015 vectorizable_induction (stmt_vec_info stmt_info
,
7016 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7017 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
7018 stmt_vector_for_cost
*cost_vec
)
7020 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7021 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7023 bool nested_in_vect_loop
= false;
7024 class loop
*iv_loop
;
7026 edge pe
= loop_preheader_edge (loop
);
7028 tree new_vec
, vec_init
, vec_step
, t
;
7031 gphi
*induction_phi
;
7032 tree induc_def
, vec_dest
;
7033 tree init_expr
, step_expr
;
7034 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7038 imm_use_iterator imm_iter
;
7039 use_operand_p use_p
;
7043 gimple_stmt_iterator si
;
7045 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
7049 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7052 /* Make sure it was recognized as induction computation. */
7053 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7056 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7057 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7062 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7063 gcc_assert (ncopies
>= 1);
7065 /* FORNOW. These restrictions should be relaxed. */
7066 if (nested_in_vect_loop_p (loop
, stmt_info
))
7068 imm_use_iterator imm_iter
;
7069 use_operand_p use_p
;
7076 if (dump_enabled_p ())
7077 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7078 "multiple types in nested loop.\n");
7082 /* FORNOW: outer loop induction with SLP not supported. */
7083 if (STMT_SLP_TYPE (stmt_info
))
7087 latch_e
= loop_latch_edge (loop
->inner
);
7088 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7089 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7091 gimple
*use_stmt
= USE_STMT (use_p
);
7092 if (is_gimple_debug (use_stmt
))
7095 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7097 exit_phi
= use_stmt
;
7103 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7104 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7105 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7109 "inner-loop induction only used outside "
7110 "of the outer vectorized loop.\n");
7115 nested_in_vect_loop
= true;
7116 iv_loop
= loop
->inner
;
7120 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7122 if (slp_node
&& !nunits
.is_constant ())
7124 /* The current SLP code creates the initial value element-by-element. */
7125 if (dump_enabled_p ())
7126 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7127 "SLP induction not supported for variable-length"
7132 if (!vec_stmt
) /* transformation not required. */
7134 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
7135 DUMP_VECT_SCOPE ("vectorizable_induction");
7136 vect_model_induction_cost (stmt_info
, ncopies
, cost_vec
);
7142 /* Compute a vector variable, initialized with the first VF values of
7143 the induction variable. E.g., for an iv with IV_PHI='X' and
7144 evolution S, for a vector of 4 units, we want to compute:
7145 [X, X + S, X + 2*S, X + 3*S]. */
7147 if (dump_enabled_p ())
7148 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
7150 latch_e
= loop_latch_edge (iv_loop
);
7151 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7153 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
7154 gcc_assert (step_expr
!= NULL_TREE
);
7155 tree step_vectype
= get_same_sized_vectype (TREE_TYPE (step_expr
), vectype
);
7157 pe
= loop_preheader_edge (iv_loop
);
7158 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
7159 loop_preheader_edge (iv_loop
));
7162 if (!nested_in_vect_loop
)
7164 /* Convert the initial value to the IV update type. */
7165 tree new_type
= TREE_TYPE (step_expr
);
7166 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
7168 /* If we are using the loop mask to "peel" for alignment then we need
7169 to adjust the start value here. */
7170 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
7171 if (skip_niters
!= NULL_TREE
)
7173 if (FLOAT_TYPE_P (vectype
))
7174 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
7177 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
7178 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
7179 skip_niters
, step_expr
);
7180 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
7181 init_expr
, skip_step
);
7187 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7188 gcc_assert (!new_bb
);
7191 /* Find the first insertion point in the BB. */
7192 basic_block bb
= gimple_bb (phi
);
7193 si
= gsi_after_labels (bb
);
7195 /* For SLP induction we have to generate several IVs as for example
7196 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7197 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7198 [VF*S, VF*S, VF*S, VF*S] for all. */
7201 /* Enforced above. */
7202 unsigned int const_nunits
= nunits
.to_constant ();
7204 /* Generate [VF*S, VF*S, ... ]. */
7205 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7207 expr
= build_int_cst (integer_type_node
, vf
);
7208 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7211 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7212 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7214 if (! CONSTANT_CLASS_P (new_name
))
7215 new_name
= vect_init_vector (stmt_info
, new_name
,
7216 TREE_TYPE (step_expr
), NULL
);
7217 new_vec
= build_vector_from_val (step_vectype
, new_name
);
7218 vec_step
= vect_init_vector (stmt_info
, new_vec
, step_vectype
, NULL
);
7220 /* Now generate the IVs. */
7221 unsigned group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7222 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7223 unsigned elts
= const_nunits
* nvects
;
7224 unsigned nivs
= least_common_multiple (group_size
,
7225 const_nunits
) / const_nunits
;
7226 gcc_assert (elts
% group_size
== 0);
7227 tree elt
= init_expr
;
7229 for (ivn
= 0; ivn
< nivs
; ++ivn
)
7231 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
7233 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
7235 if (ivn
*const_nunits
+ eltn
>= group_size
7236 && (ivn
* const_nunits
+ eltn
) % group_size
== 0)
7237 elt
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (elt
),
7239 elts
.quick_push (elt
);
7241 vec_init
= gimple_build_vector (&stmts
, &elts
);
7242 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
7245 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7246 gcc_assert (!new_bb
);
7249 /* Create the induction-phi that defines the induction-operand. */
7250 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7251 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7252 stmt_vec_info induction_phi_info
7253 = loop_vinfo
->add_stmt (induction_phi
);
7254 induc_def
= PHI_RESULT (induction_phi
);
7256 /* Create the iv update inside the loop */
7257 gimple_seq stmts
= NULL
;
7258 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
7259 vec_def
= gimple_build (&stmts
,
7260 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
7261 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
7262 loop_vinfo
->add_stmt (SSA_NAME_DEF_STMT (vec_def
));
7263 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7265 /* Set the arguments of the phi node: */
7266 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7267 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7270 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi_info
);
7273 /* Re-use IVs when we can. */
7277 = least_common_multiple (group_size
, const_nunits
) / group_size
;
7278 /* Generate [VF'*S, VF'*S, ... ]. */
7279 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7281 expr
= build_int_cst (integer_type_node
, vfp
);
7282 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7285 expr
= build_int_cst (TREE_TYPE (step_expr
), vfp
);
7286 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7288 if (! CONSTANT_CLASS_P (new_name
))
7289 new_name
= vect_init_vector (stmt_info
, new_name
,
7290 TREE_TYPE (step_expr
), NULL
);
7291 new_vec
= build_vector_from_val (step_vectype
, new_name
);
7292 vec_step
= vect_init_vector (stmt_info
, new_vec
, step_vectype
, NULL
);
7293 for (; ivn
< nvects
; ++ivn
)
7295 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
]->stmt
;
7297 if (gimple_code (iv
) == GIMPLE_PHI
)
7298 def
= gimple_phi_result (iv
);
7300 def
= gimple_assign_lhs (iv
);
7301 gimple_seq stmts
= NULL
;
7302 def
= gimple_convert (&stmts
, step_vectype
, def
);
7303 def
= gimple_build (&stmts
,
7304 PLUS_EXPR
, step_vectype
, def
, vec_step
);
7305 def
= gimple_convert (&stmts
, vectype
, def
);
7306 if (gimple_code (iv
) == GIMPLE_PHI
)
7307 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7310 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
7311 gsi_insert_seq_after (&tgsi
, stmts
, GSI_CONTINUE_LINKING
);
7313 SLP_TREE_VEC_STMTS (slp_node
).quick_push
7314 (loop_vinfo
->add_stmt (SSA_NAME_DEF_STMT (def
)));
7321 /* Create the vector that holds the initial_value of the induction. */
7322 if (nested_in_vect_loop
)
7324 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7325 been created during vectorization of previous stmts. We obtain it
7326 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7327 vec_init
= vect_get_vec_def_for_operand (init_expr
, stmt_info
);
7328 /* If the initial value is not of proper type, convert it. */
7329 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
7332 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
7336 build1 (VIEW_CONVERT_EXPR
, vectype
,
7338 vec_init
= gimple_assign_lhs (new_stmt
);
7339 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
7341 gcc_assert (!new_bb
);
7342 loop_vinfo
->add_stmt (new_stmt
);
7347 /* iv_loop is the loop to be vectorized. Create:
7348 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7350 new_name
= gimple_convert (&stmts
, TREE_TYPE (step_expr
), init_expr
);
7352 unsigned HOST_WIDE_INT const_nunits
;
7353 if (nunits
.is_constant (&const_nunits
))
7355 tree_vector_builder
elts (step_vectype
, const_nunits
, 1);
7356 elts
.quick_push (new_name
);
7357 for (i
= 1; i
< const_nunits
; i
++)
7359 /* Create: new_name_i = new_name + step_expr */
7360 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
7361 new_name
, step_expr
);
7362 elts
.quick_push (new_name
);
7364 /* Create a vector from [new_name_0, new_name_1, ...,
7365 new_name_nunits-1] */
7366 vec_init
= gimple_build_vector (&stmts
, &elts
);
7368 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
7369 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7370 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, step_vectype
,
7371 new_name
, step_expr
);
7375 [base, base, base, ...]
7376 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7377 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
7378 gcc_assert (flag_associative_math
);
7379 tree index
= build_index_vector (step_vectype
, 0, 1);
7380 tree base_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
7382 tree step_vec
= gimple_build_vector_from_val (&stmts
, step_vectype
,
7384 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, step_vectype
, index
);
7385 vec_init
= gimple_build (&stmts
, MULT_EXPR
, step_vectype
,
7386 vec_init
, step_vec
);
7387 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
,
7388 vec_init
, base_vec
);
7390 vec_init
= gimple_convert (&stmts
, vectype
, vec_init
);
7394 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7395 gcc_assert (!new_bb
);
7400 /* Create the vector that holds the step of the induction. */
7401 if (nested_in_vect_loop
)
7402 /* iv_loop is nested in the loop to be vectorized. Generate:
7403 vec_step = [S, S, S, S] */
7404 new_name
= step_expr
;
7407 /* iv_loop is the loop to be vectorized. Generate:
7408 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7409 gimple_seq seq
= NULL
;
7410 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7412 expr
= build_int_cst (integer_type_node
, vf
);
7413 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7416 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7417 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7421 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7422 gcc_assert (!new_bb
);
7426 t
= unshare_expr (new_name
);
7427 gcc_assert (CONSTANT_CLASS_P (new_name
)
7428 || TREE_CODE (new_name
) == SSA_NAME
);
7429 new_vec
= build_vector_from_val (step_vectype
, t
);
7430 vec_step
= vect_init_vector (stmt_info
, new_vec
, step_vectype
, NULL
);
7433 /* Create the following def-use cycle:
7438 vec_iv = PHI <vec_init, vec_loop>
7442 vec_loop = vec_iv + vec_step; */
7444 /* Create the induction-phi that defines the induction-operand. */
7445 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7446 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7447 stmt_vec_info induction_phi_info
= loop_vinfo
->add_stmt (induction_phi
);
7448 induc_def
= PHI_RESULT (induction_phi
);
7450 /* Create the iv update inside the loop */
7452 vec_def
= gimple_convert (&stmts
, step_vectype
, induc_def
);
7453 vec_def
= gimple_build (&stmts
, PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
7454 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
7455 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7456 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
7457 stmt_vec_info new_stmt_info
= loop_vinfo
->add_stmt (new_stmt
);
7459 /* Set the arguments of the phi node: */
7460 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7461 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7464 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= induction_phi_info
;
7466 /* In case that vectorization factor (VF) is bigger than the number
7467 of elements that we can fit in a vectype (nunits), we have to generate
7468 more than one vector stmt - i.e - we need to "unroll" the
7469 vector stmt by a factor VF/nunits. For more details see documentation
7470 in vectorizable_operation. */
7474 gimple_seq seq
= NULL
;
7475 stmt_vec_info prev_stmt_vinfo
;
7476 /* FORNOW. This restriction should be relaxed. */
7477 gcc_assert (!nested_in_vect_loop
);
7479 /* Create the vector that holds the step of the induction. */
7480 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7482 expr
= build_int_cst (integer_type_node
, nunits
);
7483 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7486 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
7487 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7491 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7492 gcc_assert (!new_bb
);
7495 t
= unshare_expr (new_name
);
7496 gcc_assert (CONSTANT_CLASS_P (new_name
)
7497 || TREE_CODE (new_name
) == SSA_NAME
);
7498 new_vec
= build_vector_from_val (step_vectype
, t
);
7499 vec_step
= vect_init_vector (stmt_info
, new_vec
, step_vectype
, NULL
);
7501 vec_def
= induc_def
;
7502 prev_stmt_vinfo
= induction_phi_info
;
7503 for (i
= 1; i
< ncopies
; i
++)
7505 /* vec_i = vec_prev + vec_step */
7506 gimple_seq stmts
= NULL
;
7507 vec_def
= gimple_convert (&stmts
, step_vectype
, vec_def
);
7508 vec_def
= gimple_build (&stmts
,
7509 PLUS_EXPR
, step_vectype
, vec_def
, vec_step
);
7510 vec_def
= gimple_convert (&stmts
, vectype
, vec_def
);
7512 gsi_insert_seq_before (&si
, stmts
, GSI_SAME_STMT
);
7513 new_stmt
= SSA_NAME_DEF_STMT (vec_def
);
7514 new_stmt_info
= loop_vinfo
->add_stmt (new_stmt
);
7515 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt_info
;
7516 prev_stmt_vinfo
= new_stmt_info
;
7520 if (nested_in_vect_loop
)
7522 /* Find the loop-closed exit-phi of the induction, and record
7523 the final vector of induction results: */
7525 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7527 gimple
*use_stmt
= USE_STMT (use_p
);
7528 if (is_gimple_debug (use_stmt
))
7531 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
7533 exit_phi
= use_stmt
;
7539 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7540 /* FORNOW. Currently not supporting the case that an inner-loop induction
7541 is not used in the outer-loop (i.e. only outside the outer-loop). */
7542 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
7543 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
7545 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt_info
;
7546 if (dump_enabled_p ())
7547 dump_printf_loc (MSG_NOTE
, vect_location
,
7548 "vector of inductions after inner-loop:%G",
7554 if (dump_enabled_p ())
7555 dump_printf_loc (MSG_NOTE
, vect_location
,
7556 "transform induction: created def-use cycle: %G%G",
7557 induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
7562 /* Function vectorizable_live_operation.
7564 STMT_INFO computes a value that is used outside the loop. Check if
7565 it can be supported. */
7568 vectorizable_live_operation (stmt_vec_info stmt_info
,
7569 gimple_stmt_iterator
*gsi
,
7570 slp_tree slp_node
, slp_instance slp_node_instance
,
7571 int slp_index
, bool vec_stmt_p
,
7572 stmt_vector_for_cost
*)
7574 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7575 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7576 imm_use_iterator imm_iter
;
7577 tree lhs
, lhs_type
, bitsize
, vec_bitsize
;
7578 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7579 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7582 auto_vec
<tree
> vec_oprnds
;
7584 poly_uint64 vec_index
= 0;
7586 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
7588 /* The last stmt of a reduction is live and vectorized via
7589 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7590 validity so just trigger the transform here. */
7591 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
)))
7597 /* For reduction chains the meta-info is attached to
7598 the group leader. */
7599 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
7600 stmt_info
= REDUC_GROUP_FIRST_ELEMENT (stmt_info
);
7601 /* For SLP reductions we vectorize the epilogue for
7602 all involved stmts together. */
7603 else if (slp_index
!= 0)
7606 stmt_vec_info reduc_info
= info_for_reduction (stmt_info
);
7607 gcc_assert (reduc_info
->is_reduc_info
);
7608 if (STMT_VINFO_REDUC_TYPE (reduc_info
) == FOLD_LEFT_REDUCTION
7609 || STMT_VINFO_REDUC_TYPE (reduc_info
) == EXTRACT_LAST_REDUCTION
)
7611 vect_create_epilog_for_reduction (stmt_info
, slp_node
,
7616 /* FORNOW. CHECKME. */
7617 if (nested_in_vect_loop_p (loop
, stmt_info
))
7620 /* If STMT is not relevant and it is a simple assignment and its inputs are
7621 invariant then it can remain in place, unvectorized. The original last
7622 scalar value that it computes will be used. */
7623 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7625 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
7626 if (dump_enabled_p ())
7627 dump_printf_loc (MSG_NOTE
, vect_location
,
7628 "statement is simple and uses invariant. Leaving in "
7636 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7640 gcc_assert (slp_index
>= 0);
7642 int num_scalar
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7643 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7645 /* Get the last occurrence of the scalar index from the concatenation of
7646 all the slp vectors. Calculate which slp vector it is and the index
7648 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
7650 /* Calculate which vector contains the result, and which lane of
7651 that vector we need. */
7652 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
7654 if (dump_enabled_p ())
7655 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7656 "Cannot determine which vector holds the"
7657 " final result.\n");
7664 /* No transformation required. */
7665 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
7667 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
7668 OPTIMIZE_FOR_SPEED
))
7670 if (dump_enabled_p ())
7671 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7672 "can't use a fully-masked loop because "
7673 "the target doesn't support extract last "
7675 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7679 if (dump_enabled_p ())
7680 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7681 "can't use a fully-masked loop because an "
7682 "SLP statement is live after the loop.\n");
7683 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7685 else if (ncopies
> 1)
7687 if (dump_enabled_p ())
7688 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7689 "can't use a fully-masked loop because"
7690 " ncopies is greater than 1.\n");
7691 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7695 gcc_assert (ncopies
== 1 && !slp_node
);
7696 vect_record_loop_mask (loop_vinfo
,
7697 &LOOP_VINFO_MASKS (loop_vinfo
),
7704 /* Use the lhs of the original scalar statement. */
7705 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
7707 lhs
= (is_a
<gphi
*> (stmt
)) ? gimple_phi_result (stmt
)
7708 : gimple_get_lhs (stmt
);
7709 lhs_type
= TREE_TYPE (lhs
);
7711 bitsize
= (VECTOR_BOOLEAN_TYPE_P (vectype
)
7712 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype
)))
7713 : TYPE_SIZE (TREE_TYPE (vectype
)));
7714 vec_bitsize
= TYPE_SIZE (vectype
);
7716 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7717 tree vec_lhs
, bitstart
;
7720 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
7722 /* Get the correct slp vectorized stmt. */
7723 gimple
*vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
]->stmt
;
7724 if (gphi
*phi
= dyn_cast
<gphi
*> (vec_stmt
))
7725 vec_lhs
= gimple_phi_result (phi
);
7727 vec_lhs
= gimple_get_lhs (vec_stmt
);
7729 /* Get entry to use. */
7730 bitstart
= bitsize_int (vec_index
);
7731 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
7735 enum vect_def_type dt
= STMT_VINFO_DEF_TYPE (stmt_info
);
7736 vec_lhs
= vect_get_vec_def_for_operand_1 (stmt_info
, dt
);
7737 gcc_checking_assert (ncopies
== 1
7738 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
7740 /* For multiple copies, get the last copy. */
7741 for (int i
= 1; i
< ncopies
; ++i
)
7742 vec_lhs
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, vec_lhs
);
7744 /* Get the last lane in the vector. */
7745 bitstart
= int_const_binop (MINUS_EXPR
, vec_bitsize
, bitsize
);
7748 gimple_seq stmts
= NULL
;
7750 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
7754 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7756 where VEC_LHS is the vectorized live-out result and MASK is
7757 the loop mask for the final iteration. */
7758 gcc_assert (ncopies
== 1 && !slp_node
);
7759 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
7760 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
7762 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
,
7763 scalar_type
, mask
, vec_lhs
);
7765 /* Convert the extracted vector element to the required scalar type. */
7766 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
7770 tree bftype
= TREE_TYPE (vectype
);
7771 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
7772 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
7773 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs
, bitsize
, bitstart
);
7774 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
7775 &stmts
, true, NULL_TREE
);
7779 gsi_insert_seq_on_edge_immediate (single_exit (loop
), stmts
);
7781 /* Replace use of lhs with newly computed result. If the use stmt is a
7782 single arg PHI, just replace all uses of PHI result. It's necessary
7783 because lcssa PHI defining lhs may be before newly inserted stmt. */
7784 use_operand_p use_p
;
7785 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
7786 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
7787 && !is_gimple_debug (use_stmt
))
7789 if (gimple_code (use_stmt
) == GIMPLE_PHI
7790 && gimple_phi_num_args (use_stmt
) == 1)
7792 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
7796 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
7797 SET_USE (use_p
, new_tree
);
7799 update_stmt (use_stmt
);
7805 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7808 vect_loop_kill_debug_uses (class loop
*loop
, stmt_vec_info stmt_info
)
7810 ssa_op_iter op_iter
;
7811 imm_use_iterator imm_iter
;
7812 def_operand_p def_p
;
7815 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
7817 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
7821 if (!is_gimple_debug (ustmt
))
7824 bb
= gimple_bb (ustmt
);
7826 if (!flow_bb_inside_loop_p (loop
, bb
))
7828 if (gimple_debug_bind_p (ustmt
))
7830 if (dump_enabled_p ())
7831 dump_printf_loc (MSG_NOTE
, vect_location
,
7832 "killing debug use\n");
7834 gimple_debug_bind_reset_value (ustmt
);
7835 update_stmt (ustmt
);
7844 /* Given loop represented by LOOP_VINFO, return true if computation of
7845 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7849 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
7851 /* Constant case. */
7852 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
7854 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
7855 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
7857 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
7858 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
7859 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
7864 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7865 /* Check the upper bound of loop niters. */
7866 if (get_max_loop_iterations (loop
, &max
))
7868 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
7869 signop sgn
= TYPE_SIGN (type
);
7870 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
7877 /* Return a mask type with half the number of elements as TYPE. */
7880 vect_halve_mask_nunits (tree type
)
7882 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (type
), 2);
7883 return build_truth_vector_type (nunits
, current_vector_size
);
7886 /* Return a mask type with twice as many elements as TYPE. */
7889 vect_double_mask_nunits (tree type
)
7891 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (type
) * 2;
7892 return build_truth_vector_type (nunits
, current_vector_size
);
7895 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7896 contain a sequence of NVECTORS masks that each control a vector of type
7900 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
7901 unsigned int nvectors
, tree vectype
)
7903 gcc_assert (nvectors
!= 0);
7904 if (masks
->length () < nvectors
)
7905 masks
->safe_grow_cleared (nvectors
);
7906 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
7907 /* The number of scalars per iteration and the number of vectors are
7908 both compile-time constants. */
7909 unsigned int nscalars_per_iter
7910 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
7911 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
7912 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
7914 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
7915 rgm
->mask_type
= build_same_sized_truth_vector_type (vectype
);
7919 /* Given a complete set of masks MASKS, extract mask number INDEX
7920 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
7921 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
7923 See the comment above vec_loop_masks for more details about the mask
7927 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
7928 unsigned int nvectors
, tree vectype
, unsigned int index
)
7930 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
7931 tree mask_type
= rgm
->mask_type
;
7933 /* Populate the rgroup's mask array, if this is the first time we've
7935 if (rgm
->masks
.is_empty ())
7937 rgm
->masks
.safe_grow_cleared (nvectors
);
7938 for (unsigned int i
= 0; i
< nvectors
; ++i
)
7940 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
7941 /* Provide a dummy definition until the real one is available. */
7942 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
7943 rgm
->masks
[i
] = mask
;
7947 tree mask
= rgm
->masks
[index
];
7948 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
7949 TYPE_VECTOR_SUBPARTS (vectype
)))
7951 /* A loop mask for data type X can be reused for data type Y
7952 if X has N times more elements than Y and if Y's elements
7953 are N times bigger than X's. In this case each sequence
7954 of N elements in the loop mask will be all-zero or all-one.
7955 We can then view-convert the mask so that each sequence of
7956 N elements is replaced by a single element. */
7957 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
7958 TYPE_VECTOR_SUBPARTS (vectype
)));
7959 gimple_seq seq
= NULL
;
7960 mask_type
= build_same_sized_truth_vector_type (vectype
);
7961 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
7963 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
7968 /* Scale profiling counters by estimation for LOOP which is vectorized
7972 scale_profile_for_vect_loop (class loop
*loop
, unsigned vf
)
7974 edge preheader
= loop_preheader_edge (loop
);
7975 /* Reduce loop iterations by the vectorization factor. */
7976 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
7977 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
7979 if (freq_h
.nonzero_p ())
7981 profile_probability p
;
7983 /* Avoid dropping loop body profile counter to 0 because of zero count
7984 in loop's preheader. */
7985 if (!(freq_e
== profile_count::zero ()))
7986 freq_e
= freq_e
.force_nonzero ();
7987 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
7988 scale_loop_frequencies (loop
, p
);
7991 edge exit_e
= single_exit (loop
);
7992 exit_e
->probability
= profile_probability::always ()
7993 .apply_scale (1, new_est_niter
+ 1);
7995 edge exit_l
= single_pred_edge (loop
->latch
);
7996 profile_probability prob
= exit_l
->probability
;
7997 exit_l
->probability
= exit_e
->probability
.invert ();
7998 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
7999 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
8002 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8003 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8007 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
8008 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
8010 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8011 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8013 if (dump_enabled_p ())
8014 dump_printf_loc (MSG_NOTE
, vect_location
,
8015 "------>vectorizing statement: %G", stmt_info
->stmt
);
8017 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8018 vect_loop_kill_debug_uses (loop
, stmt_info
);
8020 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8021 && !STMT_VINFO_LIVE_P (stmt_info
))
8024 if (STMT_VINFO_VECTYPE (stmt_info
))
8027 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
8028 if (!STMT_SLP_TYPE (stmt_info
)
8029 && maybe_ne (nunits
, vf
)
8030 && dump_enabled_p ())
8031 /* For SLP VF is set according to unrolling factor, and not
8032 to vector size, hence for SLP this print is not valid. */
8033 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8036 /* Pure SLP statements have already been vectorized. We still need
8037 to apply loop vectorization to hybrid SLP statements. */
8038 if (PURE_SLP_STMT (stmt_info
))
8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
8044 if (vect_transform_stmt (stmt_info
, gsi
, NULL
, NULL
))
8045 *seen_store
= stmt_info
;
8048 /* Function vect_transform_loop.
8050 The analysis phase has determined that the loop is vectorizable.
8051 Vectorize the loop - created vectorized stmts to replace the scalar
8052 stmts in the loop, and update the loop exit condition.
8053 Returns scalar epilogue loop if any. */
8056 vect_transform_loop (loop_vec_info loop_vinfo
)
8058 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8059 class loop
*epilogue
= NULL
;
8060 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
8061 int nbbs
= loop
->num_nodes
;
8063 tree niters_vector
= NULL_TREE
;
8064 tree step_vector
= NULL_TREE
;
8065 tree niters_vector_mult_vf
= NULL_TREE
;
8066 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8067 unsigned int lowest_vf
= constant_lower_bound (vf
);
8069 bool check_profitability
= false;
8072 DUMP_VECT_SCOPE ("vec_transform_loop");
8074 loop_vinfo
->shared
->check_datarefs ();
8076 /* Use the more conservative vectorization threshold. If the number
8077 of iterations is constant assume the cost check has been performed
8078 by our caller. If the threshold makes all loops profitable that
8079 run at least the (estimated) vectorization factor number of times
8080 checking is pointless, too. */
8081 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
8082 if (th
>= vect_vf_for_cost (loop_vinfo
)
8083 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8085 if (dump_enabled_p ())
8086 dump_printf_loc (MSG_NOTE
, vect_location
,
8087 "Profitability threshold is %d loop iterations.\n",
8089 check_profitability
= true;
8092 /* Make sure there exists a single-predecessor exit bb. Do this before
8094 edge e
= single_exit (loop
);
8095 if (! single_pred_p (e
->dest
))
8097 split_loop_exit_edge (e
, true);
8098 if (dump_enabled_p ())
8099 dump_printf (MSG_NOTE
, "split exit edge\n");
8102 /* Version the loop first, if required, so the profitability check
8105 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
8108 = vect_loop_versioning (loop_vinfo
);
8109 sloop
->force_vectorize
= false;
8110 check_profitability
= false;
8113 /* Make sure there exists a single-predecessor exit bb also on the
8114 scalar loop copy. Do this after versioning but before peeling
8115 so CFG structure is fine for both scalar and if-converted loop
8116 to make slpeel_duplicate_current_defs_from_edges face matched
8117 loop closed PHI nodes on the exit. */
8118 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8120 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
8121 if (! single_pred_p (e
->dest
))
8123 split_loop_exit_edge (e
, true);
8124 if (dump_enabled_p ())
8125 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
8129 tree niters
= vect_build_loop_niters (loop_vinfo
);
8130 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
8131 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
8132 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
8133 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
8134 &step_vector
, &niters_vector_mult_vf
, th
,
8135 check_profitability
, niters_no_overflow
);
8136 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
)
8137 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
).initialized_p ())
8138 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
),
8139 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo
));
8141 if (niters_vector
== NULL_TREE
)
8143 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8144 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8145 && known_eq (lowest_vf
, vf
))
8148 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
8149 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
8150 step_vector
= build_one_cst (TREE_TYPE (niters
));
8153 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
8154 &step_vector
, niters_no_overflow
);
8157 /* 1) Make sure the loop header has exactly two entries
8158 2) Make sure we have a preheader basic block. */
8160 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
8162 split_edge (loop_preheader_edge (loop
));
8164 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8165 && vect_use_loop_mask_for_alignment_p (loop_vinfo
))
8166 /* This will deal with any possible peeling. */
8167 vect_prepare_for_masked_peels (loop_vinfo
);
8169 /* Schedule the SLP instances first, then handle loop vectorization
8171 if (!loop_vinfo
->slp_instances
.is_empty ())
8173 DUMP_VECT_SCOPE ("scheduling SLP instances");
8174 vect_schedule_slp (loop_vinfo
);
8177 /* FORNOW: the vectorizer supports only loops which body consist
8178 of one basic block (header + empty latch). When the vectorizer will
8179 support more involved loop forms, the order by which the BBs are
8180 traversed need to be reconsidered. */
8182 for (i
= 0; i
< nbbs
; i
++)
8184 basic_block bb
= bbs
[i
];
8185 stmt_vec_info stmt_info
;
8187 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
8190 gphi
*phi
= si
.phi ();
8191 if (dump_enabled_p ())
8192 dump_printf_loc (MSG_NOTE
, vect_location
,
8193 "------>vectorizing phi: %G", phi
);
8194 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
8198 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8199 vect_loop_kill_debug_uses (loop
, stmt_info
);
8201 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8202 && !STMT_VINFO_LIVE_P (stmt_info
))
8205 if (STMT_VINFO_VECTYPE (stmt_info
)
8207 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
8208 && dump_enabled_p ())
8209 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8211 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
8212 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8213 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_double_reduction_def
8214 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
8215 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
)
8216 && ! PURE_SLP_STMT (stmt_info
))
8218 if (dump_enabled_p ())
8219 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
8220 vect_transform_stmt (stmt_info
, NULL
, NULL
, NULL
);
8224 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
8227 stmt
= gsi_stmt (si
);
8228 /* During vectorization remove existing clobber stmts. */
8229 if (gimple_clobber_p (stmt
))
8231 unlink_stmt_vdef (stmt
);
8232 gsi_remove (&si
, true);
8233 release_defs (stmt
);
8237 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
8239 /* vector stmts created in the outer-loop during vectorization of
8240 stmts in an inner-loop may not have a stmt_info, and do not
8241 need to be vectorized. */
8242 stmt_vec_info seen_store
= NULL
;
8245 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
8247 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
8248 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
8249 !gsi_end_p (subsi
); gsi_next (&subsi
))
8251 stmt_vec_info pat_stmt_info
8252 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
8253 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
8256 stmt_vec_info pat_stmt_info
8257 = STMT_VINFO_RELATED_STMT (stmt_info
);
8258 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
, &si
,
8261 vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
8267 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
8268 /* Interleaving. If IS_STORE is TRUE, the
8269 vectorization of the interleaving chain was
8270 completed - free all the stores in the chain. */
8271 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store
));
8273 /* Free the attached stmt_vec_info and remove the stmt. */
8274 loop_vinfo
->remove_stmt (stmt_info
);
8279 /* Stub out scalar statements that must not survive vectorization.
8280 Doing this here helps with grouped statements, or statements that
8281 are involved in patterns. */
8282 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
8283 !gsi_end_p (gsi
); gsi_next (&gsi
))
8285 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
8286 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
8288 tree lhs
= gimple_get_lhs (call
);
8289 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8291 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
8292 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
8293 gsi_replace (&gsi
, new_stmt
, true);
8299 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8300 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8301 if (integer_onep (step_vector
))
8302 niters_no_overflow
= true;
8303 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
8304 niters_vector_mult_vf
, !niters_no_overflow
);
8306 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
8307 scale_profile_for_vect_loop (loop
, assumed_vf
);
8309 /* True if the final iteration might not handle a full vector's
8310 worth of scalar iterations. */
8311 bool final_iter_may_be_partial
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8312 /* The minimum number of iterations performed by the epilogue. This
8313 is 1 when peeling for gaps because we always need a final scalar
8315 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
8316 /* +1 to convert latch counts to loop iteration counts,
8317 -min_epilogue_iters to remove iterations that cannot be performed
8318 by the vector code. */
8319 int bias_for_lowest
= 1 - min_epilogue_iters
;
8320 int bias_for_assumed
= bias_for_lowest
;
8321 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
8322 if (alignment_npeels
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8324 /* When the amount of peeling is known at compile time, the first
8325 iteration will have exactly alignment_npeels active elements.
8326 In the worst case it will have at least one. */
8327 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
8328 bias_for_lowest
+= lowest_vf
- min_first_active
;
8329 bias_for_assumed
+= assumed_vf
- min_first_active
;
8331 /* In these calculations the "- 1" converts loop iteration counts
8332 back to latch counts. */
8333 if (loop
->any_upper_bound
)
8334 loop
->nb_iterations_upper_bound
8335 = (final_iter_may_be_partial
8336 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8338 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8340 if (loop
->any_likely_upper_bound
)
8341 loop
->nb_iterations_likely_upper_bound
8342 = (final_iter_may_be_partial
8343 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
8344 + bias_for_lowest
, lowest_vf
) - 1
8345 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
8346 + bias_for_lowest
, lowest_vf
) - 1);
8347 if (loop
->any_estimate
)
8348 loop
->nb_iterations_estimate
8349 = (final_iter_may_be_partial
8350 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8352 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8355 if (dump_enabled_p ())
8357 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8359 dump_printf_loc (MSG_NOTE
, vect_location
,
8360 "LOOP VECTORIZED\n");
8362 dump_printf_loc (MSG_NOTE
, vect_location
,
8363 "OUTER LOOP VECTORIZED\n");
8364 dump_printf (MSG_NOTE
, "\n");
8368 dump_printf_loc (MSG_NOTE
, vect_location
,
8369 "LOOP EPILOGUE VECTORIZED (VS=");
8370 dump_dec (MSG_NOTE
, current_vector_size
);
8371 dump_printf (MSG_NOTE
, ")\n");
8375 /* Loops vectorized with a variable factor won't benefit from
8376 unrolling/peeling. */
8377 if (!vf
.is_constant ())
8380 if (dump_enabled_p ())
8381 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
8382 " variable-length vectorization factor\n");
8384 /* Free SLP instances here because otherwise stmt reference counting
8386 slp_instance instance
;
8387 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
8388 vect_free_slp_instance (instance
, true);
8389 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
8390 /* Clear-up safelen field since its value is invalid after vectorization
8391 since vectorized loop can have loop-carried dependencies. */
8394 /* Don't vectorize epilogue for epilogue. */
8395 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8398 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK
))
8403 auto_vector_sizes vector_sizes
;
8404 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
, false);
8405 unsigned int next_size
= 0;
8407 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8408 on niters already ajusted for the iterations of the prologue. */
8409 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8410 && known_eq (vf
, lowest_vf
))
8412 unsigned HOST_WIDE_INT eiters
8413 = (LOOP_VINFO_INT_NITERS (loop_vinfo
)
8414 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
));
8416 = eiters
% lowest_vf
+ LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
);
8417 epilogue
->nb_iterations_upper_bound
= eiters
- 1;
8418 epilogue
->any_upper_bound
= true;
8421 while (next_size
< vector_sizes
.length ()
8422 && !(constant_multiple_p (current_vector_size
,
8423 vector_sizes
[next_size
], &ratio
)
8424 && eiters
>= lowest_vf
/ ratio
))
8428 while (next_size
< vector_sizes
.length ()
8429 && maybe_lt (current_vector_size
, vector_sizes
[next_size
]))
8432 if (next_size
== vector_sizes
.length ())
8438 epilogue
->force_vectorize
= loop
->force_vectorize
;
8439 epilogue
->safelen
= loop
->safelen
;
8440 epilogue
->dont_vectorize
= false;
8442 /* We may need to if-convert epilogue to vectorize it. */
8443 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8444 tree_if_conversion (epilogue
);
8450 /* The code below is trying to perform simple optimization - revert
8451 if-conversion for masked stores, i.e. if the mask of a store is zero
8452 do not perform it and all stored value producers also if possible.
8460 this transformation will produce the following semi-hammock:
8462 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8464 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8465 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8466 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8467 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8468 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8469 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8474 optimize_mask_stores (class loop
*loop
)
8476 basic_block
*bbs
= get_loop_body (loop
);
8477 unsigned nbbs
= loop
->num_nodes
;
8480 class loop
*bb_loop
;
8481 gimple_stmt_iterator gsi
;
8483 auto_vec
<gimple
*> worklist
;
8484 auto_purge_vect_location sentinel
;
8486 vect_location
= find_loop_location (loop
);
8487 /* Pick up all masked stores in loop if any. */
8488 for (i
= 0; i
< nbbs
; i
++)
8491 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
8494 stmt
= gsi_stmt (gsi
);
8495 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
8496 worklist
.safe_push (stmt
);
8501 if (worklist
.is_empty ())
8504 /* Loop has masked stores. */
8505 while (!worklist
.is_empty ())
8507 gimple
*last
, *last_store
;
8510 basic_block store_bb
, join_bb
;
8511 gimple_stmt_iterator gsi_to
;
8512 tree vdef
, new_vdef
;
8517 last
= worklist
.pop ();
8518 mask
= gimple_call_arg (last
, 2);
8519 bb
= gimple_bb (last
);
8520 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8521 the same loop as if_bb. It could be different to LOOP when two
8522 level loop-nest is vectorized and mask_store belongs to the inner
8524 e
= split_block (bb
, last
);
8525 bb_loop
= bb
->loop_father
;
8526 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
8528 store_bb
= create_empty_bb (bb
);
8529 add_bb_to_loop (store_bb
, bb_loop
);
8530 e
->flags
= EDGE_TRUE_VALUE
;
8531 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
8532 /* Put STORE_BB to likely part. */
8533 efalse
->probability
= profile_probability::unlikely ();
8534 store_bb
->count
= efalse
->count ();
8535 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
8536 if (dom_info_available_p (CDI_DOMINATORS
))
8537 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
8538 if (dump_enabled_p ())
8539 dump_printf_loc (MSG_NOTE
, vect_location
,
8540 "Create new block %d to sink mask stores.",
8542 /* Create vector comparison with boolean result. */
8543 vectype
= TREE_TYPE (mask
);
8544 zero
= build_zero_cst (vectype
);
8545 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
8546 gsi
= gsi_last_bb (bb
);
8547 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
8548 /* Create new PHI node for vdef of the last masked store:
8549 .MEM_2 = VDEF <.MEM_1>
8550 will be converted to
8551 .MEM.3 = VDEF <.MEM_1>
8552 and new PHI node will be created in join bb
8553 .MEM_2 = PHI <.MEM_1, .MEM_3>
8555 vdef
= gimple_vdef (last
);
8556 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
8557 gimple_set_vdef (last
, new_vdef
);
8558 phi
= create_phi_node (vdef
, join_bb
);
8559 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
8561 /* Put all masked stores with the same mask to STORE_BB if possible. */
8564 gimple_stmt_iterator gsi_from
;
8565 gimple
*stmt1
= NULL
;
8567 /* Move masked store to STORE_BB. */
8569 gsi
= gsi_for_stmt (last
);
8571 /* Shift GSI to the previous stmt for further traversal. */
8573 gsi_to
= gsi_start_bb (store_bb
);
8574 gsi_move_before (&gsi_from
, &gsi_to
);
8575 /* Setup GSI_TO to the non-empty block start. */
8576 gsi_to
= gsi_start_bb (store_bb
);
8577 if (dump_enabled_p ())
8578 dump_printf_loc (MSG_NOTE
, vect_location
,
8579 "Move stmt to created bb\n%G", last
);
8580 /* Move all stored value producers if possible. */
8581 while (!gsi_end_p (gsi
))
8584 imm_use_iterator imm_iter
;
8585 use_operand_p use_p
;
8588 /* Skip debug statements. */
8589 if (is_gimple_debug (gsi_stmt (gsi
)))
8594 stmt1
= gsi_stmt (gsi
);
8595 /* Do not consider statements writing to memory or having
8596 volatile operand. */
8597 if (gimple_vdef (stmt1
)
8598 || gimple_has_volatile_ops (stmt1
))
8602 lhs
= gimple_get_lhs (stmt1
);
8606 /* LHS of vectorized stmt must be SSA_NAME. */
8607 if (TREE_CODE (lhs
) != SSA_NAME
)
8610 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8612 /* Remove dead scalar statement. */
8613 if (has_zero_uses (lhs
))
8615 gsi_remove (&gsi_from
, true);
8620 /* Check that LHS does not have uses outside of STORE_BB. */
8622 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
8625 use_stmt
= USE_STMT (use_p
);
8626 if (is_gimple_debug (use_stmt
))
8628 if (gimple_bb (use_stmt
) != store_bb
)
8637 if (gimple_vuse (stmt1
)
8638 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
8641 /* Can move STMT1 to STORE_BB. */
8642 if (dump_enabled_p ())
8643 dump_printf_loc (MSG_NOTE
, vect_location
,
8644 "Move stmt to created bb\n%G", stmt1
);
8645 gsi_move_before (&gsi_from
, &gsi_to
);
8646 /* Shift GSI_TO for further insertion. */
8649 /* Put other masked stores with the same mask to STORE_BB. */
8650 if (worklist
.is_empty ()
8651 || gimple_call_arg (worklist
.last (), 2) != mask
8652 || worklist
.last () != stmt1
)
8654 last
= worklist
.pop ();
8656 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);
8660 /* Decide whether it is possible to use a zero-based induction variable
8661 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
8662 return the value that the induction variable must be able to hold
8663 in order to ensure that the loop ends with an all-false mask.
8664 Return -1 otherwise. */
8666 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo
)
8668 tree niters_skip
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
8669 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8670 unsigned HOST_WIDE_INT max_vf
= vect_max_vf (loop_vinfo
);
8672 /* Calculate the value that the induction variable must be able
8673 to hit in order to ensure that we end the loop with an all-false mask.
8674 This involves adding the maximum number of inactive trailing scalar
8676 widest_int iv_limit
= -1;
8677 if (max_loop_iterations (loop
, &iv_limit
))
8681 /* Add the maximum number of skipped iterations to the
8682 maximum iteration count. */
8683 if (TREE_CODE (niters_skip
) == INTEGER_CST
)
8684 iv_limit
+= wi::to_widest (niters_skip
);
8686 iv_limit
+= max_vf
- 1;
8688 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
))
8689 /* Make a conservatively-correct assumption. */
8690 iv_limit
+= max_vf
- 1;
8692 /* IV_LIMIT is the maximum number of latch iterations, which is also
8693 the maximum in-range IV value. Round this value down to the previous
8694 vector alignment boundary and then add an extra full iteration. */
8695 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8696 iv_limit
= (iv_limit
& -(int) known_alignment (vf
)) + max_vf
;