2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
58 /* Loop Vectorization Pass.
60 This pass tries to vectorize loops.
62 For example, the vectorizer transforms the following simple loop:
64 short a[N]; short b[N]; short c[N]; int i;
70 as if it was manually vectorized by rewriting the source code into:
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 for (i=0; i<N/8; i++){
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
118 For example, say stmt S1 was vectorized into stmt VS1:
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 static void vect_estimate_min_profitable_iters (loop_vec_info
, int *, int *);
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info
,
164 bool vectype_maybe_set_p
,
166 vec
<stmt_vec_info
> *mask_producers
)
168 gimple
*stmt
= stmt_info
->stmt
;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info
)
171 && !STMT_VINFO_LIVE_P (stmt_info
))
172 || gimple_clobber_p (stmt
))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE
, vect_location
, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype
, nunits_vectype
;
180 opt_result res
= vect_get_vector_types_for_stmt (stmt_info
, &stmt_vectype
,
187 if (STMT_VINFO_VECTYPE (stmt_info
))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info
)
192 || vectype_maybe_set_p
)
193 && STMT_VINFO_VECTYPE (stmt_info
) == stmt_vectype
);
194 else if (stmt_vectype
== boolean_type_node
)
195 mask_producers
->safe_push (stmt_info
);
197 STMT_VINFO_VECTYPE (stmt_info
) = stmt_vectype
;
201 vect_update_max_nunits (vf
, nunits_vectype
);
203 return opt_result::success ();
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info
, poly_uint64
*vf
,
215 vec
<stmt_vec_info
> *mask_producers
)
217 vec_info
*vinfo
= stmt_info
->vinfo
;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining statement: %G",
222 = vect_determine_vf_for_stmt_1 (stmt_info
, false, vf
, mask_producers
);
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info
)
227 && STMT_VINFO_RELATED_STMT (stmt_info
))
229 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
230 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si
= gsi_start (pattern_def_seq
);
234 !gsi_end_p (si
); gsi_next (&si
))
236 stmt_vec_info def_stmt_info
= vinfo
->lookup_stmt (gsi_stmt (si
));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE
, vect_location
,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info
->stmt
);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info
, true,
243 res
= vect_determine_vf_for_stmt_1 (def_stmt_info
, true,
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE
, vect_location
,
251 "==> examining pattern statement: %G",
253 res
= vect_determine_vf_for_stmt_1 (stmt_info
, true, vf
, mask_producers
);
258 return opt_result::success ();
261 /* Function vect_determine_vectorization_factor
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo
)
289 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
290 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
291 unsigned nbbs
= loop
->num_nodes
;
292 poly_uint64 vectorization_factor
= 1;
293 tree scalar_type
= NULL_TREE
;
296 stmt_vec_info stmt_info
;
298 auto_vec
<stmt_vec_info
> mask_producers
;
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
302 for (i
= 0; i
< nbbs
; i
++)
304 basic_block bb
= bbs
[i
];
306 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
310 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE
, vect_location
, "==> examining phi: %G",
315 gcc_assert (stmt_info
);
317 if (STMT_VINFO_RELEVANT_P (stmt_info
)
318 || STMT_VINFO_LIVE_P (stmt_info
))
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info
));
321 scalar_type
= TREE_TYPE (PHI_RESULT (phi
));
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE
, vect_location
,
325 "get vectype for scalar type: %T\n",
328 vectype
= get_vectype_for_scalar_type (scalar_type
);
330 return opt_result::failure_at (phi
,
331 "not vectorized: unsupported "
334 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE
, vect_location
, "vectype: %T\n",
340 if (dump_enabled_p ())
342 dump_printf_loc (MSG_NOTE
, vect_location
, "nunits = ");
343 dump_dec (MSG_NOTE
, TYPE_VECTOR_SUBPARTS (vectype
));
344 dump_printf (MSG_NOTE
, "\n");
347 vect_update_max_nunits (&vectorization_factor
, vectype
);
351 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
354 stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
356 = vect_determine_vf_for_stmt (stmt_info
, &vectorization_factor
,
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE
, vect_location
, "vectorization factor = ");
367 dump_dec (MSG_NOTE
, vectorization_factor
);
368 dump_printf (MSG_NOTE
, "\n");
371 if (known_le (vectorization_factor
, 1U))
372 return opt_result::failure_at (vect_location
,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
376 for (i
= 0; i
< mask_producers
.length (); i
++)
378 stmt_info
= mask_producers
[i
];
379 opt_tree mask_type
= vect_get_mask_type_for_stmt (stmt_info
);
381 return opt_result::propagate_failure (mask_type
);
382 STMT_VINFO_VECTYPE (stmt_info
) = mask_type
;
385 return opt_result::success ();
389 /* Function vect_is_simple_iv_evolution.
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
395 vect_is_simple_iv_evolution (unsigned loop_nb
, tree access_fn
, tree
* init
,
400 tree evolution_part
= evolution_part_in_loop_num (access_fn
, loop_nb
);
403 /* When there is no evolution in this loop, the evolution function
405 if (evolution_part
== NULL_TREE
)
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part
))
413 step_expr
= evolution_part
;
414 init_expr
= unshare_expr (initial_condition_in_loop_num (access_fn
, loop_nb
));
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE
, vect_location
, "step: %T, init: %T\n",
418 step_expr
, init_expr
);
423 if (TREE_CODE (step_expr
) != INTEGER_CST
424 && (TREE_CODE (step_expr
) != SSA_NAME
425 || ((bb
= gimple_bb (SSA_NAME_DEF_STMT (step_expr
)))
426 && flow_bb_inside_loop_p (get_loop (cfun
, loop_nb
), bb
))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr
))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
))
429 || !flag_associative_math
)))
430 && (TREE_CODE (step_expr
) != REAL_CST
431 || !flag_associative_math
))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
447 x_1 = PHI <x_4(outer2), ...>;
451 x_2 = PHI <x_1(outer1), ...>;
457 x_4 = PHI <x_3(inner)>;
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info
, gphi
*phi
)
466 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
469 FOR_EACH_PHI_ARG (use_p
, phi
, op_iter
, SSA_OP_USE
)
470 if (stmt_vec_info def_info
= loop_vinfo
->lookup_def (USE_FROM_PTR (use_p
)))
471 if (STMT_VINFO_DEF_TYPE (def_info
) == vect_double_reduction_def
)
476 /* Function vect_analyze_scalar_cycles_1.
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo
, struct loop
*loop
)
486 basic_block bb
= loop
->header
;
488 auto_vec
<stmt_vec_info
, 64> worklist
;
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
497 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
499 gphi
*phi
= gsi
.phi ();
500 tree access_fn
= NULL
;
501 tree def
= PHI_RESULT (phi
);
502 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (phi
);
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def
))
512 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_unknown_def_type
;
514 /* Analyze the evolution function. */
515 access_fn
= analyze_scalar_evolution (loop
, def
);
518 STRIP_NOPS (access_fn
);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE
, vect_location
,
521 "Access function of PHI: %T\n", access_fn
);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
523 = initial_condition_in_loop_num (access_fn
, loop
->num
);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
)
525 = evolution_part_in_loop_num (access_fn
, loop
->num
);
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo
, phi
)
530 || !vect_is_simple_iv_evolution (loop
->num
, access_fn
, &init
, &step
)
531 || (LOOP_VINFO_LOOP (loop_vinfo
) != loop
532 && TREE_CODE (step
) != INTEGER_CST
))
534 worklist
.safe_push (stmt_vinfo
);
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
)
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
) != NULL_TREE
);
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE
, vect_location
, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_induction_def
;
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist
.length () > 0)
551 stmt_vec_info stmt_vinfo
= worklist
.pop ();
552 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
553 tree def
= PHI_RESULT (phi
);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE
, vect_location
, "Analyze phi: %G", phi
);
558 gcc_assert (!virtual_operand_p (def
)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_unknown_def_type
);
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo
, stmt_vinfo
,
563 &double_reduc
, false);
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE
, vect_location
,
570 "Detected double reduction.\n");
572 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_double_reduction_def
;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info
)
574 = vect_double_reduction_def
;
578 if (loop
!= LOOP_VINFO_LOOP (loop_vinfo
))
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE
, vect_location
,
582 "Detected vectorizable nested cycle.\n");
584 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_nested_cycle
;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_nested_cycle
;
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE
, vect_location
,
591 "Detected reduction.\n");
593 STMT_VINFO_DEF_TYPE (stmt_vinfo
) = vect_reduction_def
;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info
) = vect_reduction_def
;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info
))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo
).safe_push
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
607 "Unknown def-use cycle pattern.\n");
612 /* Function vect_analyze_scalar_cycles.
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo
)
636 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
638 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
);
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
650 vect_analyze_scalar_cycles_1 (loop_vinfo
, loop
->inner
);
653 /* Transfer group and reduction information from STMT_INFO to its
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info
)
659 stmt_vec_info firstp
= STMT_VINFO_RELATED_STMT (stmt_info
);
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp
)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
663 REDUC_GROUP_SIZE (firstp
) = REDUC_GROUP_SIZE (stmt_info
);
666 stmtp
= STMT_VINFO_RELATED_STMT (stmt_info
);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp
) = firstp
;
668 stmt_info
= REDUC_GROUP_NEXT_ELEMENT (stmt_info
);
670 REDUC_GROUP_NEXT_ELEMENT (stmtp
)
671 = STMT_VINFO_RELATED_STMT (stmt_info
);
674 STMT_VINFO_DEF_TYPE (stmtp
) = vect_reduction_def
;
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo
)
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
), i
, first
)
686 if (STMT_VINFO_IN_PATTERN_P (first
))
688 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (first
);
691 if (! STMT_VINFO_IN_PATTERN_P (next
))
693 next
= REDUC_GROUP_NEXT_ELEMENT (next
);
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
699 vect_fixup_reduc_chain (first
);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
)[i
]
701 = STMT_VINFO_RELATED_STMT (first
);
706 /* Function vect_get_loop_niters.
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
713 Return the loop exit condition. */
717 vect_get_loop_niters (struct loop
*loop
, tree
*assumptions
,
718 tree
*number_of_iterations
, tree
*number_of_iterationsm1
)
720 edge exit
= single_exit (loop
);
721 struct tree_niter_desc niter_desc
;
722 tree niter_assumptions
, niter
, may_be_zero
;
723 gcond
*cond
= get_loop_exit_condition (loop
);
725 *assumptions
= boolean_true_node
;
726 *number_of_iterationsm1
= chrec_dont_know
;
727 *number_of_iterations
= chrec_dont_know
;
728 DUMP_VECT_SCOPE ("get_loop_niters");
733 niter
= chrec_dont_know
;
734 may_be_zero
= NULL_TREE
;
735 niter_assumptions
= boolean_true_node
;
736 if (!number_of_iterations_exit_assumptions (loop
, exit
, &niter_desc
, NULL
)
737 || chrec_contains_undetermined (niter_desc
.niter
))
740 niter_assumptions
= niter_desc
.assumptions
;
741 may_be_zero
= niter_desc
.may_be_zero
;
742 niter
= niter_desc
.niter
;
744 if (may_be_zero
&& integer_zerop (may_be_zero
))
745 may_be_zero
= NULL_TREE
;
749 if (COMPARISON_CLASS_P (may_be_zero
))
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions
&& !integer_nonzerop (niter_assumptions
))
754 niter_assumptions
= fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
756 fold_build1 (TRUTH_NOT_EXPR
,
760 niter
= fold_build3 (COND_EXPR
, TREE_TYPE (niter
), may_be_zero
,
761 build_int_cst (TREE_TYPE (niter
), 0),
762 rewrite_to_non_trapping_overflow (niter
));
764 may_be_zero
= NULL_TREE
;
766 else if (integer_nonzerop (may_be_zero
))
768 *number_of_iterationsm1
= build_int_cst (TREE_TYPE (niter
), 0);
769 *number_of_iterations
= build_int_cst (TREE_TYPE (niter
), 1);
776 *assumptions
= niter_assumptions
;
777 *number_of_iterationsm1
= niter
;
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter
&& !chrec_contains_undetermined (niter
))
784 niter
= fold_build2 (PLUS_EXPR
, TREE_TYPE (niter
), unshare_expr (niter
),
785 build_int_cst (TREE_TYPE (niter
), 1));
786 *number_of_iterations
= niter
;
791 /* Function bb_in_loop_p
793 Used as predicate for dfs order traversal of the loop bbs. */
796 bb_in_loop_p (const_basic_block bb
, const void *data
)
798 const struct loop
*const loop
= (const struct loop
*)data
;
799 if (flow_bb_inside_loop_p (loop
, bb
))
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
808 _loop_vec_info::_loop_vec_info (struct loop
*loop_in
, vec_info_shared
*shared
)
809 : vec_info (vec_info::loop
, init_cost (loop_in
), shared
),
811 bbs (XCNEWVEC (basic_block
, loop
->num_nodes
)),
812 num_itersm1 (NULL_TREE
),
813 num_iters (NULL_TREE
),
814 num_iters_unchanged (NULL_TREE
),
815 num_iters_assumptions (NULL_TREE
),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE
),
821 mask_compare_type (NULL_TREE
),
822 simd_if_cond (NULL_TREE
),
824 peeling_for_alignment (0),
827 slp_unrolling_factor (1),
828 single_scalar_iteration_cost (0),
829 vectorizable (false),
830 can_fully_mask_p (true),
831 fully_masked_p (false),
832 peeling_for_gaps (false),
833 peeling_for_niter (false),
834 operands_swapped (false),
835 no_data_dependencies (false),
836 has_mask_store (false),
838 orig_loop_info (NULL
)
840 /* CHECKME: We want to visit all BBs before their successors (except for
841 latch blocks, for which this assertion wouldn't hold). In the simple
842 case of the loop forms we allow, a dfs order of the BBs would the same
843 as reversed postorder traversal, so we are safe. */
845 unsigned int nbbs
= dfs_enumerate_from (loop
->header
, 0, bb_in_loop_p
,
846 bbs
, loop
->num_nodes
, loop
);
847 gcc_assert (nbbs
== loop
->num_nodes
);
849 for (unsigned int i
= 0; i
< nbbs
; i
++)
851 basic_block bb
= bbs
[i
];
852 gimple_stmt_iterator si
;
854 for (si
= gsi_start_phis (bb
); !gsi_end_p (si
); gsi_next (&si
))
856 gimple
*phi
= gsi_stmt (si
);
857 gimple_set_uid (phi
, 0);
861 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
863 gimple
*stmt
= gsi_stmt (si
);
864 gimple_set_uid (stmt
, 0);
866 /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
867 second argument is the #pragma omp simd if (x) condition, when 0,
868 loop shouldn't be vectorized, when non-zero constant, it should
869 be vectorized normally, otherwise versioned with vectorized loop
870 done if the condition is non-zero at runtime. */
872 && is_gimple_call (stmt
)
873 && gimple_call_internal_p (stmt
)
874 && gimple_call_internal_fn (stmt
) == IFN_GOMP_SIMD_LANE
875 && gimple_call_num_args (stmt
) >= 2
876 && TREE_CODE (gimple_call_arg (stmt
, 0)) == SSA_NAME
878 == SSA_NAME_VAR (gimple_call_arg (stmt
, 0))))
880 tree arg
= gimple_call_arg (stmt
, 1);
881 if (integer_zerop (arg
) || TREE_CODE (arg
) == SSA_NAME
)
884 gcc_assert (integer_nonzerop (arg
));
890 /* Free all levels of MASKS. */
893 release_vec_loop_masks (vec_loop_masks
*masks
)
897 FOR_EACH_VEC_ELT (*masks
, i
, rgm
)
898 rgm
->masks
.release ();
902 /* Free all memory used by the _loop_vec_info, as well as all the
903 stmt_vec_info structs of all the stmts in the loop. */
905 _loop_vec_info::~_loop_vec_info ()
908 gimple_stmt_iterator si
;
911 nbbs
= loop
->num_nodes
;
912 for (j
= 0; j
< nbbs
; j
++)
914 basic_block bb
= bbs
[j
];
915 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); )
917 gimple
*stmt
= gsi_stmt (si
);
919 /* We may have broken canonical form by moving a constant
920 into RHS1 of a commutative op. Fix such occurrences. */
921 if (operands_swapped
&& is_gimple_assign (stmt
))
923 enum tree_code code
= gimple_assign_rhs_code (stmt
);
925 if ((code
== PLUS_EXPR
926 || code
== POINTER_PLUS_EXPR
927 || code
== MULT_EXPR
)
928 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt
)))
929 swap_ssa_operands (stmt
,
930 gimple_assign_rhs1_ptr (stmt
),
931 gimple_assign_rhs2_ptr (stmt
));
932 else if (code
== COND_EXPR
933 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt
)))
935 tree cond_expr
= gimple_assign_rhs1 (stmt
);
936 enum tree_code cond_code
= TREE_CODE (cond_expr
);
938 if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
940 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
,
942 cond_code
= invert_tree_comparison (cond_code
,
944 if (cond_code
!= ERROR_MARK
)
946 TREE_SET_CODE (cond_expr
, cond_code
);
947 swap_ssa_operands (stmt
,
948 gimple_assign_rhs2_ptr (stmt
),
949 gimple_assign_rhs3_ptr (stmt
));
960 release_vec_loop_masks (&masks
);
966 /* Return an invariant or register for EXPR and emit necessary
967 computations in the LOOP_VINFO loop preheader. */
970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo
, tree expr
)
972 if (is_gimple_reg (expr
)
973 || is_gimple_min_invariant (expr
))
976 if (! loop_vinfo
->ivexpr_map
)
977 loop_vinfo
->ivexpr_map
= new hash_map
<tree_operand_hash
, tree
>;
978 tree
&cached
= loop_vinfo
->ivexpr_map
->get_or_insert (expr
);
981 gimple_seq stmts
= NULL
;
982 cached
= force_gimple_operand (unshare_expr (expr
),
983 &stmts
, true, NULL_TREE
);
986 edge e
= loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo
));
987 gsi_insert_seq_on_edge_immediate (e
, stmts
);
993 /* Return true if we can use CMP_TYPE as the comparison type to produce
994 all masks required to mask LOOP_VINFO. */
997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo
, tree cmp_type
)
1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1002 if (rgm
->mask_type
!= NULL_TREE
1003 && !direct_internal_fn_supported_p (IFN_WHILE_ULT
,
1004 cmp_type
, rgm
->mask_type
,
1005 OPTIMIZE_FOR_SPEED
))
1010 /* Calculate the maximum number of scalars per iteration for every
1011 rgroup in LOOP_VINFO. */
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo
)
1016 unsigned int res
= 1;
1019 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo
), i
, rgm
)
1020 res
= MAX (res
, rgm
->max_nscalars_per_iter
);
1024 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1025 whether we can actually generate the masks required. Return true if so,
1026 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1029 vect_verify_full_masking (loop_vec_info loop_vinfo
)
1031 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1032 unsigned int min_ni_width
;
1034 /* Use a normal loop if there are no statements that need masking.
1035 This only happens in rare degenerate cases: it means that the loop
1036 has no loads, no stores, and no live-out values. */
1037 if (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ())
1040 /* Get the maximum number of iterations that is representable
1041 in the counter type. */
1042 tree ni_type
= TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo
));
1043 widest_int max_ni
= wi::to_widest (TYPE_MAX_VALUE (ni_type
)) + 1;
1045 /* Get a more refined estimate for the number of iterations. */
1046 widest_int max_back_edges
;
1047 if (max_loop_iterations (loop
, &max_back_edges
))
1048 max_ni
= wi::smin (max_ni
, max_back_edges
+ 1);
1050 /* Account for rgroup masks, in which each bit is replicated N times. */
1051 max_ni
*= vect_get_max_nscalars_per_iter (loop_vinfo
);
1053 /* Work out how many bits we need to represent the limit. */
1054 min_ni_width
= wi::min_precision (max_ni
, UNSIGNED
);
1056 /* Find a scalar mode for which WHILE_ULT is supported. */
1057 opt_scalar_int_mode cmp_mode_iter
;
1058 tree cmp_type
= NULL_TREE
;
1059 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter
, MODE_INT
)
1061 unsigned int cmp_bits
= GET_MODE_BITSIZE (cmp_mode_iter
.require ());
1062 if (cmp_bits
>= min_ni_width
1063 && targetm
.scalar_mode_supported_p (cmp_mode_iter
.require ()))
1065 tree this_type
= build_nonstandard_integer_type (cmp_bits
, true);
1067 && can_produce_all_loop_masks_p (loop_vinfo
, this_type
))
1069 /* Although we could stop as soon as we find a valid mode,
1070 it's often better to continue until we hit Pmode, since the
1071 operands to the WHILE are more likely to be reusable in
1072 address calculations. */
1073 cmp_type
= this_type
;
1074 if (cmp_bits
>= GET_MODE_BITSIZE (Pmode
))
1083 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo
) = cmp_type
;
1087 /* Calculate the cost of one scalar iteration of the loop. */
1089 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo
)
1091 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1092 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1093 int nbbs
= loop
->num_nodes
, factor
;
1094 int innerloop_iters
, i
;
1096 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1098 /* Gather costs for statements in the scalar loop. */
1101 innerloop_iters
= 1;
1103 innerloop_iters
= 50; /* FIXME */
1105 for (i
= 0; i
< nbbs
; i
++)
1107 gimple_stmt_iterator si
;
1108 basic_block bb
= bbs
[i
];
1110 if (bb
->loop_father
== loop
->inner
)
1111 factor
= innerloop_iters
;
1115 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1117 gimple
*stmt
= gsi_stmt (si
);
1118 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
1120 if (!is_gimple_assign (stmt
) && !is_gimple_call (stmt
))
1123 /* Skip stmts that are not vectorized inside the loop. */
1124 stmt_vec_info vstmt_info
= vect_stmt_to_vectorize (stmt_info
);
1125 if (!STMT_VINFO_RELEVANT_P (vstmt_info
)
1126 && (!STMT_VINFO_LIVE_P (vstmt_info
)
1127 || !VECTORIZABLE_CYCLE_DEF
1128 (STMT_VINFO_DEF_TYPE (vstmt_info
))))
1131 vect_cost_for_stmt kind
;
1132 if (STMT_VINFO_DATA_REF (stmt_info
))
1134 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1137 kind
= scalar_store
;
1142 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1143 factor
, kind
, stmt_info
, 0, vect_prologue
);
1147 /* Now accumulate cost. */
1148 void *target_cost_data
= init_cost (loop
);
1149 stmt_info_for_cost
*si
;
1151 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1153 (void) add_stmt_cost (target_cost_data
, si
->count
,
1154 si
->kind
, si
->stmt_info
, si
->misalign
,
1156 unsigned dummy
, body_cost
= 0;
1157 finish_cost (target_cost_data
, &dummy
, &body_cost
, &dummy
);
1158 destroy_cost_data (target_cost_data
);
1159 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
) = body_cost
;
1163 /* Function vect_analyze_loop_form_1.
1165 Verify that certain CFG restrictions hold, including:
1166 - the loop has a pre-header
1167 - the loop has a single entry and exit
1168 - the loop exit condition is simple enough
1169 - the number of iterations can be analyzed, i.e, a countable loop. The
1170 niter could be analyzed under some assumptions. */
1173 vect_analyze_loop_form_1 (struct loop
*loop
, gcond
**loop_cond
,
1174 tree
*assumptions
, tree
*number_of_iterationsm1
,
1175 tree
*number_of_iterations
, gcond
**inner_loop_cond
)
1177 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1179 /* Different restrictions apply when we are considering an inner-most loop,
1180 vs. an outer (nested) loop.
1181 (FORNOW. May want to relax some of these restrictions in the future). */
1185 /* Inner-most loop. We currently require that the number of BBs is
1186 exactly 2 (the header and latch). Vectorizable inner-most loops
1197 if (loop
->num_nodes
!= 2)
1198 return opt_result::failure_at (vect_location
,
1200 " control flow in loop.\n");
1202 if (empty_block_p (loop
->header
))
1203 return opt_result::failure_at (vect_location
,
1204 "not vectorized: empty loop.\n");
1208 struct loop
*innerloop
= loop
->inner
;
1211 /* Nested loop. We currently require that the loop is doubly-nested,
1212 contains a single inner loop, and the number of BBs is exactly 5.
1213 Vectorizable outer-loops look like this:
1225 The inner-loop has the properties expected of inner-most loops
1226 as described above. */
1228 if ((loop
->inner
)->inner
|| (loop
->inner
)->next
)
1229 return opt_result::failure_at (vect_location
,
1231 " multiple nested loops.\n");
1233 if (loop
->num_nodes
!= 5)
1234 return opt_result::failure_at (vect_location
,
1236 " control flow in loop.\n");
1238 entryedge
= loop_preheader_edge (innerloop
);
1239 if (entryedge
->src
!= loop
->header
1240 || !single_exit (innerloop
)
1241 || single_exit (innerloop
)->dest
!= EDGE_PRED (loop
->latch
, 0)->src
)
1242 return opt_result::failure_at (vect_location
,
1244 " unsupported outerloop form.\n");
1246 /* Analyze the inner-loop. */
1247 tree inner_niterm1
, inner_niter
, inner_assumptions
;
1249 = vect_analyze_loop_form_1 (loop
->inner
, inner_loop_cond
,
1250 &inner_assumptions
, &inner_niterm1
,
1251 &inner_niter
, NULL
);
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1256 "not vectorized: Bad inner loop.\n");
1260 /* Don't support analyzing niter under assumptions for inner
1262 if (!integer_onep (inner_assumptions
))
1263 return opt_result::failure_at (vect_location
,
1264 "not vectorized: Bad inner loop.\n");
1266 if (!expr_invariant_in_loop_p (loop
, inner_niter
))
1267 return opt_result::failure_at (vect_location
,
1268 "not vectorized: inner-loop count not"
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_NOTE
, vect_location
,
1273 "Considering outer-loop vectorization.\n");
1276 if (!single_exit (loop
))
1277 return opt_result::failure_at (vect_location
,
1278 "not vectorized: multiple exits.\n");
1279 if (EDGE_COUNT (loop
->header
->preds
) != 2)
1280 return opt_result::failure_at (vect_location
,
1282 " too many incoming edges.\n");
1284 /* We assume that the loop exit condition is at the end of the loop. i.e,
1285 that the loop is represented as a do-while (with a proper if-guard
1286 before the loop if needed), where the loop header contains all the
1287 executable statements, and the latch is empty. */
1288 if (!empty_block_p (loop
->latch
)
1289 || !gimple_seq_empty_p (phi_nodes (loop
->latch
)))
1290 return opt_result::failure_at (vect_location
,
1291 "not vectorized: latch block not empty.\n");
1293 /* Make sure the exit is not abnormal. */
1294 edge e
= single_exit (loop
);
1295 if (e
->flags
& EDGE_ABNORMAL
)
1296 return opt_result::failure_at (vect_location
,
1298 " abnormal loop exit edge.\n");
1300 *loop_cond
= vect_get_loop_niters (loop
, assumptions
, number_of_iterations
,
1301 number_of_iterationsm1
);
1303 return opt_result::failure_at
1305 "not vectorized: complicated exit condition.\n");
1307 if (integer_zerop (*assumptions
)
1308 || !*number_of_iterations
1309 || chrec_contains_undetermined (*number_of_iterations
))
1310 return opt_result::failure_at
1312 "not vectorized: number of iterations cannot be computed.\n");
1314 if (integer_zerop (*number_of_iterations
))
1315 return opt_result::failure_at
1317 "not vectorized: number of iterations = 0.\n");
1319 return opt_result::success ();
1322 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1325 vect_analyze_loop_form (struct loop
*loop
, vec_info_shared
*shared
)
1327 tree assumptions
, number_of_iterations
, number_of_iterationsm1
;
1328 gcond
*loop_cond
, *inner_loop_cond
= NULL
;
1331 = vect_analyze_loop_form_1 (loop
, &loop_cond
,
1332 &assumptions
, &number_of_iterationsm1
,
1333 &number_of_iterations
, &inner_loop_cond
);
1335 return opt_loop_vec_info::propagate_failure (res
);
1337 loop_vec_info loop_vinfo
= new _loop_vec_info (loop
, shared
);
1338 LOOP_VINFO_NITERSM1 (loop_vinfo
) = number_of_iterationsm1
;
1339 LOOP_VINFO_NITERS (loop_vinfo
) = number_of_iterations
;
1340 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = number_of_iterations
;
1341 if (!integer_onep (assumptions
))
1343 /* We consider to vectorize this loop by versioning it under
1344 some assumptions. In order to do this, we need to clear
1345 existing information computed by scev and niter analyzer. */
1347 free_numbers_of_iterations_estimates (loop
);
1348 /* Also set flag for this loop so that following scev and niter
1349 analysis are done under the assumptions. */
1350 loop_constraint_set (loop
, LOOP_C_FINITE
);
1351 /* Also record the assumptions for versioning. */
1352 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo
) = assumptions
;
1355 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1357 if (dump_enabled_p ())
1359 dump_printf_loc (MSG_NOTE
, vect_location
,
1360 "Symbolic number of iterations is ");
1361 dump_generic_expr (MSG_NOTE
, TDF_DETAILS
, number_of_iterations
);
1362 dump_printf (MSG_NOTE
, "\n");
1366 stmt_vec_info loop_cond_info
= loop_vinfo
->lookup_stmt (loop_cond
);
1367 STMT_VINFO_TYPE (loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1368 if (inner_loop_cond
)
1370 stmt_vec_info inner_loop_cond_info
1371 = loop_vinfo
->lookup_stmt (inner_loop_cond
);
1372 STMT_VINFO_TYPE (inner_loop_cond_info
) = loop_exit_ctrl_vec_info_type
;
1375 gcc_assert (!loop
->aux
);
1376 loop
->aux
= loop_vinfo
;
1377 return opt_loop_vec_info::success (loop_vinfo
);
1382 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1383 statements update the vectorization factor. */
1386 vect_update_vf_for_slp (loop_vec_info loop_vinfo
)
1388 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1389 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1390 int nbbs
= loop
->num_nodes
;
1391 poly_uint64 vectorization_factor
;
1394 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1396 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1397 gcc_assert (known_ne (vectorization_factor
, 0U));
1399 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1400 vectorization factor of the loop is the unrolling factor required by
1401 the SLP instances. If that unrolling factor is 1, we say, that we
1402 perform pure SLP on loop - cross iteration parallelism is not
1404 bool only_slp_in_loop
= true;
1405 for (i
= 0; i
< nbbs
; i
++)
1407 basic_block bb
= bbs
[i
];
1408 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1411 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
1412 stmt_info
= vect_stmt_to_vectorize (stmt_info
);
1413 if ((STMT_VINFO_RELEVANT_P (stmt_info
)
1414 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info
)))
1415 && !PURE_SLP_STMT (stmt_info
))
1416 /* STMT needs both SLP and loop-based vectorization. */
1417 only_slp_in_loop
= false;
1421 if (only_slp_in_loop
)
1423 if (dump_enabled_p ())
1424 dump_printf_loc (MSG_NOTE
, vect_location
,
1425 "Loop contains only SLP stmts\n");
1426 vectorization_factor
= LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
);
1430 if (dump_enabled_p ())
1431 dump_printf_loc (MSG_NOTE
, vect_location
,
1432 "Loop contains SLP and non-SLP stmts\n");
1433 /* Both the vectorization factor and unroll factor have the form
1434 current_vector_size * X for some rational X, so they must have
1435 a common multiple. */
1436 vectorization_factor
1437 = force_common_multiple (vectorization_factor
,
1438 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
));
1441 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = vectorization_factor
;
1442 if (dump_enabled_p ())
1444 dump_printf_loc (MSG_NOTE
, vect_location
,
1445 "Updating vectorization factor to ");
1446 dump_dec (MSG_NOTE
, vectorization_factor
);
1447 dump_printf (MSG_NOTE
, ".\n");
1451 /* Return true if STMT_INFO describes a double reduction phi and if
1452 the other phi in the reduction is also relevant for vectorization.
1453 This rejects cases such as:
1456 x_1 = PHI <x_3(outer2), ...>;
1464 x_3 = PHI <x_2(inner)>;
1466 if nothing in x_2 or elsewhere makes x_1 relevant. */
1469 vect_active_double_reduction_p (stmt_vec_info stmt_info
)
1471 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1474 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info
));
1477 /* Function vect_analyze_loop_operations.
1479 Scan the loop stmts and make sure they are all vectorizable. */
1482 vect_analyze_loop_operations (loop_vec_info loop_vinfo
)
1484 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1485 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
1486 int nbbs
= loop
->num_nodes
;
1488 stmt_vec_info stmt_info
;
1489 bool need_to_vectorize
= false;
1492 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1494 auto_vec
<stmt_info_for_cost
> cost_vec
;
1496 for (i
= 0; i
< nbbs
; i
++)
1498 basic_block bb
= bbs
[i
];
1500 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
1503 gphi
*phi
= si
.phi ();
1506 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE
, vect_location
, "examining phi: %G", phi
);
1509 if (virtual_operand_p (gimple_phi_result (phi
)))
1512 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1513 (i.e., a phi in the tail of the outer-loop). */
1514 if (! is_loop_header_bb_p (bb
))
1516 /* FORNOW: we currently don't support the case that these phis
1517 are not used in the outerloop (unless it is double reduction,
1518 i.e., this phi is vect_reduction_def), cause this case
1519 requires to actually do something here. */
1520 if (STMT_VINFO_LIVE_P (stmt_info
)
1521 && !vect_active_double_reduction_p (stmt_info
))
1522 return opt_result::failure_at (phi
,
1523 "Unsupported loop-closed phi"
1524 " in outer-loop.\n");
1526 /* If PHI is used in the outer loop, we check that its operand
1527 is defined in the inner loop. */
1528 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1532 if (gimple_phi_num_args (phi
) != 1)
1533 return opt_result::failure_at (phi
, "unsupported phi");
1535 phi_op
= PHI_ARG_DEF (phi
, 0);
1536 stmt_vec_info op_def_info
= loop_vinfo
->lookup_def (phi_op
);
1538 return opt_result::failure_at (phi
, "unsupported phi");
1540 if (STMT_VINFO_RELEVANT (op_def_info
) != vect_used_in_outer
1541 && (STMT_VINFO_RELEVANT (op_def_info
)
1542 != vect_used_in_outer_by_reduction
))
1543 return opt_result::failure_at (phi
, "unsupported phi");
1549 gcc_assert (stmt_info
);
1551 if ((STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
1552 || STMT_VINFO_LIVE_P (stmt_info
))
1553 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
1554 /* A scalar-dependence cycle that we don't support. */
1555 return opt_result::failure_at (phi
,
1557 " scalar dependence cycle.\n");
1559 if (STMT_VINFO_RELEVANT_P (stmt_info
))
1561 need_to_vectorize
= true;
1562 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
1563 && ! PURE_SLP_STMT (stmt_info
))
1564 ok
= vectorizable_induction (stmt_info
, NULL
, NULL
, NULL
,
1566 else if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
1567 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
1568 && ! PURE_SLP_STMT (stmt_info
))
1569 ok
= vectorizable_reduction (stmt_info
, NULL
, NULL
, NULL
, NULL
,
1573 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1575 && STMT_VINFO_LIVE_P (stmt_info
)
1576 && !PURE_SLP_STMT (stmt_info
))
1577 ok
= vectorizable_live_operation (stmt_info
, NULL
, NULL
, -1, NULL
,
1581 return opt_result::failure_at (phi
,
1582 "not vectorized: relevant phi not "
1584 static_cast <gimple
*> (phi
));
1587 for (gimple_stmt_iterator si
= gsi_start_bb (bb
); !gsi_end_p (si
);
1590 gimple
*stmt
= gsi_stmt (si
);
1591 if (!gimple_clobber_p (stmt
))
1594 = vect_analyze_stmt (loop_vinfo
->lookup_stmt (stmt
),
1596 NULL
, NULL
, &cost_vec
);
1603 add_stmt_costs (loop_vinfo
->target_cost_data
, &cost_vec
);
1605 /* All operations in the loop are either irrelevant (deal with loop
1606 control, or dead), or only used outside the loop and can be moved
1607 out of the loop (e.g. invariants, inductions). The loop can be
1608 optimized away by scalar optimizations. We're better off not
1609 touching this loop. */
1610 if (!need_to_vectorize
)
1612 if (dump_enabled_p ())
1613 dump_printf_loc (MSG_NOTE
, vect_location
,
1614 "All the computation can be taken out of the loop.\n");
1615 return opt_result::failure_at
1617 "not vectorized: redundant loop. no profit to vectorize.\n");
1620 return opt_result::success ();
1623 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1624 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1625 definitely no, or -1 if it's worth retrying. */
1628 vect_analyze_loop_costing (loop_vec_info loop_vinfo
)
1630 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1631 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
1633 /* Only fully-masked loops can have iteration counts less than the
1634 vectorization factor. */
1635 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
1637 HOST_WIDE_INT max_niter
;
1639 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1640 max_niter
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
1642 max_niter
= max_stmt_executions_int (loop
);
1645 && (unsigned HOST_WIDE_INT
) max_niter
< assumed_vf
)
1647 if (dump_enabled_p ())
1648 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1649 "not vectorized: iteration count smaller than "
1650 "vectorization factor.\n");
1655 int min_profitable_iters
, min_profitable_estimate
;
1656 vect_estimate_min_profitable_iters (loop_vinfo
, &min_profitable_iters
,
1657 &min_profitable_estimate
);
1659 if (min_profitable_iters
< 0)
1661 if (dump_enabled_p ())
1662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1663 "not vectorized: vectorization not profitable.\n");
1664 if (dump_enabled_p ())
1665 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1666 "not vectorized: vector version will never be "
1671 int min_scalar_loop_bound
= (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND
)
1674 /* Use the cost model only if it is more conservative than user specified
1676 unsigned int th
= (unsigned) MAX (min_scalar_loop_bound
,
1677 min_profitable_iters
);
1679 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = th
;
1681 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
1682 && LOOP_VINFO_INT_NITERS (loop_vinfo
) < th
)
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1686 "not vectorized: vectorization not profitable.\n");
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE
, vect_location
,
1689 "not vectorized: iteration count smaller than user "
1690 "specified loop bound parameter or minimum profitable "
1691 "iterations (whichever is more conservative).\n");
1695 HOST_WIDE_INT estimated_niter
= estimated_stmt_executions_int (loop
);
1696 if (estimated_niter
== -1)
1697 estimated_niter
= likely_max_stmt_executions_int (loop
);
1698 if (estimated_niter
!= -1
1699 && ((unsigned HOST_WIDE_INT
) estimated_niter
1700 < MAX (th
, (unsigned) min_profitable_estimate
)))
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1704 "not vectorized: estimated iteration count too "
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE
, vect_location
,
1708 "not vectorized: estimated iteration count smaller "
1709 "than specified loop bound parameter or minimum "
1710 "profitable iterations (whichever is more "
1711 "conservative).\n");
1719 vect_get_datarefs_in_loop (loop_p loop
, basic_block
*bbs
,
1720 vec
<data_reference_p
> *datarefs
,
1721 unsigned int *n_stmts
)
1724 for (unsigned i
= 0; i
< loop
->num_nodes
; i
++)
1725 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
1726 !gsi_end_p (gsi
); gsi_next (&gsi
))
1728 gimple
*stmt
= gsi_stmt (gsi
);
1729 if (is_gimple_debug (stmt
))
1732 opt_result res
= vect_find_stmt_data_reference (loop
, stmt
, datarefs
);
1735 if (is_gimple_call (stmt
) && loop
->safelen
)
1737 tree fndecl
= gimple_call_fndecl (stmt
), op
;
1738 if (fndecl
!= NULL_TREE
)
1740 cgraph_node
*node
= cgraph_node::get (fndecl
);
1741 if (node
!= NULL
&& node
->simd_clones
!= NULL
)
1743 unsigned int j
, n
= gimple_call_num_args (stmt
);
1744 for (j
= 0; j
< n
; j
++)
1746 op
= gimple_call_arg (stmt
, j
);
1748 || (REFERENCE_CLASS_P (op
)
1749 && get_base_address (op
)))
1752 op
= gimple_call_lhs (stmt
);
1753 /* Ignore #pragma omp declare simd functions
1754 if they don't have data references in the
1755 call stmt itself. */
1759 || (REFERENCE_CLASS_P (op
)
1760 && get_base_address (op
)))))
1767 /* If dependence analysis will give up due to the limit on the
1768 number of datarefs stop here and fail fatally. */
1769 if (datarefs
->length ()
1770 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS
))
1771 return opt_result::failure_at (stmt
, "exceeded param "
1772 "loop-max-datarefs-for-datadeps\n");
1774 return opt_result::success ();
1777 /* Function vect_analyze_loop_2.
1779 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780 for it. The different analyses will record information in the
1781 loop_vec_info struct. */
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo
, bool &fatal
, unsigned *n_stmts
)
1785 opt_result ok
= opt_result::success ();
1787 unsigned int max_vf
= MAX_VECTORIZATION_FACTOR
;
1788 poly_uint64 min_vf
= 2;
1790 /* The first group of checks is independent of the vector size. */
1793 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)
1794 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo
)))
1795 return opt_result::failure_at (vect_location
,
1796 "not vectorized: simd if(0)\n");
1798 /* Find all data references in the loop (which correspond to vdefs/vuses)
1799 and analyze their evolution in the loop. */
1801 loop_p loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1803 /* Gather the data references and count stmts in the loop. */
1804 if (!LOOP_VINFO_DATAREFS (loop_vinfo
).exists ())
1807 = vect_get_datarefs_in_loop (loop
, LOOP_VINFO_BBS (loop_vinfo
),
1808 &LOOP_VINFO_DATAREFS (loop_vinfo
),
1812 if (dump_enabled_p ())
1813 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1814 "not vectorized: loop contains function "
1815 "calls or data references that cannot "
1819 loop_vinfo
->shared
->save_datarefs ();
1822 loop_vinfo
->shared
->check_datarefs ();
1824 /* Analyze the data references and also adjust the minimal
1825 vectorization factor according to the loads and stores. */
1827 ok
= vect_analyze_data_refs (loop_vinfo
, &min_vf
);
1830 if (dump_enabled_p ())
1831 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1832 "bad data references.\n");
1836 /* Classify all cross-iteration scalar data-flow cycles.
1837 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1838 vect_analyze_scalar_cycles (loop_vinfo
);
1840 vect_pattern_recog (loop_vinfo
);
1842 vect_fixup_scalar_cycles_with_patterns (loop_vinfo
);
1844 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1845 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1847 ok
= vect_analyze_data_ref_accesses (loop_vinfo
);
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1852 "bad data access.\n");
1856 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1858 ok
= vect_mark_stmts_to_be_vectorized (loop_vinfo
);
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1863 "unexpected pattern.\n");
1867 /* While the rest of the analysis below depends on it in some way. */
1870 /* Analyze data dependences between the data-refs in the loop
1871 and adjust the maximum vectorization factor according to
1873 FORNOW: fail at the first data dependence that we encounter. */
1875 ok
= vect_analyze_data_ref_dependences (loop_vinfo
, &max_vf
);
1878 if (dump_enabled_p ())
1879 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1880 "bad data dependence.\n");
1883 if (max_vf
!= MAX_VECTORIZATION_FACTOR
1884 && maybe_lt (max_vf
, min_vf
))
1885 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
1886 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo
) = max_vf
;
1888 ok
= vect_determine_vectorization_factor (loop_vinfo
);
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1893 "can't determine vectorization factor.\n");
1896 if (max_vf
!= MAX_VECTORIZATION_FACTOR
1897 && maybe_lt (max_vf
, LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
1898 return opt_result::failure_at (vect_location
, "bad data dependence.\n");
1900 /* Compute the scalar iteration cost. */
1901 vect_compute_single_scalar_iteration_cost (loop_vinfo
);
1903 poly_uint64 saved_vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1906 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1907 ok
= vect_analyze_slp (loop_vinfo
, *n_stmts
);
1911 /* If there are any SLP instances mark them as pure_slp. */
1912 bool slp
= vect_make_slp_decision (loop_vinfo
);
1915 /* Find stmts that need to be both vectorized and SLPed. */
1916 vect_detect_hybrid_slp (loop_vinfo
);
1918 /* Update the vectorization factor based on the SLP decision. */
1919 vect_update_vf_for_slp (loop_vinfo
);
1922 bool saved_can_fully_mask_p
= LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
);
1924 /* We don't expect to have to roll back to anything other than an empty
1926 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo
).is_empty ());
1928 /* This is the point where we can re-start analysis with SLP forced off. */
1931 /* Now the vectorization factor is final. */
1932 poly_uint64 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1933 gcc_assert (known_ne (vectorization_factor
, 0U));
1935 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
) && dump_enabled_p ())
1937 dump_printf_loc (MSG_NOTE
, vect_location
,
1938 "vectorization_factor = ");
1939 dump_dec (MSG_NOTE
, vectorization_factor
);
1940 dump_printf (MSG_NOTE
, ", niters = %wd\n",
1941 LOOP_VINFO_INT_NITERS (loop_vinfo
));
1944 HOST_WIDE_INT max_niter
1945 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo
));
1947 /* Analyze the alignment of the data-refs in the loop.
1948 Fail if a data reference is found that cannot be vectorized. */
1950 ok
= vect_analyze_data_refs_alignment (loop_vinfo
);
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1955 "bad data alignment.\n");
1959 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1960 It is important to call pruning after vect_analyze_data_ref_accesses,
1961 since we use grouping information gathered by interleaving analysis. */
1962 ok
= vect_prune_runtime_alias_test_list (loop_vinfo
);
1966 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1967 vectorization, since we do not want to add extra peeling or
1968 add versioning for alignment. */
1969 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
1970 /* This pass will decide on using loop versioning and/or loop peeling in
1971 order to enhance the alignment of data references in the loop. */
1972 ok
= vect_enhance_data_refs_alignment (loop_vinfo
);
1974 ok
= vect_verify_datarefs_alignment (loop_vinfo
);
1980 /* Analyze operations in the SLP instances. Note this may
1981 remove unsupported SLP instances which makes the above
1982 SLP kind detection invalid. */
1983 unsigned old_size
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length ();
1984 vect_slp_analyze_operations (loop_vinfo
);
1985 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).length () != old_size
)
1987 ok
= opt_result::failure_at (vect_location
,
1988 "unsupported SLP instances\n");
1993 /* Scan all the remaining operations in the loop that are not subject
1994 to SLP and make sure they are vectorizable. */
1995 ok
= vect_analyze_loop_operations (loop_vinfo
);
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2000 "bad operation or unsupported loop bound.\n");
2004 /* Decide whether to use a fully-masked loop for this vectorization
2006 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
2007 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
)
2008 && vect_verify_full_masking (loop_vinfo
));
2009 if (dump_enabled_p ())
2011 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2012 dump_printf_loc (MSG_NOTE
, vect_location
,
2013 "using a fully-masked loop.\n");
2015 dump_printf_loc (MSG_NOTE
, vect_location
,
2016 "not using a fully-masked loop.\n");
2019 /* If epilog loop is required because of data accesses with gaps,
2020 one additional iteration needs to be peeled. Check if there is
2021 enough iterations for vectorization. */
2022 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2023 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2024 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2026 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2027 tree scalar_niters
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
2029 if (known_lt (wi::to_widest (scalar_niters
), vf
))
2030 return opt_result::failure_at (vect_location
,
2031 "loop has no enough iterations to"
2032 " support peeling for gaps.\n");
2035 /* Check the costings of the loop make vectorizing worthwhile. */
2036 res
= vect_analyze_loop_costing (loop_vinfo
);
2039 ok
= opt_result::failure_at (vect_location
,
2040 "Loop costings may not be worthwhile.\n");
2044 return opt_result::failure_at (vect_location
,
2045 "Loop costings not worthwhile.\n");
2047 /* Decide whether we need to create an epilogue loop to handle
2048 remaining scalar iterations. */
2049 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
2051 unsigned HOST_WIDE_INT const_vf
;
2052 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2053 /* The main loop handles all iterations. */
2054 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2055 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
2056 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) >= 0)
2058 /* Work out the (constant) number of iterations that need to be
2059 peeled for reasons other than niters. */
2060 unsigned int peel_niter
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2061 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2063 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo
) - peel_niter
,
2064 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)))
2065 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2067 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2068 /* ??? When peeling for gaps but not alignment, we could
2069 try to check whether the (variable) niters is known to be
2070 VF * N + 1. That's something of a niche case though. */
2071 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2072 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&const_vf
)
2073 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo
))
2074 < (unsigned) exact_log2 (const_vf
))
2075 /* In case of versioning, check if the maximum number of
2076 iterations is greater than th. If they are identical,
2077 the epilogue is unnecessary. */
2078 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo
)
2079 || ((unsigned HOST_WIDE_INT
) max_niter
2080 > (th
/ const_vf
) * const_vf
))))
2081 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = true;
2083 /* If an epilogue loop is required make sure we can create one. */
2084 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
)
2085 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
))
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_NOTE
, vect_location
, "epilog loop required\n");
2089 if (!vect_can_advance_ivs_p (loop_vinfo
)
2090 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo
),
2091 single_exit (LOOP_VINFO_LOOP
2094 ok
= opt_result::failure_at (vect_location
,
2095 "not vectorized: can't create required "
2101 /* During peeling, we need to check if number of loop iterations is
2102 enough for both peeled prolog loop and vector loop. This check
2103 can be merged along with threshold check of loop versioning, so
2104 increase threshold for this case if necessary. */
2105 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
2107 poly_uint64 niters_th
= 0;
2109 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo
))
2111 /* Niters for peeled prolog loop. */
2112 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
2114 dr_vec_info
*dr_info
= LOOP_VINFO_UNALIGNED_DR (loop_vinfo
);
2115 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
2116 niters_th
+= TYPE_VECTOR_SUBPARTS (vectype
) - 1;
2119 niters_th
+= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
2122 /* Niters for at least one iteration of vectorized loop. */
2123 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
2124 niters_th
+= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2125 /* One additional iteration because of peeling for gap. */
2126 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
2128 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = niters_th
;
2131 gcc_assert (known_eq (vectorization_factor
,
2132 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)));
2134 /* Ok to vectorize! */
2135 return opt_result::success ();
2138 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2141 /* Try again with SLP forced off but if we didn't do any SLP there is
2142 no point in re-trying. */
2146 /* If there are reduction chains re-trying will fail anyway. */
2147 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).is_empty ())
2150 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2151 via interleaving or lane instructions. */
2152 slp_instance instance
;
2155 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
2157 stmt_vec_info vinfo
;
2158 vinfo
= SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance
))[0];
2159 if (! STMT_VINFO_GROUPED_ACCESS (vinfo
))
2161 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2162 unsigned int size
= DR_GROUP_SIZE (vinfo
);
2163 tree vectype
= STMT_VINFO_VECTYPE (vinfo
);
2164 if (! vect_store_lanes_supported (vectype
, size
, false)
2165 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype
), 1U)
2166 && ! vect_grouped_store_supported (vectype
, size
))
2167 return opt_result::failure_at (vinfo
->stmt
,
2168 "unsupported grouped store\n");
2169 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, node
)
2171 vinfo
= SLP_TREE_SCALAR_STMTS (node
)[0];
2172 vinfo
= DR_GROUP_FIRST_ELEMENT (vinfo
);
2173 bool single_element_p
= !DR_GROUP_NEXT_ELEMENT (vinfo
);
2174 size
= DR_GROUP_SIZE (vinfo
);
2175 vectype
= STMT_VINFO_VECTYPE (vinfo
);
2176 if (! vect_load_lanes_supported (vectype
, size
, false)
2177 && ! vect_grouped_load_supported (vectype
, single_element_p
,
2179 return opt_result::failure_at (vinfo
->stmt
,
2180 "unsupported grouped load\n");
2184 if (dump_enabled_p ())
2185 dump_printf_loc (MSG_NOTE
, vect_location
,
2186 "re-trying with SLP disabled\n");
2188 /* Roll back state appropriately. No SLP this time. */
2190 /* Restore vectorization factor as it were without SLP. */
2191 LOOP_VINFO_VECT_FACTOR (loop_vinfo
) = saved_vectorization_factor
;
2192 /* Free the SLP instances. */
2193 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), j
, instance
)
2194 vect_free_slp_instance (instance
, false);
2195 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
2196 /* Reset SLP type to loop_vect on all stmts. */
2197 for (i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2199 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2200 for (gimple_stmt_iterator si
= gsi_start_phis (bb
);
2201 !gsi_end_p (si
); gsi_next (&si
))
2203 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2204 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2206 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
2207 !gsi_end_p (si
); gsi_next (&si
))
2209 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (gsi_stmt (si
));
2210 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2211 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2213 gimple
*pattern_def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
2214 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2215 STMT_SLP_TYPE (stmt_info
) = loop_vect
;
2216 for (gimple_stmt_iterator pi
= gsi_start (pattern_def_seq
);
2217 !gsi_end_p (pi
); gsi_next (&pi
))
2218 STMT_SLP_TYPE (loop_vinfo
->lookup_stmt (gsi_stmt (pi
)))
2223 /* Free optimized alias test DDRS. */
2224 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).truncate (0);
2225 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).release ();
2226 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).release ();
2227 /* Reset target cost data. */
2228 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
));
2229 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
2230 = init_cost (LOOP_VINFO_LOOP (loop_vinfo
));
2231 /* Reset accumulated rgroup information. */
2232 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo
));
2233 /* Reset assorted flags. */
2234 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo
) = false;
2235 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = false;
2236 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
) = 0;
2237 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
) = 0;
2238 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = saved_can_fully_mask_p
;
2243 /* Function vect_analyze_loop.
2245 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2246 for it. The different analyses will record information in the
2247 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2250 vect_analyze_loop (struct loop
*loop
, loop_vec_info orig_loop_vinfo
,
2251 vec_info_shared
*shared
)
2253 auto_vector_sizes vector_sizes
;
2255 /* Autodetect first vector size we try. */
2256 current_vector_size
= 0;
2257 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
);
2258 unsigned int next_size
= 0;
2260 DUMP_VECT_SCOPE ("analyze_loop_nest");
2262 if (loop_outer (loop
)
2263 && loop_vec_info_for_loop (loop_outer (loop
))
2264 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop
))))
2265 return opt_loop_vec_info::failure_at (vect_location
,
2266 "outer-loop already vectorized.\n");
2268 if (!find_loop_nest (loop
, &shared
->loop_nest
))
2269 return opt_loop_vec_info::failure_at
2271 "not vectorized: loop nest containing two or more consecutive inner"
2272 " loops cannot be vectorized\n");
2274 unsigned n_stmts
= 0;
2275 poly_uint64 autodetected_vector_size
= 0;
2278 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2279 opt_loop_vec_info loop_vinfo
2280 = vect_analyze_loop_form (loop
, shared
);
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2285 "bad loop form.\n");
2291 if (orig_loop_vinfo
)
2292 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
) = orig_loop_vinfo
;
2294 opt_result res
= vect_analyze_loop_2 (loop_vinfo
, fatal
, &n_stmts
);
2297 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo
) = 1;
2305 autodetected_vector_size
= current_vector_size
;
2307 if (next_size
< vector_sizes
.length ()
2308 && known_eq (vector_sizes
[next_size
], autodetected_vector_size
))
2312 || next_size
== vector_sizes
.length ()
2313 || known_eq (current_vector_size
, 0U))
2314 return opt_loop_vec_info::propagate_failure (res
);
2316 /* Try the next biggest vector size. */
2317 current_vector_size
= vector_sizes
[next_size
++];
2318 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE
, vect_location
,
2321 "***** Re-trying analysis with "
2323 dump_dec (MSG_NOTE
, current_vector_size
);
2324 dump_printf (MSG_NOTE
, "\n");
2329 /* Return true if there is an in-order reduction function for CODE, storing
2330 it in *REDUC_FN if so. */
2333 fold_left_reduction_fn (tree_code code
, internal_fn
*reduc_fn
)
2338 *reduc_fn
= IFN_FOLD_LEFT_PLUS
;
2346 /* Function reduction_fn_for_scalar_code
2349 CODE - tree_code of a reduction operations.
2352 REDUC_FN - the corresponding internal function to be used to reduce the
2353 vector of partial results into a single scalar result, or IFN_LAST
2354 if the operation is a supported reduction operation, but does not have
2355 such an internal function.
2357 Return FALSE if CODE currently cannot be vectorized as reduction. */
2360 reduction_fn_for_scalar_code (enum tree_code code
, internal_fn
*reduc_fn
)
2365 *reduc_fn
= IFN_REDUC_MAX
;
2369 *reduc_fn
= IFN_REDUC_MIN
;
2373 *reduc_fn
= IFN_REDUC_PLUS
;
2377 *reduc_fn
= IFN_REDUC_AND
;
2381 *reduc_fn
= IFN_REDUC_IOR
;
2385 *reduc_fn
= IFN_REDUC_XOR
;
2390 *reduc_fn
= IFN_LAST
;
2398 /* If there is a neutral value X such that SLP reduction NODE would not
2399 be affected by the introduction of additional X elements, return that X,
2400 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2401 is true if the SLP statements perform a single reduction, false if each
2402 statement performs an independent reduction. */
2405 neutral_op_for_slp_reduction (slp_tree slp_node
, tree_code code
,
2408 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2409 stmt_vec_info stmt_vinfo
= stmts
[0];
2410 tree vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
2411 tree scalar_type
= TREE_TYPE (vector_type
);
2412 struct loop
*loop
= gimple_bb (stmt_vinfo
->stmt
)->loop_father
;
2417 case WIDEN_SUM_EXPR
:
2424 return build_zero_cst (scalar_type
);
2427 return build_one_cst (scalar_type
);
2430 return build_all_ones_cst (scalar_type
);
2434 /* For MIN/MAX the initial values are neutral. A reduction chain
2435 has only a single initial value, so that value is neutral for
2438 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
,
2439 loop_preheader_edge (loop
));
2447 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2448 STMT is printed with a message MSG. */
2451 report_vect_op (dump_flags_t msg_type
, gimple
*stmt
, const char *msg
)
2453 dump_printf_loc (msg_type
, vect_location
, "%s%G", msg
, stmt
);
2456 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2457 operation. Return true if the results of DEF_STMT_INFO are something
2458 that can be accumulated by such a reduction. */
2461 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info
)
2463 return (is_gimple_assign (def_stmt_info
->stmt
)
2464 || is_gimple_call (def_stmt_info
->stmt
)
2465 || STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_induction_def
2466 || (gimple_code (def_stmt_info
->stmt
) == GIMPLE_PHI
2467 && STMT_VINFO_DEF_TYPE (def_stmt_info
) == vect_internal_def
2468 && !is_loop_header_bb_p (gimple_bb (def_stmt_info
->stmt
))));
2471 /* Detect SLP reduction of the form:
2481 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2482 FIRST_STMT is the first reduction stmt in the chain
2483 (a2 = operation (a1)).
2485 Return TRUE if a reduction chain was detected. */
2488 vect_is_slp_reduction (loop_vec_info loop_info
, gimple
*phi
,
2491 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2492 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2493 enum tree_code code
;
2494 gimple
*loop_use_stmt
= NULL
;
2495 stmt_vec_info use_stmt_info
;
2497 imm_use_iterator imm_iter
;
2498 use_operand_p use_p
;
2499 int nloop_uses
, size
= 0, n_out_of_loop_uses
;
2502 if (loop
!= vect_loop
)
2505 auto_vec
<stmt_vec_info
, 8> reduc_chain
;
2506 lhs
= PHI_RESULT (phi
);
2507 code
= gimple_assign_rhs_code (first_stmt
);
2511 n_out_of_loop_uses
= 0;
2512 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
2514 gimple
*use_stmt
= USE_STMT (use_p
);
2515 if (is_gimple_debug (use_stmt
))
2518 /* Check if we got back to the reduction phi. */
2519 if (use_stmt
== phi
)
2521 loop_use_stmt
= use_stmt
;
2526 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2528 loop_use_stmt
= use_stmt
;
2532 n_out_of_loop_uses
++;
2534 /* There are can be either a single use in the loop or two uses in
2536 if (nloop_uses
> 1 || (n_out_of_loop_uses
&& nloop_uses
))
2543 /* We reached a statement with no loop uses. */
2544 if (nloop_uses
== 0)
2547 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2548 if (gimple_code (loop_use_stmt
) == GIMPLE_PHI
)
2551 if (!is_gimple_assign (loop_use_stmt
)
2552 || code
!= gimple_assign_rhs_code (loop_use_stmt
)
2553 || !flow_bb_inside_loop_p (loop
, gimple_bb (loop_use_stmt
)))
2556 /* Insert USE_STMT into reduction chain. */
2557 use_stmt_info
= loop_info
->lookup_stmt (loop_use_stmt
);
2558 reduc_chain
.safe_push (use_stmt_info
);
2560 lhs
= gimple_assign_lhs (loop_use_stmt
);
2564 if (!found
|| loop_use_stmt
!= phi
|| size
< 2)
2567 /* Swap the operands, if needed, to make the reduction operand be the second
2569 lhs
= PHI_RESULT (phi
);
2570 for (unsigned i
= 0; i
< reduc_chain
.length (); ++i
)
2572 gassign
*next_stmt
= as_a
<gassign
*> (reduc_chain
[i
]->stmt
);
2573 if (gimple_assign_rhs2 (next_stmt
) == lhs
)
2575 tree op
= gimple_assign_rhs1 (next_stmt
);
2576 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (op
);
2578 /* Check that the other def is either defined in the loop
2579 ("vect_internal_def"), or it's an induction (defined by a
2580 loop-header phi-node). */
2582 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
))
2583 && vect_valid_reduction_input_p (def_stmt_info
))
2585 lhs
= gimple_assign_lhs (next_stmt
);
2593 tree op
= gimple_assign_rhs2 (next_stmt
);
2594 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (op
);
2596 /* Check that the other def is either defined in the loop
2597 ("vect_internal_def"), or it's an induction (defined by a
2598 loop-header phi-node). */
2600 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
))
2601 && vect_valid_reduction_input_p (def_stmt_info
))
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_NOTE
, vect_location
, "swapping oprnds: %G",
2607 swap_ssa_operands (next_stmt
,
2608 gimple_assign_rhs1_ptr (next_stmt
),
2609 gimple_assign_rhs2_ptr (next_stmt
));
2610 update_stmt (next_stmt
);
2612 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt
)))
2613 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
2619 lhs
= gimple_assign_lhs (next_stmt
);
2622 /* Build up the actual chain. */
2623 for (unsigned i
= 0; i
< reduc_chain
.length () - 1; ++i
)
2625 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
[i
]) = reduc_chain
[0];
2626 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
[i
]) = reduc_chain
[i
+1];
2628 REDUC_GROUP_FIRST_ELEMENT (reduc_chain
.last ()) = reduc_chain
[0];
2629 REDUC_GROUP_NEXT_ELEMENT (reduc_chain
.last ()) = NULL
;
2631 /* Save the chain for further analysis in SLP detection. */
2632 LOOP_VINFO_REDUCTION_CHAINS (loop_info
).safe_push (reduc_chain
[0]);
2633 REDUC_GROUP_SIZE (reduc_chain
[0]) = size
;
2638 /* Return true if we need an in-order reduction for operation CODE
2639 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2640 overflow must wrap. */
2643 needs_fold_left_reduction_p (tree type
, tree_code code
,
2644 bool need_wrapping_integral_overflow
)
2646 /* CHECKME: check for !flag_finite_math_only too? */
2647 if (SCALAR_FLOAT_TYPE_P (type
))
2655 return !flag_associative_math
;
2658 if (INTEGRAL_TYPE_P (type
))
2660 if (!operation_no_trapping_overflow (type
, code
))
2662 if (need_wrapping_integral_overflow
2663 && !TYPE_OVERFLOW_WRAPS (type
)
2664 && operation_can_overflow (code
))
2669 if (SAT_FIXED_POINT_TYPE_P (type
))
2675 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2676 reduction operation CODE has a handled computation expression. */
2679 check_reduction_path (dump_user_location_t loc
, loop_p loop
, gphi
*phi
,
2680 tree loop_arg
, enum tree_code code
)
2682 auto_vec
<std::pair
<ssa_op_iter
, use_operand_p
> > path
;
2683 auto_bitmap visited
;
2684 tree lookfor
= PHI_RESULT (phi
);
2686 use_operand_p curr
= op_iter_init_phiuse (&curri
, phi
, SSA_OP_USE
);
2687 while (USE_FROM_PTR (curr
) != loop_arg
)
2688 curr
= op_iter_next_use (&curri
);
2689 curri
.i
= curri
.numops
;
2692 path
.safe_push (std::make_pair (curri
, curr
));
2693 tree use
= USE_FROM_PTR (curr
);
2696 gimple
*def
= SSA_NAME_DEF_STMT (use
);
2697 if (gimple_nop_p (def
)
2698 || ! flow_bb_inside_loop_p (loop
, gimple_bb (def
)))
2703 std::pair
<ssa_op_iter
, use_operand_p
> x
= path
.pop ();
2707 curr
= op_iter_next_use (&curri
);
2708 /* Skip already visited or non-SSA operands (from iterating
2710 while (curr
!= NULL_USE_OPERAND_P
2711 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2712 || ! bitmap_set_bit (visited
,
2714 (USE_FROM_PTR (curr
)))));
2716 while (curr
== NULL_USE_OPERAND_P
&& ! path
.is_empty ());
2717 if (curr
== NULL_USE_OPERAND_P
)
2722 if (gimple_code (def
) == GIMPLE_PHI
)
2723 curr
= op_iter_init_phiuse (&curri
, as_a
<gphi
*>(def
), SSA_OP_USE
);
2725 curr
= op_iter_init_use (&curri
, def
, SSA_OP_USE
);
2726 while (curr
!= NULL_USE_OPERAND_P
2727 && (TREE_CODE (USE_FROM_PTR (curr
)) != SSA_NAME
2728 || ! bitmap_set_bit (visited
,
2730 (USE_FROM_PTR (curr
)))))
2731 curr
= op_iter_next_use (&curri
);
2732 if (curr
== NULL_USE_OPERAND_P
)
2737 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2739 dump_printf_loc (MSG_NOTE
, loc
, "reduction path: ");
2741 std::pair
<ssa_op_iter
, use_operand_p
> *x
;
2742 FOR_EACH_VEC_ELT (path
, i
, x
)
2743 dump_printf (MSG_NOTE
, "%T ", USE_FROM_PTR (x
->second
));
2744 dump_printf (MSG_NOTE
, "\n");
2747 /* Check whether the reduction path detected is valid. */
2748 bool fail
= path
.length () == 0;
2750 for (unsigned i
= 1; i
< path
.length (); ++i
)
2752 gimple
*use_stmt
= USE_STMT (path
[i
].second
);
2753 tree op
= USE_FROM_PTR (path
[i
].second
);
2754 if (! has_single_use (op
)
2755 || ! is_gimple_assign (use_stmt
))
2760 if (gimple_assign_rhs_code (use_stmt
) != code
)
2762 if (code
== PLUS_EXPR
2763 && gimple_assign_rhs_code (use_stmt
) == MINUS_EXPR
)
2765 /* Track whether we negate the reduction value each iteration. */
2766 if (gimple_assign_rhs2 (use_stmt
) == op
)
2776 return ! fail
&& ! neg
;
2780 /* Function vect_is_simple_reduction
2782 (1) Detect a cross-iteration def-use cycle that represents a simple
2783 reduction computation. We look for the following pattern:
2788 a2 = operation (a3, a1)
2795 a2 = operation (a3, a1)
2798 1. operation is commutative and associative and it is safe to
2799 change the order of the computation
2800 2. no uses for a2 in the loop (a2 is used out of the loop)
2801 3. no uses of a1 in the loop besides the reduction operation
2802 4. no uses of a1 outside the loop.
2804 Conditions 1,4 are tested here.
2805 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2807 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2810 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2814 inner loop (def of a3)
2817 (4) Detect condition expressions, ie:
2818 for (int i = 0; i < N; i++)
2824 static stmt_vec_info
2825 vect_is_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
2827 bool need_wrapping_integral_overflow
,
2828 enum vect_reduction_type
*v_reduc_type
)
2830 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
2831 struct loop
*loop
= (gimple_bb (phi
))->loop_father
;
2832 struct loop
*vect_loop
= LOOP_VINFO_LOOP (loop_info
);
2833 bool nested_in_vect_loop
= flow_loop_nested_p (vect_loop
, loop
);
2834 gimple
*phi_use_stmt
= NULL
;
2835 enum tree_code orig_code
, code
;
2836 tree op1
, op2
, op3
= NULL_TREE
, op4
= NULL_TREE
;
2839 imm_use_iterator imm_iter
;
2840 use_operand_p use_p
;
2843 *double_reduc
= false;
2844 *v_reduc_type
= TREE_CODE_REDUCTION
;
2846 tree phi_name
= PHI_RESULT (phi
);
2847 /* ??? If there are no uses of the PHI result the inner loop reduction
2848 won't be detected as possibly double-reduction by vectorizable_reduction
2849 because that tries to walk the PHI arg from the preheader edge which
2850 can be constant. See PR60382. */
2851 if (has_zero_uses (phi_name
))
2853 unsigned nphi_def_loop_uses
= 0;
2854 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, phi_name
)
2856 gimple
*use_stmt
= USE_STMT (use_p
);
2857 if (is_gimple_debug (use_stmt
))
2860 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2862 if (dump_enabled_p ())
2863 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2864 "intermediate value used outside loop.\n");
2869 nphi_def_loop_uses
++;
2870 phi_use_stmt
= use_stmt
;
2873 edge latch_e
= loop_latch_edge (loop
);
2874 tree loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
2875 if (TREE_CODE (loop_arg
) != SSA_NAME
)
2877 if (dump_enabled_p ())
2878 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2879 "reduction: not ssa_name: %T\n", loop_arg
);
2883 stmt_vec_info def_stmt_info
= loop_info
->lookup_def (loop_arg
);
2885 || !flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt_info
->stmt
)))
2888 if (gassign
*def_stmt
= dyn_cast
<gassign
*> (def_stmt_info
->stmt
))
2890 name
= gimple_assign_lhs (def_stmt
);
2893 else if (gphi
*def_stmt
= dyn_cast
<gphi
*> (def_stmt_info
->stmt
))
2895 name
= PHI_RESULT (def_stmt
);
2900 if (dump_enabled_p ())
2901 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2902 "reduction: unhandled reduction operation: %G",
2903 def_stmt_info
->stmt
);
2907 unsigned nlatch_def_loop_uses
= 0;
2908 auto_vec
<gphi
*, 3> lcphis
;
2909 bool inner_loop_of_double_reduc
= false;
2910 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, name
)
2912 gimple
*use_stmt
= USE_STMT (use_p
);
2913 if (is_gimple_debug (use_stmt
))
2915 if (flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
2916 nlatch_def_loop_uses
++;
2919 /* We can have more than one loop-closed PHI. */
2920 lcphis
.safe_push (as_a
<gphi
*> (use_stmt
));
2921 if (nested_in_vect_loop
2922 && (STMT_VINFO_DEF_TYPE (loop_info
->lookup_stmt (use_stmt
))
2923 == vect_double_reduction_def
))
2924 inner_loop_of_double_reduc
= true;
2928 /* If this isn't a nested cycle or if the nested cycle reduction value
2929 is used ouside of the inner loop we cannot handle uses of the reduction
2931 if ((!nested_in_vect_loop
|| inner_loop_of_double_reduc
)
2932 && (nlatch_def_loop_uses
> 1 || nphi_def_loop_uses
> 1))
2934 if (dump_enabled_p ())
2935 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2936 "reduction used in loop.\n");
2940 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2941 defined in the inner loop. */
2944 gphi
*def_stmt
= as_a
<gphi
*> (def_stmt_info
->stmt
);
2945 op1
= PHI_ARG_DEF (def_stmt
, 0);
2947 if (gimple_phi_num_args (def_stmt
) != 1
2948 || TREE_CODE (op1
) != SSA_NAME
)
2950 if (dump_enabled_p ())
2951 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2952 "unsupported phi node definition.\n");
2957 gimple
*def1
= SSA_NAME_DEF_STMT (op1
);
2958 if (gimple_bb (def1
)
2959 && flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
))
2961 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (def1
))
2962 && is_gimple_assign (def1
)
2963 && is_a
<gphi
*> (phi_use_stmt
)
2964 && flow_bb_inside_loop_p (loop
->inner
, gimple_bb (phi_use_stmt
)))
2966 if (dump_enabled_p ())
2967 report_vect_op (MSG_NOTE
, def_stmt
,
2968 "detected double reduction: ");
2970 *double_reduc
= true;
2971 return def_stmt_info
;
2977 /* If we are vectorizing an inner reduction we are executing that
2978 in the original order only in case we are not dealing with a
2979 double reduction. */
2980 bool check_reduction
= true;
2981 if (flow_loop_nested_p (vect_loop
, loop
))
2985 check_reduction
= false;
2986 FOR_EACH_VEC_ELT (lcphis
, i
, lcphi
)
2987 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, gimple_phi_result (lcphi
))
2989 gimple
*use_stmt
= USE_STMT (use_p
);
2990 if (is_gimple_debug (use_stmt
))
2992 if (! flow_bb_inside_loop_p (vect_loop
, gimple_bb (use_stmt
)))
2993 check_reduction
= true;
2997 gassign
*def_stmt
= as_a
<gassign
*> (def_stmt_info
->stmt
);
2998 code
= orig_code
= gimple_assign_rhs_code (def_stmt
);
3000 if (nested_in_vect_loop
&& !check_reduction
)
3002 /* FIXME: Even for non-reductions code generation is funneled
3003 through vectorizable_reduction for the stmt defining the
3004 PHI latch value. So we have to artificially restrict ourselves
3005 for the supported operations. */
3006 switch (get_gimple_rhs_class (code
))
3008 case GIMPLE_BINARY_RHS
:
3009 case GIMPLE_TERNARY_RHS
:
3012 /* Not supported by vectorizable_reduction. */
3013 if (dump_enabled_p ())
3014 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3015 "nested cycle: not handled operation: ");
3018 if (dump_enabled_p ())
3019 report_vect_op (MSG_NOTE
, def_stmt
, "detected nested cycle: ");
3020 return def_stmt_info
;
3023 /* We can handle "res -= x[i]", which is non-associative by
3024 simply rewriting this into "res += -x[i]". Avoid changing
3025 gimple instruction for the first simple tests and only do this
3026 if we're allowed to change code at all. */
3027 if (code
== MINUS_EXPR
&& gimple_assign_rhs2 (def_stmt
) != phi_name
)
3030 if (code
== COND_EXPR
)
3032 if (! nested_in_vect_loop
)
3033 *v_reduc_type
= COND_REDUCTION
;
3035 op3
= gimple_assign_rhs1 (def_stmt
);
3036 if (COMPARISON_CLASS_P (op3
))
3038 op4
= TREE_OPERAND (op3
, 1);
3039 op3
= TREE_OPERAND (op3
, 0);
3041 if (op3
== phi_name
|| op4
== phi_name
)
3043 if (dump_enabled_p ())
3044 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3045 "reduction: condition depends on previous"
3050 op1
= gimple_assign_rhs2 (def_stmt
);
3051 op2
= gimple_assign_rhs3 (def_stmt
);
3053 else if (!commutative_tree_code (code
) || !associative_tree_code (code
))
3055 if (dump_enabled_p ())
3056 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3057 "reduction: not commutative/associative: ");
3060 else if (get_gimple_rhs_class (code
) == GIMPLE_BINARY_RHS
)
3062 op1
= gimple_assign_rhs1 (def_stmt
);
3063 op2
= gimple_assign_rhs2 (def_stmt
);
3067 if (dump_enabled_p ())
3068 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3069 "reduction: not handled operation: ");
3073 if (TREE_CODE (op1
) != SSA_NAME
&& TREE_CODE (op2
) != SSA_NAME
)
3075 if (dump_enabled_p ())
3076 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3077 "reduction: both uses not ssa_names: ");
3082 type
= TREE_TYPE (gimple_assign_lhs (def_stmt
));
3083 if ((TREE_CODE (op1
) == SSA_NAME
3084 && !types_compatible_p (type
,TREE_TYPE (op1
)))
3085 || (TREE_CODE (op2
) == SSA_NAME
3086 && !types_compatible_p (type
, TREE_TYPE (op2
)))
3087 || (op3
&& TREE_CODE (op3
) == SSA_NAME
3088 && !types_compatible_p (type
, TREE_TYPE (op3
)))
3089 || (op4
&& TREE_CODE (op4
) == SSA_NAME
3090 && !types_compatible_p (type
, TREE_TYPE (op4
))))
3092 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_NOTE
, vect_location
,
3095 "reduction: multiple types: operation type: "
3096 "%T, operands types: %T,%T",
3097 type
, TREE_TYPE (op1
), TREE_TYPE (op2
));
3099 dump_printf (MSG_NOTE
, ",%T", TREE_TYPE (op3
));
3102 dump_printf (MSG_NOTE
, ",%T", TREE_TYPE (op4
));
3103 dump_printf (MSG_NOTE
, "\n");
3109 /* Check whether it's ok to change the order of the computation.
3110 Generally, when vectorizing a reduction we change the order of the
3111 computation. This may change the behavior of the program in some
3112 cases, so we need to check that this is ok. One exception is when
3113 vectorizing an outer-loop: the inner-loop is executed sequentially,
3114 and therefore vectorizing reductions in the inner-loop during
3115 outer-loop vectorization is safe. */
3117 && *v_reduc_type
== TREE_CODE_REDUCTION
3118 && needs_fold_left_reduction_p (type
, code
,
3119 need_wrapping_integral_overflow
))
3120 *v_reduc_type
= FOLD_LEFT_REDUCTION
;
3122 /* Reduction is safe. We're dealing with one of the following:
3123 1) integer arithmetic and no trapv
3124 2) floating point arithmetic, and special flags permit this optimization
3125 3) nested cycle (i.e., outer loop vectorization). */
3126 stmt_vec_info def1_info
= loop_info
->lookup_def (op1
);
3127 stmt_vec_info def2_info
= loop_info
->lookup_def (op2
);
3128 if (code
!= COND_EXPR
&& !def1_info
&& !def2_info
)
3130 if (dump_enabled_p ())
3131 report_vect_op (MSG_NOTE
, def_stmt
, "reduction: no defs for operands: ");
3135 /* Check that one def is the reduction def, defined by PHI,
3136 the other def is either defined in the loop ("vect_internal_def"),
3137 or it's an induction (defined by a loop-header phi-node). */
3140 && def2_info
->stmt
== phi
3141 && (code
== COND_EXPR
3143 || !flow_bb_inside_loop_p (loop
, gimple_bb (def1_info
->stmt
))
3144 || vect_valid_reduction_input_p (def1_info
)))
3146 if (dump_enabled_p ())
3147 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
3148 return def_stmt_info
;
3152 && def1_info
->stmt
== phi
3153 && (code
== COND_EXPR
3155 || !flow_bb_inside_loop_p (loop
, gimple_bb (def2_info
->stmt
))
3156 || vect_valid_reduction_input_p (def2_info
)))
3158 if (! nested_in_vect_loop
&& orig_code
!= MINUS_EXPR
)
3160 /* Check if we can swap operands (just for simplicity - so that
3161 the rest of the code can assume that the reduction variable
3162 is always the last (second) argument). */
3163 if (code
== COND_EXPR
)
3165 /* Swap cond_expr by inverting the condition. */
3166 tree cond_expr
= gimple_assign_rhs1 (def_stmt
);
3167 enum tree_code invert_code
= ERROR_MARK
;
3168 enum tree_code cond_code
= TREE_CODE (cond_expr
);
3170 if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
3172 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
3173 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
3175 if (invert_code
!= ERROR_MARK
)
3177 TREE_SET_CODE (cond_expr
, invert_code
);
3178 swap_ssa_operands (def_stmt
,
3179 gimple_assign_rhs2_ptr (def_stmt
),
3180 gimple_assign_rhs3_ptr (def_stmt
));
3184 if (dump_enabled_p ())
3185 report_vect_op (MSG_NOTE
, def_stmt
,
3186 "detected reduction: cannot swap operands "
3192 swap_ssa_operands (def_stmt
, gimple_assign_rhs1_ptr (def_stmt
),
3193 gimple_assign_rhs2_ptr (def_stmt
));
3195 if (dump_enabled_p ())
3196 report_vect_op (MSG_NOTE
, def_stmt
,
3197 "detected reduction: need to swap operands: ");
3199 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt
)))
3200 LOOP_VINFO_OPERANDS_SWAPPED (loop_info
) = true;
3204 if (dump_enabled_p ())
3205 report_vect_op (MSG_NOTE
, def_stmt
, "detected reduction: ");
3208 return def_stmt_info
;
3211 /* Try to find SLP reduction chain. */
3212 if (! nested_in_vect_loop
3213 && code
!= COND_EXPR
3214 && orig_code
!= MINUS_EXPR
3215 && vect_is_slp_reduction (loop_info
, phi
, def_stmt
))
3217 if (dump_enabled_p ())
3218 report_vect_op (MSG_NOTE
, def_stmt
,
3219 "reduction: detected reduction chain: ");
3221 return def_stmt_info
;
3224 /* Look for the expression computing loop_arg from loop PHI result. */
3225 if (check_reduction_path (vect_location
, loop
, phi
, loop_arg
, code
))
3226 return def_stmt_info
;
3228 if (dump_enabled_p ())
3230 report_vect_op (MSG_MISSED_OPTIMIZATION
, def_stmt
,
3231 "reduction: unknown pattern: ");
3237 /* Wrapper around vect_is_simple_reduction, which will modify code
3238 in-place if it enables detection of more reductions. Arguments
3242 vect_force_simple_reduction (loop_vec_info loop_info
, stmt_vec_info phi_info
,
3244 bool need_wrapping_integral_overflow
)
3246 enum vect_reduction_type v_reduc_type
;
3247 stmt_vec_info def_info
3248 = vect_is_simple_reduction (loop_info
, phi_info
, double_reduc
,
3249 need_wrapping_integral_overflow
,
3253 STMT_VINFO_REDUC_TYPE (phi_info
) = v_reduc_type
;
3254 STMT_VINFO_REDUC_DEF (phi_info
) = def_info
;
3255 STMT_VINFO_REDUC_TYPE (def_info
) = v_reduc_type
;
3256 STMT_VINFO_REDUC_DEF (def_info
) = phi_info
;
3261 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3263 vect_get_known_peeling_cost (loop_vec_info loop_vinfo
, int peel_iters_prologue
,
3264 int *peel_iters_epilogue
,
3265 stmt_vector_for_cost
*scalar_cost_vec
,
3266 stmt_vector_for_cost
*prologue_cost_vec
,
3267 stmt_vector_for_cost
*epilogue_cost_vec
)
3270 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3272 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
3274 *peel_iters_epilogue
= assumed_vf
/ 2;
3275 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_NOTE
, vect_location
,
3277 "cost model: epilogue peel iters set to vf/2 "
3278 "because loop iterations are unknown .\n");
3280 /* If peeled iterations are known but number of scalar loop
3281 iterations are unknown, count a taken branch per peeled loop. */
3282 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3283 NULL
, 0, vect_prologue
);
3284 retval
= record_stmt_cost (prologue_cost_vec
, 1, cond_branch_taken
,
3285 NULL
, 0, vect_epilogue
);
3289 int niters
= LOOP_VINFO_INT_NITERS (loop_vinfo
);
3290 peel_iters_prologue
= niters
< peel_iters_prologue
?
3291 niters
: peel_iters_prologue
;
3292 *peel_iters_epilogue
= (niters
- peel_iters_prologue
) % assumed_vf
;
3293 /* If we need to peel for gaps, but no peeling is required, we have to
3294 peel VF iterations. */
3295 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) && !*peel_iters_epilogue
)
3296 *peel_iters_epilogue
= assumed_vf
;
3299 stmt_info_for_cost
*si
;
3301 if (peel_iters_prologue
)
3302 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3303 retval
+= record_stmt_cost (prologue_cost_vec
,
3304 si
->count
* peel_iters_prologue
,
3305 si
->kind
, si
->stmt_info
, si
->misalign
,
3307 if (*peel_iters_epilogue
)
3308 FOR_EACH_VEC_ELT (*scalar_cost_vec
, j
, si
)
3309 retval
+= record_stmt_cost (epilogue_cost_vec
,
3310 si
->count
* *peel_iters_epilogue
,
3311 si
->kind
, si
->stmt_info
, si
->misalign
,
3317 /* Function vect_estimate_min_profitable_iters
3319 Return the number of iterations required for the vector version of the
3320 loop to be profitable relative to the cost of the scalar version of the
3323 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3324 of iterations for vectorization. -1 value means loop vectorization
3325 is not profitable. This returned value may be used for dynamic
3326 profitability check.
3328 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3329 for static check against estimated number of iterations. */
3332 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo
,
3333 int *ret_min_profitable_niters
,
3334 int *ret_min_profitable_estimate
)
3336 int min_profitable_iters
;
3337 int min_profitable_estimate
;
3338 int peel_iters_prologue
;
3339 int peel_iters_epilogue
;
3340 unsigned vec_inside_cost
= 0;
3341 int vec_outside_cost
= 0;
3342 unsigned vec_prologue_cost
= 0;
3343 unsigned vec_epilogue_cost
= 0;
3344 int scalar_single_iter_cost
= 0;
3345 int scalar_outside_cost
= 0;
3346 int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
3347 int npeel
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
3348 void *target_cost_data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3350 /* Cost model disabled. */
3351 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
3353 if (dump_enabled_p ())
3354 dump_printf_loc (MSG_NOTE
, vect_location
, "cost model disabled.\n");
3355 *ret_min_profitable_niters
= 0;
3356 *ret_min_profitable_estimate
= 0;
3360 /* Requires loop versioning tests to handle misalignment. */
3361 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
3363 /* FIXME: Make cost depend on complexity of individual check. */
3364 unsigned len
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ();
3365 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3367 if (dump_enabled_p ())
3368 dump_printf (MSG_NOTE
,
3369 "cost model: Adding cost of checks for loop "
3370 "versioning to treat misalignment.\n");
3373 /* Requires loop versioning with alias checks. */
3374 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo
))
3376 /* FIXME: Make cost depend on complexity of individual check. */
3377 unsigned len
= LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
).length ();
3378 (void) add_stmt_cost (target_cost_data
, len
, vector_stmt
, NULL
, 0,
3380 len
= LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).length ();
3382 /* Count LEN - 1 ANDs and LEN comparisons. */
3383 (void) add_stmt_cost (target_cost_data
, len
* 2 - 1, scalar_stmt
,
3384 NULL
, 0, vect_prologue
);
3385 len
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).length ();
3388 /* Count LEN - 1 ANDs and LEN comparisons. */
3389 unsigned int nstmts
= len
* 2 - 1;
3390 /* +1 for each bias that needs adding. */
3391 for (unsigned int i
= 0; i
< len
; ++i
)
3392 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
)[i
].unsigned_p
)
3394 (void) add_stmt_cost (target_cost_data
, nstmts
, scalar_stmt
,
3395 NULL
, 0, vect_prologue
);
3397 if (dump_enabled_p ())
3398 dump_printf (MSG_NOTE
,
3399 "cost model: Adding cost of checks for loop "
3400 "versioning aliasing.\n");
3403 /* Requires loop versioning with niter checks. */
3404 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo
))
3406 /* FIXME: Make cost depend on complexity of individual check. */
3407 (void) add_stmt_cost (target_cost_data
, 1, vector_stmt
, NULL
, 0,
3409 if (dump_enabled_p ())
3410 dump_printf (MSG_NOTE
,
3411 "cost model: Adding cost of checks for loop "
3412 "versioning niters.\n");
3415 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3416 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
, NULL
, 0,
3419 /* Count statements in scalar loop. Using this as scalar cost for a single
3422 TODO: Add outer loop support.
3424 TODO: Consider assigning different costs to different scalar
3427 scalar_single_iter_cost
3428 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo
);
3430 /* Add additional cost for the peeled instructions in prologue and epilogue
3431 loop. (For fully-masked loops there will be no peeling.)
3433 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3434 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3436 TODO: Build an expression that represents peel_iters for prologue and
3437 epilogue to be used in a run-time test. */
3439 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3441 peel_iters_prologue
= 0;
3442 peel_iters_epilogue
= 0;
3444 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
))
3446 /* We need to peel exactly one iteration. */
3447 peel_iters_epilogue
+= 1;
3448 stmt_info_for_cost
*si
;
3450 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
3452 (void) add_stmt_cost (target_cost_data
, si
->count
,
3453 si
->kind
, si
->stmt_info
, si
->misalign
,
3459 peel_iters_prologue
= assumed_vf
/ 2;
3460 if (dump_enabled_p ())
3461 dump_printf (MSG_NOTE
, "cost model: "
3462 "prologue peel iters set to vf/2.\n");
3464 /* If peeling for alignment is unknown, loop bound of main loop becomes
3466 peel_iters_epilogue
= assumed_vf
/ 2;
3467 if (dump_enabled_p ())
3468 dump_printf (MSG_NOTE
, "cost model: "
3469 "epilogue peel iters set to vf/2 because "
3470 "peeling for alignment is unknown.\n");
3472 /* If peeled iterations are unknown, count a taken branch and a not taken
3473 branch per peeled loop. Even if scalar loop iterations are known,
3474 vector iterations are not known since peeled prologue iterations are
3475 not known. Hence guards remain the same. */
3476 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3477 NULL
, 0, vect_prologue
);
3478 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3479 NULL
, 0, vect_prologue
);
3480 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_taken
,
3481 NULL
, 0, vect_epilogue
);
3482 (void) add_stmt_cost (target_cost_data
, 1, cond_branch_not_taken
,
3483 NULL
, 0, vect_epilogue
);
3484 stmt_info_for_cost
*si
;
3486 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
), j
, si
)
3488 (void) add_stmt_cost (target_cost_data
,
3489 si
->count
* peel_iters_prologue
,
3490 si
->kind
, si
->stmt_info
, si
->misalign
,
3492 (void) add_stmt_cost (target_cost_data
,
3493 si
->count
* peel_iters_epilogue
,
3494 si
->kind
, si
->stmt_info
, si
->misalign
,
3500 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
3501 stmt_info_for_cost
*si
;
3503 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
3505 prologue_cost_vec
.create (2);
3506 epilogue_cost_vec
.create (2);
3507 peel_iters_prologue
= npeel
;
3509 (void) vect_get_known_peeling_cost (loop_vinfo
, peel_iters_prologue
,
3510 &peel_iters_epilogue
,
3511 &LOOP_VINFO_SCALAR_ITERATION_COST
3514 &epilogue_cost_vec
);
3516 FOR_EACH_VEC_ELT (prologue_cost_vec
, j
, si
)
3517 (void) add_stmt_cost (data
, si
->count
, si
->kind
, si
->stmt_info
,
3518 si
->misalign
, vect_prologue
);
3520 FOR_EACH_VEC_ELT (epilogue_cost_vec
, j
, si
)
3521 (void) add_stmt_cost (data
, si
->count
, si
->kind
, si
->stmt_info
,
3522 si
->misalign
, vect_epilogue
);
3524 prologue_cost_vec
.release ();
3525 epilogue_cost_vec
.release ();
3528 /* FORNOW: The scalar outside cost is incremented in one of the
3531 1. The vectorizer checks for alignment and aliasing and generates
3532 a condition that allows dynamic vectorization. A cost model
3533 check is ANDED with the versioning condition. Hence scalar code
3534 path now has the added cost of the versioning check.
3536 if (cost > th & versioning_check)
3539 Hence run-time scalar is incremented by not-taken branch cost.
3541 2. The vectorizer then checks if a prologue is required. If the
3542 cost model check was not done before during versioning, it has to
3543 be done before the prologue check.
3546 prologue = scalar_iters
3551 if (prologue == num_iters)
3554 Hence the run-time scalar cost is incremented by a taken branch,
3555 plus a not-taken branch, plus a taken branch cost.
3557 3. The vectorizer then checks if an epilogue is required. If the
3558 cost model check was not done before during prologue check, it
3559 has to be done with the epilogue check.
3565 if (prologue == num_iters)
3568 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3571 Hence the run-time scalar cost should be incremented by 2 taken
3574 TODO: The back end may reorder the BBS's differently and reverse
3575 conditions/branch directions. Change the estimates below to
3576 something more reasonable. */
3578 /* If the number of iterations is known and we do not do versioning, we can
3579 decide whether to vectorize at compile time. Hence the scalar version
3580 do not carry cost model guard costs. */
3581 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
3582 || LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3584 /* Cost model check occurs at versioning. */
3585 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
3586 scalar_outside_cost
+= vect_get_stmt_cost (cond_branch_not_taken
);
3589 /* Cost model check occurs at prologue generation. */
3590 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) < 0)
3591 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
)
3592 + vect_get_stmt_cost (cond_branch_not_taken
);
3593 /* Cost model check occurs at epilogue generation. */
3595 scalar_outside_cost
+= 2 * vect_get_stmt_cost (cond_branch_taken
);
3599 /* Complete the target-specific cost calculations. */
3600 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
), &vec_prologue_cost
,
3601 &vec_inside_cost
, &vec_epilogue_cost
);
3603 vec_outside_cost
= (int)(vec_prologue_cost
+ vec_epilogue_cost
);
3605 if (dump_enabled_p ())
3607 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3608 dump_printf (MSG_NOTE
, " Vector inside of loop cost: %d\n",
3610 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n",
3612 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n",
3614 dump_printf (MSG_NOTE
, " Scalar iteration cost: %d\n",
3615 scalar_single_iter_cost
);
3616 dump_printf (MSG_NOTE
, " Scalar outside cost: %d\n",
3617 scalar_outside_cost
);
3618 dump_printf (MSG_NOTE
, " Vector outside cost: %d\n",
3620 dump_printf (MSG_NOTE
, " prologue iterations: %d\n",
3621 peel_iters_prologue
);
3622 dump_printf (MSG_NOTE
, " epilogue iterations: %d\n",
3623 peel_iters_epilogue
);
3626 /* Calculate number of iterations required to make the vector version
3627 profitable, relative to the loop bodies only. The following condition
3629 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3631 SIC = scalar iteration cost, VIC = vector iteration cost,
3632 VOC = vector outside cost, VF = vectorization factor,
3633 NPEEL = prologue iterations + epilogue iterations,
3634 SOC = scalar outside cost for run time cost model check. */
3636 int saving_per_viter
= (scalar_single_iter_cost
* assumed_vf
3638 if (saving_per_viter
<= 0)
3640 if (LOOP_VINFO_LOOP (loop_vinfo
)->force_vectorize
)
3641 warning_at (vect_location
.get_location_t (), OPT_Wopenmp_simd
,
3642 "vectorization did not happen for a simd loop");
3644 if (dump_enabled_p ())
3645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3646 "cost model: the vector iteration cost = %d "
3647 "divided by the scalar iteration cost = %d "
3648 "is greater or equal to the vectorization factor = %d"
3650 vec_inside_cost
, scalar_single_iter_cost
, assumed_vf
);
3651 *ret_min_profitable_niters
= -1;
3652 *ret_min_profitable_estimate
= -1;
3656 /* ??? The "if" arm is written to handle all cases; see below for what
3657 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3658 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3660 /* Rewriting the condition above in terms of the number of
3661 vector iterations (vniters) rather than the number of
3662 scalar iterations (niters) gives:
3664 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3666 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3668 For integer N, X and Y when X > 0:
3670 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3671 int outside_overhead
= (vec_outside_cost
3672 - scalar_single_iter_cost
* peel_iters_prologue
3673 - scalar_single_iter_cost
* peel_iters_epilogue
3674 - scalar_outside_cost
);
3675 /* We're only interested in cases that require at least one
3676 vector iteration. */
3677 int min_vec_niters
= 1;
3678 if (outside_overhead
> 0)
3679 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
3681 if (dump_enabled_p ())
3682 dump_printf (MSG_NOTE
, " Minimum number of vector iterations: %d\n",
3685 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3687 /* Now that we know the minimum number of vector iterations,
3688 find the minimum niters for which the scalar cost is larger:
3690 SIC * niters > VIC * vniters + VOC - SOC
3692 We know that the minimum niters is no more than
3693 vniters * VF + NPEEL, but it might be (and often is) less
3694 than that if a partial vector iteration is cheaper than the
3695 equivalent scalar code. */
3696 int threshold
= (vec_inside_cost
* min_vec_niters
3698 - scalar_outside_cost
);
3700 min_profitable_iters
= 1;
3702 min_profitable_iters
= threshold
/ scalar_single_iter_cost
+ 1;
3705 /* Convert the number of vector iterations into a number of
3706 scalar iterations. */
3707 min_profitable_iters
= (min_vec_niters
* assumed_vf
3708 + peel_iters_prologue
3709 + peel_iters_epilogue
);
3713 min_profitable_iters
= ((vec_outside_cost
- scalar_outside_cost
)
3715 - vec_inside_cost
* peel_iters_prologue
3716 - vec_inside_cost
* peel_iters_epilogue
);
3717 if (min_profitable_iters
<= 0)
3718 min_profitable_iters
= 0;
3721 min_profitable_iters
/= saving_per_viter
;
3723 if ((scalar_single_iter_cost
* assumed_vf
* min_profitable_iters
)
3724 <= (((int) vec_inside_cost
* min_profitable_iters
)
3725 + (((int) vec_outside_cost
- scalar_outside_cost
)
3727 min_profitable_iters
++;
3731 if (dump_enabled_p ())
3732 dump_printf (MSG_NOTE
,
3733 " Calculated minimum iters for profitability: %d\n",
3734 min_profitable_iters
);
3736 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
3737 && min_profitable_iters
< (assumed_vf
+ peel_iters_prologue
))
3738 /* We want the vectorized loop to execute at least once. */
3739 min_profitable_iters
= assumed_vf
+ peel_iters_prologue
;
3741 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_NOTE
, vect_location
,
3743 " Runtime profitability threshold = %d\n",
3744 min_profitable_iters
);
3746 *ret_min_profitable_niters
= min_profitable_iters
;
3748 /* Calculate number of iterations required to make the vector version
3749 profitable, relative to the loop bodies only.
3751 Non-vectorized variant is SIC * niters and it must win over vector
3752 variant on the expected loop trip count. The following condition must hold true:
3753 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3755 if (vec_outside_cost
<= 0)
3756 min_profitable_estimate
= 0;
3757 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3759 /* This is a repeat of the code above, but with + SOC rather
3761 int outside_overhead
= (vec_outside_cost
3762 - scalar_single_iter_cost
* peel_iters_prologue
3763 - scalar_single_iter_cost
* peel_iters_epilogue
3764 + scalar_outside_cost
);
3765 int min_vec_niters
= 1;
3766 if (outside_overhead
> 0)
3767 min_vec_niters
= outside_overhead
/ saving_per_viter
+ 1;
3769 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
3771 int threshold
= (vec_inside_cost
* min_vec_niters
3773 + scalar_outside_cost
);
3774 min_profitable_estimate
= threshold
/ scalar_single_iter_cost
+ 1;
3777 min_profitable_estimate
= (min_vec_niters
* assumed_vf
3778 + peel_iters_prologue
3779 + peel_iters_epilogue
);
3783 min_profitable_estimate
= ((vec_outside_cost
+ scalar_outside_cost
)
3785 - vec_inside_cost
* peel_iters_prologue
3786 - vec_inside_cost
* peel_iters_epilogue
)
3787 / ((scalar_single_iter_cost
* assumed_vf
)
3790 min_profitable_estimate
= MAX (min_profitable_estimate
, min_profitable_iters
);
3791 if (dump_enabled_p ())
3792 dump_printf_loc (MSG_NOTE
, vect_location
,
3793 " Static estimate profitability threshold = %d\n",
3794 min_profitable_estimate
);
3796 *ret_min_profitable_estimate
= min_profitable_estimate
;
3799 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3800 vector elements (not bits) for a vector with NELT elements. */
3802 calc_vec_perm_mask_for_shift (unsigned int offset
, unsigned int nelt
,
3803 vec_perm_builder
*sel
)
3805 /* The encoding is a single stepped pattern. Any wrap-around is handled
3806 by vec_perm_indices. */
3807 sel
->new_vector (nelt
, 1, 3);
3808 for (unsigned int i
= 0; i
< 3; i
++)
3809 sel
->quick_push (i
+ offset
);
3812 /* Checks whether the target supports whole-vector shifts for vectors of mode
3813 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3814 it supports vec_perm_const with masks for all necessary shift amounts. */
3816 have_whole_vector_shift (machine_mode mode
)
3818 if (optab_handler (vec_shr_optab
, mode
) != CODE_FOR_nothing
)
3821 /* Variable-length vectors should be handled via the optab. */
3823 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
3826 vec_perm_builder sel
;
3827 vec_perm_indices indices
;
3828 for (unsigned int i
= nelt
/ 2; i
>= 1; i
/= 2)
3830 calc_vec_perm_mask_for_shift (i
, nelt
, &sel
);
3831 indices
.new_vector (sel
, 2, nelt
);
3832 if (!can_vec_perm_const_p (mode
, indices
, false))
3838 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3839 functions. Design better to avoid maintenance issues. */
3841 /* Function vect_model_reduction_cost.
3843 Models cost for a reduction operation, including the vector ops
3844 generated within the strip-mine loop, the initial definition before
3845 the loop, and the epilogue code that must be generated. */
3848 vect_model_reduction_cost (stmt_vec_info stmt_info
, internal_fn reduc_fn
,
3849 int ncopies
, stmt_vector_for_cost
*cost_vec
)
3851 int prologue_cost
= 0, epilogue_cost
= 0, inside_cost
;
3852 enum tree_code code
;
3856 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3857 struct loop
*loop
= NULL
;
3860 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3862 /* Condition reductions generate two reductions in the loop. */
3863 vect_reduction_type reduction_type
3864 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
);
3865 if (reduction_type
== COND_REDUCTION
)
3868 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3869 mode
= TYPE_MODE (vectype
);
3870 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3872 code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
3874 if (reduction_type
== EXTRACT_LAST_REDUCTION
3875 || reduction_type
== FOLD_LEFT_REDUCTION
)
3877 /* No extra instructions needed in the prologue. */
3880 if (reduction_type
== EXTRACT_LAST_REDUCTION
|| reduc_fn
!= IFN_LAST
)
3881 /* Count one reduction-like operation per vector. */
3882 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vec_to_scalar
,
3883 stmt_info
, 0, vect_body
);
3886 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3887 unsigned int nelements
= ncopies
* vect_nunits_for_cost (vectype
);
3888 inside_cost
= record_stmt_cost (cost_vec
, nelements
,
3889 vec_to_scalar
, stmt_info
, 0,
3891 inside_cost
+= record_stmt_cost (cost_vec
, nelements
,
3892 scalar_stmt
, stmt_info
, 0,
3898 /* Add in cost for initial definition.
3899 For cond reduction we have four vectors: initial index, step,
3900 initial result of the data reduction, initial value of the index
3902 int prologue_stmts
= reduction_type
== COND_REDUCTION
? 4 : 1;
3903 prologue_cost
+= record_stmt_cost (cost_vec
, prologue_stmts
,
3904 scalar_to_vec
, stmt_info
, 0,
3907 /* Cost of reduction op inside loop. */
3908 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
3909 stmt_info
, 0, vect_body
);
3912 /* Determine cost of epilogue code.
3914 We have a reduction operator that will reduce the vector in one statement.
3915 Also requires scalar extract. */
3917 if (!loop
|| !nested_in_vect_loop_p (loop
, orig_stmt_info
))
3919 if (reduc_fn
!= IFN_LAST
)
3921 if (reduction_type
== COND_REDUCTION
)
3923 /* An EQ stmt and an COND_EXPR stmt. */
3924 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
3925 vector_stmt
, stmt_info
, 0,
3927 /* Reduction of the max index and a reduction of the found
3929 epilogue_cost
+= record_stmt_cost (cost_vec
, 2,
3930 vec_to_scalar
, stmt_info
, 0,
3932 /* A broadcast of the max value. */
3933 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3934 scalar_to_vec
, stmt_info
, 0,
3939 epilogue_cost
+= record_stmt_cost (cost_vec
, 1, vector_stmt
,
3940 stmt_info
, 0, vect_epilogue
);
3941 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3942 vec_to_scalar
, stmt_info
, 0,
3946 else if (reduction_type
== COND_REDUCTION
)
3948 unsigned estimated_nunits
= vect_nunits_for_cost (vectype
);
3949 /* Extraction of scalar elements. */
3950 epilogue_cost
+= record_stmt_cost (cost_vec
,
3951 2 * estimated_nunits
,
3952 vec_to_scalar
, stmt_info
, 0,
3954 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3955 epilogue_cost
+= record_stmt_cost (cost_vec
,
3956 2 * estimated_nunits
- 3,
3957 scalar_stmt
, stmt_info
, 0,
3960 else if (reduction_type
== EXTRACT_LAST_REDUCTION
3961 || reduction_type
== FOLD_LEFT_REDUCTION
)
3962 /* No extra instructions need in the epilogue. */
3966 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
3968 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info
->stmt
)));
3969 int element_bitsize
= tree_to_uhwi (bitsize
);
3970 int nelements
= vec_size_in_bits
/ element_bitsize
;
3972 if (code
== COND_EXPR
)
3975 optab
= optab_for_tree_code (code
, vectype
, optab_default
);
3977 /* We have a whole vector shift available. */
3978 if (optab
!= unknown_optab
3979 && VECTOR_MODE_P (mode
)
3980 && optab_handler (optab
, mode
) != CODE_FOR_nothing
3981 && have_whole_vector_shift (mode
))
3983 /* Final reduction via vector shifts and the reduction operator.
3984 Also requires scalar extract. */
3985 epilogue_cost
+= record_stmt_cost (cost_vec
,
3986 exact_log2 (nelements
) * 2,
3987 vector_stmt
, stmt_info
, 0,
3989 epilogue_cost
+= record_stmt_cost (cost_vec
, 1,
3990 vec_to_scalar
, stmt_info
, 0,
3994 /* Use extracts and reduction op for final reduction. For N
3995 elements, we have N extracts and N-1 reduction ops. */
3996 epilogue_cost
+= record_stmt_cost (cost_vec
,
3997 nelements
+ nelements
- 1,
3998 vector_stmt
, stmt_info
, 0,
4003 if (dump_enabled_p ())
4004 dump_printf (MSG_NOTE
,
4005 "vect_model_reduction_cost: inside_cost = %d, "
4006 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost
,
4007 prologue_cost
, epilogue_cost
);
4011 /* Function vect_model_induction_cost.
4013 Models cost for induction operations. */
4016 vect_model_induction_cost (stmt_vec_info stmt_info
, int ncopies
,
4017 stmt_vector_for_cost
*cost_vec
)
4019 unsigned inside_cost
, prologue_cost
;
4021 if (PURE_SLP_STMT (stmt_info
))
4024 /* loop cost for vec_loop. */
4025 inside_cost
= record_stmt_cost (cost_vec
, ncopies
, vector_stmt
,
4026 stmt_info
, 0, vect_body
);
4028 /* prologue cost for vec_init and vec_step. */
4029 prologue_cost
= record_stmt_cost (cost_vec
, 2, scalar_to_vec
,
4030 stmt_info
, 0, vect_prologue
);
4032 if (dump_enabled_p ())
4033 dump_printf_loc (MSG_NOTE
, vect_location
,
4034 "vect_model_induction_cost: inside_cost = %d, "
4035 "prologue_cost = %d .\n", inside_cost
, prologue_cost
);
4040 /* Function get_initial_def_for_reduction
4043 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4044 INIT_VAL - the initial value of the reduction variable
4047 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4048 of the reduction (used for adjusting the epilog - see below).
4049 Return a vector variable, initialized according to the operation that
4050 STMT_VINFO performs. This vector will be used as the initial value
4051 of the vector of partial results.
4053 Option1 (adjust in epilog): Initialize the vector as follows:
4054 add/bit or/xor: [0,0,...,0,0]
4055 mult/bit and: [1,1,...,1,1]
4056 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4057 and when necessary (e.g. add/mult case) let the caller know
4058 that it needs to adjust the result by init_val.
4060 Option2: Initialize the vector as follows:
4061 add/bit or/xor: [init_val,0,0,...,0]
4062 mult/bit and: [init_val,1,1,...,1]
4063 min/max/cond_expr: [init_val,init_val,...,init_val]
4064 and no adjustments are needed.
4066 For example, for the following code:
4072 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4073 For a vector of 4 units, we want to return either [0,0,0,init_val],
4074 or [0,0,0,0] and let the caller know that it needs to adjust
4075 the result at the end by 'init_val'.
4077 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4078 initialization vector is simpler (same element in all entries), if
4079 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4081 A cost model should help decide between these two schemes. */
4084 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo
, tree init_val
,
4085 tree
*adjustment_def
)
4087 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
4088 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4089 tree scalar_type
= TREE_TYPE (init_val
);
4090 tree vectype
= get_vectype_for_scalar_type (scalar_type
);
4091 enum tree_code code
= gimple_assign_rhs_code (stmt_vinfo
->stmt
);
4094 REAL_VALUE_TYPE real_init_val
= dconst0
;
4095 int int_init_val
= 0;
4096 gimple_seq stmts
= NULL
;
4098 gcc_assert (vectype
);
4100 gcc_assert (POINTER_TYPE_P (scalar_type
) || INTEGRAL_TYPE_P (scalar_type
)
4101 || SCALAR_FLOAT_TYPE_P (scalar_type
));
4103 gcc_assert (nested_in_vect_loop_p (loop
, stmt_vinfo
)
4104 || loop
== (gimple_bb (stmt_vinfo
->stmt
))->loop_father
);
4106 vect_reduction_type reduction_type
4107 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo
);
4111 case WIDEN_SUM_EXPR
:
4121 /* ADJUSTMENT_DEF is NULL when called from
4122 vect_create_epilog_for_reduction to vectorize double reduction. */
4124 *adjustment_def
= init_val
;
4126 if (code
== MULT_EXPR
)
4128 real_init_val
= dconst1
;
4132 if (code
== BIT_AND_EXPR
)
4135 if (SCALAR_FLOAT_TYPE_P (scalar_type
))
4136 def_for_init
= build_real (scalar_type
, real_init_val
);
4138 def_for_init
= build_int_cst (scalar_type
, int_init_val
);
4141 /* Option1: the first element is '0' or '1' as well. */
4142 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4144 else if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant ())
4146 /* Option2 (variable length): the first element is INIT_VAL. */
4147 init_def
= gimple_build_vector_from_val (&stmts
, vectype
,
4149 init_def
= gimple_build (&stmts
, CFN_VEC_SHL_INSERT
,
4150 vectype
, init_def
, init_val
);
4154 /* Option2: the first element is INIT_VAL. */
4155 tree_vector_builder
elts (vectype
, 1, 2);
4156 elts
.quick_push (init_val
);
4157 elts
.quick_push (def_for_init
);
4158 init_def
= gimple_build_vector (&stmts
, &elts
);
4169 *adjustment_def
= NULL_TREE
;
4170 if (reduction_type
!= COND_REDUCTION
4171 && reduction_type
!= EXTRACT_LAST_REDUCTION
)
4173 init_def
= vect_get_vec_def_for_operand (init_val
, stmt_vinfo
);
4177 init_val
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_val
);
4178 init_def
= gimple_build_vector_from_val (&stmts
, vectype
, init_val
);
4187 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
4191 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4192 NUMBER_OF_VECTORS is the number of vector defs to create.
4193 If NEUTRAL_OP is nonnull, introducing extra elements of that
4194 value will not change the result. */
4197 get_initial_defs_for_reduction (slp_tree slp_node
,
4198 vec
<tree
> *vec_oprnds
,
4199 unsigned int number_of_vectors
,
4200 bool reduc_chain
, tree neutral_op
)
4202 vec
<stmt_vec_info
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
4203 stmt_vec_info stmt_vinfo
= stmts
[0];
4204 unsigned HOST_WIDE_INT nunits
;
4205 unsigned j
, number_of_places_left_in_vector
;
4207 unsigned int group_size
= stmts
.length ();
4211 vector_type
= STMT_VINFO_VECTYPE (stmt_vinfo
);
4213 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
);
4215 loop
= (gimple_bb (stmt_vinfo
->stmt
))->loop_father
;
4217 edge pe
= loop_preheader_edge (loop
);
4219 gcc_assert (!reduc_chain
|| neutral_op
);
4221 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4222 created vectors. It is greater than 1 if unrolling is performed.
4224 For example, we have two scalar operands, s1 and s2 (e.g., group of
4225 strided accesses of size two), while NUNITS is four (i.e., four scalars
4226 of this type can be packed in a vector). The output vector will contain
4227 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4230 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4231 vectors containing the operands.
4233 For example, NUNITS is four as before, and the group size is 8
4234 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4235 {s5, s6, s7, s8}. */
4237 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4238 nunits
= group_size
;
4240 number_of_places_left_in_vector
= nunits
;
4241 bool constant_p
= true;
4242 tree_vector_builder
elts (vector_type
, nunits
, 1);
4243 elts
.quick_grow (nunits
);
4244 gimple_seq ctor_seq
= NULL
;
4245 for (j
= 0; j
< nunits
* number_of_vectors
; ++j
)
4249 stmt_vinfo
= stmts
[i
];
4251 /* Get the def before the loop. In reduction chain we have only
4252 one initial value. Else we have as many as PHIs in the group. */
4254 op
= j
!= 0 ? neutral_op
: PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4255 else if (((vec_oprnds
->length () + 1) * nunits
4256 - number_of_places_left_in_vector
>= group_size
)
4260 op
= PHI_ARG_DEF_FROM_EDGE (stmt_vinfo
->stmt
, pe
);
4262 /* Create 'vect_ = {op0,op1,...,opn}'. */
4263 number_of_places_left_in_vector
--;
4264 elts
[nunits
- number_of_places_left_in_vector
- 1] = op
;
4265 if (!CONSTANT_CLASS_P (op
))
4268 if (number_of_places_left_in_vector
== 0)
4271 if (constant_p
&& !neutral_op
4272 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4273 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4274 /* Build the vector directly from ELTS. */
4275 init
= gimple_build_vector (&ctor_seq
, &elts
);
4276 else if (neutral_op
)
4278 /* Build a vector of the neutral value and shift the
4279 other elements into place. */
4280 init
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
4283 while (k
> 0 && elts
[k
- 1] == neutral_op
)
4288 init
= gimple_build (&ctor_seq
, CFN_VEC_SHL_INSERT
,
4289 vector_type
, init
, elts
[k
]);
4294 /* First time round, duplicate ELTS to fill the
4295 required number of vectors. */
4296 duplicate_and_interleave (&ctor_seq
, vector_type
, elts
,
4297 number_of_vectors
, *vec_oprnds
);
4300 vec_oprnds
->quick_push (init
);
4302 number_of_places_left_in_vector
= nunits
;
4303 elts
.new_vector (vector_type
, nunits
, 1);
4304 elts
.quick_grow (nunits
);
4308 if (ctor_seq
!= NULL
)
4309 gsi_insert_seq_on_edge_immediate (pe
, ctor_seq
);
4313 /* Function vect_create_epilog_for_reduction
4315 Create code at the loop-epilog to finalize the result of a reduction
4318 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4319 reduction statements.
4320 STMT_INFO is the scalar reduction stmt that is being vectorized.
4321 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4322 number of elements that we can fit in a vectype (nunits). In this case
4323 we have to generate more than one vector stmt - i.e - we need to "unroll"
4324 the vector stmt by a factor VF/nunits. For more details see documentation
4325 in vectorizable_operation.
4326 REDUC_FN is the internal function for the epilog reduction.
4327 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4329 REDUC_INDEX is the index of the operand in the right hand side of the
4330 statement that is defined by REDUCTION_PHI.
4331 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4332 SLP_NODE is an SLP node containing a group of reduction statements. The
4333 first one in this group is STMT_INFO.
4334 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4335 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4336 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4337 any value of the IV in the loop.
4338 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4339 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4340 null if this is not an SLP reduction
4343 1. Creates the reduction def-use cycles: sets the arguments for
4345 The loop-entry argument is the vectorized initial-value of the reduction.
4346 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4348 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4349 by calling the function specified by REDUC_FN if available, or by
4350 other means (whole-vector shifts or a scalar loop).
4351 The function also creates a new phi node at the loop exit to preserve
4352 loop-closed form, as illustrated below.
4354 The flow at the entry to this function:
4357 vec_def = phi <null, null> # REDUCTION_PHI
4358 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4359 s_loop = scalar_stmt # (scalar) STMT_INFO
4361 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4365 The above is transformed by this function into:
4368 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4369 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4370 s_loop = scalar_stmt # (scalar) STMT_INFO
4372 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4373 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4374 v_out2 = reduce <v_out1>
4375 s_out3 = extract_field <v_out2, 0>
4376 s_out4 = adjust_result <s_out3>
4382 vect_create_epilog_for_reduction (vec
<tree
> vect_defs
,
4383 stmt_vec_info stmt_info
,
4384 gimple
*reduc_def_stmt
,
4385 int ncopies
, internal_fn reduc_fn
,
4386 vec
<stmt_vec_info
> reduction_phis
,
4389 slp_instance slp_node_instance
,
4390 tree induc_val
, enum tree_code induc_code
,
4393 stmt_vec_info prev_phi_info
;
4396 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4397 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
), *outer_loop
= NULL
;
4398 basic_block exit_bb
;
4401 gimple
*new_phi
= NULL
, *phi
;
4402 stmt_vec_info phi_info
;
4403 gimple_stmt_iterator exit_gsi
;
4405 tree new_temp
= NULL_TREE
, new_dest
, new_name
, new_scalar_dest
;
4406 gimple
*epilog_stmt
= NULL
;
4407 enum tree_code code
= gimple_assign_rhs_code (stmt_info
->stmt
);
4410 tree adjustment_def
= NULL
;
4411 tree vec_initial_def
= NULL
;
4412 tree expr
, def
, initial_def
= NULL
;
4413 tree orig_name
, scalar_result
;
4414 imm_use_iterator imm_iter
, phi_imm_iter
;
4415 use_operand_p use_p
, phi_use_p
;
4417 stmt_vec_info reduction_phi_info
= NULL
;
4418 bool nested_in_vect_loop
= false;
4419 auto_vec
<gimple
*> new_phis
;
4420 auto_vec
<stmt_vec_info
> inner_phis
;
4422 auto_vec
<tree
> scalar_results
;
4423 unsigned int group_size
= 1, k
, ratio
;
4424 auto_vec
<tree
> vec_initial_defs
;
4425 auto_vec
<gimple
*> phis
;
4426 bool slp_reduc
= false;
4427 bool direct_slp_reduc
;
4428 tree new_phi_result
;
4429 stmt_vec_info inner_phi
= NULL
;
4430 tree induction_index
= NULL_TREE
;
4433 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
4435 if (nested_in_vect_loop_p (loop
, stmt_info
))
4439 nested_in_vect_loop
= true;
4440 gcc_assert (!slp_node
);
4443 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4444 gcc_assert (vectype
);
4445 mode
= TYPE_MODE (vectype
);
4447 /* 1. Create the reduction def-use cycle:
4448 Set the arguments of REDUCTION_PHIS, i.e., transform
4451 vec_def = phi <null, null> # REDUCTION_PHI
4452 VECT_DEF = vector_stmt # vectorized form of STMT
4458 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4459 VECT_DEF = vector_stmt # vectorized form of STMT
4462 (in case of SLP, do it for all the phis). */
4464 /* Get the loop-entry arguments. */
4465 enum vect_def_type initial_def_dt
= vect_unknown_def_type
;
4468 unsigned vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
4469 vec_initial_defs
.reserve (vec_num
);
4470 get_initial_defs_for_reduction (slp_node_instance
->reduc_phis
,
4471 &vec_initial_defs
, vec_num
,
4472 REDUC_GROUP_FIRST_ELEMENT (stmt_info
),
4477 /* Get at the scalar def before the loop, that defines the initial value
4478 of the reduction variable. */
4479 initial_def
= PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt
,
4480 loop_preheader_edge (loop
));
4481 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4482 and we can't use zero for induc_val, use initial_def. Similarly
4483 for REDUC_MIN and initial_def larger than the base. */
4484 if (TREE_CODE (initial_def
) == INTEGER_CST
4485 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4486 == INTEGER_INDUC_COND_REDUCTION
)
4487 && !integer_zerop (induc_val
)
4488 && ((induc_code
== MAX_EXPR
4489 && tree_int_cst_lt (initial_def
, induc_val
))
4490 || (induc_code
== MIN_EXPR
4491 && tree_int_cst_lt (induc_val
, initial_def
))))
4492 induc_val
= initial_def
;
4495 /* In case of double reduction we only create a vector variable
4496 to be put in the reduction phi node. The actual statement
4497 creation is done later in this function. */
4498 vec_initial_def
= vect_create_destination_var (initial_def
, vectype
);
4499 else if (nested_in_vect_loop
)
4501 /* Do not use an adjustment def as that case is not supported
4502 correctly if ncopies is not one. */
4503 vect_is_simple_use (initial_def
, loop_vinfo
, &initial_def_dt
);
4504 vec_initial_def
= vect_get_vec_def_for_operand (initial_def
,
4509 = get_initial_def_for_reduction (stmt_info
, initial_def
,
4511 vec_initial_defs
.create (1);
4512 vec_initial_defs
.quick_push (vec_initial_def
);
4515 /* Set phi nodes arguments. */
4516 FOR_EACH_VEC_ELT (reduction_phis
, i
, phi_info
)
4518 tree vec_init_def
= vec_initial_defs
[i
];
4519 tree def
= vect_defs
[i
];
4520 for (j
= 0; j
< ncopies
; j
++)
4524 phi_info
= STMT_VINFO_RELATED_STMT (phi_info
);
4525 if (nested_in_vect_loop
)
4527 = vect_get_vec_def_for_stmt_copy (loop_vinfo
, vec_init_def
);
4530 /* Set the loop-entry arg of the reduction-phi. */
4532 gphi
*phi
= as_a
<gphi
*> (phi_info
->stmt
);
4533 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
4534 == INTEGER_INDUC_COND_REDUCTION
)
4536 /* Initialise the reduction phi to zero. This prevents initial
4537 values of non-zero interferring with the reduction op. */
4538 gcc_assert (ncopies
== 1);
4539 gcc_assert (i
== 0);
4541 tree vec_init_def_type
= TREE_TYPE (vec_init_def
);
4543 = build_vector_from_val (vec_init_def_type
, induc_val
);
4545 add_phi_arg (phi
, induc_val_vec
, loop_preheader_edge (loop
),
4549 add_phi_arg (phi
, vec_init_def
, loop_preheader_edge (loop
),
4552 /* Set the loop-latch arg for the reduction-phi. */
4554 def
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, def
);
4556 add_phi_arg (phi
, def
, loop_latch_edge (loop
), UNKNOWN_LOCATION
);
4558 if (dump_enabled_p ())
4559 dump_printf_loc (MSG_NOTE
, vect_location
,
4560 "transform reduction: created def-use cycle: %G%G",
4561 phi
, SSA_NAME_DEF_STMT (def
));
4565 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4566 which is updated with the current index of the loop for every match of
4567 the original loop's cond_expr (VEC_STMT). This results in a vector
4568 containing the last time the condition passed for that vector lane.
4569 The first match will be a 1 to allow 0 to be used for non-matching
4570 indexes. If there are no matches at all then the vector will be all
4572 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
)
4574 tree indx_before_incr
, indx_after_incr
;
4575 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype
);
4577 gimple
*vec_stmt
= STMT_VINFO_VEC_STMT (stmt_info
)->stmt
;
4578 gcc_assert (gimple_assign_rhs_code (vec_stmt
) == VEC_COND_EXPR
);
4580 int scalar_precision
4581 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype
)));
4582 tree cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
4583 tree cr_index_vector_type
= build_vector_type
4584 (cr_index_scalar_type
, TYPE_VECTOR_SUBPARTS (vectype
));
4586 /* First we create a simple vector induction variable which starts
4587 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4588 vector size (STEP). */
4590 /* Create a {1,2,3,...} vector. */
4591 tree series_vect
= build_index_vector (cr_index_vector_type
, 1, 1);
4593 /* Create a vector of the step value. */
4594 tree step
= build_int_cst (cr_index_scalar_type
, nunits_out
);
4595 tree vec_step
= build_vector_from_val (cr_index_vector_type
, step
);
4597 /* Create an induction variable. */
4598 gimple_stmt_iterator incr_gsi
;
4600 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4601 create_iv (series_vect
, vec_step
, NULL_TREE
, loop
, &incr_gsi
,
4602 insert_after
, &indx_before_incr
, &indx_after_incr
);
4604 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4605 filled with zeros (VEC_ZERO). */
4607 /* Create a vector of 0s. */
4608 tree zero
= build_zero_cst (cr_index_scalar_type
);
4609 tree vec_zero
= build_vector_from_val (cr_index_vector_type
, zero
);
4611 /* Create a vector phi node. */
4612 tree new_phi_tree
= make_ssa_name (cr_index_vector_type
);
4613 new_phi
= create_phi_node (new_phi_tree
, loop
->header
);
4614 loop_vinfo
->add_stmt (new_phi
);
4615 add_phi_arg (as_a
<gphi
*> (new_phi
), vec_zero
,
4616 loop_preheader_edge (loop
), UNKNOWN_LOCATION
);
4618 /* Now take the condition from the loops original cond_expr
4619 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4620 every match uses values from the induction variable
4621 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4623 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4624 the new cond_expr (INDEX_COND_EXPR). */
4626 /* Duplicate the condition from vec_stmt. */
4627 tree ccompare
= unshare_expr (gimple_assign_rhs1 (vec_stmt
));
4629 /* Create a conditional, where the condition is taken from vec_stmt
4630 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4631 else is the phi (NEW_PHI_TREE). */
4632 tree index_cond_expr
= build3 (VEC_COND_EXPR
, cr_index_vector_type
,
4633 ccompare
, indx_before_incr
,
4635 induction_index
= make_ssa_name (cr_index_vector_type
);
4636 gimple
*index_condition
= gimple_build_assign (induction_index
,
4638 gsi_insert_before (&incr_gsi
, index_condition
, GSI_SAME_STMT
);
4639 stmt_vec_info index_vec_info
= loop_vinfo
->add_stmt (index_condition
);
4640 STMT_VINFO_VECTYPE (index_vec_info
) = cr_index_vector_type
;
4642 /* Update the phi with the vec cond. */
4643 add_phi_arg (as_a
<gphi
*> (new_phi
), induction_index
,
4644 loop_latch_edge (loop
), UNKNOWN_LOCATION
);
4647 /* 2. Create epilog code.
4648 The reduction epilog code operates across the elements of the vector
4649 of partial results computed by the vectorized loop.
4650 The reduction epilog code consists of:
4652 step 1: compute the scalar result in a vector (v_out2)
4653 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4654 step 3: adjust the scalar result (s_out3) if needed.
4656 Step 1 can be accomplished using one the following three schemes:
4657 (scheme 1) using reduc_fn, if available.
4658 (scheme 2) using whole-vector shifts, if available.
4659 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4662 The overall epilog code looks like this:
4664 s_out0 = phi <s_loop> # original EXIT_PHI
4665 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4666 v_out2 = reduce <v_out1> # step 1
4667 s_out3 = extract_field <v_out2, 0> # step 2
4668 s_out4 = adjust_result <s_out3> # step 3
4670 (step 3 is optional, and steps 1 and 2 may be combined).
4671 Lastly, the uses of s_out0 are replaced by s_out4. */
4674 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4675 v_out1 = phi <VECT_DEF>
4676 Store them in NEW_PHIS. */
4678 exit_bb
= single_exit (loop
)->dest
;
4679 prev_phi_info
= NULL
;
4680 new_phis
.create (vect_defs
.length ());
4681 FOR_EACH_VEC_ELT (vect_defs
, i
, def
)
4683 for (j
= 0; j
< ncopies
; j
++)
4685 tree new_def
= copy_ssa_name (def
);
4686 phi
= create_phi_node (new_def
, exit_bb
);
4687 stmt_vec_info phi_info
= loop_vinfo
->add_stmt (phi
);
4689 new_phis
.quick_push (phi
);
4692 def
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, def
);
4693 STMT_VINFO_RELATED_STMT (prev_phi_info
) = phi_info
;
4696 SET_PHI_ARG_DEF (phi
, single_exit (loop
)->dest_idx
, def
);
4697 prev_phi_info
= phi_info
;
4701 /* The epilogue is created for the outer-loop, i.e., for the loop being
4702 vectorized. Create exit phis for the outer loop. */
4706 exit_bb
= single_exit (loop
)->dest
;
4707 inner_phis
.create (vect_defs
.length ());
4708 FOR_EACH_VEC_ELT (new_phis
, i
, phi
)
4710 stmt_vec_info phi_info
= loop_vinfo
->lookup_stmt (phi
);
4711 tree new_result
= copy_ssa_name (PHI_RESULT (phi
));
4712 gphi
*outer_phi
= create_phi_node (new_result
, exit_bb
);
4713 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4715 prev_phi_info
= loop_vinfo
->add_stmt (outer_phi
);
4716 inner_phis
.quick_push (phi_info
);
4717 new_phis
[i
] = outer_phi
;
4718 while (STMT_VINFO_RELATED_STMT (phi_info
))
4720 phi_info
= STMT_VINFO_RELATED_STMT (phi_info
);
4721 new_result
= copy_ssa_name (PHI_RESULT (phi_info
->stmt
));
4722 outer_phi
= create_phi_node (new_result
, exit_bb
);
4723 SET_PHI_ARG_DEF (outer_phi
, single_exit (loop
)->dest_idx
,
4724 PHI_RESULT (phi_info
->stmt
));
4725 stmt_vec_info outer_phi_info
= loop_vinfo
->add_stmt (outer_phi
);
4726 STMT_VINFO_RELATED_STMT (prev_phi_info
) = outer_phi_info
;
4727 prev_phi_info
= outer_phi_info
;
4732 exit_gsi
= gsi_after_labels (exit_bb
);
4734 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4735 (i.e. when reduc_fn is not available) and in the final adjustment
4736 code (if needed). Also get the original scalar reduction variable as
4737 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4738 represents a reduction pattern), the tree-code and scalar-def are
4739 taken from the original stmt that the pattern-stmt (STMT) replaces.
4740 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4741 are taken from STMT. */
4743 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4744 if (orig_stmt_info
!= stmt_info
)
4746 /* Reduction pattern */
4747 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
4748 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info
) == stmt_info
);
4751 code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
4752 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4753 partial results are added and not subtracted. */
4754 if (code
== MINUS_EXPR
)
4757 scalar_dest
= gimple_assign_lhs (orig_stmt_info
->stmt
);
4758 scalar_type
= TREE_TYPE (scalar_dest
);
4759 scalar_results
.create (group_size
);
4760 new_scalar_dest
= vect_create_destination_var (scalar_dest
, NULL
);
4761 bitsize
= TYPE_SIZE (scalar_type
);
4763 /* In case this is a reduction in an inner-loop while vectorizing an outer
4764 loop - we don't need to extract a single scalar result at the end of the
4765 inner-loop (unless it is double reduction, i.e., the use of reduction is
4766 outside the outer-loop). The final vector of partial results will be used
4767 in the vectorized outer-loop, or reduced to a scalar result at the end of
4769 if (nested_in_vect_loop
&& !double_reduc
)
4770 goto vect_finalize_reduction
;
4772 /* SLP reduction without reduction chain, e.g.,
4776 b2 = operation (b1) */
4777 slp_reduc
= (slp_node
&& !REDUC_GROUP_FIRST_ELEMENT (stmt_info
));
4779 /* True if we should implement SLP_REDUC using native reduction operations
4780 instead of scalar operations. */
4781 direct_slp_reduc
= (reduc_fn
!= IFN_LAST
4783 && !TYPE_VECTOR_SUBPARTS (vectype
).is_constant ());
4785 /* In case of reduction chain, e.g.,
4788 a3 = operation (a2),
4790 we may end up with more than one vector result. Here we reduce them to
4792 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
) || direct_slp_reduc
)
4794 tree first_vect
= PHI_RESULT (new_phis
[0]);
4795 gassign
*new_vec_stmt
= NULL
;
4796 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4797 for (k
= 1; k
< new_phis
.length (); k
++)
4799 gimple
*next_phi
= new_phis
[k
];
4800 tree second_vect
= PHI_RESULT (next_phi
);
4801 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
4802 new_vec_stmt
= gimple_build_assign (tem
, code
,
4803 first_vect
, second_vect
);
4804 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4808 new_phi_result
= first_vect
;
4811 new_phis
.truncate (0);
4812 new_phis
.safe_push (new_vec_stmt
);
4815 /* Likewise if we couldn't use a single defuse cycle. */
4816 else if (ncopies
> 1)
4818 gcc_assert (new_phis
.length () == 1);
4819 tree first_vect
= PHI_RESULT (new_phis
[0]);
4820 gassign
*new_vec_stmt
= NULL
;
4821 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4822 stmt_vec_info next_phi_info
= loop_vinfo
->lookup_stmt (new_phis
[0]);
4823 for (int k
= 1; k
< ncopies
; ++k
)
4825 next_phi_info
= STMT_VINFO_RELATED_STMT (next_phi_info
);
4826 tree second_vect
= PHI_RESULT (next_phi_info
->stmt
);
4827 tree tem
= make_ssa_name (vec_dest
, new_vec_stmt
);
4828 new_vec_stmt
= gimple_build_assign (tem
, code
,
4829 first_vect
, second_vect
);
4830 gsi_insert_before (&exit_gsi
, new_vec_stmt
, GSI_SAME_STMT
);
4833 new_phi_result
= first_vect
;
4834 new_phis
.truncate (0);
4835 new_phis
.safe_push (new_vec_stmt
);
4838 new_phi_result
= PHI_RESULT (new_phis
[0]);
4840 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
4841 && reduc_fn
!= IFN_LAST
)
4843 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4844 various data values where the condition matched and another vector
4845 (INDUCTION_INDEX) containing all the indexes of those matches. We
4846 need to extract the last matching index (which will be the index with
4847 highest value) and use this to index into the data vector.
4848 For the case where there were no matches, the data vector will contain
4849 all default values and the index vector will be all zeros. */
4851 /* Get various versions of the type of the vector of indexes. */
4852 tree index_vec_type
= TREE_TYPE (induction_index
);
4853 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type
));
4854 tree index_scalar_type
= TREE_TYPE (index_vec_type
);
4855 tree index_vec_cmp_type
= build_same_sized_truth_vector_type
4858 /* Get an unsigned integer version of the type of the data vector. */
4859 int scalar_precision
4860 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
4861 tree scalar_type_unsigned
= make_unsigned_type (scalar_precision
);
4862 tree vectype_unsigned
= build_vector_type
4863 (scalar_type_unsigned
, TYPE_VECTOR_SUBPARTS (vectype
));
4865 /* First we need to create a vector (ZERO_VEC) of zeros and another
4866 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4867 can create using a MAX reduction and then expanding.
4868 In the case where the loop never made any matches, the max index will
4871 /* Vector of {0, 0, 0,...}. */
4872 tree zero_vec
= make_ssa_name (vectype
);
4873 tree zero_vec_rhs
= build_zero_cst (vectype
);
4874 gimple
*zero_vec_stmt
= gimple_build_assign (zero_vec
, zero_vec_rhs
);
4875 gsi_insert_before (&exit_gsi
, zero_vec_stmt
, GSI_SAME_STMT
);
4877 /* Find maximum value from the vector of found indexes. */
4878 tree max_index
= make_ssa_name (index_scalar_type
);
4879 gcall
*max_index_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4880 1, induction_index
);
4881 gimple_call_set_lhs (max_index_stmt
, max_index
);
4882 gsi_insert_before (&exit_gsi
, max_index_stmt
, GSI_SAME_STMT
);
4884 /* Vector of {max_index, max_index, max_index,...}. */
4885 tree max_index_vec
= make_ssa_name (index_vec_type
);
4886 tree max_index_vec_rhs
= build_vector_from_val (index_vec_type
,
4888 gimple
*max_index_vec_stmt
= gimple_build_assign (max_index_vec
,
4890 gsi_insert_before (&exit_gsi
, max_index_vec_stmt
, GSI_SAME_STMT
);
4892 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4893 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4894 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4895 otherwise. Only one value should match, resulting in a vector
4896 (VEC_COND) with one data value and the rest zeros.
4897 In the case where the loop never made any matches, every index will
4898 match, resulting in a vector with all data values (which will all be
4899 the default value). */
4901 /* Compare the max index vector to the vector of found indexes to find
4902 the position of the max value. */
4903 tree vec_compare
= make_ssa_name (index_vec_cmp_type
);
4904 gimple
*vec_compare_stmt
= gimple_build_assign (vec_compare
, EQ_EXPR
,
4907 gsi_insert_before (&exit_gsi
, vec_compare_stmt
, GSI_SAME_STMT
);
4909 /* Use the compare to choose either values from the data vector or
4911 tree vec_cond
= make_ssa_name (vectype
);
4912 gimple
*vec_cond_stmt
= gimple_build_assign (vec_cond
, VEC_COND_EXPR
,
4913 vec_compare
, new_phi_result
,
4915 gsi_insert_before (&exit_gsi
, vec_cond_stmt
, GSI_SAME_STMT
);
4917 /* Finally we need to extract the data value from the vector (VEC_COND)
4918 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4919 reduction, but because this doesn't exist, we can use a MAX reduction
4920 instead. The data value might be signed or a float so we need to cast
4922 In the case where the loop never made any matches, the data values are
4923 all identical, and so will reduce down correctly. */
4925 /* Make the matched data values unsigned. */
4926 tree vec_cond_cast
= make_ssa_name (vectype_unsigned
);
4927 tree vec_cond_cast_rhs
= build1 (VIEW_CONVERT_EXPR
, vectype_unsigned
,
4929 gimple
*vec_cond_cast_stmt
= gimple_build_assign (vec_cond_cast
,
4932 gsi_insert_before (&exit_gsi
, vec_cond_cast_stmt
, GSI_SAME_STMT
);
4934 /* Reduce down to a scalar value. */
4935 tree data_reduc
= make_ssa_name (scalar_type_unsigned
);
4936 gcall
*data_reduc_stmt
= gimple_build_call_internal (IFN_REDUC_MAX
,
4938 gimple_call_set_lhs (data_reduc_stmt
, data_reduc
);
4939 gsi_insert_before (&exit_gsi
, data_reduc_stmt
, GSI_SAME_STMT
);
4941 /* Convert the reduced value back to the result type and set as the
4943 gimple_seq stmts
= NULL
;
4944 new_temp
= gimple_build (&stmts
, VIEW_CONVERT_EXPR
, scalar_type
,
4946 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
4947 scalar_results
.safe_push (new_temp
);
4949 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) == COND_REDUCTION
4950 && reduc_fn
== IFN_LAST
)
4952 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4954 idx_val = induction_index[0];
4955 val = data_reduc[0];
4956 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4957 if (induction_index[i] > idx_val)
4958 val = data_reduc[i], idx_val = induction_index[i];
4961 tree data_eltype
= TREE_TYPE (TREE_TYPE (new_phi_result
));
4962 tree idx_eltype
= TREE_TYPE (TREE_TYPE (induction_index
));
4963 unsigned HOST_WIDE_INT el_size
= tree_to_uhwi (TYPE_SIZE (idx_eltype
));
4964 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index
));
4965 /* Enforced by vectorizable_reduction, which ensures we have target
4966 support before allowing a conditional reduction on variable-length
4968 unsigned HOST_WIDE_INT v_size
= el_size
* nunits
.to_constant ();
4969 tree idx_val
= NULL_TREE
, val
= NULL_TREE
;
4970 for (unsigned HOST_WIDE_INT off
= 0; off
< v_size
; off
+= el_size
)
4972 tree old_idx_val
= idx_val
;
4974 idx_val
= make_ssa_name (idx_eltype
);
4975 epilog_stmt
= gimple_build_assign (idx_val
, BIT_FIELD_REF
,
4976 build3 (BIT_FIELD_REF
, idx_eltype
,
4978 bitsize_int (el_size
),
4979 bitsize_int (off
)));
4980 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4981 val
= make_ssa_name (data_eltype
);
4982 epilog_stmt
= gimple_build_assign (val
, BIT_FIELD_REF
,
4983 build3 (BIT_FIELD_REF
,
4986 bitsize_int (el_size
),
4987 bitsize_int (off
)));
4988 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
4991 tree new_idx_val
= idx_val
;
4993 if (off
!= v_size
- el_size
)
4995 new_idx_val
= make_ssa_name (idx_eltype
);
4996 epilog_stmt
= gimple_build_assign (new_idx_val
,
4999 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5001 new_val
= make_ssa_name (data_eltype
);
5002 epilog_stmt
= gimple_build_assign (new_val
,
5009 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5010 idx_val
= new_idx_val
;
5014 /* Convert the reduced value back to the result type and set as the
5016 gimple_seq stmts
= NULL
;
5017 val
= gimple_convert (&stmts
, scalar_type
, val
);
5018 gsi_insert_seq_before (&exit_gsi
, stmts
, GSI_SAME_STMT
);
5019 scalar_results
.safe_push (val
);
5022 /* 2.3 Create the reduction code, using one of the three schemes described
5023 above. In SLP we simply need to extract all the elements from the
5024 vector (without reducing them), so we use scalar shifts. */
5025 else if (reduc_fn
!= IFN_LAST
&& !slp_reduc
)
5031 v_out2 = reduc_expr <v_out1> */
5033 if (dump_enabled_p ())
5034 dump_printf_loc (MSG_NOTE
, vect_location
,
5035 "Reduce using direct vector reduction.\n");
5037 vec_elem_type
= TREE_TYPE (TREE_TYPE (new_phi_result
));
5038 if (!useless_type_conversion_p (scalar_type
, vec_elem_type
))
5041 = vect_create_destination_var (scalar_dest
, vec_elem_type
);
5042 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
5044 gimple_set_lhs (epilog_stmt
, tmp_dest
);
5045 new_temp
= make_ssa_name (tmp_dest
, epilog_stmt
);
5046 gimple_set_lhs (epilog_stmt
, new_temp
);
5047 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5049 epilog_stmt
= gimple_build_assign (new_scalar_dest
, NOP_EXPR
,
5054 epilog_stmt
= gimple_build_call_internal (reduc_fn
, 1,
5056 gimple_set_lhs (epilog_stmt
, new_scalar_dest
);
5059 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5060 gimple_set_lhs (epilog_stmt
, new_temp
);
5061 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5063 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5064 == INTEGER_INDUC_COND_REDUCTION
)
5065 && !operand_equal_p (initial_def
, induc_val
, 0))
5067 /* Earlier we set the initial value to be a vector if induc_val
5068 values. Check the result and if it is induc_val then replace
5069 with the original initial value, unless induc_val is
5070 the same as initial_def already. */
5071 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5074 tmp
= make_ssa_name (new_scalar_dest
);
5075 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5076 initial_def
, new_temp
);
5077 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5081 scalar_results
.safe_push (new_temp
);
5083 else if (direct_slp_reduc
)
5085 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5086 with the elements for other SLP statements replaced with the
5087 neutral value. We can then do a normal reduction on each vector. */
5089 /* Enforced by vectorizable_reduction. */
5090 gcc_assert (new_phis
.length () == 1);
5091 gcc_assert (pow2p_hwi (group_size
));
5093 slp_tree orig_phis_slp_node
= slp_node_instance
->reduc_phis
;
5094 vec
<stmt_vec_info
> orig_phis
5095 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node
);
5096 gimple_seq seq
= NULL
;
5098 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5099 and the same element size as VECTYPE. */
5100 tree index
= build_index_vector (vectype
, 0, 1);
5101 tree index_type
= TREE_TYPE (index
);
5102 tree index_elt_type
= TREE_TYPE (index_type
);
5103 tree mask_type
= build_same_sized_truth_vector_type (index_type
);
5105 /* Create a vector that, for each element, identifies which of
5106 the REDUC_GROUP_SIZE results should use it. */
5107 tree index_mask
= build_int_cst (index_elt_type
, group_size
- 1);
5108 index
= gimple_build (&seq
, BIT_AND_EXPR
, index_type
, index
,
5109 build_vector_from_val (index_type
, index_mask
));
5111 /* Get a neutral vector value. This is simply a splat of the neutral
5112 scalar value if we have one, otherwise the initial scalar value
5113 is itself a neutral value. */
5114 tree vector_identity
= NULL_TREE
;
5116 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5118 for (unsigned int i
= 0; i
< group_size
; ++i
)
5120 /* If there's no univeral neutral value, we can use the
5121 initial scalar value from the original PHI. This is used
5122 for MIN and MAX reduction, for example. */
5126 = PHI_ARG_DEF_FROM_EDGE (orig_phis
[i
]->stmt
,
5127 loop_preheader_edge (loop
));
5128 vector_identity
= gimple_build_vector_from_val (&seq
, vectype
,
5132 /* Calculate the equivalent of:
5134 sel[j] = (index[j] == i);
5136 which selects the elements of NEW_PHI_RESULT that should
5137 be included in the result. */
5138 tree compare_val
= build_int_cst (index_elt_type
, i
);
5139 compare_val
= build_vector_from_val (index_type
, compare_val
);
5140 tree sel
= gimple_build (&seq
, EQ_EXPR
, mask_type
,
5141 index
, compare_val
);
5143 /* Calculate the equivalent of:
5145 vec = seq ? new_phi_result : vector_identity;
5147 VEC is now suitable for a full vector reduction. */
5148 tree vec
= gimple_build (&seq
, VEC_COND_EXPR
, vectype
,
5149 sel
, new_phi_result
, vector_identity
);
5151 /* Do the reduction and convert it to the appropriate type. */
5152 tree scalar
= gimple_build (&seq
, as_combined_fn (reduc_fn
),
5153 TREE_TYPE (vectype
), vec
);
5154 scalar
= gimple_convert (&seq
, scalar_type
, scalar
);
5155 scalar_results
.safe_push (scalar
);
5157 gsi_insert_seq_before (&exit_gsi
, seq
, GSI_SAME_STMT
);
5161 bool reduce_with_shift
;
5164 /* COND reductions all do the final reduction with MAX_EXPR
5166 if (code
== COND_EXPR
)
5168 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5169 == INTEGER_INDUC_COND_REDUCTION
)
5171 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5172 == CONST_COND_REDUCTION
)
5173 code
= STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
);
5178 /* See if the target wants to do the final (shift) reduction
5179 in a vector mode of smaller size and first reduce upper/lower
5180 halves against each other. */
5181 enum machine_mode mode1
= mode
;
5182 tree vectype1
= vectype
;
5183 unsigned sz
= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
));
5186 && (mode1
= targetm
.vectorize
.split_reduction (mode
)) != mode
)
5187 sz1
= GET_MODE_SIZE (mode1
).to_constant ();
5189 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz1
);
5190 reduce_with_shift
= have_whole_vector_shift (mode1
);
5191 if (!VECTOR_MODE_P (mode1
))
5192 reduce_with_shift
= false;
5195 optab optab
= optab_for_tree_code (code
, vectype1
, optab_default
);
5196 if (optab_handler (optab
, mode1
) == CODE_FOR_nothing
)
5197 reduce_with_shift
= false;
5200 /* First reduce the vector to the desired vector size we should
5201 do shift reduction on by combining upper and lower halves. */
5202 new_temp
= new_phi_result
;
5205 gcc_assert (!slp_reduc
);
5207 vectype1
= get_vectype_for_scalar_type_and_size (scalar_type
, sz
);
5209 /* The target has to make sure we support lowpart/highpart
5210 extraction, either via direct vector extract or through
5211 an integer mode punning. */
5213 if (convert_optab_handler (vec_extract_optab
,
5214 TYPE_MODE (TREE_TYPE (new_temp
)),
5215 TYPE_MODE (vectype1
))
5216 != CODE_FOR_nothing
)
5218 /* Extract sub-vectors directly once vec_extract becomes
5219 a conversion optab. */
5220 dst1
= make_ssa_name (vectype1
);
5222 = gimple_build_assign (dst1
, BIT_FIELD_REF
,
5223 build3 (BIT_FIELD_REF
, vectype1
,
5224 new_temp
, TYPE_SIZE (vectype1
),
5226 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5227 dst2
= make_ssa_name (vectype1
);
5229 = gimple_build_assign (dst2
, BIT_FIELD_REF
,
5230 build3 (BIT_FIELD_REF
, vectype1
,
5231 new_temp
, TYPE_SIZE (vectype1
),
5232 bitsize_int (sz
* BITS_PER_UNIT
)));
5233 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5237 /* Extract via punning to appropriately sized integer mode
5239 tree eltype
= build_nonstandard_integer_type (sz
* BITS_PER_UNIT
,
5241 tree etype
= build_vector_type (eltype
, 2);
5242 gcc_assert (convert_optab_handler (vec_extract_optab
,
5245 != CODE_FOR_nothing
);
5246 tree tem
= make_ssa_name (etype
);
5247 epilog_stmt
= gimple_build_assign (tem
, VIEW_CONVERT_EXPR
,
5248 build1 (VIEW_CONVERT_EXPR
,
5250 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5252 tem
= make_ssa_name (eltype
);
5254 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5255 build3 (BIT_FIELD_REF
, eltype
,
5256 new_temp
, TYPE_SIZE (eltype
),
5258 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5259 dst1
= make_ssa_name (vectype1
);
5260 epilog_stmt
= gimple_build_assign (dst1
, VIEW_CONVERT_EXPR
,
5261 build1 (VIEW_CONVERT_EXPR
,
5263 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5264 tem
= make_ssa_name (eltype
);
5266 = gimple_build_assign (tem
, BIT_FIELD_REF
,
5267 build3 (BIT_FIELD_REF
, eltype
,
5268 new_temp
, TYPE_SIZE (eltype
),
5269 bitsize_int (sz
* BITS_PER_UNIT
)));
5270 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5271 dst2
= make_ssa_name (vectype1
);
5272 epilog_stmt
= gimple_build_assign (dst2
, VIEW_CONVERT_EXPR
,
5273 build1 (VIEW_CONVERT_EXPR
,
5275 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5278 new_temp
= make_ssa_name (vectype1
);
5279 epilog_stmt
= gimple_build_assign (new_temp
, code
, dst1
, dst2
);
5280 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5283 if (reduce_with_shift
&& !slp_reduc
)
5285 int element_bitsize
= tree_to_uhwi (bitsize
);
5286 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5287 for variable-length vectors and also requires direct target support
5288 for loop reductions. */
5289 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5290 int nelements
= vec_size_in_bits
/ element_bitsize
;
5291 vec_perm_builder sel
;
5292 vec_perm_indices indices
;
5296 tree zero_vec
= build_zero_cst (vectype1
);
5298 for (offset = nelements/2; offset >= 1; offset/=2)
5300 Create: va' = vec_shift <va, offset>
5301 Create: va = vop <va, va'>
5306 if (dump_enabled_p ())
5307 dump_printf_loc (MSG_NOTE
, vect_location
,
5308 "Reduce using vector shifts\n");
5310 mode1
= TYPE_MODE (vectype1
);
5311 vec_dest
= vect_create_destination_var (scalar_dest
, vectype1
);
5312 for (elt_offset
= nelements
/ 2;
5316 calc_vec_perm_mask_for_shift (elt_offset
, nelements
, &sel
);
5317 indices
.new_vector (sel
, 2, nelements
);
5318 tree mask
= vect_gen_perm_mask_any (vectype1
, indices
);
5319 epilog_stmt
= gimple_build_assign (vec_dest
, VEC_PERM_EXPR
,
5320 new_temp
, zero_vec
, mask
);
5321 new_name
= make_ssa_name (vec_dest
, epilog_stmt
);
5322 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5323 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5325 epilog_stmt
= gimple_build_assign (vec_dest
, code
, new_name
,
5327 new_temp
= make_ssa_name (vec_dest
, epilog_stmt
);
5328 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5329 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5332 /* 2.4 Extract the final scalar result. Create:
5333 s_out3 = extract_field <v_out2, bitpos> */
5335 if (dump_enabled_p ())
5336 dump_printf_loc (MSG_NOTE
, vect_location
,
5337 "extract scalar result\n");
5339 rhs
= build3 (BIT_FIELD_REF
, scalar_type
, new_temp
,
5340 bitsize
, bitsize_zero_node
);
5341 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5342 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5343 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5344 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5345 scalar_results
.safe_push (new_temp
);
5350 s = extract_field <v_out2, 0>
5351 for (offset = element_size;
5352 offset < vector_size;
5353 offset += element_size;)
5355 Create: s' = extract_field <v_out2, offset>
5356 Create: s = op <s, s'> // For non SLP cases
5359 if (dump_enabled_p ())
5360 dump_printf_loc (MSG_NOTE
, vect_location
,
5361 "Reduce using scalar code.\n");
5363 int vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype1
));
5364 int element_bitsize
= tree_to_uhwi (bitsize
);
5365 FOR_EACH_VEC_ELT (new_phis
, i
, new_phi
)
5368 if (gimple_code (new_phi
) == GIMPLE_PHI
)
5369 vec_temp
= PHI_RESULT (new_phi
);
5371 vec_temp
= gimple_assign_lhs (new_phi
);
5372 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
, bitsize
,
5374 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5375 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5376 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5377 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5379 /* In SLP we don't need to apply reduction operation, so we just
5380 collect s' values in SCALAR_RESULTS. */
5382 scalar_results
.safe_push (new_temp
);
5384 for (bit_offset
= element_bitsize
;
5385 bit_offset
< vec_size_in_bits
;
5386 bit_offset
+= element_bitsize
)
5388 tree bitpos
= bitsize_int (bit_offset
);
5389 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vec_temp
,
5392 epilog_stmt
= gimple_build_assign (new_scalar_dest
, rhs
);
5393 new_name
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5394 gimple_assign_set_lhs (epilog_stmt
, new_name
);
5395 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5399 /* In SLP we don't need to apply reduction operation, so
5400 we just collect s' values in SCALAR_RESULTS. */
5401 new_temp
= new_name
;
5402 scalar_results
.safe_push (new_name
);
5406 epilog_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5407 new_name
, new_temp
);
5408 new_temp
= make_ssa_name (new_scalar_dest
, epilog_stmt
);
5409 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5410 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5415 /* The only case where we need to reduce scalar results in SLP, is
5416 unrolling. If the size of SCALAR_RESULTS is greater than
5417 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5418 REDUC_GROUP_SIZE. */
5421 tree res
, first_res
, new_res
;
5424 /* Reduce multiple scalar results in case of SLP unrolling. */
5425 for (j
= group_size
; scalar_results
.iterate (j
, &res
);
5428 first_res
= scalar_results
[j
% group_size
];
5429 new_stmt
= gimple_build_assign (new_scalar_dest
, code
,
5431 new_res
= make_ssa_name (new_scalar_dest
, new_stmt
);
5432 gimple_assign_set_lhs (new_stmt
, new_res
);
5433 gsi_insert_before (&exit_gsi
, new_stmt
, GSI_SAME_STMT
);
5434 scalar_results
[j
% group_size
] = new_res
;
5438 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5439 scalar_results
.safe_push (new_temp
);
5442 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5443 == INTEGER_INDUC_COND_REDUCTION
)
5444 && !operand_equal_p (initial_def
, induc_val
, 0))
5446 /* Earlier we set the initial value to be a vector if induc_val
5447 values. Check the result and if it is induc_val then replace
5448 with the original initial value, unless induc_val is
5449 the same as initial_def already. */
5450 tree zcompare
= build2 (EQ_EXPR
, boolean_type_node
, new_temp
,
5453 tree tmp
= make_ssa_name (new_scalar_dest
);
5454 epilog_stmt
= gimple_build_assign (tmp
, COND_EXPR
, zcompare
,
5455 initial_def
, new_temp
);
5456 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5457 scalar_results
[0] = tmp
;
5461 vect_finalize_reduction
:
5466 /* 2.5 Adjust the final result by the initial value of the reduction
5467 variable. (When such adjustment is not needed, then
5468 'adjustment_def' is zero). For example, if code is PLUS we create:
5469 new_temp = loop_exit_def + adjustment_def */
5473 gcc_assert (!slp_reduc
);
5474 if (nested_in_vect_loop
)
5476 new_phi
= new_phis
[0];
5477 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) == VECTOR_TYPE
);
5478 expr
= build2 (code
, vectype
, PHI_RESULT (new_phi
), adjustment_def
);
5479 new_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5483 new_temp
= scalar_results
[0];
5484 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def
)) != VECTOR_TYPE
);
5485 expr
= build2 (code
, scalar_type
, new_temp
, adjustment_def
);
5486 new_dest
= vect_create_destination_var (scalar_dest
, scalar_type
);
5489 epilog_stmt
= gimple_build_assign (new_dest
, expr
);
5490 new_temp
= make_ssa_name (new_dest
, epilog_stmt
);
5491 gimple_assign_set_lhs (epilog_stmt
, new_temp
);
5492 gsi_insert_before (&exit_gsi
, epilog_stmt
, GSI_SAME_STMT
);
5493 if (nested_in_vect_loop
)
5495 stmt_vec_info epilog_stmt_info
= loop_vinfo
->add_stmt (epilog_stmt
);
5496 STMT_VINFO_RELATED_STMT (epilog_stmt_info
)
5497 = STMT_VINFO_RELATED_STMT (loop_vinfo
->lookup_stmt (new_phi
));
5500 scalar_results
.quick_push (new_temp
);
5502 scalar_results
[0] = new_temp
;
5505 scalar_results
[0] = new_temp
;
5507 new_phis
[0] = epilog_stmt
;
5510 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5511 phis with new adjusted scalar results, i.e., replace use <s_out0>
5516 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5517 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5518 v_out2 = reduce <v_out1>
5519 s_out3 = extract_field <v_out2, 0>
5520 s_out4 = adjust_result <s_out3>
5527 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5528 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5529 v_out2 = reduce <v_out1>
5530 s_out3 = extract_field <v_out2, 0>
5531 s_out4 = adjust_result <s_out3>
5536 /* In SLP reduction chain we reduce vector results into one vector if
5537 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5538 LHS of the last stmt in the reduction chain, since we are looking for
5539 the loop exit phi node. */
5540 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
5542 stmt_vec_info dest_stmt_info
5543 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1]);
5544 scalar_dest
= gimple_assign_lhs (dest_stmt_info
->stmt
);
5548 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5549 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5550 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5551 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5552 correspond to the first vector stmt, etc.
5553 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5554 if (group_size
> new_phis
.length ())
5556 ratio
= group_size
/ new_phis
.length ();
5557 gcc_assert (!(group_size
% new_phis
.length ()));
5562 stmt_vec_info epilog_stmt_info
= NULL
;
5563 for (k
= 0; k
< group_size
; k
++)
5567 epilog_stmt_info
= loop_vinfo
->lookup_stmt (new_phis
[k
/ ratio
]);
5568 reduction_phi_info
= reduction_phis
[k
/ ratio
];
5570 inner_phi
= inner_phis
[k
/ ratio
];
5575 stmt_vec_info scalar_stmt_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[k
];
5577 orig_stmt_info
= STMT_VINFO_RELATED_STMT (scalar_stmt_info
);
5578 /* SLP statements can't participate in patterns. */
5579 gcc_assert (!orig_stmt_info
);
5580 scalar_dest
= gimple_assign_lhs (scalar_stmt_info
->stmt
);
5584 /* Find the loop-closed-use at the loop exit of the original scalar
5585 result. (The reduction result is expected to have two immediate uses -
5586 one at the latch block, and one at the loop exit). */
5587 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5588 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
)))
5589 && !is_gimple_debug (USE_STMT (use_p
)))
5590 phis
.safe_push (USE_STMT (use_p
));
5592 /* While we expect to have found an exit_phi because of loop-closed-ssa
5593 form we can end up without one if the scalar cycle is dead. */
5595 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5599 stmt_vec_info exit_phi_vinfo
5600 = loop_vinfo
->lookup_stmt (exit_phi
);
5604 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = inner_phi
;
5606 STMT_VINFO_VEC_STMT (exit_phi_vinfo
) = epilog_stmt_info
;
5608 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo
)
5609 != vect_double_reduction_def
)
5612 /* Handle double reduction:
5614 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5615 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5616 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5617 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5619 At that point the regular reduction (stmt2 and stmt3) is
5620 already vectorized, as well as the exit phi node, stmt4.
5621 Here we vectorize the phi node of double reduction, stmt1, and
5622 update all relevant statements. */
5624 /* Go through all the uses of s2 to find double reduction phi
5625 node, i.e., stmt1 above. */
5626 orig_name
= PHI_RESULT (exit_phi
);
5627 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5629 stmt_vec_info use_stmt_vinfo
;
5630 tree vect_phi_init
, preheader_arg
, vect_phi_res
;
5631 basic_block bb
= gimple_bb (use_stmt
);
5633 /* Check that USE_STMT is really double reduction phi
5635 if (gimple_code (use_stmt
) != GIMPLE_PHI
5636 || gimple_phi_num_args (use_stmt
) != 2
5637 || bb
->loop_father
!= outer_loop
)
5639 use_stmt_vinfo
= loop_vinfo
->lookup_stmt (use_stmt
);
5641 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo
)
5642 != vect_double_reduction_def
)
5645 /* Create vector phi node for double reduction:
5646 vs1 = phi <vs0, vs2>
5647 vs1 was created previously in this function by a call to
5648 vect_get_vec_def_for_operand and is stored in
5650 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5651 vs0 is created here. */
5653 /* Create vector phi node. */
5654 vect_phi
= create_phi_node (vec_initial_def
, bb
);
5655 loop_vec_info_for_loop (outer_loop
)->add_stmt (vect_phi
);
5657 /* Create vs0 - initial def of the double reduction phi. */
5658 preheader_arg
= PHI_ARG_DEF_FROM_EDGE (use_stmt
,
5659 loop_preheader_edge (outer_loop
));
5660 vect_phi_init
= get_initial_def_for_reduction
5661 (stmt_info
, preheader_arg
, NULL
);
5663 /* Update phi node arguments with vs0 and vs2. */
5664 add_phi_arg (vect_phi
, vect_phi_init
,
5665 loop_preheader_edge (outer_loop
),
5667 add_phi_arg (vect_phi
, PHI_RESULT (inner_phi
->stmt
),
5668 loop_latch_edge (outer_loop
), UNKNOWN_LOCATION
);
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_NOTE
, vect_location
,
5671 "created double reduction phi node: %G",
5674 vect_phi_res
= PHI_RESULT (vect_phi
);
5676 /* Replace the use, i.e., set the correct vs1 in the regular
5677 reduction phi node. FORNOW, NCOPIES is always 1, so the
5678 loop is redundant. */
5679 stmt_vec_info use_info
= reduction_phi_info
;
5680 for (j
= 0; j
< ncopies
; j
++)
5682 edge pr_edge
= loop_preheader_edge (loop
);
5683 SET_PHI_ARG_DEF (as_a
<gphi
*> (use_info
->stmt
),
5684 pr_edge
->dest_idx
, vect_phi_res
);
5685 use_info
= STMT_VINFO_RELATED_STMT (use_info
);
5692 if (nested_in_vect_loop
)
5701 /* Find the loop-closed-use at the loop exit of the original scalar
5702 result. (The reduction result is expected to have two immediate uses,
5703 one at the latch block, and one at the loop exit). For double
5704 reductions we are looking for exit phis of the outer loop. */
5705 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, scalar_dest
)
5707 if (!flow_bb_inside_loop_p (loop
, gimple_bb (USE_STMT (use_p
))))
5709 if (!is_gimple_debug (USE_STMT (use_p
)))
5710 phis
.safe_push (USE_STMT (use_p
));
5714 if (double_reduc
&& gimple_code (USE_STMT (use_p
)) == GIMPLE_PHI
)
5716 tree phi_res
= PHI_RESULT (USE_STMT (use_p
));
5718 FOR_EACH_IMM_USE_FAST (phi_use_p
, phi_imm_iter
, phi_res
)
5720 if (!flow_bb_inside_loop_p (loop
,
5721 gimple_bb (USE_STMT (phi_use_p
)))
5722 && !is_gimple_debug (USE_STMT (phi_use_p
)))
5723 phis
.safe_push (USE_STMT (phi_use_p
));
5729 FOR_EACH_VEC_ELT (phis
, i
, exit_phi
)
5731 /* Replace the uses: */
5732 orig_name
= PHI_RESULT (exit_phi
);
5733 scalar_result
= scalar_results
[k
];
5734 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, orig_name
)
5735 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
5736 SET_USE (use_p
, scalar_result
);
5743 /* Return a vector of type VECTYPE that is equal to the vector select
5744 operation "MASK ? VEC : IDENTITY". Insert the select statements
5748 merge_with_identity (gimple_stmt_iterator
*gsi
, tree mask
, tree vectype
,
5749 tree vec
, tree identity
)
5751 tree cond
= make_temp_ssa_name (vectype
, NULL
, "cond");
5752 gimple
*new_stmt
= gimple_build_assign (cond
, VEC_COND_EXPR
,
5753 mask
, vec
, identity
);
5754 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5758 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5759 order, starting with LHS. Insert the extraction statements before GSI and
5760 associate the new scalar SSA names with variable SCALAR_DEST.
5761 Return the SSA name for the result. */
5764 vect_expand_fold_left (gimple_stmt_iterator
*gsi
, tree scalar_dest
,
5765 tree_code code
, tree lhs
, tree vector_rhs
)
5767 tree vectype
= TREE_TYPE (vector_rhs
);
5768 tree scalar_type
= TREE_TYPE (vectype
);
5769 tree bitsize
= TYPE_SIZE (scalar_type
);
5770 unsigned HOST_WIDE_INT vec_size_in_bits
= tree_to_uhwi (TYPE_SIZE (vectype
));
5771 unsigned HOST_WIDE_INT element_bitsize
= tree_to_uhwi (bitsize
);
5773 for (unsigned HOST_WIDE_INT bit_offset
= 0;
5774 bit_offset
< vec_size_in_bits
;
5775 bit_offset
+= element_bitsize
)
5777 tree bitpos
= bitsize_int (bit_offset
);
5778 tree rhs
= build3 (BIT_FIELD_REF
, scalar_type
, vector_rhs
,
5781 gassign
*stmt
= gimple_build_assign (scalar_dest
, rhs
);
5782 rhs
= make_ssa_name (scalar_dest
, stmt
);
5783 gimple_assign_set_lhs (stmt
, rhs
);
5784 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5786 stmt
= gimple_build_assign (scalar_dest
, code
, lhs
, rhs
);
5787 tree new_name
= make_ssa_name (scalar_dest
, stmt
);
5788 gimple_assign_set_lhs (stmt
, new_name
);
5789 gsi_insert_before (gsi
, stmt
, GSI_SAME_STMT
);
5795 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5796 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5797 statement. CODE is the operation performed by STMT_INFO and OPS are
5798 its scalar operands. REDUC_INDEX is the index of the operand in
5799 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5800 implements in-order reduction, or IFN_LAST if we should open-code it.
5801 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5802 that should be used to control the operation in a fully-masked loop. */
5805 vectorize_fold_left_reduction (stmt_vec_info stmt_info
,
5806 gimple_stmt_iterator
*gsi
,
5807 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
5808 gimple
*reduc_def_stmt
,
5809 tree_code code
, internal_fn reduc_fn
,
5810 tree ops
[3], tree vectype_in
,
5811 int reduc_index
, vec_loop_masks
*masks
)
5813 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5814 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5815 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
5816 stmt_vec_info new_stmt_info
= NULL
;
5822 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
5824 gcc_assert (!nested_in_vect_loop_p (loop
, stmt_info
));
5825 gcc_assert (ncopies
== 1);
5826 gcc_assert (TREE_CODE_LENGTH (code
) == binary_op
);
5827 gcc_assert (reduc_index
== (code
== MINUS_EXPR
? 0 : 1));
5828 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
5829 == FOLD_LEFT_REDUCTION
);
5832 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out
),
5833 TYPE_VECTOR_SUBPARTS (vectype_in
)));
5835 tree op0
= ops
[1 - reduc_index
];
5838 stmt_vec_info scalar_dest_def_info
;
5839 auto_vec
<tree
> vec_oprnds0
;
5842 auto_vec
<vec
<tree
> > vec_defs (2);
5843 auto_vec
<tree
> sops(2);
5844 sops
.quick_push (ops
[0]);
5845 sops
.quick_push (ops
[1]);
5846 vect_get_slp_defs (sops
, slp_node
, &vec_defs
);
5847 vec_oprnds0
.safe_splice (vec_defs
[1 - reduc_index
]);
5848 vec_defs
[0].release ();
5849 vec_defs
[1].release ();
5850 group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
5851 scalar_dest_def_info
= SLP_TREE_SCALAR_STMTS (slp_node
)[group_size
- 1];
5855 tree loop_vec_def0
= vect_get_vec_def_for_operand (op0
, stmt_info
);
5856 vec_oprnds0
.create (1);
5857 vec_oprnds0
.quick_push (loop_vec_def0
);
5858 scalar_dest_def_info
= stmt_info
;
5861 tree scalar_dest
= gimple_assign_lhs (scalar_dest_def_info
->stmt
);
5862 tree scalar_type
= TREE_TYPE (scalar_dest
);
5863 tree reduc_var
= gimple_phi_result (reduc_def_stmt
);
5865 int vec_num
= vec_oprnds0
.length ();
5866 gcc_assert (vec_num
== 1 || slp_node
);
5867 tree vec_elem_type
= TREE_TYPE (vectype_out
);
5868 gcc_checking_assert (useless_type_conversion_p (scalar_type
, vec_elem_type
));
5870 tree vector_identity
= NULL_TREE
;
5871 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5872 vector_identity
= build_zero_cst (vectype_out
);
5874 tree scalar_dest_var
= vect_create_destination_var (scalar_dest
, NULL
);
5877 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
5880 tree mask
= NULL_TREE
;
5881 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
5882 mask
= vect_get_loop_mask (gsi
, masks
, vec_num
, vectype_in
, i
);
5884 /* Handle MINUS by adding the negative. */
5885 if (reduc_fn
!= IFN_LAST
&& code
== MINUS_EXPR
)
5887 tree negated
= make_ssa_name (vectype_out
);
5888 new_stmt
= gimple_build_assign (negated
, NEGATE_EXPR
, def0
);
5889 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5894 def0
= merge_with_identity (gsi
, mask
, vectype_out
, def0
,
5897 /* On the first iteration the input is simply the scalar phi
5898 result, and for subsequent iterations it is the output of
5899 the preceding operation. */
5900 if (reduc_fn
!= IFN_LAST
)
5902 new_stmt
= gimple_build_call_internal (reduc_fn
, 2, reduc_var
, def0
);
5903 /* For chained SLP reductions the output of the previous reduction
5904 operation serves as the input of the next. For the final statement
5905 the output cannot be a temporary - we reuse the original
5906 scalar destination of the last statement. */
5907 if (i
!= vec_num
- 1)
5909 gimple_set_lhs (new_stmt
, scalar_dest_var
);
5910 reduc_var
= make_ssa_name (scalar_dest_var
, new_stmt
);
5911 gimple_set_lhs (new_stmt
, reduc_var
);
5916 reduc_var
= vect_expand_fold_left (gsi
, scalar_dest_var
, code
,
5918 new_stmt
= SSA_NAME_DEF_STMT (reduc_var
);
5919 /* Remove the statement, so that we can use the same code paths
5920 as for statements that we've just created. */
5921 gimple_stmt_iterator tmp_gsi
= gsi_for_stmt (new_stmt
);
5922 gsi_remove (&tmp_gsi
, true);
5925 if (i
== vec_num
- 1)
5927 gimple_set_lhs (new_stmt
, scalar_dest
);
5928 new_stmt_info
= vect_finish_replace_stmt (scalar_dest_def_info
,
5932 new_stmt_info
= vect_finish_stmt_generation (scalar_dest_def_info
,
5936 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt_info
);
5940 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
5945 /* Function is_nonwrapping_integer_induction.
5947 Check if STMT_VINO (which is part of loop LOOP) both increments and
5948 does not cause overflow. */
5951 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo
, struct loop
*loop
)
5953 gphi
*phi
= as_a
<gphi
*> (stmt_vinfo
->stmt
);
5954 tree base
= STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo
);
5955 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo
);
5956 tree lhs_type
= TREE_TYPE (gimple_phi_result (phi
));
5957 widest_int ni
, max_loop_value
, lhs_max
;
5958 wi::overflow_type overflow
= wi::OVF_NONE
;
5960 /* Make sure the loop is integer based. */
5961 if (TREE_CODE (base
) != INTEGER_CST
5962 || TREE_CODE (step
) != INTEGER_CST
)
5965 /* Check that the max size of the loop will not wrap. */
5967 if (TYPE_OVERFLOW_UNDEFINED (lhs_type
))
5970 if (! max_stmt_executions (loop
, &ni
))
5973 max_loop_value
= wi::mul (wi::to_widest (step
), ni
, TYPE_SIGN (lhs_type
),
5978 max_loop_value
= wi::add (wi::to_widest (base
), max_loop_value
,
5979 TYPE_SIGN (lhs_type
), &overflow
);
5983 return (wi::min_precision (max_loop_value
, TYPE_SIGN (lhs_type
))
5984 <= TYPE_PRECISION (lhs_type
));
5987 /* Check if masking can be supported by inserting a conditional expression.
5988 CODE is the code for the operation. COND_FN is the conditional internal
5989 function, if it exists. VECTYPE_IN is the type of the vector input. */
5991 use_mask_by_cond_expr_p (enum tree_code code
, internal_fn cond_fn
,
5994 if (cond_fn
!= IFN_LAST
5995 && direct_internal_fn_supported_p (cond_fn
, vectype_in
,
5996 OPTIMIZE_FOR_SPEED
))
6010 /* Insert a conditional expression to enable masked vectorization. CODE is the
6011 code for the operation. VOP is the array of operands. MASK is the loop
6012 mask. GSI is a statement iterator used to place the new conditional
6015 build_vect_cond_expr (enum tree_code code
, tree vop
[3], tree mask
,
6016 gimple_stmt_iterator
*gsi
)
6022 tree vectype
= TREE_TYPE (vop
[1]);
6023 tree zero
= build_zero_cst (vectype
);
6024 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6025 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6026 mask
, vop
[1], zero
);
6027 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6028 vop
[1] = masked_op1
;
6034 tree vectype
= TREE_TYPE (vop
[1]);
6035 tree masked_op1
= make_temp_ssa_name (vectype
, NULL
, "masked_op1");
6036 gassign
*select
= gimple_build_assign (masked_op1
, VEC_COND_EXPR
,
6037 mask
, vop
[1], vop
[0]);
6038 gsi_insert_before (gsi
, select
, GSI_SAME_STMT
);
6039 vop
[1] = masked_op1
;
6048 /* Function vectorizable_reduction.
6050 Check if STMT_INFO performs a reduction operation that can be vectorized.
6051 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6052 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6053 Return true if STMT_INFO is vectorizable in this way.
6055 This function also handles reduction idioms (patterns) that have been
6056 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6057 may be of this form:
6058 X = pattern_expr (arg0, arg1, ..., X)
6059 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6060 sequence that had been detected and replaced by the pattern-stmt
6063 This function also handles reduction of condition expressions, for example:
6064 for (int i = 0; i < N; i++)
6067 This is handled by vectorising the loop and creating an additional vector
6068 containing the loop indexes for which "a[i] < value" was true. In the
6069 function epilogue this is reduced to a single max value and then used to
6070 index into the vector of results.
6072 In some cases of reduction patterns, the type of the reduction variable X is
6073 different than the type of the other arguments of STMT_INFO.
6074 In such cases, the vectype that is used when transforming STMT_INFO into
6075 a vector stmt is different than the vectype that is used to determine the
6076 vectorization factor, because it consists of a different number of elements
6077 than the actual number of elements that are being operated upon in parallel.
6079 For example, consider an accumulation of shorts into an int accumulator.
6080 On some targets it's possible to vectorize this pattern operating on 8
6081 shorts at a time (hence, the vectype for purposes of determining the
6082 vectorization factor should be V8HI); on the other hand, the vectype that
6083 is used to create the vector form is actually V4SI (the type of the result).
6085 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6086 indicates what is the actual level of parallelism (V8HI in the example), so
6087 that the right vectorization factor would be derived. This vectype
6088 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6089 be used to create the vectorized stmt. The right vectype for the vectorized
6090 stmt is obtained from the type of the result X:
6091 get_vectype_for_scalar_type (TREE_TYPE (X))
6093 This means that, contrary to "regular" reductions (or "regular" stmts in
6094 general), the following equation:
6095 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6096 does *NOT* necessarily hold for reduction patterns. */
6099 vectorizable_reduction (stmt_vec_info stmt_info
, gimple_stmt_iterator
*gsi
,
6100 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
6101 slp_instance slp_node_instance
,
6102 stmt_vector_for_cost
*cost_vec
)
6106 tree vectype_out
= STMT_VINFO_VECTYPE (stmt_info
);
6107 tree vectype_in
= NULL_TREE
;
6108 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
6109 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6110 enum tree_code code
, orig_code
;
6111 internal_fn reduc_fn
;
6112 machine_mode vec_mode
;
6115 tree new_temp
= NULL_TREE
;
6116 enum vect_def_type dt
, cond_reduc_dt
= vect_unknown_def_type
;
6117 stmt_vec_info cond_stmt_vinfo
= NULL
;
6118 enum tree_code cond_reduc_op_code
= ERROR_MARK
;
6124 stmt_vec_info prev_stmt_info
, prev_phi_info
;
6125 bool single_defuse_cycle
= false;
6126 stmt_vec_info new_stmt_info
= NULL
;
6129 enum vect_def_type dts
[3];
6130 bool nested_cycle
= false, found_nested_cycle_def
= false;
6131 bool double_reduc
= false;
6133 struct loop
* def_stmt_loop
;
6135 auto_vec
<tree
> vec_oprnds0
;
6136 auto_vec
<tree
> vec_oprnds1
;
6137 auto_vec
<tree
> vec_oprnds2
;
6138 auto_vec
<tree
> vect_defs
;
6139 auto_vec
<stmt_vec_info
> phis
;
6142 tree cr_index_scalar_type
= NULL_TREE
, cr_index_vector_type
= NULL_TREE
;
6143 tree cond_reduc_val
= NULL_TREE
;
6145 /* Make sure it was already recognized as a reduction computation. */
6146 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
6147 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_nested_cycle
)
6150 if (nested_in_vect_loop_p (loop
, stmt_info
))
6153 nested_cycle
= true;
6156 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6157 gcc_assert (slp_node
6158 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
);
6160 if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
6162 tree phi_result
= gimple_phi_result (phi
);
6163 /* Analysis is fully done on the reduction stmt invocation. */
6167 slp_node_instance
->reduc_phis
= slp_node
;
6169 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
6173 if (STMT_VINFO_REDUC_TYPE (stmt_info
) == FOLD_LEFT_REDUCTION
)
6174 /* Leave the scalar phi in place. Note that checking
6175 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6176 for reductions involving a single statement. */
6179 stmt_vec_info reduc_stmt_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
6180 reduc_stmt_info
= vect_stmt_to_vectorize (reduc_stmt_info
);
6182 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info
)
6183 == EXTRACT_LAST_REDUCTION
)
6184 /* Leave the scalar phi in place. */
6187 gassign
*reduc_stmt
= as_a
<gassign
*> (reduc_stmt_info
->stmt
);
6188 code
= gimple_assign_rhs_code (reduc_stmt
);
6189 for (unsigned k
= 1; k
< gimple_num_ops (reduc_stmt
); ++k
)
6191 tree op
= gimple_op (reduc_stmt
, k
);
6192 if (op
== phi_result
)
6194 if (k
== 1 && code
== COND_EXPR
)
6196 bool is_simple_use
= vect_is_simple_use (op
, loop_vinfo
, &dt
);
6197 gcc_assert (is_simple_use
);
6198 if (dt
== vect_constant_def
|| dt
== vect_external_def
)
6201 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6202 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op
)))))
6203 vectype_in
= get_vectype_for_scalar_type (TREE_TYPE (op
));
6206 /* For a nested cycle we might end up with an operation like
6207 phi_result * phi_result. */
6209 vectype_in
= STMT_VINFO_VECTYPE (stmt_info
);
6210 gcc_assert (vectype_in
);
6215 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6217 stmt_vec_info use_stmt_info
;
6219 && STMT_VINFO_RELEVANT (reduc_stmt_info
) <= vect_used_only_live
6220 && (use_stmt_info
= loop_vinfo
->lookup_single_use (phi_result
))
6221 && vect_stmt_to_vectorize (use_stmt_info
) == reduc_stmt_info
)
6222 single_defuse_cycle
= true;
6224 /* Create the destination vector */
6225 scalar_dest
= gimple_assign_lhs (reduc_stmt
);
6226 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
6229 /* The size vect_schedule_slp_instance computes is off for us. */
6230 vec_num
= vect_get_num_vectors
6231 (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6232 * SLP_TREE_SCALAR_STMTS (slp_node
).length (),
6237 /* Generate the reduction PHIs upfront. */
6238 prev_phi_info
= NULL
;
6239 for (j
= 0; j
< ncopies
; j
++)
6241 if (j
== 0 || !single_defuse_cycle
)
6243 for (i
= 0; i
< vec_num
; i
++)
6245 /* Create the reduction-phi that defines the reduction
6247 gimple
*new_phi
= create_phi_node (vec_dest
, loop
->header
);
6248 stmt_vec_info new_phi_info
= loop_vinfo
->add_stmt (new_phi
);
6251 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_phi_info
);
6255 STMT_VINFO_VEC_STMT (stmt_info
)
6256 = *vec_stmt
= new_phi_info
;
6258 STMT_VINFO_RELATED_STMT (prev_phi_info
) = new_phi_info
;
6259 prev_phi_info
= new_phi_info
;
6268 /* 1. Is vectorizable reduction? */
6269 /* Not supportable if the reduction variable is used in the loop, unless
6270 it's a reduction chain. */
6271 if (STMT_VINFO_RELEVANT (stmt_info
) > vect_used_in_outer
6272 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6275 /* Reductions that are not used even in an enclosing outer-loop,
6276 are expected to be "live" (used out of the loop). */
6277 if (STMT_VINFO_RELEVANT (stmt_info
) == vect_unused_in_scope
6278 && !STMT_VINFO_LIVE_P (stmt_info
))
6281 /* 2. Has this been recognized as a reduction pattern?
6283 Check if STMT represents a pattern that has been recognized
6284 in earlier analysis stages. For stmts that represent a pattern,
6285 the STMT_VINFO_RELATED_STMT field records the last stmt in
6286 the original sequence that constitutes the pattern. */
6288 stmt_vec_info orig_stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
6291 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
));
6292 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info
));
6295 /* 3. Check the operands of the operation. The first operands are defined
6296 inside the loop body. The last operand is the reduction variable,
6297 which is defined by the loop-header-phi. */
6299 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
6302 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt
)))
6304 case GIMPLE_BINARY_RHS
:
6305 code
= gimple_assign_rhs_code (stmt
);
6306 op_type
= TREE_CODE_LENGTH (code
);
6307 gcc_assert (op_type
== binary_op
);
6308 ops
[0] = gimple_assign_rhs1 (stmt
);
6309 ops
[1] = gimple_assign_rhs2 (stmt
);
6312 case GIMPLE_TERNARY_RHS
:
6313 code
= gimple_assign_rhs_code (stmt
);
6314 op_type
= TREE_CODE_LENGTH (code
);
6315 gcc_assert (op_type
== ternary_op
);
6316 ops
[0] = gimple_assign_rhs1 (stmt
);
6317 ops
[1] = gimple_assign_rhs2 (stmt
);
6318 ops
[2] = gimple_assign_rhs3 (stmt
);
6321 case GIMPLE_UNARY_RHS
:
6328 if (code
== COND_EXPR
&& slp_node
)
6331 scalar_dest
= gimple_assign_lhs (stmt
);
6332 scalar_type
= TREE_TYPE (scalar_dest
);
6333 if (!POINTER_TYPE_P (scalar_type
) && !INTEGRAL_TYPE_P (scalar_type
)
6334 && !SCALAR_FLOAT_TYPE_P (scalar_type
))
6337 /* Do not try to vectorize bit-precision reductions. */
6338 if (!type_has_mode_precision_p (scalar_type
))
6341 /* All uses but the last are expected to be defined in the loop.
6342 The last use is the reduction variable. In case of nested cycle this
6343 assumption is not true: we use reduc_index to record the index of the
6344 reduction variable. */
6345 stmt_vec_info reduc_def_info
;
6347 reduc_def_info
= STMT_VINFO_REDUC_DEF (orig_stmt_info
);
6349 reduc_def_info
= STMT_VINFO_REDUC_DEF (stmt_info
);
6350 gcc_assert (reduc_def_info
);
6351 gphi
*reduc_def_phi
= as_a
<gphi
*> (reduc_def_info
->stmt
);
6352 tree reduc_def
= PHI_RESULT (reduc_def_phi
);
6353 int reduc_index
= -1;
6354 for (i
= 0; i
< op_type
; i
++)
6356 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6357 if (i
== 0 && code
== COND_EXPR
)
6360 stmt_vec_info def_stmt_info
;
6361 is_simple_use
= vect_is_simple_use (ops
[i
], loop_vinfo
, &dts
[i
], &tem
,
6364 gcc_assert (is_simple_use
);
6365 if (dt
== vect_reduction_def
6366 && ops
[i
] == reduc_def
)
6373 /* To properly compute ncopies we are interested in the widest
6374 input type in case we're looking at a widening accumulation. */
6376 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in
)))
6377 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem
)))))
6381 if (dt
!= vect_internal_def
6382 && dt
!= vect_external_def
6383 && dt
!= vect_constant_def
6384 && dt
!= vect_induction_def
6385 && !(dt
== vect_nested_cycle
&& nested_cycle
))
6388 if (dt
== vect_nested_cycle
6389 && ops
[i
] == reduc_def
)
6391 found_nested_cycle_def
= true;
6395 if (i
== 1 && code
== COND_EXPR
)
6397 /* Record how value of COND_EXPR is defined. */
6398 if (dt
== vect_constant_def
)
6401 cond_reduc_val
= ops
[i
];
6403 if (dt
== vect_induction_def
6405 && is_nonwrapping_integer_induction (def_stmt_info
, loop
))
6408 cond_stmt_vinfo
= def_stmt_info
;
6414 vectype_in
= vectype_out
;
6416 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6417 directy used in stmt. */
6418 if (reduc_index
== -1)
6420 if (STMT_VINFO_REDUC_TYPE (stmt_info
) == FOLD_LEFT_REDUCTION
)
6422 if (dump_enabled_p ())
6423 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6424 "in-order reduction chain without SLP.\n");
6429 if (!(reduc_index
== -1
6430 || dts
[reduc_index
] == vect_reduction_def
6431 || dts
[reduc_index
] == vect_nested_cycle
6432 || ((dts
[reduc_index
] == vect_internal_def
6433 || dts
[reduc_index
] == vect_external_def
6434 || dts
[reduc_index
] == vect_constant_def
6435 || dts
[reduc_index
] == vect_induction_def
)
6436 && nested_cycle
&& found_nested_cycle_def
)))
6438 /* For pattern recognized stmts, orig_stmt might be a reduction,
6439 but some helper statements for the pattern might not, or
6440 might be COND_EXPRs with reduction uses in the condition. */
6441 gcc_assert (orig_stmt_info
);
6445 /* PHIs should not participate in patterns. */
6446 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info
));
6447 enum vect_reduction_type v_reduc_type
6448 = STMT_VINFO_REDUC_TYPE (reduc_def_info
);
6449 stmt_vec_info tmp
= STMT_VINFO_REDUC_DEF (reduc_def_info
);
6451 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = v_reduc_type
;
6452 /* If we have a condition reduction, see if we can simplify it further. */
6453 if (v_reduc_type
== COND_REDUCTION
)
6455 /* TODO: We can't yet handle reduction chains, since we need to treat
6456 each COND_EXPR in the chain specially, not just the last one.
6459 x_1 = PHI <x_3, ...>
6460 x_2 = a_2 ? ... : x_1;
6461 x_3 = a_3 ? ... : x_2;
6463 we're interested in the last element in x_3 for which a_2 || a_3
6464 is true, whereas the current reduction chain handling would
6465 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6466 as a reduction operation. */
6467 if (reduc_index
== -1)
6469 if (dump_enabled_p ())
6470 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6471 "conditional reduction chains not supported\n");
6475 /* vect_is_simple_reduction ensured that operand 2 is the
6476 loop-carried operand. */
6477 gcc_assert (reduc_index
== 2);
6479 /* Loop peeling modifies initial value of reduction PHI, which
6480 makes the reduction stmt to be transformed different to the
6481 original stmt analyzed. We need to record reduction code for
6482 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6483 it can be used directly at transform stage. */
6484 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
) == MAX_EXPR
6485 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
) == MIN_EXPR
)
6487 /* Also set the reduction type to CONST_COND_REDUCTION. */
6488 gcc_assert (cond_reduc_dt
== vect_constant_def
);
6489 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = CONST_COND_REDUCTION
;
6491 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST
,
6492 vectype_in
, OPTIMIZE_FOR_SPEED
))
6494 if (dump_enabled_p ())
6495 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6496 "optimizing condition reduction with"
6497 " FOLD_EXTRACT_LAST.\n");
6498 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
) = EXTRACT_LAST_REDUCTION
;
6500 else if (cond_reduc_dt
== vect_induction_def
)
6503 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo
);
6504 tree step
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo
);
6506 gcc_assert (TREE_CODE (base
) == INTEGER_CST
6507 && TREE_CODE (step
) == INTEGER_CST
);
6508 cond_reduc_val
= NULL_TREE
;
6509 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6510 above base; punt if base is the minimum value of the type for
6511 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6512 if (tree_int_cst_sgn (step
) == -1)
6514 cond_reduc_op_code
= MIN_EXPR
;
6515 if (tree_int_cst_sgn (base
) == -1)
6516 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6517 else if (tree_int_cst_lt (base
,
6518 TYPE_MAX_VALUE (TREE_TYPE (base
))))
6520 = int_const_binop (PLUS_EXPR
, base
, integer_one_node
);
6524 cond_reduc_op_code
= MAX_EXPR
;
6525 if (tree_int_cst_sgn (base
) == 1)
6526 cond_reduc_val
= build_int_cst (TREE_TYPE (base
), 0);
6527 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base
)),
6530 = int_const_binop (MINUS_EXPR
, base
, integer_one_node
);
6534 if (dump_enabled_p ())
6535 dump_printf_loc (MSG_NOTE
, vect_location
,
6536 "condition expression based on "
6537 "integer induction.\n");
6538 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6539 = INTEGER_INDUC_COND_REDUCTION
;
6542 else if (cond_reduc_dt
== vect_constant_def
)
6544 enum vect_def_type cond_initial_dt
;
6545 gimple
*def_stmt
= SSA_NAME_DEF_STMT (ops
[reduc_index
]);
6546 tree cond_initial_val
6547 = PHI_ARG_DEF_FROM_EDGE (def_stmt
, loop_preheader_edge (loop
));
6549 gcc_assert (cond_reduc_val
!= NULL_TREE
);
6550 vect_is_simple_use (cond_initial_val
, loop_vinfo
, &cond_initial_dt
);
6551 if (cond_initial_dt
== vect_constant_def
6552 && types_compatible_p (TREE_TYPE (cond_initial_val
),
6553 TREE_TYPE (cond_reduc_val
)))
6555 tree e
= fold_binary (LE_EXPR
, boolean_type_node
,
6556 cond_initial_val
, cond_reduc_val
);
6557 if (e
&& (integer_onep (e
) || integer_zerop (e
)))
6559 if (dump_enabled_p ())
6560 dump_printf_loc (MSG_NOTE
, vect_location
,
6561 "condition expression based on "
6562 "compile time constant.\n");
6563 /* Record reduction code at analysis stage. */
6564 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
)
6565 = integer_onep (e
) ? MAX_EXPR
: MIN_EXPR
;
6566 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
)
6567 = CONST_COND_REDUCTION
;
6574 gcc_assert (tmp
== orig_stmt_info
6575 || REDUC_GROUP_FIRST_ELEMENT (tmp
) == orig_stmt_info
);
6577 /* We changed STMT to be the first stmt in reduction chain, hence we
6578 check that in this case the first element in the chain is STMT. */
6579 gcc_assert (tmp
== stmt_info
6580 || REDUC_GROUP_FIRST_ELEMENT (tmp
) == stmt_info
);
6582 if (STMT_VINFO_LIVE_P (reduc_def_info
))
6588 ncopies
= vect_get_num_copies (loop_vinfo
, vectype_in
);
6590 gcc_assert (ncopies
>= 1);
6592 vec_mode
= TYPE_MODE (vectype_in
);
6593 poly_uint64 nunits_out
= TYPE_VECTOR_SUBPARTS (vectype_out
);
6597 def_bb
= gimple_bb (reduc_def_phi
);
6598 def_stmt_loop
= def_bb
->loop_father
;
6599 def_arg
= PHI_ARG_DEF_FROM_EDGE (reduc_def_phi
,
6600 loop_preheader_edge (def_stmt_loop
));
6601 stmt_vec_info def_arg_stmt_info
= loop_vinfo
->lookup_def (def_arg
);
6602 if (def_arg_stmt_info
6603 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info
)
6604 == vect_double_reduction_def
))
6605 double_reduc
= true;
6608 vect_reduction_type reduction_type
6609 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info
);
6610 if ((double_reduc
|| reduction_type
!= TREE_CODE_REDUCTION
)
6613 if (dump_enabled_p ())
6614 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6615 "multiple types in double reduction or condition "
6620 if (code
== COND_EXPR
)
6622 /* Only call during the analysis stage, otherwise we'll lose
6624 if (!vec_stmt
&& !vectorizable_condition (stmt_info
, gsi
, NULL
,
6625 true, NULL
, cost_vec
))
6627 if (dump_enabled_p ())
6628 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6629 "unsupported condition in reduction\n");
6633 else if (code
== LSHIFT_EXPR
|| code
== RSHIFT_EXPR
6634 || code
== LROTATE_EXPR
|| code
== RROTATE_EXPR
)
6636 /* Only call during the analysis stage, otherwise we'll lose
6637 STMT_VINFO_TYPE. We only support this for nested cycles
6638 without double reductions at the moment. */
6641 || (!vec_stmt
&& !vectorizable_shift (stmt_info
, gsi
, NULL
,
6644 if (dump_enabled_p ())
6645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6646 "unsupported shift or rotation in reduction\n");
6652 /* 4. Supportable by target? */
6654 /* 4.1. check support for the operation in the loop */
6655 optab
= optab_for_tree_code (code
, vectype_in
, optab_default
);
6658 if (dump_enabled_p ())
6659 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6665 if (optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
6667 if (dump_enabled_p ())
6668 dump_printf (MSG_NOTE
, "op not supported by target.\n");
6670 if (maybe_ne (GET_MODE_SIZE (vec_mode
), UNITS_PER_WORD
)
6671 || !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6674 if (dump_enabled_p ())
6675 dump_printf (MSG_NOTE
, "proceeding using word mode.\n");
6678 /* Worthwhile without SIMD support? */
6679 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in
))
6680 && !vect_worthwhile_without_simd_p (loop_vinfo
, code
))
6682 if (dump_enabled_p ())
6683 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6684 "not worthwhile without SIMD support.\n");
6690 /* 4.2. Check support for the epilog operation.
6692 If STMT represents a reduction pattern, then the type of the
6693 reduction variable may be different than the type of the rest
6694 of the arguments. For example, consider the case of accumulation
6695 of shorts into an int accumulator; The original code:
6696 S1: int_a = (int) short_a;
6697 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6700 STMT: int_acc = widen_sum <short_a, int_acc>
6703 1. The tree-code that is used to create the vector operation in the
6704 epilog code (that reduces the partial results) is not the
6705 tree-code of STMT, but is rather the tree-code of the original
6706 stmt from the pattern that STMT is replacing. I.e, in the example
6707 above we want to use 'widen_sum' in the loop, but 'plus' in the
6709 2. The type (mode) we use to check available target support
6710 for the vector operation to be created in the *epilog*, is
6711 determined by the type of the reduction variable (in the example
6712 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6713 However the type (mode) we use to check available target support
6714 for the vector operation to be created *inside the loop*, is
6715 determined by the type of the other arguments to STMT (in the
6716 example we'd check this: optab_handler (widen_sum_optab,
6719 This is contrary to "regular" reductions, in which the types of all
6720 the arguments are the same as the type of the reduction variable.
6721 For "regular" reductions we can therefore use the same vector type
6722 (and also the same tree-code) when generating the epilog code and
6723 when generating the code inside the loop. */
6726 && (reduction_type
== TREE_CODE_REDUCTION
6727 || reduction_type
== FOLD_LEFT_REDUCTION
))
6729 /* This is a reduction pattern: get the vectype from the type of the
6730 reduction variable, and get the tree-code from orig_stmt. */
6731 orig_code
= gimple_assign_rhs_code (orig_stmt_info
->stmt
);
6732 gcc_assert (vectype_out
);
6733 vec_mode
= TYPE_MODE (vectype_out
);
6737 /* Regular reduction: use the same vectype and tree-code as used for
6738 the vector code inside the loop can be used for the epilog code. */
6741 if (code
== MINUS_EXPR
)
6742 orig_code
= PLUS_EXPR
;
6744 /* For simple condition reductions, replace with the actual expression
6745 we want to base our reduction around. */
6746 if (reduction_type
== CONST_COND_REDUCTION
)
6748 orig_code
= STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info
);
6749 gcc_assert (orig_code
== MAX_EXPR
|| orig_code
== MIN_EXPR
);
6751 else if (reduction_type
== INTEGER_INDUC_COND_REDUCTION
)
6752 orig_code
= cond_reduc_op_code
;
6755 reduc_fn
= IFN_LAST
;
6757 if (reduction_type
== TREE_CODE_REDUCTION
6758 || reduction_type
== FOLD_LEFT_REDUCTION
6759 || reduction_type
== INTEGER_INDUC_COND_REDUCTION
6760 || reduction_type
== CONST_COND_REDUCTION
)
6762 if (reduction_type
== FOLD_LEFT_REDUCTION
6763 ? fold_left_reduction_fn (orig_code
, &reduc_fn
)
6764 : reduction_fn_for_scalar_code (orig_code
, &reduc_fn
))
6766 if (reduc_fn
!= IFN_LAST
6767 && !direct_internal_fn_supported_p (reduc_fn
, vectype_out
,
6768 OPTIMIZE_FOR_SPEED
))
6770 if (dump_enabled_p ())
6771 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6772 "reduc op not supported by target.\n");
6774 reduc_fn
= IFN_LAST
;
6779 if (!nested_cycle
|| double_reduc
)
6781 if (dump_enabled_p ())
6782 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6783 "no reduc code for scalar code.\n");
6789 else if (reduction_type
== COND_REDUCTION
)
6791 int scalar_precision
6792 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type
));
6793 cr_index_scalar_type
= make_unsigned_type (scalar_precision
);
6794 cr_index_vector_type
= build_vector_type (cr_index_scalar_type
,
6797 if (direct_internal_fn_supported_p (IFN_REDUC_MAX
, cr_index_vector_type
,
6798 OPTIMIZE_FOR_SPEED
))
6799 reduc_fn
= IFN_REDUC_MAX
;
6802 if (reduction_type
!= EXTRACT_LAST_REDUCTION
6803 && (!nested_cycle
|| double_reduc
)
6804 && reduc_fn
== IFN_LAST
6805 && !nunits_out
.is_constant ())
6807 if (dump_enabled_p ())
6808 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6809 "missing target support for reduction on"
6810 " variable-length vectors.\n");
6814 /* For SLP reductions, see if there is a neutral value we can use. */
6815 tree neutral_op
= NULL_TREE
;
6817 neutral_op
= neutral_op_for_slp_reduction
6818 (slp_node_instance
->reduc_phis
, code
,
6819 REDUC_GROUP_FIRST_ELEMENT (stmt_info
) != NULL
);
6821 if (double_reduc
&& reduction_type
== FOLD_LEFT_REDUCTION
)
6823 /* We can't support in-order reductions of code such as this:
6825 for (int i = 0; i < n1; ++i)
6826 for (int j = 0; j < n2; ++j)
6829 since GCC effectively transforms the loop when vectorizing:
6831 for (int i = 0; i < n1 / VF; ++i)
6832 for (int j = 0; j < n2; ++j)
6833 for (int k = 0; k < VF; ++k)
6836 which is a reassociation of the original operation. */
6837 if (dump_enabled_p ())
6838 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6839 "in-order double reduction not supported.\n");
6844 if (reduction_type
== FOLD_LEFT_REDUCTION
6846 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
6848 /* We cannot use in-order reductions in this case because there is
6849 an implicit reassociation of the operations involved. */
6850 if (dump_enabled_p ())
6851 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6852 "in-order unchained SLP reductions not supported.\n");
6856 /* For double reductions, and for SLP reductions with a neutral value,
6857 we construct a variable-length initial vector by loading a vector
6858 full of the neutral value and then shift-and-inserting the start
6859 values into the low-numbered elements. */
6860 if ((double_reduc
|| neutral_op
)
6861 && !nunits_out
.is_constant ()
6862 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT
,
6863 vectype_out
, OPTIMIZE_FOR_SPEED
))
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6867 "reduction on variable-length vectors requires"
6868 " target support for a vector-shift-and-insert"
6873 /* Check extra constraints for variable-length unchained SLP reductions. */
6874 if (STMT_SLP_TYPE (stmt_info
)
6875 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
6876 && !nunits_out
.is_constant ())
6878 /* We checked above that we could build the initial vector when
6879 there's a neutral element value. Check here for the case in
6880 which each SLP statement has its own initial value and in which
6881 that value needs to be repeated for every instance of the
6882 statement within the initial vector. */
6883 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
6884 scalar_mode elt_mode
= SCALAR_TYPE_MODE (TREE_TYPE (vectype_out
));
6886 && !can_duplicate_and_interleave_p (group_size
, elt_mode
))
6888 if (dump_enabled_p ())
6889 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6890 "unsupported form of SLP reduction for"
6891 " variable-length vectors: cannot build"
6892 " initial vector.\n");
6895 /* The epilogue code relies on the number of elements being a multiple
6896 of the group size. The duplicate-and-interleave approach to setting
6897 up the the initial vector does too. */
6898 if (!multiple_p (nunits_out
, group_size
))
6900 if (dump_enabled_p ())
6901 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6902 "unsupported form of SLP reduction for"
6903 " variable-length vectors: the vector size"
6904 " is not a multiple of the number of results.\n");
6909 /* In case of widenning multiplication by a constant, we update the type
6910 of the constant to be the type of the other operand. We check that the
6911 constant fits the type in the pattern recognition pass. */
6912 if (code
== DOT_PROD_EXPR
6913 && !types_compatible_p (TREE_TYPE (ops
[0]), TREE_TYPE (ops
[1])))
6915 if (TREE_CODE (ops
[0]) == INTEGER_CST
)
6916 ops
[0] = fold_convert (TREE_TYPE (ops
[1]), ops
[0]);
6917 else if (TREE_CODE (ops
[1]) == INTEGER_CST
)
6918 ops
[1] = fold_convert (TREE_TYPE (ops
[0]), ops
[1]);
6921 if (dump_enabled_p ())
6922 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6923 "invalid types in dot-prod\n");
6929 if (reduction_type
== COND_REDUCTION
)
6933 if (! max_loop_iterations (loop
, &ni
))
6935 if (dump_enabled_p ())
6936 dump_printf_loc (MSG_NOTE
, vect_location
,
6937 "loop count not known, cannot create cond "
6941 /* Convert backedges to iterations. */
6944 /* The additional index will be the same type as the condition. Check
6945 that the loop can fit into this less one (because we'll use up the
6946 zero slot for when there are no matches). */
6947 tree max_index
= TYPE_MAX_VALUE (cr_index_scalar_type
);
6948 if (wi::geu_p (ni
, wi::to_widest (max_index
)))
6950 if (dump_enabled_p ())
6951 dump_printf_loc (MSG_NOTE
, vect_location
,
6952 "loop size is greater than data size.\n");
6957 /* In case the vectorization factor (VF) is bigger than the number
6958 of elements that we can fit in a vectype (nunits), we have to generate
6959 more than one vector stmt - i.e - we need to "unroll" the
6960 vector stmt by a factor VF/nunits. For more details see documentation
6961 in vectorizable_operation. */
6963 /* If the reduction is used in an outer loop we need to generate
6964 VF intermediate results, like so (e.g. for ncopies=2):
6969 (i.e. we generate VF results in 2 registers).
6970 In this case we have a separate def-use cycle for each copy, and therefore
6971 for each copy we get the vector def for the reduction variable from the
6972 respective phi node created for this copy.
6974 Otherwise (the reduction is unused in the loop nest), we can combine
6975 together intermediate results, like so (e.g. for ncopies=2):
6979 (i.e. we generate VF/2 results in a single register).
6980 In this case for each copy we get the vector def for the reduction variable
6981 from the vectorized reduction operation generated in the previous iteration.
6983 This only works when we see both the reduction PHI and its only consumer
6984 in vectorizable_reduction and there are no intermediate stmts
6986 stmt_vec_info use_stmt_info
;
6987 tree reduc_phi_result
= gimple_phi_result (reduc_def_phi
);
6989 && (STMT_VINFO_RELEVANT (stmt_info
) <= vect_used_only_live
)
6990 && (use_stmt_info
= loop_vinfo
->lookup_single_use (reduc_phi_result
))
6991 && vect_stmt_to_vectorize (use_stmt_info
) == stmt_info
)
6993 single_defuse_cycle
= true;
6997 epilog_copies
= ncopies
;
6999 /* If the reduction stmt is one of the patterns that have lane
7000 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7002 && ! single_defuse_cycle
)
7003 && (code
== DOT_PROD_EXPR
7004 || code
== WIDEN_SUM_EXPR
7005 || code
== SAD_EXPR
))
7007 if (dump_enabled_p ())
7008 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7009 "multi def-use cycle not possible for lane-reducing "
7010 "reduction operation\n");
7015 vec_num
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7019 internal_fn cond_fn
= get_conditional_internal_fn (code
);
7020 vec_loop_masks
*masks
= &LOOP_VINFO_MASKS (loop_vinfo
);
7021 bool mask_by_cond_expr
= use_mask_by_cond_expr_p (code
, cond_fn
, vectype_in
);
7023 if (!vec_stmt
) /* transformation not required. */
7025 vect_model_reduction_cost (stmt_info
, reduc_fn
, ncopies
, cost_vec
);
7026 if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
7028 if (reduction_type
!= FOLD_LEFT_REDUCTION
7029 && !mask_by_cond_expr
7030 && (cond_fn
== IFN_LAST
7031 || !direct_internal_fn_supported_p (cond_fn
, vectype_in
,
7032 OPTIMIZE_FOR_SPEED
)))
7034 if (dump_enabled_p ())
7035 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7036 "can't use a fully-masked loop because no"
7037 " conditional operation is available.\n");
7038 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7040 else if (reduc_index
== -1)
7042 if (dump_enabled_p ())
7043 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7044 "can't use a fully-masked loop for chained"
7046 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7049 vect_record_loop_mask (loop_vinfo
, masks
, ncopies
* vec_num
,
7052 if (dump_enabled_p ()
7053 && reduction_type
== FOLD_LEFT_REDUCTION
)
7054 dump_printf_loc (MSG_NOTE
, vect_location
,
7055 "using an in-order (fold-left) reduction.\n");
7056 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
7062 if (dump_enabled_p ())
7063 dump_printf_loc (MSG_NOTE
, vect_location
, "transform reduction.\n");
7065 /* FORNOW: Multiple types are not supported for condition. */
7066 if (code
== COND_EXPR
)
7067 gcc_assert (ncopies
== 1);
7069 bool masked_loop_p
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
7071 if (reduction_type
== FOLD_LEFT_REDUCTION
)
7072 return vectorize_fold_left_reduction
7073 (stmt_info
, gsi
, vec_stmt
, slp_node
, reduc_def_phi
, code
,
7074 reduc_fn
, ops
, vectype_in
, reduc_index
, masks
);
7076 if (reduction_type
== EXTRACT_LAST_REDUCTION
)
7078 gcc_assert (!slp_node
);
7079 return vectorizable_condition (stmt_info
, gsi
, vec_stmt
,
7083 /* Create the destination vector */
7084 vec_dest
= vect_create_destination_var (scalar_dest
, vectype_out
);
7086 prev_stmt_info
= NULL
;
7087 prev_phi_info
= NULL
;
7090 vec_oprnds0
.create (1);
7091 vec_oprnds1
.create (1);
7092 if (op_type
== ternary_op
)
7093 vec_oprnds2
.create (1);
7096 phis
.create (vec_num
);
7097 vect_defs
.create (vec_num
);
7099 vect_defs
.quick_push (NULL_TREE
);
7102 phis
.splice (SLP_TREE_VEC_STMTS (slp_node_instance
->reduc_phis
));
7104 phis
.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info
));
7106 for (j
= 0; j
< ncopies
; j
++)
7108 if (code
== COND_EXPR
)
7110 gcc_assert (!slp_node
);
7111 vectorizable_condition (stmt_info
, gsi
, vec_stmt
,
7115 if (code
== LSHIFT_EXPR
7116 || code
== RSHIFT_EXPR
)
7118 vectorizable_shift (stmt_info
, gsi
, vec_stmt
, slp_node
, NULL
);
7127 /* Get vec defs for all the operands except the reduction index,
7128 ensuring the ordering of the ops in the vector is kept. */
7129 auto_vec
<tree
, 3> slp_ops
;
7130 auto_vec
<vec
<tree
>, 3> vec_defs
;
7132 slp_ops
.quick_push (ops
[0]);
7133 slp_ops
.quick_push (ops
[1]);
7134 if (op_type
== ternary_op
)
7135 slp_ops
.quick_push (ops
[2]);
7137 vect_get_slp_defs (slp_ops
, slp_node
, &vec_defs
);
7139 vec_oprnds0
.safe_splice (vec_defs
[0]);
7140 vec_defs
[0].release ();
7141 vec_oprnds1
.safe_splice (vec_defs
[1]);
7142 vec_defs
[1].release ();
7143 if (op_type
== ternary_op
)
7145 vec_oprnds2
.safe_splice (vec_defs
[2]);
7146 vec_defs
[2].release ();
7151 vec_oprnds0
.quick_push
7152 (vect_get_vec_def_for_operand (ops
[0], stmt_info
));
7153 vec_oprnds1
.quick_push
7154 (vect_get_vec_def_for_operand (ops
[1], stmt_info
));
7155 if (op_type
== ternary_op
)
7156 vec_oprnds2
.quick_push
7157 (vect_get_vec_def_for_operand (ops
[2], stmt_info
));
7164 gcc_assert (reduc_index
!= -1 || ! single_defuse_cycle
);
7166 if (single_defuse_cycle
&& reduc_index
== 0)
7167 vec_oprnds0
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
7170 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
7172 if (single_defuse_cycle
&& reduc_index
== 1)
7173 vec_oprnds1
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
7176 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
7178 if (op_type
== ternary_op
)
7180 if (single_defuse_cycle
&& reduc_index
== 2)
7181 vec_oprnds2
[0] = gimple_get_lhs (new_stmt_info
->stmt
);
7184 = vect_get_vec_def_for_stmt_copy (loop_vinfo
,
7190 FOR_EACH_VEC_ELT (vec_oprnds0
, i
, def0
)
7192 tree vop
[3] = { def0
, vec_oprnds1
[i
], NULL_TREE
};
7193 if (masked_loop_p
&& !mask_by_cond_expr
)
7195 /* Make sure that the reduction accumulator is vop[0]. */
7196 if (reduc_index
== 1)
7198 gcc_assert (commutative_tree_code (code
));
7199 std::swap (vop
[0], vop
[1]);
7201 tree mask
= vect_get_loop_mask (gsi
, masks
, vec_num
* ncopies
,
7202 vectype_in
, i
* ncopies
+ j
);
7203 gcall
*call
= gimple_build_call_internal (cond_fn
, 4, mask
,
7206 new_temp
= make_ssa_name (vec_dest
, call
);
7207 gimple_call_set_lhs (call
, new_temp
);
7208 gimple_call_set_nothrow (call
, true);
7210 = vect_finish_stmt_generation (stmt_info
, call
, gsi
);
7214 if (op_type
== ternary_op
)
7215 vop
[2] = vec_oprnds2
[i
];
7217 if (masked_loop_p
&& mask_by_cond_expr
)
7219 tree mask
= vect_get_loop_mask (gsi
, masks
,
7221 vectype_in
, i
* ncopies
+ j
);
7222 build_vect_cond_expr (code
, vop
, mask
, gsi
);
7225 gassign
*new_stmt
= gimple_build_assign (vec_dest
, code
,
7226 vop
[0], vop
[1], vop
[2]);
7227 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
7228 gimple_assign_set_lhs (new_stmt
, new_temp
);
7230 = vect_finish_stmt_generation (stmt_info
, new_stmt
, gsi
);
7235 SLP_TREE_VEC_STMTS (slp_node
).quick_push (new_stmt_info
);
7236 vect_defs
.quick_push (new_temp
);
7239 vect_defs
[0] = new_temp
;
7246 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= new_stmt_info
;
7248 STMT_VINFO_RELATED_STMT (prev_stmt_info
) = new_stmt_info
;
7250 prev_stmt_info
= new_stmt_info
;
7253 /* Finalize the reduction-phi (set its arguments) and create the
7254 epilog reduction code. */
7255 if ((!single_defuse_cycle
|| code
== COND_EXPR
) && !slp_node
)
7256 vect_defs
[0] = gimple_get_lhs ((*vec_stmt
)->stmt
);
7258 vect_create_epilog_for_reduction (vect_defs
, stmt_info
, reduc_def_phi
,
7259 epilog_copies
, reduc_fn
, phis
,
7260 double_reduc
, slp_node
, slp_node_instance
,
7261 cond_reduc_val
, cond_reduc_op_code
,
7267 /* Function vect_min_worthwhile_factor.
7269 For a loop where we could vectorize the operation indicated by CODE,
7270 return the minimum vectorization factor that makes it worthwhile
7271 to use generic vectors. */
7273 vect_min_worthwhile_factor (enum tree_code code
)
7293 /* Return true if VINFO indicates we are doing loop vectorization and if
7294 it is worth decomposing CODE operations into scalar operations for
7295 that loop's vectorization factor. */
7298 vect_worthwhile_without_simd_p (vec_info
*vinfo
, tree_code code
)
7300 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
7301 unsigned HOST_WIDE_INT value
;
7303 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&value
)
7304 && value
>= vect_min_worthwhile_factor (code
));
7307 /* Function vectorizable_induction
7309 Check if STMT_INFO performs an induction computation that can be vectorized.
7310 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7311 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7312 Return true if STMT_INFO is vectorizable in this way. */
7315 vectorizable_induction (stmt_vec_info stmt_info
,
7316 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7317 stmt_vec_info
*vec_stmt
, slp_tree slp_node
,
7318 stmt_vector_for_cost
*cost_vec
)
7320 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7321 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7323 bool nested_in_vect_loop
= false;
7324 struct loop
*iv_loop
;
7326 edge pe
= loop_preheader_edge (loop
);
7328 tree new_vec
, vec_init
, vec_step
, t
;
7331 gphi
*induction_phi
;
7332 tree induc_def
, vec_dest
;
7333 tree init_expr
, step_expr
;
7334 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
7338 imm_use_iterator imm_iter
;
7339 use_operand_p use_p
;
7343 gimple_stmt_iterator si
;
7345 gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
);
7349 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7352 /* Make sure it was recognized as induction computation. */
7353 if (STMT_VINFO_DEF_TYPE (stmt_info
) != vect_induction_def
)
7356 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7357 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7362 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7363 gcc_assert (ncopies
>= 1);
7365 /* FORNOW. These restrictions should be relaxed. */
7366 if (nested_in_vect_loop_p (loop
, stmt_info
))
7368 imm_use_iterator imm_iter
;
7369 use_operand_p use_p
;
7376 if (dump_enabled_p ())
7377 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7378 "multiple types in nested loop.\n");
7382 /* FORNOW: outer loop induction with SLP not supported. */
7383 if (STMT_SLP_TYPE (stmt_info
))
7387 latch_e
= loop_latch_edge (loop
->inner
);
7388 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7389 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7391 gimple
*use_stmt
= USE_STMT (use_p
);
7392 if (is_gimple_debug (use_stmt
))
7395 if (!flow_bb_inside_loop_p (loop
->inner
, gimple_bb (use_stmt
)))
7397 exit_phi
= use_stmt
;
7403 stmt_vec_info exit_phi_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7404 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo
)
7405 && !STMT_VINFO_LIVE_P (exit_phi_vinfo
)))
7407 if (dump_enabled_p ())
7408 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7409 "inner-loop induction only used outside "
7410 "of the outer vectorized loop.\n");
7415 nested_in_vect_loop
= true;
7416 iv_loop
= loop
->inner
;
7420 gcc_assert (iv_loop
== (gimple_bb (phi
))->loop_father
);
7422 if (slp_node
&& !nunits
.is_constant ())
7424 /* The current SLP code creates the initial value element-by-element. */
7425 if (dump_enabled_p ())
7426 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7427 "SLP induction not supported for variable-length"
7432 if (!vec_stmt
) /* transformation not required. */
7434 STMT_VINFO_TYPE (stmt_info
) = induc_vec_info_type
;
7435 DUMP_VECT_SCOPE ("vectorizable_induction");
7436 vect_model_induction_cost (stmt_info
, ncopies
, cost_vec
);
7442 /* Compute a vector variable, initialized with the first VF values of
7443 the induction variable. E.g., for an iv with IV_PHI='X' and
7444 evolution S, for a vector of 4 units, we want to compute:
7445 [X, X + S, X + 2*S, X + 3*S]. */
7447 if (dump_enabled_p ())
7448 dump_printf_loc (MSG_NOTE
, vect_location
, "transform induction phi.\n");
7450 latch_e
= loop_latch_edge (iv_loop
);
7451 loop_arg
= PHI_ARG_DEF_FROM_EDGE (phi
, latch_e
);
7453 step_expr
= STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info
);
7454 gcc_assert (step_expr
!= NULL_TREE
);
7456 pe
= loop_preheader_edge (iv_loop
);
7457 init_expr
= PHI_ARG_DEF_FROM_EDGE (phi
,
7458 loop_preheader_edge (iv_loop
));
7461 if (!nested_in_vect_loop
)
7463 /* Convert the initial value to the desired type. */
7464 tree new_type
= TREE_TYPE (vectype
);
7465 init_expr
= gimple_convert (&stmts
, new_type
, init_expr
);
7467 /* If we are using the loop mask to "peel" for alignment then we need
7468 to adjust the start value here. */
7469 tree skip_niters
= LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo
);
7470 if (skip_niters
!= NULL_TREE
)
7472 if (FLOAT_TYPE_P (vectype
))
7473 skip_niters
= gimple_build (&stmts
, FLOAT_EXPR
, new_type
,
7476 skip_niters
= gimple_convert (&stmts
, new_type
, skip_niters
);
7477 tree skip_step
= gimple_build (&stmts
, MULT_EXPR
, new_type
,
7478 skip_niters
, step_expr
);
7479 init_expr
= gimple_build (&stmts
, MINUS_EXPR
, new_type
,
7480 init_expr
, skip_step
);
7484 /* Convert the step to the desired type. */
7485 step_expr
= gimple_convert (&stmts
, TREE_TYPE (vectype
), step_expr
);
7489 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7490 gcc_assert (!new_bb
);
7493 /* Find the first insertion point in the BB. */
7494 basic_block bb
= gimple_bb (phi
);
7495 si
= gsi_after_labels (bb
);
7497 /* For SLP induction we have to generate several IVs as for example
7498 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7499 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7500 [VF*S, VF*S, VF*S, VF*S] for all. */
7503 /* Enforced above. */
7504 unsigned int const_nunits
= nunits
.to_constant ();
7506 /* Generate [VF*S, VF*S, ... ]. */
7507 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7509 expr
= build_int_cst (integer_type_node
, vf
);
7510 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7513 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7514 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7516 if (! CONSTANT_CLASS_P (new_name
))
7517 new_name
= vect_init_vector (stmt_info
, new_name
,
7518 TREE_TYPE (step_expr
), NULL
);
7519 new_vec
= build_vector_from_val (vectype
, new_name
);
7520 vec_step
= vect_init_vector (stmt_info
, new_vec
, vectype
, NULL
);
7522 /* Now generate the IVs. */
7523 unsigned group_size
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7524 unsigned nvects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7525 unsigned elts
= const_nunits
* nvects
;
7526 unsigned nivs
= least_common_multiple (group_size
,
7527 const_nunits
) / const_nunits
;
7528 gcc_assert (elts
% group_size
== 0);
7529 tree elt
= init_expr
;
7531 for (ivn
= 0; ivn
< nivs
; ++ivn
)
7533 tree_vector_builder
elts (vectype
, const_nunits
, 1);
7535 for (unsigned eltn
= 0; eltn
< const_nunits
; ++eltn
)
7537 if (ivn
*const_nunits
+ eltn
>= group_size
7538 && (ivn
* const_nunits
+ eltn
) % group_size
== 0)
7539 elt
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (elt
),
7541 elts
.quick_push (elt
);
7543 vec_init
= gimple_build_vector (&stmts
, &elts
);
7546 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7547 gcc_assert (!new_bb
);
7550 /* Create the induction-phi that defines the induction-operand. */
7551 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7552 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7553 stmt_vec_info induction_phi_info
7554 = loop_vinfo
->add_stmt (induction_phi
);
7555 induc_def
= PHI_RESULT (induction_phi
);
7557 /* Create the iv update inside the loop */
7558 vec_def
= make_ssa_name (vec_dest
);
7559 new_stmt
= gimple_build_assign (vec_def
, PLUS_EXPR
, induc_def
, vec_step
);
7560 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7561 loop_vinfo
->add_stmt (new_stmt
);
7563 /* Set the arguments of the phi node: */
7564 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7565 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7568 SLP_TREE_VEC_STMTS (slp_node
).quick_push (induction_phi_info
);
7571 /* Re-use IVs when we can. */
7575 = least_common_multiple (group_size
, const_nunits
) / group_size
;
7576 /* Generate [VF'*S, VF'*S, ... ]. */
7577 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7579 expr
= build_int_cst (integer_type_node
, vfp
);
7580 expr
= fold_convert (TREE_TYPE (step_expr
), expr
);
7583 expr
= build_int_cst (TREE_TYPE (step_expr
), vfp
);
7584 new_name
= fold_build2 (MULT_EXPR
, TREE_TYPE (step_expr
),
7586 if (! CONSTANT_CLASS_P (new_name
))
7587 new_name
= vect_init_vector (stmt_info
, new_name
,
7588 TREE_TYPE (step_expr
), NULL
);
7589 new_vec
= build_vector_from_val (vectype
, new_name
);
7590 vec_step
= vect_init_vector (stmt_info
, new_vec
, vectype
, NULL
);
7591 for (; ivn
< nvects
; ++ivn
)
7593 gimple
*iv
= SLP_TREE_VEC_STMTS (slp_node
)[ivn
- nivs
]->stmt
;
7595 if (gimple_code (iv
) == GIMPLE_PHI
)
7596 def
= gimple_phi_result (iv
);
7598 def
= gimple_assign_lhs (iv
);
7599 new_stmt
= gimple_build_assign (make_ssa_name (vectype
),
7602 if (gimple_code (iv
) == GIMPLE_PHI
)
7603 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7606 gimple_stmt_iterator tgsi
= gsi_for_stmt (iv
);
7607 gsi_insert_after (&tgsi
, new_stmt
, GSI_CONTINUE_LINKING
);
7609 SLP_TREE_VEC_STMTS (slp_node
).quick_push
7610 (loop_vinfo
->add_stmt (new_stmt
));
7617 /* Create the vector that holds the initial_value of the induction. */
7618 if (nested_in_vect_loop
)
7620 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7621 been created during vectorization of previous stmts. We obtain it
7622 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7623 vec_init
= vect_get_vec_def_for_operand (init_expr
, stmt_info
);
7624 /* If the initial value is not of proper type, convert it. */
7625 if (!useless_type_conversion_p (vectype
, TREE_TYPE (vec_init
)))
7628 = gimple_build_assign (vect_get_new_ssa_name (vectype
,
7632 build1 (VIEW_CONVERT_EXPR
, vectype
,
7634 vec_init
= gimple_assign_lhs (new_stmt
);
7635 new_bb
= gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop
),
7637 gcc_assert (!new_bb
);
7638 loop_vinfo
->add_stmt (new_stmt
);
7643 /* iv_loop is the loop to be vectorized. Create:
7644 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7646 new_name
= gimple_convert (&stmts
, TREE_TYPE (vectype
), init_expr
);
7648 unsigned HOST_WIDE_INT const_nunits
;
7649 if (nunits
.is_constant (&const_nunits
))
7651 tree_vector_builder
elts (vectype
, const_nunits
, 1);
7652 elts
.quick_push (new_name
);
7653 for (i
= 1; i
< const_nunits
; i
++)
7655 /* Create: new_name_i = new_name + step_expr */
7656 new_name
= gimple_build (&stmts
, PLUS_EXPR
, TREE_TYPE (new_name
),
7657 new_name
, step_expr
);
7658 elts
.quick_push (new_name
);
7660 /* Create a vector from [new_name_0, new_name_1, ...,
7661 new_name_nunits-1] */
7662 vec_init
= gimple_build_vector (&stmts
, &elts
);
7664 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr
)))
7665 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7666 vec_init
= gimple_build (&stmts
, VEC_SERIES_EXPR
, vectype
,
7667 new_name
, step_expr
);
7671 [base, base, base, ...]
7672 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7673 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)));
7674 gcc_assert (flag_associative_math
);
7675 tree index
= build_index_vector (vectype
, 0, 1);
7676 tree base_vec
= gimple_build_vector_from_val (&stmts
, vectype
,
7678 tree step_vec
= gimple_build_vector_from_val (&stmts
, vectype
,
7680 vec_init
= gimple_build (&stmts
, FLOAT_EXPR
, vectype
, index
);
7681 vec_init
= gimple_build (&stmts
, MULT_EXPR
, vectype
,
7682 vec_init
, step_vec
);
7683 vec_init
= gimple_build (&stmts
, PLUS_EXPR
, vectype
,
7684 vec_init
, base_vec
);
7689 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
7690 gcc_assert (!new_bb
);
7695 /* Create the vector that holds the step of the induction. */
7696 if (nested_in_vect_loop
)
7697 /* iv_loop is nested in the loop to be vectorized. Generate:
7698 vec_step = [S, S, S, S] */
7699 new_name
= step_expr
;
7702 /* iv_loop is the loop to be vectorized. Generate:
7703 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7704 gimple_seq seq
= NULL
;
7705 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7707 expr
= build_int_cst (integer_type_node
, vf
);
7708 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7711 expr
= build_int_cst (TREE_TYPE (step_expr
), vf
);
7712 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7716 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7717 gcc_assert (!new_bb
);
7721 t
= unshare_expr (new_name
);
7722 gcc_assert (CONSTANT_CLASS_P (new_name
)
7723 || TREE_CODE (new_name
) == SSA_NAME
);
7724 new_vec
= build_vector_from_val (vectype
, t
);
7725 vec_step
= vect_init_vector (stmt_info
, new_vec
, vectype
, NULL
);
7728 /* Create the following def-use cycle:
7733 vec_iv = PHI <vec_init, vec_loop>
7737 vec_loop = vec_iv + vec_step; */
7739 /* Create the induction-phi that defines the induction-operand. */
7740 vec_dest
= vect_get_new_vect_var (vectype
, vect_simple_var
, "vec_iv_");
7741 induction_phi
= create_phi_node (vec_dest
, iv_loop
->header
);
7742 stmt_vec_info induction_phi_info
= loop_vinfo
->add_stmt (induction_phi
);
7743 induc_def
= PHI_RESULT (induction_phi
);
7745 /* Create the iv update inside the loop */
7746 vec_def
= make_ssa_name (vec_dest
);
7747 new_stmt
= gimple_build_assign (vec_def
, PLUS_EXPR
, induc_def
, vec_step
);
7748 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7749 stmt_vec_info new_stmt_info
= loop_vinfo
->add_stmt (new_stmt
);
7751 /* Set the arguments of the phi node: */
7752 add_phi_arg (induction_phi
, vec_init
, pe
, UNKNOWN_LOCATION
);
7753 add_phi_arg (induction_phi
, vec_def
, loop_latch_edge (iv_loop
),
7756 STMT_VINFO_VEC_STMT (stmt_info
) = *vec_stmt
= induction_phi_info
;
7758 /* In case that vectorization factor (VF) is bigger than the number
7759 of elements that we can fit in a vectype (nunits), we have to generate
7760 more than one vector stmt - i.e - we need to "unroll" the
7761 vector stmt by a factor VF/nunits. For more details see documentation
7762 in vectorizable_operation. */
7766 gimple_seq seq
= NULL
;
7767 stmt_vec_info prev_stmt_vinfo
;
7768 /* FORNOW. This restriction should be relaxed. */
7769 gcc_assert (!nested_in_vect_loop
);
7771 /* Create the vector that holds the step of the induction. */
7772 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr
)))
7774 expr
= build_int_cst (integer_type_node
, nunits
);
7775 expr
= gimple_build (&seq
, FLOAT_EXPR
, TREE_TYPE (step_expr
), expr
);
7778 expr
= build_int_cst (TREE_TYPE (step_expr
), nunits
);
7779 new_name
= gimple_build (&seq
, MULT_EXPR
, TREE_TYPE (step_expr
),
7783 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, seq
);
7784 gcc_assert (!new_bb
);
7787 t
= unshare_expr (new_name
);
7788 gcc_assert (CONSTANT_CLASS_P (new_name
)
7789 || TREE_CODE (new_name
) == SSA_NAME
);
7790 new_vec
= build_vector_from_val (vectype
, t
);
7791 vec_step
= vect_init_vector (stmt_info
, new_vec
, vectype
, NULL
);
7793 vec_def
= induc_def
;
7794 prev_stmt_vinfo
= induction_phi_info
;
7795 for (i
= 1; i
< ncopies
; i
++)
7797 /* vec_i = vec_prev + vec_step */
7798 new_stmt
= gimple_build_assign (vec_dest
, PLUS_EXPR
,
7800 vec_def
= make_ssa_name (vec_dest
, new_stmt
);
7801 gimple_assign_set_lhs (new_stmt
, vec_def
);
7803 gsi_insert_before (&si
, new_stmt
, GSI_SAME_STMT
);
7804 new_stmt_info
= loop_vinfo
->add_stmt (new_stmt
);
7805 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo
) = new_stmt_info
;
7806 prev_stmt_vinfo
= new_stmt_info
;
7810 if (nested_in_vect_loop
)
7812 /* Find the loop-closed exit-phi of the induction, and record
7813 the final vector of induction results: */
7815 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, loop_arg
)
7817 gimple
*use_stmt
= USE_STMT (use_p
);
7818 if (is_gimple_debug (use_stmt
))
7821 if (!flow_bb_inside_loop_p (iv_loop
, gimple_bb (use_stmt
)))
7823 exit_phi
= use_stmt
;
7829 stmt_vec_info stmt_vinfo
= loop_vinfo
->lookup_stmt (exit_phi
);
7830 /* FORNOW. Currently not supporting the case that an inner-loop induction
7831 is not used in the outer-loop (i.e. only outside the outer-loop). */
7832 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo
)
7833 && !STMT_VINFO_LIVE_P (stmt_vinfo
));
7835 STMT_VINFO_VEC_STMT (stmt_vinfo
) = new_stmt_info
;
7836 if (dump_enabled_p ())
7837 dump_printf_loc (MSG_NOTE
, vect_location
,
7838 "vector of inductions after inner-loop:%G",
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_NOTE
, vect_location
,
7846 "transform induction: created def-use cycle: %G%G",
7847 induction_phi
, SSA_NAME_DEF_STMT (vec_def
));
7852 /* Function vectorizable_live_operation.
7854 STMT_INFO computes a value that is used outside the loop. Check if
7855 it can be supported. */
7858 vectorizable_live_operation (stmt_vec_info stmt_info
,
7859 gimple_stmt_iterator
*gsi ATTRIBUTE_UNUSED
,
7860 slp_tree slp_node
, int slp_index
,
7861 stmt_vec_info
*vec_stmt
,
7862 stmt_vector_for_cost
*)
7864 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
7865 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
7866 imm_use_iterator imm_iter
;
7867 tree lhs
, lhs_type
, bitsize
, vec_bitsize
;
7868 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
7869 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
7872 auto_vec
<tree
> vec_oprnds
;
7874 poly_uint64 vec_index
= 0;
7876 gcc_assert (STMT_VINFO_LIVE_P (stmt_info
));
7878 if (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
)
7881 /* FORNOW. CHECKME. */
7882 if (nested_in_vect_loop_p (loop
, stmt_info
))
7885 /* If STMT is not relevant and it is a simple assignment and its inputs are
7886 invariant then it can remain in place, unvectorized. The original last
7887 scalar value that it computes will be used. */
7888 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
7890 gcc_assert (is_simple_and_all_uses_invariant (stmt_info
, loop_vinfo
));
7891 if (dump_enabled_p ())
7892 dump_printf_loc (MSG_NOTE
, vect_location
,
7893 "statement is simple and uses invariant. Leaving in "
7901 ncopies
= vect_get_num_copies (loop_vinfo
, vectype
);
7905 gcc_assert (slp_index
>= 0);
7907 int num_scalar
= SLP_TREE_SCALAR_STMTS (slp_node
).length ();
7908 int num_vec
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
7910 /* Get the last occurrence of the scalar index from the concatenation of
7911 all the slp vectors. Calculate which slp vector it is and the index
7913 poly_uint64 pos
= (num_vec
* nunits
) - num_scalar
+ slp_index
;
7915 /* Calculate which vector contains the result, and which lane of
7916 that vector we need. */
7917 if (!can_div_trunc_p (pos
, nunits
, &vec_entry
, &vec_index
))
7919 if (dump_enabled_p ())
7920 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7921 "Cannot determine which vector holds the"
7922 " final result.\n");
7929 /* No transformation required. */
7930 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
))
7932 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST
, vectype
,
7933 OPTIMIZE_FOR_SPEED
))
7935 if (dump_enabled_p ())
7936 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7937 "can't use a fully-masked loop because "
7938 "the target doesn't support extract last "
7940 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7944 if (dump_enabled_p ())
7945 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7946 "can't use a fully-masked loop because an "
7947 "SLP statement is live after the loop.\n");
7948 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7950 else if (ncopies
> 1)
7952 if (dump_enabled_p ())
7953 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
7954 "can't use a fully-masked loop because"
7955 " ncopies is greater than 1.\n");
7956 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo
) = false;
7960 gcc_assert (ncopies
== 1 && !slp_node
);
7961 vect_record_loop_mask (loop_vinfo
,
7962 &LOOP_VINFO_MASKS (loop_vinfo
),
7969 /* Use the lhs of the original scalar statement. */
7970 gimple
*stmt
= vect_orig_stmt (stmt_info
)->stmt
;
7972 lhs
= (is_a
<gphi
*> (stmt
)) ? gimple_phi_result (stmt
)
7973 : gimple_get_lhs (stmt
);
7974 lhs_type
= TREE_TYPE (lhs
);
7976 bitsize
= (VECTOR_BOOLEAN_TYPE_P (vectype
)
7977 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype
)))
7978 : TYPE_SIZE (TREE_TYPE (vectype
)));
7979 vec_bitsize
= TYPE_SIZE (vectype
);
7981 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7982 tree vec_lhs
, bitstart
;
7985 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
7987 /* Get the correct slp vectorized stmt. */
7988 gimple
*vec_stmt
= SLP_TREE_VEC_STMTS (slp_node
)[vec_entry
]->stmt
;
7989 if (gphi
*phi
= dyn_cast
<gphi
*> (vec_stmt
))
7990 vec_lhs
= gimple_phi_result (phi
);
7992 vec_lhs
= gimple_get_lhs (vec_stmt
);
7994 /* Get entry to use. */
7995 bitstart
= bitsize_int (vec_index
);
7996 bitstart
= int_const_binop (MULT_EXPR
, bitsize
, bitstart
);
8000 enum vect_def_type dt
= STMT_VINFO_DEF_TYPE (stmt_info
);
8001 vec_lhs
= vect_get_vec_def_for_operand_1 (stmt_info
, dt
);
8002 gcc_checking_assert (ncopies
== 1
8003 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
));
8005 /* For multiple copies, get the last copy. */
8006 for (int i
= 1; i
< ncopies
; ++i
)
8007 vec_lhs
= vect_get_vec_def_for_stmt_copy (loop_vinfo
, vec_lhs
);
8009 /* Get the last lane in the vector. */
8010 bitstart
= int_const_binop (MINUS_EXPR
, vec_bitsize
, bitsize
);
8013 gimple_seq stmts
= NULL
;
8015 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8019 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8021 where VEC_LHS is the vectorized live-out result and MASK is
8022 the loop mask for the final iteration. */
8023 gcc_assert (ncopies
== 1 && !slp_node
);
8024 tree scalar_type
= TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info
));
8025 tree mask
= vect_get_loop_mask (gsi
, &LOOP_VINFO_MASKS (loop_vinfo
),
8027 tree scalar_res
= gimple_build (&stmts
, CFN_EXTRACT_LAST
,
8028 scalar_type
, mask
, vec_lhs
);
8030 /* Convert the extracted vector element to the required scalar type. */
8031 new_tree
= gimple_convert (&stmts
, lhs_type
, scalar_res
);
8035 tree bftype
= TREE_TYPE (vectype
);
8036 if (VECTOR_BOOLEAN_TYPE_P (vectype
))
8037 bftype
= build_nonstandard_integer_type (tree_to_uhwi (bitsize
), 1);
8038 new_tree
= build3 (BIT_FIELD_REF
, bftype
, vec_lhs
, bitsize
, bitstart
);
8039 new_tree
= force_gimple_operand (fold_convert (lhs_type
, new_tree
),
8040 &stmts
, true, NULL_TREE
);
8044 gsi_insert_seq_on_edge_immediate (single_exit (loop
), stmts
);
8046 /* Replace use of lhs with newly computed result. If the use stmt is a
8047 single arg PHI, just replace all uses of PHI result. It's necessary
8048 because lcssa PHI defining lhs may be before newly inserted stmt. */
8049 use_operand_p use_p
;
8050 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, lhs
)
8051 if (!flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
))
8052 && !is_gimple_debug (use_stmt
))
8054 if (gimple_code (use_stmt
) == GIMPLE_PHI
8055 && gimple_phi_num_args (use_stmt
) == 1)
8057 replace_uses_by (gimple_phi_result (use_stmt
), new_tree
);
8061 FOR_EACH_IMM_USE_ON_STMT (use_p
, imm_iter
)
8062 SET_USE (use_p
, new_tree
);
8064 update_stmt (use_stmt
);
8070 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8073 vect_loop_kill_debug_uses (struct loop
*loop
, stmt_vec_info stmt_info
)
8075 ssa_op_iter op_iter
;
8076 imm_use_iterator imm_iter
;
8077 def_operand_p def_p
;
8080 FOR_EACH_PHI_OR_STMT_DEF (def_p
, stmt_info
->stmt
, op_iter
, SSA_OP_DEF
)
8082 FOR_EACH_IMM_USE_STMT (ustmt
, imm_iter
, DEF_FROM_PTR (def_p
))
8086 if (!is_gimple_debug (ustmt
))
8089 bb
= gimple_bb (ustmt
);
8091 if (!flow_bb_inside_loop_p (loop
, bb
))
8093 if (gimple_debug_bind_p (ustmt
))
8095 if (dump_enabled_p ())
8096 dump_printf_loc (MSG_NOTE
, vect_location
,
8097 "killing debug use\n");
8099 gimple_debug_bind_reset_value (ustmt
);
8100 update_stmt (ustmt
);
8109 /* Given loop represented by LOOP_VINFO, return true if computation of
8110 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8114 loop_niters_no_overflow (loop_vec_info loop_vinfo
)
8116 /* Constant case. */
8117 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8119 tree cst_niters
= LOOP_VINFO_NITERS (loop_vinfo
);
8120 tree cst_nitersm1
= LOOP_VINFO_NITERSM1 (loop_vinfo
);
8122 gcc_assert (TREE_CODE (cst_niters
) == INTEGER_CST
);
8123 gcc_assert (TREE_CODE (cst_nitersm1
) == INTEGER_CST
);
8124 if (wi::to_widest (cst_nitersm1
) < wi::to_widest (cst_niters
))
8129 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8130 /* Check the upper bound of loop niters. */
8131 if (get_max_loop_iterations (loop
, &max
))
8133 tree type
= TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
));
8134 signop sgn
= TYPE_SIGN (type
);
8135 widest_int type_max
= widest_int::from (wi::max_value (type
), sgn
);
8142 /* Return a mask type with half the number of elements as TYPE. */
8145 vect_halve_mask_nunits (tree type
)
8147 poly_uint64 nunits
= exact_div (TYPE_VECTOR_SUBPARTS (type
), 2);
8148 return build_truth_vector_type (nunits
, current_vector_size
);
8151 /* Return a mask type with twice as many elements as TYPE. */
8154 vect_double_mask_nunits (tree type
)
8156 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (type
) * 2;
8157 return build_truth_vector_type (nunits
, current_vector_size
);
8160 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8161 contain a sequence of NVECTORS masks that each control a vector of type
8165 vect_record_loop_mask (loop_vec_info loop_vinfo
, vec_loop_masks
*masks
,
8166 unsigned int nvectors
, tree vectype
)
8168 gcc_assert (nvectors
!= 0);
8169 if (masks
->length () < nvectors
)
8170 masks
->safe_grow_cleared (nvectors
);
8171 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8172 /* The number of scalars per iteration and the number of vectors are
8173 both compile-time constants. */
8174 unsigned int nscalars_per_iter
8175 = exact_div (nvectors
* TYPE_VECTOR_SUBPARTS (vectype
),
8176 LOOP_VINFO_VECT_FACTOR (loop_vinfo
)).to_constant ();
8177 if (rgm
->max_nscalars_per_iter
< nscalars_per_iter
)
8179 rgm
->max_nscalars_per_iter
= nscalars_per_iter
;
8180 rgm
->mask_type
= build_same_sized_truth_vector_type (vectype
);
8184 /* Given a complete set of masks MASKS, extract mask number INDEX
8185 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8186 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8188 See the comment above vec_loop_masks for more details about the mask
8192 vect_get_loop_mask (gimple_stmt_iterator
*gsi
, vec_loop_masks
*masks
,
8193 unsigned int nvectors
, tree vectype
, unsigned int index
)
8195 rgroup_masks
*rgm
= &(*masks
)[nvectors
- 1];
8196 tree mask_type
= rgm
->mask_type
;
8198 /* Populate the rgroup's mask array, if this is the first time we've
8200 if (rgm
->masks
.is_empty ())
8202 rgm
->masks
.safe_grow_cleared (nvectors
);
8203 for (unsigned int i
= 0; i
< nvectors
; ++i
)
8205 tree mask
= make_temp_ssa_name (mask_type
, NULL
, "loop_mask");
8206 /* Provide a dummy definition until the real one is available. */
8207 SSA_NAME_DEF_STMT (mask
) = gimple_build_nop ();
8208 rgm
->masks
[i
] = mask
;
8212 tree mask
= rgm
->masks
[index
];
8213 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type
),
8214 TYPE_VECTOR_SUBPARTS (vectype
)))
8216 /* A loop mask for data type X can be reused for data type Y
8217 if X has N times more elements than Y and if Y's elements
8218 are N times bigger than X's. In this case each sequence
8219 of N elements in the loop mask will be all-zero or all-one.
8220 We can then view-convert the mask so that each sequence of
8221 N elements is replaced by a single element. */
8222 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type
),
8223 TYPE_VECTOR_SUBPARTS (vectype
)));
8224 gimple_seq seq
= NULL
;
8225 mask_type
= build_same_sized_truth_vector_type (vectype
);
8226 mask
= gimple_build (&seq
, VIEW_CONVERT_EXPR
, mask_type
, mask
);
8228 gsi_insert_seq_before (gsi
, seq
, GSI_SAME_STMT
);
8233 /* Scale profiling counters by estimation for LOOP which is vectorized
8237 scale_profile_for_vect_loop (struct loop
*loop
, unsigned vf
)
8239 edge preheader
= loop_preheader_edge (loop
);
8240 /* Reduce loop iterations by the vectorization factor. */
8241 gcov_type new_est_niter
= niter_for_unrolled_loop (loop
, vf
);
8242 profile_count freq_h
= loop
->header
->count
, freq_e
= preheader
->count ();
8244 if (freq_h
.nonzero_p ())
8246 profile_probability p
;
8248 /* Avoid dropping loop body profile counter to 0 because of zero count
8249 in loop's preheader. */
8250 if (!(freq_e
== profile_count::zero ()))
8251 freq_e
= freq_e
.force_nonzero ();
8252 p
= freq_e
.apply_scale (new_est_niter
+ 1, 1).probability_in (freq_h
);
8253 scale_loop_frequencies (loop
, p
);
8256 edge exit_e
= single_exit (loop
);
8257 exit_e
->probability
= profile_probability::always ()
8258 .apply_scale (1, new_est_niter
+ 1);
8260 edge exit_l
= single_pred_edge (loop
->latch
);
8261 profile_probability prob
= exit_l
->probability
;
8262 exit_l
->probability
= exit_e
->probability
.invert ();
8263 if (prob
.initialized_p () && exit_l
->probability
.initialized_p ())
8264 scale_bbs_frequencies (&loop
->latch
, 1, exit_l
->probability
/ prob
);
8267 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8268 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8272 vect_transform_loop_stmt (loop_vec_info loop_vinfo
, stmt_vec_info stmt_info
,
8273 gimple_stmt_iterator
*gsi
, stmt_vec_info
*seen_store
)
8275 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8276 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8278 if (dump_enabled_p ())
8279 dump_printf_loc (MSG_NOTE
, vect_location
,
8280 "------>vectorizing statement: %G", stmt_info
->stmt
);
8282 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8283 vect_loop_kill_debug_uses (loop
, stmt_info
);
8285 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8286 && !STMT_VINFO_LIVE_P (stmt_info
))
8289 if (STMT_VINFO_VECTYPE (stmt_info
))
8292 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
8293 if (!STMT_SLP_TYPE (stmt_info
)
8294 && maybe_ne (nunits
, vf
)
8295 && dump_enabled_p ())
8296 /* For SLP VF is set according to unrolling factor, and not
8297 to vector size, hence for SLP this print is not valid. */
8298 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8301 /* Pure SLP statements have already been vectorized. We still need
8302 to apply loop vectorization to hybrid SLP statements. */
8303 if (PURE_SLP_STMT (stmt_info
))
8306 if (dump_enabled_p ())
8307 dump_printf_loc (MSG_NOTE
, vect_location
, "transform statement.\n");
8309 if (vect_transform_stmt (stmt_info
, gsi
, NULL
, NULL
))
8310 *seen_store
= stmt_info
;
8313 /* Function vect_transform_loop.
8315 The analysis phase has determined that the loop is vectorizable.
8316 Vectorize the loop - created vectorized stmts to replace the scalar
8317 stmts in the loop, and update the loop exit condition.
8318 Returns scalar epilogue loop if any. */
8321 vect_transform_loop (loop_vec_info loop_vinfo
)
8323 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
8324 struct loop
*epilogue
= NULL
;
8325 basic_block
*bbs
= LOOP_VINFO_BBS (loop_vinfo
);
8326 int nbbs
= loop
->num_nodes
;
8328 tree niters_vector
= NULL_TREE
;
8329 tree step_vector
= NULL_TREE
;
8330 tree niters_vector_mult_vf
= NULL_TREE
;
8331 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
8332 unsigned int lowest_vf
= constant_lower_bound (vf
);
8334 bool check_profitability
= false;
8337 DUMP_VECT_SCOPE ("vec_transform_loop");
8339 loop_vinfo
->shared
->check_datarefs ();
8341 /* Use the more conservative vectorization threshold. If the number
8342 of iterations is constant assume the cost check has been performed
8343 by our caller. If the threshold makes all loops profitable that
8344 run at least the (estimated) vectorization factor number of times
8345 checking is pointless, too. */
8346 th
= LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo
);
8347 if (th
>= vect_vf_for_cost (loop_vinfo
)
8348 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
8350 if (dump_enabled_p ())
8351 dump_printf_loc (MSG_NOTE
, vect_location
,
8352 "Profitability threshold is %d loop iterations.\n",
8354 check_profitability
= true;
8357 /* Make sure there exists a single-predecessor exit bb. Do this before
8359 edge e
= single_exit (loop
);
8360 if (! single_pred_p (e
->dest
))
8362 split_loop_exit_edge (e
, true);
8363 if (dump_enabled_p ())
8364 dump_printf (MSG_NOTE
, "split exit edge\n");
8367 /* Version the loop first, if required, so the profitability check
8370 if (LOOP_REQUIRES_VERSIONING (loop_vinfo
))
8372 poly_uint64 versioning_threshold
8373 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo
);
8374 if (check_profitability
8375 && ordered_p (poly_uint64 (th
), versioning_threshold
))
8377 versioning_threshold
= ordered_max (poly_uint64 (th
),
8378 versioning_threshold
);
8379 check_profitability
= false;
8382 = vect_loop_versioning (loop_vinfo
, th
, check_profitability
,
8383 versioning_threshold
);
8384 sloop
->force_vectorize
= false;
8385 check_profitability
= false;
8388 /* Make sure there exists a single-predecessor exit bb also on the
8389 scalar loop copy. Do this after versioning but before peeling
8390 so CFG structure is fine for both scalar and if-converted loop
8391 to make slpeel_duplicate_current_defs_from_edges face matched
8392 loop closed PHI nodes on the exit. */
8393 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8395 e
= single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
));
8396 if (! single_pred_p (e
->dest
))
8398 split_loop_exit_edge (e
, true);
8399 if (dump_enabled_p ())
8400 dump_printf (MSG_NOTE
, "split exit edge of scalar loop\n");
8404 tree niters
= vect_build_loop_niters (loop_vinfo
);
8405 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo
) = niters
;
8406 tree nitersm1
= unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo
));
8407 bool niters_no_overflow
= loop_niters_no_overflow (loop_vinfo
);
8408 epilogue
= vect_do_peeling (loop_vinfo
, niters
, nitersm1
, &niters_vector
,
8409 &step_vector
, &niters_vector_mult_vf
, th
,
8410 check_profitability
, niters_no_overflow
);
8412 if (niters_vector
== NULL_TREE
)
8414 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8415 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8416 && known_eq (lowest_vf
, vf
))
8419 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo
)),
8420 LOOP_VINFO_INT_NITERS (loop_vinfo
) / lowest_vf
);
8421 step_vector
= build_one_cst (TREE_TYPE (niters
));
8424 vect_gen_vector_loop_niters (loop_vinfo
, niters
, &niters_vector
,
8425 &step_vector
, niters_no_overflow
);
8428 /* 1) Make sure the loop header has exactly two entries
8429 2) Make sure we have a preheader basic block. */
8431 gcc_assert (EDGE_COUNT (loop
->header
->preds
) == 2);
8433 split_edge (loop_preheader_edge (loop
));
8435 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
8436 && vect_use_loop_mask_for_alignment_p (loop_vinfo
))
8437 /* This will deal with any possible peeling. */
8438 vect_prepare_for_masked_peels (loop_vinfo
);
8440 /* Schedule the SLP instances first, then handle loop vectorization
8442 if (!loop_vinfo
->slp_instances
.is_empty ())
8444 DUMP_VECT_SCOPE ("scheduling SLP instances");
8445 vect_schedule_slp (loop_vinfo
);
8448 /* FORNOW: the vectorizer supports only loops which body consist
8449 of one basic block (header + empty latch). When the vectorizer will
8450 support more involved loop forms, the order by which the BBs are
8451 traversed need to be reconsidered. */
8453 for (i
= 0; i
< nbbs
; i
++)
8455 basic_block bb
= bbs
[i
];
8456 stmt_vec_info stmt_info
;
8458 for (gphi_iterator si
= gsi_start_phis (bb
); !gsi_end_p (si
);
8461 gphi
*phi
= si
.phi ();
8462 if (dump_enabled_p ())
8463 dump_printf_loc (MSG_NOTE
, vect_location
,
8464 "------>vectorizing phi: %G", phi
);
8465 stmt_info
= loop_vinfo
->lookup_stmt (phi
);
8469 if (MAY_HAVE_DEBUG_BIND_STMTS
&& !STMT_VINFO_LIVE_P (stmt_info
))
8470 vect_loop_kill_debug_uses (loop
, stmt_info
);
8472 if (!STMT_VINFO_RELEVANT_P (stmt_info
)
8473 && !STMT_VINFO_LIVE_P (stmt_info
))
8476 if (STMT_VINFO_VECTYPE (stmt_info
)
8478 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
)), vf
))
8479 && dump_enabled_p ())
8480 dump_printf_loc (MSG_NOTE
, vect_location
, "multiple-types.\n");
8482 if ((STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
8483 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_reduction_def
8484 || STMT_VINFO_DEF_TYPE (stmt_info
) == vect_nested_cycle
)
8485 && ! PURE_SLP_STMT (stmt_info
))
8487 if (dump_enabled_p ())
8488 dump_printf_loc (MSG_NOTE
, vect_location
, "transform phi.\n");
8489 vect_transform_stmt (stmt_info
, NULL
, NULL
, NULL
);
8493 for (gimple_stmt_iterator si
= gsi_start_bb (bb
);
8496 stmt
= gsi_stmt (si
);
8497 /* During vectorization remove existing clobber stmts. */
8498 if (gimple_clobber_p (stmt
))
8500 unlink_stmt_vdef (stmt
);
8501 gsi_remove (&si
, true);
8502 release_defs (stmt
);
8506 stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
8508 /* vector stmts created in the outer-loop during vectorization of
8509 stmts in an inner-loop may not have a stmt_info, and do not
8510 need to be vectorized. */
8511 stmt_vec_info seen_store
= NULL
;
8514 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
8516 gimple
*def_seq
= STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
);
8517 for (gimple_stmt_iterator subsi
= gsi_start (def_seq
);
8518 !gsi_end_p (subsi
); gsi_next (&subsi
))
8520 stmt_vec_info pat_stmt_info
8521 = loop_vinfo
->lookup_stmt (gsi_stmt (subsi
));
8522 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
,
8525 stmt_vec_info pat_stmt_info
8526 = STMT_VINFO_RELATED_STMT (stmt_info
);
8527 vect_transform_loop_stmt (loop_vinfo
, pat_stmt_info
, &si
,
8530 vect_transform_loop_stmt (loop_vinfo
, stmt_info
, &si
,
8536 if (STMT_VINFO_GROUPED_ACCESS (seen_store
))
8537 /* Interleaving. If IS_STORE is TRUE, the
8538 vectorization of the interleaving chain was
8539 completed - free all the stores in the chain. */
8540 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store
));
8542 /* Free the attached stmt_vec_info and remove the stmt. */
8543 loop_vinfo
->remove_stmt (stmt_info
);
8548 /* Stub out scalar statements that must not survive vectorization.
8549 Doing this here helps with grouped statements, or statements that
8550 are involved in patterns. */
8551 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
8552 !gsi_end_p (gsi
); gsi_next (&gsi
))
8554 gcall
*call
= dyn_cast
<gcall
*> (gsi_stmt (gsi
));
8555 if (call
&& gimple_call_internal_p (call
, IFN_MASK_LOAD
))
8557 tree lhs
= gimple_get_lhs (call
);
8558 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8560 tree zero
= build_zero_cst (TREE_TYPE (lhs
));
8561 gimple
*new_stmt
= gimple_build_assign (lhs
, zero
);
8562 gsi_replace (&gsi
, new_stmt
, true);
8568 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8569 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8570 if (integer_onep (step_vector
))
8571 niters_no_overflow
= true;
8572 vect_set_loop_condition (loop
, loop_vinfo
, niters_vector
, step_vector
,
8573 niters_vector_mult_vf
, !niters_no_overflow
);
8575 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
8576 scale_profile_for_vect_loop (loop
, assumed_vf
);
8578 /* True if the final iteration might not handle a full vector's
8579 worth of scalar iterations. */
8580 bool final_iter_may_be_partial
= LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
);
8581 /* The minimum number of iterations performed by the epilogue. This
8582 is 1 when peeling for gaps because we always need a final scalar
8584 int min_epilogue_iters
= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) ? 1 : 0;
8585 /* +1 to convert latch counts to loop iteration counts,
8586 -min_epilogue_iters to remove iterations that cannot be performed
8587 by the vector code. */
8588 int bias_for_lowest
= 1 - min_epilogue_iters
;
8589 int bias_for_assumed
= bias_for_lowest
;
8590 int alignment_npeels
= LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
);
8591 if (alignment_npeels
&& LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
))
8593 /* When the amount of peeling is known at compile time, the first
8594 iteration will have exactly alignment_npeels active elements.
8595 In the worst case it will have at least one. */
8596 int min_first_active
= (alignment_npeels
> 0 ? alignment_npeels
: 1);
8597 bias_for_lowest
+= lowest_vf
- min_first_active
;
8598 bias_for_assumed
+= assumed_vf
- min_first_active
;
8600 /* In these calculations the "- 1" converts loop iteration counts
8601 back to latch counts. */
8602 if (loop
->any_upper_bound
)
8603 loop
->nb_iterations_upper_bound
8604 = (final_iter_may_be_partial
8605 ? wi::udiv_ceil (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8607 : wi::udiv_floor (loop
->nb_iterations_upper_bound
+ bias_for_lowest
,
8609 if (loop
->any_likely_upper_bound
)
8610 loop
->nb_iterations_likely_upper_bound
8611 = (final_iter_may_be_partial
8612 ? wi::udiv_ceil (loop
->nb_iterations_likely_upper_bound
8613 + bias_for_lowest
, lowest_vf
) - 1
8614 : wi::udiv_floor (loop
->nb_iterations_likely_upper_bound
8615 + bias_for_lowest
, lowest_vf
) - 1);
8616 if (loop
->any_estimate
)
8617 loop
->nb_iterations_estimate
8618 = (final_iter_may_be_partial
8619 ? wi::udiv_ceil (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8621 : wi::udiv_floor (loop
->nb_iterations_estimate
+ bias_for_assumed
,
8624 if (dump_enabled_p ())
8626 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8628 dump_printf_loc (MSG_NOTE
, vect_location
,
8629 "LOOP VECTORIZED\n");
8631 dump_printf_loc (MSG_NOTE
, vect_location
,
8632 "OUTER LOOP VECTORIZED\n");
8633 dump_printf (MSG_NOTE
, "\n");
8637 dump_printf_loc (MSG_NOTE
, vect_location
,
8638 "LOOP EPILOGUE VECTORIZED (VS=");
8639 dump_dec (MSG_NOTE
, current_vector_size
);
8640 dump_printf (MSG_NOTE
, ")\n");
8644 /* Loops vectorized with a variable factor won't benefit from
8645 unrolling/peeling. */
8646 if (!vf
.is_constant ())
8649 if (dump_enabled_p ())
8650 dump_printf_loc (MSG_NOTE
, vect_location
, "Disabling unrolling due to"
8651 " variable-length vectorization factor\n");
8653 /* Free SLP instances here because otherwise stmt reference counting
8655 slp_instance instance
;
8656 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo
), i
, instance
)
8657 vect_free_slp_instance (instance
, true);
8658 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).release ();
8659 /* Clear-up safelen field since its value is invalid after vectorization
8660 since vectorized loop can have loop-carried dependencies. */
8663 /* Don't vectorize epilogue for epilogue. */
8664 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
8667 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK
))
8672 auto_vector_sizes vector_sizes
;
8673 targetm
.vectorize
.autovectorize_vector_sizes (&vector_sizes
);
8674 unsigned int next_size
= 0;
8676 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8677 on niters already ajusted for the iterations of the prologue. */
8678 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
)
8679 && known_eq (vf
, lowest_vf
))
8681 unsigned HOST_WIDE_INT eiters
8682 = (LOOP_VINFO_INT_NITERS (loop_vinfo
)
8683 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
));
8685 = eiters
% lowest_vf
+ LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
);
8686 epilogue
->nb_iterations_upper_bound
= eiters
- 1;
8687 epilogue
->any_upper_bound
= true;
8690 while (next_size
< vector_sizes
.length ()
8691 && !(constant_multiple_p (current_vector_size
,
8692 vector_sizes
[next_size
], &ratio
)
8693 && eiters
>= lowest_vf
/ ratio
))
8697 while (next_size
< vector_sizes
.length ()
8698 && maybe_lt (current_vector_size
, vector_sizes
[next_size
]))
8701 if (next_size
== vector_sizes
.length ())
8707 epilogue
->force_vectorize
= loop
->force_vectorize
;
8708 epilogue
->safelen
= loop
->safelen
;
8709 epilogue
->dont_vectorize
= false;
8711 /* We may need to if-convert epilogue to vectorize it. */
8712 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo
))
8713 tree_if_conversion (epilogue
);
8719 /* The code below is trying to perform simple optimization - revert
8720 if-conversion for masked stores, i.e. if the mask of a store is zero
8721 do not perform it and all stored value producers also if possible.
8729 this transformation will produce the following semi-hammock:
8731 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8733 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8734 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8735 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8736 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8737 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8738 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8743 optimize_mask_stores (struct loop
*loop
)
8745 basic_block
*bbs
= get_loop_body (loop
);
8746 unsigned nbbs
= loop
->num_nodes
;
8749 struct loop
*bb_loop
;
8750 gimple_stmt_iterator gsi
;
8752 auto_vec
<gimple
*> worklist
;
8753 auto_purge_vect_location sentinel
;
8755 vect_location
= find_loop_location (loop
);
8756 /* Pick up all masked stores in loop if any. */
8757 for (i
= 0; i
< nbbs
; i
++)
8760 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
8763 stmt
= gsi_stmt (gsi
);
8764 if (gimple_call_internal_p (stmt
, IFN_MASK_STORE
))
8765 worklist
.safe_push (stmt
);
8770 if (worklist
.is_empty ())
8773 /* Loop has masked stores. */
8774 while (!worklist
.is_empty ())
8776 gimple
*last
, *last_store
;
8779 basic_block store_bb
, join_bb
;
8780 gimple_stmt_iterator gsi_to
;
8781 tree vdef
, new_vdef
;
8786 last
= worklist
.pop ();
8787 mask
= gimple_call_arg (last
, 2);
8788 bb
= gimple_bb (last
);
8789 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8790 the same loop as if_bb. It could be different to LOOP when two
8791 level loop-nest is vectorized and mask_store belongs to the inner
8793 e
= split_block (bb
, last
);
8794 bb_loop
= bb
->loop_father
;
8795 gcc_assert (loop
== bb_loop
|| flow_loop_nested_p (loop
, bb_loop
));
8797 store_bb
= create_empty_bb (bb
);
8798 add_bb_to_loop (store_bb
, bb_loop
);
8799 e
->flags
= EDGE_TRUE_VALUE
;
8800 efalse
= make_edge (bb
, store_bb
, EDGE_FALSE_VALUE
);
8801 /* Put STORE_BB to likely part. */
8802 efalse
->probability
= profile_probability::unlikely ();
8803 store_bb
->count
= efalse
->count ();
8804 make_single_succ_edge (store_bb
, join_bb
, EDGE_FALLTHRU
);
8805 if (dom_info_available_p (CDI_DOMINATORS
))
8806 set_immediate_dominator (CDI_DOMINATORS
, store_bb
, bb
);
8807 if (dump_enabled_p ())
8808 dump_printf_loc (MSG_NOTE
, vect_location
,
8809 "Create new block %d to sink mask stores.",
8811 /* Create vector comparison with boolean result. */
8812 vectype
= TREE_TYPE (mask
);
8813 zero
= build_zero_cst (vectype
);
8814 stmt
= gimple_build_cond (EQ_EXPR
, mask
, zero
, NULL_TREE
, NULL_TREE
);
8815 gsi
= gsi_last_bb (bb
);
8816 gsi_insert_after (&gsi
, stmt
, GSI_SAME_STMT
);
8817 /* Create new PHI node for vdef of the last masked store:
8818 .MEM_2 = VDEF <.MEM_1>
8819 will be converted to
8820 .MEM.3 = VDEF <.MEM_1>
8821 and new PHI node will be created in join bb
8822 .MEM_2 = PHI <.MEM_1, .MEM_3>
8824 vdef
= gimple_vdef (last
);
8825 new_vdef
= make_ssa_name (gimple_vop (cfun
), last
);
8826 gimple_set_vdef (last
, new_vdef
);
8827 phi
= create_phi_node (vdef
, join_bb
);
8828 add_phi_arg (phi
, new_vdef
, EDGE_SUCC (store_bb
, 0), UNKNOWN_LOCATION
);
8830 /* Put all masked stores with the same mask to STORE_BB if possible. */
8833 gimple_stmt_iterator gsi_from
;
8834 gimple
*stmt1
= NULL
;
8836 /* Move masked store to STORE_BB. */
8838 gsi
= gsi_for_stmt (last
);
8840 /* Shift GSI to the previous stmt for further traversal. */
8842 gsi_to
= gsi_start_bb (store_bb
);
8843 gsi_move_before (&gsi_from
, &gsi_to
);
8844 /* Setup GSI_TO to the non-empty block start. */
8845 gsi_to
= gsi_start_bb (store_bb
);
8846 if (dump_enabled_p ())
8847 dump_printf_loc (MSG_NOTE
, vect_location
,
8848 "Move stmt to created bb\n%G", last
);
8849 /* Move all stored value producers if possible. */
8850 while (!gsi_end_p (gsi
))
8853 imm_use_iterator imm_iter
;
8854 use_operand_p use_p
;
8857 /* Skip debug statements. */
8858 if (is_gimple_debug (gsi_stmt (gsi
)))
8863 stmt1
= gsi_stmt (gsi
);
8864 /* Do not consider statements writing to memory or having
8865 volatile operand. */
8866 if (gimple_vdef (stmt1
)
8867 || gimple_has_volatile_ops (stmt1
))
8871 lhs
= gimple_get_lhs (stmt1
);
8875 /* LHS of vectorized stmt must be SSA_NAME. */
8876 if (TREE_CODE (lhs
) != SSA_NAME
)
8879 if (!VECTOR_TYPE_P (TREE_TYPE (lhs
)))
8881 /* Remove dead scalar statement. */
8882 if (has_zero_uses (lhs
))
8884 gsi_remove (&gsi_from
, true);
8889 /* Check that LHS does not have uses outside of STORE_BB. */
8891 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, lhs
)
8894 use_stmt
= USE_STMT (use_p
);
8895 if (is_gimple_debug (use_stmt
))
8897 if (gimple_bb (use_stmt
) != store_bb
)
8906 if (gimple_vuse (stmt1
)
8907 && gimple_vuse (stmt1
) != gimple_vuse (last_store
))
8910 /* Can move STMT1 to STORE_BB. */
8911 if (dump_enabled_p ())
8912 dump_printf_loc (MSG_NOTE
, vect_location
,
8913 "Move stmt to created bb\n%G", stmt1
);
8914 gsi_move_before (&gsi_from
, &gsi_to
);
8915 /* Shift GSI_TO for further insertion. */
8918 /* Put other masked stores with the same mask to STORE_BB. */
8919 if (worklist
.is_empty ()
8920 || gimple_call_arg (worklist
.last (), 2) != mask
8921 || worklist
.last () != stmt1
)
8923 last
= worklist
.pop ();
8925 add_phi_arg (phi
, gimple_vuse (last_store
), e
, UNKNOWN_LOCATION
);