]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.cc
Avoid scaling flat loop profiles of vectorized loops
[thirdparty/gcc.git] / gcc / tree-vect-loop.cc
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 #include "langhooks.h"
59
60 /* Loop Vectorization Pass.
61
62 This pass tries to vectorize loops.
63
64 For example, the vectorizer transforms the following simple loop:
65
66 short a[N]; short b[N]; short c[N]; int i;
67
68 for (i=0; i<N; i++){
69 a[i] = b[i] + c[i];
70 }
71
72 as if it was manually vectorized by rewriting the source code into:
73
74 typedef int __attribute__((mode(V8HI))) v8hi;
75 short a[N]; short b[N]; short c[N]; int i;
76 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 v8hi va, vb, vc;
78
79 for (i=0; i<N/8; i++){
80 vb = pb[i];
81 vc = pc[i];
82 va = vb + vc;
83 pa[i] = va;
84 }
85
86 The main entry to this pass is vectorize_loops(), in which
87 the vectorizer applies a set of analyses on a given set of loops,
88 followed by the actual vectorization transformation for the loops that
89 had successfully passed the analysis phase.
90 Throughout this pass we make a distinction between two types of
91 data: scalars (which are represented by SSA_NAMES), and memory references
92 ("data-refs"). These two types of data require different handling both
93 during analysis and transformation. The types of data-refs that the
94 vectorizer currently supports are ARRAY_REFS which base is an array DECL
95 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96 accesses are required to have a simple (consecutive) access pattern.
97
98 Analysis phase:
99 ===============
100 The driver for the analysis phase is vect_analyze_loop().
101 It applies a set of analyses, some of which rely on the scalar evolution
102 analyzer (scev) developed by Sebastian Pop.
103
104 During the analysis phase the vectorizer records some information
105 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106 loop, as well as general information about the loop as a whole, which is
107 recorded in a "loop_vec_info" struct attached to each loop.
108
109 Transformation phase:
110 =====================
111 The loop transformation phase scans all the stmts in the loop, and
112 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113 the loop that needs to be vectorized. It inserts the vector code sequence
114 just before the scalar stmt S, and records a pointer to the vector code
115 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116 attached to S). This pointer will be used for the vectorization of following
117 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118 otherwise, we rely on dead code elimination for removing it.
119
120 For example, say stmt S1 was vectorized into stmt VS1:
121
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 S2: a = b;
125
126 To vectorize stmt S2, the vectorizer first finds the stmt that defines
127 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
129 resulting sequence would be:
130
131 VS1: vb = px[i];
132 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 VS2: va = vb;
134 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135
136 Operands that are not SSA_NAMEs, are data-refs that appear in
137 load/store operations (like 'x[i]' in S1), and are handled differently.
138
139 Target modeling:
140 =================
141 Currently the only target specific information that is used is the
142 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143 Targets that can support different sizes of vectors, for now will need
144 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
145 flexibility will be added in the future.
146
147 Since we only vectorize operations which vector form can be
148 expressed using existing tree codes, to verify that an operation is
149 supported, the vectorizer checks the relevant optab at the relevant
150 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
151 the value found is CODE_FOR_nothing, then there's no target support, and
152 we can't vectorize the stmt.
153
154 For additional information on this project see:
155 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 */
157
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
159 unsigned *);
160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
161 bool *, bool *, bool);
162
163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
164 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
165 may already be set for general statements (not just data refs). */
166
167 static opt_result
168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
169 bool vectype_maybe_set_p,
170 poly_uint64 *vf)
171 {
172 gimple *stmt = stmt_info->stmt;
173
174 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
176 || gimple_clobber_p (stmt))
177 {
178 if (dump_enabled_p ())
179 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
180 return opt_result::success ();
181 }
182
183 tree stmt_vectype, nunits_vectype;
184 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
185 &stmt_vectype,
186 &nunits_vectype);
187 if (!res)
188 return res;
189
190 if (stmt_vectype)
191 {
192 if (STMT_VINFO_VECTYPE (stmt_info))
193 /* The only case when a vectype had been already set is for stmts
194 that contain a data ref, or for "pattern-stmts" (stmts generated
195 by the vectorizer to represent/replace a certain idiom). */
196 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
197 || vectype_maybe_set_p)
198 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
199 else
200 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
201 }
202
203 if (nunits_vectype)
204 vect_update_max_nunits (vf, nunits_vectype);
205
206 return opt_result::success ();
207 }
208
209 /* Subroutine of vect_determine_vectorization_factor. Set the vector
210 types of STMT_INFO and all attached pattern statements and update
211 the vectorization factor VF accordingly. Return true on success
212 or false if something prevented vectorization. */
213
214 static opt_result
215 vect_determine_vf_for_stmt (vec_info *vinfo,
216 stmt_vec_info stmt_info, poly_uint64 *vf)
217 {
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
222 if (!res)
223 return res;
224
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
227 {
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
234 {
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: %G",
239 def_stmt_info->stmt);
240 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
241 if (!res)
242 return res;
243 }
244
245 if (dump_enabled_p ())
246 dump_printf_loc (MSG_NOTE, vect_location,
247 "==> examining pattern statement: %G",
248 stmt_info->stmt);
249 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
250 if (!res)
251 return res;
252 }
253
254 return opt_result::success ();
255 }
256
257 /* Function vect_determine_vectorization_factor
258
259 Determine the vectorization factor (VF). VF is the number of data elements
260 that are operated upon in parallel in a single iteration of the vectorized
261 loop. For example, when vectorizing a loop that operates on 4byte elements,
262 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
263 elements can fit in a single vector register.
264
265 We currently support vectorization of loops in which all types operated upon
266 are of the same size. Therefore this function currently sets VF according to
267 the size of the types operated upon, and fails if there are multiple sizes
268 in the loop.
269
270 VF is also the factor by which the loop iterations are strip-mined, e.g.:
271 original loop:
272 for (i=0; i<N; i++){
273 a[i] = b[i] + c[i];
274 }
275
276 vectorized loop:
277 for (i=0; i<N; i+=VF){
278 a[i:VF] = b[i:VF] + c[i:VF];
279 }
280 */
281
282 static opt_result
283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
284 {
285 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
286 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
287 unsigned nbbs = loop->num_nodes;
288 poly_uint64 vectorization_factor = 1;
289 tree scalar_type = NULL_TREE;
290 gphi *phi;
291 tree vectype;
292 stmt_vec_info stmt_info;
293 unsigned i;
294
295 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296
297 for (i = 0; i < nbbs; i++)
298 {
299 basic_block bb = bbs[i];
300
301 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
302 gsi_next (&si))
303 {
304 phi = si.phi ();
305 stmt_info = loop_vinfo->lookup_stmt (phi);
306 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
308 (gimple *) phi);
309
310 gcc_assert (stmt_info);
311
312 if (STMT_VINFO_RELEVANT_P (stmt_info)
313 || STMT_VINFO_LIVE_P (stmt_info))
314 {
315 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
316 scalar_type = TREE_TYPE (PHI_RESULT (phi));
317
318 if (dump_enabled_p ())
319 dump_printf_loc (MSG_NOTE, vect_location,
320 "get vectype for scalar type: %T\n",
321 scalar_type);
322
323 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
324 if (!vectype)
325 return opt_result::failure_at (phi,
326 "not vectorized: unsupported "
327 "data-type %T\n",
328 scalar_type);
329 STMT_VINFO_VECTYPE (stmt_info) = vectype;
330
331 if (dump_enabled_p ())
332 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
333 vectype);
334
335 if (dump_enabled_p ())
336 {
337 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
338 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
339 dump_printf (MSG_NOTE, "\n");
340 }
341
342 vect_update_max_nunits (&vectorization_factor, vectype);
343 }
344 }
345
346 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
347 gsi_next (&si))
348 {
349 if (is_gimple_debug (gsi_stmt (si)))
350 continue;
351 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
352 opt_result res
353 = vect_determine_vf_for_stmt (loop_vinfo,
354 stmt_info, &vectorization_factor);
355 if (!res)
356 return res;
357 }
358 }
359
360 /* TODO: Analyze cost. Decide if worth while to vectorize. */
361 if (dump_enabled_p ())
362 {
363 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
364 dump_dec (MSG_NOTE, vectorization_factor);
365 dump_printf (MSG_NOTE, "\n");
366 }
367
368 if (known_le (vectorization_factor, 1U))
369 return opt_result::failure_at (vect_location,
370 "not vectorized: unsupported data-type\n");
371 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
372 return opt_result::success ();
373 }
374
375
376 /* Function vect_is_simple_iv_evolution.
377
378 FORNOW: A simple evolution of an induction variables in the loop is
379 considered a polynomial evolution. */
380
381 static bool
382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
383 tree * step)
384 {
385 tree init_expr;
386 tree step_expr;
387 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
388 basic_block bb;
389
390 /* When there is no evolution in this loop, the evolution function
391 is not "simple". */
392 if (evolution_part == NULL_TREE)
393 return false;
394
395 /* When the evolution is a polynomial of degree >= 2
396 the evolution function is not "simple". */
397 if (tree_is_chrec (evolution_part))
398 return false;
399
400 step_expr = evolution_part;
401 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
402
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
405 step_expr, init_expr);
406
407 *init = init_expr;
408 *step = step_expr;
409
410 if (TREE_CODE (step_expr) != INTEGER_CST
411 && (TREE_CODE (step_expr) != SSA_NAME
412 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
413 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
414 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
415 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
416 || !flag_associative_math)))
417 && (TREE_CODE (step_expr) != REAL_CST
418 || !flag_associative_math))
419 {
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "step unknown.\n");
423 return false;
424 }
425
426 return true;
427 }
428
429 /* Function vect_is_nonlinear_iv_evolution
430
431 Only support nonlinear induction for integer type
432 1. neg
433 2. mul by constant
434 3. lshift/rshift by constant.
435
436 For neg induction, return a fake step as integer -1. */
437 static bool
438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
439 gphi* loop_phi_node, tree *init, tree *step)
440 {
441 tree init_expr, ev_expr, result, op1, op2;
442 gimple* def;
443
444 if (gimple_phi_num_args (loop_phi_node) != 2)
445 return false;
446
447 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
448 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
449
450 /* Support nonlinear induction only for integer type. */
451 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
452 return false;
453
454 *init = init_expr;
455 result = PHI_RESULT (loop_phi_node);
456
457 if (TREE_CODE (ev_expr) != SSA_NAME
458 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
459 || !is_gimple_assign (def))
460 return false;
461
462 enum tree_code t_code = gimple_assign_rhs_code (def);
463 switch (t_code)
464 {
465 case NEGATE_EXPR:
466 if (gimple_assign_rhs1 (def) != result)
467 return false;
468 *step = build_int_cst (TREE_TYPE (init_expr), -1);
469 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
470 break;
471
472 case RSHIFT_EXPR:
473 case LSHIFT_EXPR:
474 case MULT_EXPR:
475 op1 = gimple_assign_rhs1 (def);
476 op2 = gimple_assign_rhs2 (def);
477 if (TREE_CODE (op2) != INTEGER_CST
478 || op1 != result)
479 return false;
480 *step = op2;
481 if (t_code == LSHIFT_EXPR)
482 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
483 else if (t_code == RSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
485 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
486 else
487 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
488 break;
489
490 default:
491 return false;
492 }
493
494 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
495 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
496
497 return true;
498 }
499
500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
501 what we are assuming is a double reduction. For example, given
502 a structure like this:
503
504 outer1:
505 x_1 = PHI <x_4(outer2), ...>;
506 ...
507
508 inner:
509 x_2 = PHI <x_1(outer1), ...>;
510 ...
511 x_3 = ...;
512 ...
513
514 outer2:
515 x_4 = PHI <x_3(inner)>;
516 ...
517
518 outer loop analysis would treat x_1 as a double reduction phi and
519 this function would then return true for x_2. */
520
521 static bool
522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
523 {
524 use_operand_p use_p;
525 ssa_op_iter op_iter;
526 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
527 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
528 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
529 return true;
530 return false;
531 }
532
533 /* Returns true if Phi is a first-order recurrence. A first-order
534 recurrence is a non-reduction recurrence relation in which the value of
535 the recurrence in the current loop iteration equals a value defined in
536 the previous iteration. */
537
538 static bool
539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
540 gphi *phi)
541 {
542 /* A nested cycle isn't vectorizable as first order recurrence. */
543 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
544 return false;
545
546 /* Ensure the loop latch definition is from within the loop. */
547 edge latch = loop_latch_edge (loop);
548 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
549 if (TREE_CODE (ldef) != SSA_NAME
550 || SSA_NAME_IS_DEFAULT_DEF (ldef)
551 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
552 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
553 return false;
554
555 tree def = gimple_phi_result (phi);
556
557 /* Ensure every use_stmt of the phi node is dominated by the latch
558 definition. */
559 imm_use_iterator imm_iter;
560 use_operand_p use_p;
561 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
562 if (!is_gimple_debug (USE_STMT (use_p))
563 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
564 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
565 USE_STMT (use_p))))
566 return false;
567
568 /* First-order recurrence autovectorization needs shuffle vector. */
569 tree scalar_type = TREE_TYPE (def);
570 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
571 if (!vectype)
572 return false;
573
574 return true;
575 }
576
577 /* Function vect_analyze_scalar_cycles_1.
578
579 Examine the cross iteration def-use cycles of scalar variables
580 in LOOP. LOOP_VINFO represents the loop that is now being
581 considered for vectorization (can be LOOP, or an outer-loop
582 enclosing LOOP). SLP indicates there will be some subsequent
583 slp analyses or not. */
584
585 static void
586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
587 bool slp)
588 {
589 basic_block bb = loop->header;
590 tree init, step;
591 auto_vec<stmt_vec_info, 64> worklist;
592 gphi_iterator gsi;
593 bool double_reduc, reduc_chain;
594
595 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
596
597 /* First - identify all inductions. Reduction detection assumes that all the
598 inductions have been identified, therefore, this order must not be
599 changed. */
600 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
601 {
602 gphi *phi = gsi.phi ();
603 tree access_fn = NULL;
604 tree def = PHI_RESULT (phi);
605 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
606
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
609 (gimple *) phi);
610
611 /* Skip virtual phi's. The data dependences that are associated with
612 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
613 if (virtual_operand_p (def))
614 continue;
615
616 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
617
618 /* Analyze the evolution function. */
619 access_fn = analyze_scalar_evolution (loop, def);
620 if (access_fn)
621 {
622 STRIP_NOPS (access_fn);
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_NOTE, vect_location,
625 "Access function of PHI: %T\n", access_fn);
626 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
627 = initial_condition_in_loop_num (access_fn, loop->num);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
629 = evolution_part_in_loop_num (access_fn, loop->num);
630 }
631
632 if ((!access_fn
633 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
634 || !vect_is_simple_iv_evolution (loop->num, access_fn,
635 &init, &step)
636 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
637 && TREE_CODE (step) != INTEGER_CST))
638 /* Only handle nonlinear iv for same loop. */
639 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
640 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
641 phi, &init, &step)))
642 {
643 worklist.safe_push (stmt_vinfo);
644 continue;
645 }
646
647 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
648 != NULL_TREE);
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
650
651 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
653 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
654 }
655
656
657 /* Second - identify all reductions and nested cycles. */
658 while (worklist.length () > 0)
659 {
660 stmt_vec_info stmt_vinfo = worklist.pop ();
661 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
662 tree def = PHI_RESULT (phi);
663
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
666 (gimple *) phi);
667
668 gcc_assert (!virtual_operand_p (def)
669 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
670
671 stmt_vec_info reduc_stmt_info
672 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
673 &reduc_chain, slp);
674 if (reduc_stmt_info)
675 {
676 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
677 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
678 if (double_reduc)
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "Detected double reduction.\n");
683
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
685 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
686 }
687 else
688 {
689 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
690 {
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE, vect_location,
693 "Detected vectorizable nested cycle.\n");
694
695 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
696 }
697 else
698 {
699 if (dump_enabled_p ())
700 dump_printf_loc (MSG_NOTE, vect_location,
701 "Detected reduction.\n");
702
703 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
704 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
705 /* Store the reduction cycles for possible vectorization in
706 loop-aware SLP if it was not detected as reduction
707 chain. */
708 if (! reduc_chain)
709 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
710 (reduc_stmt_info);
711 }
712 }
713 }
714 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
715 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
716 else
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "Unknown def-use cycle pattern.\n");
720 }
721 }
722
723
724 /* Function vect_analyze_scalar_cycles.
725
726 Examine the cross iteration def-use cycles of scalar variables, by
727 analyzing the loop-header PHIs of scalar variables. Classify each
728 cycle as one of the following: invariant, induction, reduction, unknown.
729 We do that for the loop represented by LOOP_VINFO, and also to its
730 inner-loop, if exists.
731 Examples for scalar cycles:
732
733 Example1: reduction:
734
735 loop1:
736 for (i=0; i<N; i++)
737 sum += a[i];
738
739 Example2: induction:
740
741 loop2:
742 for (i=0; i<N; i++)
743 a[i] = i; */
744
745 static void
746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
747 {
748 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
749
750 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
751
752 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
753 Reductions in such inner-loop therefore have different properties than
754 the reductions in the nest that gets vectorized:
755 1. When vectorized, they are executed in the same order as in the original
756 scalar loop, so we can't change the order of computation when
757 vectorizing them.
758 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
759 current checks are too strict. */
760
761 if (loop->inner)
762 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
763 }
764
765 /* Transfer group and reduction information from STMT_INFO to its
766 pattern stmt. */
767
768 static void
769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
770 {
771 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
772 stmt_vec_info stmtp;
773 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
774 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
775 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
776 do
777 {
778 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
779 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
780 == STMT_VINFO_DEF_TYPE (stmt_info));
781 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
782 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
783 if (stmt_info)
784 REDUC_GROUP_NEXT_ELEMENT (stmtp)
785 = STMT_VINFO_RELATED_STMT (stmt_info);
786 }
787 while (stmt_info);
788 }
789
790 /* Fixup scalar cycles that now have their stmts detected as patterns. */
791
792 static void
793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
794 {
795 stmt_vec_info first;
796 unsigned i;
797
798 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
799 {
800 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
801 while (next)
802 {
803 if ((STMT_VINFO_IN_PATTERN_P (next)
804 != STMT_VINFO_IN_PATTERN_P (first))
805 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
806 break;
807 next = REDUC_GROUP_NEXT_ELEMENT (next);
808 }
809 /* If all reduction chain members are well-formed patterns adjust
810 the group to group the pattern stmts instead. */
811 if (! next
812 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
813 {
814 if (STMT_VINFO_IN_PATTERN_P (first))
815 {
816 vect_fixup_reduc_chain (first);
817 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
818 = STMT_VINFO_RELATED_STMT (first);
819 }
820 }
821 /* If not all stmt in the chain are patterns or if we failed
822 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
823 it as regular reduction instead. */
824 else
825 {
826 stmt_vec_info vinfo = first;
827 stmt_vec_info last = NULL;
828 while (vinfo)
829 {
830 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
831 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
832 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
833 last = vinfo;
834 vinfo = next;
835 }
836 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
837 = vect_internal_def;
838 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
839 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
840 --i;
841 }
842 }
843 }
844
845 /* Function vect_get_loop_niters.
846
847 Determine how many iterations the loop is executed and place it
848 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
849 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
850 niter information holds in ASSUMPTIONS.
851
852 Return the loop exit condition. */
853
854
855 static gcond *
856 vect_get_loop_niters (class loop *loop, tree *assumptions,
857 tree *number_of_iterations, tree *number_of_iterationsm1)
858 {
859 edge exit = single_exit (loop);
860 class tree_niter_desc niter_desc;
861 tree niter_assumptions, niter, may_be_zero;
862 gcond *cond = get_loop_exit_condition (loop);
863
864 *assumptions = boolean_true_node;
865 *number_of_iterationsm1 = chrec_dont_know;
866 *number_of_iterations = chrec_dont_know;
867 DUMP_VECT_SCOPE ("get_loop_niters");
868
869 if (!exit)
870 return cond;
871
872 may_be_zero = NULL_TREE;
873 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
874 || chrec_contains_undetermined (niter_desc.niter))
875 return cond;
876
877 niter_assumptions = niter_desc.assumptions;
878 may_be_zero = niter_desc.may_be_zero;
879 niter = niter_desc.niter;
880
881 if (may_be_zero && integer_zerop (may_be_zero))
882 may_be_zero = NULL_TREE;
883
884 if (may_be_zero)
885 {
886 if (COMPARISON_CLASS_P (may_be_zero))
887 {
888 /* Try to combine may_be_zero with assumptions, this can simplify
889 computation of niter expression. */
890 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
891 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
892 niter_assumptions,
893 fold_build1 (TRUTH_NOT_EXPR,
894 boolean_type_node,
895 may_be_zero));
896 else
897 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
898 build_int_cst (TREE_TYPE (niter), 0),
899 rewrite_to_non_trapping_overflow (niter));
900
901 may_be_zero = NULL_TREE;
902 }
903 else if (integer_nonzerop (may_be_zero))
904 {
905 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
906 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
907 return cond;
908 }
909 else
910 return cond;
911 }
912
913 *assumptions = niter_assumptions;
914 *number_of_iterationsm1 = niter;
915
916 /* We want the number of loop header executions which is the number
917 of latch executions plus one.
918 ??? For UINT_MAX latch executions this number overflows to zero
919 for loops like do { n++; } while (n != 0); */
920 if (niter && !chrec_contains_undetermined (niter))
921 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
922 build_int_cst (TREE_TYPE (niter), 1));
923 *number_of_iterations = niter;
924
925 return cond;
926 }
927
928 /* Function bb_in_loop_p
929
930 Used as predicate for dfs order traversal of the loop bbs. */
931
932 static bool
933 bb_in_loop_p (const_basic_block bb, const void *data)
934 {
935 const class loop *const loop = (const class loop *)data;
936 if (flow_bb_inside_loop_p (loop, bb))
937 return true;
938 return false;
939 }
940
941
942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
943 stmt_vec_info structs for all the stmts in LOOP_IN. */
944
945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
946 : vec_info (vec_info::loop, shared),
947 loop (loop_in),
948 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
949 num_itersm1 (NULL_TREE),
950 num_iters (NULL_TREE),
951 num_iters_unchanged (NULL_TREE),
952 num_iters_assumptions (NULL_TREE),
953 vector_costs (nullptr),
954 scalar_costs (nullptr),
955 th (0),
956 versioning_threshold (0),
957 vectorization_factor (0),
958 main_loop_edge (nullptr),
959 skip_main_loop_edge (nullptr),
960 skip_this_loop_edge (nullptr),
961 reusable_accumulators (),
962 suggested_unroll_factor (1),
963 max_vectorization_factor (0),
964 mask_skip_niters (NULL_TREE),
965 rgroup_compare_type (NULL_TREE),
966 simd_if_cond (NULL_TREE),
967 partial_vector_style (vect_partial_vectors_none),
968 unaligned_dr (NULL),
969 peeling_for_alignment (0),
970 ptr_mask (0),
971 ivexpr_map (NULL),
972 scan_map (NULL),
973 slp_unrolling_factor (1),
974 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
975 vectorizable (false),
976 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
977 using_partial_vectors_p (false),
978 using_decrementing_iv_p (false),
979 using_select_vl_p (false),
980 epil_using_partial_vectors_p (false),
981 partial_load_store_bias (0),
982 peeling_for_gaps (false),
983 peeling_for_niter (false),
984 no_data_dependencies (false),
985 has_mask_store (false),
986 scalar_loop_scaling (profile_probability::uninitialized ()),
987 scalar_loop (NULL),
988 orig_loop_info (NULL)
989 {
990 /* CHECKME: We want to visit all BBs before their successors (except for
991 latch blocks, for which this assertion wouldn't hold). In the simple
992 case of the loop forms we allow, a dfs order of the BBs would the same
993 as reversed postorder traversal, so we are safe. */
994
995 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
996 bbs, loop->num_nodes, loop);
997 gcc_assert (nbbs == loop->num_nodes);
998
999 for (unsigned int i = 0; i < nbbs; i++)
1000 {
1001 basic_block bb = bbs[i];
1002 gimple_stmt_iterator si;
1003
1004 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1005 {
1006 gimple *phi = gsi_stmt (si);
1007 gimple_set_uid (phi, 0);
1008 add_stmt (phi);
1009 }
1010
1011 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1012 {
1013 gimple *stmt = gsi_stmt (si);
1014 gimple_set_uid (stmt, 0);
1015 if (is_gimple_debug (stmt))
1016 continue;
1017 add_stmt (stmt);
1018 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019 third argument is the #pragma omp simd if (x) condition, when 0,
1020 loop shouldn't be vectorized, when non-zero constant, it should
1021 be vectorized normally, otherwise versioned with vectorized loop
1022 done if the condition is non-zero at runtime. */
1023 if (loop_in->simduid
1024 && is_gimple_call (stmt)
1025 && gimple_call_internal_p (stmt)
1026 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027 && gimple_call_num_args (stmt) >= 3
1028 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029 && (loop_in->simduid
1030 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1031 {
1032 tree arg = gimple_call_arg (stmt, 2);
1033 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034 simd_if_cond = arg;
1035 else
1036 gcc_assert (integer_nonzerop (arg));
1037 }
1038 }
1039 }
1040
1041 epilogue_vinfos.create (6);
1042 }
1043
1044 /* Free all levels of rgroup CONTROLS. */
1045
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1048 {
1049 rgroup_controls *rgc;
1050 unsigned int i;
1051 FOR_EACH_VEC_ELT (*controls, i, rgc)
1052 rgc->controls.release ();
1053 controls->release ();
1054 }
1055
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057 stmt_vec_info structs of all the stmts in the loop. */
1058
1059 _loop_vec_info::~_loop_vec_info ()
1060 {
1061 free (bbs);
1062
1063 release_vec_loop_controls (&masks.rgc_vec);
1064 release_vec_loop_controls (&lens);
1065 delete ivexpr_map;
1066 delete scan_map;
1067 epilogue_vinfos.release ();
1068 delete scalar_costs;
1069 delete vector_costs;
1070
1071 /* When we release an epiloge vinfo that we do not intend to use
1072 avoid clearing AUX of the main loop which should continue to
1073 point to the main loop vinfo since otherwise we'll leak that. */
1074 if (loop->aux == this)
1075 loop->aux = NULL;
1076 }
1077
1078 /* Return an invariant or register for EXPR and emit necessary
1079 computations in the LOOP_VINFO loop preheader. */
1080
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1083 {
1084 if (is_gimple_reg (expr)
1085 || is_gimple_min_invariant (expr))
1086 return expr;
1087
1088 if (! loop_vinfo->ivexpr_map)
1089 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091 if (! cached)
1092 {
1093 gimple_seq stmts = NULL;
1094 cached = force_gimple_operand (unshare_expr (expr),
1095 &stmts, true, NULL_TREE);
1096 if (stmts)
1097 {
1098 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099 gsi_insert_seq_on_edge_immediate (e, stmts);
1100 }
1101 }
1102 return cached;
1103 }
1104
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106 all masks required to mask LOOP_VINFO. */
1107
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1110 {
1111 rgroup_controls *rgm;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114 if (rgm->type != NULL_TREE
1115 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116 cmp_type, rgm->type,
1117 OPTIMIZE_FOR_SPEED))
1118 return false;
1119 return true;
1120 }
1121
1122 /* Calculate the maximum number of scalars per iteration for every
1123 rgroup in LOOP_VINFO. */
1124
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1127 {
1128 unsigned int res = 1;
1129 unsigned int i;
1130 rgroup_controls *rgm;
1131 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132 res = MAX (res, rgm->max_nscalars_per_iter);
1133 return res;
1134 }
1135
1136 /* Calculate the minimum precision necessary to represent:
1137
1138 MAX_NITERS * FACTOR
1139
1140 as an unsigned integer, where MAX_NITERS is the maximum number of
1141 loop header iterations for the original scalar form of LOOP_VINFO. */
1142
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1145 {
1146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147
1148 /* Get the maximum number of iterations that is representable
1149 in the counter type. */
1150 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1152
1153 /* Get a more refined estimate for the number of iterations. */
1154 widest_int max_back_edges;
1155 if (max_loop_iterations (loop, &max_back_edges))
1156 max_ni = wi::smin (max_ni, max_back_edges + 1);
1157
1158 /* Work out how many bits we need to represent the limit. */
1159 return wi::min_precision (max_ni * factor, UNSIGNED);
1160 }
1161
1162 /* True if the loop needs peeling or partial vectors when vectorized. */
1163
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1166 {
1167 unsigned HOST_WIDE_INT const_vf;
1168 HOST_WIDE_INT max_niter
1169 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1170
1171 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174 (loop_vinfo));
1175
1176 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1178 {
1179 /* Work out the (constant) number of iterations that need to be
1180 peeled for reasons other than niters. */
1181 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183 peel_niter += 1;
1184 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186 return true;
1187 }
1188 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189 /* ??? When peeling for gaps but not alignment, we could
1190 try to check whether the (variable) niters is known to be
1191 VF * N + 1. That's something of a niche case though. */
1192 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195 < (unsigned) exact_log2 (const_vf))
1196 /* In case of versioning, check if the maximum number of
1197 iterations is greater than th. If they are identical,
1198 the epilogue is unnecessary. */
1199 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200 || ((unsigned HOST_WIDE_INT) max_niter
1201 > (th / const_vf) * const_vf))))
1202 return true;
1203
1204 return false;
1205 }
1206
1207 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1208 whether we can actually generate the masks required. Return true if so,
1209 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1210
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1213 {
1214 unsigned int min_ni_width;
1215
1216 /* Use a normal loop if there are no statements that need masking.
1217 This only happens in rare degenerate cases: it means that the loop
1218 has no loads, no stores, and no live-out values. */
1219 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220 return false;
1221
1222 /* Produce the rgroup controls. */
1223 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1224 {
1225 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226 tree vectype = mask.first;
1227 unsigned nvectors = mask.second;
1228
1229 if (masks->rgc_vec.length () < nvectors)
1230 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232 /* The number of scalars per iteration and the number of vectors are
1233 both compile-time constants. */
1234 unsigned int nscalars_per_iter
1235 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1237
1238 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1239 {
1240 rgm->max_nscalars_per_iter = nscalars_per_iter;
1241 rgm->type = truth_type_for (vectype);
1242 rgm->factor = 1;
1243 }
1244 }
1245
1246 unsigned int max_nscalars_per_iter
1247 = vect_get_max_nscalars_per_iter (loop_vinfo);
1248
1249 /* Work out how many bits we need to represent the limit. */
1250 min_ni_width
1251 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1252
1253 /* Find a scalar mode for which WHILE_ULT is supported. */
1254 opt_scalar_int_mode cmp_mode_iter;
1255 tree cmp_type = NULL_TREE;
1256 tree iv_type = NULL_TREE;
1257 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258 unsigned int iv_precision = UINT_MAX;
1259
1260 if (iv_limit != -1)
1261 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262 UNSIGNED);
1263
1264 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1265 {
1266 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267 if (cmp_bits >= min_ni_width
1268 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1269 {
1270 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271 if (this_type
1272 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1273 {
1274 /* Although we could stop as soon as we find a valid mode,
1275 there are at least two reasons why that's not always the
1276 best choice:
1277
1278 - An IV that's Pmode or wider is more likely to be reusable
1279 in address calculations than an IV that's narrower than
1280 Pmode.
1281
1282 - Doing the comparison in IV_PRECISION or wider allows
1283 a natural 0-based IV, whereas using a narrower comparison
1284 type requires mitigations against wrap-around.
1285
1286 Conversely, if the IV limit is variable, doing the comparison
1287 in a wider type than the original type can introduce
1288 unnecessary extensions, so picking the widest valid mode
1289 is not always a good choice either.
1290
1291 Here we prefer the first IV type that's Pmode or wider,
1292 and the first comparison type that's IV_PRECISION or wider.
1293 (The comparison type must be no wider than the IV type,
1294 to avoid extensions in the vector loop.)
1295
1296 ??? We might want to try continuing beyond Pmode for ILP32
1297 targets if CMP_BITS < IV_PRECISION. */
1298 iv_type = this_type;
1299 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300 cmp_type = this_type;
1301 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302 break;
1303 }
1304 }
1305 }
1306
1307 if (!cmp_type)
1308 {
1309 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310 return false;
1311 }
1312
1313 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316 return true;
1317 }
1318
1319 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1320 whether we can actually generate AVX512 style masks. Return true if so,
1321 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1322
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1325 {
1326 /* Produce differently organized rgc_vec and differently check
1327 we can produce masks. */
1328
1329 /* Use a normal loop if there are no statements that need masking.
1330 This only happens in rare degenerate cases: it means that the loop
1331 has no loads, no stores, and no live-out values. */
1332 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333 return false;
1334
1335 /* For the decrementing IV we need to represent all values in
1336 [0, niter + niter_skip] where niter_skip is the elements we
1337 skip in the first iteration for prologue peeling. */
1338 tree iv_type = NULL_TREE;
1339 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340 unsigned int iv_precision = UINT_MAX;
1341 if (iv_limit != -1)
1342 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1343
1344 /* First compute the type for the IV we use to track the remaining
1345 scalar iterations. */
1346 opt_scalar_int_mode cmp_mode_iter;
1347 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1348 {
1349 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350 if (cmp_bits >= iv_precision
1351 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1352 {
1353 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354 if (iv_type)
1355 break;
1356 }
1357 }
1358 if (!iv_type)
1359 return false;
1360
1361 /* Produce the rgroup controls. */
1362 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1363 {
1364 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365 tree vectype = mask.first;
1366 unsigned nvectors = mask.second;
1367
1368 /* The number of scalars per iteration and the number of vectors are
1369 both compile-time constants. */
1370 unsigned int nscalars_per_iter
1371 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1373
1374 /* We index the rgroup_controls vector with nscalars_per_iter
1375 which we keep constant and instead have a varying nvectors,
1376 remembering the vector mask with the fewest nV. */
1377 if (masks->rgc_vec.length () < nscalars_per_iter)
1378 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1380
1381 if (!rgm->type || rgm->factor > nvectors)
1382 {
1383 rgm->type = truth_type_for (vectype);
1384 rgm->compare_type = NULL_TREE;
1385 rgm->max_nscalars_per_iter = nscalars_per_iter;
1386 rgm->factor = nvectors;
1387 rgm->bias_adjusted_ctrl = NULL_TREE;
1388 }
1389 }
1390
1391 /* There is no fixed compare type we are going to use but we have to
1392 be able to get at one for each mask group. */
1393 unsigned int min_ni_width
1394 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1395
1396 bool ok = true;
1397 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1398 {
1399 tree mask_type = rgc.type;
1400 if (!mask_type)
1401 continue;
1402
1403 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1404 {
1405 ok = false;
1406 break;
1407 }
1408
1409 /* If iv_type is usable as compare type use that - we can elide the
1410 saturation in that case. */
1411 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1412 {
1413 tree cmp_vectype
1414 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416 rgc.compare_type = cmp_vectype;
1417 }
1418 if (!rgc.compare_type)
1419 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1420 {
1421 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422 if (cmp_bits >= min_ni_width
1423 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1424 {
1425 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426 if (!cmp_type)
1427 continue;
1428
1429 /* Check whether we can produce the mask with cmp_type. */
1430 tree cmp_vectype
1431 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1433 {
1434 rgc.compare_type = cmp_vectype;
1435 break;
1436 }
1437 }
1438 }
1439 if (!rgc.compare_type)
1440 {
1441 ok = false;
1442 break;
1443 }
1444 }
1445 if (!ok)
1446 {
1447 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448 return false;
1449 }
1450
1451 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454 return true;
1455 }
1456
1457 /* Check whether we can use vector access with length based on precison
1458 comparison. So far, to keep it simple, we only allow the case that the
1459 precision of the target supported length is larger than the precision
1460 required by loop niters. */
1461
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1464 {
1465 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466 return false;
1467
1468 machine_mode len_load_mode = get_len_load_store_mode
1469 (loop_vinfo->vector_mode, true).require ();
1470 machine_mode len_store_mode = get_len_load_store_mode
1471 (loop_vinfo->vector_mode, false).require ();
1472
1473 signed char partial_load_bias = internal_len_load_store_bias
1474 (IFN_LEN_LOAD, len_load_mode);
1475
1476 signed char partial_store_bias = internal_len_load_store_bias
1477 (IFN_LEN_STORE, len_store_mode);
1478
1479 gcc_assert (partial_load_bias == partial_store_bias);
1480
1481 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482 return false;
1483
1484 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485 len_loads with a length of zero. In order to avoid that we prohibit
1486 more than one loop length here. */
1487 if (partial_load_bias == -1
1488 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489 return false;
1490
1491 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1492
1493 unsigned int max_nitems_per_iter = 1;
1494 unsigned int i;
1495 rgroup_controls *rgl;
1496 /* Find the maximum number of items per iteration for every rgroup. */
1497 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1498 {
1499 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1501 }
1502
1503 /* Work out how many bits we need to represent the length limit. */
1504 unsigned int min_ni_prec
1505 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1506
1507 /* Now use the maximum of below precisions for one suitable IV type:
1508 - the IV's natural precision
1509 - the precision needed to hold: the maximum number of scalar
1510 iterations multiplied by the scale factor (min_ni_prec above)
1511 - the Pmode precision
1512
1513 If min_ni_prec is less than the precision of the current niters,
1514 we perfer to still use the niters type. Prefer to use Pmode and
1515 wider IV to avoid narrow conversions. */
1516
1517 unsigned int ni_prec
1518 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519 min_ni_prec = MAX (min_ni_prec, ni_prec);
1520 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1521
1522 tree iv_type = NULL_TREE;
1523 opt_scalar_int_mode tmode_iter;
1524 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1525 {
1526 scalar_mode tmode = tmode_iter.require ();
1527 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1528
1529 /* ??? Do we really want to construct one IV whose precision exceeds
1530 BITS_PER_WORD? */
1531 if (tbits > BITS_PER_WORD)
1532 break;
1533
1534 /* Find the first available standard integral type. */
1535 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1536 {
1537 iv_type = build_nonstandard_integer_type (tbits, true);
1538 break;
1539 }
1540 }
1541
1542 if (!iv_type)
1543 {
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546 "can't vectorize with length-based partial vectors"
1547 " because there is no suitable iv type.\n");
1548 return false;
1549 }
1550
1551 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1554
1555 return true;
1556 }
1557
1558 /* Calculate the cost of one scalar iteration of the loop. */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1561 {
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564 int nbbs = loop->num_nodes, factor;
1565 int innerloop_iters, i;
1566
1567 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1568
1569 /* Gather costs for statements in the scalar loop. */
1570
1571 /* FORNOW. */
1572 innerloop_iters = 1;
1573 if (loop->inner)
1574 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1575
1576 for (i = 0; i < nbbs; i++)
1577 {
1578 gimple_stmt_iterator si;
1579 basic_block bb = bbs[i];
1580
1581 if (bb->loop_father == loop->inner)
1582 factor = innerloop_iters;
1583 else
1584 factor = 1;
1585
1586 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1587 {
1588 gimple *stmt = gsi_stmt (si);
1589 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1590
1591 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592 continue;
1593
1594 /* Skip stmts that are not vectorized inside the loop. */
1595 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597 && (!STMT_VINFO_LIVE_P (vstmt_info)
1598 || !VECTORIZABLE_CYCLE_DEF
1599 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600 continue;
1601
1602 vect_cost_for_stmt kind;
1603 if (STMT_VINFO_DATA_REF (stmt_info))
1604 {
1605 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606 kind = scalar_load;
1607 else
1608 kind = scalar_store;
1609 }
1610 else if (vect_nop_conversion_p (stmt_info))
1611 continue;
1612 else
1613 kind = scalar_stmt;
1614
1615 /* We are using vect_prologue here to avoid scaling twice
1616 by the inner loop factor. */
1617 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618 factor, kind, stmt_info, 0, vect_prologue);
1619 }
1620 }
1621
1622 /* Now accumulate cost. */
1623 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624 add_stmt_costs (loop_vinfo->scalar_costs,
1625 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626 loop_vinfo->scalar_costs->finish_cost (nullptr);
1627 }
1628
1629
1630 /* Function vect_analyze_loop_form.
1631
1632 Verify that certain CFG restrictions hold, including:
1633 - the loop has a pre-header
1634 - the loop has a single entry and exit
1635 - the loop exit condition is simple enough
1636 - the number of iterations can be analyzed, i.e, a countable loop. The
1637 niter could be analyzed under some assumptions. */
1638
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1641 {
1642 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1643
1644 /* Different restrictions apply when we are considering an inner-most loop,
1645 vs. an outer (nested) loop.
1646 (FORNOW. May want to relax some of these restrictions in the future). */
1647
1648 info->inner_loop_cond = NULL;
1649 if (!loop->inner)
1650 {
1651 /* Inner-most loop. We currently require that the number of BBs is
1652 exactly 2 (the header and latch). Vectorizable inner-most loops
1653 look like this:
1654
1655 (pre-header)
1656 |
1657 header <--------+
1658 | | |
1659 | +--> latch --+
1660 |
1661 (exit-bb) */
1662
1663 if (loop->num_nodes != 2)
1664 return opt_result::failure_at (vect_location,
1665 "not vectorized:"
1666 " control flow in loop.\n");
1667
1668 if (empty_block_p (loop->header))
1669 return opt_result::failure_at (vect_location,
1670 "not vectorized: empty loop.\n");
1671 }
1672 else
1673 {
1674 class loop *innerloop = loop->inner;
1675 edge entryedge;
1676
1677 /* Nested loop. We currently require that the loop is doubly-nested,
1678 contains a single inner loop, and the number of BBs is exactly 5.
1679 Vectorizable outer-loops look like this:
1680
1681 (pre-header)
1682 |
1683 header <---+
1684 | |
1685 inner-loop |
1686 | |
1687 tail ------+
1688 |
1689 (exit-bb)
1690
1691 The inner-loop has the properties expected of inner-most loops
1692 as described above. */
1693
1694 if ((loop->inner)->inner || (loop->inner)->next)
1695 return opt_result::failure_at (vect_location,
1696 "not vectorized:"
1697 " multiple nested loops.\n");
1698
1699 if (loop->num_nodes != 5)
1700 return opt_result::failure_at (vect_location,
1701 "not vectorized:"
1702 " control flow in loop.\n");
1703
1704 entryedge = loop_preheader_edge (innerloop);
1705 if (entryedge->src != loop->header
1706 || !single_exit (innerloop)
1707 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708 return opt_result::failure_at (vect_location,
1709 "not vectorized:"
1710 " unsupported outerloop form.\n");
1711
1712 /* Analyze the inner-loop. */
1713 vect_loop_form_info inner;
1714 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715 if (!res)
1716 {
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: Bad inner loop.\n");
1720 return res;
1721 }
1722
1723 /* Don't support analyzing niter under assumptions for inner
1724 loop. */
1725 if (!integer_onep (inner.assumptions))
1726 return opt_result::failure_at (vect_location,
1727 "not vectorized: Bad inner loop.\n");
1728
1729 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730 return opt_result::failure_at (vect_location,
1731 "not vectorized: inner-loop count not"
1732 " invariant.\n");
1733
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE, vect_location,
1736 "Considering outer-loop vectorization.\n");
1737 info->inner_loop_cond = inner.loop_cond;
1738 }
1739
1740 if (!single_exit (loop))
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized: multiple exits.\n");
1743 if (EDGE_COUNT (loop->header->preds) != 2)
1744 return opt_result::failure_at (vect_location,
1745 "not vectorized:"
1746 " too many incoming edges.\n");
1747
1748 /* We assume that the loop exit condition is at the end of the loop. i.e,
1749 that the loop is represented as a do-while (with a proper if-guard
1750 before the loop if needed), where the loop header contains all the
1751 executable statements, and the latch is empty. */
1752 if (!empty_block_p (loop->latch)
1753 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754 return opt_result::failure_at (vect_location,
1755 "not vectorized: latch block not empty.\n");
1756
1757 /* Make sure the exit is not abnormal. */
1758 edge e = single_exit (loop);
1759 if (e->flags & EDGE_ABNORMAL)
1760 return opt_result::failure_at (vect_location,
1761 "not vectorized:"
1762 " abnormal loop exit edge.\n");
1763
1764 info->loop_cond
1765 = vect_get_loop_niters (loop, &info->assumptions,
1766 &info->number_of_iterations,
1767 &info->number_of_iterationsm1);
1768 if (!info->loop_cond)
1769 return opt_result::failure_at
1770 (vect_location,
1771 "not vectorized: complicated exit condition.\n");
1772
1773 if (integer_zerop (info->assumptions)
1774 || !info->number_of_iterations
1775 || chrec_contains_undetermined (info->number_of_iterations))
1776 return opt_result::failure_at
1777 (info->loop_cond,
1778 "not vectorized: number of iterations cannot be computed.\n");
1779
1780 if (integer_zerop (info->number_of_iterations))
1781 return opt_result::failure_at
1782 (info->loop_cond,
1783 "not vectorized: number of iterations = 0.\n");
1784
1785 if (!(tree_fits_shwi_p (info->number_of_iterations)
1786 && tree_to_shwi (info->number_of_iterations) > 0))
1787 {
1788 if (dump_enabled_p ())
1789 {
1790 dump_printf_loc (MSG_NOTE, vect_location,
1791 "Symbolic number of iterations is ");
1792 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793 dump_printf (MSG_NOTE, "\n");
1794 }
1795 }
1796
1797 return opt_result::success ();
1798 }
1799
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801 vect_analyze_loop_form result. */
1802
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805 const vect_loop_form_info *info,
1806 loop_vec_info main_loop_info)
1807 {
1808 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813 /* Also record the assumptions for versioning. */
1814 if (!integer_onep (info->assumptions) && !main_loop_info)
1815 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1816
1817 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819 if (info->inner_loop_cond)
1820 {
1821 stmt_vec_info inner_loop_cond_info
1822 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824 /* If we have an estimate on the number of iterations of the inner
1825 loop use that to limit the scale for costing, otherwise use
1826 --param vect-inner-loop-cost-factor literally. */
1827 widest_int nit;
1828 if (estimated_stmt_executions (loop->inner, &nit))
1829 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1831 }
1832
1833 return loop_vinfo;
1834 }
1835
1836
1837
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839 statements update the vectorization factor. */
1840
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1843 {
1844 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846 int nbbs = loop->num_nodes;
1847 poly_uint64 vectorization_factor;
1848 int i;
1849
1850 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1851
1852 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853 gcc_assert (known_ne (vectorization_factor, 0U));
1854
1855 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856 vectorization factor of the loop is the unrolling factor required by
1857 the SLP instances. If that unrolling factor is 1, we say, that we
1858 perform pure SLP on loop - cross iteration parallelism is not
1859 exploited. */
1860 bool only_slp_in_loop = true;
1861 for (i = 0; i < nbbs; i++)
1862 {
1863 basic_block bb = bbs[i];
1864 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865 gsi_next (&si))
1866 {
1867 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868 if (!stmt_info)
1869 continue;
1870 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872 && !PURE_SLP_STMT (stmt_info))
1873 /* STMT needs both SLP and loop-based vectorization. */
1874 only_slp_in_loop = false;
1875 }
1876 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877 gsi_next (&si))
1878 {
1879 if (is_gimple_debug (gsi_stmt (si)))
1880 continue;
1881 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882 stmt_info = vect_stmt_to_vectorize (stmt_info);
1883 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885 && !PURE_SLP_STMT (stmt_info))
1886 /* STMT needs both SLP and loop-based vectorization. */
1887 only_slp_in_loop = false;
1888 }
1889 }
1890
1891 if (only_slp_in_loop)
1892 {
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_NOTE, vect_location,
1895 "Loop contains only SLP stmts\n");
1896 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1897 }
1898 else
1899 {
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Loop contains SLP and non-SLP stmts\n");
1903 /* Both the vectorization factor and unroll factor have the form
1904 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905 so they must have a common multiple. */
1906 vectorization_factor
1907 = force_common_multiple (vectorization_factor,
1908 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1909 }
1910
1911 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912 if (dump_enabled_p ())
1913 {
1914 dump_printf_loc (MSG_NOTE, vect_location,
1915 "Updating vectorization factor to ");
1916 dump_dec (MSG_NOTE, vectorization_factor);
1917 dump_printf (MSG_NOTE, ".\n");
1918 }
1919 }
1920
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922 the other phi in the reduction is also relevant for vectorization.
1923 This rejects cases such as:
1924
1925 outer1:
1926 x_1 = PHI <x_3(outer2), ...>;
1927 ...
1928
1929 inner:
1930 x_2 = ...;
1931 ...
1932
1933 outer2:
1934 x_3 = PHI <x_2(inner)>;
1935
1936 if nothing in x_2 or elsewhere makes x_1 relevant. */
1937
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1940 {
1941 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942 return false;
1943
1944 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1945 }
1946
1947 /* Function vect_analyze_loop_operations.
1948
1949 Scan the loop stmts and make sure they are all vectorizable. */
1950
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1953 {
1954 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956 int nbbs = loop->num_nodes;
1957 int i;
1958 stmt_vec_info stmt_info;
1959 bool need_to_vectorize = false;
1960 bool ok;
1961
1962 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1963
1964 auto_vec<stmt_info_for_cost> cost_vec;
1965
1966 for (i = 0; i < nbbs; i++)
1967 {
1968 basic_block bb = bbs[i];
1969
1970 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971 gsi_next (&si))
1972 {
1973 gphi *phi = si.phi ();
1974 ok = true;
1975
1976 stmt_info = loop_vinfo->lookup_stmt (phi);
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979 (gimple *) phi);
1980 if (virtual_operand_p (gimple_phi_result (phi)))
1981 continue;
1982
1983 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984 (i.e., a phi in the tail of the outer-loop). */
1985 if (! is_loop_header_bb_p (bb))
1986 {
1987 /* FORNOW: we currently don't support the case that these phis
1988 are not used in the outerloop (unless it is double reduction,
1989 i.e., this phi is vect_reduction_def), cause this case
1990 requires to actually do something here. */
1991 if (STMT_VINFO_LIVE_P (stmt_info)
1992 && !vect_active_double_reduction_p (stmt_info))
1993 return opt_result::failure_at (phi,
1994 "Unsupported loop-closed phi"
1995 " in outer-loop.\n");
1996
1997 /* If PHI is used in the outer loop, we check that its operand
1998 is defined in the inner loop. */
1999 if (STMT_VINFO_RELEVANT_P (stmt_info))
2000 {
2001 tree phi_op;
2002
2003 if (gimple_phi_num_args (phi) != 1)
2004 return opt_result::failure_at (phi, "unsupported phi");
2005
2006 phi_op = PHI_ARG_DEF (phi, 0);
2007 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008 if (!op_def_info)
2009 return opt_result::failure_at (phi, "unsupported phi\n");
2010
2011 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012 && (STMT_VINFO_RELEVANT (op_def_info)
2013 != vect_used_in_outer_by_reduction))
2014 return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017 || (STMT_VINFO_DEF_TYPE (stmt_info)
2018 == vect_double_reduction_def))
2019 && !vectorizable_lc_phi (loop_vinfo,
2020 stmt_info, NULL, NULL))
2021 return opt_result::failure_at (phi, "unsupported phi\n");
2022 }
2023
2024 continue;
2025 }
2026
2027 gcc_assert (stmt_info);
2028
2029 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030 || STMT_VINFO_LIVE_P (stmt_info))
2031 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033 /* A scalar-dependence cycle that we don't support. */
2034 return opt_result::failure_at (phi,
2035 "not vectorized:"
2036 " scalar dependence cycle.\n");
2037
2038 if (STMT_VINFO_RELEVANT_P (stmt_info))
2039 {
2040 need_to_vectorize = true;
2041 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042 && ! PURE_SLP_STMT (stmt_info))
2043 ok = vectorizable_induction (loop_vinfo,
2044 stmt_info, NULL, NULL,
2045 &cost_vec);
2046 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047 || (STMT_VINFO_DEF_TYPE (stmt_info)
2048 == vect_double_reduction_def)
2049 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050 && ! PURE_SLP_STMT (stmt_info))
2051 ok = vectorizable_reduction (loop_vinfo,
2052 stmt_info, NULL, NULL, &cost_vec);
2053 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054 == vect_first_order_recurrence)
2055 && ! PURE_SLP_STMT (stmt_info))
2056 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057 &cost_vec);
2058 }
2059
2060 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2061 if (ok
2062 && STMT_VINFO_LIVE_P (stmt_info)
2063 && !PURE_SLP_STMT (stmt_info))
2064 ok = vectorizable_live_operation (loop_vinfo,
2065 stmt_info, NULL, NULL, NULL,
2066 -1, false, &cost_vec);
2067
2068 if (!ok)
2069 return opt_result::failure_at (phi,
2070 "not vectorized: relevant phi not "
2071 "supported: %G",
2072 static_cast <gimple *> (phi));
2073 }
2074
2075 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2076 gsi_next (&si))
2077 {
2078 gimple *stmt = gsi_stmt (si);
2079 if (!gimple_clobber_p (stmt)
2080 && !is_gimple_debug (stmt))
2081 {
2082 opt_result res
2083 = vect_analyze_stmt (loop_vinfo,
2084 loop_vinfo->lookup_stmt (stmt),
2085 &need_to_vectorize,
2086 NULL, NULL, &cost_vec);
2087 if (!res)
2088 return res;
2089 }
2090 }
2091 } /* bbs */
2092
2093 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2094
2095 /* All operations in the loop are either irrelevant (deal with loop
2096 control, or dead), or only used outside the loop and can be moved
2097 out of the loop (e.g. invariants, inductions). The loop can be
2098 optimized away by scalar optimizations. We're better off not
2099 touching this loop. */
2100 if (!need_to_vectorize)
2101 {
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "All the computation can be taken out of the loop.\n");
2105 return opt_result::failure_at
2106 (vect_location,
2107 "not vectorized: redundant loop. no profit to vectorize.\n");
2108 }
2109
2110 return opt_result::success ();
2111 }
2112
2113 /* Return true if we know that the iteration count is smaller than the
2114 vectorization factor. Return false if it isn't, or if we can't be sure
2115 either way. */
2116
2117 static bool
2118 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2119 {
2120 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2121
2122 HOST_WIDE_INT max_niter;
2123 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2124 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2125 else
2126 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2127
2128 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2129 return true;
2130
2131 return false;
2132 }
2133
2134 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2135 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2136 definitely no, or -1 if it's worth retrying. */
2137
2138 static int
2139 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2140 unsigned *suggested_unroll_factor)
2141 {
2142 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2143 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2144
2145 /* Only loops that can handle partially-populated vectors can have iteration
2146 counts less than the vectorization factor. */
2147 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2148 && vect_known_niters_smaller_than_vf (loop_vinfo))
2149 {
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2152 "not vectorized: iteration count smaller than "
2153 "vectorization factor.\n");
2154 return 0;
2155 }
2156
2157 /* If we know the number of iterations we can do better, for the
2158 epilogue we can also decide whether the main loop leaves us
2159 with enough iterations, prefering a smaller vector epilog then
2160 also possibly used for the case we skip the vector loop. */
2161 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2162 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2163 {
2164 widest_int scalar_niters
2165 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2166 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2167 {
2168 loop_vec_info orig_loop_vinfo
2169 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2170 unsigned lowest_vf
2171 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2172 int prolog_peeling = 0;
2173 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2174 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2175 if (prolog_peeling >= 0
2176 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2177 lowest_vf))
2178 {
2179 unsigned gap
2180 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2181 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2182 % lowest_vf + gap);
2183 }
2184 }
2185
2186 /* Check that the loop processes at least one full vector. */
2187 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2188 if (known_lt (scalar_niters, vf))
2189 {
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 "loop does not have enough iterations "
2193 "to support vectorization.\n");
2194 return 0;
2195 }
2196
2197 /* If we need to peel an extra epilogue iteration to handle data
2198 accesses with gaps, check that there are enough scalar iterations
2199 available.
2200
2201 The check above is redundant with this one when peeling for gaps,
2202 but the distinction is useful for diagnostics. */
2203 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2204 && known_le (scalar_niters, vf))
2205 {
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208 "loop does not have enough iterations "
2209 "to support peeling for gaps.\n");
2210 return 0;
2211 }
2212 }
2213
2214 /* If using the "very cheap" model. reject cases in which we'd keep
2215 a copy of the scalar code (even if we might be able to vectorize it). */
2216 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2217 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2218 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2219 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2220 {
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2223 "some scalar iterations would need to be peeled\n");
2224 return 0;
2225 }
2226
2227 int min_profitable_iters, min_profitable_estimate;
2228 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2229 &min_profitable_estimate,
2230 suggested_unroll_factor);
2231
2232 if (min_profitable_iters < 0)
2233 {
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "not vectorized: vectorization not profitable.\n");
2237 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 "not vectorized: vector version will never be "
2240 "profitable.\n");
2241 return -1;
2242 }
2243
2244 int min_scalar_loop_bound = (param_min_vect_loop_bound
2245 * assumed_vf);
2246
2247 /* Use the cost model only if it is more conservative than user specified
2248 threshold. */
2249 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2250 min_profitable_iters);
2251
2252 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2253
2254 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2255 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2256 {
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "not vectorized: vectorization not profitable.\n");
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_NOTE, vect_location,
2262 "not vectorized: iteration count smaller than user "
2263 "specified loop bound parameter or minimum profitable "
2264 "iterations (whichever is more conservative).\n");
2265 return 0;
2266 }
2267
2268 /* The static profitablity threshold min_profitable_estimate includes
2269 the cost of having to check at runtime whether the scalar loop
2270 should be used instead. If it turns out that we don't need or want
2271 such a check, the threshold we should use for the static estimate
2272 is simply the point at which the vector loop becomes more profitable
2273 than the scalar loop. */
2274 if (min_profitable_estimate > min_profitable_iters
2275 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2276 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2277 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2278 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2279 {
2280 if (dump_enabled_p ())
2281 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2282 " choice between the scalar and vector loops\n");
2283 min_profitable_estimate = min_profitable_iters;
2284 }
2285
2286 /* If the vector loop needs multiple iterations to be beneficial then
2287 things are probably too close to call, and the conservative thing
2288 would be to stick with the scalar code. */
2289 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2290 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2291 {
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294 "one iteration of the vector loop would be"
2295 " more expensive than the equivalent number of"
2296 " iterations of the scalar loop\n");
2297 return 0;
2298 }
2299
2300 HOST_WIDE_INT estimated_niter;
2301
2302 /* If we are vectorizing an epilogue then we know the maximum number of
2303 scalar iterations it will cover is at least one lower than the
2304 vectorization factor of the main loop. */
2305 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2306 estimated_niter
2307 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2308 else
2309 {
2310 estimated_niter = estimated_stmt_executions_int (loop);
2311 if (estimated_niter == -1)
2312 estimated_niter = likely_max_stmt_executions_int (loop);
2313 }
2314 if (estimated_niter != -1
2315 && ((unsigned HOST_WIDE_INT) estimated_niter
2316 < MAX (th, (unsigned) min_profitable_estimate)))
2317 {
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 "not vectorized: estimated iteration count too "
2321 "small.\n");
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE, vect_location,
2324 "not vectorized: estimated iteration count smaller "
2325 "than specified loop bound parameter or minimum "
2326 "profitable iterations (whichever is more "
2327 "conservative).\n");
2328 return -1;
2329 }
2330
2331 return 1;
2332 }
2333
2334 static opt_result
2335 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2336 vec<data_reference_p> *datarefs,
2337 unsigned int *n_stmts)
2338 {
2339 *n_stmts = 0;
2340 for (unsigned i = 0; i < loop->num_nodes; i++)
2341 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2342 !gsi_end_p (gsi); gsi_next (&gsi))
2343 {
2344 gimple *stmt = gsi_stmt (gsi);
2345 if (is_gimple_debug (stmt))
2346 continue;
2347 ++(*n_stmts);
2348 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2349 NULL, 0);
2350 if (!res)
2351 {
2352 if (is_gimple_call (stmt) && loop->safelen)
2353 {
2354 tree fndecl = gimple_call_fndecl (stmt), op;
2355 if (fndecl == NULL_TREE
2356 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2357 {
2358 fndecl = gimple_call_arg (stmt, 0);
2359 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2360 fndecl = TREE_OPERAND (fndecl, 0);
2361 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2362 }
2363 if (fndecl != NULL_TREE)
2364 {
2365 cgraph_node *node = cgraph_node::get (fndecl);
2366 if (node != NULL && node->simd_clones != NULL)
2367 {
2368 unsigned int j, n = gimple_call_num_args (stmt);
2369 for (j = 0; j < n; j++)
2370 {
2371 op = gimple_call_arg (stmt, j);
2372 if (DECL_P (op)
2373 || (REFERENCE_CLASS_P (op)
2374 && get_base_address (op)))
2375 break;
2376 }
2377 op = gimple_call_lhs (stmt);
2378 /* Ignore #pragma omp declare simd functions
2379 if they don't have data references in the
2380 call stmt itself. */
2381 if (j == n
2382 && !(op
2383 && (DECL_P (op)
2384 || (REFERENCE_CLASS_P (op)
2385 && get_base_address (op)))))
2386 continue;
2387 }
2388 }
2389 }
2390 return res;
2391 }
2392 /* If dependence analysis will give up due to the limit on the
2393 number of datarefs stop here and fail fatally. */
2394 if (datarefs->length ()
2395 > (unsigned)param_loop_max_datarefs_for_datadeps)
2396 return opt_result::failure_at (stmt, "exceeded param "
2397 "loop-max-datarefs-for-datadeps\n");
2398 }
2399 return opt_result::success ();
2400 }
2401
2402 /* Look for SLP-only access groups and turn each individual access into its own
2403 group. */
2404 static void
2405 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2406 {
2407 unsigned int i;
2408 struct data_reference *dr;
2409
2410 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2411
2412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2413 FOR_EACH_VEC_ELT (datarefs, i, dr)
2414 {
2415 gcc_assert (DR_REF (dr));
2416 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2417
2418 /* Check if the load is a part of an interleaving chain. */
2419 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2420 {
2421 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2422 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2423 unsigned int group_size = DR_GROUP_SIZE (first_element);
2424
2425 /* Check if SLP-only groups. */
2426 if (!STMT_SLP_TYPE (stmt_info)
2427 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2428 {
2429 /* Dissolve the group. */
2430 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2431
2432 stmt_vec_info vinfo = first_element;
2433 while (vinfo)
2434 {
2435 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2436 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2437 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2438 DR_GROUP_SIZE (vinfo) = 1;
2439 if (STMT_VINFO_STRIDED_P (first_element))
2440 DR_GROUP_GAP (vinfo) = 0;
2441 else
2442 DR_GROUP_GAP (vinfo) = group_size - 1;
2443 /* Duplicate and adjust alignment info, it needs to
2444 be present on each group leader, see dr_misalignment. */
2445 if (vinfo != first_element)
2446 {
2447 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2448 dr_info2->target_alignment = dr_info->target_alignment;
2449 int misalignment = dr_info->misalignment;
2450 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2451 {
2452 HOST_WIDE_INT diff
2453 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2454 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2455 unsigned HOST_WIDE_INT align_c
2456 = dr_info->target_alignment.to_constant ();
2457 misalignment = (misalignment + diff) % align_c;
2458 }
2459 dr_info2->misalignment = misalignment;
2460 }
2461 vinfo = next;
2462 }
2463 }
2464 }
2465 }
2466 }
2467
2468 /* Determine if operating on full vectors for LOOP_VINFO might leave
2469 some scalar iterations still to do. If so, decide how we should
2470 handle those scalar iterations. The possibilities are:
2471
2472 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2473 In this case:
2474
2475 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2476 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2477 LOOP_VINFO_PEELING_FOR_NITER == false
2478
2479 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2480 to handle the remaining scalar iterations. In this case:
2481
2482 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2483 LOOP_VINFO_PEELING_FOR_NITER == true
2484
2485 There are two choices:
2486
2487 (2a) Consider vectorizing the epilogue loop at the same VF as the
2488 main loop, but using partial vectors instead of full vectors.
2489 In this case:
2490
2491 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2492
2493 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2494 In this case:
2495
2496 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2497 */
2498
2499 opt_result
2500 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2501 {
2502 /* Determine whether there would be any scalar iterations left over. */
2503 bool need_peeling_or_partial_vectors_p
2504 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2505
2506 /* Decide whether to vectorize the loop with partial vectors. */
2507 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2508 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2509 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2510 && need_peeling_or_partial_vectors_p)
2511 {
2512 /* For partial-vector-usage=1, try to push the handling of partial
2513 vectors to the epilogue, with the main loop continuing to operate
2514 on full vectors.
2515
2516 If we are unrolling we also do not want to use partial vectors. This
2517 is to avoid the overhead of generating multiple masks and also to
2518 avoid having to execute entire iterations of FALSE masked instructions
2519 when dealing with one or less full iterations.
2520
2521 ??? We could then end up failing to use partial vectors if we
2522 decide to peel iterations into a prologue, and if the main loop
2523 then ends up processing fewer than VF iterations. */
2524 if ((param_vect_partial_vector_usage == 1
2525 || loop_vinfo->suggested_unroll_factor > 1)
2526 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2527 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2528 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2529 else
2530 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2531 }
2532
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_NOTE, vect_location,
2535 "operating on %s vectors%s.\n",
2536 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2537 ? "partial" : "full",
2538 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2539 ? " for epilogue loop" : "");
2540
2541 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2542 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2543 && need_peeling_or_partial_vectors_p);
2544
2545 return opt_result::success ();
2546 }
2547
2548 /* Function vect_analyze_loop_2.
2549
2550 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2551 analyses will record information in some members of LOOP_VINFO. FATAL
2552 indicates if some analysis meets fatal error. If one non-NULL pointer
2553 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2554 worked out suggested unroll factor, while one NULL pointer shows it's
2555 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2556 is to hold the slp decision when the suggested unroll factor is worked
2557 out. */
2558 static opt_result
2559 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2560 unsigned *suggested_unroll_factor,
2561 bool& slp_done_for_suggested_uf)
2562 {
2563 opt_result ok = opt_result::success ();
2564 int res;
2565 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2566 poly_uint64 min_vf = 2;
2567 loop_vec_info orig_loop_vinfo = NULL;
2568
2569 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2570 loop_vec_info of the first vectorized loop. */
2571 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2572 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2573 else
2574 orig_loop_vinfo = loop_vinfo;
2575 gcc_assert (orig_loop_vinfo);
2576
2577 /* The first group of checks is independent of the vector size. */
2578 fatal = true;
2579
2580 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2581 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2582 return opt_result::failure_at (vect_location,
2583 "not vectorized: simd if(0)\n");
2584
2585 /* Find all data references in the loop (which correspond to vdefs/vuses)
2586 and analyze their evolution in the loop. */
2587
2588 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2589
2590 /* Gather the data references and count stmts in the loop. */
2591 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2592 {
2593 opt_result res
2594 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2595 &LOOP_VINFO_DATAREFS (loop_vinfo),
2596 &LOOP_VINFO_N_STMTS (loop_vinfo));
2597 if (!res)
2598 {
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601 "not vectorized: loop contains function "
2602 "calls or data references that cannot "
2603 "be analyzed\n");
2604 return res;
2605 }
2606 loop_vinfo->shared->save_datarefs ();
2607 }
2608 else
2609 loop_vinfo->shared->check_datarefs ();
2610
2611 /* Analyze the data references and also adjust the minimal
2612 vectorization factor according to the loads and stores. */
2613
2614 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2615 if (!ok)
2616 {
2617 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2619 "bad data references.\n");
2620 return ok;
2621 }
2622
2623 /* Check if we are applying unroll factor now. */
2624 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2625 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2626
2627 /* If the slp decision is false when suggested unroll factor is worked
2628 out, and we are applying suggested unroll factor, we can simply skip
2629 all slp related analyses this time. */
2630 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2631
2632 /* Classify all cross-iteration scalar data-flow cycles.
2633 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2634 vect_analyze_scalar_cycles (loop_vinfo, slp);
2635
2636 vect_pattern_recog (loop_vinfo);
2637
2638 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2639
2640 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2641 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2642
2643 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2644 if (!ok)
2645 {
2646 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2648 "bad data access.\n");
2649 return ok;
2650 }
2651
2652 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2653
2654 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2655 if (!ok)
2656 {
2657 if (dump_enabled_p ())
2658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2659 "unexpected pattern.\n");
2660 return ok;
2661 }
2662
2663 /* While the rest of the analysis below depends on it in some way. */
2664 fatal = false;
2665
2666 /* Analyze data dependences between the data-refs in the loop
2667 and adjust the maximum vectorization factor according to
2668 the dependences.
2669 FORNOW: fail at the first data dependence that we encounter. */
2670
2671 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2672 if (!ok)
2673 {
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2676 "bad data dependence.\n");
2677 return ok;
2678 }
2679 if (max_vf != MAX_VECTORIZATION_FACTOR
2680 && maybe_lt (max_vf, min_vf))
2681 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2682 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2683
2684 ok = vect_determine_vectorization_factor (loop_vinfo);
2685 if (!ok)
2686 {
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2689 "can't determine vectorization factor.\n");
2690 return ok;
2691 }
2692 if (max_vf != MAX_VECTORIZATION_FACTOR
2693 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2694 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2695
2696 /* Compute the scalar iteration cost. */
2697 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2698
2699 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2700
2701 if (slp)
2702 {
2703 /* Check the SLP opportunities in the loop, analyze and build
2704 SLP trees. */
2705 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2706 if (!ok)
2707 return ok;
2708
2709 /* If there are any SLP instances mark them as pure_slp. */
2710 slp = vect_make_slp_decision (loop_vinfo);
2711 if (slp)
2712 {
2713 /* Find stmts that need to be both vectorized and SLPed. */
2714 vect_detect_hybrid_slp (loop_vinfo);
2715
2716 /* Update the vectorization factor based on the SLP decision. */
2717 vect_update_vf_for_slp (loop_vinfo);
2718
2719 /* Optimize the SLP graph with the vectorization factor fixed. */
2720 vect_optimize_slp (loop_vinfo);
2721
2722 /* Gather the loads reachable from the SLP graph entries. */
2723 vect_gather_slp_loads (loop_vinfo);
2724 }
2725 }
2726
2727 bool saved_can_use_partial_vectors_p
2728 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2729
2730 /* We don't expect to have to roll back to anything other than an empty
2731 set of rgroups. */
2732 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2733
2734 /* This is the point where we can re-start analysis with SLP forced off. */
2735 start_over:
2736
2737 /* Apply the suggested unrolling factor, this was determined by the backend
2738 during finish_cost the first time we ran the analyzis for this
2739 vector mode. */
2740 if (applying_suggested_uf)
2741 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2742
2743 /* Now the vectorization factor is final. */
2744 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2745 gcc_assert (known_ne (vectorization_factor, 0U));
2746
2747 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2748 {
2749 dump_printf_loc (MSG_NOTE, vect_location,
2750 "vectorization_factor = ");
2751 dump_dec (MSG_NOTE, vectorization_factor);
2752 dump_printf (MSG_NOTE, ", niters = %wd\n",
2753 LOOP_VINFO_INT_NITERS (loop_vinfo));
2754 }
2755
2756 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2757
2758 /* Analyze the alignment of the data-refs in the loop.
2759 Fail if a data reference is found that cannot be vectorized. */
2760
2761 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2762 if (!ok)
2763 {
2764 if (dump_enabled_p ())
2765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2766 "bad data alignment.\n");
2767 return ok;
2768 }
2769
2770 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2771 It is important to call pruning after vect_analyze_data_ref_accesses,
2772 since we use grouping information gathered by interleaving analysis. */
2773 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2774 if (!ok)
2775 return ok;
2776
2777 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2778 vectorization, since we do not want to add extra peeling or
2779 add versioning for alignment. */
2780 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2781 /* This pass will decide on using loop versioning and/or loop peeling in
2782 order to enhance the alignment of data references in the loop. */
2783 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2784 if (!ok)
2785 return ok;
2786
2787 if (slp)
2788 {
2789 /* Analyze operations in the SLP instances. Note this may
2790 remove unsupported SLP instances which makes the above
2791 SLP kind detection invalid. */
2792 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2793 vect_slp_analyze_operations (loop_vinfo);
2794 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2795 {
2796 ok = opt_result::failure_at (vect_location,
2797 "unsupported SLP instances\n");
2798 goto again;
2799 }
2800
2801 /* Check whether any load in ALL SLP instances is possibly permuted. */
2802 slp_tree load_node, slp_root;
2803 unsigned i, x;
2804 slp_instance instance;
2805 bool can_use_lanes = true;
2806 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2807 {
2808 slp_root = SLP_INSTANCE_TREE (instance);
2809 int group_size = SLP_TREE_LANES (slp_root);
2810 tree vectype = SLP_TREE_VECTYPE (slp_root);
2811 bool loads_permuted = false;
2812 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2813 {
2814 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2815 continue;
2816 unsigned j;
2817 stmt_vec_info load_info;
2818 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2819 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2820 {
2821 loads_permuted = true;
2822 break;
2823 }
2824 }
2825
2826 /* If the loads and stores can be handled with load/store-lane
2827 instructions record it and move on to the next instance. */
2828 if (loads_permuted
2829 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2830 && vect_store_lanes_supported (vectype, group_size, false))
2831 {
2832 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2833 {
2834 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2835 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2836 /* Use SLP for strided accesses (or if we can't
2837 load-lanes). */
2838 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2839 || ! vect_load_lanes_supported
2840 (STMT_VINFO_VECTYPE (stmt_vinfo),
2841 DR_GROUP_SIZE (stmt_vinfo), false))
2842 break;
2843 }
2844
2845 can_use_lanes
2846 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2847
2848 if (can_use_lanes && dump_enabled_p ())
2849 dump_printf_loc (MSG_NOTE, vect_location,
2850 "SLP instance %p can use load/store-lanes\n",
2851 (void *) instance);
2852 }
2853 else
2854 {
2855 can_use_lanes = false;
2856 break;
2857 }
2858 }
2859
2860 /* If all SLP instances can use load/store-lanes abort SLP and try again
2861 with SLP disabled. */
2862 if (can_use_lanes)
2863 {
2864 ok = opt_result::failure_at (vect_location,
2865 "Built SLP cancelled: can use "
2866 "load/store-lanes\n");
2867 if (dump_enabled_p ())
2868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2869 "Built SLP cancelled: all SLP instances support "
2870 "load/store-lanes\n");
2871 goto again;
2872 }
2873 }
2874
2875 /* Dissolve SLP-only groups. */
2876 vect_dissolve_slp_only_groups (loop_vinfo);
2877
2878 /* Scan all the remaining operations in the loop that are not subject
2879 to SLP and make sure they are vectorizable. */
2880 ok = vect_analyze_loop_operations (loop_vinfo);
2881 if (!ok)
2882 {
2883 if (dump_enabled_p ())
2884 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885 "bad operation or unsupported loop bound.\n");
2886 return ok;
2887 }
2888
2889 /* For now, we don't expect to mix both masking and length approaches for one
2890 loop, disable it if both are recorded. */
2891 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2892 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2893 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2894 {
2895 if (dump_enabled_p ())
2896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2897 "can't vectorize a loop with partial vectors"
2898 " because we don't expect to mix different"
2899 " approaches with partial vectors for the"
2900 " same loop.\n");
2901 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2902 }
2903
2904 /* If we still have the option of using partial vectors,
2905 check whether we can generate the necessary loop controls. */
2906 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2907 {
2908 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2909 {
2910 if (!vect_verify_full_masking (loop_vinfo)
2911 && !vect_verify_full_masking_avx512 (loop_vinfo))
2912 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2913 }
2914 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2915 if (!vect_verify_loop_lens (loop_vinfo))
2916 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2917 }
2918
2919 /* If we're vectorizing a loop that uses length "controls" and
2920 can iterate more than once, we apply decrementing IV approach
2921 in loop control. */
2922 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2923 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2924 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2925 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2926 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2927 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2928 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2929
2930 /* If a loop uses length controls and has a decrementing loop control IV,
2931 we will normally pass that IV through a MIN_EXPR to calcaluate the
2932 basis for the length controls. E.g. in a loop that processes one
2933 element per scalar iteration, the number of elements would be
2934 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2935
2936 This MIN_EXPR approach allows us to use pointer IVs with an invariant
2937 step, since only the final iteration of the vector loop can have
2938 inactive lanes.
2939
2940 However, some targets have a dedicated instruction for calculating the
2941 preferred length, given the total number of elements that still need to
2942 be processed. This is encapsulated in the SELECT_VL internal function.
2943
2944 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2945 to determine the basis for the length controls. However, unlike the
2946 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2947 lanes inactive in any iteration of the vector loop, not just the last
2948 iteration. This SELECT_VL approach therefore requires us to use pointer
2949 IVs with variable steps.
2950
2951 Once we've decided how many elements should be processed by one
2952 iteration of the vector loop, we need to populate the rgroup controls.
2953 If a loop has multiple rgroups, we need to make sure that those rgroups
2954 "line up" (that is, they must be consistent about which elements are
2955 active and which aren't). This is done by vect_adjust_loop_lens_control.
2956
2957 In principle, it would be possible to use vect_adjust_loop_lens_control
2958 on either the result of a MIN_EXPR or the result of a SELECT_VL.
2959 However:
2960
2961 (1) In practice, it only makes sense to use SELECT_VL when a vector
2962 operation will be controlled directly by the result. It is not
2963 worth using SELECT_VL if it would only be the input to other
2964 calculations.
2965
2966 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2967 pointer IV will need N updates by a variable amount (N-1 updates
2968 within the iteration and 1 update to move to the next iteration).
2969
2970 Because of this, we prefer to use the MIN_EXPR approach whenever there
2971 is more than one length control.
2972
2973 In addition, SELECT_VL always operates to a granularity of 1 unit.
2974 If we wanted to use it to control an SLP operation on N consecutive
2975 elements, we would need to make the SELECT_VL inputs measure scalar
2976 iterations (rather than elements) and then multiply the SELECT_VL
2977 result by N. But using SELECT_VL this way is inefficient because
2978 of (1) above.
2979
2980 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2981 satisfied:
2982
2983 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2984 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2985
2986 Since SELECT_VL (variable step) will make SCEV analysis failed and then
2987 we will fail to gain benefits of following unroll optimizations. We prefer
2988 using the MIN_EXPR approach in this situation. */
2989 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2990 {
2991 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2992 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2993 OPTIMIZE_FOR_SPEED)
2994 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2995 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
2996 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2997 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2998 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2999 }
3000
3001 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3002 assuming that the loop will be used as a main loop. We will redo
3003 this analysis later if we instead decide to use the loop as an
3004 epilogue loop. */
3005 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3006 if (!ok)
3007 return ok;
3008
3009 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3010 to be able to handle fewer than VF scalars, or needs to have a lower VF
3011 than the main loop. */
3012 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3013 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3014 {
3015 poly_uint64 unscaled_vf
3016 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3017 orig_loop_vinfo->suggested_unroll_factor);
3018 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3019 return opt_result::failure_at (vect_location,
3020 "Vectorization factor too high for"
3021 " epilogue loop.\n");
3022 }
3023
3024 /* Check the costings of the loop make vectorizing worthwhile. */
3025 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3026 if (res < 0)
3027 {
3028 ok = opt_result::failure_at (vect_location,
3029 "Loop costings may not be worthwhile.\n");
3030 goto again;
3031 }
3032 if (!res)
3033 return opt_result::failure_at (vect_location,
3034 "Loop costings not worthwhile.\n");
3035
3036 /* If an epilogue loop is required make sure we can create one. */
3037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3038 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3039 {
3040 if (dump_enabled_p ())
3041 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3042 if (!vect_can_advance_ivs_p (loop_vinfo)
3043 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3044 single_exit (LOOP_VINFO_LOOP
3045 (loop_vinfo))))
3046 {
3047 ok = opt_result::failure_at (vect_location,
3048 "not vectorized: can't create required "
3049 "epilog loop\n");
3050 goto again;
3051 }
3052 }
3053
3054 /* During peeling, we need to check if number of loop iterations is
3055 enough for both peeled prolog loop and vector loop. This check
3056 can be merged along with threshold check of loop versioning, so
3057 increase threshold for this case if necessary.
3058
3059 If we are analyzing an epilogue we still want to check what its
3060 versioning threshold would be. If we decide to vectorize the epilogues we
3061 will want to use the lowest versioning threshold of all epilogues and main
3062 loop. This will enable us to enter a vectorized epilogue even when
3063 versioning the loop. We can't simply check whether the epilogue requires
3064 versioning though since we may have skipped some versioning checks when
3065 analyzing the epilogue. For instance, checks for alias versioning will be
3066 skipped when dealing with epilogues as we assume we already checked them
3067 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3068 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3069 {
3070 poly_uint64 niters_th = 0;
3071 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3072
3073 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3074 {
3075 /* Niters for peeled prolog loop. */
3076 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3077 {
3078 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3079 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3080 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3081 }
3082 else
3083 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3084 }
3085
3086 /* Niters for at least one iteration of vectorized loop. */
3087 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3088 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3089 /* One additional iteration because of peeling for gap. */
3090 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3091 niters_th += 1;
3092
3093 /* Use the same condition as vect_transform_loop to decide when to use
3094 the cost to determine a versioning threshold. */
3095 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3096 && ordered_p (th, niters_th))
3097 niters_th = ordered_max (poly_uint64 (th), niters_th);
3098
3099 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3100 }
3101
3102 gcc_assert (known_eq (vectorization_factor,
3103 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3104
3105 slp_done_for_suggested_uf = slp;
3106
3107 /* Ok to vectorize! */
3108 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3109 return opt_result::success ();
3110
3111 again:
3112 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3113 gcc_assert (!ok);
3114
3115 /* Try again with SLP forced off but if we didn't do any SLP there is
3116 no point in re-trying. */
3117 if (!slp)
3118 return ok;
3119
3120 /* If the slp decision is true when suggested unroll factor is worked
3121 out, and we are applying suggested unroll factor, we don't need to
3122 re-try any more. */
3123 if (applying_suggested_uf && slp_done_for_suggested_uf)
3124 return ok;
3125
3126 /* If there are reduction chains re-trying will fail anyway. */
3127 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3128 return ok;
3129
3130 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3131 via interleaving or lane instructions. */
3132 slp_instance instance;
3133 slp_tree node;
3134 unsigned i, j;
3135 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3136 {
3137 stmt_vec_info vinfo;
3138 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3139 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3140 continue;
3141 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3142 unsigned int size = DR_GROUP_SIZE (vinfo);
3143 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3144 if (! vect_store_lanes_supported (vectype, size, false)
3145 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3146 && ! vect_grouped_store_supported (vectype, size))
3147 return opt_result::failure_at (vinfo->stmt,
3148 "unsupported grouped store\n");
3149 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3150 {
3151 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3152 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3153 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3154 size = DR_GROUP_SIZE (vinfo);
3155 vectype = STMT_VINFO_VECTYPE (vinfo);
3156 if (! vect_load_lanes_supported (vectype, size, false)
3157 && ! vect_grouped_load_supported (vectype, single_element_p,
3158 size))
3159 return opt_result::failure_at (vinfo->stmt,
3160 "unsupported grouped load\n");
3161 }
3162 }
3163
3164 if (dump_enabled_p ())
3165 dump_printf_loc (MSG_NOTE, vect_location,
3166 "re-trying with SLP disabled\n");
3167
3168 /* Roll back state appropriately. No SLP this time. */
3169 slp = false;
3170 /* Restore vectorization factor as it were without SLP. */
3171 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3172 /* Free the SLP instances. */
3173 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3174 vect_free_slp_instance (instance);
3175 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3176 /* Reset SLP type to loop_vect on all stmts. */
3177 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3178 {
3179 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3180 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3181 !gsi_end_p (si); gsi_next (&si))
3182 {
3183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3184 STMT_SLP_TYPE (stmt_info) = loop_vect;
3185 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3186 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3187 {
3188 /* vectorizable_reduction adjusts reduction stmt def-types,
3189 restore them to that of the PHI. */
3190 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3191 = STMT_VINFO_DEF_TYPE (stmt_info);
3192 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3193 (STMT_VINFO_REDUC_DEF (stmt_info)))
3194 = STMT_VINFO_DEF_TYPE (stmt_info);
3195 }
3196 }
3197 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3198 !gsi_end_p (si); gsi_next (&si))
3199 {
3200 if (is_gimple_debug (gsi_stmt (si)))
3201 continue;
3202 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3203 STMT_SLP_TYPE (stmt_info) = loop_vect;
3204 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3205 {
3206 stmt_vec_info pattern_stmt_info
3207 = STMT_VINFO_RELATED_STMT (stmt_info);
3208 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3209 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3210
3211 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3212 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3213 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3214 !gsi_end_p (pi); gsi_next (&pi))
3215 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3216 = loop_vect;
3217 }
3218 }
3219 }
3220 /* Free optimized alias test DDRS. */
3221 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3222 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3223 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3224 /* Reset target cost data. */
3225 delete loop_vinfo->vector_costs;
3226 loop_vinfo->vector_costs = nullptr;
3227 /* Reset accumulated rgroup information. */
3228 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3229 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3230 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3231 /* Reset assorted flags. */
3232 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3233 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3234 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3235 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3236 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3237 = saved_can_use_partial_vectors_p;
3238
3239 goto start_over;
3240 }
3241
3242 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3243 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3244 OLD_LOOP_VINFO is better unless something specifically indicates
3245 otherwise.
3246
3247 Note that this deliberately isn't a partial order. */
3248
3249 static bool
3250 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3251 loop_vec_info old_loop_vinfo)
3252 {
3253 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3254 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3255
3256 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3257 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3258
3259 /* Always prefer a VF of loop->simdlen over any other VF. */
3260 if (loop->simdlen)
3261 {
3262 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3263 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3264 if (new_simdlen_p != old_simdlen_p)
3265 return new_simdlen_p;
3266 }
3267
3268 const auto *old_costs = old_loop_vinfo->vector_costs;
3269 const auto *new_costs = new_loop_vinfo->vector_costs;
3270 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3271 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3272
3273 return new_costs->better_main_loop_than_p (old_costs);
3274 }
3275
3276 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3277 true if we should. */
3278
3279 static bool
3280 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3281 loop_vec_info old_loop_vinfo)
3282 {
3283 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3284 return false;
3285
3286 if (dump_enabled_p ())
3287 dump_printf_loc (MSG_NOTE, vect_location,
3288 "***** Preferring vector mode %s to vector mode %s\n",
3289 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3290 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3291 return true;
3292 }
3293
3294 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3295 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3296 MODE_I to the next mode useful to analyze.
3297 Return the loop_vinfo on success and wrapped null on failure. */
3298
3299 static opt_loop_vec_info
3300 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3301 const vect_loop_form_info *loop_form_info,
3302 loop_vec_info main_loop_vinfo,
3303 const vector_modes &vector_modes, unsigned &mode_i,
3304 machine_mode &autodetected_vector_mode,
3305 bool &fatal)
3306 {
3307 loop_vec_info loop_vinfo
3308 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3309
3310 machine_mode vector_mode = vector_modes[mode_i];
3311 loop_vinfo->vector_mode = vector_mode;
3312 unsigned int suggested_unroll_factor = 1;
3313 bool slp_done_for_suggested_uf = false;
3314
3315 /* Run the main analysis. */
3316 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3317 &suggested_unroll_factor,
3318 slp_done_for_suggested_uf);
3319 if (dump_enabled_p ())
3320 dump_printf_loc (MSG_NOTE, vect_location,
3321 "***** Analysis %s with vector mode %s\n",
3322 res ? "succeeded" : " failed",
3323 GET_MODE_NAME (loop_vinfo->vector_mode));
3324
3325 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3326 {
3327 if (dump_enabled_p ())
3328 dump_printf_loc (MSG_NOTE, vect_location,
3329 "***** Re-trying analysis for unrolling"
3330 " with unroll factor %d and slp %s.\n",
3331 suggested_unroll_factor,
3332 slp_done_for_suggested_uf ? "on" : "off");
3333 loop_vec_info unroll_vinfo
3334 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3335 unroll_vinfo->vector_mode = vector_mode;
3336 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3337 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3338 slp_done_for_suggested_uf);
3339 if (new_res)
3340 {
3341 delete loop_vinfo;
3342 loop_vinfo = unroll_vinfo;
3343 }
3344 else
3345 delete unroll_vinfo;
3346 }
3347
3348 /* Remember the autodetected vector mode. */
3349 if (vector_mode == VOIDmode)
3350 autodetected_vector_mode = loop_vinfo->vector_mode;
3351
3352 /* Advance mode_i, first skipping modes that would result in the
3353 same analysis result. */
3354 while (mode_i + 1 < vector_modes.length ()
3355 && vect_chooses_same_modes_p (loop_vinfo,
3356 vector_modes[mode_i + 1]))
3357 {
3358 if (dump_enabled_p ())
3359 dump_printf_loc (MSG_NOTE, vect_location,
3360 "***** The result for vector mode %s would"
3361 " be the same\n",
3362 GET_MODE_NAME (vector_modes[mode_i + 1]));
3363 mode_i += 1;
3364 }
3365 if (mode_i + 1 < vector_modes.length ()
3366 && VECTOR_MODE_P (autodetected_vector_mode)
3367 && (related_vector_mode (vector_modes[mode_i + 1],
3368 GET_MODE_INNER (autodetected_vector_mode))
3369 == autodetected_vector_mode)
3370 && (related_vector_mode (autodetected_vector_mode,
3371 GET_MODE_INNER (vector_modes[mode_i + 1]))
3372 == vector_modes[mode_i + 1]))
3373 {
3374 if (dump_enabled_p ())
3375 dump_printf_loc (MSG_NOTE, vect_location,
3376 "***** Skipping vector mode %s, which would"
3377 " repeat the analysis for %s\n",
3378 GET_MODE_NAME (vector_modes[mode_i + 1]),
3379 GET_MODE_NAME (autodetected_vector_mode));
3380 mode_i += 1;
3381 }
3382 mode_i++;
3383
3384 if (!res)
3385 {
3386 delete loop_vinfo;
3387 if (fatal)
3388 gcc_checking_assert (main_loop_vinfo == NULL);
3389 return opt_loop_vec_info::propagate_failure (res);
3390 }
3391
3392 return opt_loop_vec_info::success (loop_vinfo);
3393 }
3394
3395 /* Function vect_analyze_loop.
3396
3397 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3398 for it. The different analyses will record information in the
3399 loop_vec_info struct. */
3400 opt_loop_vec_info
3401 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3402 {
3403 DUMP_VECT_SCOPE ("analyze_loop_nest");
3404
3405 if (loop_outer (loop)
3406 && loop_vec_info_for_loop (loop_outer (loop))
3407 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3408 return opt_loop_vec_info::failure_at (vect_location,
3409 "outer-loop already vectorized.\n");
3410
3411 if (!find_loop_nest (loop, &shared->loop_nest))
3412 return opt_loop_vec_info::failure_at
3413 (vect_location,
3414 "not vectorized: loop nest containing two or more consecutive inner"
3415 " loops cannot be vectorized\n");
3416
3417 /* Analyze the loop form. */
3418 vect_loop_form_info loop_form_info;
3419 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3420 if (!res)
3421 {
3422 if (dump_enabled_p ())
3423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3424 "bad loop form.\n");
3425 return opt_loop_vec_info::propagate_failure (res);
3426 }
3427 if (!integer_onep (loop_form_info.assumptions))
3428 {
3429 /* We consider to vectorize this loop by versioning it under
3430 some assumptions. In order to do this, we need to clear
3431 existing information computed by scev and niter analyzer. */
3432 scev_reset_htab ();
3433 free_numbers_of_iterations_estimates (loop);
3434 /* Also set flag for this loop so that following scev and niter
3435 analysis are done under the assumptions. */
3436 loop_constraint_set (loop, LOOP_C_FINITE);
3437 }
3438
3439 auto_vector_modes vector_modes;
3440 /* Autodetect first vector size we try. */
3441 vector_modes.safe_push (VOIDmode);
3442 unsigned int autovec_flags
3443 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3444 loop->simdlen != 0);
3445 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3446 && !unlimited_cost_model (loop));
3447 machine_mode autodetected_vector_mode = VOIDmode;
3448 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3449 unsigned int mode_i = 0;
3450 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3451
3452 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3453 a mode has not been analyzed. */
3454 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3455 for (unsigned i = 0; i < vector_modes.length (); ++i)
3456 cached_vf_per_mode.safe_push (0);
3457
3458 /* First determine the main loop vectorization mode, either the first
3459 one that works, starting with auto-detecting the vector mode and then
3460 following the targets order of preference, or the one with the
3461 lowest cost if pick_lowest_cost_p. */
3462 while (1)
3463 {
3464 bool fatal;
3465 unsigned int last_mode_i = mode_i;
3466 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3467 failed. */
3468 cached_vf_per_mode[last_mode_i] = -1;
3469 opt_loop_vec_info loop_vinfo
3470 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3471 NULL, vector_modes, mode_i,
3472 autodetected_vector_mode, fatal);
3473 if (fatal)
3474 break;
3475
3476 if (loop_vinfo)
3477 {
3478 /* Analyzis has been successful so update the VF value. The
3479 VF should always be a multiple of unroll_factor and we want to
3480 capture the original VF here. */
3481 cached_vf_per_mode[last_mode_i]
3482 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3483 loop_vinfo->suggested_unroll_factor);
3484 /* Once we hit the desired simdlen for the first time,
3485 discard any previous attempts. */
3486 if (simdlen
3487 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3488 {
3489 delete first_loop_vinfo;
3490 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3491 simdlen = 0;
3492 }
3493 else if (pick_lowest_cost_p
3494 && first_loop_vinfo
3495 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3496 {
3497 /* Pick loop_vinfo over first_loop_vinfo. */
3498 delete first_loop_vinfo;
3499 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3500 }
3501 if (first_loop_vinfo == NULL)
3502 first_loop_vinfo = loop_vinfo;
3503 else
3504 {
3505 delete loop_vinfo;
3506 loop_vinfo = opt_loop_vec_info::success (NULL);
3507 }
3508
3509 /* Commit to first_loop_vinfo if we have no reason to try
3510 alternatives. */
3511 if (!simdlen && !pick_lowest_cost_p)
3512 break;
3513 }
3514 if (mode_i == vector_modes.length ()
3515 || autodetected_vector_mode == VOIDmode)
3516 break;
3517
3518 /* Try the next biggest vector size. */
3519 if (dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location,
3521 "***** Re-trying analysis with vector mode %s\n",
3522 GET_MODE_NAME (vector_modes[mode_i]));
3523 }
3524 if (!first_loop_vinfo)
3525 return opt_loop_vec_info::propagate_failure (res);
3526
3527 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_NOTE, vect_location,
3529 "***** Choosing vector mode %s\n",
3530 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3531
3532 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3533 enabled, SIMDUID is not set, it is the innermost loop and we have
3534 either already found the loop's SIMDLEN or there was no SIMDLEN to
3535 begin with.
3536 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3537 bool vect_epilogues = (!simdlen
3538 && loop->inner == NULL
3539 && param_vect_epilogues_nomask
3540 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3541 && !loop->simduid);
3542 if (!vect_epilogues)
3543 return first_loop_vinfo;
3544
3545 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3546 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3547
3548 /* For epilogues start the analysis from the first mode. The motivation
3549 behind starting from the beginning comes from cases where the VECTOR_MODES
3550 array may contain length-agnostic and length-specific modes. Their
3551 ordering is not guaranteed, so we could end up picking a mode for the main
3552 loop that is after the epilogue's optimal mode. */
3553 vector_modes[0] = autodetected_vector_mode;
3554 mode_i = 0;
3555
3556 bool supports_partial_vectors =
3557 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3558 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3559
3560 while (1)
3561 {
3562 /* If the target does not support partial vectors we can shorten the
3563 number of modes to analyze for the epilogue as we know we can't pick a
3564 mode that would lead to a VF at least as big as the
3565 FIRST_VINFO_VF. */
3566 if (!supports_partial_vectors
3567 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3568 {
3569 mode_i++;
3570 if (mode_i == vector_modes.length ())
3571 break;
3572 continue;
3573 }
3574
3575 if (dump_enabled_p ())
3576 dump_printf_loc (MSG_NOTE, vect_location,
3577 "***** Re-trying epilogue analysis with vector "
3578 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3579
3580 bool fatal;
3581 opt_loop_vec_info loop_vinfo
3582 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3583 first_loop_vinfo,
3584 vector_modes, mode_i,
3585 autodetected_vector_mode, fatal);
3586 if (fatal)
3587 break;
3588
3589 if (loop_vinfo)
3590 {
3591 if (pick_lowest_cost_p)
3592 {
3593 /* Keep trying to roll back vectorization attempts while the
3594 loop_vec_infos they produced were worse than this one. */
3595 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3596 while (!vinfos.is_empty ()
3597 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3598 {
3599 gcc_assert (vect_epilogues);
3600 delete vinfos.pop ();
3601 }
3602 }
3603 /* For now only allow one epilogue loop. */
3604 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3605 {
3606 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3607 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3608 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3609 || maybe_ne (lowest_th, 0U));
3610 /* Keep track of the known smallest versioning
3611 threshold. */
3612 if (ordered_p (lowest_th, th))
3613 lowest_th = ordered_min (lowest_th, th);
3614 }
3615 else
3616 {
3617 delete loop_vinfo;
3618 loop_vinfo = opt_loop_vec_info::success (NULL);
3619 }
3620
3621 /* For now only allow one epilogue loop, but allow
3622 pick_lowest_cost_p to replace it, so commit to the
3623 first epilogue if we have no reason to try alternatives. */
3624 if (!pick_lowest_cost_p)
3625 break;
3626 }
3627
3628 if (mode_i == vector_modes.length ())
3629 break;
3630
3631 }
3632
3633 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3634 {
3635 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3636 if (dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "***** Choosing epilogue vector mode %s\n",
3639 GET_MODE_NAME
3640 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3641 }
3642
3643 return first_loop_vinfo;
3644 }
3645
3646 /* Return true if there is an in-order reduction function for CODE, storing
3647 it in *REDUC_FN if so. */
3648
3649 static bool
3650 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3651 {
3652 if (code == PLUS_EXPR)
3653 {
3654 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3655 return true;
3656 }
3657 return false;
3658 }
3659
3660 /* Function reduction_fn_for_scalar_code
3661
3662 Input:
3663 CODE - tree_code of a reduction operations.
3664
3665 Output:
3666 REDUC_FN - the corresponding internal function to be used to reduce the
3667 vector of partial results into a single scalar result, or IFN_LAST
3668 if the operation is a supported reduction operation, but does not have
3669 such an internal function.
3670
3671 Return FALSE if CODE currently cannot be vectorized as reduction. */
3672
3673 bool
3674 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3675 {
3676 if (code.is_tree_code ())
3677 switch (tree_code (code))
3678 {
3679 case MAX_EXPR:
3680 *reduc_fn = IFN_REDUC_MAX;
3681 return true;
3682
3683 case MIN_EXPR:
3684 *reduc_fn = IFN_REDUC_MIN;
3685 return true;
3686
3687 case PLUS_EXPR:
3688 *reduc_fn = IFN_REDUC_PLUS;
3689 return true;
3690
3691 case BIT_AND_EXPR:
3692 *reduc_fn = IFN_REDUC_AND;
3693 return true;
3694
3695 case BIT_IOR_EXPR:
3696 *reduc_fn = IFN_REDUC_IOR;
3697 return true;
3698
3699 case BIT_XOR_EXPR:
3700 *reduc_fn = IFN_REDUC_XOR;
3701 return true;
3702
3703 case MULT_EXPR:
3704 case MINUS_EXPR:
3705 *reduc_fn = IFN_LAST;
3706 return true;
3707
3708 default:
3709 return false;
3710 }
3711 else
3712 switch (combined_fn (code))
3713 {
3714 CASE_CFN_FMAX:
3715 *reduc_fn = IFN_REDUC_FMAX;
3716 return true;
3717
3718 CASE_CFN_FMIN:
3719 *reduc_fn = IFN_REDUC_FMIN;
3720 return true;
3721
3722 default:
3723 return false;
3724 }
3725 }
3726
3727 /* If there is a neutral value X such that a reduction would not be affected
3728 by the introduction of additional X elements, return that X, otherwise
3729 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3730 of the scalar elements. If the reduction has just a single initial value
3731 then INITIAL_VALUE is that value, otherwise it is null. */
3732
3733 tree
3734 neutral_op_for_reduction (tree scalar_type, code_helper code,
3735 tree initial_value)
3736 {
3737 if (code.is_tree_code ())
3738 switch (tree_code (code))
3739 {
3740 case WIDEN_SUM_EXPR:
3741 case DOT_PROD_EXPR:
3742 case SAD_EXPR:
3743 case PLUS_EXPR:
3744 case MINUS_EXPR:
3745 case BIT_IOR_EXPR:
3746 case BIT_XOR_EXPR:
3747 return build_zero_cst (scalar_type);
3748
3749 case MULT_EXPR:
3750 return build_one_cst (scalar_type);
3751
3752 case BIT_AND_EXPR:
3753 return build_all_ones_cst (scalar_type);
3754
3755 case MAX_EXPR:
3756 case MIN_EXPR:
3757 return initial_value;
3758
3759 default:
3760 return NULL_TREE;
3761 }
3762 else
3763 switch (combined_fn (code))
3764 {
3765 CASE_CFN_FMIN:
3766 CASE_CFN_FMAX:
3767 return initial_value;
3768
3769 default:
3770 return NULL_TREE;
3771 }
3772 }
3773
3774 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3775 STMT is printed with a message MSG. */
3776
3777 static void
3778 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3779 {
3780 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3781 }
3782
3783 /* Return true if we need an in-order reduction for operation CODE
3784 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3785 overflow must wrap. */
3786
3787 bool
3788 needs_fold_left_reduction_p (tree type, code_helper code)
3789 {
3790 /* CHECKME: check for !flag_finite_math_only too? */
3791 if (SCALAR_FLOAT_TYPE_P (type))
3792 {
3793 if (code.is_tree_code ())
3794 switch (tree_code (code))
3795 {
3796 case MIN_EXPR:
3797 case MAX_EXPR:
3798 return false;
3799
3800 default:
3801 return !flag_associative_math;
3802 }
3803 else
3804 switch (combined_fn (code))
3805 {
3806 CASE_CFN_FMIN:
3807 CASE_CFN_FMAX:
3808 return false;
3809
3810 default:
3811 return !flag_associative_math;
3812 }
3813 }
3814
3815 if (INTEGRAL_TYPE_P (type))
3816 return (!code.is_tree_code ()
3817 || !operation_no_trapping_overflow (type, tree_code (code)));
3818
3819 if (SAT_FIXED_POINT_TYPE_P (type))
3820 return true;
3821
3822 return false;
3823 }
3824
3825 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3826 has a handled computation expression. Store the main reduction
3827 operation in *CODE. */
3828
3829 static bool
3830 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3831 tree loop_arg, code_helper *code,
3832 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3833 {
3834 auto_bitmap visited;
3835 tree lookfor = PHI_RESULT (phi);
3836 ssa_op_iter curri;
3837 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3838 while (USE_FROM_PTR (curr) != loop_arg)
3839 curr = op_iter_next_use (&curri);
3840 curri.i = curri.numops;
3841 do
3842 {
3843 path.safe_push (std::make_pair (curri, curr));
3844 tree use = USE_FROM_PTR (curr);
3845 if (use == lookfor)
3846 break;
3847 gimple *def = SSA_NAME_DEF_STMT (use);
3848 if (gimple_nop_p (def)
3849 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3850 {
3851 pop:
3852 do
3853 {
3854 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3855 curri = x.first;
3856 curr = x.second;
3857 do
3858 curr = op_iter_next_use (&curri);
3859 /* Skip already visited or non-SSA operands (from iterating
3860 over PHI args). */
3861 while (curr != NULL_USE_OPERAND_P
3862 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3863 || ! bitmap_set_bit (visited,
3864 SSA_NAME_VERSION
3865 (USE_FROM_PTR (curr)))));
3866 }
3867 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3868 if (curr == NULL_USE_OPERAND_P)
3869 break;
3870 }
3871 else
3872 {
3873 if (gimple_code (def) == GIMPLE_PHI)
3874 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3875 else
3876 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3877 while (curr != NULL_USE_OPERAND_P
3878 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3879 || ! bitmap_set_bit (visited,
3880 SSA_NAME_VERSION
3881 (USE_FROM_PTR (curr)))))
3882 curr = op_iter_next_use (&curri);
3883 if (curr == NULL_USE_OPERAND_P)
3884 goto pop;
3885 }
3886 }
3887 while (1);
3888 if (dump_file && (dump_flags & TDF_DETAILS))
3889 {
3890 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3891 unsigned i;
3892 std::pair<ssa_op_iter, use_operand_p> *x;
3893 FOR_EACH_VEC_ELT (path, i, x)
3894 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3895 dump_printf (MSG_NOTE, "\n");
3896 }
3897
3898 /* Check whether the reduction path detected is valid. */
3899 bool fail = path.length () == 0;
3900 bool neg = false;
3901 int sign = -1;
3902 *code = ERROR_MARK;
3903 for (unsigned i = 1; i < path.length (); ++i)
3904 {
3905 gimple *use_stmt = USE_STMT (path[i].second);
3906 gimple_match_op op;
3907 if (!gimple_extract_op (use_stmt, &op))
3908 {
3909 fail = true;
3910 break;
3911 }
3912 unsigned int opi = op.num_ops;
3913 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3914 {
3915 /* The following make sure we can compute the operand index
3916 easily plus it mostly disallows chaining via COND_EXPR condition
3917 operands. */
3918 for (opi = 0; opi < op.num_ops; ++opi)
3919 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3920 break;
3921 }
3922 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3923 {
3924 for (opi = 0; opi < op.num_ops; ++opi)
3925 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3926 break;
3927 }
3928 if (opi == op.num_ops)
3929 {
3930 fail = true;
3931 break;
3932 }
3933 op.code = canonicalize_code (op.code, op.type);
3934 if (op.code == MINUS_EXPR)
3935 {
3936 op.code = PLUS_EXPR;
3937 /* Track whether we negate the reduction value each iteration. */
3938 if (op.ops[1] == op.ops[opi])
3939 neg = ! neg;
3940 }
3941 if (CONVERT_EXPR_CODE_P (op.code)
3942 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3943 ;
3944 else if (*code == ERROR_MARK)
3945 {
3946 *code = op.code;
3947 sign = TYPE_SIGN (op.type);
3948 }
3949 else if (op.code != *code)
3950 {
3951 fail = true;
3952 break;
3953 }
3954 else if ((op.code == MIN_EXPR
3955 || op.code == MAX_EXPR)
3956 && sign != TYPE_SIGN (op.type))
3957 {
3958 fail = true;
3959 break;
3960 }
3961 /* Check there's only a single stmt the op is used on. For the
3962 not value-changing tail and the last stmt allow out-of-loop uses.
3963 ??? We could relax this and handle arbitrary live stmts by
3964 forcing a scalar epilogue for example. */
3965 imm_use_iterator imm_iter;
3966 gimple *op_use_stmt;
3967 unsigned cnt = 0;
3968 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3969 if (!is_gimple_debug (op_use_stmt)
3970 && (*code != ERROR_MARK
3971 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3972 {
3973 /* We want to allow x + x but not x < 1 ? x : 2. */
3974 if (is_gimple_assign (op_use_stmt)
3975 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3976 {
3977 use_operand_p use_p;
3978 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3979 cnt++;
3980 }
3981 else
3982 cnt++;
3983 }
3984 if (cnt != 1)
3985 {
3986 fail = true;
3987 break;
3988 }
3989 }
3990 return ! fail && ! neg && *code != ERROR_MARK;
3991 }
3992
3993 bool
3994 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3995 tree loop_arg, enum tree_code code)
3996 {
3997 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3998 code_helper code_;
3999 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4000 && code_ == code);
4001 }
4002
4003
4004
4005 /* Function vect_is_simple_reduction
4006
4007 (1) Detect a cross-iteration def-use cycle that represents a simple
4008 reduction computation. We look for the following pattern:
4009
4010 loop_header:
4011 a1 = phi < a0, a2 >
4012 a3 = ...
4013 a2 = operation (a3, a1)
4014
4015 or
4016
4017 a3 = ...
4018 loop_header:
4019 a1 = phi < a0, a2 >
4020 a2 = operation (a3, a1)
4021
4022 such that:
4023 1. operation is commutative and associative and it is safe to
4024 change the order of the computation
4025 2. no uses for a2 in the loop (a2 is used out of the loop)
4026 3. no uses of a1 in the loop besides the reduction operation
4027 4. no uses of a1 outside the loop.
4028
4029 Conditions 1,4 are tested here.
4030 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4031
4032 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4033 nested cycles.
4034
4035 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4036 reductions:
4037
4038 a1 = phi < a0, a2 >
4039 inner loop (def of a3)
4040 a2 = phi < a3 >
4041
4042 (4) Detect condition expressions, ie:
4043 for (int i = 0; i < N; i++)
4044 if (a[i] < val)
4045 ret_val = a[i];
4046
4047 */
4048
4049 static stmt_vec_info
4050 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4051 bool *double_reduc, bool *reduc_chain_p, bool slp)
4052 {
4053 gphi *phi = as_a <gphi *> (phi_info->stmt);
4054 gimple *phi_use_stmt = NULL;
4055 imm_use_iterator imm_iter;
4056 use_operand_p use_p;
4057
4058 *double_reduc = false;
4059 *reduc_chain_p = false;
4060 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4061
4062 tree phi_name = PHI_RESULT (phi);
4063 /* ??? If there are no uses of the PHI result the inner loop reduction
4064 won't be detected as possibly double-reduction by vectorizable_reduction
4065 because that tries to walk the PHI arg from the preheader edge which
4066 can be constant. See PR60382. */
4067 if (has_zero_uses (phi_name))
4068 return NULL;
4069 class loop *loop = (gimple_bb (phi))->loop_father;
4070 unsigned nphi_def_loop_uses = 0;
4071 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4072 {
4073 gimple *use_stmt = USE_STMT (use_p);
4074 if (is_gimple_debug (use_stmt))
4075 continue;
4076
4077 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4078 {
4079 if (dump_enabled_p ())
4080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4081 "intermediate value used outside loop.\n");
4082
4083 return NULL;
4084 }
4085
4086 nphi_def_loop_uses++;
4087 phi_use_stmt = use_stmt;
4088 }
4089
4090 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4091 if (TREE_CODE (latch_def) != SSA_NAME)
4092 {
4093 if (dump_enabled_p ())
4094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4095 "reduction: not ssa_name: %T\n", latch_def);
4096 return NULL;
4097 }
4098
4099 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4100 if (!def_stmt_info
4101 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4102 return NULL;
4103
4104 bool nested_in_vect_loop
4105 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4106 unsigned nlatch_def_loop_uses = 0;
4107 auto_vec<gphi *, 3> lcphis;
4108 bool inner_loop_of_double_reduc = false;
4109 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4110 {
4111 gimple *use_stmt = USE_STMT (use_p);
4112 if (is_gimple_debug (use_stmt))
4113 continue;
4114 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4115 nlatch_def_loop_uses++;
4116 else
4117 {
4118 /* We can have more than one loop-closed PHI. */
4119 lcphis.safe_push (as_a <gphi *> (use_stmt));
4120 if (nested_in_vect_loop
4121 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4122 == vect_double_reduction_def))
4123 inner_loop_of_double_reduc = true;
4124 }
4125 }
4126
4127 /* If we are vectorizing an inner reduction we are executing that
4128 in the original order only in case we are not dealing with a
4129 double reduction. */
4130 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4131 {
4132 if (dump_enabled_p ())
4133 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4134 "detected nested cycle: ");
4135 return def_stmt_info;
4136 }
4137
4138 /* When the inner loop of a double reduction ends up with more than
4139 one loop-closed PHI we have failed to classify alternate such
4140 PHIs as double reduction, leading to wrong code. See PR103237. */
4141 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4142 {
4143 if (dump_enabled_p ())
4144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4145 "unhandle double reduction\n");
4146 return NULL;
4147 }
4148
4149 /* If this isn't a nested cycle or if the nested cycle reduction value
4150 is used ouside of the inner loop we cannot handle uses of the reduction
4151 value. */
4152 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4153 {
4154 if (dump_enabled_p ())
4155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4156 "reduction used in loop.\n");
4157 return NULL;
4158 }
4159
4160 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4161 defined in the inner loop. */
4162 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4163 {
4164 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4165 if (gimple_phi_num_args (def_stmt) != 1
4166 || TREE_CODE (op1) != SSA_NAME)
4167 {
4168 if (dump_enabled_p ())
4169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4170 "unsupported phi node definition.\n");
4171
4172 return NULL;
4173 }
4174
4175 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4176 and the latch definition op1. */
4177 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4178 if (gimple_bb (def1)
4179 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4180 && loop->inner
4181 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4182 && (is_gimple_assign (def1) || is_gimple_call (def1))
4183 && is_a <gphi *> (phi_use_stmt)
4184 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4185 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4186 loop_latch_edge (loop->inner))))
4187 {
4188 if (dump_enabled_p ())
4189 report_vect_op (MSG_NOTE, def_stmt,
4190 "detected double reduction: ");
4191
4192 *double_reduc = true;
4193 return def_stmt_info;
4194 }
4195
4196 return NULL;
4197 }
4198
4199 /* Look for the expression computing latch_def from then loop PHI result. */
4200 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4201 code_helper code;
4202 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4203 path))
4204 {
4205 STMT_VINFO_REDUC_CODE (phi_info) = code;
4206 if (code == COND_EXPR && !nested_in_vect_loop)
4207 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4208
4209 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4210 reduction chain for which the additional restriction is that
4211 all operations in the chain are the same. */
4212 auto_vec<stmt_vec_info, 8> reduc_chain;
4213 unsigned i;
4214 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4215 for (i = path.length () - 1; i >= 1; --i)
4216 {
4217 gimple *stmt = USE_STMT (path[i].second);
4218 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4219 gimple_match_op op;
4220 if (!gimple_extract_op (stmt, &op))
4221 gcc_unreachable ();
4222 if (gassign *assign = dyn_cast<gassign *> (stmt))
4223 STMT_VINFO_REDUC_IDX (stmt_info)
4224 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4225 else
4226 {
4227 gcall *call = as_a<gcall *> (stmt);
4228 STMT_VINFO_REDUC_IDX (stmt_info)
4229 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4230 }
4231 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4232 && (i == 1 || i == path.length () - 1));
4233 if ((op.code != code && !leading_conversion)
4234 /* We can only handle the final value in epilogue
4235 generation for reduction chains. */
4236 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4237 is_slp_reduc = false;
4238 /* For reduction chains we support a trailing/leading
4239 conversions. We do not store those in the actual chain. */
4240 if (leading_conversion)
4241 continue;
4242 reduc_chain.safe_push (stmt_info);
4243 }
4244 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4245 {
4246 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4247 {
4248 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4249 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4250 }
4251 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4252 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4253
4254 /* Save the chain for further analysis in SLP detection. */
4255 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4256 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4257
4258 *reduc_chain_p = true;
4259 if (dump_enabled_p ())
4260 dump_printf_loc (MSG_NOTE, vect_location,
4261 "reduction: detected reduction chain\n");
4262 }
4263 else if (dump_enabled_p ())
4264 dump_printf_loc (MSG_NOTE, vect_location,
4265 "reduction: detected reduction\n");
4266
4267 return def_stmt_info;
4268 }
4269
4270 if (dump_enabled_p ())
4271 dump_printf_loc (MSG_NOTE, vect_location,
4272 "reduction: unknown pattern\n");
4273
4274 return NULL;
4275 }
4276
4277 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4278 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4279 or -1 if not known. */
4280
4281 static int
4282 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4283 {
4284 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4285 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4286 {
4287 if (dump_enabled_p ())
4288 dump_printf_loc (MSG_NOTE, vect_location,
4289 "cost model: epilogue peel iters set to vf/2 "
4290 "because loop iterations are unknown .\n");
4291 return assumed_vf / 2;
4292 }
4293 else
4294 {
4295 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4296 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4297 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4298 /* If we need to peel for gaps, but no peeling is required, we have to
4299 peel VF iterations. */
4300 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4301 peel_iters_epilogue = assumed_vf;
4302 return peel_iters_epilogue;
4303 }
4304 }
4305
4306 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4307 int
4308 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4309 int *peel_iters_epilogue,
4310 stmt_vector_for_cost *scalar_cost_vec,
4311 stmt_vector_for_cost *prologue_cost_vec,
4312 stmt_vector_for_cost *epilogue_cost_vec)
4313 {
4314 int retval = 0;
4315
4316 *peel_iters_epilogue
4317 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4318
4319 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4320 {
4321 /* If peeled iterations are known but number of scalar loop
4322 iterations are unknown, count a taken branch per peeled loop. */
4323 if (peel_iters_prologue > 0)
4324 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4325 vect_prologue);
4326 if (*peel_iters_epilogue > 0)
4327 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4328 vect_epilogue);
4329 }
4330
4331 stmt_info_for_cost *si;
4332 int j;
4333 if (peel_iters_prologue)
4334 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4335 retval += record_stmt_cost (prologue_cost_vec,
4336 si->count * peel_iters_prologue,
4337 si->kind, si->stmt_info, si->misalign,
4338 vect_prologue);
4339 if (*peel_iters_epilogue)
4340 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4341 retval += record_stmt_cost (epilogue_cost_vec,
4342 si->count * *peel_iters_epilogue,
4343 si->kind, si->stmt_info, si->misalign,
4344 vect_epilogue);
4345
4346 return retval;
4347 }
4348
4349 /* Function vect_estimate_min_profitable_iters
4350
4351 Return the number of iterations required for the vector version of the
4352 loop to be profitable relative to the cost of the scalar version of the
4353 loop.
4354
4355 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4356 of iterations for vectorization. -1 value means loop vectorization
4357 is not profitable. This returned value may be used for dynamic
4358 profitability check.
4359
4360 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4361 for static check against estimated number of iterations. */
4362
4363 static void
4364 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4365 int *ret_min_profitable_niters,
4366 int *ret_min_profitable_estimate,
4367 unsigned *suggested_unroll_factor)
4368 {
4369 int min_profitable_iters;
4370 int min_profitable_estimate;
4371 int peel_iters_prologue;
4372 int peel_iters_epilogue;
4373 unsigned vec_inside_cost = 0;
4374 int vec_outside_cost = 0;
4375 unsigned vec_prologue_cost = 0;
4376 unsigned vec_epilogue_cost = 0;
4377 int scalar_single_iter_cost = 0;
4378 int scalar_outside_cost = 0;
4379 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4380 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4381 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4382
4383 /* Cost model disabled. */
4384 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4385 {
4386 if (dump_enabled_p ())
4387 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4388 *ret_min_profitable_niters = 0;
4389 *ret_min_profitable_estimate = 0;
4390 return;
4391 }
4392
4393 /* Requires loop versioning tests to handle misalignment. */
4394 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4395 {
4396 /* FIXME: Make cost depend on complexity of individual check. */
4397 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4398 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4399 if (dump_enabled_p ())
4400 dump_printf (MSG_NOTE,
4401 "cost model: Adding cost of checks for loop "
4402 "versioning to treat misalignment.\n");
4403 }
4404
4405 /* Requires loop versioning with alias checks. */
4406 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4407 {
4408 /* FIXME: Make cost depend on complexity of individual check. */
4409 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4410 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4411 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4412 if (len)
4413 /* Count LEN - 1 ANDs and LEN comparisons. */
4414 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4415 scalar_stmt, vect_prologue);
4416 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4417 if (len)
4418 {
4419 /* Count LEN - 1 ANDs and LEN comparisons. */
4420 unsigned int nstmts = len * 2 - 1;
4421 /* +1 for each bias that needs adding. */
4422 for (unsigned int i = 0; i < len; ++i)
4423 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4424 nstmts += 1;
4425 (void) add_stmt_cost (target_cost_data, nstmts,
4426 scalar_stmt, vect_prologue);
4427 }
4428 if (dump_enabled_p ())
4429 dump_printf (MSG_NOTE,
4430 "cost model: Adding cost of checks for loop "
4431 "versioning aliasing.\n");
4432 }
4433
4434 /* Requires loop versioning with niter checks. */
4435 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4436 {
4437 /* FIXME: Make cost depend on complexity of individual check. */
4438 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4439 NULL, NULL, NULL_TREE, 0, vect_prologue);
4440 if (dump_enabled_p ())
4441 dump_printf (MSG_NOTE,
4442 "cost model: Adding cost of checks for loop "
4443 "versioning niters.\n");
4444 }
4445
4446 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4447 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4448 vect_prologue);
4449
4450 /* Count statements in scalar loop. Using this as scalar cost for a single
4451 iteration for now.
4452
4453 TODO: Add outer loop support.
4454
4455 TODO: Consider assigning different costs to different scalar
4456 statements. */
4457
4458 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4459
4460 /* Add additional cost for the peeled instructions in prologue and epilogue
4461 loop. (For fully-masked loops there will be no peeling.)
4462
4463 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4464 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4465
4466 TODO: Build an expression that represents peel_iters for prologue and
4467 epilogue to be used in a run-time test. */
4468
4469 bool prologue_need_br_taken_cost = false;
4470 bool prologue_need_br_not_taken_cost = false;
4471
4472 /* Calculate peel_iters_prologue. */
4473 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4474 peel_iters_prologue = 0;
4475 else if (npeel < 0)
4476 {
4477 peel_iters_prologue = assumed_vf / 2;
4478 if (dump_enabled_p ())
4479 dump_printf (MSG_NOTE, "cost model: "
4480 "prologue peel iters set to vf/2.\n");
4481
4482 /* If peeled iterations are unknown, count a taken branch and a not taken
4483 branch per peeled loop. Even if scalar loop iterations are known,
4484 vector iterations are not known since peeled prologue iterations are
4485 not known. Hence guards remain the same. */
4486 prologue_need_br_taken_cost = true;
4487 prologue_need_br_not_taken_cost = true;
4488 }
4489 else
4490 {
4491 peel_iters_prologue = npeel;
4492 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4493 /* If peeled iterations are known but number of scalar loop
4494 iterations are unknown, count a taken branch per peeled loop. */
4495 prologue_need_br_taken_cost = true;
4496 }
4497
4498 bool epilogue_need_br_taken_cost = false;
4499 bool epilogue_need_br_not_taken_cost = false;
4500
4501 /* Calculate peel_iters_epilogue. */
4502 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4503 /* We need to peel exactly one iteration for gaps. */
4504 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4505 else if (npeel < 0)
4506 {
4507 /* If peeling for alignment is unknown, loop bound of main loop
4508 becomes unknown. */
4509 peel_iters_epilogue = assumed_vf / 2;
4510 if (dump_enabled_p ())
4511 dump_printf (MSG_NOTE, "cost model: "
4512 "epilogue peel iters set to vf/2 because "
4513 "peeling for alignment is unknown.\n");
4514
4515 /* See the same reason above in peel_iters_prologue calculation. */
4516 epilogue_need_br_taken_cost = true;
4517 epilogue_need_br_not_taken_cost = true;
4518 }
4519 else
4520 {
4521 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4522 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4523 /* If peeled iterations are known but number of scalar loop
4524 iterations are unknown, count a taken branch per peeled loop. */
4525 epilogue_need_br_taken_cost = true;
4526 }
4527
4528 stmt_info_for_cost *si;
4529 int j;
4530 /* Add costs associated with peel_iters_prologue. */
4531 if (peel_iters_prologue)
4532 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4533 {
4534 (void) add_stmt_cost (target_cost_data,
4535 si->count * peel_iters_prologue, si->kind,
4536 si->stmt_info, si->node, si->vectype,
4537 si->misalign, vect_prologue);
4538 }
4539
4540 /* Add costs associated with peel_iters_epilogue. */
4541 if (peel_iters_epilogue)
4542 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4543 {
4544 (void) add_stmt_cost (target_cost_data,
4545 si->count * peel_iters_epilogue, si->kind,
4546 si->stmt_info, si->node, si->vectype,
4547 si->misalign, vect_epilogue);
4548 }
4549
4550 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4551
4552 if (prologue_need_br_taken_cost)
4553 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4554 vect_prologue);
4555
4556 if (prologue_need_br_not_taken_cost)
4557 (void) add_stmt_cost (target_cost_data, 1,
4558 cond_branch_not_taken, vect_prologue);
4559
4560 if (epilogue_need_br_taken_cost)
4561 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4562 vect_epilogue);
4563
4564 if (epilogue_need_br_not_taken_cost)
4565 (void) add_stmt_cost (target_cost_data, 1,
4566 cond_branch_not_taken, vect_epilogue);
4567
4568 /* Take care of special costs for rgroup controls of partial vectors. */
4569 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4570 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4571 == vect_partial_vectors_avx512))
4572 {
4573 /* Calculate how many masks we need to generate. */
4574 unsigned int num_masks = 0;
4575 bool need_saturation = false;
4576 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4577 if (rgm.type)
4578 {
4579 unsigned nvectors = rgm.factor;
4580 num_masks += nvectors;
4581 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4582 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4583 need_saturation = true;
4584 }
4585
4586 /* ??? The target isn't able to identify the costs below as
4587 producing masks so it cannot penaltize cases where we'd run
4588 out of mask registers for example. */
4589
4590 /* ??? We are also failing to account for smaller vector masks
4591 we generate by splitting larger masks in vect_get_loop_mask. */
4592
4593 /* In the worst case, we need to generate each mask in the prologue
4594 and in the loop body. We need one splat per group and one
4595 compare per mask.
4596
4597 Sometimes the prologue mask will fold to a constant,
4598 so the actual prologue cost might be smaller. However, it's
4599 simpler and safer to use the worst-case cost; if this ends up
4600 being the tie-breaker between vectorizing or not, then it's
4601 probably better not to vectorize. */
4602 (void) add_stmt_cost (target_cost_data,
4603 num_masks
4604 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4605 vector_stmt, NULL, NULL, NULL_TREE, 0,
4606 vect_prologue);
4607 (void) add_stmt_cost (target_cost_data,
4608 num_masks
4609 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4610 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4611
4612 /* When we need saturation we need it both in the prologue and
4613 the epilogue. */
4614 if (need_saturation)
4615 {
4616 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4617 NULL, NULL, NULL_TREE, 0, vect_prologue);
4618 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4619 NULL, NULL, NULL_TREE, 0, vect_body);
4620 }
4621 }
4622 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4623 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4624 == vect_partial_vectors_while_ult))
4625 {
4626 /* Calculate how many masks we need to generate. */
4627 unsigned int num_masks = 0;
4628 rgroup_controls *rgm;
4629 unsigned int num_vectors_m1;
4630 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4631 num_vectors_m1, rgm)
4632 if (rgm->type)
4633 num_masks += num_vectors_m1 + 1;
4634 gcc_assert (num_masks > 0);
4635
4636 /* In the worst case, we need to generate each mask in the prologue
4637 and in the loop body. One of the loop body mask instructions
4638 replaces the comparison in the scalar loop, and since we don't
4639 count the scalar comparison against the scalar body, we shouldn't
4640 count that vector instruction against the vector body either.
4641
4642 Sometimes we can use unpacks instead of generating prologue
4643 masks and sometimes the prologue mask will fold to a constant,
4644 so the actual prologue cost might be smaller. However, it's
4645 simpler and safer to use the worst-case cost; if this ends up
4646 being the tie-breaker between vectorizing or not, then it's
4647 probably better not to vectorize. */
4648 (void) add_stmt_cost (target_cost_data, num_masks,
4649 vector_stmt, NULL, NULL, NULL_TREE, 0,
4650 vect_prologue);
4651 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4652 vector_stmt, NULL, NULL, NULL_TREE, 0,
4653 vect_body);
4654 }
4655 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4656 {
4657 /* Referring to the functions vect_set_loop_condition_partial_vectors
4658 and vect_set_loop_controls_directly, we need to generate each
4659 length in the prologue and in the loop body if required. Although
4660 there are some possible optimizations, we consider the worst case
4661 here. */
4662
4663 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4664 signed char partial_load_store_bias
4665 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4666 bool need_iterate_p
4667 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4668 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4669
4670 /* Calculate how many statements to be added. */
4671 unsigned int prologue_stmts = 0;
4672 unsigned int body_stmts = 0;
4673
4674 rgroup_controls *rgc;
4675 unsigned int num_vectors_m1;
4676 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4677 if (rgc->type)
4678 {
4679 /* May need one SHIFT for nitems_total computation. */
4680 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4681 if (nitems != 1 && !niters_known_p)
4682 prologue_stmts += 1;
4683
4684 /* May need one MAX and one MINUS for wrap around. */
4685 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4686 prologue_stmts += 2;
4687
4688 /* Need one MAX and one MINUS for each batch limit excepting for
4689 the 1st one. */
4690 prologue_stmts += num_vectors_m1 * 2;
4691
4692 unsigned int num_vectors = num_vectors_m1 + 1;
4693
4694 /* Need to set up lengths in prologue, only one MIN required
4695 for each since start index is zero. */
4696 prologue_stmts += num_vectors;
4697
4698 /* If we have a non-zero partial load bias, we need one PLUS
4699 to adjust the load length. */
4700 if (partial_load_store_bias != 0)
4701 body_stmts += 1;
4702
4703 /* Each may need two MINs and one MINUS to update lengths in body
4704 for next iteration. */
4705 if (need_iterate_p)
4706 body_stmts += 3 * num_vectors;
4707 }
4708
4709 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4710 scalar_stmt, vect_prologue);
4711 (void) add_stmt_cost (target_cost_data, body_stmts,
4712 scalar_stmt, vect_body);
4713 }
4714
4715 /* FORNOW: The scalar outside cost is incremented in one of the
4716 following ways:
4717
4718 1. The vectorizer checks for alignment and aliasing and generates
4719 a condition that allows dynamic vectorization. A cost model
4720 check is ANDED with the versioning condition. Hence scalar code
4721 path now has the added cost of the versioning check.
4722
4723 if (cost > th & versioning_check)
4724 jmp to vector code
4725
4726 Hence run-time scalar is incremented by not-taken branch cost.
4727
4728 2. The vectorizer then checks if a prologue is required. If the
4729 cost model check was not done before during versioning, it has to
4730 be done before the prologue check.
4731
4732 if (cost <= th)
4733 prologue = scalar_iters
4734 if (prologue == 0)
4735 jmp to vector code
4736 else
4737 execute prologue
4738 if (prologue == num_iters)
4739 go to exit
4740
4741 Hence the run-time scalar cost is incremented by a taken branch,
4742 plus a not-taken branch, plus a taken branch cost.
4743
4744 3. The vectorizer then checks if an epilogue is required. If the
4745 cost model check was not done before during prologue check, it
4746 has to be done with the epilogue check.
4747
4748 if (prologue == 0)
4749 jmp to vector code
4750 else
4751 execute prologue
4752 if (prologue == num_iters)
4753 go to exit
4754 vector code:
4755 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4756 jmp to epilogue
4757
4758 Hence the run-time scalar cost should be incremented by 2 taken
4759 branches.
4760
4761 TODO: The back end may reorder the BBS's differently and reverse
4762 conditions/branch directions. Change the estimates below to
4763 something more reasonable. */
4764
4765 /* If the number of iterations is known and we do not do versioning, we can
4766 decide whether to vectorize at compile time. Hence the scalar version
4767 do not carry cost model guard costs. */
4768 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4769 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4770 {
4771 /* Cost model check occurs at versioning. */
4772 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4773 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4774 else
4775 {
4776 /* Cost model check occurs at prologue generation. */
4777 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4778 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4779 + vect_get_stmt_cost (cond_branch_not_taken);
4780 /* Cost model check occurs at epilogue generation. */
4781 else
4782 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4783 }
4784 }
4785
4786 /* Complete the target-specific cost calculations. */
4787 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4788 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4789 suggested_unroll_factor);
4790
4791 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4792 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4793 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4794 *suggested_unroll_factor,
4795 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4796 {
4797 if (dump_enabled_p ())
4798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4799 "can't unroll as unrolled vectorization factor larger"
4800 " than maximum vectorization factor: "
4801 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4802 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4803 *suggested_unroll_factor = 1;
4804 }
4805
4806 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4807
4808 if (dump_enabled_p ())
4809 {
4810 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4811 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4812 vec_inside_cost);
4813 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4814 vec_prologue_cost);
4815 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4816 vec_epilogue_cost);
4817 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4818 scalar_single_iter_cost);
4819 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4820 scalar_outside_cost);
4821 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4822 vec_outside_cost);
4823 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4824 peel_iters_prologue);
4825 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4826 peel_iters_epilogue);
4827 }
4828
4829 /* Calculate number of iterations required to make the vector version
4830 profitable, relative to the loop bodies only. The following condition
4831 must hold true:
4832 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4833 where
4834 SIC = scalar iteration cost, VIC = vector iteration cost,
4835 VOC = vector outside cost, VF = vectorization factor,
4836 NPEEL = prologue iterations + epilogue iterations,
4837 SOC = scalar outside cost for run time cost model check. */
4838
4839 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4840 - vec_inside_cost);
4841 if (saving_per_viter <= 0)
4842 {
4843 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4844 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4845 "vectorization did not happen for a simd loop");
4846
4847 if (dump_enabled_p ())
4848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4849 "cost model: the vector iteration cost = %d "
4850 "divided by the scalar iteration cost = %d "
4851 "is greater or equal to the vectorization factor = %d"
4852 ".\n",
4853 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4854 *ret_min_profitable_niters = -1;
4855 *ret_min_profitable_estimate = -1;
4856 return;
4857 }
4858
4859 /* ??? The "if" arm is written to handle all cases; see below for what
4860 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4861 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4862 {
4863 /* Rewriting the condition above in terms of the number of
4864 vector iterations (vniters) rather than the number of
4865 scalar iterations (niters) gives:
4866
4867 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4868
4869 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4870
4871 For integer N, X and Y when X > 0:
4872
4873 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4874 int outside_overhead = (vec_outside_cost
4875 - scalar_single_iter_cost * peel_iters_prologue
4876 - scalar_single_iter_cost * peel_iters_epilogue
4877 - scalar_outside_cost);
4878 /* We're only interested in cases that require at least one
4879 vector iteration. */
4880 int min_vec_niters = 1;
4881 if (outside_overhead > 0)
4882 min_vec_niters = outside_overhead / saving_per_viter + 1;
4883
4884 if (dump_enabled_p ())
4885 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4886 min_vec_niters);
4887
4888 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4889 {
4890 /* Now that we know the minimum number of vector iterations,
4891 find the minimum niters for which the scalar cost is larger:
4892
4893 SIC * niters > VIC * vniters + VOC - SOC
4894
4895 We know that the minimum niters is no more than
4896 vniters * VF + NPEEL, but it might be (and often is) less
4897 than that if a partial vector iteration is cheaper than the
4898 equivalent scalar code. */
4899 int threshold = (vec_inside_cost * min_vec_niters
4900 + vec_outside_cost
4901 - scalar_outside_cost);
4902 if (threshold <= 0)
4903 min_profitable_iters = 1;
4904 else
4905 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4906 }
4907 else
4908 /* Convert the number of vector iterations into a number of
4909 scalar iterations. */
4910 min_profitable_iters = (min_vec_niters * assumed_vf
4911 + peel_iters_prologue
4912 + peel_iters_epilogue);
4913 }
4914 else
4915 {
4916 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4917 * assumed_vf
4918 - vec_inside_cost * peel_iters_prologue
4919 - vec_inside_cost * peel_iters_epilogue);
4920 if (min_profitable_iters <= 0)
4921 min_profitable_iters = 0;
4922 else
4923 {
4924 min_profitable_iters /= saving_per_viter;
4925
4926 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4927 <= (((int) vec_inside_cost * min_profitable_iters)
4928 + (((int) vec_outside_cost - scalar_outside_cost)
4929 * assumed_vf)))
4930 min_profitable_iters++;
4931 }
4932 }
4933
4934 if (dump_enabled_p ())
4935 dump_printf (MSG_NOTE,
4936 " Calculated minimum iters for profitability: %d\n",
4937 min_profitable_iters);
4938
4939 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4940 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4941 /* We want the vectorized loop to execute at least once. */
4942 min_profitable_iters = assumed_vf + peel_iters_prologue;
4943 else if (min_profitable_iters < peel_iters_prologue)
4944 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4945 vectorized loop executes at least once. */
4946 min_profitable_iters = peel_iters_prologue;
4947
4948 if (dump_enabled_p ())
4949 dump_printf_loc (MSG_NOTE, vect_location,
4950 " Runtime profitability threshold = %d\n",
4951 min_profitable_iters);
4952
4953 *ret_min_profitable_niters = min_profitable_iters;
4954
4955 /* Calculate number of iterations required to make the vector version
4956 profitable, relative to the loop bodies only.
4957
4958 Non-vectorized variant is SIC * niters and it must win over vector
4959 variant on the expected loop trip count. The following condition must hold true:
4960 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4961
4962 if (vec_outside_cost <= 0)
4963 min_profitable_estimate = 0;
4964 /* ??? This "else if" arm is written to handle all cases; see below for
4965 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4966 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4967 {
4968 /* This is a repeat of the code above, but with + SOC rather
4969 than - SOC. */
4970 int outside_overhead = (vec_outside_cost
4971 - scalar_single_iter_cost * peel_iters_prologue
4972 - scalar_single_iter_cost * peel_iters_epilogue
4973 + scalar_outside_cost);
4974 int min_vec_niters = 1;
4975 if (outside_overhead > 0)
4976 min_vec_niters = outside_overhead / saving_per_viter + 1;
4977
4978 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4979 {
4980 int threshold = (vec_inside_cost * min_vec_niters
4981 + vec_outside_cost
4982 + scalar_outside_cost);
4983 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4984 }
4985 else
4986 min_profitable_estimate = (min_vec_niters * assumed_vf
4987 + peel_iters_prologue
4988 + peel_iters_epilogue);
4989 }
4990 else
4991 {
4992 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4993 * assumed_vf
4994 - vec_inside_cost * peel_iters_prologue
4995 - vec_inside_cost * peel_iters_epilogue)
4996 / ((scalar_single_iter_cost * assumed_vf)
4997 - vec_inside_cost);
4998 }
4999 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5000 if (dump_enabled_p ())
5001 dump_printf_loc (MSG_NOTE, vect_location,
5002 " Static estimate profitability threshold = %d\n",
5003 min_profitable_estimate);
5004
5005 *ret_min_profitable_estimate = min_profitable_estimate;
5006 }
5007
5008 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5009 vector elements (not bits) for a vector with NELT elements. */
5010 static void
5011 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5012 vec_perm_builder *sel)
5013 {
5014 /* The encoding is a single stepped pattern. Any wrap-around is handled
5015 by vec_perm_indices. */
5016 sel->new_vector (nelt, 1, 3);
5017 for (unsigned int i = 0; i < 3; i++)
5018 sel->quick_push (i + offset);
5019 }
5020
5021 /* Checks whether the target supports whole-vector shifts for vectors of mode
5022 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5023 it supports vec_perm_const with masks for all necessary shift amounts. */
5024 static bool
5025 have_whole_vector_shift (machine_mode mode)
5026 {
5027 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5028 return true;
5029
5030 /* Variable-length vectors should be handled via the optab. */
5031 unsigned int nelt;
5032 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5033 return false;
5034
5035 vec_perm_builder sel;
5036 vec_perm_indices indices;
5037 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5038 {
5039 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5040 indices.new_vector (sel, 2, nelt);
5041 if (!can_vec_perm_const_p (mode, mode, indices, false))
5042 return false;
5043 }
5044 return true;
5045 }
5046
5047 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5048 multiplication operands have differing signs and (b) we intend
5049 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5050 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5051
5052 static bool
5053 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5054 stmt_vec_info stmt_info)
5055 {
5056 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5057 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5058 return false;
5059
5060 tree rhs1 = gimple_assign_rhs1 (assign);
5061 tree rhs2 = gimple_assign_rhs2 (assign);
5062 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5063 return false;
5064
5065 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5066 gcc_assert (reduc_info->is_reduc_info);
5067 return !directly_supported_p (DOT_PROD_EXPR,
5068 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5069 optab_vector_mixed_sign);
5070 }
5071
5072 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5073 functions. Design better to avoid maintenance issues. */
5074
5075 /* Function vect_model_reduction_cost.
5076
5077 Models cost for a reduction operation, including the vector ops
5078 generated within the strip-mine loop in some cases, the initial
5079 definition before the loop, and the epilogue code that must be generated. */
5080
5081 static void
5082 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5083 stmt_vec_info stmt_info, internal_fn reduc_fn,
5084 vect_reduction_type reduction_type,
5085 int ncopies, stmt_vector_for_cost *cost_vec)
5086 {
5087 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5088 tree vectype;
5089 machine_mode mode;
5090 class loop *loop = NULL;
5091
5092 if (loop_vinfo)
5093 loop = LOOP_VINFO_LOOP (loop_vinfo);
5094
5095 /* Condition reductions generate two reductions in the loop. */
5096 if (reduction_type == COND_REDUCTION)
5097 ncopies *= 2;
5098
5099 vectype = STMT_VINFO_VECTYPE (stmt_info);
5100 mode = TYPE_MODE (vectype);
5101 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5102
5103 gimple_match_op op;
5104 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5105 gcc_unreachable ();
5106
5107 bool emulated_mixed_dot_prod
5108 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5109 if (reduction_type == EXTRACT_LAST_REDUCTION)
5110 /* No extra instructions are needed in the prologue. The loop body
5111 operations are costed in vectorizable_condition. */
5112 inside_cost = 0;
5113 else if (reduction_type == FOLD_LEFT_REDUCTION)
5114 {
5115 /* No extra instructions needed in the prologue. */
5116 prologue_cost = 0;
5117
5118 if (reduc_fn != IFN_LAST)
5119 /* Count one reduction-like operation per vector. */
5120 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5121 stmt_info, 0, vect_body);
5122 else
5123 {
5124 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5125 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5126 inside_cost = record_stmt_cost (cost_vec, nelements,
5127 vec_to_scalar, stmt_info, 0,
5128 vect_body);
5129 inside_cost += record_stmt_cost (cost_vec, nelements,
5130 scalar_stmt, stmt_info, 0,
5131 vect_body);
5132 }
5133 }
5134 else
5135 {
5136 /* Add in the cost of the initial definitions. */
5137 int prologue_stmts;
5138 if (reduction_type == COND_REDUCTION)
5139 /* For cond reductions we have four vectors: initial index, step,
5140 initial result of the data reduction, initial value of the index
5141 reduction. */
5142 prologue_stmts = 4;
5143 else if (emulated_mixed_dot_prod)
5144 /* We need the initial reduction value and two invariants:
5145 one that contains the minimum signed value and one that
5146 contains half of its negative. */
5147 prologue_stmts = 3;
5148 else
5149 prologue_stmts = 1;
5150 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5151 scalar_to_vec, stmt_info, 0,
5152 vect_prologue);
5153 }
5154
5155 /* Determine cost of epilogue code.
5156
5157 We have a reduction operator that will reduce the vector in one statement.
5158 Also requires scalar extract. */
5159
5160 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5161 {
5162 if (reduc_fn != IFN_LAST)
5163 {
5164 if (reduction_type == COND_REDUCTION)
5165 {
5166 /* An EQ stmt and an COND_EXPR stmt. */
5167 epilogue_cost += record_stmt_cost (cost_vec, 2,
5168 vector_stmt, stmt_info, 0,
5169 vect_epilogue);
5170 /* Reduction of the max index and a reduction of the found
5171 values. */
5172 epilogue_cost += record_stmt_cost (cost_vec, 2,
5173 vec_to_scalar, stmt_info, 0,
5174 vect_epilogue);
5175 /* A broadcast of the max value. */
5176 epilogue_cost += record_stmt_cost (cost_vec, 1,
5177 scalar_to_vec, stmt_info, 0,
5178 vect_epilogue);
5179 }
5180 else
5181 {
5182 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5183 stmt_info, 0, vect_epilogue);
5184 epilogue_cost += record_stmt_cost (cost_vec, 1,
5185 vec_to_scalar, stmt_info, 0,
5186 vect_epilogue);
5187 }
5188 }
5189 else if (reduction_type == COND_REDUCTION)
5190 {
5191 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5192 /* Extraction of scalar elements. */
5193 epilogue_cost += record_stmt_cost (cost_vec,
5194 2 * estimated_nunits,
5195 vec_to_scalar, stmt_info, 0,
5196 vect_epilogue);
5197 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5198 epilogue_cost += record_stmt_cost (cost_vec,
5199 2 * estimated_nunits - 3,
5200 scalar_stmt, stmt_info, 0,
5201 vect_epilogue);
5202 }
5203 else if (reduction_type == EXTRACT_LAST_REDUCTION
5204 || reduction_type == FOLD_LEFT_REDUCTION)
5205 /* No extra instructions need in the epilogue. */
5206 ;
5207 else
5208 {
5209 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5210 tree bitsize = TYPE_SIZE (op.type);
5211 int element_bitsize = tree_to_uhwi (bitsize);
5212 int nelements = vec_size_in_bits / element_bitsize;
5213
5214 if (op.code == COND_EXPR)
5215 op.code = MAX_EXPR;
5216
5217 /* We have a whole vector shift available. */
5218 if (VECTOR_MODE_P (mode)
5219 && directly_supported_p (op.code, vectype)
5220 && have_whole_vector_shift (mode))
5221 {
5222 /* Final reduction via vector shifts and the reduction operator.
5223 Also requires scalar extract. */
5224 epilogue_cost += record_stmt_cost (cost_vec,
5225 exact_log2 (nelements) * 2,
5226 vector_stmt, stmt_info, 0,
5227 vect_epilogue);
5228 epilogue_cost += record_stmt_cost (cost_vec, 1,
5229 vec_to_scalar, stmt_info, 0,
5230 vect_epilogue);
5231 }
5232 else
5233 /* Use extracts and reduction op for final reduction. For N
5234 elements, we have N extracts and N-1 reduction ops. */
5235 epilogue_cost += record_stmt_cost (cost_vec,
5236 nelements + nelements - 1,
5237 vector_stmt, stmt_info, 0,
5238 vect_epilogue);
5239 }
5240 }
5241
5242 if (dump_enabled_p ())
5243 dump_printf (MSG_NOTE,
5244 "vect_model_reduction_cost: inside_cost = %d, "
5245 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5246 prologue_cost, epilogue_cost);
5247 }
5248
5249 /* SEQ is a sequence of instructions that initialize the reduction
5250 described by REDUC_INFO. Emit them in the appropriate place. */
5251
5252 static void
5253 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5254 stmt_vec_info reduc_info, gimple *seq)
5255 {
5256 if (reduc_info->reused_accumulator)
5257 {
5258 /* When reusing an accumulator from the main loop, we only need
5259 initialization instructions if the main loop can be skipped.
5260 In that case, emit the initialization instructions at the end
5261 of the guard block that does the skip. */
5262 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5263 gcc_assert (skip_edge);
5264 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5265 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5266 }
5267 else
5268 {
5269 /* The normal case: emit the initialization instructions on the
5270 preheader edge. */
5271 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5272 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5273 }
5274 }
5275
5276 /* Function get_initial_def_for_reduction
5277
5278 Input:
5279 REDUC_INFO - the info_for_reduction
5280 INIT_VAL - the initial value of the reduction variable
5281 NEUTRAL_OP - a value that has no effect on the reduction, as per
5282 neutral_op_for_reduction
5283
5284 Output:
5285 Return a vector variable, initialized according to the operation that
5286 STMT_VINFO performs. This vector will be used as the initial value
5287 of the vector of partial results.
5288
5289 The value we need is a vector in which element 0 has value INIT_VAL
5290 and every other element has value NEUTRAL_OP. */
5291
5292 static tree
5293 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5294 stmt_vec_info reduc_info,
5295 tree init_val, tree neutral_op)
5296 {
5297 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5298 tree scalar_type = TREE_TYPE (init_val);
5299 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5300 tree init_def;
5301 gimple_seq stmts = NULL;
5302
5303 gcc_assert (vectype);
5304
5305 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5306 || SCALAR_FLOAT_TYPE_P (scalar_type));
5307
5308 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5309 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5310
5311 if (operand_equal_p (init_val, neutral_op))
5312 {
5313 /* If both elements are equal then the vector described above is
5314 just a splat. */
5315 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5316 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5317 }
5318 else
5319 {
5320 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5321 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5322 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5323 {
5324 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5325 element 0. */
5326 init_def = gimple_build_vector_from_val (&stmts, vectype,
5327 neutral_op);
5328 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5329 vectype, init_def, init_val);
5330 }
5331 else
5332 {
5333 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5334 tree_vector_builder elts (vectype, 1, 2);
5335 elts.quick_push (init_val);
5336 elts.quick_push (neutral_op);
5337 init_def = gimple_build_vector (&stmts, &elts);
5338 }
5339 }
5340
5341 if (stmts)
5342 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5343 return init_def;
5344 }
5345
5346 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5347 which performs a reduction involving GROUP_SIZE scalar statements.
5348 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5349 is nonnull, introducing extra elements of that value will not change the
5350 result. */
5351
5352 static void
5353 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5354 stmt_vec_info reduc_info,
5355 vec<tree> *vec_oprnds,
5356 unsigned int number_of_vectors,
5357 unsigned int group_size, tree neutral_op)
5358 {
5359 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5360 unsigned HOST_WIDE_INT nunits;
5361 unsigned j, number_of_places_left_in_vector;
5362 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5363 unsigned int i;
5364
5365 gcc_assert (group_size == initial_values.length () || neutral_op);
5366
5367 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5368 created vectors. It is greater than 1 if unrolling is performed.
5369
5370 For example, we have two scalar operands, s1 and s2 (e.g., group of
5371 strided accesses of size two), while NUNITS is four (i.e., four scalars
5372 of this type can be packed in a vector). The output vector will contain
5373 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5374 will be 2).
5375
5376 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5377 vectors containing the operands.
5378
5379 For example, NUNITS is four as before, and the group size is 8
5380 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5381 {s5, s6, s7, s8}. */
5382
5383 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5384 nunits = group_size;
5385
5386 number_of_places_left_in_vector = nunits;
5387 bool constant_p = true;
5388 tree_vector_builder elts (vector_type, nunits, 1);
5389 elts.quick_grow (nunits);
5390 gimple_seq ctor_seq = NULL;
5391 for (j = 0; j < nunits * number_of_vectors; ++j)
5392 {
5393 tree op;
5394 i = j % group_size;
5395
5396 /* Get the def before the loop. In reduction chain we have only
5397 one initial value. Else we have as many as PHIs in the group. */
5398 if (i >= initial_values.length () || (j > i && neutral_op))
5399 op = neutral_op;
5400 else
5401 op = initial_values[i];
5402
5403 /* Create 'vect_ = {op0,op1,...,opn}'. */
5404 number_of_places_left_in_vector--;
5405 elts[nunits - number_of_places_left_in_vector - 1] = op;
5406 if (!CONSTANT_CLASS_P (op))
5407 constant_p = false;
5408
5409 if (number_of_places_left_in_vector == 0)
5410 {
5411 tree init;
5412 if (constant_p && !neutral_op
5413 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5414 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5415 /* Build the vector directly from ELTS. */
5416 init = gimple_build_vector (&ctor_seq, &elts);
5417 else if (neutral_op)
5418 {
5419 /* Build a vector of the neutral value and shift the
5420 other elements into place. */
5421 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5422 neutral_op);
5423 int k = nunits;
5424 while (k > 0 && elts[k - 1] == neutral_op)
5425 k -= 1;
5426 while (k > 0)
5427 {
5428 k -= 1;
5429 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5430 vector_type, init, elts[k]);
5431 }
5432 }
5433 else
5434 {
5435 /* First time round, duplicate ELTS to fill the
5436 required number of vectors. */
5437 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5438 elts, number_of_vectors, *vec_oprnds);
5439 break;
5440 }
5441 vec_oprnds->quick_push (init);
5442
5443 number_of_places_left_in_vector = nunits;
5444 elts.new_vector (vector_type, nunits, 1);
5445 elts.quick_grow (nunits);
5446 constant_p = true;
5447 }
5448 }
5449 if (ctor_seq != NULL)
5450 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5451 }
5452
5453 /* For a statement STMT_INFO taking part in a reduction operation return
5454 the stmt_vec_info the meta information is stored on. */
5455
5456 stmt_vec_info
5457 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5458 {
5459 stmt_info = vect_orig_stmt (stmt_info);
5460 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5461 if (!is_a <gphi *> (stmt_info->stmt)
5462 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5463 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5464 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5465 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5466 {
5467 if (gimple_phi_num_args (phi) == 1)
5468 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5469 }
5470 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5471 {
5472 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5473 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5474 stmt_info = info;
5475 }
5476 return stmt_info;
5477 }
5478
5479 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5480 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5481 return false. */
5482
5483 static bool
5484 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5485 stmt_vec_info reduc_info)
5486 {
5487 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5488 if (!main_loop_vinfo)
5489 return false;
5490
5491 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5492 return false;
5493
5494 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5495 auto_vec<tree, 16> main_loop_results (num_phis);
5496 auto_vec<tree, 16> initial_values (num_phis);
5497 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5498 {
5499 /* The epilogue loop can be entered either from the main loop or
5500 from an earlier guard block. */
5501 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5502 for (tree incoming_value : reduc_info->reduc_initial_values)
5503 {
5504 /* Look for:
5505
5506 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5507 INITIAL_VALUE(guard block)>. */
5508 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5509
5510 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5511 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5512
5513 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5514 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5515
5516 main_loop_results.quick_push (from_main_loop);
5517 initial_values.quick_push (from_skip);
5518 }
5519 }
5520 else
5521 /* The main loop dominates the epilogue loop. */
5522 main_loop_results.splice (reduc_info->reduc_initial_values);
5523
5524 /* See if the main loop has the kind of accumulator we need. */
5525 vect_reusable_accumulator *accumulator
5526 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5527 if (!accumulator
5528 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5529 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5530 accumulator->reduc_info->reduc_scalar_results.begin ()))
5531 return false;
5532
5533 /* Handle the case where we can reduce wider vectors to narrower ones. */
5534 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5535 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5536 unsigned HOST_WIDE_INT m;
5537 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5538 TYPE_VECTOR_SUBPARTS (vectype), &m))
5539 return false;
5540 /* Check the intermediate vector types and operations are available. */
5541 tree prev_vectype = old_vectype;
5542 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5543 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5544 {
5545 intermediate_nunits = exact_div (intermediate_nunits, 2);
5546 tree intermediate_vectype = get_related_vectype_for_scalar_type
5547 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5548 if (!intermediate_vectype
5549 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5550 intermediate_vectype)
5551 || !can_vec_extract (TYPE_MODE (prev_vectype),
5552 TYPE_MODE (intermediate_vectype)))
5553 return false;
5554 prev_vectype = intermediate_vectype;
5555 }
5556
5557 /* Non-SLP reductions might apply an adjustment after the reduction
5558 operation, in order to simplify the initialization of the accumulator.
5559 If the epilogue loop carries on from where the main loop left off,
5560 it should apply the same adjustment to the final reduction result.
5561
5562 If the epilogue loop can also be entered directly (rather than via
5563 the main loop), we need to be able to handle that case in the same way,
5564 with the same adjustment. (In principle we could add a PHI node
5565 to select the correct adjustment, but in practice that shouldn't be
5566 necessary.) */
5567 tree main_adjustment
5568 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5569 if (loop_vinfo->main_loop_edge && main_adjustment)
5570 {
5571 gcc_assert (num_phis == 1);
5572 tree initial_value = initial_values[0];
5573 /* Check that we can use INITIAL_VALUE as the adjustment and
5574 initialize the accumulator with a neutral value instead. */
5575 if (!operand_equal_p (initial_value, main_adjustment))
5576 return false;
5577 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5578 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5579 code, initial_value);
5580 }
5581 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5582 reduc_info->reduc_initial_values.truncate (0);
5583 reduc_info->reduc_initial_values.splice (initial_values);
5584 reduc_info->reused_accumulator = accumulator;
5585 return true;
5586 }
5587
5588 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5589 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5590
5591 static tree
5592 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5593 gimple_seq *seq)
5594 {
5595 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5596 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5597 tree stype = TREE_TYPE (vectype);
5598 tree new_temp = vec_def;
5599 while (nunits > nunits1)
5600 {
5601 nunits /= 2;
5602 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5603 stype, nunits);
5604 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5605
5606 /* The target has to make sure we support lowpart/highpart
5607 extraction, either via direct vector extract or through
5608 an integer mode punning. */
5609 tree dst1, dst2;
5610 gimple *epilog_stmt;
5611 if (convert_optab_handler (vec_extract_optab,
5612 TYPE_MODE (TREE_TYPE (new_temp)),
5613 TYPE_MODE (vectype1))
5614 != CODE_FOR_nothing)
5615 {
5616 /* Extract sub-vectors directly once vec_extract becomes
5617 a conversion optab. */
5618 dst1 = make_ssa_name (vectype1);
5619 epilog_stmt
5620 = gimple_build_assign (dst1, BIT_FIELD_REF,
5621 build3 (BIT_FIELD_REF, vectype1,
5622 new_temp, TYPE_SIZE (vectype1),
5623 bitsize_int (0)));
5624 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5625 dst2 = make_ssa_name (vectype1);
5626 epilog_stmt
5627 = gimple_build_assign (dst2, BIT_FIELD_REF,
5628 build3 (BIT_FIELD_REF, vectype1,
5629 new_temp, TYPE_SIZE (vectype1),
5630 bitsize_int (bitsize)));
5631 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5632 }
5633 else
5634 {
5635 /* Extract via punning to appropriately sized integer mode
5636 vector. */
5637 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5638 tree etype = build_vector_type (eltype, 2);
5639 gcc_assert (convert_optab_handler (vec_extract_optab,
5640 TYPE_MODE (etype),
5641 TYPE_MODE (eltype))
5642 != CODE_FOR_nothing);
5643 tree tem = make_ssa_name (etype);
5644 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5645 build1 (VIEW_CONVERT_EXPR,
5646 etype, new_temp));
5647 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5648 new_temp = tem;
5649 tem = make_ssa_name (eltype);
5650 epilog_stmt
5651 = gimple_build_assign (tem, BIT_FIELD_REF,
5652 build3 (BIT_FIELD_REF, eltype,
5653 new_temp, TYPE_SIZE (eltype),
5654 bitsize_int (0)));
5655 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5656 dst1 = make_ssa_name (vectype1);
5657 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5658 build1 (VIEW_CONVERT_EXPR,
5659 vectype1, tem));
5660 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5661 tem = make_ssa_name (eltype);
5662 epilog_stmt
5663 = gimple_build_assign (tem, BIT_FIELD_REF,
5664 build3 (BIT_FIELD_REF, eltype,
5665 new_temp, TYPE_SIZE (eltype),
5666 bitsize_int (bitsize)));
5667 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5668 dst2 = make_ssa_name (vectype1);
5669 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5670 build1 (VIEW_CONVERT_EXPR,
5671 vectype1, tem));
5672 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5673 }
5674
5675 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5676 }
5677
5678 return new_temp;
5679 }
5680
5681 /* Function vect_create_epilog_for_reduction
5682
5683 Create code at the loop-epilog to finalize the result of a reduction
5684 computation.
5685
5686 STMT_INFO is the scalar reduction stmt that is being vectorized.
5687 SLP_NODE is an SLP node containing a group of reduction statements. The
5688 first one in this group is STMT_INFO.
5689 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5690 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5691 (counting from 0)
5692
5693 This function:
5694 1. Completes the reduction def-use cycles.
5695 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5696 by calling the function specified by REDUC_FN if available, or by
5697 other means (whole-vector shifts or a scalar loop).
5698 The function also creates a new phi node at the loop exit to preserve
5699 loop-closed form, as illustrated below.
5700
5701 The flow at the entry to this function:
5702
5703 loop:
5704 vec_def = phi <vec_init, null> # REDUCTION_PHI
5705 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5706 s_loop = scalar_stmt # (scalar) STMT_INFO
5707 loop_exit:
5708 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5709 use <s_out0>
5710 use <s_out0>
5711
5712 The above is transformed by this function into:
5713
5714 loop:
5715 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5716 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5717 s_loop = scalar_stmt # (scalar) STMT_INFO
5718 loop_exit:
5719 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5720 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5721 v_out2 = reduce <v_out1>
5722 s_out3 = extract_field <v_out2, 0>
5723 s_out4 = adjust_result <s_out3>
5724 use <s_out4>
5725 use <s_out4>
5726 */
5727
5728 static void
5729 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5730 stmt_vec_info stmt_info,
5731 slp_tree slp_node,
5732 slp_instance slp_node_instance)
5733 {
5734 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5735 gcc_assert (reduc_info->is_reduc_info);
5736 /* For double reductions we need to get at the inner loop reduction
5737 stmt which has the meta info attached. Our stmt_info is that of the
5738 loop-closed PHI of the inner loop which we remember as
5739 def for the reduction PHI generation. */
5740 bool double_reduc = false;
5741 stmt_vec_info rdef_info = stmt_info;
5742 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5743 {
5744 gcc_assert (!slp_node);
5745 double_reduc = true;
5746 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5747 (stmt_info->stmt, 0));
5748 stmt_info = vect_stmt_to_vectorize (stmt_info);
5749 }
5750 gphi *reduc_def_stmt
5751 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5752 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5753 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5754 tree vectype;
5755 machine_mode mode;
5756 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5757 basic_block exit_bb;
5758 tree scalar_dest;
5759 tree scalar_type;
5760 gimple *new_phi = NULL, *phi;
5761 gimple_stmt_iterator exit_gsi;
5762 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5763 gimple *epilog_stmt = NULL;
5764 gimple *exit_phi;
5765 tree bitsize;
5766 tree def;
5767 tree orig_name, scalar_result;
5768 imm_use_iterator imm_iter, phi_imm_iter;
5769 use_operand_p use_p, phi_use_p;
5770 gimple *use_stmt;
5771 auto_vec<tree> reduc_inputs;
5772 int j, i;
5773 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5774 unsigned int group_size = 1, k;
5775 auto_vec<gimple *> phis;
5776 /* SLP reduction without reduction chain, e.g.,
5777 # a1 = phi <a2, a0>
5778 # b1 = phi <b2, b0>
5779 a2 = operation (a1)
5780 b2 = operation (b1) */
5781 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5782 bool direct_slp_reduc;
5783 tree induction_index = NULL_TREE;
5784
5785 if (slp_node)
5786 group_size = SLP_TREE_LANES (slp_node);
5787
5788 if (nested_in_vect_loop_p (loop, stmt_info))
5789 {
5790 outer_loop = loop;
5791 loop = loop->inner;
5792 gcc_assert (!slp_node && double_reduc);
5793 }
5794
5795 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5796 gcc_assert (vectype);
5797 mode = TYPE_MODE (vectype);
5798
5799 tree induc_val = NULL_TREE;
5800 tree adjustment_def = NULL;
5801 if (slp_node)
5802 ;
5803 else
5804 {
5805 /* Optimize: for induction condition reduction, if we can't use zero
5806 for induc_val, use initial_def. */
5807 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5808 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5809 else if (double_reduc)
5810 ;
5811 else
5812 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5813 }
5814
5815 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5816 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5817 if (slp_reduc)
5818 /* All statements produce live-out values. */
5819 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5820 else if (slp_node)
5821 {
5822 /* The last statement in the reduction chain produces the live-out
5823 value. Note SLP optimization can shuffle scalar stmts to
5824 optimize permutations so we have to search for the last stmt. */
5825 for (k = 0; k < group_size; ++k)
5826 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5827 {
5828 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5829 break;
5830 }
5831 }
5832
5833 unsigned vec_num;
5834 int ncopies;
5835 if (slp_node)
5836 {
5837 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5838 ncopies = 1;
5839 }
5840 else
5841 {
5842 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5843 vec_num = 1;
5844 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5845 }
5846
5847 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5848 which is updated with the current index of the loop for every match of
5849 the original loop's cond_expr (VEC_STMT). This results in a vector
5850 containing the last time the condition passed for that vector lane.
5851 The first match will be a 1 to allow 0 to be used for non-matching
5852 indexes. If there are no matches at all then the vector will be all
5853 zeroes.
5854
5855 PR92772: This algorithm is broken for architectures that support
5856 masked vectors, but do not provide fold_extract_last. */
5857 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5858 {
5859 auto_vec<std::pair<tree, bool>, 2> ccompares;
5860 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5861 cond_info = vect_stmt_to_vectorize (cond_info);
5862 while (cond_info != reduc_info)
5863 {
5864 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5865 {
5866 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5867 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5868 ccompares.safe_push
5869 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5870 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5871 }
5872 cond_info
5873 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5874 1 + STMT_VINFO_REDUC_IDX
5875 (cond_info)));
5876 cond_info = vect_stmt_to_vectorize (cond_info);
5877 }
5878 gcc_assert (ccompares.length () != 0);
5879
5880 tree indx_before_incr, indx_after_incr;
5881 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5882 int scalar_precision
5883 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5884 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5885 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5886 (TYPE_MODE (vectype), cr_index_scalar_type,
5887 TYPE_VECTOR_SUBPARTS (vectype));
5888
5889 /* First we create a simple vector induction variable which starts
5890 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5891 vector size (STEP). */
5892
5893 /* Create a {1,2,3,...} vector. */
5894 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5895
5896 /* Create a vector of the step value. */
5897 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5898 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5899
5900 /* Create an induction variable. */
5901 gimple_stmt_iterator incr_gsi;
5902 bool insert_after;
5903 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5904 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5905 insert_after, &indx_before_incr, &indx_after_incr);
5906
5907 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5908 filled with zeros (VEC_ZERO). */
5909
5910 /* Create a vector of 0s. */
5911 tree zero = build_zero_cst (cr_index_scalar_type);
5912 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5913
5914 /* Create a vector phi node. */
5915 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5916 new_phi = create_phi_node (new_phi_tree, loop->header);
5917 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5918 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5919
5920 /* Now take the condition from the loops original cond_exprs
5921 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5922 every match uses values from the induction variable
5923 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5924 (NEW_PHI_TREE).
5925 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5926 the new cond_expr (INDEX_COND_EXPR). */
5927 gimple_seq stmts = NULL;
5928 for (int i = ccompares.length () - 1; i != -1; --i)
5929 {
5930 tree ccompare = ccompares[i].first;
5931 if (ccompares[i].second)
5932 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5933 cr_index_vector_type,
5934 ccompare,
5935 indx_before_incr, new_phi_tree);
5936 else
5937 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5938 cr_index_vector_type,
5939 ccompare,
5940 new_phi_tree, indx_before_incr);
5941 }
5942 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5943
5944 /* Update the phi with the vec cond. */
5945 induction_index = new_phi_tree;
5946 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5947 loop_latch_edge (loop), UNKNOWN_LOCATION);
5948 }
5949
5950 /* 2. Create epilog code.
5951 The reduction epilog code operates across the elements of the vector
5952 of partial results computed by the vectorized loop.
5953 The reduction epilog code consists of:
5954
5955 step 1: compute the scalar result in a vector (v_out2)
5956 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5957 step 3: adjust the scalar result (s_out3) if needed.
5958
5959 Step 1 can be accomplished using one the following three schemes:
5960 (scheme 1) using reduc_fn, if available.
5961 (scheme 2) using whole-vector shifts, if available.
5962 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5963 combined.
5964
5965 The overall epilog code looks like this:
5966
5967 s_out0 = phi <s_loop> # original EXIT_PHI
5968 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5969 v_out2 = reduce <v_out1> # step 1
5970 s_out3 = extract_field <v_out2, 0> # step 2
5971 s_out4 = adjust_result <s_out3> # step 3
5972
5973 (step 3 is optional, and steps 1 and 2 may be combined).
5974 Lastly, the uses of s_out0 are replaced by s_out4. */
5975
5976
5977 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5978 v_out1 = phi <VECT_DEF>
5979 Store them in NEW_PHIS. */
5980 if (double_reduc)
5981 loop = outer_loop;
5982 exit_bb = single_exit (loop)->dest;
5983 exit_gsi = gsi_after_labels (exit_bb);
5984 reduc_inputs.create (slp_node ? vec_num : ncopies);
5985 for (unsigned i = 0; i < vec_num; i++)
5986 {
5987 gimple_seq stmts = NULL;
5988 if (slp_node)
5989 def = vect_get_slp_vect_def (slp_node, i);
5990 else
5991 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5992 for (j = 0; j < ncopies; j++)
5993 {
5994 tree new_def = copy_ssa_name (def);
5995 phi = create_phi_node (new_def, exit_bb);
5996 if (j)
5997 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5998 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5999 new_def = gimple_convert (&stmts, vectype, new_def);
6000 reduc_inputs.quick_push (new_def);
6001 }
6002 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6003 }
6004
6005 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6006 (i.e. when reduc_fn is not available) and in the final adjustment
6007 code (if needed). Also get the original scalar reduction variable as
6008 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6009 represents a reduction pattern), the tree-code and scalar-def are
6010 taken from the original stmt that the pattern-stmt (STMT) replaces.
6011 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6012 are taken from STMT. */
6013
6014 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6015 if (orig_stmt_info != stmt_info)
6016 {
6017 /* Reduction pattern */
6018 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6019 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6020 }
6021
6022 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6023 scalar_type = TREE_TYPE (scalar_dest);
6024 scalar_results.truncate (0);
6025 scalar_results.reserve_exact (group_size);
6026 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6027 bitsize = TYPE_SIZE (scalar_type);
6028
6029 /* True if we should implement SLP_REDUC using native reduction operations
6030 instead of scalar operations. */
6031 direct_slp_reduc = (reduc_fn != IFN_LAST
6032 && slp_reduc
6033 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6034
6035 /* In case of reduction chain, e.g.,
6036 # a1 = phi <a3, a0>
6037 a2 = operation (a1)
6038 a3 = operation (a2),
6039
6040 we may end up with more than one vector result. Here we reduce them
6041 to one vector.
6042
6043 The same is true for a SLP reduction, e.g.,
6044 # a1 = phi <a2, a0>
6045 # b1 = phi <b2, b0>
6046 a2 = operation (a1)
6047 b2 = operation (a2),
6048
6049 where we can end up with more than one vector as well. We can
6050 easily accumulate vectors when the number of vector elements is
6051 a multiple of the SLP group size.
6052
6053 The same is true if we couldn't use a single defuse cycle. */
6054 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6055 || direct_slp_reduc
6056 || (slp_reduc
6057 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6058 || ncopies > 1)
6059 {
6060 gimple_seq stmts = NULL;
6061 tree single_input = reduc_inputs[0];
6062 for (k = 1; k < reduc_inputs.length (); k++)
6063 single_input = gimple_build (&stmts, code, vectype,
6064 single_input, reduc_inputs[k]);
6065 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6066
6067 reduc_inputs.truncate (0);
6068 reduc_inputs.safe_push (single_input);
6069 }
6070
6071 tree orig_reduc_input = reduc_inputs[0];
6072
6073 /* If this loop is an epilogue loop that can be skipped after the
6074 main loop, we can only share a reduction operation between the
6075 main loop and the epilogue if we put it at the target of the
6076 skip edge.
6077
6078 We can still reuse accumulators if this check fails. Doing so has
6079 the minor(?) benefit of making the epilogue loop's scalar result
6080 independent of the main loop's scalar result. */
6081 bool unify_with_main_loop_p = false;
6082 if (reduc_info->reused_accumulator
6083 && loop_vinfo->skip_this_loop_edge
6084 && single_succ_p (exit_bb)
6085 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6086 {
6087 unify_with_main_loop_p = true;
6088
6089 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6090 reduc_inputs[0] = make_ssa_name (vectype);
6091 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6092 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6093 UNKNOWN_LOCATION);
6094 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6095 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6096 exit_gsi = gsi_after_labels (reduc_block);
6097 }
6098
6099 /* Shouldn't be used beyond this point. */
6100 exit_bb = nullptr;
6101
6102 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6103 && reduc_fn != IFN_LAST)
6104 {
6105 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6106 various data values where the condition matched and another vector
6107 (INDUCTION_INDEX) containing all the indexes of those matches. We
6108 need to extract the last matching index (which will be the index with
6109 highest value) and use this to index into the data vector.
6110 For the case where there were no matches, the data vector will contain
6111 all default values and the index vector will be all zeros. */
6112
6113 /* Get various versions of the type of the vector of indexes. */
6114 tree index_vec_type = TREE_TYPE (induction_index);
6115 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6116 tree index_scalar_type = TREE_TYPE (index_vec_type);
6117 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6118
6119 /* Get an unsigned integer version of the type of the data vector. */
6120 int scalar_precision
6121 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6122 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6123 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6124 vectype);
6125
6126 /* First we need to create a vector (ZERO_VEC) of zeros and another
6127 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6128 can create using a MAX reduction and then expanding.
6129 In the case where the loop never made any matches, the max index will
6130 be zero. */
6131
6132 /* Vector of {0, 0, 0,...}. */
6133 tree zero_vec = build_zero_cst (vectype);
6134
6135 /* Find maximum value from the vector of found indexes. */
6136 tree max_index = make_ssa_name (index_scalar_type);
6137 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6138 1, induction_index);
6139 gimple_call_set_lhs (max_index_stmt, max_index);
6140 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6141
6142 /* Vector of {max_index, max_index, max_index,...}. */
6143 tree max_index_vec = make_ssa_name (index_vec_type);
6144 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6145 max_index);
6146 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6147 max_index_vec_rhs);
6148 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6149
6150 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6151 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6152 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6153 otherwise. Only one value should match, resulting in a vector
6154 (VEC_COND) with one data value and the rest zeros.
6155 In the case where the loop never made any matches, every index will
6156 match, resulting in a vector with all data values (which will all be
6157 the default value). */
6158
6159 /* Compare the max index vector to the vector of found indexes to find
6160 the position of the max value. */
6161 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6162 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6163 induction_index,
6164 max_index_vec);
6165 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6166
6167 /* Use the compare to choose either values from the data vector or
6168 zero. */
6169 tree vec_cond = make_ssa_name (vectype);
6170 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6171 vec_compare,
6172 reduc_inputs[0],
6173 zero_vec);
6174 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6175
6176 /* Finally we need to extract the data value from the vector (VEC_COND)
6177 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6178 reduction, but because this doesn't exist, we can use a MAX reduction
6179 instead. The data value might be signed or a float so we need to cast
6180 it first.
6181 In the case where the loop never made any matches, the data values are
6182 all identical, and so will reduce down correctly. */
6183
6184 /* Make the matched data values unsigned. */
6185 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6186 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6187 vec_cond);
6188 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6189 VIEW_CONVERT_EXPR,
6190 vec_cond_cast_rhs);
6191 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6192
6193 /* Reduce down to a scalar value. */
6194 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6195 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6196 1, vec_cond_cast);
6197 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6198 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6199
6200 /* Convert the reduced value back to the result type and set as the
6201 result. */
6202 gimple_seq stmts = NULL;
6203 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6204 data_reduc);
6205 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6206 scalar_results.safe_push (new_temp);
6207 }
6208 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6209 && reduc_fn == IFN_LAST)
6210 {
6211 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6212 idx = 0;
6213 idx_val = induction_index[0];
6214 val = data_reduc[0];
6215 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6216 if (induction_index[i] > idx_val)
6217 val = data_reduc[i], idx_val = induction_index[i];
6218 return val; */
6219
6220 tree data_eltype = TREE_TYPE (vectype);
6221 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6222 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6223 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6224 /* Enforced by vectorizable_reduction, which ensures we have target
6225 support before allowing a conditional reduction on variable-length
6226 vectors. */
6227 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6228 tree idx_val = NULL_TREE, val = NULL_TREE;
6229 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6230 {
6231 tree old_idx_val = idx_val;
6232 tree old_val = val;
6233 idx_val = make_ssa_name (idx_eltype);
6234 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6235 build3 (BIT_FIELD_REF, idx_eltype,
6236 induction_index,
6237 bitsize_int (el_size),
6238 bitsize_int (off)));
6239 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6240 val = make_ssa_name (data_eltype);
6241 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6242 build3 (BIT_FIELD_REF,
6243 data_eltype,
6244 reduc_inputs[0],
6245 bitsize_int (el_size),
6246 bitsize_int (off)));
6247 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6248 if (off != 0)
6249 {
6250 tree new_idx_val = idx_val;
6251 if (off != v_size - el_size)
6252 {
6253 new_idx_val = make_ssa_name (idx_eltype);
6254 epilog_stmt = gimple_build_assign (new_idx_val,
6255 MAX_EXPR, idx_val,
6256 old_idx_val);
6257 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6258 }
6259 tree cond = make_ssa_name (boolean_type_node);
6260 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6261 idx_val, old_idx_val);
6262 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6263 tree new_val = make_ssa_name (data_eltype);
6264 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6265 cond, val, old_val);
6266 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6267 idx_val = new_idx_val;
6268 val = new_val;
6269 }
6270 }
6271 /* Convert the reduced value back to the result type and set as the
6272 result. */
6273 gimple_seq stmts = NULL;
6274 val = gimple_convert (&stmts, scalar_type, val);
6275 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6276 scalar_results.safe_push (val);
6277 }
6278
6279 /* 2.3 Create the reduction code, using one of the three schemes described
6280 above. In SLP we simply need to extract all the elements from the
6281 vector (without reducing them), so we use scalar shifts. */
6282 else if (reduc_fn != IFN_LAST && !slp_reduc)
6283 {
6284 tree tmp;
6285 tree vec_elem_type;
6286
6287 /* Case 1: Create:
6288 v_out2 = reduc_expr <v_out1> */
6289
6290 if (dump_enabled_p ())
6291 dump_printf_loc (MSG_NOTE, vect_location,
6292 "Reduce using direct vector reduction.\n");
6293
6294 gimple_seq stmts = NULL;
6295 vec_elem_type = TREE_TYPE (vectype);
6296 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6297 vec_elem_type, reduc_inputs[0]);
6298 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6299 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6300
6301 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6302 && induc_val)
6303 {
6304 /* Earlier we set the initial value to be a vector if induc_val
6305 values. Check the result and if it is induc_val then replace
6306 with the original initial value, unless induc_val is
6307 the same as initial_def already. */
6308 tree zcompare = make_ssa_name (boolean_type_node);
6309 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6310 new_temp, induc_val);
6311 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6312 tree initial_def = reduc_info->reduc_initial_values[0];
6313 tmp = make_ssa_name (new_scalar_dest);
6314 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6315 initial_def, new_temp);
6316 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6317 new_temp = tmp;
6318 }
6319
6320 scalar_results.safe_push (new_temp);
6321 }
6322 else if (direct_slp_reduc)
6323 {
6324 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6325 with the elements for other SLP statements replaced with the
6326 neutral value. We can then do a normal reduction on each vector. */
6327
6328 /* Enforced by vectorizable_reduction. */
6329 gcc_assert (reduc_inputs.length () == 1);
6330 gcc_assert (pow2p_hwi (group_size));
6331
6332 gimple_seq seq = NULL;
6333
6334 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6335 and the same element size as VECTYPE. */
6336 tree index = build_index_vector (vectype, 0, 1);
6337 tree index_type = TREE_TYPE (index);
6338 tree index_elt_type = TREE_TYPE (index_type);
6339 tree mask_type = truth_type_for (index_type);
6340
6341 /* Create a vector that, for each element, identifies which of
6342 the REDUC_GROUP_SIZE results should use it. */
6343 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6344 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6345 build_vector_from_val (index_type, index_mask));
6346
6347 /* Get a neutral vector value. This is simply a splat of the neutral
6348 scalar value if we have one, otherwise the initial scalar value
6349 is itself a neutral value. */
6350 tree vector_identity = NULL_TREE;
6351 tree neutral_op = NULL_TREE;
6352 if (slp_node)
6353 {
6354 tree initial_value = NULL_TREE;
6355 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6356 initial_value = reduc_info->reduc_initial_values[0];
6357 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6358 initial_value);
6359 }
6360 if (neutral_op)
6361 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6362 neutral_op);
6363 for (unsigned int i = 0; i < group_size; ++i)
6364 {
6365 /* If there's no univeral neutral value, we can use the
6366 initial scalar value from the original PHI. This is used
6367 for MIN and MAX reduction, for example. */
6368 if (!neutral_op)
6369 {
6370 tree scalar_value = reduc_info->reduc_initial_values[i];
6371 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6372 scalar_value);
6373 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6374 scalar_value);
6375 }
6376
6377 /* Calculate the equivalent of:
6378
6379 sel[j] = (index[j] == i);
6380
6381 which selects the elements of REDUC_INPUTS[0] that should
6382 be included in the result. */
6383 tree compare_val = build_int_cst (index_elt_type, i);
6384 compare_val = build_vector_from_val (index_type, compare_val);
6385 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6386 index, compare_val);
6387
6388 /* Calculate the equivalent of:
6389
6390 vec = seq ? reduc_inputs[0] : vector_identity;
6391
6392 VEC is now suitable for a full vector reduction. */
6393 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6394 sel, reduc_inputs[0], vector_identity);
6395
6396 /* Do the reduction and convert it to the appropriate type. */
6397 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6398 TREE_TYPE (vectype), vec);
6399 scalar = gimple_convert (&seq, scalar_type, scalar);
6400 scalar_results.safe_push (scalar);
6401 }
6402 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6403 }
6404 else
6405 {
6406 bool reduce_with_shift;
6407 tree vec_temp;
6408
6409 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6410
6411 /* See if the target wants to do the final (shift) reduction
6412 in a vector mode of smaller size and first reduce upper/lower
6413 halves against each other. */
6414 enum machine_mode mode1 = mode;
6415 tree stype = TREE_TYPE (vectype);
6416 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6417 unsigned nunits1 = nunits;
6418 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6419 && reduc_inputs.length () == 1)
6420 {
6421 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6422 /* For SLP reductions we have to make sure lanes match up, but
6423 since we're doing individual element final reduction reducing
6424 vector width here is even more important.
6425 ??? We can also separate lanes with permutes, for the common
6426 case of power-of-two group-size odd/even extracts would work. */
6427 if (slp_reduc && nunits != nunits1)
6428 {
6429 nunits1 = least_common_multiple (nunits1, group_size);
6430 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6431 }
6432 }
6433 if (!slp_reduc
6434 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6435 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6436
6437 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6438 stype, nunits1);
6439 reduce_with_shift = have_whole_vector_shift (mode1);
6440 if (!VECTOR_MODE_P (mode1)
6441 || !directly_supported_p (code, vectype1))
6442 reduce_with_shift = false;
6443
6444 /* First reduce the vector to the desired vector size we should
6445 do shift reduction on by combining upper and lower halves. */
6446 gimple_seq stmts = NULL;
6447 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6448 code, &stmts);
6449 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6450 reduc_inputs[0] = new_temp;
6451
6452 if (reduce_with_shift && !slp_reduc)
6453 {
6454 int element_bitsize = tree_to_uhwi (bitsize);
6455 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6456 for variable-length vectors and also requires direct target support
6457 for loop reductions. */
6458 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6459 int nelements = vec_size_in_bits / element_bitsize;
6460 vec_perm_builder sel;
6461 vec_perm_indices indices;
6462
6463 int elt_offset;
6464
6465 tree zero_vec = build_zero_cst (vectype1);
6466 /* Case 2: Create:
6467 for (offset = nelements/2; offset >= 1; offset/=2)
6468 {
6469 Create: va' = vec_shift <va, offset>
6470 Create: va = vop <va, va'>
6471 } */
6472
6473 tree rhs;
6474
6475 if (dump_enabled_p ())
6476 dump_printf_loc (MSG_NOTE, vect_location,
6477 "Reduce using vector shifts\n");
6478
6479 gimple_seq stmts = NULL;
6480 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6481 for (elt_offset = nelements / 2;
6482 elt_offset >= 1;
6483 elt_offset /= 2)
6484 {
6485 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6486 indices.new_vector (sel, 2, nelements);
6487 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6488 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6489 new_temp, zero_vec, mask);
6490 new_temp = gimple_build (&stmts, code,
6491 vectype1, new_name, new_temp);
6492 }
6493 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6494
6495 /* 2.4 Extract the final scalar result. Create:
6496 s_out3 = extract_field <v_out2, bitpos> */
6497
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_NOTE, vect_location,
6500 "extract scalar result\n");
6501
6502 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6503 bitsize, bitsize_zero_node);
6504 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6505 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6506 gimple_assign_set_lhs (epilog_stmt, new_temp);
6507 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6508 scalar_results.safe_push (new_temp);
6509 }
6510 else
6511 {
6512 /* Case 3: Create:
6513 s = extract_field <v_out2, 0>
6514 for (offset = element_size;
6515 offset < vector_size;
6516 offset += element_size;)
6517 {
6518 Create: s' = extract_field <v_out2, offset>
6519 Create: s = op <s, s'> // For non SLP cases
6520 } */
6521
6522 if (dump_enabled_p ())
6523 dump_printf_loc (MSG_NOTE, vect_location,
6524 "Reduce using scalar code.\n");
6525
6526 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6527 int element_bitsize = tree_to_uhwi (bitsize);
6528 tree compute_type = TREE_TYPE (vectype);
6529 gimple_seq stmts = NULL;
6530 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6531 {
6532 int bit_offset;
6533 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6534 vec_temp, bitsize, bitsize_zero_node);
6535
6536 /* In SLP we don't need to apply reduction operation, so we just
6537 collect s' values in SCALAR_RESULTS. */
6538 if (slp_reduc)
6539 scalar_results.safe_push (new_temp);
6540
6541 for (bit_offset = element_bitsize;
6542 bit_offset < vec_size_in_bits;
6543 bit_offset += element_bitsize)
6544 {
6545 tree bitpos = bitsize_int (bit_offset);
6546 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6547 compute_type, vec_temp,
6548 bitsize, bitpos);
6549 if (slp_reduc)
6550 {
6551 /* In SLP we don't need to apply reduction operation, so
6552 we just collect s' values in SCALAR_RESULTS. */
6553 new_temp = new_name;
6554 scalar_results.safe_push (new_name);
6555 }
6556 else
6557 new_temp = gimple_build (&stmts, code, compute_type,
6558 new_name, new_temp);
6559 }
6560 }
6561
6562 /* The only case where we need to reduce scalar results in SLP, is
6563 unrolling. If the size of SCALAR_RESULTS is greater than
6564 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6565 REDUC_GROUP_SIZE. */
6566 if (slp_reduc)
6567 {
6568 tree res, first_res, new_res;
6569
6570 /* Reduce multiple scalar results in case of SLP unrolling. */
6571 for (j = group_size; scalar_results.iterate (j, &res);
6572 j++)
6573 {
6574 first_res = scalar_results[j % group_size];
6575 new_res = gimple_build (&stmts, code, compute_type,
6576 first_res, res);
6577 scalar_results[j % group_size] = new_res;
6578 }
6579 scalar_results.truncate (group_size);
6580 for (k = 0; k < group_size; k++)
6581 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6582 scalar_results[k]);
6583 }
6584 else
6585 {
6586 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6587 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6588 scalar_results.safe_push (new_temp);
6589 }
6590
6591 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6592 }
6593
6594 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6595 && induc_val)
6596 {
6597 /* Earlier we set the initial value to be a vector if induc_val
6598 values. Check the result and if it is induc_val then replace
6599 with the original initial value, unless induc_val is
6600 the same as initial_def already. */
6601 tree zcompare = make_ssa_name (boolean_type_node);
6602 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6603 induc_val);
6604 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6605 tree initial_def = reduc_info->reduc_initial_values[0];
6606 tree tmp = make_ssa_name (new_scalar_dest);
6607 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6608 initial_def, new_temp);
6609 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6610 scalar_results[0] = tmp;
6611 }
6612 }
6613
6614 /* 2.5 Adjust the final result by the initial value of the reduction
6615 variable. (When such adjustment is not needed, then
6616 'adjustment_def' is zero). For example, if code is PLUS we create:
6617 new_temp = loop_exit_def + adjustment_def */
6618
6619 if (adjustment_def)
6620 {
6621 gcc_assert (!slp_reduc);
6622 gimple_seq stmts = NULL;
6623 if (double_reduc)
6624 {
6625 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6626 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6627 new_temp = gimple_build (&stmts, code, vectype,
6628 reduc_inputs[0], adjustment_def);
6629 }
6630 else
6631 {
6632 new_temp = scalar_results[0];
6633 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6634 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6635 adjustment_def);
6636 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6637 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6638 new_temp, adjustment_def);
6639 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6640 }
6641
6642 epilog_stmt = gimple_seq_last_stmt (stmts);
6643 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6644 scalar_results[0] = new_temp;
6645 }
6646
6647 /* Record this operation if it could be reused by the epilogue loop. */
6648 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6649 && reduc_inputs.length () == 1)
6650 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6651 { orig_reduc_input, reduc_info });
6652
6653 if (double_reduc)
6654 loop = outer_loop;
6655
6656 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6657 phis with new adjusted scalar results, i.e., replace use <s_out0>
6658 with use <s_out4>.
6659
6660 Transform:
6661 loop_exit:
6662 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6663 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6664 v_out2 = reduce <v_out1>
6665 s_out3 = extract_field <v_out2, 0>
6666 s_out4 = adjust_result <s_out3>
6667 use <s_out0>
6668 use <s_out0>
6669
6670 into:
6671
6672 loop_exit:
6673 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6674 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6675 v_out2 = reduce <v_out1>
6676 s_out3 = extract_field <v_out2, 0>
6677 s_out4 = adjust_result <s_out3>
6678 use <s_out4>
6679 use <s_out4> */
6680
6681 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6682 for (k = 0; k < live_out_stmts.size (); k++)
6683 {
6684 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6685 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6686
6687 phis.create (3);
6688 /* Find the loop-closed-use at the loop exit of the original scalar
6689 result. (The reduction result is expected to have two immediate uses,
6690 one at the latch block, and one at the loop exit). For double
6691 reductions we are looking for exit phis of the outer loop. */
6692 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6693 {
6694 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6695 {
6696 if (!is_gimple_debug (USE_STMT (use_p)))
6697 phis.safe_push (USE_STMT (use_p));
6698 }
6699 else
6700 {
6701 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6702 {
6703 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6704
6705 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6706 {
6707 if (!flow_bb_inside_loop_p (loop,
6708 gimple_bb (USE_STMT (phi_use_p)))
6709 && !is_gimple_debug (USE_STMT (phi_use_p)))
6710 phis.safe_push (USE_STMT (phi_use_p));
6711 }
6712 }
6713 }
6714 }
6715
6716 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6717 {
6718 /* Replace the uses: */
6719 orig_name = PHI_RESULT (exit_phi);
6720
6721 /* Look for a single use at the target of the skip edge. */
6722 if (unify_with_main_loop_p)
6723 {
6724 use_operand_p use_p;
6725 gimple *user;
6726 if (!single_imm_use (orig_name, &use_p, &user))
6727 gcc_unreachable ();
6728 orig_name = gimple_get_lhs (user);
6729 }
6730
6731 scalar_result = scalar_results[k];
6732 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6733 {
6734 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6735 SET_USE (use_p, scalar_result);
6736 update_stmt (use_stmt);
6737 }
6738 }
6739
6740 phis.release ();
6741 }
6742 }
6743
6744 /* Return a vector of type VECTYPE that is equal to the vector select
6745 operation "MASK ? VEC : IDENTITY". Insert the select statements
6746 before GSI. */
6747
6748 static tree
6749 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6750 tree vec, tree identity)
6751 {
6752 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6753 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6754 mask, vec, identity);
6755 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6756 return cond;
6757 }
6758
6759 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6760 order, starting with LHS. Insert the extraction statements before GSI and
6761 associate the new scalar SSA names with variable SCALAR_DEST.
6762 Return the SSA name for the result. */
6763
6764 static tree
6765 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6766 tree_code code, tree lhs, tree vector_rhs)
6767 {
6768 tree vectype = TREE_TYPE (vector_rhs);
6769 tree scalar_type = TREE_TYPE (vectype);
6770 tree bitsize = TYPE_SIZE (scalar_type);
6771 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6772 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6773
6774 for (unsigned HOST_WIDE_INT bit_offset = 0;
6775 bit_offset < vec_size_in_bits;
6776 bit_offset += element_bitsize)
6777 {
6778 tree bitpos = bitsize_int (bit_offset);
6779 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6780 bitsize, bitpos);
6781
6782 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6783 rhs = make_ssa_name (scalar_dest, stmt);
6784 gimple_assign_set_lhs (stmt, rhs);
6785 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6786
6787 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6788 tree new_name = make_ssa_name (scalar_dest, stmt);
6789 gimple_assign_set_lhs (stmt, new_name);
6790 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6791 lhs = new_name;
6792 }
6793 return lhs;
6794 }
6795
6796 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6797 type of the vector input. */
6798
6799 static internal_fn
6800 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6801 {
6802 internal_fn mask_reduc_fn;
6803
6804 switch (reduc_fn)
6805 {
6806 case IFN_FOLD_LEFT_PLUS:
6807 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6808 break;
6809
6810 default:
6811 return IFN_LAST;
6812 }
6813
6814 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6815 OPTIMIZE_FOR_SPEED))
6816 return mask_reduc_fn;
6817 return IFN_LAST;
6818 }
6819
6820 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6821 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6822 statement. CODE is the operation performed by STMT_INFO and OPS are
6823 its scalar operands. REDUC_INDEX is the index of the operand in
6824 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6825 implements in-order reduction, or IFN_LAST if we should open-code it.
6826 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6827 that should be used to control the operation in a fully-masked loop. */
6828
6829 static bool
6830 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6831 stmt_vec_info stmt_info,
6832 gimple_stmt_iterator *gsi,
6833 gimple **vec_stmt, slp_tree slp_node,
6834 gimple *reduc_def_stmt,
6835 tree_code code, internal_fn reduc_fn,
6836 tree ops[3], tree vectype_in,
6837 int reduc_index, vec_loop_masks *masks)
6838 {
6839 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6840 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6841 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6842
6843 int ncopies;
6844 if (slp_node)
6845 ncopies = 1;
6846 else
6847 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6848
6849 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6850 gcc_assert (ncopies == 1);
6851 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6852
6853 if (slp_node)
6854 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6855 TYPE_VECTOR_SUBPARTS (vectype_in)));
6856
6857 tree op0 = ops[1 - reduc_index];
6858
6859 int group_size = 1;
6860 stmt_vec_info scalar_dest_def_info;
6861 auto_vec<tree> vec_oprnds0;
6862 if (slp_node)
6863 {
6864 auto_vec<vec<tree> > vec_defs (2);
6865 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6866 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6867 vec_defs[0].release ();
6868 vec_defs[1].release ();
6869 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6870 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6871 }
6872 else
6873 {
6874 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6875 op0, &vec_oprnds0);
6876 scalar_dest_def_info = stmt_info;
6877 }
6878
6879 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6880 tree scalar_type = TREE_TYPE (scalar_dest);
6881 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6882
6883 int vec_num = vec_oprnds0.length ();
6884 gcc_assert (vec_num == 1 || slp_node);
6885 tree vec_elem_type = TREE_TYPE (vectype_out);
6886 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6887
6888 tree vector_identity = NULL_TREE;
6889 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6890 vector_identity = build_zero_cst (vectype_out);
6891
6892 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6893 int i;
6894 tree def0;
6895 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6896 {
6897 gimple *new_stmt;
6898 tree mask = NULL_TREE;
6899 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6900 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6901
6902 /* Handle MINUS by adding the negative. */
6903 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6904 {
6905 tree negated = make_ssa_name (vectype_out);
6906 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6907 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6908 def0 = negated;
6909 }
6910
6911 if (mask && mask_reduc_fn == IFN_LAST)
6912 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6913 vector_identity);
6914
6915 /* On the first iteration the input is simply the scalar phi
6916 result, and for subsequent iterations it is the output of
6917 the preceding operation. */
6918 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6919 {
6920 if (mask && mask_reduc_fn != IFN_LAST)
6921 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6922 def0, mask);
6923 else
6924 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6925 def0);
6926 /* For chained SLP reductions the output of the previous reduction
6927 operation serves as the input of the next. For the final statement
6928 the output cannot be a temporary - we reuse the original
6929 scalar destination of the last statement. */
6930 if (i != vec_num - 1)
6931 {
6932 gimple_set_lhs (new_stmt, scalar_dest_var);
6933 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6934 gimple_set_lhs (new_stmt, reduc_var);
6935 }
6936 }
6937 else
6938 {
6939 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6940 reduc_var, def0);
6941 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6942 /* Remove the statement, so that we can use the same code paths
6943 as for statements that we've just created. */
6944 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6945 gsi_remove (&tmp_gsi, true);
6946 }
6947
6948 if (i == vec_num - 1)
6949 {
6950 gimple_set_lhs (new_stmt, scalar_dest);
6951 vect_finish_replace_stmt (loop_vinfo,
6952 scalar_dest_def_info,
6953 new_stmt);
6954 }
6955 else
6956 vect_finish_stmt_generation (loop_vinfo,
6957 scalar_dest_def_info,
6958 new_stmt, gsi);
6959
6960 if (slp_node)
6961 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6962 else
6963 {
6964 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6965 *vec_stmt = new_stmt;
6966 }
6967 }
6968
6969 return true;
6970 }
6971
6972 /* Function is_nonwrapping_integer_induction.
6973
6974 Check if STMT_VINO (which is part of loop LOOP) both increments and
6975 does not cause overflow. */
6976
6977 static bool
6978 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6979 {
6980 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6981 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6982 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6983 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6984 widest_int ni, max_loop_value, lhs_max;
6985 wi::overflow_type overflow = wi::OVF_NONE;
6986
6987 /* Make sure the loop is integer based. */
6988 if (TREE_CODE (base) != INTEGER_CST
6989 || TREE_CODE (step) != INTEGER_CST)
6990 return false;
6991
6992 /* Check that the max size of the loop will not wrap. */
6993
6994 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6995 return true;
6996
6997 if (! max_stmt_executions (loop, &ni))
6998 return false;
6999
7000 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7001 &overflow);
7002 if (overflow)
7003 return false;
7004
7005 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7006 TYPE_SIGN (lhs_type), &overflow);
7007 if (overflow)
7008 return false;
7009
7010 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7011 <= TYPE_PRECISION (lhs_type));
7012 }
7013
7014 /* Check if masking can be supported by inserting a conditional expression.
7015 CODE is the code for the operation. COND_FN is the conditional internal
7016 function, if it exists. VECTYPE_IN is the type of the vector input. */
7017 static bool
7018 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7019 tree vectype_in)
7020 {
7021 if (cond_fn != IFN_LAST
7022 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7023 OPTIMIZE_FOR_SPEED))
7024 return false;
7025
7026 if (code.is_tree_code ())
7027 switch (tree_code (code))
7028 {
7029 case DOT_PROD_EXPR:
7030 case SAD_EXPR:
7031 return true;
7032
7033 default:
7034 break;
7035 }
7036 return false;
7037 }
7038
7039 /* Insert a conditional expression to enable masked vectorization. CODE is the
7040 code for the operation. VOP is the array of operands. MASK is the loop
7041 mask. GSI is a statement iterator used to place the new conditional
7042 expression. */
7043 static void
7044 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7045 gimple_stmt_iterator *gsi)
7046 {
7047 switch (tree_code (code))
7048 {
7049 case DOT_PROD_EXPR:
7050 {
7051 tree vectype = TREE_TYPE (vop[1]);
7052 tree zero = build_zero_cst (vectype);
7053 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7054 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7055 mask, vop[1], zero);
7056 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7057 vop[1] = masked_op1;
7058 break;
7059 }
7060
7061 case SAD_EXPR:
7062 {
7063 tree vectype = TREE_TYPE (vop[1]);
7064 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7065 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7066 mask, vop[1], vop[0]);
7067 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7068 vop[1] = masked_op1;
7069 break;
7070 }
7071
7072 default:
7073 gcc_unreachable ();
7074 }
7075 }
7076
7077 /* Function vectorizable_reduction.
7078
7079 Check if STMT_INFO performs a reduction operation that can be vectorized.
7080 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7081 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7082 Return true if STMT_INFO is vectorizable in this way.
7083
7084 This function also handles reduction idioms (patterns) that have been
7085 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7086 may be of this form:
7087 X = pattern_expr (arg0, arg1, ..., X)
7088 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7089 sequence that had been detected and replaced by the pattern-stmt
7090 (STMT_INFO).
7091
7092 This function also handles reduction of condition expressions, for example:
7093 for (int i = 0; i < N; i++)
7094 if (a[i] < value)
7095 last = a[i];
7096 This is handled by vectorising the loop and creating an additional vector
7097 containing the loop indexes for which "a[i] < value" was true. In the
7098 function epilogue this is reduced to a single max value and then used to
7099 index into the vector of results.
7100
7101 In some cases of reduction patterns, the type of the reduction variable X is
7102 different than the type of the other arguments of STMT_INFO.
7103 In such cases, the vectype that is used when transforming STMT_INFO into
7104 a vector stmt is different than the vectype that is used to determine the
7105 vectorization factor, because it consists of a different number of elements
7106 than the actual number of elements that are being operated upon in parallel.
7107
7108 For example, consider an accumulation of shorts into an int accumulator.
7109 On some targets it's possible to vectorize this pattern operating on 8
7110 shorts at a time (hence, the vectype for purposes of determining the
7111 vectorization factor should be V8HI); on the other hand, the vectype that
7112 is used to create the vector form is actually V4SI (the type of the result).
7113
7114 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7115 indicates what is the actual level of parallelism (V8HI in the example), so
7116 that the right vectorization factor would be derived. This vectype
7117 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7118 be used to create the vectorized stmt. The right vectype for the vectorized
7119 stmt is obtained from the type of the result X:
7120 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7121
7122 This means that, contrary to "regular" reductions (or "regular" stmts in
7123 general), the following equation:
7124 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7125 does *NOT* necessarily hold for reduction patterns. */
7126
7127 bool
7128 vectorizable_reduction (loop_vec_info loop_vinfo,
7129 stmt_vec_info stmt_info, slp_tree slp_node,
7130 slp_instance slp_node_instance,
7131 stmt_vector_for_cost *cost_vec)
7132 {
7133 tree vectype_in = NULL_TREE;
7134 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7135 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7136 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7137 stmt_vec_info cond_stmt_vinfo = NULL;
7138 int i;
7139 int ncopies;
7140 bool single_defuse_cycle = false;
7141 bool nested_cycle = false;
7142 bool double_reduc = false;
7143 int vec_num;
7144 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7145 tree cond_reduc_val = NULL_TREE;
7146
7147 /* Make sure it was already recognized as a reduction computation. */
7148 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7149 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7150 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7151 return false;
7152
7153 /* The stmt we store reduction analysis meta on. */
7154 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7155 reduc_info->is_reduc_info = true;
7156
7157 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7158 {
7159 if (is_a <gphi *> (stmt_info->stmt))
7160 {
7161 if (slp_node)
7162 {
7163 /* We eventually need to set a vector type on invariant
7164 arguments. */
7165 unsigned j;
7166 slp_tree child;
7167 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7168 if (!vect_maybe_update_slp_op_vectype
7169 (child, SLP_TREE_VECTYPE (slp_node)))
7170 {
7171 if (dump_enabled_p ())
7172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7173 "incompatible vector types for "
7174 "invariants\n");
7175 return false;
7176 }
7177 }
7178 /* Analysis for double-reduction is done on the outer
7179 loop PHI, nested cycles have no further restrictions. */
7180 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7181 }
7182 else
7183 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7184 return true;
7185 }
7186
7187 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7188 stmt_vec_info phi_info = stmt_info;
7189 if (!is_a <gphi *> (stmt_info->stmt))
7190 {
7191 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7192 return true;
7193 }
7194 if (slp_node)
7195 {
7196 slp_node_instance->reduc_phis = slp_node;
7197 /* ??? We're leaving slp_node to point to the PHIs, we only
7198 need it to get at the number of vector stmts which wasn't
7199 yet initialized for the instance root. */
7200 }
7201 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7202 {
7203 use_operand_p use_p;
7204 gimple *use_stmt;
7205 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7206 &use_p, &use_stmt);
7207 gcc_assert (res);
7208 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7209 }
7210
7211 /* PHIs should not participate in patterns. */
7212 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7213 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7214
7215 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7216 and compute the reduction chain length. Discover the real
7217 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7218 tree reduc_def
7219 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7220 loop_latch_edge
7221 (gimple_bb (reduc_def_phi)->loop_father));
7222 unsigned reduc_chain_length = 0;
7223 bool only_slp_reduc_chain = true;
7224 stmt_info = NULL;
7225 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7226 while (reduc_def != PHI_RESULT (reduc_def_phi))
7227 {
7228 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7229 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7230 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7231 {
7232 if (dump_enabled_p ())
7233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7234 "reduction chain broken by patterns.\n");
7235 return false;
7236 }
7237 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7238 only_slp_reduc_chain = false;
7239 /* For epilogue generation live members of the chain need
7240 to point back to the PHI via their original stmt for
7241 info_for_reduction to work. For SLP we need to look at
7242 all lanes here - even though we only will vectorize from
7243 the SLP node with live lane zero the other live lanes also
7244 need to be identified as part of a reduction to be able
7245 to skip code generation for them. */
7246 if (slp_for_stmt_info)
7247 {
7248 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7249 if (STMT_VINFO_LIVE_P (s))
7250 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7251 }
7252 else if (STMT_VINFO_LIVE_P (vdef))
7253 STMT_VINFO_REDUC_DEF (def) = phi_info;
7254 gimple_match_op op;
7255 if (!gimple_extract_op (vdef->stmt, &op))
7256 {
7257 if (dump_enabled_p ())
7258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7259 "reduction chain includes unsupported"
7260 " statement type.\n");
7261 return false;
7262 }
7263 if (CONVERT_EXPR_CODE_P (op.code))
7264 {
7265 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7266 {
7267 if (dump_enabled_p ())
7268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7269 "conversion in the reduction chain.\n");
7270 return false;
7271 }
7272 }
7273 else if (!stmt_info)
7274 /* First non-conversion stmt. */
7275 stmt_info = vdef;
7276 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7277 reduc_chain_length++;
7278 if (!stmt_info && slp_node)
7279 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7280 }
7281 /* PHIs should not participate in patterns. */
7282 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7283
7284 if (nested_in_vect_loop_p (loop, stmt_info))
7285 {
7286 loop = loop->inner;
7287 nested_cycle = true;
7288 }
7289
7290 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7291 element. */
7292 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7293 {
7294 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7295 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7296 }
7297 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7298 gcc_assert (slp_node
7299 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7300
7301 /* 1. Is vectorizable reduction? */
7302 /* Not supportable if the reduction variable is used in the loop, unless
7303 it's a reduction chain. */
7304 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7305 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7306 return false;
7307
7308 /* Reductions that are not used even in an enclosing outer-loop,
7309 are expected to be "live" (used out of the loop). */
7310 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7311 && !STMT_VINFO_LIVE_P (stmt_info))
7312 return false;
7313
7314 /* 2. Has this been recognized as a reduction pattern?
7315
7316 Check if STMT represents a pattern that has been recognized
7317 in earlier analysis stages. For stmts that represent a pattern,
7318 the STMT_VINFO_RELATED_STMT field records the last stmt in
7319 the original sequence that constitutes the pattern. */
7320
7321 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7322 if (orig_stmt_info)
7323 {
7324 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7325 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7326 }
7327
7328 /* 3. Check the operands of the operation. The first operands are defined
7329 inside the loop body. The last operand is the reduction variable,
7330 which is defined by the loop-header-phi. */
7331
7332 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7333 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7334 gimple_match_op op;
7335 if (!gimple_extract_op (stmt_info->stmt, &op))
7336 gcc_unreachable ();
7337 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7338 || op.code == WIDEN_SUM_EXPR
7339 || op.code == SAD_EXPR);
7340
7341 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7342 && !SCALAR_FLOAT_TYPE_P (op.type))
7343 return false;
7344
7345 /* Do not try to vectorize bit-precision reductions. */
7346 if (!type_has_mode_precision_p (op.type))
7347 return false;
7348
7349 /* For lane-reducing ops we're reducing the number of reduction PHIs
7350 which means the only use of that may be in the lane-reducing operation. */
7351 if (lane_reduc_code_p
7352 && reduc_chain_length != 1
7353 && !only_slp_reduc_chain)
7354 {
7355 if (dump_enabled_p ())
7356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7357 "lane-reducing reduction with extra stmts.\n");
7358 return false;
7359 }
7360
7361 /* All uses but the last are expected to be defined in the loop.
7362 The last use is the reduction variable. In case of nested cycle this
7363 assumption is not true: we use reduc_index to record the index of the
7364 reduction variable. */
7365 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7366 /* We need to skip an extra operand for COND_EXPRs with embedded
7367 comparison. */
7368 unsigned opno_adjust = 0;
7369 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7370 opno_adjust = 1;
7371 for (i = 0; i < (int) op.num_ops; i++)
7372 {
7373 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7374 if (i == 0 && op.code == COND_EXPR)
7375 continue;
7376
7377 stmt_vec_info def_stmt_info;
7378 enum vect_def_type dt;
7379 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7380 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7381 &vectype_op[i], &def_stmt_info))
7382 {
7383 if (dump_enabled_p ())
7384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7385 "use not simple.\n");
7386 return false;
7387 }
7388 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7389 continue;
7390
7391 /* There should be only one cycle def in the stmt, the one
7392 leading to reduc_def. */
7393 if (VECTORIZABLE_CYCLE_DEF (dt))
7394 return false;
7395
7396 if (!vectype_op[i])
7397 vectype_op[i]
7398 = get_vectype_for_scalar_type (loop_vinfo,
7399 TREE_TYPE (op.ops[i]), slp_op[i]);
7400
7401 /* To properly compute ncopies we are interested in the widest
7402 non-reduction input type in case we're looking at a widening
7403 accumulation that we later handle in vect_transform_reduction. */
7404 if (lane_reduc_code_p
7405 && vectype_op[i]
7406 && (!vectype_in
7407 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7408 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7409 vectype_in = vectype_op[i];
7410
7411 if (op.code == COND_EXPR)
7412 {
7413 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7414 if (dt == vect_constant_def)
7415 {
7416 cond_reduc_dt = dt;
7417 cond_reduc_val = op.ops[i];
7418 }
7419 if (dt == vect_induction_def
7420 && def_stmt_info
7421 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7422 {
7423 cond_reduc_dt = dt;
7424 cond_stmt_vinfo = def_stmt_info;
7425 }
7426 }
7427 }
7428 if (!vectype_in)
7429 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7430 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7431
7432 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7433 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7434 /* If we have a condition reduction, see if we can simplify it further. */
7435 if (v_reduc_type == COND_REDUCTION)
7436 {
7437 if (slp_node)
7438 return false;
7439
7440 /* When the condition uses the reduction value in the condition, fail. */
7441 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7442 {
7443 if (dump_enabled_p ())
7444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7445 "condition depends on previous iteration\n");
7446 return false;
7447 }
7448
7449 if (reduc_chain_length == 1
7450 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7451 vectype_in, OPTIMIZE_FOR_SPEED))
7452 {
7453 if (dump_enabled_p ())
7454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455 "optimizing condition reduction with"
7456 " FOLD_EXTRACT_LAST.\n");
7457 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7458 }
7459 else if (cond_reduc_dt == vect_induction_def)
7460 {
7461 tree base
7462 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7463 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7464
7465 gcc_assert (TREE_CODE (base) == INTEGER_CST
7466 && TREE_CODE (step) == INTEGER_CST);
7467 cond_reduc_val = NULL_TREE;
7468 enum tree_code cond_reduc_op_code = ERROR_MARK;
7469 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7470 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7471 ;
7472 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7473 above base; punt if base is the minimum value of the type for
7474 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7475 else if (tree_int_cst_sgn (step) == -1)
7476 {
7477 cond_reduc_op_code = MIN_EXPR;
7478 if (tree_int_cst_sgn (base) == -1)
7479 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7480 else if (tree_int_cst_lt (base,
7481 TYPE_MAX_VALUE (TREE_TYPE (base))))
7482 cond_reduc_val
7483 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7484 }
7485 else
7486 {
7487 cond_reduc_op_code = MAX_EXPR;
7488 if (tree_int_cst_sgn (base) == 1)
7489 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7490 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7491 base))
7492 cond_reduc_val
7493 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7494 }
7495 if (cond_reduc_val)
7496 {
7497 if (dump_enabled_p ())
7498 dump_printf_loc (MSG_NOTE, vect_location,
7499 "condition expression based on "
7500 "integer induction.\n");
7501 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7502 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7503 = cond_reduc_val;
7504 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7505 }
7506 }
7507 else if (cond_reduc_dt == vect_constant_def)
7508 {
7509 enum vect_def_type cond_initial_dt;
7510 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7511 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7512 if (cond_initial_dt == vect_constant_def
7513 && types_compatible_p (TREE_TYPE (cond_initial_val),
7514 TREE_TYPE (cond_reduc_val)))
7515 {
7516 tree e = fold_binary (LE_EXPR, boolean_type_node,
7517 cond_initial_val, cond_reduc_val);
7518 if (e && (integer_onep (e) || integer_zerop (e)))
7519 {
7520 if (dump_enabled_p ())
7521 dump_printf_loc (MSG_NOTE, vect_location,
7522 "condition expression based on "
7523 "compile time constant.\n");
7524 /* Record reduction code at analysis stage. */
7525 STMT_VINFO_REDUC_CODE (reduc_info)
7526 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7527 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7528 }
7529 }
7530 }
7531 }
7532
7533 if (STMT_VINFO_LIVE_P (phi_info))
7534 return false;
7535
7536 if (slp_node)
7537 ncopies = 1;
7538 else
7539 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7540
7541 gcc_assert (ncopies >= 1);
7542
7543 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7544
7545 if (nested_cycle)
7546 {
7547 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7548 == vect_double_reduction_def);
7549 double_reduc = true;
7550 }
7551
7552 /* 4.2. Check support for the epilog operation.
7553
7554 If STMT represents a reduction pattern, then the type of the
7555 reduction variable may be different than the type of the rest
7556 of the arguments. For example, consider the case of accumulation
7557 of shorts into an int accumulator; The original code:
7558 S1: int_a = (int) short_a;
7559 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7560
7561 was replaced with:
7562 STMT: int_acc = widen_sum <short_a, int_acc>
7563
7564 This means that:
7565 1. The tree-code that is used to create the vector operation in the
7566 epilog code (that reduces the partial results) is not the
7567 tree-code of STMT, but is rather the tree-code of the original
7568 stmt from the pattern that STMT is replacing. I.e, in the example
7569 above we want to use 'widen_sum' in the loop, but 'plus' in the
7570 epilog.
7571 2. The type (mode) we use to check available target support
7572 for the vector operation to be created in the *epilog*, is
7573 determined by the type of the reduction variable (in the example
7574 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7575 However the type (mode) we use to check available target support
7576 for the vector operation to be created *inside the loop*, is
7577 determined by the type of the other arguments to STMT (in the
7578 example we'd check this: optab_handler (widen_sum_optab,
7579 vect_short_mode)).
7580
7581 This is contrary to "regular" reductions, in which the types of all
7582 the arguments are the same as the type of the reduction variable.
7583 For "regular" reductions we can therefore use the same vector type
7584 (and also the same tree-code) when generating the epilog code and
7585 when generating the code inside the loop. */
7586
7587 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7588 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7589
7590 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7591 if (reduction_type == TREE_CODE_REDUCTION)
7592 {
7593 /* Check whether it's ok to change the order of the computation.
7594 Generally, when vectorizing a reduction we change the order of the
7595 computation. This may change the behavior of the program in some
7596 cases, so we need to check that this is ok. One exception is when
7597 vectorizing an outer-loop: the inner-loop is executed sequentially,
7598 and therefore vectorizing reductions in the inner-loop during
7599 outer-loop vectorization is safe. Likewise when we are vectorizing
7600 a series of reductions using SLP and the VF is one the reductions
7601 are performed in scalar order. */
7602 if (slp_node
7603 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7604 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7605 ;
7606 else if (needs_fold_left_reduction_p (op.type, orig_code))
7607 {
7608 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7609 is not directy used in stmt. */
7610 if (!only_slp_reduc_chain
7611 && reduc_chain_length != 1)
7612 {
7613 if (dump_enabled_p ())
7614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7615 "in-order reduction chain without SLP.\n");
7616 return false;
7617 }
7618 STMT_VINFO_REDUC_TYPE (reduc_info)
7619 = reduction_type = FOLD_LEFT_REDUCTION;
7620 }
7621 else if (!commutative_binary_op_p (orig_code, op.type)
7622 || !associative_binary_op_p (orig_code, op.type))
7623 {
7624 if (dump_enabled_p ())
7625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7626 "reduction: not commutative/associative");
7627 return false;
7628 }
7629 }
7630
7631 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7632 && ncopies > 1)
7633 {
7634 if (dump_enabled_p ())
7635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7636 "multiple types in double reduction or condition "
7637 "reduction or fold-left reduction.\n");
7638 return false;
7639 }
7640
7641 internal_fn reduc_fn = IFN_LAST;
7642 if (reduction_type == TREE_CODE_REDUCTION
7643 || reduction_type == FOLD_LEFT_REDUCTION
7644 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7645 || reduction_type == CONST_COND_REDUCTION)
7646 {
7647 if (reduction_type == FOLD_LEFT_REDUCTION
7648 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7649 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7650 {
7651 if (reduc_fn != IFN_LAST
7652 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7653 OPTIMIZE_FOR_SPEED))
7654 {
7655 if (dump_enabled_p ())
7656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7657 "reduc op not supported by target.\n");
7658
7659 reduc_fn = IFN_LAST;
7660 }
7661 }
7662 else
7663 {
7664 if (!nested_cycle || double_reduc)
7665 {
7666 if (dump_enabled_p ())
7667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668 "no reduc code for scalar code.\n");
7669
7670 return false;
7671 }
7672 }
7673 }
7674 else if (reduction_type == COND_REDUCTION)
7675 {
7676 int scalar_precision
7677 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7678 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7679 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7680 vectype_out);
7681
7682 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7683 OPTIMIZE_FOR_SPEED))
7684 reduc_fn = IFN_REDUC_MAX;
7685 }
7686 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7687
7688 if (reduction_type != EXTRACT_LAST_REDUCTION
7689 && (!nested_cycle || double_reduc)
7690 && reduc_fn == IFN_LAST
7691 && !nunits_out.is_constant ())
7692 {
7693 if (dump_enabled_p ())
7694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695 "missing target support for reduction on"
7696 " variable-length vectors.\n");
7697 return false;
7698 }
7699
7700 /* For SLP reductions, see if there is a neutral value we can use. */
7701 tree neutral_op = NULL_TREE;
7702 if (slp_node)
7703 {
7704 tree initial_value = NULL_TREE;
7705 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7706 initial_value = vect_phi_initial_value (reduc_def_phi);
7707 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7708 orig_code, initial_value);
7709 }
7710
7711 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7712 {
7713 /* We can't support in-order reductions of code such as this:
7714
7715 for (int i = 0; i < n1; ++i)
7716 for (int j = 0; j < n2; ++j)
7717 l += a[j];
7718
7719 since GCC effectively transforms the loop when vectorizing:
7720
7721 for (int i = 0; i < n1 / VF; ++i)
7722 for (int j = 0; j < n2; ++j)
7723 for (int k = 0; k < VF; ++k)
7724 l += a[j];
7725
7726 which is a reassociation of the original operation. */
7727 if (dump_enabled_p ())
7728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7729 "in-order double reduction not supported.\n");
7730
7731 return false;
7732 }
7733
7734 if (reduction_type == FOLD_LEFT_REDUCTION
7735 && slp_node
7736 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7737 {
7738 /* We cannot use in-order reductions in this case because there is
7739 an implicit reassociation of the operations involved. */
7740 if (dump_enabled_p ())
7741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742 "in-order unchained SLP reductions not supported.\n");
7743 return false;
7744 }
7745
7746 /* For double reductions, and for SLP reductions with a neutral value,
7747 we construct a variable-length initial vector by loading a vector
7748 full of the neutral value and then shift-and-inserting the start
7749 values into the low-numbered elements. */
7750 if ((double_reduc || neutral_op)
7751 && !nunits_out.is_constant ()
7752 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7753 vectype_out, OPTIMIZE_FOR_SPEED))
7754 {
7755 if (dump_enabled_p ())
7756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7757 "reduction on variable-length vectors requires"
7758 " target support for a vector-shift-and-insert"
7759 " operation.\n");
7760 return false;
7761 }
7762
7763 /* Check extra constraints for variable-length unchained SLP reductions. */
7764 if (slp_node
7765 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7766 && !nunits_out.is_constant ())
7767 {
7768 /* We checked above that we could build the initial vector when
7769 there's a neutral element value. Check here for the case in
7770 which each SLP statement has its own initial value and in which
7771 that value needs to be repeated for every instance of the
7772 statement within the initial vector. */
7773 unsigned int group_size = SLP_TREE_LANES (slp_node);
7774 if (!neutral_op
7775 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7776 TREE_TYPE (vectype_out)))
7777 {
7778 if (dump_enabled_p ())
7779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7780 "unsupported form of SLP reduction for"
7781 " variable-length vectors: cannot build"
7782 " initial vector.\n");
7783 return false;
7784 }
7785 /* The epilogue code relies on the number of elements being a multiple
7786 of the group size. The duplicate-and-interleave approach to setting
7787 up the initial vector does too. */
7788 if (!multiple_p (nunits_out, group_size))
7789 {
7790 if (dump_enabled_p ())
7791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7792 "unsupported form of SLP reduction for"
7793 " variable-length vectors: the vector size"
7794 " is not a multiple of the number of results.\n");
7795 return false;
7796 }
7797 }
7798
7799 if (reduction_type == COND_REDUCTION)
7800 {
7801 widest_int ni;
7802
7803 if (! max_loop_iterations (loop, &ni))
7804 {
7805 if (dump_enabled_p ())
7806 dump_printf_loc (MSG_NOTE, vect_location,
7807 "loop count not known, cannot create cond "
7808 "reduction.\n");
7809 return false;
7810 }
7811 /* Convert backedges to iterations. */
7812 ni += 1;
7813
7814 /* The additional index will be the same type as the condition. Check
7815 that the loop can fit into this less one (because we'll use up the
7816 zero slot for when there are no matches). */
7817 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7818 if (wi::geu_p (ni, wi::to_widest (max_index)))
7819 {
7820 if (dump_enabled_p ())
7821 dump_printf_loc (MSG_NOTE, vect_location,
7822 "loop size is greater than data size.\n");
7823 return false;
7824 }
7825 }
7826
7827 /* In case the vectorization factor (VF) is bigger than the number
7828 of elements that we can fit in a vectype (nunits), we have to generate
7829 more than one vector stmt - i.e - we need to "unroll" the
7830 vector stmt by a factor VF/nunits. For more details see documentation
7831 in vectorizable_operation. */
7832
7833 /* If the reduction is used in an outer loop we need to generate
7834 VF intermediate results, like so (e.g. for ncopies=2):
7835 r0 = phi (init, r0)
7836 r1 = phi (init, r1)
7837 r0 = x0 + r0;
7838 r1 = x1 + r1;
7839 (i.e. we generate VF results in 2 registers).
7840 In this case we have a separate def-use cycle for each copy, and therefore
7841 for each copy we get the vector def for the reduction variable from the
7842 respective phi node created for this copy.
7843
7844 Otherwise (the reduction is unused in the loop nest), we can combine
7845 together intermediate results, like so (e.g. for ncopies=2):
7846 r = phi (init, r)
7847 r = x0 + r;
7848 r = x1 + r;
7849 (i.e. we generate VF/2 results in a single register).
7850 In this case for each copy we get the vector def for the reduction variable
7851 from the vectorized reduction operation generated in the previous iteration.
7852
7853 This only works when we see both the reduction PHI and its only consumer
7854 in vectorizable_reduction and there are no intermediate stmts
7855 participating. When unrolling we want each unrolled iteration to have its
7856 own reduction accumulator since one of the main goals of unrolling a
7857 reduction is to reduce the aggregate loop-carried latency. */
7858 if (ncopies > 1
7859 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7860 && reduc_chain_length == 1
7861 && loop_vinfo->suggested_unroll_factor == 1)
7862 single_defuse_cycle = true;
7863
7864 if (single_defuse_cycle || lane_reduc_code_p)
7865 {
7866 gcc_assert (op.code != COND_EXPR);
7867
7868 /* 4. Supportable by target? */
7869 bool ok = true;
7870
7871 /* 4.1. check support for the operation in the loop
7872
7873 This isn't necessary for the lane reduction codes, since they
7874 can only be produced by pattern matching, and it's up to the
7875 pattern matcher to test for support. The main reason for
7876 specifically skipping this step is to avoid rechecking whether
7877 mixed-sign dot-products can be implemented using signed
7878 dot-products. */
7879 machine_mode vec_mode = TYPE_MODE (vectype_in);
7880 if (!lane_reduc_code_p
7881 && !directly_supported_p (op.code, vectype_in, optab_vector))
7882 {
7883 if (dump_enabled_p ())
7884 dump_printf (MSG_NOTE, "op not supported by target.\n");
7885 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7886 || !vect_can_vectorize_without_simd_p (op.code))
7887 ok = false;
7888 else
7889 if (dump_enabled_p ())
7890 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7891 }
7892
7893 if (vect_emulated_vector_p (vectype_in)
7894 && !vect_can_vectorize_without_simd_p (op.code))
7895 {
7896 if (dump_enabled_p ())
7897 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7898 return false;
7899 }
7900
7901 /* lane-reducing operations have to go through vect_transform_reduction.
7902 For the other cases try without the single cycle optimization. */
7903 if (!ok)
7904 {
7905 if (lane_reduc_code_p)
7906 return false;
7907 else
7908 single_defuse_cycle = false;
7909 }
7910 }
7911 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7912
7913 /* If the reduction stmt is one of the patterns that have lane
7914 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7915 if ((ncopies > 1 && ! single_defuse_cycle)
7916 && lane_reduc_code_p)
7917 {
7918 if (dump_enabled_p ())
7919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7920 "multi def-use cycle not possible for lane-reducing "
7921 "reduction operation\n");
7922 return false;
7923 }
7924
7925 if (slp_node
7926 && !(!single_defuse_cycle
7927 && !lane_reduc_code_p
7928 && reduction_type != FOLD_LEFT_REDUCTION))
7929 for (i = 0; i < (int) op.num_ops; i++)
7930 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7931 {
7932 if (dump_enabled_p ())
7933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934 "incompatible vector types for invariants\n");
7935 return false;
7936 }
7937
7938 if (slp_node)
7939 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7940 else
7941 vec_num = 1;
7942
7943 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7944 reduction_type, ncopies, cost_vec);
7945 /* Cost the reduction op inside the loop if transformed via
7946 vect_transform_reduction. Otherwise this is costed by the
7947 separate vectorizable_* routines. */
7948 if (single_defuse_cycle || lane_reduc_code_p)
7949 {
7950 int factor = 1;
7951 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7952 /* Three dot-products and a subtraction. */
7953 factor = 4;
7954 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7955 stmt_info, 0, vect_body);
7956 }
7957
7958 if (dump_enabled_p ()
7959 && reduction_type == FOLD_LEFT_REDUCTION)
7960 dump_printf_loc (MSG_NOTE, vect_location,
7961 "using an in-order (fold-left) reduction.\n");
7962 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7963 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7964 reductions go through their own vectorizable_* routines. */
7965 if (!single_defuse_cycle
7966 && !lane_reduc_code_p
7967 && reduction_type != FOLD_LEFT_REDUCTION)
7968 {
7969 stmt_vec_info tem
7970 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7971 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7972 {
7973 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7974 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7975 }
7976 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7977 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7978 }
7979 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7980 {
7981 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7982 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7983
7984 if (reduction_type != FOLD_LEFT_REDUCTION
7985 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7986 && (cond_fn == IFN_LAST
7987 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7988 OPTIMIZE_FOR_SPEED)))
7989 {
7990 if (dump_enabled_p ())
7991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992 "can't operate on partial vectors because"
7993 " no conditional operation is available.\n");
7994 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7995 }
7996 else if (reduction_type == FOLD_LEFT_REDUCTION
7997 && reduc_fn == IFN_LAST
7998 && !expand_vec_cond_expr_p (vectype_in,
7999 truth_type_for (vectype_in),
8000 SSA_NAME))
8001 {
8002 if (dump_enabled_p ())
8003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8004 "can't operate on partial vectors because"
8005 " no conditional operation is available.\n");
8006 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8007 }
8008 else
8009 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8010 vectype_in, NULL);
8011 }
8012 return true;
8013 }
8014
8015 /* STMT_INFO is a dot-product reduction whose multiplication operands
8016 have different signs. Emit a sequence to emulate the operation
8017 using a series of signed DOT_PROD_EXPRs and return the last
8018 statement generated. VEC_DEST is the result of the vector operation
8019 and VOP lists its inputs. */
8020
8021 static gassign *
8022 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8023 gimple_stmt_iterator *gsi, tree vec_dest,
8024 tree vop[3])
8025 {
8026 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8027 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8028 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8029 gimple *new_stmt;
8030
8031 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8032 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8033 std::swap (vop[0], vop[1]);
8034
8035 /* Convert all inputs to signed types. */
8036 for (int i = 0; i < 3; ++i)
8037 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8038 {
8039 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8040 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8041 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8042 vop[i] = tmp;
8043 }
8044
8045 /* In the comments below we assume 8-bit inputs for simplicity,
8046 but the approach works for any full integer type. */
8047
8048 /* Create a vector of -128. */
8049 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8050 tree min_narrow = build_vector_from_val (narrow_vectype,
8051 min_narrow_elttype);
8052
8053 /* Create a vector of 64. */
8054 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8055 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8056 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8057
8058 /* Emit: SUB_RES = VOP[0] - 128. */
8059 tree sub_res = make_ssa_name (narrow_vectype);
8060 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8061 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8062
8063 /* Emit:
8064
8065 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8066 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8067 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8068
8069 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8070 Doing the two 64 * y steps first allows more time to compute x. */
8071 tree stage1 = make_ssa_name (wide_vectype);
8072 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8073 vop[1], half_narrow, vop[2]);
8074 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8075
8076 tree stage2 = make_ssa_name (wide_vectype);
8077 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8078 vop[1], half_narrow, stage1);
8079 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8080
8081 tree stage3 = make_ssa_name (wide_vectype);
8082 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8083 sub_res, vop[1], stage2);
8084 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8085
8086 /* Convert STAGE3 to the reduction type. */
8087 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8088 }
8089
8090 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8091 value. */
8092
8093 bool
8094 vect_transform_reduction (loop_vec_info loop_vinfo,
8095 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8096 gimple **vec_stmt, slp_tree slp_node)
8097 {
8098 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8099 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8100 int i;
8101 int ncopies;
8102 int vec_num;
8103
8104 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8105 gcc_assert (reduc_info->is_reduc_info);
8106
8107 if (nested_in_vect_loop_p (loop, stmt_info))
8108 {
8109 loop = loop->inner;
8110 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8111 }
8112
8113 gimple_match_op op;
8114 if (!gimple_extract_op (stmt_info->stmt, &op))
8115 gcc_unreachable ();
8116
8117 /* All uses but the last are expected to be defined in the loop.
8118 The last use is the reduction variable. In case of nested cycle this
8119 assumption is not true: we use reduc_index to record the index of the
8120 reduction variable. */
8121 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8122 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8123 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8124 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8125
8126 if (slp_node)
8127 {
8128 ncopies = 1;
8129 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8130 }
8131 else
8132 {
8133 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8134 vec_num = 1;
8135 }
8136
8137 code_helper code = canonicalize_code (op.code, op.type);
8138 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8139 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8140 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8141
8142 /* Transform. */
8143 tree new_temp = NULL_TREE;
8144 auto_vec<tree> vec_oprnds0;
8145 auto_vec<tree> vec_oprnds1;
8146 auto_vec<tree> vec_oprnds2;
8147 tree def0;
8148
8149 if (dump_enabled_p ())
8150 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8151
8152 /* FORNOW: Multiple types are not supported for condition. */
8153 if (code == COND_EXPR)
8154 gcc_assert (ncopies == 1);
8155
8156 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8157
8158 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8159 if (reduction_type == FOLD_LEFT_REDUCTION)
8160 {
8161 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8162 gcc_assert (code.is_tree_code ());
8163 return vectorize_fold_left_reduction
8164 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8165 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
8166 }
8167
8168 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8169 gcc_assert (single_defuse_cycle
8170 || code == DOT_PROD_EXPR
8171 || code == WIDEN_SUM_EXPR
8172 || code == SAD_EXPR);
8173
8174 /* Create the destination vector */
8175 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8176 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8177
8178 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8179 single_defuse_cycle && reduc_index == 0
8180 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8181 single_defuse_cycle && reduc_index == 1
8182 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8183 op.num_ops == 3
8184 && !(single_defuse_cycle && reduc_index == 2)
8185 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8186 if (single_defuse_cycle)
8187 {
8188 gcc_assert (!slp_node);
8189 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8190 op.ops[reduc_index],
8191 reduc_index == 0 ? &vec_oprnds0
8192 : (reduc_index == 1 ? &vec_oprnds1
8193 : &vec_oprnds2));
8194 }
8195
8196 bool emulated_mixed_dot_prod
8197 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8198 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8199 {
8200 gimple *new_stmt;
8201 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8202 if (masked_loop_p && !mask_by_cond_expr)
8203 {
8204 /* No conditional ifns have been defined for dot-product yet. */
8205 gcc_assert (code != DOT_PROD_EXPR);
8206
8207 /* Make sure that the reduction accumulator is vop[0]. */
8208 if (reduc_index == 1)
8209 {
8210 gcc_assert (commutative_binary_op_p (code, op.type));
8211 std::swap (vop[0], vop[1]);
8212 }
8213 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8214 vec_num * ncopies, vectype_in, i);
8215 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8216 vop[0], vop[1], vop[0]);
8217 new_temp = make_ssa_name (vec_dest, call);
8218 gimple_call_set_lhs (call, new_temp);
8219 gimple_call_set_nothrow (call, true);
8220 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8221 new_stmt = call;
8222 }
8223 else
8224 {
8225 if (op.num_ops == 3)
8226 vop[2] = vec_oprnds2[i];
8227
8228 if (masked_loop_p && mask_by_cond_expr)
8229 {
8230 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8231 vec_num * ncopies, vectype_in, i);
8232 build_vect_cond_expr (code, vop, mask, gsi);
8233 }
8234
8235 if (emulated_mixed_dot_prod)
8236 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8237 vec_dest, vop);
8238 else if (code.is_internal_fn ())
8239 new_stmt = gimple_build_call_internal (internal_fn (code),
8240 op.num_ops,
8241 vop[0], vop[1], vop[2]);
8242 else
8243 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8244 vop[0], vop[1], vop[2]);
8245 new_temp = make_ssa_name (vec_dest, new_stmt);
8246 gimple_set_lhs (new_stmt, new_temp);
8247 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8248 }
8249
8250 if (slp_node)
8251 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
8252 else if (single_defuse_cycle
8253 && i < ncopies - 1)
8254 {
8255 if (reduc_index == 0)
8256 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8257 else if (reduc_index == 1)
8258 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8259 else if (reduc_index == 2)
8260 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8261 }
8262 else
8263 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8264 }
8265
8266 if (!slp_node)
8267 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8268
8269 return true;
8270 }
8271
8272 /* Transform phase of a cycle PHI. */
8273
8274 bool
8275 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8276 stmt_vec_info stmt_info, gimple **vec_stmt,
8277 slp_tree slp_node, slp_instance slp_node_instance)
8278 {
8279 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8281 int i;
8282 int ncopies;
8283 int j;
8284 bool nested_cycle = false;
8285 int vec_num;
8286
8287 if (nested_in_vect_loop_p (loop, stmt_info))
8288 {
8289 loop = loop->inner;
8290 nested_cycle = true;
8291 }
8292
8293 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8294 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8295 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8296 gcc_assert (reduc_info->is_reduc_info);
8297
8298 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8299 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8300 /* Leave the scalar phi in place. */
8301 return true;
8302
8303 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8304 /* For a nested cycle we do not fill the above. */
8305 if (!vectype_in)
8306 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8307 gcc_assert (vectype_in);
8308
8309 if (slp_node)
8310 {
8311 /* The size vect_schedule_slp_instance computes is off for us. */
8312 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8313 * SLP_TREE_LANES (slp_node), vectype_in);
8314 ncopies = 1;
8315 }
8316 else
8317 {
8318 vec_num = 1;
8319 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8320 }
8321
8322 /* Check whether we should use a single PHI node and accumulate
8323 vectors to one before the backedge. */
8324 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8325 ncopies = 1;
8326
8327 /* Create the destination vector */
8328 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8329 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8330 vectype_out);
8331
8332 /* Get the loop-entry arguments. */
8333 tree vec_initial_def = NULL_TREE;
8334 auto_vec<tree> vec_initial_defs;
8335 if (slp_node)
8336 {
8337 vec_initial_defs.reserve (vec_num);
8338 if (nested_cycle)
8339 {
8340 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8341 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8342 &vec_initial_defs);
8343 }
8344 else
8345 {
8346 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8347 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8348 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8349
8350 unsigned int num_phis = stmts.length ();
8351 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8352 num_phis = 1;
8353 initial_values.reserve (num_phis);
8354 for (unsigned int i = 0; i < num_phis; ++i)
8355 {
8356 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8357 initial_values.quick_push (vect_phi_initial_value (this_phi));
8358 }
8359 if (vec_num == 1)
8360 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8361 if (!initial_values.is_empty ())
8362 {
8363 tree initial_value
8364 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8365 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8366 tree neutral_op
8367 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8368 code, initial_value);
8369 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8370 &vec_initial_defs, vec_num,
8371 stmts.length (), neutral_op);
8372 }
8373 }
8374 }
8375 else
8376 {
8377 /* Get at the scalar def before the loop, that defines the initial
8378 value of the reduction variable. */
8379 tree initial_def = vect_phi_initial_value (phi);
8380 reduc_info->reduc_initial_values.safe_push (initial_def);
8381 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8382 and we can't use zero for induc_val, use initial_def. Similarly
8383 for REDUC_MIN and initial_def larger than the base. */
8384 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8385 {
8386 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8387 if (TREE_CODE (initial_def) == INTEGER_CST
8388 && !integer_zerop (induc_val)
8389 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8390 && tree_int_cst_lt (initial_def, induc_val))
8391 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8392 && tree_int_cst_lt (induc_val, initial_def))))
8393 {
8394 induc_val = initial_def;
8395 /* Communicate we used the initial_def to epilouge
8396 generation. */
8397 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8398 }
8399 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8400 }
8401 else if (nested_cycle)
8402 {
8403 /* Do not use an adjustment def as that case is not supported
8404 correctly if ncopies is not one. */
8405 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8406 ncopies, initial_def,
8407 &vec_initial_defs);
8408 }
8409 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8410 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8411 /* Fill the initial vector with the initial scalar value. */
8412 vec_initial_def
8413 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8414 initial_def, initial_def);
8415 else
8416 {
8417 if (ncopies == 1)
8418 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8419 if (!reduc_info->reduc_initial_values.is_empty ())
8420 {
8421 initial_def = reduc_info->reduc_initial_values[0];
8422 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8423 tree neutral_op
8424 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8425 code, initial_def);
8426 gcc_assert (neutral_op);
8427 /* Try to simplify the vector initialization by applying an
8428 adjustment after the reduction has been performed. */
8429 if (!reduc_info->reused_accumulator
8430 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8431 && !operand_equal_p (neutral_op, initial_def))
8432 {
8433 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8434 = initial_def;
8435 initial_def = neutral_op;
8436 }
8437 vec_initial_def
8438 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8439 initial_def, neutral_op);
8440 }
8441 }
8442 }
8443
8444 if (vec_initial_def)
8445 {
8446 vec_initial_defs.create (ncopies);
8447 for (i = 0; i < ncopies; ++i)
8448 vec_initial_defs.quick_push (vec_initial_def);
8449 }
8450
8451 if (auto *accumulator = reduc_info->reused_accumulator)
8452 {
8453 tree def = accumulator->reduc_input;
8454 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8455 {
8456 unsigned int nreduc;
8457 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8458 (TREE_TYPE (def)),
8459 TYPE_VECTOR_SUBPARTS (vectype_out),
8460 &nreduc);
8461 gcc_assert (res);
8462 gimple_seq stmts = NULL;
8463 /* Reduce the single vector to a smaller one. */
8464 if (nreduc != 1)
8465 {
8466 /* Perform the reduction in the appropriate type. */
8467 tree rvectype = vectype_out;
8468 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8469 TREE_TYPE (TREE_TYPE (def))))
8470 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8471 TYPE_VECTOR_SUBPARTS
8472 (vectype_out));
8473 def = vect_create_partial_epilog (def, rvectype,
8474 STMT_VINFO_REDUC_CODE
8475 (reduc_info),
8476 &stmts);
8477 }
8478 /* The epilogue loop might use a different vector mode, like
8479 VNx2DI vs. V2DI. */
8480 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8481 {
8482 tree reduc_type = build_vector_type_for_mode
8483 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8484 def = gimple_convert (&stmts, reduc_type, def);
8485 }
8486 /* Adjust the input so we pick up the partially reduced value
8487 for the skip edge in vect_create_epilog_for_reduction. */
8488 accumulator->reduc_input = def;
8489 /* And the reduction could be carried out using a different sign. */
8490 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8491 def = gimple_convert (&stmts, vectype_out, def);
8492 if (loop_vinfo->main_loop_edge)
8493 {
8494 /* While we'd like to insert on the edge this will split
8495 blocks and disturb bookkeeping, we also will eventually
8496 need this on the skip edge. Rely on sinking to
8497 fixup optimal placement and insert in the pred. */
8498 gimple_stmt_iterator gsi
8499 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8500 /* Insert before a cond that eventually skips the
8501 epilogue. */
8502 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8503 gsi_prev (&gsi);
8504 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8505 }
8506 else
8507 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8508 stmts);
8509 }
8510 if (loop_vinfo->main_loop_edge)
8511 vec_initial_defs[0]
8512 = vect_get_main_loop_result (loop_vinfo, def,
8513 vec_initial_defs[0]);
8514 else
8515 vec_initial_defs.safe_push (def);
8516 }
8517
8518 /* Generate the reduction PHIs upfront. */
8519 for (i = 0; i < vec_num; i++)
8520 {
8521 tree vec_init_def = vec_initial_defs[i];
8522 for (j = 0; j < ncopies; j++)
8523 {
8524 /* Create the reduction-phi that defines the reduction
8525 operand. */
8526 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8527
8528 /* Set the loop-entry arg of the reduction-phi. */
8529 if (j != 0 && nested_cycle)
8530 vec_init_def = vec_initial_defs[j];
8531 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8532 UNKNOWN_LOCATION);
8533
8534 /* The loop-latch arg is set in epilogue processing. */
8535
8536 if (slp_node)
8537 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8538 else
8539 {
8540 if (j == 0)
8541 *vec_stmt = new_phi;
8542 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8543 }
8544 }
8545 }
8546
8547 return true;
8548 }
8549
8550 /* Vectorizes LC PHIs. */
8551
8552 bool
8553 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8554 stmt_vec_info stmt_info, gimple **vec_stmt,
8555 slp_tree slp_node)
8556 {
8557 if (!loop_vinfo
8558 || !is_a <gphi *> (stmt_info->stmt)
8559 || gimple_phi_num_args (stmt_info->stmt) != 1)
8560 return false;
8561
8562 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8563 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8564 return false;
8565
8566 if (!vec_stmt) /* transformation not required. */
8567 {
8568 /* Deal with copies from externs or constants that disguise as
8569 loop-closed PHI nodes (PR97886). */
8570 if (slp_node
8571 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8572 SLP_TREE_VECTYPE (slp_node)))
8573 {
8574 if (dump_enabled_p ())
8575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8576 "incompatible vector types for invariants\n");
8577 return false;
8578 }
8579 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8580 return true;
8581 }
8582
8583 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8584 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8585 basic_block bb = gimple_bb (stmt_info->stmt);
8586 edge e = single_pred_edge (bb);
8587 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8588 auto_vec<tree> vec_oprnds;
8589 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8590 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8591 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8592 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8593 {
8594 /* Create the vectorized LC PHI node. */
8595 gphi *new_phi = create_phi_node (vec_dest, bb);
8596 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8597 if (slp_node)
8598 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8599 else
8600 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8601 }
8602 if (!slp_node)
8603 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8604
8605 return true;
8606 }
8607
8608 /* Vectorizes PHIs. */
8609
8610 bool
8611 vectorizable_phi (vec_info *,
8612 stmt_vec_info stmt_info, gimple **vec_stmt,
8613 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8614 {
8615 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8616 return false;
8617
8618 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8619 return false;
8620
8621 tree vectype = SLP_TREE_VECTYPE (slp_node);
8622
8623 if (!vec_stmt) /* transformation not required. */
8624 {
8625 slp_tree child;
8626 unsigned i;
8627 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8628 if (!child)
8629 {
8630 if (dump_enabled_p ())
8631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8632 "PHI node with unvectorized backedge def\n");
8633 return false;
8634 }
8635 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8636 {
8637 if (dump_enabled_p ())
8638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8639 "incompatible vector types for invariants\n");
8640 return false;
8641 }
8642 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8643 && !useless_type_conversion_p (vectype,
8644 SLP_TREE_VECTYPE (child)))
8645 {
8646 /* With bools we can have mask and non-mask precision vectors
8647 or different non-mask precisions. while pattern recog is
8648 supposed to guarantee consistency here bugs in it can cause
8649 mismatches (PR103489 and PR103800 for example).
8650 Deal with them here instead of ICEing later. */
8651 if (dump_enabled_p ())
8652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8653 "incompatible vector type setup from "
8654 "bool pattern detection\n");
8655 return false;
8656 }
8657
8658 /* For single-argument PHIs assume coalescing which means zero cost
8659 for the scalar and the vector PHIs. This avoids artificially
8660 favoring the vector path (but may pessimize it in some cases). */
8661 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8662 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8663 vector_stmt, stmt_info, vectype, 0, vect_body);
8664 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8665 return true;
8666 }
8667
8668 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8669 basic_block bb = gimple_bb (stmt_info->stmt);
8670 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8671 auto_vec<gphi *> new_phis;
8672 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8673 {
8674 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8675
8676 /* Skip not yet vectorized defs. */
8677 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8678 && SLP_TREE_VEC_STMTS (child).is_empty ())
8679 continue;
8680
8681 auto_vec<tree> vec_oprnds;
8682 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8683 if (!new_phis.exists ())
8684 {
8685 new_phis.create (vec_oprnds.length ());
8686 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8687 {
8688 /* Create the vectorized LC PHI node. */
8689 new_phis.quick_push (create_phi_node (vec_dest, bb));
8690 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8691 }
8692 }
8693 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8694 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8695 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8696 }
8697 /* We should have at least one already vectorized child. */
8698 gcc_assert (new_phis.exists ());
8699
8700 return true;
8701 }
8702
8703 /* Vectorizes first order recurrences. An overview of the transformation
8704 is described below. Suppose we have the following loop.
8705
8706 int t = 0;
8707 for (int i = 0; i < n; ++i)
8708 {
8709 b[i] = a[i] - t;
8710 t = a[i];
8711 }
8712
8713 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8714 looks (simplified) like:
8715
8716 scalar.preheader:
8717 init = 0;
8718
8719 scalar.body:
8720 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8721 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8722 _1 = a[i]
8723 b[i] = _1 - _2
8724 if (i < n) goto scalar.body
8725
8726 In this example, _2 is a recurrence because it's value depends on the
8727 previous iteration. We vectorize this as (VF = 4)
8728
8729 vector.preheader:
8730 vect_init = vect_cst(..., ..., ..., 0)
8731
8732 vector.body
8733 i = PHI <0(vector.preheader), i+4(vector.body)>
8734 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8735 vect_2 = a[i, i+1, i+2, i+3];
8736 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8737 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8738 if (..) goto vector.body
8739
8740 In this function, vectorizable_recurr, we code generate both the
8741 vector PHI node and the permute since those together compute the
8742 vectorized value of the scalar PHI. We do not yet have the
8743 backedge value to fill in there nor into the vec_perm. Those
8744 are filled in maybe_set_vectorized_backedge_value and
8745 vect_schedule_scc.
8746
8747 TODO: Since the scalar loop does not have a use of the recurrence
8748 outside of the loop the natural way to implement peeling via
8749 vectorizing the live value doesn't work. For now peeling of loops
8750 with a recurrence is not implemented. For SLP the supported cases
8751 are restricted to those requiring a single vector recurrence PHI. */
8752
8753 bool
8754 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8755 gimple **vec_stmt, slp_tree slp_node,
8756 stmt_vector_for_cost *cost_vec)
8757 {
8758 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8759 return false;
8760
8761 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8762
8763 /* So far we only support first-order recurrence auto-vectorization. */
8764 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8765 return false;
8766
8767 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8768 unsigned ncopies;
8769 if (slp_node)
8770 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8771 else
8772 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8773 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8774 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8775 /* We need to be able to make progress with a single vector. */
8776 if (maybe_gt (dist * 2, nunits))
8777 {
8778 if (dump_enabled_p ())
8779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8780 "first order recurrence exceeds half of "
8781 "a vector\n");
8782 return false;
8783 }
8784
8785 /* First-order recurrence autovectorization needs to handle permutation
8786 with indices = [nunits-1, nunits, nunits+1, ...]. */
8787 vec_perm_builder sel (nunits, 1, 3);
8788 for (int i = 0; i < 3; ++i)
8789 sel.quick_push (nunits - dist + i);
8790 vec_perm_indices indices (sel, 2, nunits);
8791
8792 if (!vec_stmt) /* transformation not required. */
8793 {
8794 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8795 indices))
8796 return false;
8797
8798 if (slp_node)
8799 {
8800 /* We eventually need to set a vector type on invariant
8801 arguments. */
8802 unsigned j;
8803 slp_tree child;
8804 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8805 if (!vect_maybe_update_slp_op_vectype
8806 (child, SLP_TREE_VECTYPE (slp_node)))
8807 {
8808 if (dump_enabled_p ())
8809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8810 "incompatible vector types for "
8811 "invariants\n");
8812 return false;
8813 }
8814 }
8815 /* The recurrence costs the initialization vector and one permute
8816 for each copy. */
8817 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8818 stmt_info, 0, vect_prologue);
8819 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8820 stmt_info, 0, vect_body);
8821 if (dump_enabled_p ())
8822 dump_printf_loc (MSG_NOTE, vect_location,
8823 "vectorizable_recurr: inside_cost = %d, "
8824 "prologue_cost = %d .\n", inside_cost,
8825 prologue_cost);
8826
8827 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8828 return true;
8829 }
8830
8831 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8832 basic_block bb = gimple_bb (phi);
8833 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8834 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8835 {
8836 gimple_seq stmts = NULL;
8837 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8838 gsi_insert_seq_on_edge_immediate (pe, stmts);
8839 }
8840 tree vec_init = build_vector_from_val (vectype, preheader);
8841 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8842
8843 /* Create the vectorized first-order PHI node. */
8844 tree vec_dest = vect_get_new_vect_var (vectype,
8845 vect_simple_var, "vec_recur_");
8846 gphi *new_phi = create_phi_node (vec_dest, bb);
8847 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8848
8849 /* Insert shuffles the first-order recurrence autovectorization.
8850 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8851 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8852
8853 /* Insert the required permute after the latch definition. The
8854 second and later operands are tentative and will be updated when we have
8855 vectorized the latch definition. */
8856 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8857 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8858 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8859 gsi_next (&gsi2);
8860
8861 for (unsigned i = 0; i < ncopies; ++i)
8862 {
8863 vec_dest = make_ssa_name (vectype);
8864 gassign *vperm
8865 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8866 i == 0 ? gimple_phi_result (new_phi) : NULL,
8867 NULL, perm);
8868 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8869
8870 if (slp_node)
8871 SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8872 else
8873 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8874 }
8875
8876 if (!slp_node)
8877 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8878 return true;
8879 }
8880
8881 /* Return true if VECTYPE represents a vector that requires lowering
8882 by the vector lowering pass. */
8883
8884 bool
8885 vect_emulated_vector_p (tree vectype)
8886 {
8887 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8888 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8889 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8890 }
8891
8892 /* Return true if we can emulate CODE on an integer mode representation
8893 of a vector. */
8894
8895 bool
8896 vect_can_vectorize_without_simd_p (tree_code code)
8897 {
8898 switch (code)
8899 {
8900 case PLUS_EXPR:
8901 case MINUS_EXPR:
8902 case NEGATE_EXPR:
8903 case BIT_AND_EXPR:
8904 case BIT_IOR_EXPR:
8905 case BIT_XOR_EXPR:
8906 case BIT_NOT_EXPR:
8907 return true;
8908
8909 default:
8910 return false;
8911 }
8912 }
8913
8914 /* Likewise, but taking a code_helper. */
8915
8916 bool
8917 vect_can_vectorize_without_simd_p (code_helper code)
8918 {
8919 return (code.is_tree_code ()
8920 && vect_can_vectorize_without_simd_p (tree_code (code)));
8921 }
8922
8923 /* Create vector init for vectorized iv. */
8924 static tree
8925 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8926 tree step_expr, poly_uint64 nunits,
8927 tree vectype,
8928 enum vect_induction_op_type induction_type)
8929 {
8930 unsigned HOST_WIDE_INT const_nunits;
8931 tree vec_shift, vec_init, new_name;
8932 unsigned i;
8933 tree itype = TREE_TYPE (vectype);
8934
8935 /* iv_loop is the loop to be vectorized. Create:
8936 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8937 new_name = gimple_convert (stmts, itype, init_expr);
8938 switch (induction_type)
8939 {
8940 case vect_step_op_shr:
8941 case vect_step_op_shl:
8942 /* Build the Initial value from shift_expr. */
8943 vec_init = gimple_build_vector_from_val (stmts,
8944 vectype,
8945 new_name);
8946 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8947 build_zero_cst (itype), step_expr);
8948 vec_init = gimple_build (stmts,
8949 (induction_type == vect_step_op_shr
8950 ? RSHIFT_EXPR : LSHIFT_EXPR),
8951 vectype, vec_init, vec_shift);
8952 break;
8953
8954 case vect_step_op_neg:
8955 {
8956 vec_init = gimple_build_vector_from_val (stmts,
8957 vectype,
8958 new_name);
8959 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8960 vectype, vec_init);
8961 /* The encoding has 2 interleaved stepped patterns. */
8962 vec_perm_builder sel (nunits, 2, 3);
8963 sel.quick_grow (6);
8964 for (i = 0; i < 3; i++)
8965 {
8966 sel[2 * i] = i;
8967 sel[2 * i + 1] = i + nunits;
8968 }
8969 vec_perm_indices indices (sel, 2, nunits);
8970 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8971 fail when vec_init is const vector. In that situation vec_perm is not
8972 really needed. */
8973 tree perm_mask_even
8974 = vect_gen_perm_mask_any (vectype, indices);
8975 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8976 vectype,
8977 vec_init, vec_neg,
8978 perm_mask_even);
8979 }
8980 break;
8981
8982 case vect_step_op_mul:
8983 {
8984 /* Use unsigned mult to avoid UD integer overflow. */
8985 gcc_assert (nunits.is_constant (&const_nunits));
8986 tree utype = unsigned_type_for (itype);
8987 tree uvectype = build_vector_type (utype,
8988 TYPE_VECTOR_SUBPARTS (vectype));
8989 new_name = gimple_convert (stmts, utype, new_name);
8990 vec_init = gimple_build_vector_from_val (stmts,
8991 uvectype,
8992 new_name);
8993 tree_vector_builder elts (uvectype, const_nunits, 1);
8994 tree elt_step = build_one_cst (utype);
8995
8996 elts.quick_push (elt_step);
8997 for (i = 1; i < const_nunits; i++)
8998 {
8999 /* Create: new_name_i = new_name + step_expr. */
9000 elt_step = gimple_build (stmts, MULT_EXPR,
9001 utype, elt_step, step_expr);
9002 elts.quick_push (elt_step);
9003 }
9004 /* Create a vector from [new_name_0, new_name_1, ...,
9005 new_name_nunits-1]. */
9006 tree vec_mul = gimple_build_vector (stmts, &elts);
9007 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9008 vec_init, vec_mul);
9009 vec_init = gimple_convert (stmts, vectype, vec_init);
9010 }
9011 break;
9012
9013 default:
9014 gcc_unreachable ();
9015 }
9016
9017 return vec_init;
9018 }
9019
9020 /* Peel init_expr by skip_niter for induction_type. */
9021 tree
9022 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9023 tree skip_niters, tree step_expr,
9024 enum vect_induction_op_type induction_type)
9025 {
9026 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9027 tree type = TREE_TYPE (init_expr);
9028 unsigned prec = TYPE_PRECISION (type);
9029 switch (induction_type)
9030 {
9031 case vect_step_op_neg:
9032 if (TREE_INT_CST_LOW (skip_niters) % 2)
9033 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9034 /* else no change. */
9035 break;
9036
9037 case vect_step_op_shr:
9038 case vect_step_op_shl:
9039 skip_niters = gimple_convert (stmts, type, skip_niters);
9040 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9041 /* When shift mount >= precision, need to avoid UD.
9042 In the original loop, there's no UD, and according to semantic,
9043 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9044 if (!tree_fits_uhwi_p (step_expr)
9045 || tree_to_uhwi (step_expr) >= prec)
9046 {
9047 if (induction_type == vect_step_op_shl
9048 || TYPE_UNSIGNED (type))
9049 init_expr = build_zero_cst (type);
9050 else
9051 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9052 init_expr,
9053 wide_int_to_tree (type, prec - 1));
9054 }
9055 else
9056 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9057 ? RSHIFT_EXPR : LSHIFT_EXPR),
9058 type, init_expr, step_expr);
9059 break;
9060
9061 case vect_step_op_mul:
9062 {
9063 tree utype = unsigned_type_for (type);
9064 init_expr = gimple_convert (stmts, utype, init_expr);
9065 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9066 wide_int begin = wi::to_wide (step_expr);
9067 for (unsigned i = 0; i != skipn - 1; i++)
9068 begin = wi::mul (begin, wi::to_wide (step_expr));
9069 tree mult_expr = wide_int_to_tree (utype, begin);
9070 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9071 init_expr = gimple_convert (stmts, type, init_expr);
9072 }
9073 break;
9074
9075 default:
9076 gcc_unreachable ();
9077 }
9078
9079 return init_expr;
9080 }
9081
9082 /* Create vector step for vectorized iv. */
9083 static tree
9084 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9085 poly_uint64 vf,
9086 enum vect_induction_op_type induction_type)
9087 {
9088 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9089 tree new_name = NULL;
9090 /* Step should be pow (step, vf) for mult induction. */
9091 if (induction_type == vect_step_op_mul)
9092 {
9093 gcc_assert (vf.is_constant ());
9094 wide_int begin = wi::to_wide (step_expr);
9095
9096 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9097 begin = wi::mul (begin, wi::to_wide (step_expr));
9098
9099 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9100 }
9101 else if (induction_type == vect_step_op_neg)
9102 /* Do nothing. */
9103 ;
9104 else
9105 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9106 expr, step_expr);
9107 return new_name;
9108 }
9109
9110 static tree
9111 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9112 stmt_vec_info stmt_info,
9113 tree new_name, tree vectype,
9114 enum vect_induction_op_type induction_type)
9115 {
9116 /* No step is needed for neg induction. */
9117 if (induction_type == vect_step_op_neg)
9118 return NULL;
9119
9120 tree t = unshare_expr (new_name);
9121 gcc_assert (CONSTANT_CLASS_P (new_name)
9122 || TREE_CODE (new_name) == SSA_NAME);
9123 tree new_vec = build_vector_from_val (vectype, t);
9124 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9125 new_vec, vectype, NULL);
9126 return vec_step;
9127 }
9128
9129 /* Update vectorized iv with vect_step, induc_def is init. */
9130 static tree
9131 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9132 tree induc_def, tree vec_step,
9133 enum vect_induction_op_type induction_type)
9134 {
9135 tree vec_def = induc_def;
9136 switch (induction_type)
9137 {
9138 case vect_step_op_mul:
9139 {
9140 /* Use unsigned mult to avoid UD integer overflow. */
9141 tree uvectype
9142 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9143 TYPE_VECTOR_SUBPARTS (vectype));
9144 vec_def = gimple_convert (stmts, uvectype, vec_def);
9145 vec_step = gimple_convert (stmts, uvectype, vec_step);
9146 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9147 vec_def, vec_step);
9148 vec_def = gimple_convert (stmts, vectype, vec_def);
9149 }
9150 break;
9151
9152 case vect_step_op_shr:
9153 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9154 vec_def, vec_step);
9155 break;
9156
9157 case vect_step_op_shl:
9158 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9159 vec_def, vec_step);
9160 break;
9161 case vect_step_op_neg:
9162 vec_def = induc_def;
9163 /* Do nothing. */
9164 break;
9165 default:
9166 gcc_unreachable ();
9167 }
9168
9169 return vec_def;
9170
9171 }
9172
9173 /* Function vectorizable_induction
9174
9175 Check if STMT_INFO performs an nonlinear induction computation that can be
9176 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9177 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9178 basic block.
9179 Return true if STMT_INFO is vectorizable in this way. */
9180
9181 static bool
9182 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9183 stmt_vec_info stmt_info,
9184 gimple **vec_stmt, slp_tree slp_node,
9185 stmt_vector_for_cost *cost_vec)
9186 {
9187 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9188 unsigned ncopies;
9189 bool nested_in_vect_loop = false;
9190 class loop *iv_loop;
9191 tree vec_def;
9192 edge pe = loop_preheader_edge (loop);
9193 basic_block new_bb;
9194 tree vec_init, vec_step;
9195 tree new_name;
9196 gimple *new_stmt;
9197 gphi *induction_phi;
9198 tree induc_def, vec_dest;
9199 tree init_expr, step_expr;
9200 tree niters_skip;
9201 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9202 unsigned i;
9203 gimple_stmt_iterator si;
9204
9205 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9206
9207 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9208 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9209 enum vect_induction_op_type induction_type
9210 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9211
9212 gcc_assert (induction_type > vect_step_op_add);
9213
9214 if (slp_node)
9215 ncopies = 1;
9216 else
9217 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9218 gcc_assert (ncopies >= 1);
9219
9220 /* FORNOW. Only handle nonlinear induction in the same loop. */
9221 if (nested_in_vect_loop_p (loop, stmt_info))
9222 {
9223 if (dump_enabled_p ())
9224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9225 "nonlinear induction in nested loop.\n");
9226 return false;
9227 }
9228
9229 iv_loop = loop;
9230 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9231
9232 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9233 update for each iv and a permutation to generate wanted vector iv. */
9234 if (slp_node)
9235 {
9236 if (dump_enabled_p ())
9237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9238 "SLP induction not supported for nonlinear"
9239 " induction.\n");
9240 return false;
9241 }
9242
9243 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9244 {
9245 if (dump_enabled_p ())
9246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9247 "floating point nonlinear induction vectorization"
9248 " not supported.\n");
9249 return false;
9250 }
9251
9252 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9253 init_expr = vect_phi_initial_value (phi);
9254 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9255 && TREE_CODE (step_expr) == INTEGER_CST);
9256 /* step_expr should be aligned with init_expr,
9257 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9258 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9259
9260 if (TREE_CODE (init_expr) == INTEGER_CST)
9261 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9262 else
9263 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9264 TREE_TYPE (init_expr)));
9265
9266 switch (induction_type)
9267 {
9268 case vect_step_op_neg:
9269 if (TREE_CODE (init_expr) != INTEGER_CST
9270 && TREE_CODE (init_expr) != REAL_CST)
9271 {
9272 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9273 if (!directly_supported_p (NEGATE_EXPR, vectype))
9274 return false;
9275
9276 /* The encoding has 2 interleaved stepped patterns. */
9277 vec_perm_builder sel (nunits, 2, 3);
9278 machine_mode mode = TYPE_MODE (vectype);
9279 sel.quick_grow (6);
9280 for (i = 0; i < 3; i++)
9281 {
9282 sel[i * 2] = i;
9283 sel[i * 2 + 1] = i + nunits;
9284 }
9285 vec_perm_indices indices (sel, 2, nunits);
9286 if (!can_vec_perm_const_p (mode, mode, indices))
9287 return false;
9288 }
9289 break;
9290
9291 case vect_step_op_mul:
9292 {
9293 /* Check for backend support of MULT_EXPR. */
9294 if (!directly_supported_p (MULT_EXPR, vectype))
9295 return false;
9296
9297 /* ?? How to construct vector step for variable number vector.
9298 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9299 if (!vf.is_constant ())
9300 return false;
9301 }
9302 break;
9303
9304 case vect_step_op_shr:
9305 /* Check for backend support of RSHIFT_EXPR. */
9306 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9307 return false;
9308
9309 /* Don't shift more than type precision to avoid UD. */
9310 if (!tree_fits_uhwi_p (step_expr)
9311 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9312 TYPE_PRECISION (TREE_TYPE (init_expr))))
9313 return false;
9314 break;
9315
9316 case vect_step_op_shl:
9317 /* Check for backend support of RSHIFT_EXPR. */
9318 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9319 return false;
9320
9321 /* Don't shift more than type precision to avoid UD. */
9322 if (!tree_fits_uhwi_p (step_expr)
9323 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9324 TYPE_PRECISION (TREE_TYPE (init_expr))))
9325 return false;
9326
9327 break;
9328
9329 default:
9330 gcc_unreachable ();
9331 }
9332
9333 if (!vec_stmt) /* transformation not required. */
9334 {
9335 unsigned inside_cost = 0, prologue_cost = 0;
9336 /* loop cost for vec_loop. Neg induction doesn't have any
9337 inside_cost. */
9338 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9339 stmt_info, 0, vect_body);
9340
9341 /* loop cost for vec_loop. Neg induction doesn't have any
9342 inside_cost. */
9343 if (induction_type == vect_step_op_neg)
9344 inside_cost = 0;
9345
9346 /* prologue cost for vec_init and vec_step. */
9347 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9348 stmt_info, 0, vect_prologue);
9349
9350 if (dump_enabled_p ())
9351 dump_printf_loc (MSG_NOTE, vect_location,
9352 "vect_model_induction_cost: inside_cost = %d, "
9353 "prologue_cost = %d. \n", inside_cost,
9354 prologue_cost);
9355
9356 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9357 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9358 return true;
9359 }
9360
9361 /* Transform. */
9362
9363 /* Compute a vector variable, initialized with the first VF values of
9364 the induction variable. E.g., for an iv with IV_PHI='X' and
9365 evolution S, for a vector of 4 units, we want to compute:
9366 [X, X + S, X + 2*S, X + 3*S]. */
9367
9368 if (dump_enabled_p ())
9369 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9370
9371 pe = loop_preheader_edge (iv_loop);
9372 /* Find the first insertion point in the BB. */
9373 basic_block bb = gimple_bb (phi);
9374 si = gsi_after_labels (bb);
9375
9376 gimple_seq stmts = NULL;
9377
9378 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9379 /* If we are using the loop mask to "peel" for alignment then we need
9380 to adjust the start value here. */
9381 if (niters_skip != NULL_TREE)
9382 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9383 step_expr, induction_type);
9384
9385 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9386 step_expr, nunits, vectype,
9387 induction_type);
9388 if (stmts)
9389 {
9390 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9391 gcc_assert (!new_bb);
9392 }
9393
9394 stmts = NULL;
9395 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9396 vf, induction_type);
9397 if (stmts)
9398 {
9399 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9400 gcc_assert (!new_bb);
9401 }
9402
9403 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9404 new_name, vectype,
9405 induction_type);
9406 /* Create the following def-use cycle:
9407 loop prolog:
9408 vec_init = ...
9409 vec_step = ...
9410 loop:
9411 vec_iv = PHI <vec_init, vec_loop>
9412 ...
9413 STMT
9414 ...
9415 vec_loop = vec_iv + vec_step; */
9416
9417 /* Create the induction-phi that defines the induction-operand. */
9418 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9419 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9420 induc_def = PHI_RESULT (induction_phi);
9421
9422 /* Create the iv update inside the loop. */
9423 stmts = NULL;
9424 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9425 induc_def, vec_step,
9426 induction_type);
9427
9428 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9429 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9430
9431 /* Set the arguments of the phi node: */
9432 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9433 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9434 UNKNOWN_LOCATION);
9435
9436 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9437 *vec_stmt = induction_phi;
9438
9439 /* In case that vectorization factor (VF) is bigger than the number
9440 of elements that we can fit in a vectype (nunits), we have to generate
9441 more than one vector stmt - i.e - we need to "unroll" the
9442 vector stmt by a factor VF/nunits. For more details see documentation
9443 in vectorizable_operation. */
9444
9445 if (ncopies > 1)
9446 {
9447 stmts = NULL;
9448 /* FORNOW. This restriction should be relaxed. */
9449 gcc_assert (!nested_in_vect_loop);
9450
9451 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9452 nunits, induction_type);
9453
9454 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9455 new_name, vectype,
9456 induction_type);
9457 vec_def = induc_def;
9458 for (i = 1; i < ncopies; i++)
9459 {
9460 /* vec_i = vec_prev + vec_step. */
9461 stmts = NULL;
9462 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9463 vec_def, vec_step,
9464 induction_type);
9465 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9466 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9467 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9468 }
9469 }
9470
9471 if (dump_enabled_p ())
9472 dump_printf_loc (MSG_NOTE, vect_location,
9473 "transform induction: created def-use cycle: %G%G",
9474 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9475
9476 return true;
9477 }
9478
9479 /* Function vectorizable_induction
9480
9481 Check if STMT_INFO performs an induction computation that can be vectorized.
9482 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9483 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9484 Return true if STMT_INFO is vectorizable in this way. */
9485
9486 bool
9487 vectorizable_induction (loop_vec_info loop_vinfo,
9488 stmt_vec_info stmt_info,
9489 gimple **vec_stmt, slp_tree slp_node,
9490 stmt_vector_for_cost *cost_vec)
9491 {
9492 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9493 unsigned ncopies;
9494 bool nested_in_vect_loop = false;
9495 class loop *iv_loop;
9496 tree vec_def;
9497 edge pe = loop_preheader_edge (loop);
9498 basic_block new_bb;
9499 tree new_vec, vec_init, vec_step, t;
9500 tree new_name;
9501 gimple *new_stmt;
9502 gphi *induction_phi;
9503 tree induc_def, vec_dest;
9504 tree init_expr, step_expr;
9505 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9506 unsigned i;
9507 tree expr;
9508 gimple_stmt_iterator si;
9509 enum vect_induction_op_type induction_type
9510 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9511
9512 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9513 if (!phi)
9514 return false;
9515
9516 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9517 return false;
9518
9519 /* Make sure it was recognized as induction computation. */
9520 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9521 return false;
9522
9523 /* Handle nonlinear induction in a separate place. */
9524 if (induction_type != vect_step_op_add)
9525 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9526 vec_stmt, slp_node, cost_vec);
9527
9528 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9529 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9530
9531 if (slp_node)
9532 ncopies = 1;
9533 else
9534 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9535 gcc_assert (ncopies >= 1);
9536
9537 /* FORNOW. These restrictions should be relaxed. */
9538 if (nested_in_vect_loop_p (loop, stmt_info))
9539 {
9540 imm_use_iterator imm_iter;
9541 use_operand_p use_p;
9542 gimple *exit_phi;
9543 edge latch_e;
9544 tree loop_arg;
9545
9546 if (ncopies > 1)
9547 {
9548 if (dump_enabled_p ())
9549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9550 "multiple types in nested loop.\n");
9551 return false;
9552 }
9553
9554 exit_phi = NULL;
9555 latch_e = loop_latch_edge (loop->inner);
9556 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9557 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9558 {
9559 gimple *use_stmt = USE_STMT (use_p);
9560 if (is_gimple_debug (use_stmt))
9561 continue;
9562
9563 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9564 {
9565 exit_phi = use_stmt;
9566 break;
9567 }
9568 }
9569 if (exit_phi)
9570 {
9571 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9572 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9573 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9574 {
9575 if (dump_enabled_p ())
9576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9577 "inner-loop induction only used outside "
9578 "of the outer vectorized loop.\n");
9579 return false;
9580 }
9581 }
9582
9583 nested_in_vect_loop = true;
9584 iv_loop = loop->inner;
9585 }
9586 else
9587 iv_loop = loop;
9588 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9589
9590 if (slp_node && !nunits.is_constant ())
9591 {
9592 /* The current SLP code creates the step value element-by-element. */
9593 if (dump_enabled_p ())
9594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9595 "SLP induction not supported for variable-length"
9596 " vectors.\n");
9597 return false;
9598 }
9599
9600 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9601 {
9602 if (dump_enabled_p ())
9603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9604 "floating point induction vectorization disabled\n");
9605 return false;
9606 }
9607
9608 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9609 gcc_assert (step_expr != NULL_TREE);
9610 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9611
9612 /* Check for backend support of PLUS/MINUS_EXPR. */
9613 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9614 || !directly_supported_p (MINUS_EXPR, step_vectype))
9615 return false;
9616
9617 if (!vec_stmt) /* transformation not required. */
9618 {
9619 unsigned inside_cost = 0, prologue_cost = 0;
9620 if (slp_node)
9621 {
9622 /* We eventually need to set a vector type on invariant
9623 arguments. */
9624 unsigned j;
9625 slp_tree child;
9626 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9627 if (!vect_maybe_update_slp_op_vectype
9628 (child, SLP_TREE_VECTYPE (slp_node)))
9629 {
9630 if (dump_enabled_p ())
9631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9632 "incompatible vector types for "
9633 "invariants\n");
9634 return false;
9635 }
9636 /* loop cost for vec_loop. */
9637 inside_cost
9638 = record_stmt_cost (cost_vec,
9639 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9640 vector_stmt, stmt_info, 0, vect_body);
9641 /* prologue cost for vec_init (if not nested) and step. */
9642 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9643 scalar_to_vec,
9644 stmt_info, 0, vect_prologue);
9645 }
9646 else /* if (!slp_node) */
9647 {
9648 /* loop cost for vec_loop. */
9649 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9650 stmt_info, 0, vect_body);
9651 /* prologue cost for vec_init and vec_step. */
9652 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9653 stmt_info, 0, vect_prologue);
9654 }
9655 if (dump_enabled_p ())
9656 dump_printf_loc (MSG_NOTE, vect_location,
9657 "vect_model_induction_cost: inside_cost = %d, "
9658 "prologue_cost = %d .\n", inside_cost,
9659 prologue_cost);
9660
9661 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9662 DUMP_VECT_SCOPE ("vectorizable_induction");
9663 return true;
9664 }
9665
9666 /* Transform. */
9667
9668 /* Compute a vector variable, initialized with the first VF values of
9669 the induction variable. E.g., for an iv with IV_PHI='X' and
9670 evolution S, for a vector of 4 units, we want to compute:
9671 [X, X + S, X + 2*S, X + 3*S]. */
9672
9673 if (dump_enabled_p ())
9674 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9675
9676 pe = loop_preheader_edge (iv_loop);
9677 /* Find the first insertion point in the BB. */
9678 basic_block bb = gimple_bb (phi);
9679 si = gsi_after_labels (bb);
9680
9681 /* For SLP induction we have to generate several IVs as for example
9682 with group size 3 we need
9683 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9684 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9685 if (slp_node)
9686 {
9687 /* Enforced above. */
9688 unsigned int const_nunits = nunits.to_constant ();
9689
9690 /* The initial values are vectorized, but any lanes > group_size
9691 need adjustment. */
9692 slp_tree init_node
9693 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9694
9695 /* Gather steps. Since we do not vectorize inductions as
9696 cycles we have to reconstruct the step from SCEV data. */
9697 unsigned group_size = SLP_TREE_LANES (slp_node);
9698 tree *steps = XALLOCAVEC (tree, group_size);
9699 tree *inits = XALLOCAVEC (tree, group_size);
9700 stmt_vec_info phi_info;
9701 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9702 {
9703 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9704 if (!init_node)
9705 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9706 pe->dest_idx);
9707 }
9708
9709 /* Now generate the IVs. */
9710 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9711 gcc_assert ((const_nunits * nvects) % group_size == 0);
9712 unsigned nivs;
9713 if (nested_in_vect_loop)
9714 nivs = nvects;
9715 else
9716 {
9717 /* Compute the number of distinct IVs we need. First reduce
9718 group_size if it is a multiple of const_nunits so we get
9719 one IV for a group_size of 4 but const_nunits 2. */
9720 unsigned group_sizep = group_size;
9721 if (group_sizep % const_nunits == 0)
9722 group_sizep = group_sizep / const_nunits;
9723 nivs = least_common_multiple (group_sizep,
9724 const_nunits) / const_nunits;
9725 }
9726 tree stept = TREE_TYPE (step_vectype);
9727 tree lupdate_mul = NULL_TREE;
9728 if (!nested_in_vect_loop)
9729 {
9730 /* The number of iterations covered in one vector iteration. */
9731 unsigned lup_mul = (nvects * const_nunits) / group_size;
9732 lupdate_mul
9733 = build_vector_from_val (step_vectype,
9734 SCALAR_FLOAT_TYPE_P (stept)
9735 ? build_real_from_wide (stept, lup_mul,
9736 UNSIGNED)
9737 : build_int_cstu (stept, lup_mul));
9738 }
9739 tree peel_mul = NULL_TREE;
9740 gimple_seq init_stmts = NULL;
9741 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9742 {
9743 if (SCALAR_FLOAT_TYPE_P (stept))
9744 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9745 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9746 else
9747 peel_mul = gimple_convert (&init_stmts, stept,
9748 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9749 peel_mul = gimple_build_vector_from_val (&init_stmts,
9750 step_vectype, peel_mul);
9751 }
9752 unsigned ivn;
9753 auto_vec<tree> vec_steps;
9754 for (ivn = 0; ivn < nivs; ++ivn)
9755 {
9756 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9757 tree_vector_builder init_elts (vectype, const_nunits, 1);
9758 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9759 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9760 {
9761 /* The scalar steps of the IVs. */
9762 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9763 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9764 step_elts.quick_push (elt);
9765 if (!init_node)
9766 {
9767 /* The scalar inits of the IVs if not vectorized. */
9768 elt = inits[(ivn*const_nunits + eltn) % group_size];
9769 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9770 TREE_TYPE (elt)))
9771 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9772 TREE_TYPE (vectype), elt);
9773 init_elts.quick_push (elt);
9774 }
9775 /* The number of steps to add to the initial values. */
9776 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9777 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9778 ? build_real_from_wide (stept,
9779 mul_elt, UNSIGNED)
9780 : build_int_cstu (stept, mul_elt));
9781 }
9782 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9783 vec_steps.safe_push (vec_step);
9784 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9785 if (peel_mul)
9786 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9787 step_mul, peel_mul);
9788 if (!init_node)
9789 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9790
9791 /* Create the induction-phi that defines the induction-operand. */
9792 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9793 "vec_iv_");
9794 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9795 induc_def = PHI_RESULT (induction_phi);
9796
9797 /* Create the iv update inside the loop */
9798 tree up = vec_step;
9799 if (lupdate_mul)
9800 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9801 vec_step, lupdate_mul);
9802 gimple_seq stmts = NULL;
9803 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9804 vec_def = gimple_build (&stmts,
9805 PLUS_EXPR, step_vectype, vec_def, up);
9806 vec_def = gimple_convert (&stmts, vectype, vec_def);
9807 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9808 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9809 UNKNOWN_LOCATION);
9810
9811 if (init_node)
9812 vec_init = vect_get_slp_vect_def (init_node, ivn);
9813 if (!nested_in_vect_loop
9814 && !integer_zerop (step_mul))
9815 {
9816 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9817 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9818 vec_step, step_mul);
9819 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9820 vec_def, up);
9821 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9822 }
9823
9824 /* Set the arguments of the phi node: */
9825 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9826
9827 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9828 }
9829 if (!nested_in_vect_loop)
9830 {
9831 /* Fill up to the number of vectors we need for the whole group. */
9832 nivs = least_common_multiple (group_size,
9833 const_nunits) / const_nunits;
9834 vec_steps.reserve (nivs-ivn);
9835 for (; ivn < nivs; ++ivn)
9836 {
9837 SLP_TREE_VEC_STMTS (slp_node)
9838 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9839 vec_steps.quick_push (vec_steps[0]);
9840 }
9841 }
9842
9843 /* Re-use IVs when we can. We are generating further vector
9844 stmts by adding VF' * stride to the IVs generated above. */
9845 if (ivn < nvects)
9846 {
9847 unsigned vfp
9848 = least_common_multiple (group_size, const_nunits) / group_size;
9849 tree lupdate_mul
9850 = build_vector_from_val (step_vectype,
9851 SCALAR_FLOAT_TYPE_P (stept)
9852 ? build_real_from_wide (stept,
9853 vfp, UNSIGNED)
9854 : build_int_cstu (stept, vfp));
9855 for (; ivn < nvects; ++ivn)
9856 {
9857 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9858 tree def = gimple_get_lhs (iv);
9859 if (ivn < 2*nivs)
9860 vec_steps[ivn - nivs]
9861 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9862 vec_steps[ivn - nivs], lupdate_mul);
9863 gimple_seq stmts = NULL;
9864 def = gimple_convert (&stmts, step_vectype, def);
9865 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9866 def, vec_steps[ivn % nivs]);
9867 def = gimple_convert (&stmts, vectype, def);
9868 if (gimple_code (iv) == GIMPLE_PHI)
9869 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9870 else
9871 {
9872 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9873 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9874 }
9875 SLP_TREE_VEC_STMTS (slp_node)
9876 .quick_push (SSA_NAME_DEF_STMT (def));
9877 }
9878 }
9879
9880 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9881 gcc_assert (!new_bb);
9882
9883 return true;
9884 }
9885
9886 init_expr = vect_phi_initial_value (phi);
9887
9888 gimple_seq stmts = NULL;
9889 if (!nested_in_vect_loop)
9890 {
9891 /* Convert the initial value to the IV update type. */
9892 tree new_type = TREE_TYPE (step_expr);
9893 init_expr = gimple_convert (&stmts, new_type, init_expr);
9894
9895 /* If we are using the loop mask to "peel" for alignment then we need
9896 to adjust the start value here. */
9897 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9898 if (skip_niters != NULL_TREE)
9899 {
9900 if (FLOAT_TYPE_P (vectype))
9901 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9902 skip_niters);
9903 else
9904 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9905 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9906 skip_niters, step_expr);
9907 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9908 init_expr, skip_step);
9909 }
9910 }
9911
9912 if (stmts)
9913 {
9914 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9915 gcc_assert (!new_bb);
9916 }
9917
9918 /* Create the vector that holds the initial_value of the induction. */
9919 if (nested_in_vect_loop)
9920 {
9921 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9922 been created during vectorization of previous stmts. We obtain it
9923 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9924 auto_vec<tree> vec_inits;
9925 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9926 init_expr, &vec_inits);
9927 vec_init = vec_inits[0];
9928 /* If the initial value is not of proper type, convert it. */
9929 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9930 {
9931 new_stmt
9932 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9933 vect_simple_var,
9934 "vec_iv_"),
9935 VIEW_CONVERT_EXPR,
9936 build1 (VIEW_CONVERT_EXPR, vectype,
9937 vec_init));
9938 vec_init = gimple_assign_lhs (new_stmt);
9939 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9940 new_stmt);
9941 gcc_assert (!new_bb);
9942 }
9943 }
9944 else
9945 {
9946 /* iv_loop is the loop to be vectorized. Create:
9947 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9948 stmts = NULL;
9949 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9950
9951 unsigned HOST_WIDE_INT const_nunits;
9952 if (nunits.is_constant (&const_nunits))
9953 {
9954 tree_vector_builder elts (step_vectype, const_nunits, 1);
9955 elts.quick_push (new_name);
9956 for (i = 1; i < const_nunits; i++)
9957 {
9958 /* Create: new_name_i = new_name + step_expr */
9959 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9960 new_name, step_expr);
9961 elts.quick_push (new_name);
9962 }
9963 /* Create a vector from [new_name_0, new_name_1, ...,
9964 new_name_nunits-1] */
9965 vec_init = gimple_build_vector (&stmts, &elts);
9966 }
9967 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9968 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9969 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9970 new_name, step_expr);
9971 else
9972 {
9973 /* Build:
9974 [base, base, base, ...]
9975 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9976 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9977 gcc_assert (flag_associative_math);
9978 tree index = build_index_vector (step_vectype, 0, 1);
9979 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9980 new_name);
9981 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9982 step_expr);
9983 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9984 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9985 vec_init, step_vec);
9986 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9987 vec_init, base_vec);
9988 }
9989 vec_init = gimple_convert (&stmts, vectype, vec_init);
9990
9991 if (stmts)
9992 {
9993 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9994 gcc_assert (!new_bb);
9995 }
9996 }
9997
9998
9999 /* Create the vector that holds the step of the induction. */
10000 if (nested_in_vect_loop)
10001 /* iv_loop is nested in the loop to be vectorized. Generate:
10002 vec_step = [S, S, S, S] */
10003 new_name = step_expr;
10004 else
10005 {
10006 /* iv_loop is the loop to be vectorized. Generate:
10007 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10008 gimple_seq seq = NULL;
10009 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10010 {
10011 expr = build_int_cst (integer_type_node, vf);
10012 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10013 }
10014 else
10015 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10016 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10017 expr, step_expr);
10018 if (seq)
10019 {
10020 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10021 gcc_assert (!new_bb);
10022 }
10023 }
10024
10025 t = unshare_expr (new_name);
10026 gcc_assert (CONSTANT_CLASS_P (new_name)
10027 || TREE_CODE (new_name) == SSA_NAME);
10028 new_vec = build_vector_from_val (step_vectype, t);
10029 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10030 new_vec, step_vectype, NULL);
10031
10032
10033 /* Create the following def-use cycle:
10034 loop prolog:
10035 vec_init = ...
10036 vec_step = ...
10037 loop:
10038 vec_iv = PHI <vec_init, vec_loop>
10039 ...
10040 STMT
10041 ...
10042 vec_loop = vec_iv + vec_step; */
10043
10044 /* Create the induction-phi that defines the induction-operand. */
10045 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10046 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10047 induc_def = PHI_RESULT (induction_phi);
10048
10049 /* Create the iv update inside the loop */
10050 stmts = NULL;
10051 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10052 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10053 vec_def = gimple_convert (&stmts, vectype, vec_def);
10054 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10055 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10056
10057 /* Set the arguments of the phi node: */
10058 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10059 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10060 UNKNOWN_LOCATION);
10061
10062 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10063 *vec_stmt = induction_phi;
10064
10065 /* In case that vectorization factor (VF) is bigger than the number
10066 of elements that we can fit in a vectype (nunits), we have to generate
10067 more than one vector stmt - i.e - we need to "unroll" the
10068 vector stmt by a factor VF/nunits. For more details see documentation
10069 in vectorizable_operation. */
10070
10071 if (ncopies > 1)
10072 {
10073 gimple_seq seq = NULL;
10074 /* FORNOW. This restriction should be relaxed. */
10075 gcc_assert (!nested_in_vect_loop);
10076
10077 /* Create the vector that holds the step of the induction. */
10078 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10079 {
10080 expr = build_int_cst (integer_type_node, nunits);
10081 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10082 }
10083 else
10084 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10085 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10086 expr, step_expr);
10087 if (seq)
10088 {
10089 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10090 gcc_assert (!new_bb);
10091 }
10092
10093 t = unshare_expr (new_name);
10094 gcc_assert (CONSTANT_CLASS_P (new_name)
10095 || TREE_CODE (new_name) == SSA_NAME);
10096 new_vec = build_vector_from_val (step_vectype, t);
10097 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10098 new_vec, step_vectype, NULL);
10099
10100 vec_def = induc_def;
10101 for (i = 1; i < ncopies + 1; i++)
10102 {
10103 /* vec_i = vec_prev + vec_step */
10104 gimple_seq stmts = NULL;
10105 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10106 vec_def = gimple_build (&stmts,
10107 PLUS_EXPR, step_vectype, vec_def, vec_step);
10108 vec_def = gimple_convert (&stmts, vectype, vec_def);
10109
10110 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10111 if (i < ncopies)
10112 {
10113 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10114 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10115 }
10116 else
10117 {
10118 /* vec_1 = vec_iv + (VF/n * S)
10119 vec_2 = vec_1 + (VF/n * S)
10120 ...
10121 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10122
10123 vec_n is used as vec_loop to save the large step register and
10124 related operations. */
10125 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10126 UNKNOWN_LOCATION);
10127 }
10128 }
10129 }
10130
10131 if (dump_enabled_p ())
10132 dump_printf_loc (MSG_NOTE, vect_location,
10133 "transform induction: created def-use cycle: %G%G",
10134 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10135
10136 return true;
10137 }
10138
10139 /* Function vectorizable_live_operation.
10140
10141 STMT_INFO computes a value that is used outside the loop. Check if
10142 it can be supported. */
10143
10144 bool
10145 vectorizable_live_operation (vec_info *vinfo,
10146 stmt_vec_info stmt_info,
10147 gimple_stmt_iterator *gsi,
10148 slp_tree slp_node, slp_instance slp_node_instance,
10149 int slp_index, bool vec_stmt_p,
10150 stmt_vector_for_cost *cost_vec)
10151 {
10152 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10153 imm_use_iterator imm_iter;
10154 tree lhs, lhs_type, bitsize;
10155 tree vectype = (slp_node
10156 ? SLP_TREE_VECTYPE (slp_node)
10157 : STMT_VINFO_VECTYPE (stmt_info));
10158 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10159 int ncopies;
10160 gimple *use_stmt;
10161 auto_vec<tree> vec_oprnds;
10162 int vec_entry = 0;
10163 poly_uint64 vec_index = 0;
10164
10165 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10166
10167 /* If a stmt of a reduction is live, vectorize it via
10168 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10169 validity so just trigger the transform here. */
10170 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10171 {
10172 if (!vec_stmt_p)
10173 return true;
10174 if (slp_node)
10175 {
10176 /* For reduction chains the meta-info is attached to
10177 the group leader. */
10178 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10179 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10180 /* For SLP reductions we vectorize the epilogue for
10181 all involved stmts together. */
10182 else if (slp_index != 0)
10183 return true;
10184 }
10185 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10186 gcc_assert (reduc_info->is_reduc_info);
10187 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10188 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10189 return true;
10190 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10191 slp_node_instance);
10192 return true;
10193 }
10194
10195 /* If STMT is not relevant and it is a simple assignment and its inputs are
10196 invariant then it can remain in place, unvectorized. The original last
10197 scalar value that it computes will be used. */
10198 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10199 {
10200 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10201 if (dump_enabled_p ())
10202 dump_printf_loc (MSG_NOTE, vect_location,
10203 "statement is simple and uses invariant. Leaving in "
10204 "place.\n");
10205 return true;
10206 }
10207
10208 if (slp_node)
10209 ncopies = 1;
10210 else
10211 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10212
10213 if (slp_node)
10214 {
10215 gcc_assert (slp_index >= 0);
10216
10217 /* Get the last occurrence of the scalar index from the concatenation of
10218 all the slp vectors. Calculate which slp vector it is and the index
10219 within. */
10220 int num_scalar = SLP_TREE_LANES (slp_node);
10221 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10222 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10223
10224 /* Calculate which vector contains the result, and which lane of
10225 that vector we need. */
10226 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10227 {
10228 if (dump_enabled_p ())
10229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10230 "Cannot determine which vector holds the"
10231 " final result.\n");
10232 return false;
10233 }
10234 }
10235
10236 if (!vec_stmt_p)
10237 {
10238 /* No transformation required. */
10239 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10240 {
10241 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10242 OPTIMIZE_FOR_SPEED))
10243 {
10244 if (dump_enabled_p ())
10245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10246 "can't operate on partial vectors "
10247 "because the target doesn't support extract "
10248 "last reduction.\n");
10249 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10250 }
10251 else if (slp_node)
10252 {
10253 if (dump_enabled_p ())
10254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10255 "can't operate on partial vectors "
10256 "because an SLP statement is live after "
10257 "the loop.\n");
10258 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10259 }
10260 else if (ncopies > 1)
10261 {
10262 if (dump_enabled_p ())
10263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10264 "can't operate on partial vectors "
10265 "because ncopies is greater than 1.\n");
10266 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10267 }
10268 else
10269 {
10270 gcc_assert (ncopies == 1 && !slp_node);
10271 vect_record_loop_mask (loop_vinfo,
10272 &LOOP_VINFO_MASKS (loop_vinfo),
10273 1, vectype, NULL);
10274 }
10275 }
10276 /* ??? Enable for loop costing as well. */
10277 if (!loop_vinfo)
10278 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10279 0, vect_epilogue);
10280 return true;
10281 }
10282
10283 /* Use the lhs of the original scalar statement. */
10284 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10285 if (dump_enabled_p ())
10286 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10287 "stmt %G", stmt);
10288
10289 lhs = gimple_get_lhs (stmt);
10290 lhs_type = TREE_TYPE (lhs);
10291
10292 bitsize = vector_element_bits_tree (vectype);
10293
10294 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10295 tree vec_lhs, bitstart;
10296 gimple *vec_stmt;
10297 if (slp_node)
10298 {
10299 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10300
10301 /* Get the correct slp vectorized stmt. */
10302 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
10303 vec_lhs = gimple_get_lhs (vec_stmt);
10304
10305 /* Get entry to use. */
10306 bitstart = bitsize_int (vec_index);
10307 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10308 }
10309 else
10310 {
10311 /* For multiple copies, get the last copy. */
10312 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10313 vec_lhs = gimple_get_lhs (vec_stmt);
10314
10315 /* Get the last lane in the vector. */
10316 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10317 }
10318
10319 if (loop_vinfo)
10320 {
10321 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10322 requirement, insert one phi node for it. It looks like:
10323 loop;
10324 BB:
10325 # lhs' = PHI <lhs>
10326 ==>
10327 loop;
10328 BB:
10329 # vec_lhs' = PHI <vec_lhs>
10330 new_tree = lane_extract <vec_lhs', ...>;
10331 lhs' = new_tree; */
10332
10333 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10334 basic_block exit_bb = single_exit (loop)->dest;
10335 gcc_assert (single_pred_p (exit_bb));
10336
10337 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10338 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10339 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10340
10341 gimple_seq stmts = NULL;
10342 tree new_tree;
10343 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10344 {
10345 /* Emit:
10346
10347 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10348
10349 where VEC_LHS is the vectorized live-out result and MASK is
10350 the loop mask for the final iteration. */
10351 gcc_assert (ncopies == 1 && !slp_node);
10352 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10353 tree mask = vect_get_loop_mask (loop_vinfo, gsi,
10354 &LOOP_VINFO_MASKS (loop_vinfo),
10355 1, vectype, 0);
10356 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10357 mask, vec_lhs_phi);
10358
10359 /* Convert the extracted vector element to the scalar type. */
10360 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10361 }
10362 else
10363 {
10364 tree bftype = TREE_TYPE (vectype);
10365 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10366 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10367 new_tree = build3 (BIT_FIELD_REF, bftype,
10368 vec_lhs_phi, bitsize, bitstart);
10369 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10370 &stmts, true, NULL_TREE);
10371 }
10372
10373 if (stmts)
10374 {
10375 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10376 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10377
10378 /* Remove existing phi from lhs and create one copy from new_tree. */
10379 tree lhs_phi = NULL_TREE;
10380 gimple_stmt_iterator gsi;
10381 for (gsi = gsi_start_phis (exit_bb);
10382 !gsi_end_p (gsi); gsi_next (&gsi))
10383 {
10384 gimple *phi = gsi_stmt (gsi);
10385 if ((gimple_phi_arg_def (phi, 0) == lhs))
10386 {
10387 remove_phi_node (&gsi, false);
10388 lhs_phi = gimple_phi_result (phi);
10389 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10390 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10391 break;
10392 }
10393 }
10394 }
10395
10396 /* Replace use of lhs with newly computed result. If the use stmt is a
10397 single arg PHI, just replace all uses of PHI result. It's necessary
10398 because lcssa PHI defining lhs may be before newly inserted stmt. */
10399 use_operand_p use_p;
10400 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10401 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10402 && !is_gimple_debug (use_stmt))
10403 {
10404 if (gimple_code (use_stmt) == GIMPLE_PHI
10405 && gimple_phi_num_args (use_stmt) == 1)
10406 {
10407 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10408 }
10409 else
10410 {
10411 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10412 SET_USE (use_p, new_tree);
10413 }
10414 update_stmt (use_stmt);
10415 }
10416 }
10417 else
10418 {
10419 /* For basic-block vectorization simply insert the lane-extraction. */
10420 tree bftype = TREE_TYPE (vectype);
10421 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10422 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10423 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10424 vec_lhs, bitsize, bitstart);
10425 gimple_seq stmts = NULL;
10426 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10427 &stmts, true, NULL_TREE);
10428 if (TREE_CODE (new_tree) == SSA_NAME
10429 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10430 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10431 if (is_a <gphi *> (vec_stmt))
10432 {
10433 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10434 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10435 }
10436 else
10437 {
10438 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10439 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10440 }
10441
10442 /* Replace use of lhs with newly computed result. If the use stmt is a
10443 single arg PHI, just replace all uses of PHI result. It's necessary
10444 because lcssa PHI defining lhs may be before newly inserted stmt. */
10445 use_operand_p use_p;
10446 stmt_vec_info use_stmt_info;
10447 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10448 if (!is_gimple_debug (use_stmt)
10449 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10450 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10451 {
10452 /* ??? This can happen when the live lane ends up being
10453 used in a vector construction code-generated by an
10454 external SLP node (and code-generation for that already
10455 happened). See gcc.dg/vect/bb-slp-47.c.
10456 Doing this is what would happen if that vector CTOR
10457 were not code-generated yet so it is not too bad.
10458 ??? In fact we'd likely want to avoid this situation
10459 in the first place. */
10460 if (TREE_CODE (new_tree) == SSA_NAME
10461 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10462 && gimple_code (use_stmt) != GIMPLE_PHI
10463 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10464 use_stmt))
10465 {
10466 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10467 gcc_checking_assert (code == SSA_NAME
10468 || code == CONSTRUCTOR
10469 || code == VIEW_CONVERT_EXPR
10470 || CONVERT_EXPR_CODE_P (code));
10471 if (dump_enabled_p ())
10472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10473 "Using original scalar computation for "
10474 "live lane because use preceeds vector "
10475 "def\n");
10476 continue;
10477 }
10478 /* ??? It can also happen that we end up pulling a def into
10479 a loop where replacing out-of-loop uses would require
10480 a new LC SSA PHI node. Retain the original scalar in
10481 those cases as well. PR98064. */
10482 if (TREE_CODE (new_tree) == SSA_NAME
10483 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10484 && (gimple_bb (use_stmt)->loop_father
10485 != gimple_bb (vec_stmt)->loop_father)
10486 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10487 gimple_bb (use_stmt)->loop_father))
10488 {
10489 if (dump_enabled_p ())
10490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10491 "Using original scalar computation for "
10492 "live lane because there is an out-of-loop "
10493 "definition for it\n");
10494 continue;
10495 }
10496 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10497 SET_USE (use_p, new_tree);
10498 update_stmt (use_stmt);
10499 }
10500 }
10501
10502 return true;
10503 }
10504
10505 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10506
10507 static void
10508 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10509 {
10510 ssa_op_iter op_iter;
10511 imm_use_iterator imm_iter;
10512 def_operand_p def_p;
10513 gimple *ustmt;
10514
10515 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10516 {
10517 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10518 {
10519 basic_block bb;
10520
10521 if (!is_gimple_debug (ustmt))
10522 continue;
10523
10524 bb = gimple_bb (ustmt);
10525
10526 if (!flow_bb_inside_loop_p (loop, bb))
10527 {
10528 if (gimple_debug_bind_p (ustmt))
10529 {
10530 if (dump_enabled_p ())
10531 dump_printf_loc (MSG_NOTE, vect_location,
10532 "killing debug use\n");
10533
10534 gimple_debug_bind_reset_value (ustmt);
10535 update_stmt (ustmt);
10536 }
10537 else
10538 gcc_unreachable ();
10539 }
10540 }
10541 }
10542 }
10543
10544 /* Given loop represented by LOOP_VINFO, return true if computation of
10545 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10546 otherwise. */
10547
10548 static bool
10549 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10550 {
10551 /* Constant case. */
10552 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10553 {
10554 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10555 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10556
10557 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10558 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10559 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10560 return true;
10561 }
10562
10563 widest_int max;
10564 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10565 /* Check the upper bound of loop niters. */
10566 if (get_max_loop_iterations (loop, &max))
10567 {
10568 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10569 signop sgn = TYPE_SIGN (type);
10570 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10571 if (max < type_max)
10572 return true;
10573 }
10574 return false;
10575 }
10576
10577 /* Return a mask type with half the number of elements as OLD_TYPE,
10578 given that it should have mode NEW_MODE. */
10579
10580 tree
10581 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10582 {
10583 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10584 return build_truth_vector_type_for_mode (nunits, new_mode);
10585 }
10586
10587 /* Return a mask type with twice as many elements as OLD_TYPE,
10588 given that it should have mode NEW_MODE. */
10589
10590 tree
10591 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10592 {
10593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10594 return build_truth_vector_type_for_mode (nunits, new_mode);
10595 }
10596
10597 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10598 contain a sequence of NVECTORS masks that each control a vector of type
10599 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10600 these vector masks with the vector version of SCALAR_MASK. */
10601
10602 void
10603 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10604 unsigned int nvectors, tree vectype, tree scalar_mask)
10605 {
10606 gcc_assert (nvectors != 0);
10607
10608 if (scalar_mask)
10609 {
10610 scalar_cond_masked_key cond (scalar_mask, nvectors);
10611 loop_vinfo->scalar_cond_masked_set.add (cond);
10612 }
10613
10614 masks->mask_set.add (std::make_pair (vectype, nvectors));
10615 }
10616
10617 /* Given a complete set of masks MASKS, extract mask number INDEX
10618 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10619 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10620
10621 See the comment above vec_loop_masks for more details about the mask
10622 arrangement. */
10623
10624 tree
10625 vect_get_loop_mask (loop_vec_info loop_vinfo,
10626 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10627 unsigned int nvectors, tree vectype, unsigned int index)
10628 {
10629 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10630 == vect_partial_vectors_while_ult)
10631 {
10632 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10633 tree mask_type = rgm->type;
10634
10635 /* Populate the rgroup's mask array, if this is the first time we've
10636 used it. */
10637 if (rgm->controls.is_empty ())
10638 {
10639 rgm->controls.safe_grow_cleared (nvectors, true);
10640 for (unsigned int i = 0; i < nvectors; ++i)
10641 {
10642 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10643 /* Provide a dummy definition until the real one is available. */
10644 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10645 rgm->controls[i] = mask;
10646 }
10647 }
10648
10649 tree mask = rgm->controls[index];
10650 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10651 TYPE_VECTOR_SUBPARTS (vectype)))
10652 {
10653 /* A loop mask for data type X can be reused for data type Y
10654 if X has N times more elements than Y and if Y's elements
10655 are N times bigger than X's. In this case each sequence
10656 of N elements in the loop mask will be all-zero or all-one.
10657 We can then view-convert the mask so that each sequence of
10658 N elements is replaced by a single element. */
10659 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10660 TYPE_VECTOR_SUBPARTS (vectype)));
10661 gimple_seq seq = NULL;
10662 mask_type = truth_type_for (vectype);
10663 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10664 if (seq)
10665 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10666 }
10667 return mask;
10668 }
10669 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10670 == vect_partial_vectors_avx512)
10671 {
10672 /* The number of scalars per iteration and the number of vectors are
10673 both compile-time constants. */
10674 unsigned int nscalars_per_iter
10675 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10676 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10677
10678 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10679
10680 /* The stored nV is dependent on the mask type produced. */
10681 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10682 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10683 == rgm->factor);
10684 nvectors = rgm->factor;
10685
10686 /* Populate the rgroup's mask array, if this is the first time we've
10687 used it. */
10688 if (rgm->controls.is_empty ())
10689 {
10690 rgm->controls.safe_grow_cleared (nvectors, true);
10691 for (unsigned int i = 0; i < nvectors; ++i)
10692 {
10693 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10694 /* Provide a dummy definition until the real one is available. */
10695 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10696 rgm->controls[i] = mask;
10697 }
10698 }
10699 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10700 TYPE_VECTOR_SUBPARTS (vectype)))
10701 return rgm->controls[index];
10702
10703 /* Split the vector if needed. Since we are dealing with integer mode
10704 masks with AVX512 we can operate on the integer representation
10705 performing the whole vector shifting. */
10706 unsigned HOST_WIDE_INT factor;
10707 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10708 TYPE_VECTOR_SUBPARTS (vectype), &factor);
10709 gcc_assert (ok);
10710 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10711 tree mask_type = truth_type_for (vectype);
10712 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10713 unsigned vi = index / factor;
10714 unsigned vpart = index % factor;
10715 tree vec = rgm->controls[vi];
10716 gimple_seq seq = NULL;
10717 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10718 lang_hooks.types.type_for_mode
10719 (TYPE_MODE (rgm->type), 1), vec);
10720 /* For integer mode masks simply shift the right bits into position. */
10721 if (vpart != 0)
10722 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10723 build_int_cst (integer_type_node,
10724 (TYPE_VECTOR_SUBPARTS (vectype)
10725 * vpart)));
10726 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10727 (TYPE_MODE (mask_type), 1), vec);
10728 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10729 if (seq)
10730 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10731 return vec;
10732 }
10733 else
10734 gcc_unreachable ();
10735 }
10736
10737 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10738 lengths for controlling an operation on VECTYPE. The operation splits
10739 each element of VECTYPE into FACTOR separate subelements, measuring the
10740 length as a number of these subelements. */
10741
10742 void
10743 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10744 unsigned int nvectors, tree vectype, unsigned int factor)
10745 {
10746 gcc_assert (nvectors != 0);
10747 if (lens->length () < nvectors)
10748 lens->safe_grow_cleared (nvectors, true);
10749 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10750
10751 /* The number of scalars per iteration, scalar occupied bytes and
10752 the number of vectors are both compile-time constants. */
10753 unsigned int nscalars_per_iter
10754 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10755 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10756
10757 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10758 {
10759 /* For now, we only support cases in which all loads and stores fall back
10760 to VnQI or none do. */
10761 gcc_assert (!rgl->max_nscalars_per_iter
10762 || (rgl->factor == 1 && factor == 1)
10763 || (rgl->max_nscalars_per_iter * rgl->factor
10764 == nscalars_per_iter * factor));
10765 rgl->max_nscalars_per_iter = nscalars_per_iter;
10766 rgl->type = vectype;
10767 rgl->factor = factor;
10768 }
10769 }
10770
10771 /* Given a complete set of lengths LENS, extract length number INDEX
10772 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10773 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10774 multipled by the number of elements that should be processed.
10775 Insert any set-up statements before GSI. */
10776
10777 tree
10778 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10779 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10780 unsigned int index, unsigned int factor)
10781 {
10782 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10783 bool use_bias_adjusted_len =
10784 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10785
10786 /* Populate the rgroup's len array, if this is the first time we've
10787 used it. */
10788 if (rgl->controls.is_empty ())
10789 {
10790 rgl->controls.safe_grow_cleared (nvectors, true);
10791 for (unsigned int i = 0; i < nvectors; ++i)
10792 {
10793 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10794 gcc_assert (len_type != NULL_TREE);
10795
10796 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10797
10798 /* Provide a dummy definition until the real one is available. */
10799 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10800 rgl->controls[i] = len;
10801
10802 if (use_bias_adjusted_len)
10803 {
10804 gcc_assert (i == 0);
10805 tree adjusted_len =
10806 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10807 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10808 rgl->bias_adjusted_ctrl = adjusted_len;
10809 }
10810 }
10811 }
10812
10813 if (use_bias_adjusted_len)
10814 return rgl->bias_adjusted_ctrl;
10815
10816 tree loop_len = rgl->controls[index];
10817 if (rgl->factor == 1 && factor == 1)
10818 {
10819 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10820 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10821 if (maybe_ne (nunits1, nunits2))
10822 {
10823 /* A loop len for data type X can be reused for data type Y
10824 if X has N times more elements than Y and if Y's elements
10825 are N times bigger than X's. */
10826 gcc_assert (multiple_p (nunits1, nunits2));
10827 factor = exact_div (nunits1, nunits2).to_constant ();
10828 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10829 gimple_seq seq = NULL;
10830 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10831 build_int_cst (iv_type, factor));
10832 if (seq)
10833 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10834 }
10835 }
10836 return loop_len;
10837 }
10838
10839 /* Scale profiling counters by estimation for LOOP which is vectorized
10840 by factor VF.
10841 If FLAT is true, the loop we started with had unrealistically flat
10842 profile. */
10843
10844 static void
10845 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10846 {
10847 /* For flat profiles do not scale down proportionally by VF and only
10848 cap by known iteration count bounds. */
10849 if (flat)
10850 {
10851 if (dump_file && (dump_flags & TDF_DETAILS))
10852 fprintf (dump_file,
10853 "Vectorized loop profile seems flat; not scaling iteration "
10854 "count down by the vectorization factor %i\n", vf);
10855 scale_loop_profile (loop, profile_probability::always (),
10856 get_likely_max_loop_iterations_int (loop));
10857 return;
10858 }
10859 /* Loop body executes VF fewer times and exit increases VF times. */
10860 edge exit_e = single_exit (loop);
10861 profile_count entry_count = loop_preheader_edge (loop)->count ();
10862
10863 /* If we have unreliable loop profile avoid dropping entry
10864 count bellow header count. This can happen since loops
10865 has unrealistically low trip counts. */
10866 while (vf > 1
10867 && loop->header->count > entry_count
10868 && loop->header->count < entry_count * vf)
10869 {
10870 if (dump_file && (dump_flags & TDF_DETAILS))
10871 fprintf (dump_file,
10872 "Vectorization factor %i seems too large for profile "
10873 "prevoiusly believed to be consistent; reducing.\n", vf);
10874 vf /= 2;
10875 }
10876
10877 if (entry_count.nonzero_p ())
10878 set_edge_probability_and_rescale_others
10879 (exit_e,
10880 entry_count.probability_in (loop->header->count / vf));
10881 /* Avoid producing very large exit probability when we do not have
10882 sensible profile. */
10883 else if (exit_e->probability < profile_probability::always () / (vf * 2))
10884 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10885 loop->latch->count = single_pred_edge (loop->latch)->count ();
10886
10887 scale_loop_profile (loop, profile_probability::always () / vf,
10888 get_likely_max_loop_iterations_int (loop));
10889 }
10890
10891 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10892 latch edge values originally defined by it. */
10893
10894 static void
10895 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10896 stmt_vec_info def_stmt_info)
10897 {
10898 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10899 if (!def || TREE_CODE (def) != SSA_NAME)
10900 return;
10901 stmt_vec_info phi_info;
10902 imm_use_iterator iter;
10903 use_operand_p use_p;
10904 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10905 {
10906 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10907 if (!phi)
10908 continue;
10909 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10910 && (phi_info = loop_vinfo->lookup_stmt (phi))
10911 && STMT_VINFO_RELEVANT_P (phi_info)))
10912 continue;
10913 loop_p loop = gimple_bb (phi)->loop_father;
10914 edge e = loop_latch_edge (loop);
10915 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10916 continue;
10917
10918 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10919 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10920 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10921 {
10922 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10923 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10924 gcc_assert (phi_defs.length () == latch_defs.length ());
10925 for (unsigned i = 0; i < phi_defs.length (); ++i)
10926 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10927 gimple_get_lhs (latch_defs[i]), e,
10928 gimple_phi_arg_location (phi, e->dest_idx));
10929 }
10930 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10931 {
10932 /* For first order recurrences we have to update both uses of
10933 the latch definition, the one in the PHI node and the one
10934 in the generated VEC_PERM_EXPR. */
10935 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10936 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10937 gcc_assert (phi_defs.length () == latch_defs.length ());
10938 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10939 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10940 for (unsigned i = 0; i < phi_defs.length (); ++i)
10941 {
10942 gassign *perm = as_a <gassign *> (phi_defs[i]);
10943 if (i > 0)
10944 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10945 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10946 update_stmt (perm);
10947 }
10948 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10949 gimple_phi_arg_location (phi, e->dest_idx));
10950 }
10951 }
10952 }
10953
10954 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10955 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10956 stmt_vec_info. */
10957
10958 static bool
10959 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10960 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10961 {
10962 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10963 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10964
10965 if (dump_enabled_p ())
10966 dump_printf_loc (MSG_NOTE, vect_location,
10967 "------>vectorizing statement: %G", stmt_info->stmt);
10968
10969 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10970 vect_loop_kill_debug_uses (loop, stmt_info);
10971
10972 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10973 && !STMT_VINFO_LIVE_P (stmt_info))
10974 return false;
10975
10976 if (STMT_VINFO_VECTYPE (stmt_info))
10977 {
10978 poly_uint64 nunits
10979 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10980 if (!STMT_SLP_TYPE (stmt_info)
10981 && maybe_ne (nunits, vf)
10982 && dump_enabled_p ())
10983 /* For SLP VF is set according to unrolling factor, and not
10984 to vector size, hence for SLP this print is not valid. */
10985 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10986 }
10987
10988 /* Pure SLP statements have already been vectorized. We still need
10989 to apply loop vectorization to hybrid SLP statements. */
10990 if (PURE_SLP_STMT (stmt_info))
10991 return false;
10992
10993 if (dump_enabled_p ())
10994 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10995
10996 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10997 *seen_store = stmt_info;
10998
10999 return true;
11000 }
11001
11002 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11003 in the hash_map with its corresponding values. */
11004
11005 static tree
11006 find_in_mapping (tree t, void *context)
11007 {
11008 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11009
11010 tree *value = mapping->get (t);
11011 return value ? *value : t;
11012 }
11013
11014 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11015 original loop that has now been vectorized.
11016
11017 The inits of the data_references need to be advanced with the number of
11018 iterations of the main loop. This has been computed in vect_do_peeling and
11019 is stored in parameter ADVANCE. We first restore the data_references
11020 initial offset with the values recored in ORIG_DRS_INIT.
11021
11022 Since the loop_vec_info of this EPILOGUE was constructed for the original
11023 loop, its stmt_vec_infos all point to the original statements. These need
11024 to be updated to point to their corresponding copies as well as the SSA_NAMES
11025 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11026
11027 The data_reference's connections also need to be updated. Their
11028 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11029 stmt_vec_infos, their statements need to point to their corresponding copy,
11030 if they are gather loads or scatter stores then their reference needs to be
11031 updated to point to its corresponding copy and finally we set
11032 'base_misaligned' to false as we have already peeled for alignment in the
11033 prologue of the main loop. */
11034
11035 static void
11036 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11037 {
11038 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11039 auto_vec<gimple *> stmt_worklist;
11040 hash_map<tree,tree> mapping;
11041 gimple *orig_stmt, *new_stmt;
11042 gimple_stmt_iterator epilogue_gsi;
11043 gphi_iterator epilogue_phi_gsi;
11044 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11045 basic_block *epilogue_bbs = get_loop_body (epilogue);
11046 unsigned i;
11047
11048 free (LOOP_VINFO_BBS (epilogue_vinfo));
11049 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11050
11051 /* Advance data_reference's with the number of iterations of the previous
11052 loop and its prologue. */
11053 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11054
11055
11056 /* The EPILOGUE loop is a copy of the original loop so they share the same
11057 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11058 point to the copied statements. We also create a mapping of all LHS' in
11059 the original loop and all the LHS' in the EPILOGUE and create worklists to
11060 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11061 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11062 {
11063 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11064 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11065 {
11066 new_stmt = epilogue_phi_gsi.phi ();
11067
11068 gcc_assert (gimple_uid (new_stmt) > 0);
11069 stmt_vinfo
11070 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11071
11072 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11073 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11074
11075 mapping.put (gimple_phi_result (orig_stmt),
11076 gimple_phi_result (new_stmt));
11077 /* PHI nodes can not have patterns or related statements. */
11078 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11079 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11080 }
11081
11082 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11083 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11084 {
11085 new_stmt = gsi_stmt (epilogue_gsi);
11086 if (is_gimple_debug (new_stmt))
11087 continue;
11088
11089 gcc_assert (gimple_uid (new_stmt) > 0);
11090 stmt_vinfo
11091 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11092
11093 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11094 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11095
11096 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11097 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11098
11099 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11100 {
11101 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11102 for (gimple_stmt_iterator gsi = gsi_start (seq);
11103 !gsi_end_p (gsi); gsi_next (&gsi))
11104 stmt_worklist.safe_push (gsi_stmt (gsi));
11105 }
11106
11107 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11108 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11109 {
11110 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11111 stmt_worklist.safe_push (stmt);
11112 /* Set BB such that the assert in
11113 'get_initial_def_for_reduction' is able to determine that
11114 the BB of the related stmt is inside this loop. */
11115 gimple_set_bb (stmt,
11116 gimple_bb (new_stmt));
11117 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11118 gcc_assert (related_vinfo == NULL
11119 || related_vinfo == stmt_vinfo);
11120 }
11121 }
11122 }
11123
11124 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11125 using the original main loop and thus need to be updated to refer to the
11126 cloned variables used in the epilogue. */
11127 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11128 {
11129 gimple *stmt = stmt_worklist[i];
11130 tree *new_op;
11131
11132 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11133 {
11134 tree op = gimple_op (stmt, j);
11135 if ((new_op = mapping.get(op)))
11136 gimple_set_op (stmt, j, *new_op);
11137 else
11138 {
11139 /* PR92429: The last argument of simplify_replace_tree disables
11140 folding when replacing arguments. This is required as
11141 otherwise you might end up with different statements than the
11142 ones analyzed in vect_loop_analyze, leading to different
11143 vectorization. */
11144 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11145 &find_in_mapping, &mapping, false);
11146 gimple_set_op (stmt, j, op);
11147 }
11148 }
11149 }
11150
11151 struct data_reference *dr;
11152 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11153 FOR_EACH_VEC_ELT (datarefs, i, dr)
11154 {
11155 orig_stmt = DR_STMT (dr);
11156 gcc_assert (gimple_uid (orig_stmt) > 0);
11157 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11158 /* Data references for gather loads and scatter stores do not use the
11159 updated offset we set using ADVANCE. Instead we have to make sure the
11160 reference in the data references point to the corresponding copy of
11161 the original in the epilogue. */
11162 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11163 == VMAT_GATHER_SCATTER)
11164 {
11165 DR_REF (dr)
11166 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11167 &find_in_mapping, &mapping);
11168 DR_BASE_ADDRESS (dr)
11169 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11170 &find_in_mapping, &mapping);
11171 }
11172 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11173 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11174 /* The vector size of the epilogue is smaller than that of the main loop
11175 so the alignment is either the same or lower. This means the dr will
11176 thus by definition be aligned. */
11177 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11178 }
11179
11180 epilogue_vinfo->shared->datarefs_copy.release ();
11181 epilogue_vinfo->shared->save_datarefs ();
11182 }
11183
11184 /* Function vect_transform_loop.
11185
11186 The analysis phase has determined that the loop is vectorizable.
11187 Vectorize the loop - created vectorized stmts to replace the scalar
11188 stmts in the loop, and update the loop exit condition.
11189 Returns scalar epilogue loop if any. */
11190
11191 class loop *
11192 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11193 {
11194 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11195 class loop *epilogue = NULL;
11196 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11197 int nbbs = loop->num_nodes;
11198 int i;
11199 tree niters_vector = NULL_TREE;
11200 tree step_vector = NULL_TREE;
11201 tree niters_vector_mult_vf = NULL_TREE;
11202 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11203 unsigned int lowest_vf = constant_lower_bound (vf);
11204 gimple *stmt;
11205 bool check_profitability = false;
11206 unsigned int th;
11207 bool flat = maybe_flat_loop_profile (loop);
11208
11209 DUMP_VECT_SCOPE ("vec_transform_loop");
11210
11211 loop_vinfo->shared->check_datarefs ();
11212
11213 /* Use the more conservative vectorization threshold. If the number
11214 of iterations is constant assume the cost check has been performed
11215 by our caller. If the threshold makes all loops profitable that
11216 run at least the (estimated) vectorization factor number of times
11217 checking is pointless, too. */
11218 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11219 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11220 {
11221 if (dump_enabled_p ())
11222 dump_printf_loc (MSG_NOTE, vect_location,
11223 "Profitability threshold is %d loop iterations.\n",
11224 th);
11225 check_profitability = true;
11226 }
11227
11228 /* Make sure there exists a single-predecessor exit bb. Do this before
11229 versioning. */
11230 edge e = single_exit (loop);
11231 if (! single_pred_p (e->dest))
11232 {
11233 split_loop_exit_edge (e, true);
11234 if (dump_enabled_p ())
11235 dump_printf (MSG_NOTE, "split exit edge\n");
11236 }
11237
11238 /* Version the loop first, if required, so the profitability check
11239 comes first. */
11240
11241 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11242 {
11243 class loop *sloop
11244 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11245 sloop->force_vectorize = false;
11246 check_profitability = false;
11247 }
11248
11249 /* Make sure there exists a single-predecessor exit bb also on the
11250 scalar loop copy. Do this after versioning but before peeling
11251 so CFG structure is fine for both scalar and if-converted loop
11252 to make slpeel_duplicate_current_defs_from_edges face matched
11253 loop closed PHI nodes on the exit. */
11254 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11255 {
11256 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11257 if (! single_pred_p (e->dest))
11258 {
11259 split_loop_exit_edge (e, true);
11260 if (dump_enabled_p ())
11261 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11262 }
11263 }
11264
11265 tree niters = vect_build_loop_niters (loop_vinfo);
11266 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11267 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11268 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11269 tree advance;
11270 drs_init_vec orig_drs_init;
11271
11272 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11273 &step_vector, &niters_vector_mult_vf, th,
11274 check_profitability, niters_no_overflow,
11275 &advance);
11276 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11277 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11278 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11279 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11280
11281 if (niters_vector == NULL_TREE)
11282 {
11283 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11284 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11285 && known_eq (lowest_vf, vf))
11286 {
11287 niters_vector
11288 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11289 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11290 step_vector = build_one_cst (TREE_TYPE (niters));
11291 }
11292 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11293 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11294 &step_vector, niters_no_overflow);
11295 else
11296 /* vect_do_peeling subtracted the number of peeled prologue
11297 iterations from LOOP_VINFO_NITERS. */
11298 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11299 &niters_vector, &step_vector,
11300 niters_no_overflow);
11301 }
11302
11303 /* 1) Make sure the loop header has exactly two entries
11304 2) Make sure we have a preheader basic block. */
11305
11306 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11307
11308 split_edge (loop_preheader_edge (loop));
11309
11310 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11311 /* This will deal with any possible peeling. */
11312 vect_prepare_for_masked_peels (loop_vinfo);
11313
11314 /* Schedule the SLP instances first, then handle loop vectorization
11315 below. */
11316 if (!loop_vinfo->slp_instances.is_empty ())
11317 {
11318 DUMP_VECT_SCOPE ("scheduling SLP instances");
11319 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11320 }
11321
11322 /* FORNOW: the vectorizer supports only loops which body consist
11323 of one basic block (header + empty latch). When the vectorizer will
11324 support more involved loop forms, the order by which the BBs are
11325 traversed need to be reconsidered. */
11326
11327 for (i = 0; i < nbbs; i++)
11328 {
11329 basic_block bb = bbs[i];
11330 stmt_vec_info stmt_info;
11331
11332 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11333 gsi_next (&si))
11334 {
11335 gphi *phi = si.phi ();
11336 if (dump_enabled_p ())
11337 dump_printf_loc (MSG_NOTE, vect_location,
11338 "------>vectorizing phi: %G", (gimple *) phi);
11339 stmt_info = loop_vinfo->lookup_stmt (phi);
11340 if (!stmt_info)
11341 continue;
11342
11343 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11344 vect_loop_kill_debug_uses (loop, stmt_info);
11345
11346 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11347 && !STMT_VINFO_LIVE_P (stmt_info))
11348 continue;
11349
11350 if (STMT_VINFO_VECTYPE (stmt_info)
11351 && (maybe_ne
11352 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11353 && dump_enabled_p ())
11354 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11355
11356 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11357 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11358 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11359 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11360 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11361 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11362 && ! PURE_SLP_STMT (stmt_info))
11363 {
11364 if (dump_enabled_p ())
11365 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11366 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11367 }
11368 }
11369
11370 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11371 gsi_next (&si))
11372 {
11373 gphi *phi = si.phi ();
11374 stmt_info = loop_vinfo->lookup_stmt (phi);
11375 if (!stmt_info)
11376 continue;
11377
11378 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11379 && !STMT_VINFO_LIVE_P (stmt_info))
11380 continue;
11381
11382 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11383 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11384 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11385 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11386 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11387 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11388 && ! PURE_SLP_STMT (stmt_info))
11389 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11390 }
11391
11392 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11393 !gsi_end_p (si);)
11394 {
11395 stmt = gsi_stmt (si);
11396 /* During vectorization remove existing clobber stmts. */
11397 if (gimple_clobber_p (stmt))
11398 {
11399 unlink_stmt_vdef (stmt);
11400 gsi_remove (&si, true);
11401 release_defs (stmt);
11402 }
11403 else
11404 {
11405 /* Ignore vector stmts created in the outer loop. */
11406 stmt_info = loop_vinfo->lookup_stmt (stmt);
11407
11408 /* vector stmts created in the outer-loop during vectorization of
11409 stmts in an inner-loop may not have a stmt_info, and do not
11410 need to be vectorized. */
11411 stmt_vec_info seen_store = NULL;
11412 if (stmt_info)
11413 {
11414 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11415 {
11416 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11417 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11418 !gsi_end_p (subsi); gsi_next (&subsi))
11419 {
11420 stmt_vec_info pat_stmt_info
11421 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11422 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11423 &si, &seen_store);
11424 }
11425 stmt_vec_info pat_stmt_info
11426 = STMT_VINFO_RELATED_STMT (stmt_info);
11427 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11428 &si, &seen_store))
11429 maybe_set_vectorized_backedge_value (loop_vinfo,
11430 pat_stmt_info);
11431 }
11432 else
11433 {
11434 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11435 &seen_store))
11436 maybe_set_vectorized_backedge_value (loop_vinfo,
11437 stmt_info);
11438 }
11439 }
11440 gsi_next (&si);
11441 if (seen_store)
11442 {
11443 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11444 /* Interleaving. If IS_STORE is TRUE, the
11445 vectorization of the interleaving chain was
11446 completed - free all the stores in the chain. */
11447 vect_remove_stores (loop_vinfo,
11448 DR_GROUP_FIRST_ELEMENT (seen_store));
11449 else
11450 /* Free the attached stmt_vec_info and remove the stmt. */
11451 loop_vinfo->remove_stmt (stmt_info);
11452 }
11453 }
11454 }
11455
11456 /* Stub out scalar statements that must not survive vectorization.
11457 Doing this here helps with grouped statements, or statements that
11458 are involved in patterns. */
11459 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11460 !gsi_end_p (gsi); gsi_next (&gsi))
11461 {
11462 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11463 if (!call || !gimple_call_internal_p (call))
11464 continue;
11465 internal_fn ifn = gimple_call_internal_fn (call);
11466 if (ifn == IFN_MASK_LOAD)
11467 {
11468 tree lhs = gimple_get_lhs (call);
11469 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11470 {
11471 tree zero = build_zero_cst (TREE_TYPE (lhs));
11472 gimple *new_stmt = gimple_build_assign (lhs, zero);
11473 gsi_replace (&gsi, new_stmt, true);
11474 }
11475 }
11476 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11477 {
11478 tree lhs = gimple_get_lhs (call);
11479 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11480 {
11481 tree else_arg
11482 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11483 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11484 gsi_replace (&gsi, new_stmt, true);
11485 }
11486 }
11487 }
11488 } /* BBs in loop */
11489
11490 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11491 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11492 if (integer_onep (step_vector))
11493 niters_no_overflow = true;
11494 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11495 niters_vector_mult_vf, !niters_no_overflow);
11496
11497 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11498
11499 /* True if the final iteration might not handle a full vector's
11500 worth of scalar iterations. */
11501 bool final_iter_may_be_partial
11502 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11503 /* The minimum number of iterations performed by the epilogue. This
11504 is 1 when peeling for gaps because we always need a final scalar
11505 iteration. */
11506 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11507 /* +1 to convert latch counts to loop iteration counts,
11508 -min_epilogue_iters to remove iterations that cannot be performed
11509 by the vector code. */
11510 int bias_for_lowest = 1 - min_epilogue_iters;
11511 int bias_for_assumed = bias_for_lowest;
11512 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11513 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11514 {
11515 /* When the amount of peeling is known at compile time, the first
11516 iteration will have exactly alignment_npeels active elements.
11517 In the worst case it will have at least one. */
11518 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11519 bias_for_lowest += lowest_vf - min_first_active;
11520 bias_for_assumed += assumed_vf - min_first_active;
11521 }
11522 /* In these calculations the "- 1" converts loop iteration counts
11523 back to latch counts. */
11524 if (loop->any_upper_bound)
11525 {
11526 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11527 loop->nb_iterations_upper_bound
11528 = (final_iter_may_be_partial
11529 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11530 lowest_vf) - 1
11531 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11532 lowest_vf) - 1);
11533 if (main_vinfo
11534 /* Both peeling for alignment and peeling for gaps can end up
11535 with the scalar epilogue running for more than VF-1 iterations. */
11536 && !main_vinfo->peeling_for_alignment
11537 && !main_vinfo->peeling_for_gaps)
11538 {
11539 unsigned int bound;
11540 poly_uint64 main_iters
11541 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11542 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11543 main_iters
11544 = upper_bound (main_iters,
11545 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11546 if (can_div_away_from_zero_p (main_iters,
11547 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11548 &bound))
11549 loop->nb_iterations_upper_bound
11550 = wi::umin ((widest_int) (bound - 1),
11551 loop->nb_iterations_upper_bound);
11552 }
11553 }
11554 if (loop->any_likely_upper_bound)
11555 loop->nb_iterations_likely_upper_bound
11556 = (final_iter_may_be_partial
11557 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11558 + bias_for_lowest, lowest_vf) - 1
11559 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11560 + bias_for_lowest, lowest_vf) - 1);
11561 if (loop->any_estimate)
11562 loop->nb_iterations_estimate
11563 = (final_iter_may_be_partial
11564 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11565 assumed_vf) - 1
11566 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11567 assumed_vf) - 1);
11568 scale_profile_for_vect_loop (loop, assumed_vf, flat);
11569
11570 if (dump_enabled_p ())
11571 {
11572 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11573 {
11574 dump_printf_loc (MSG_NOTE, vect_location,
11575 "LOOP VECTORIZED\n");
11576 if (loop->inner)
11577 dump_printf_loc (MSG_NOTE, vect_location,
11578 "OUTER LOOP VECTORIZED\n");
11579 dump_printf (MSG_NOTE, "\n");
11580 }
11581 else
11582 dump_printf_loc (MSG_NOTE, vect_location,
11583 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11584 GET_MODE_NAME (loop_vinfo->vector_mode));
11585 }
11586
11587 /* Loops vectorized with a variable factor won't benefit from
11588 unrolling/peeling. */
11589 if (!vf.is_constant ())
11590 {
11591 loop->unroll = 1;
11592 if (dump_enabled_p ())
11593 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11594 " variable-length vectorization factor\n");
11595 }
11596 /* Free SLP instances here because otherwise stmt reference counting
11597 won't work. */
11598 slp_instance instance;
11599 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11600 vect_free_slp_instance (instance);
11601 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11602 /* Clear-up safelen field since its value is invalid after vectorization
11603 since vectorized loop can have loop-carried dependencies. */
11604 loop->safelen = 0;
11605
11606 if (epilogue)
11607 {
11608 update_epilogue_loop_vinfo (epilogue, advance);
11609
11610 epilogue->simduid = loop->simduid;
11611 epilogue->force_vectorize = loop->force_vectorize;
11612 epilogue->dont_vectorize = false;
11613 }
11614
11615 return epilogue;
11616 }
11617
11618 /* The code below is trying to perform simple optimization - revert
11619 if-conversion for masked stores, i.e. if the mask of a store is zero
11620 do not perform it and all stored value producers also if possible.
11621 For example,
11622 for (i=0; i<n; i++)
11623 if (c[i])
11624 {
11625 p1[i] += 1;
11626 p2[i] = p3[i] +2;
11627 }
11628 this transformation will produce the following semi-hammock:
11629
11630 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11631 {
11632 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11633 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11634 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11635 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11636 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11637 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11638 }
11639 */
11640
11641 void
11642 optimize_mask_stores (class loop *loop)
11643 {
11644 basic_block *bbs = get_loop_body (loop);
11645 unsigned nbbs = loop->num_nodes;
11646 unsigned i;
11647 basic_block bb;
11648 class loop *bb_loop;
11649 gimple_stmt_iterator gsi;
11650 gimple *stmt;
11651 auto_vec<gimple *> worklist;
11652 auto_purge_vect_location sentinel;
11653
11654 vect_location = find_loop_location (loop);
11655 /* Pick up all masked stores in loop if any. */
11656 for (i = 0; i < nbbs; i++)
11657 {
11658 bb = bbs[i];
11659 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11660 gsi_next (&gsi))
11661 {
11662 stmt = gsi_stmt (gsi);
11663 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11664 worklist.safe_push (stmt);
11665 }
11666 }
11667
11668 free (bbs);
11669 if (worklist.is_empty ())
11670 return;
11671
11672 /* Loop has masked stores. */
11673 while (!worklist.is_empty ())
11674 {
11675 gimple *last, *last_store;
11676 edge e, efalse;
11677 tree mask;
11678 basic_block store_bb, join_bb;
11679 gimple_stmt_iterator gsi_to;
11680 tree vdef, new_vdef;
11681 gphi *phi;
11682 tree vectype;
11683 tree zero;
11684
11685 last = worklist.pop ();
11686 mask = gimple_call_arg (last, 2);
11687 bb = gimple_bb (last);
11688 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11689 the same loop as if_bb. It could be different to LOOP when two
11690 level loop-nest is vectorized and mask_store belongs to the inner
11691 one. */
11692 e = split_block (bb, last);
11693 bb_loop = bb->loop_father;
11694 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11695 join_bb = e->dest;
11696 store_bb = create_empty_bb (bb);
11697 add_bb_to_loop (store_bb, bb_loop);
11698 e->flags = EDGE_TRUE_VALUE;
11699 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11700 /* Put STORE_BB to likely part. */
11701 efalse->probability = profile_probability::unlikely ();
11702 e->probability = efalse->probability.invert ();
11703 store_bb->count = efalse->count ();
11704 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11705 if (dom_info_available_p (CDI_DOMINATORS))
11706 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11707 if (dump_enabled_p ())
11708 dump_printf_loc (MSG_NOTE, vect_location,
11709 "Create new block %d to sink mask stores.",
11710 store_bb->index);
11711 /* Create vector comparison with boolean result. */
11712 vectype = TREE_TYPE (mask);
11713 zero = build_zero_cst (vectype);
11714 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11715 gsi = gsi_last_bb (bb);
11716 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11717 /* Create new PHI node for vdef of the last masked store:
11718 .MEM_2 = VDEF <.MEM_1>
11719 will be converted to
11720 .MEM.3 = VDEF <.MEM_1>
11721 and new PHI node will be created in join bb
11722 .MEM_2 = PHI <.MEM_1, .MEM_3>
11723 */
11724 vdef = gimple_vdef (last);
11725 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11726 gimple_set_vdef (last, new_vdef);
11727 phi = create_phi_node (vdef, join_bb);
11728 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11729
11730 /* Put all masked stores with the same mask to STORE_BB if possible. */
11731 while (true)
11732 {
11733 gimple_stmt_iterator gsi_from;
11734 gimple *stmt1 = NULL;
11735
11736 /* Move masked store to STORE_BB. */
11737 last_store = last;
11738 gsi = gsi_for_stmt (last);
11739 gsi_from = gsi;
11740 /* Shift GSI to the previous stmt for further traversal. */
11741 gsi_prev (&gsi);
11742 gsi_to = gsi_start_bb (store_bb);
11743 gsi_move_before (&gsi_from, &gsi_to);
11744 /* Setup GSI_TO to the non-empty block start. */
11745 gsi_to = gsi_start_bb (store_bb);
11746 if (dump_enabled_p ())
11747 dump_printf_loc (MSG_NOTE, vect_location,
11748 "Move stmt to created bb\n%G", last);
11749 /* Move all stored value producers if possible. */
11750 while (!gsi_end_p (gsi))
11751 {
11752 tree lhs;
11753 imm_use_iterator imm_iter;
11754 use_operand_p use_p;
11755 bool res;
11756
11757 /* Skip debug statements. */
11758 if (is_gimple_debug (gsi_stmt (gsi)))
11759 {
11760 gsi_prev (&gsi);
11761 continue;
11762 }
11763 stmt1 = gsi_stmt (gsi);
11764 /* Do not consider statements writing to memory or having
11765 volatile operand. */
11766 if (gimple_vdef (stmt1)
11767 || gimple_has_volatile_ops (stmt1))
11768 break;
11769 gsi_from = gsi;
11770 gsi_prev (&gsi);
11771 lhs = gimple_get_lhs (stmt1);
11772 if (!lhs)
11773 break;
11774
11775 /* LHS of vectorized stmt must be SSA_NAME. */
11776 if (TREE_CODE (lhs) != SSA_NAME)
11777 break;
11778
11779 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11780 {
11781 /* Remove dead scalar statement. */
11782 if (has_zero_uses (lhs))
11783 {
11784 gsi_remove (&gsi_from, true);
11785 continue;
11786 }
11787 }
11788
11789 /* Check that LHS does not have uses outside of STORE_BB. */
11790 res = true;
11791 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11792 {
11793 gimple *use_stmt;
11794 use_stmt = USE_STMT (use_p);
11795 if (is_gimple_debug (use_stmt))
11796 continue;
11797 if (gimple_bb (use_stmt) != store_bb)
11798 {
11799 res = false;
11800 break;
11801 }
11802 }
11803 if (!res)
11804 break;
11805
11806 if (gimple_vuse (stmt1)
11807 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11808 break;
11809
11810 /* Can move STMT1 to STORE_BB. */
11811 if (dump_enabled_p ())
11812 dump_printf_loc (MSG_NOTE, vect_location,
11813 "Move stmt to created bb\n%G", stmt1);
11814 gsi_move_before (&gsi_from, &gsi_to);
11815 /* Shift GSI_TO for further insertion. */
11816 gsi_prev (&gsi_to);
11817 }
11818 /* Put other masked stores with the same mask to STORE_BB. */
11819 if (worklist.is_empty ()
11820 || gimple_call_arg (worklist.last (), 2) != mask
11821 || worklist.last () != stmt1)
11822 break;
11823 last = worklist.pop ();
11824 }
11825 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11826 }
11827 }
11828
11829 /* Decide whether it is possible to use a zero-based induction variable
11830 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11831 the value that the induction variable must be able to hold in order
11832 to ensure that the rgroups eventually have no active vector elements.
11833 Return -1 otherwise. */
11834
11835 widest_int
11836 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11837 {
11838 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11839 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11840 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11841
11842 /* Calculate the value that the induction variable must be able
11843 to hit in order to ensure that we end the loop with an all-false mask.
11844 This involves adding the maximum number of inactive trailing scalar
11845 iterations. */
11846 widest_int iv_limit = -1;
11847 if (max_loop_iterations (loop, &iv_limit))
11848 {
11849 if (niters_skip)
11850 {
11851 /* Add the maximum number of skipped iterations to the
11852 maximum iteration count. */
11853 if (TREE_CODE (niters_skip) == INTEGER_CST)
11854 iv_limit += wi::to_widest (niters_skip);
11855 else
11856 iv_limit += max_vf - 1;
11857 }
11858 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11859 /* Make a conservatively-correct assumption. */
11860 iv_limit += max_vf - 1;
11861
11862 /* IV_LIMIT is the maximum number of latch iterations, which is also
11863 the maximum in-range IV value. Round this value down to the previous
11864 vector alignment boundary and then add an extra full iteration. */
11865 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11866 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11867 }
11868 return iv_limit;
11869 }
11870
11871 /* For the given rgroup_controls RGC, check whether an induction variable
11872 would ever hit a value that produces a set of all-false masks or zero
11873 lengths before wrapping around. Return true if it's possible to wrap
11874 around before hitting the desirable value, otherwise return false. */
11875
11876 bool
11877 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11878 {
11879 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11880
11881 if (iv_limit == -1)
11882 return true;
11883
11884 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11885 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11886 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11887
11888 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11889 return true;
11890
11891 return false;
11892 }