]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.cc
Make store likely in optimize_mask_stores
[thirdparty/gcc.git] / gcc / tree-vect-loop.cc
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 #include "langhooks.h"
59
60 /* Loop Vectorization Pass.
61
62 This pass tries to vectorize loops.
63
64 For example, the vectorizer transforms the following simple loop:
65
66 short a[N]; short b[N]; short c[N]; int i;
67
68 for (i=0; i<N; i++){
69 a[i] = b[i] + c[i];
70 }
71
72 as if it was manually vectorized by rewriting the source code into:
73
74 typedef int __attribute__((mode(V8HI))) v8hi;
75 short a[N]; short b[N]; short c[N]; int i;
76 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 v8hi va, vb, vc;
78
79 for (i=0; i<N/8; i++){
80 vb = pb[i];
81 vc = pc[i];
82 va = vb + vc;
83 pa[i] = va;
84 }
85
86 The main entry to this pass is vectorize_loops(), in which
87 the vectorizer applies a set of analyses on a given set of loops,
88 followed by the actual vectorization transformation for the loops that
89 had successfully passed the analysis phase.
90 Throughout this pass we make a distinction between two types of
91 data: scalars (which are represented by SSA_NAMES), and memory references
92 ("data-refs"). These two types of data require different handling both
93 during analysis and transformation. The types of data-refs that the
94 vectorizer currently supports are ARRAY_REFS which base is an array DECL
95 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96 accesses are required to have a simple (consecutive) access pattern.
97
98 Analysis phase:
99 ===============
100 The driver for the analysis phase is vect_analyze_loop().
101 It applies a set of analyses, some of which rely on the scalar evolution
102 analyzer (scev) developed by Sebastian Pop.
103
104 During the analysis phase the vectorizer records some information
105 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106 loop, as well as general information about the loop as a whole, which is
107 recorded in a "loop_vec_info" struct attached to each loop.
108
109 Transformation phase:
110 =====================
111 The loop transformation phase scans all the stmts in the loop, and
112 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113 the loop that needs to be vectorized. It inserts the vector code sequence
114 just before the scalar stmt S, and records a pointer to the vector code
115 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116 attached to S). This pointer will be used for the vectorization of following
117 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118 otherwise, we rely on dead code elimination for removing it.
119
120 For example, say stmt S1 was vectorized into stmt VS1:
121
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 S2: a = b;
125
126 To vectorize stmt S2, the vectorizer first finds the stmt that defines
127 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
129 resulting sequence would be:
130
131 VS1: vb = px[i];
132 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 VS2: va = vb;
134 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135
136 Operands that are not SSA_NAMEs, are data-refs that appear in
137 load/store operations (like 'x[i]' in S1), and are handled differently.
138
139 Target modeling:
140 =================
141 Currently the only target specific information that is used is the
142 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143 Targets that can support different sizes of vectors, for now will need
144 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
145 flexibility will be added in the future.
146
147 Since we only vectorize operations which vector form can be
148 expressed using existing tree codes, to verify that an operation is
149 supported, the vectorizer checks the relevant optab at the relevant
150 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
151 the value found is CODE_FOR_nothing, then there's no target support, and
152 we can't vectorize the stmt.
153
154 For additional information on this project see:
155 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
156 */
157
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
159 unsigned *);
160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
161 bool *, bool *, bool);
162
163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
164 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
165 may already be set for general statements (not just data refs). */
166
167 static opt_result
168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
169 bool vectype_maybe_set_p,
170 poly_uint64 *vf)
171 {
172 gimple *stmt = stmt_info->stmt;
173
174 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
176 || gimple_clobber_p (stmt))
177 {
178 if (dump_enabled_p ())
179 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
180 return opt_result::success ();
181 }
182
183 tree stmt_vectype, nunits_vectype;
184 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
185 &stmt_vectype,
186 &nunits_vectype);
187 if (!res)
188 return res;
189
190 if (stmt_vectype)
191 {
192 if (STMT_VINFO_VECTYPE (stmt_info))
193 /* The only case when a vectype had been already set is for stmts
194 that contain a data ref, or for "pattern-stmts" (stmts generated
195 by the vectorizer to represent/replace a certain idiom). */
196 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
197 || vectype_maybe_set_p)
198 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
199 else
200 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
201 }
202
203 if (nunits_vectype)
204 vect_update_max_nunits (vf, nunits_vectype);
205
206 return opt_result::success ();
207 }
208
209 /* Subroutine of vect_determine_vectorization_factor. Set the vector
210 types of STMT_INFO and all attached pattern statements and update
211 the vectorization factor VF accordingly. Return true on success
212 or false if something prevented vectorization. */
213
214 static opt_result
215 vect_determine_vf_for_stmt (vec_info *vinfo,
216 stmt_vec_info stmt_info, poly_uint64 *vf)
217 {
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
222 if (!res)
223 return res;
224
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
227 {
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
234 {
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: %G",
239 def_stmt_info->stmt);
240 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
241 if (!res)
242 return res;
243 }
244
245 if (dump_enabled_p ())
246 dump_printf_loc (MSG_NOTE, vect_location,
247 "==> examining pattern statement: %G",
248 stmt_info->stmt);
249 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
250 if (!res)
251 return res;
252 }
253
254 return opt_result::success ();
255 }
256
257 /* Function vect_determine_vectorization_factor
258
259 Determine the vectorization factor (VF). VF is the number of data elements
260 that are operated upon in parallel in a single iteration of the vectorized
261 loop. For example, when vectorizing a loop that operates on 4byte elements,
262 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
263 elements can fit in a single vector register.
264
265 We currently support vectorization of loops in which all types operated upon
266 are of the same size. Therefore this function currently sets VF according to
267 the size of the types operated upon, and fails if there are multiple sizes
268 in the loop.
269
270 VF is also the factor by which the loop iterations are strip-mined, e.g.:
271 original loop:
272 for (i=0; i<N; i++){
273 a[i] = b[i] + c[i];
274 }
275
276 vectorized loop:
277 for (i=0; i<N; i+=VF){
278 a[i:VF] = b[i:VF] + c[i:VF];
279 }
280 */
281
282 static opt_result
283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
284 {
285 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
286 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
287 unsigned nbbs = loop->num_nodes;
288 poly_uint64 vectorization_factor = 1;
289 tree scalar_type = NULL_TREE;
290 gphi *phi;
291 tree vectype;
292 stmt_vec_info stmt_info;
293 unsigned i;
294
295 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296
297 for (i = 0; i < nbbs; i++)
298 {
299 basic_block bb = bbs[i];
300
301 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
302 gsi_next (&si))
303 {
304 phi = si.phi ();
305 stmt_info = loop_vinfo->lookup_stmt (phi);
306 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
308 (gimple *) phi);
309
310 gcc_assert (stmt_info);
311
312 if (STMT_VINFO_RELEVANT_P (stmt_info)
313 || STMT_VINFO_LIVE_P (stmt_info))
314 {
315 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
316 scalar_type = TREE_TYPE (PHI_RESULT (phi));
317
318 if (dump_enabled_p ())
319 dump_printf_loc (MSG_NOTE, vect_location,
320 "get vectype for scalar type: %T\n",
321 scalar_type);
322
323 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
324 if (!vectype)
325 return opt_result::failure_at (phi,
326 "not vectorized: unsupported "
327 "data-type %T\n",
328 scalar_type);
329 STMT_VINFO_VECTYPE (stmt_info) = vectype;
330
331 if (dump_enabled_p ())
332 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
333 vectype);
334
335 if (dump_enabled_p ())
336 {
337 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
338 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
339 dump_printf (MSG_NOTE, "\n");
340 }
341
342 vect_update_max_nunits (&vectorization_factor, vectype);
343 }
344 }
345
346 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
347 gsi_next (&si))
348 {
349 if (is_gimple_debug (gsi_stmt (si)))
350 continue;
351 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
352 opt_result res
353 = vect_determine_vf_for_stmt (loop_vinfo,
354 stmt_info, &vectorization_factor);
355 if (!res)
356 return res;
357 }
358 }
359
360 /* TODO: Analyze cost. Decide if worth while to vectorize. */
361 if (dump_enabled_p ())
362 {
363 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
364 dump_dec (MSG_NOTE, vectorization_factor);
365 dump_printf (MSG_NOTE, "\n");
366 }
367
368 if (known_le (vectorization_factor, 1U))
369 return opt_result::failure_at (vect_location,
370 "not vectorized: unsupported data-type\n");
371 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
372 return opt_result::success ();
373 }
374
375
376 /* Function vect_is_simple_iv_evolution.
377
378 FORNOW: A simple evolution of an induction variables in the loop is
379 considered a polynomial evolution. */
380
381 static bool
382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
383 tree * step)
384 {
385 tree init_expr;
386 tree step_expr;
387 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
388 basic_block bb;
389
390 /* When there is no evolution in this loop, the evolution function
391 is not "simple". */
392 if (evolution_part == NULL_TREE)
393 return false;
394
395 /* When the evolution is a polynomial of degree >= 2
396 the evolution function is not "simple". */
397 if (tree_is_chrec (evolution_part))
398 return false;
399
400 step_expr = evolution_part;
401 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
402
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
405 step_expr, init_expr);
406
407 *init = init_expr;
408 *step = step_expr;
409
410 if (TREE_CODE (step_expr) != INTEGER_CST
411 && (TREE_CODE (step_expr) != SSA_NAME
412 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
413 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
414 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
415 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
416 || !flag_associative_math)))
417 && (TREE_CODE (step_expr) != REAL_CST
418 || !flag_associative_math))
419 {
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "step unknown.\n");
423 return false;
424 }
425
426 return true;
427 }
428
429 /* Function vect_is_nonlinear_iv_evolution
430
431 Only support nonlinear induction for integer type
432 1. neg
433 2. mul by constant
434 3. lshift/rshift by constant.
435
436 For neg induction, return a fake step as integer -1. */
437 static bool
438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
439 gphi* loop_phi_node, tree *init, tree *step)
440 {
441 tree init_expr, ev_expr, result, op1, op2;
442 gimple* def;
443
444 if (gimple_phi_num_args (loop_phi_node) != 2)
445 return false;
446
447 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
448 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
449
450 /* Support nonlinear induction only for integer type. */
451 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
452 return false;
453
454 *init = init_expr;
455 result = PHI_RESULT (loop_phi_node);
456
457 if (TREE_CODE (ev_expr) != SSA_NAME
458 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
459 || !is_gimple_assign (def))
460 return false;
461
462 enum tree_code t_code = gimple_assign_rhs_code (def);
463 switch (t_code)
464 {
465 case NEGATE_EXPR:
466 if (gimple_assign_rhs1 (def) != result)
467 return false;
468 *step = build_int_cst (TREE_TYPE (init_expr), -1);
469 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
470 break;
471
472 case RSHIFT_EXPR:
473 case LSHIFT_EXPR:
474 case MULT_EXPR:
475 op1 = gimple_assign_rhs1 (def);
476 op2 = gimple_assign_rhs2 (def);
477 if (TREE_CODE (op2) != INTEGER_CST
478 || op1 != result)
479 return false;
480 *step = op2;
481 if (t_code == LSHIFT_EXPR)
482 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
483 else if (t_code == RSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
485 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
486 else
487 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
488 break;
489
490 default:
491 return false;
492 }
493
494 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
495 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
496
497 return true;
498 }
499
500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
501 what we are assuming is a double reduction. For example, given
502 a structure like this:
503
504 outer1:
505 x_1 = PHI <x_4(outer2), ...>;
506 ...
507
508 inner:
509 x_2 = PHI <x_1(outer1), ...>;
510 ...
511 x_3 = ...;
512 ...
513
514 outer2:
515 x_4 = PHI <x_3(inner)>;
516 ...
517
518 outer loop analysis would treat x_1 as a double reduction phi and
519 this function would then return true for x_2. */
520
521 static bool
522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
523 {
524 use_operand_p use_p;
525 ssa_op_iter op_iter;
526 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
527 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
528 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
529 return true;
530 return false;
531 }
532
533 /* Returns true if Phi is a first-order recurrence. A first-order
534 recurrence is a non-reduction recurrence relation in which the value of
535 the recurrence in the current loop iteration equals a value defined in
536 the previous iteration. */
537
538 static bool
539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
540 gphi *phi)
541 {
542 /* A nested cycle isn't vectorizable as first order recurrence. */
543 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
544 return false;
545
546 /* Ensure the loop latch definition is from within the loop. */
547 edge latch = loop_latch_edge (loop);
548 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
549 if (TREE_CODE (ldef) != SSA_NAME
550 || SSA_NAME_IS_DEFAULT_DEF (ldef)
551 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
552 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
553 return false;
554
555 tree def = gimple_phi_result (phi);
556
557 /* Ensure every use_stmt of the phi node is dominated by the latch
558 definition. */
559 imm_use_iterator imm_iter;
560 use_operand_p use_p;
561 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
562 if (!is_gimple_debug (USE_STMT (use_p))
563 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
564 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
565 USE_STMT (use_p))))
566 return false;
567
568 /* First-order recurrence autovectorization needs shuffle vector. */
569 tree scalar_type = TREE_TYPE (def);
570 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
571 if (!vectype)
572 return false;
573
574 return true;
575 }
576
577 /* Function vect_analyze_scalar_cycles_1.
578
579 Examine the cross iteration def-use cycles of scalar variables
580 in LOOP. LOOP_VINFO represents the loop that is now being
581 considered for vectorization (can be LOOP, or an outer-loop
582 enclosing LOOP). SLP indicates there will be some subsequent
583 slp analyses or not. */
584
585 static void
586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
587 bool slp)
588 {
589 basic_block bb = loop->header;
590 tree init, step;
591 auto_vec<stmt_vec_info, 64> worklist;
592 gphi_iterator gsi;
593 bool double_reduc, reduc_chain;
594
595 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
596
597 /* First - identify all inductions. Reduction detection assumes that all the
598 inductions have been identified, therefore, this order must not be
599 changed. */
600 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
601 {
602 gphi *phi = gsi.phi ();
603 tree access_fn = NULL;
604 tree def = PHI_RESULT (phi);
605 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
606
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
609 (gimple *) phi);
610
611 /* Skip virtual phi's. The data dependences that are associated with
612 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
613 if (virtual_operand_p (def))
614 continue;
615
616 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
617
618 /* Analyze the evolution function. */
619 access_fn = analyze_scalar_evolution (loop, def);
620 if (access_fn)
621 {
622 STRIP_NOPS (access_fn);
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_NOTE, vect_location,
625 "Access function of PHI: %T\n", access_fn);
626 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
627 = initial_condition_in_loop_num (access_fn, loop->num);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
629 = evolution_part_in_loop_num (access_fn, loop->num);
630 }
631
632 if ((!access_fn
633 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
634 || !vect_is_simple_iv_evolution (loop->num, access_fn,
635 &init, &step)
636 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
637 && TREE_CODE (step) != INTEGER_CST))
638 /* Only handle nonlinear iv for same loop. */
639 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
640 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
641 phi, &init, &step)))
642 {
643 worklist.safe_push (stmt_vinfo);
644 continue;
645 }
646
647 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
648 != NULL_TREE);
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
650
651 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
653 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
654 }
655
656
657 /* Second - identify all reductions and nested cycles. */
658 while (worklist.length () > 0)
659 {
660 stmt_vec_info stmt_vinfo = worklist.pop ();
661 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
662 tree def = PHI_RESULT (phi);
663
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
666 (gimple *) phi);
667
668 gcc_assert (!virtual_operand_p (def)
669 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
670
671 stmt_vec_info reduc_stmt_info
672 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
673 &reduc_chain, slp);
674 if (reduc_stmt_info)
675 {
676 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
677 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
678 if (double_reduc)
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "Detected double reduction.\n");
683
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
685 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
686 }
687 else
688 {
689 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
690 {
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE, vect_location,
693 "Detected vectorizable nested cycle.\n");
694
695 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
696 }
697 else
698 {
699 if (dump_enabled_p ())
700 dump_printf_loc (MSG_NOTE, vect_location,
701 "Detected reduction.\n");
702
703 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
704 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
705 /* Store the reduction cycles for possible vectorization in
706 loop-aware SLP if it was not detected as reduction
707 chain. */
708 if (! reduc_chain)
709 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
710 (reduc_stmt_info);
711 }
712 }
713 }
714 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
715 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
716 else
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "Unknown def-use cycle pattern.\n");
720 }
721 }
722
723
724 /* Function vect_analyze_scalar_cycles.
725
726 Examine the cross iteration def-use cycles of scalar variables, by
727 analyzing the loop-header PHIs of scalar variables. Classify each
728 cycle as one of the following: invariant, induction, reduction, unknown.
729 We do that for the loop represented by LOOP_VINFO, and also to its
730 inner-loop, if exists.
731 Examples for scalar cycles:
732
733 Example1: reduction:
734
735 loop1:
736 for (i=0; i<N; i++)
737 sum += a[i];
738
739 Example2: induction:
740
741 loop2:
742 for (i=0; i<N; i++)
743 a[i] = i; */
744
745 static void
746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
747 {
748 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
749
750 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
751
752 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
753 Reductions in such inner-loop therefore have different properties than
754 the reductions in the nest that gets vectorized:
755 1. When vectorized, they are executed in the same order as in the original
756 scalar loop, so we can't change the order of computation when
757 vectorizing them.
758 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
759 current checks are too strict. */
760
761 if (loop->inner)
762 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
763 }
764
765 /* Transfer group and reduction information from STMT_INFO to its
766 pattern stmt. */
767
768 static void
769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
770 {
771 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
772 stmt_vec_info stmtp;
773 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
774 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
775 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
776 do
777 {
778 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
779 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
780 == STMT_VINFO_DEF_TYPE (stmt_info));
781 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
782 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
783 if (stmt_info)
784 REDUC_GROUP_NEXT_ELEMENT (stmtp)
785 = STMT_VINFO_RELATED_STMT (stmt_info);
786 }
787 while (stmt_info);
788 }
789
790 /* Fixup scalar cycles that now have their stmts detected as patterns. */
791
792 static void
793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
794 {
795 stmt_vec_info first;
796 unsigned i;
797
798 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
799 {
800 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
801 while (next)
802 {
803 if ((STMT_VINFO_IN_PATTERN_P (next)
804 != STMT_VINFO_IN_PATTERN_P (first))
805 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
806 break;
807 next = REDUC_GROUP_NEXT_ELEMENT (next);
808 }
809 /* If all reduction chain members are well-formed patterns adjust
810 the group to group the pattern stmts instead. */
811 if (! next
812 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
813 {
814 if (STMT_VINFO_IN_PATTERN_P (first))
815 {
816 vect_fixup_reduc_chain (first);
817 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
818 = STMT_VINFO_RELATED_STMT (first);
819 }
820 }
821 /* If not all stmt in the chain are patterns or if we failed
822 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
823 it as regular reduction instead. */
824 else
825 {
826 stmt_vec_info vinfo = first;
827 stmt_vec_info last = NULL;
828 while (vinfo)
829 {
830 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
831 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
832 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
833 last = vinfo;
834 vinfo = next;
835 }
836 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
837 = vect_internal_def;
838 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
839 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
840 --i;
841 }
842 }
843 }
844
845 /* Function vect_get_loop_niters.
846
847 Determine how many iterations the loop is executed and place it
848 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
849 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
850 niter information holds in ASSUMPTIONS.
851
852 Return the loop exit condition. */
853
854
855 static gcond *
856 vect_get_loop_niters (class loop *loop, tree *assumptions,
857 tree *number_of_iterations, tree *number_of_iterationsm1)
858 {
859 edge exit = single_exit (loop);
860 class tree_niter_desc niter_desc;
861 tree niter_assumptions, niter, may_be_zero;
862 gcond *cond = get_loop_exit_condition (loop);
863
864 *assumptions = boolean_true_node;
865 *number_of_iterationsm1 = chrec_dont_know;
866 *number_of_iterations = chrec_dont_know;
867 DUMP_VECT_SCOPE ("get_loop_niters");
868
869 if (!exit)
870 return cond;
871
872 may_be_zero = NULL_TREE;
873 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
874 || chrec_contains_undetermined (niter_desc.niter))
875 return cond;
876
877 niter_assumptions = niter_desc.assumptions;
878 may_be_zero = niter_desc.may_be_zero;
879 niter = niter_desc.niter;
880
881 if (may_be_zero && integer_zerop (may_be_zero))
882 may_be_zero = NULL_TREE;
883
884 if (may_be_zero)
885 {
886 if (COMPARISON_CLASS_P (may_be_zero))
887 {
888 /* Try to combine may_be_zero with assumptions, this can simplify
889 computation of niter expression. */
890 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
891 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
892 niter_assumptions,
893 fold_build1 (TRUTH_NOT_EXPR,
894 boolean_type_node,
895 may_be_zero));
896 else
897 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
898 build_int_cst (TREE_TYPE (niter), 0),
899 rewrite_to_non_trapping_overflow (niter));
900
901 may_be_zero = NULL_TREE;
902 }
903 else if (integer_nonzerop (may_be_zero))
904 {
905 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
906 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
907 return cond;
908 }
909 else
910 return cond;
911 }
912
913 *assumptions = niter_assumptions;
914 *number_of_iterationsm1 = niter;
915
916 /* We want the number of loop header executions which is the number
917 of latch executions plus one.
918 ??? For UINT_MAX latch executions this number overflows to zero
919 for loops like do { n++; } while (n != 0); */
920 if (niter && !chrec_contains_undetermined (niter))
921 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
922 build_int_cst (TREE_TYPE (niter), 1));
923 *number_of_iterations = niter;
924
925 return cond;
926 }
927
928 /* Function bb_in_loop_p
929
930 Used as predicate for dfs order traversal of the loop bbs. */
931
932 static bool
933 bb_in_loop_p (const_basic_block bb, const void *data)
934 {
935 const class loop *const loop = (const class loop *)data;
936 if (flow_bb_inside_loop_p (loop, bb))
937 return true;
938 return false;
939 }
940
941
942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
943 stmt_vec_info structs for all the stmts in LOOP_IN. */
944
945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
946 : vec_info (vec_info::loop, shared),
947 loop (loop_in),
948 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
949 num_itersm1 (NULL_TREE),
950 num_iters (NULL_TREE),
951 num_iters_unchanged (NULL_TREE),
952 num_iters_assumptions (NULL_TREE),
953 vector_costs (nullptr),
954 scalar_costs (nullptr),
955 th (0),
956 versioning_threshold (0),
957 vectorization_factor (0),
958 main_loop_edge (nullptr),
959 skip_main_loop_edge (nullptr),
960 skip_this_loop_edge (nullptr),
961 reusable_accumulators (),
962 suggested_unroll_factor (1),
963 max_vectorization_factor (0),
964 mask_skip_niters (NULL_TREE),
965 rgroup_compare_type (NULL_TREE),
966 simd_if_cond (NULL_TREE),
967 partial_vector_style (vect_partial_vectors_none),
968 unaligned_dr (NULL),
969 peeling_for_alignment (0),
970 ptr_mask (0),
971 ivexpr_map (NULL),
972 scan_map (NULL),
973 slp_unrolling_factor (1),
974 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
975 vectorizable (false),
976 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
977 using_partial_vectors_p (false),
978 using_decrementing_iv_p (false),
979 using_select_vl_p (false),
980 epil_using_partial_vectors_p (false),
981 partial_load_store_bias (0),
982 peeling_for_gaps (false),
983 peeling_for_niter (false),
984 no_data_dependencies (false),
985 has_mask_store (false),
986 scalar_loop_scaling (profile_probability::uninitialized ()),
987 scalar_loop (NULL),
988 orig_loop_info (NULL)
989 {
990 /* CHECKME: We want to visit all BBs before their successors (except for
991 latch blocks, for which this assertion wouldn't hold). In the simple
992 case of the loop forms we allow, a dfs order of the BBs would the same
993 as reversed postorder traversal, so we are safe. */
994
995 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
996 bbs, loop->num_nodes, loop);
997 gcc_assert (nbbs == loop->num_nodes);
998
999 for (unsigned int i = 0; i < nbbs; i++)
1000 {
1001 basic_block bb = bbs[i];
1002 gimple_stmt_iterator si;
1003
1004 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1005 {
1006 gimple *phi = gsi_stmt (si);
1007 gimple_set_uid (phi, 0);
1008 add_stmt (phi);
1009 }
1010
1011 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1012 {
1013 gimple *stmt = gsi_stmt (si);
1014 gimple_set_uid (stmt, 0);
1015 if (is_gimple_debug (stmt))
1016 continue;
1017 add_stmt (stmt);
1018 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019 third argument is the #pragma omp simd if (x) condition, when 0,
1020 loop shouldn't be vectorized, when non-zero constant, it should
1021 be vectorized normally, otherwise versioned with vectorized loop
1022 done if the condition is non-zero at runtime. */
1023 if (loop_in->simduid
1024 && is_gimple_call (stmt)
1025 && gimple_call_internal_p (stmt)
1026 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027 && gimple_call_num_args (stmt) >= 3
1028 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029 && (loop_in->simduid
1030 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1031 {
1032 tree arg = gimple_call_arg (stmt, 2);
1033 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034 simd_if_cond = arg;
1035 else
1036 gcc_assert (integer_nonzerop (arg));
1037 }
1038 }
1039 }
1040
1041 epilogue_vinfos.create (6);
1042 }
1043
1044 /* Free all levels of rgroup CONTROLS. */
1045
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1048 {
1049 rgroup_controls *rgc;
1050 unsigned int i;
1051 FOR_EACH_VEC_ELT (*controls, i, rgc)
1052 rgc->controls.release ();
1053 controls->release ();
1054 }
1055
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057 stmt_vec_info structs of all the stmts in the loop. */
1058
1059 _loop_vec_info::~_loop_vec_info ()
1060 {
1061 free (bbs);
1062
1063 release_vec_loop_controls (&masks.rgc_vec);
1064 release_vec_loop_controls (&lens);
1065 delete ivexpr_map;
1066 delete scan_map;
1067 epilogue_vinfos.release ();
1068 delete scalar_costs;
1069 delete vector_costs;
1070
1071 /* When we release an epiloge vinfo that we do not intend to use
1072 avoid clearing AUX of the main loop which should continue to
1073 point to the main loop vinfo since otherwise we'll leak that. */
1074 if (loop->aux == this)
1075 loop->aux = NULL;
1076 }
1077
1078 /* Return an invariant or register for EXPR and emit necessary
1079 computations in the LOOP_VINFO loop preheader. */
1080
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1083 {
1084 if (is_gimple_reg (expr)
1085 || is_gimple_min_invariant (expr))
1086 return expr;
1087
1088 if (! loop_vinfo->ivexpr_map)
1089 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091 if (! cached)
1092 {
1093 gimple_seq stmts = NULL;
1094 cached = force_gimple_operand (unshare_expr (expr),
1095 &stmts, true, NULL_TREE);
1096 if (stmts)
1097 {
1098 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099 gsi_insert_seq_on_edge_immediate (e, stmts);
1100 }
1101 }
1102 return cached;
1103 }
1104
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106 all masks required to mask LOOP_VINFO. */
1107
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1110 {
1111 rgroup_controls *rgm;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114 if (rgm->type != NULL_TREE
1115 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116 cmp_type, rgm->type,
1117 OPTIMIZE_FOR_SPEED))
1118 return false;
1119 return true;
1120 }
1121
1122 /* Calculate the maximum number of scalars per iteration for every
1123 rgroup in LOOP_VINFO. */
1124
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1127 {
1128 unsigned int res = 1;
1129 unsigned int i;
1130 rgroup_controls *rgm;
1131 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132 res = MAX (res, rgm->max_nscalars_per_iter);
1133 return res;
1134 }
1135
1136 /* Calculate the minimum precision necessary to represent:
1137
1138 MAX_NITERS * FACTOR
1139
1140 as an unsigned integer, where MAX_NITERS is the maximum number of
1141 loop header iterations for the original scalar form of LOOP_VINFO. */
1142
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1145 {
1146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147
1148 /* Get the maximum number of iterations that is representable
1149 in the counter type. */
1150 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1152
1153 /* Get a more refined estimate for the number of iterations. */
1154 widest_int max_back_edges;
1155 if (max_loop_iterations (loop, &max_back_edges))
1156 max_ni = wi::smin (max_ni, max_back_edges + 1);
1157
1158 /* Work out how many bits we need to represent the limit. */
1159 return wi::min_precision (max_ni * factor, UNSIGNED);
1160 }
1161
1162 /* True if the loop needs peeling or partial vectors when vectorized. */
1163
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1166 {
1167 unsigned HOST_WIDE_INT const_vf;
1168 HOST_WIDE_INT max_niter
1169 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1170
1171 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174 (loop_vinfo));
1175
1176 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1178 {
1179 /* Work out the (constant) number of iterations that need to be
1180 peeled for reasons other than niters. */
1181 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183 peel_niter += 1;
1184 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186 return true;
1187 }
1188 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189 /* ??? When peeling for gaps but not alignment, we could
1190 try to check whether the (variable) niters is known to be
1191 VF * N + 1. That's something of a niche case though. */
1192 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195 < (unsigned) exact_log2 (const_vf))
1196 /* In case of versioning, check if the maximum number of
1197 iterations is greater than th. If they are identical,
1198 the epilogue is unnecessary. */
1199 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200 || ((unsigned HOST_WIDE_INT) max_niter
1201 > (th / const_vf) * const_vf))))
1202 return true;
1203
1204 return false;
1205 }
1206
1207 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1208 whether we can actually generate the masks required. Return true if so,
1209 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1210
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1213 {
1214 unsigned int min_ni_width;
1215
1216 /* Use a normal loop if there are no statements that need masking.
1217 This only happens in rare degenerate cases: it means that the loop
1218 has no loads, no stores, and no live-out values. */
1219 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220 return false;
1221
1222 /* Produce the rgroup controls. */
1223 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1224 {
1225 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226 tree vectype = mask.first;
1227 unsigned nvectors = mask.second;
1228
1229 if (masks->rgc_vec.length () < nvectors)
1230 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232 /* The number of scalars per iteration and the number of vectors are
1233 both compile-time constants. */
1234 unsigned int nscalars_per_iter
1235 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1237
1238 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1239 {
1240 rgm->max_nscalars_per_iter = nscalars_per_iter;
1241 rgm->type = truth_type_for (vectype);
1242 rgm->factor = 1;
1243 }
1244 }
1245
1246 unsigned int max_nscalars_per_iter
1247 = vect_get_max_nscalars_per_iter (loop_vinfo);
1248
1249 /* Work out how many bits we need to represent the limit. */
1250 min_ni_width
1251 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1252
1253 /* Find a scalar mode for which WHILE_ULT is supported. */
1254 opt_scalar_int_mode cmp_mode_iter;
1255 tree cmp_type = NULL_TREE;
1256 tree iv_type = NULL_TREE;
1257 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258 unsigned int iv_precision = UINT_MAX;
1259
1260 if (iv_limit != -1)
1261 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262 UNSIGNED);
1263
1264 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1265 {
1266 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267 if (cmp_bits >= min_ni_width
1268 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1269 {
1270 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271 if (this_type
1272 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1273 {
1274 /* Although we could stop as soon as we find a valid mode,
1275 there are at least two reasons why that's not always the
1276 best choice:
1277
1278 - An IV that's Pmode or wider is more likely to be reusable
1279 in address calculations than an IV that's narrower than
1280 Pmode.
1281
1282 - Doing the comparison in IV_PRECISION or wider allows
1283 a natural 0-based IV, whereas using a narrower comparison
1284 type requires mitigations against wrap-around.
1285
1286 Conversely, if the IV limit is variable, doing the comparison
1287 in a wider type than the original type can introduce
1288 unnecessary extensions, so picking the widest valid mode
1289 is not always a good choice either.
1290
1291 Here we prefer the first IV type that's Pmode or wider,
1292 and the first comparison type that's IV_PRECISION or wider.
1293 (The comparison type must be no wider than the IV type,
1294 to avoid extensions in the vector loop.)
1295
1296 ??? We might want to try continuing beyond Pmode for ILP32
1297 targets if CMP_BITS < IV_PRECISION. */
1298 iv_type = this_type;
1299 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300 cmp_type = this_type;
1301 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302 break;
1303 }
1304 }
1305 }
1306
1307 if (!cmp_type)
1308 {
1309 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310 return false;
1311 }
1312
1313 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316 return true;
1317 }
1318
1319 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1320 whether we can actually generate AVX512 style masks. Return true if so,
1321 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1322
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1325 {
1326 /* Produce differently organized rgc_vec and differently check
1327 we can produce masks. */
1328
1329 /* Use a normal loop if there are no statements that need masking.
1330 This only happens in rare degenerate cases: it means that the loop
1331 has no loads, no stores, and no live-out values. */
1332 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333 return false;
1334
1335 /* For the decrementing IV we need to represent all values in
1336 [0, niter + niter_skip] where niter_skip is the elements we
1337 skip in the first iteration for prologue peeling. */
1338 tree iv_type = NULL_TREE;
1339 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340 unsigned int iv_precision = UINT_MAX;
1341 if (iv_limit != -1)
1342 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1343
1344 /* First compute the type for the IV we use to track the remaining
1345 scalar iterations. */
1346 opt_scalar_int_mode cmp_mode_iter;
1347 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1348 {
1349 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350 if (cmp_bits >= iv_precision
1351 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1352 {
1353 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354 if (iv_type)
1355 break;
1356 }
1357 }
1358 if (!iv_type)
1359 return false;
1360
1361 /* Produce the rgroup controls. */
1362 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1363 {
1364 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365 tree vectype = mask.first;
1366 unsigned nvectors = mask.second;
1367
1368 /* The number of scalars per iteration and the number of vectors are
1369 both compile-time constants. */
1370 unsigned int nscalars_per_iter
1371 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1373
1374 /* We index the rgroup_controls vector with nscalars_per_iter
1375 which we keep constant and instead have a varying nvectors,
1376 remembering the vector mask with the fewest nV. */
1377 if (masks->rgc_vec.length () < nscalars_per_iter)
1378 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1380
1381 if (!rgm->type || rgm->factor > nvectors)
1382 {
1383 rgm->type = truth_type_for (vectype);
1384 rgm->compare_type = NULL_TREE;
1385 rgm->max_nscalars_per_iter = nscalars_per_iter;
1386 rgm->factor = nvectors;
1387 rgm->bias_adjusted_ctrl = NULL_TREE;
1388 }
1389 }
1390
1391 /* There is no fixed compare type we are going to use but we have to
1392 be able to get at one for each mask group. */
1393 unsigned int min_ni_width
1394 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1395
1396 bool ok = true;
1397 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1398 {
1399 tree mask_type = rgc.type;
1400 if (!mask_type)
1401 continue;
1402
1403 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1404 {
1405 ok = false;
1406 break;
1407 }
1408
1409 /* If iv_type is usable as compare type use that - we can elide the
1410 saturation in that case. */
1411 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1412 {
1413 tree cmp_vectype
1414 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416 rgc.compare_type = cmp_vectype;
1417 }
1418 if (!rgc.compare_type)
1419 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1420 {
1421 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422 if (cmp_bits >= min_ni_width
1423 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1424 {
1425 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426 if (!cmp_type)
1427 continue;
1428
1429 /* Check whether we can produce the mask with cmp_type. */
1430 tree cmp_vectype
1431 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1433 {
1434 rgc.compare_type = cmp_vectype;
1435 break;
1436 }
1437 }
1438 }
1439 if (!rgc.compare_type)
1440 {
1441 ok = false;
1442 break;
1443 }
1444 }
1445 if (!ok)
1446 {
1447 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448 return false;
1449 }
1450
1451 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454 return true;
1455 }
1456
1457 /* Check whether we can use vector access with length based on precison
1458 comparison. So far, to keep it simple, we only allow the case that the
1459 precision of the target supported length is larger than the precision
1460 required by loop niters. */
1461
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1464 {
1465 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466 return false;
1467
1468 machine_mode len_load_mode = get_len_load_store_mode
1469 (loop_vinfo->vector_mode, true).require ();
1470 machine_mode len_store_mode = get_len_load_store_mode
1471 (loop_vinfo->vector_mode, false).require ();
1472
1473 signed char partial_load_bias = internal_len_load_store_bias
1474 (IFN_LEN_LOAD, len_load_mode);
1475
1476 signed char partial_store_bias = internal_len_load_store_bias
1477 (IFN_LEN_STORE, len_store_mode);
1478
1479 gcc_assert (partial_load_bias == partial_store_bias);
1480
1481 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482 return false;
1483
1484 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485 len_loads with a length of zero. In order to avoid that we prohibit
1486 more than one loop length here. */
1487 if (partial_load_bias == -1
1488 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489 return false;
1490
1491 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1492
1493 unsigned int max_nitems_per_iter = 1;
1494 unsigned int i;
1495 rgroup_controls *rgl;
1496 /* Find the maximum number of items per iteration for every rgroup. */
1497 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1498 {
1499 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1501 }
1502
1503 /* Work out how many bits we need to represent the length limit. */
1504 unsigned int min_ni_prec
1505 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1506
1507 /* Now use the maximum of below precisions for one suitable IV type:
1508 - the IV's natural precision
1509 - the precision needed to hold: the maximum number of scalar
1510 iterations multiplied by the scale factor (min_ni_prec above)
1511 - the Pmode precision
1512
1513 If min_ni_prec is less than the precision of the current niters,
1514 we perfer to still use the niters type. Prefer to use Pmode and
1515 wider IV to avoid narrow conversions. */
1516
1517 unsigned int ni_prec
1518 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519 min_ni_prec = MAX (min_ni_prec, ni_prec);
1520 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1521
1522 tree iv_type = NULL_TREE;
1523 opt_scalar_int_mode tmode_iter;
1524 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1525 {
1526 scalar_mode tmode = tmode_iter.require ();
1527 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1528
1529 /* ??? Do we really want to construct one IV whose precision exceeds
1530 BITS_PER_WORD? */
1531 if (tbits > BITS_PER_WORD)
1532 break;
1533
1534 /* Find the first available standard integral type. */
1535 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1536 {
1537 iv_type = build_nonstandard_integer_type (tbits, true);
1538 break;
1539 }
1540 }
1541
1542 if (!iv_type)
1543 {
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546 "can't vectorize with length-based partial vectors"
1547 " because there is no suitable iv type.\n");
1548 return false;
1549 }
1550
1551 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1554
1555 return true;
1556 }
1557
1558 /* Calculate the cost of one scalar iteration of the loop. */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1561 {
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564 int nbbs = loop->num_nodes, factor;
1565 int innerloop_iters, i;
1566
1567 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1568
1569 /* Gather costs for statements in the scalar loop. */
1570
1571 /* FORNOW. */
1572 innerloop_iters = 1;
1573 if (loop->inner)
1574 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1575
1576 for (i = 0; i < nbbs; i++)
1577 {
1578 gimple_stmt_iterator si;
1579 basic_block bb = bbs[i];
1580
1581 if (bb->loop_father == loop->inner)
1582 factor = innerloop_iters;
1583 else
1584 factor = 1;
1585
1586 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1587 {
1588 gimple *stmt = gsi_stmt (si);
1589 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1590
1591 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592 continue;
1593
1594 /* Skip stmts that are not vectorized inside the loop. */
1595 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597 && (!STMT_VINFO_LIVE_P (vstmt_info)
1598 || !VECTORIZABLE_CYCLE_DEF
1599 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600 continue;
1601
1602 vect_cost_for_stmt kind;
1603 if (STMT_VINFO_DATA_REF (stmt_info))
1604 {
1605 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606 kind = scalar_load;
1607 else
1608 kind = scalar_store;
1609 }
1610 else if (vect_nop_conversion_p (stmt_info))
1611 continue;
1612 else
1613 kind = scalar_stmt;
1614
1615 /* We are using vect_prologue here to avoid scaling twice
1616 by the inner loop factor. */
1617 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618 factor, kind, stmt_info, 0, vect_prologue);
1619 }
1620 }
1621
1622 /* Now accumulate cost. */
1623 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624 add_stmt_costs (loop_vinfo->scalar_costs,
1625 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626 loop_vinfo->scalar_costs->finish_cost (nullptr);
1627 }
1628
1629
1630 /* Function vect_analyze_loop_form.
1631
1632 Verify that certain CFG restrictions hold, including:
1633 - the loop has a pre-header
1634 - the loop has a single entry and exit
1635 - the loop exit condition is simple enough
1636 - the number of iterations can be analyzed, i.e, a countable loop. The
1637 niter could be analyzed under some assumptions. */
1638
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1641 {
1642 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1643
1644 /* Different restrictions apply when we are considering an inner-most loop,
1645 vs. an outer (nested) loop.
1646 (FORNOW. May want to relax some of these restrictions in the future). */
1647
1648 info->inner_loop_cond = NULL;
1649 if (!loop->inner)
1650 {
1651 /* Inner-most loop. We currently require that the number of BBs is
1652 exactly 2 (the header and latch). Vectorizable inner-most loops
1653 look like this:
1654
1655 (pre-header)
1656 |
1657 header <--------+
1658 | | |
1659 | +--> latch --+
1660 |
1661 (exit-bb) */
1662
1663 if (loop->num_nodes != 2)
1664 return opt_result::failure_at (vect_location,
1665 "not vectorized:"
1666 " control flow in loop.\n");
1667
1668 if (empty_block_p (loop->header))
1669 return opt_result::failure_at (vect_location,
1670 "not vectorized: empty loop.\n");
1671 }
1672 else
1673 {
1674 class loop *innerloop = loop->inner;
1675 edge entryedge;
1676
1677 /* Nested loop. We currently require that the loop is doubly-nested,
1678 contains a single inner loop, and the number of BBs is exactly 5.
1679 Vectorizable outer-loops look like this:
1680
1681 (pre-header)
1682 |
1683 header <---+
1684 | |
1685 inner-loop |
1686 | |
1687 tail ------+
1688 |
1689 (exit-bb)
1690
1691 The inner-loop has the properties expected of inner-most loops
1692 as described above. */
1693
1694 if ((loop->inner)->inner || (loop->inner)->next)
1695 return opt_result::failure_at (vect_location,
1696 "not vectorized:"
1697 " multiple nested loops.\n");
1698
1699 if (loop->num_nodes != 5)
1700 return opt_result::failure_at (vect_location,
1701 "not vectorized:"
1702 " control flow in loop.\n");
1703
1704 entryedge = loop_preheader_edge (innerloop);
1705 if (entryedge->src != loop->header
1706 || !single_exit (innerloop)
1707 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708 return opt_result::failure_at (vect_location,
1709 "not vectorized:"
1710 " unsupported outerloop form.\n");
1711
1712 /* Analyze the inner-loop. */
1713 vect_loop_form_info inner;
1714 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715 if (!res)
1716 {
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: Bad inner loop.\n");
1720 return res;
1721 }
1722
1723 /* Don't support analyzing niter under assumptions for inner
1724 loop. */
1725 if (!integer_onep (inner.assumptions))
1726 return opt_result::failure_at (vect_location,
1727 "not vectorized: Bad inner loop.\n");
1728
1729 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730 return opt_result::failure_at (vect_location,
1731 "not vectorized: inner-loop count not"
1732 " invariant.\n");
1733
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE, vect_location,
1736 "Considering outer-loop vectorization.\n");
1737 info->inner_loop_cond = inner.loop_cond;
1738 }
1739
1740 if (!single_exit (loop))
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized: multiple exits.\n");
1743 if (EDGE_COUNT (loop->header->preds) != 2)
1744 return opt_result::failure_at (vect_location,
1745 "not vectorized:"
1746 " too many incoming edges.\n");
1747
1748 /* We assume that the loop exit condition is at the end of the loop. i.e,
1749 that the loop is represented as a do-while (with a proper if-guard
1750 before the loop if needed), where the loop header contains all the
1751 executable statements, and the latch is empty. */
1752 if (!empty_block_p (loop->latch)
1753 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754 return opt_result::failure_at (vect_location,
1755 "not vectorized: latch block not empty.\n");
1756
1757 /* Make sure the exit is not abnormal. */
1758 edge e = single_exit (loop);
1759 if (e->flags & EDGE_ABNORMAL)
1760 return opt_result::failure_at (vect_location,
1761 "not vectorized:"
1762 " abnormal loop exit edge.\n");
1763
1764 info->loop_cond
1765 = vect_get_loop_niters (loop, &info->assumptions,
1766 &info->number_of_iterations,
1767 &info->number_of_iterationsm1);
1768 if (!info->loop_cond)
1769 return opt_result::failure_at
1770 (vect_location,
1771 "not vectorized: complicated exit condition.\n");
1772
1773 if (integer_zerop (info->assumptions)
1774 || !info->number_of_iterations
1775 || chrec_contains_undetermined (info->number_of_iterations))
1776 return opt_result::failure_at
1777 (info->loop_cond,
1778 "not vectorized: number of iterations cannot be computed.\n");
1779
1780 if (integer_zerop (info->number_of_iterations))
1781 return opt_result::failure_at
1782 (info->loop_cond,
1783 "not vectorized: number of iterations = 0.\n");
1784
1785 if (!(tree_fits_shwi_p (info->number_of_iterations)
1786 && tree_to_shwi (info->number_of_iterations) > 0))
1787 {
1788 if (dump_enabled_p ())
1789 {
1790 dump_printf_loc (MSG_NOTE, vect_location,
1791 "Symbolic number of iterations is ");
1792 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793 dump_printf (MSG_NOTE, "\n");
1794 }
1795 }
1796
1797 return opt_result::success ();
1798 }
1799
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801 vect_analyze_loop_form result. */
1802
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805 const vect_loop_form_info *info,
1806 loop_vec_info main_loop_info)
1807 {
1808 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813 /* Also record the assumptions for versioning. */
1814 if (!integer_onep (info->assumptions) && !main_loop_info)
1815 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1816
1817 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819 if (info->inner_loop_cond)
1820 {
1821 stmt_vec_info inner_loop_cond_info
1822 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824 /* If we have an estimate on the number of iterations of the inner
1825 loop use that to limit the scale for costing, otherwise use
1826 --param vect-inner-loop-cost-factor literally. */
1827 widest_int nit;
1828 if (estimated_stmt_executions (loop->inner, &nit))
1829 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1831 }
1832
1833 return loop_vinfo;
1834 }
1835
1836
1837
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839 statements update the vectorization factor. */
1840
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1843 {
1844 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846 int nbbs = loop->num_nodes;
1847 poly_uint64 vectorization_factor;
1848 int i;
1849
1850 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1851
1852 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853 gcc_assert (known_ne (vectorization_factor, 0U));
1854
1855 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856 vectorization factor of the loop is the unrolling factor required by
1857 the SLP instances. If that unrolling factor is 1, we say, that we
1858 perform pure SLP on loop - cross iteration parallelism is not
1859 exploited. */
1860 bool only_slp_in_loop = true;
1861 for (i = 0; i < nbbs; i++)
1862 {
1863 basic_block bb = bbs[i];
1864 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865 gsi_next (&si))
1866 {
1867 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868 if (!stmt_info)
1869 continue;
1870 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872 && !PURE_SLP_STMT (stmt_info))
1873 /* STMT needs both SLP and loop-based vectorization. */
1874 only_slp_in_loop = false;
1875 }
1876 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877 gsi_next (&si))
1878 {
1879 if (is_gimple_debug (gsi_stmt (si)))
1880 continue;
1881 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882 stmt_info = vect_stmt_to_vectorize (stmt_info);
1883 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885 && !PURE_SLP_STMT (stmt_info))
1886 /* STMT needs both SLP and loop-based vectorization. */
1887 only_slp_in_loop = false;
1888 }
1889 }
1890
1891 if (only_slp_in_loop)
1892 {
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_NOTE, vect_location,
1895 "Loop contains only SLP stmts\n");
1896 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1897 }
1898 else
1899 {
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Loop contains SLP and non-SLP stmts\n");
1903 /* Both the vectorization factor and unroll factor have the form
1904 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905 so they must have a common multiple. */
1906 vectorization_factor
1907 = force_common_multiple (vectorization_factor,
1908 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1909 }
1910
1911 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912 if (dump_enabled_p ())
1913 {
1914 dump_printf_loc (MSG_NOTE, vect_location,
1915 "Updating vectorization factor to ");
1916 dump_dec (MSG_NOTE, vectorization_factor);
1917 dump_printf (MSG_NOTE, ".\n");
1918 }
1919 }
1920
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922 the other phi in the reduction is also relevant for vectorization.
1923 This rejects cases such as:
1924
1925 outer1:
1926 x_1 = PHI <x_3(outer2), ...>;
1927 ...
1928
1929 inner:
1930 x_2 = ...;
1931 ...
1932
1933 outer2:
1934 x_3 = PHI <x_2(inner)>;
1935
1936 if nothing in x_2 or elsewhere makes x_1 relevant. */
1937
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1940 {
1941 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942 return false;
1943
1944 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1945 }
1946
1947 /* Function vect_analyze_loop_operations.
1948
1949 Scan the loop stmts and make sure they are all vectorizable. */
1950
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1953 {
1954 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956 int nbbs = loop->num_nodes;
1957 int i;
1958 stmt_vec_info stmt_info;
1959 bool need_to_vectorize = false;
1960 bool ok;
1961
1962 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1963
1964 auto_vec<stmt_info_for_cost> cost_vec;
1965
1966 for (i = 0; i < nbbs; i++)
1967 {
1968 basic_block bb = bbs[i];
1969
1970 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971 gsi_next (&si))
1972 {
1973 gphi *phi = si.phi ();
1974 ok = true;
1975
1976 stmt_info = loop_vinfo->lookup_stmt (phi);
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979 (gimple *) phi);
1980 if (virtual_operand_p (gimple_phi_result (phi)))
1981 continue;
1982
1983 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984 (i.e., a phi in the tail of the outer-loop). */
1985 if (! is_loop_header_bb_p (bb))
1986 {
1987 /* FORNOW: we currently don't support the case that these phis
1988 are not used in the outerloop (unless it is double reduction,
1989 i.e., this phi is vect_reduction_def), cause this case
1990 requires to actually do something here. */
1991 if (STMT_VINFO_LIVE_P (stmt_info)
1992 && !vect_active_double_reduction_p (stmt_info))
1993 return opt_result::failure_at (phi,
1994 "Unsupported loop-closed phi"
1995 " in outer-loop.\n");
1996
1997 /* If PHI is used in the outer loop, we check that its operand
1998 is defined in the inner loop. */
1999 if (STMT_VINFO_RELEVANT_P (stmt_info))
2000 {
2001 tree phi_op;
2002
2003 if (gimple_phi_num_args (phi) != 1)
2004 return opt_result::failure_at (phi, "unsupported phi");
2005
2006 phi_op = PHI_ARG_DEF (phi, 0);
2007 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008 if (!op_def_info)
2009 return opt_result::failure_at (phi, "unsupported phi\n");
2010
2011 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012 && (STMT_VINFO_RELEVANT (op_def_info)
2013 != vect_used_in_outer_by_reduction))
2014 return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017 || (STMT_VINFO_DEF_TYPE (stmt_info)
2018 == vect_double_reduction_def))
2019 && !vectorizable_lc_phi (loop_vinfo,
2020 stmt_info, NULL, NULL))
2021 return opt_result::failure_at (phi, "unsupported phi\n");
2022 }
2023
2024 continue;
2025 }
2026
2027 gcc_assert (stmt_info);
2028
2029 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030 || STMT_VINFO_LIVE_P (stmt_info))
2031 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033 /* A scalar-dependence cycle that we don't support. */
2034 return opt_result::failure_at (phi,
2035 "not vectorized:"
2036 " scalar dependence cycle.\n");
2037
2038 if (STMT_VINFO_RELEVANT_P (stmt_info))
2039 {
2040 need_to_vectorize = true;
2041 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042 && ! PURE_SLP_STMT (stmt_info))
2043 ok = vectorizable_induction (loop_vinfo,
2044 stmt_info, NULL, NULL,
2045 &cost_vec);
2046 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047 || (STMT_VINFO_DEF_TYPE (stmt_info)
2048 == vect_double_reduction_def)
2049 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050 && ! PURE_SLP_STMT (stmt_info))
2051 ok = vectorizable_reduction (loop_vinfo,
2052 stmt_info, NULL, NULL, &cost_vec);
2053 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054 == vect_first_order_recurrence)
2055 && ! PURE_SLP_STMT (stmt_info))
2056 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057 &cost_vec);
2058 }
2059
2060 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2061 if (ok
2062 && STMT_VINFO_LIVE_P (stmt_info)
2063 && !PURE_SLP_STMT (stmt_info))
2064 ok = vectorizable_live_operation (loop_vinfo,
2065 stmt_info, NULL, NULL, NULL,
2066 -1, false, &cost_vec);
2067
2068 if (!ok)
2069 return opt_result::failure_at (phi,
2070 "not vectorized: relevant phi not "
2071 "supported: %G",
2072 static_cast <gimple *> (phi));
2073 }
2074
2075 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2076 gsi_next (&si))
2077 {
2078 gimple *stmt = gsi_stmt (si);
2079 if (!gimple_clobber_p (stmt)
2080 && !is_gimple_debug (stmt))
2081 {
2082 opt_result res
2083 = vect_analyze_stmt (loop_vinfo,
2084 loop_vinfo->lookup_stmt (stmt),
2085 &need_to_vectorize,
2086 NULL, NULL, &cost_vec);
2087 if (!res)
2088 return res;
2089 }
2090 }
2091 } /* bbs */
2092
2093 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2094
2095 /* All operations in the loop are either irrelevant (deal with loop
2096 control, or dead), or only used outside the loop and can be moved
2097 out of the loop (e.g. invariants, inductions). The loop can be
2098 optimized away by scalar optimizations. We're better off not
2099 touching this loop. */
2100 if (!need_to_vectorize)
2101 {
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "All the computation can be taken out of the loop.\n");
2105 return opt_result::failure_at
2106 (vect_location,
2107 "not vectorized: redundant loop. no profit to vectorize.\n");
2108 }
2109
2110 return opt_result::success ();
2111 }
2112
2113 /* Return true if we know that the iteration count is smaller than the
2114 vectorization factor. Return false if it isn't, or if we can't be sure
2115 either way. */
2116
2117 static bool
2118 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2119 {
2120 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2121
2122 HOST_WIDE_INT max_niter;
2123 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2124 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2125 else
2126 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2127
2128 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2129 return true;
2130
2131 return false;
2132 }
2133
2134 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2135 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2136 definitely no, or -1 if it's worth retrying. */
2137
2138 static int
2139 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2140 unsigned *suggested_unroll_factor)
2141 {
2142 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2143 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2144
2145 /* Only loops that can handle partially-populated vectors can have iteration
2146 counts less than the vectorization factor. */
2147 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2148 && vect_known_niters_smaller_than_vf (loop_vinfo))
2149 {
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2152 "not vectorized: iteration count smaller than "
2153 "vectorization factor.\n");
2154 return 0;
2155 }
2156
2157 /* If we know the number of iterations we can do better, for the
2158 epilogue we can also decide whether the main loop leaves us
2159 with enough iterations, prefering a smaller vector epilog then
2160 also possibly used for the case we skip the vector loop. */
2161 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2162 {
2163 widest_int scalar_niters
2164 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2165 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2166 {
2167 loop_vec_info orig_loop_vinfo
2168 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2169 unsigned lowest_vf
2170 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2171 int prolog_peeling = 0;
2172 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2173 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2174 if (prolog_peeling >= 0
2175 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2176 lowest_vf))
2177 {
2178 unsigned gap
2179 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2180 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2181 % lowest_vf + gap);
2182 }
2183 }
2184 /* Reject vectorizing for a single scalar iteration, even if
2185 we could in principle implement that using partial vectors. */
2186 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2187 if (scalar_niters <= peeling_gap + 1)
2188 {
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191 "not vectorized: loop only has a single "
2192 "scalar iteration.\n");
2193 return 0;
2194 }
2195
2196 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2197 {
2198 /* Check that the loop processes at least one full vector. */
2199 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2200 if (known_lt (scalar_niters, vf))
2201 {
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 "loop does not have enough iterations "
2205 "to support vectorization.\n");
2206 return 0;
2207 }
2208
2209 /* If we need to peel an extra epilogue iteration to handle data
2210 accesses with gaps, check that there are enough scalar iterations
2211 available.
2212
2213 The check above is redundant with this one when peeling for gaps,
2214 but the distinction is useful for diagnostics. */
2215 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2216 && known_le (scalar_niters, vf))
2217 {
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "loop does not have enough iterations "
2221 "to support peeling for gaps.\n");
2222 return 0;
2223 }
2224 }
2225 }
2226
2227 /* If using the "very cheap" model. reject cases in which we'd keep
2228 a copy of the scalar code (even if we might be able to vectorize it). */
2229 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2230 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2231 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2232 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2233 {
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "some scalar iterations would need to be peeled\n");
2237 return 0;
2238 }
2239
2240 int min_profitable_iters, min_profitable_estimate;
2241 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2242 &min_profitable_estimate,
2243 suggested_unroll_factor);
2244
2245 if (min_profitable_iters < 0)
2246 {
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "not vectorized: vectorization not profitable.\n");
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "not vectorized: vector version will never be "
2253 "profitable.\n");
2254 return -1;
2255 }
2256
2257 int min_scalar_loop_bound = (param_min_vect_loop_bound
2258 * assumed_vf);
2259
2260 /* Use the cost model only if it is more conservative than user specified
2261 threshold. */
2262 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2263 min_profitable_iters);
2264
2265 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2266
2267 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2268 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2269 {
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272 "not vectorized: vectorization not profitable.\n");
2273 if (dump_enabled_p ())
2274 dump_printf_loc (MSG_NOTE, vect_location,
2275 "not vectorized: iteration count smaller than user "
2276 "specified loop bound parameter or minimum profitable "
2277 "iterations (whichever is more conservative).\n");
2278 return 0;
2279 }
2280
2281 /* The static profitablity threshold min_profitable_estimate includes
2282 the cost of having to check at runtime whether the scalar loop
2283 should be used instead. If it turns out that we don't need or want
2284 such a check, the threshold we should use for the static estimate
2285 is simply the point at which the vector loop becomes more profitable
2286 than the scalar loop. */
2287 if (min_profitable_estimate > min_profitable_iters
2288 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2289 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2290 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2291 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2292 {
2293 if (dump_enabled_p ())
2294 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2295 " choice between the scalar and vector loops\n");
2296 min_profitable_estimate = min_profitable_iters;
2297 }
2298
2299 /* If the vector loop needs multiple iterations to be beneficial then
2300 things are probably too close to call, and the conservative thing
2301 would be to stick with the scalar code. */
2302 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2303 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2304 {
2305 if (dump_enabled_p ())
2306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 "one iteration of the vector loop would be"
2308 " more expensive than the equivalent number of"
2309 " iterations of the scalar loop\n");
2310 return 0;
2311 }
2312
2313 HOST_WIDE_INT estimated_niter;
2314
2315 /* If we are vectorizing an epilogue then we know the maximum number of
2316 scalar iterations it will cover is at least one lower than the
2317 vectorization factor of the main loop. */
2318 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2319 estimated_niter
2320 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2321 else
2322 {
2323 estimated_niter = estimated_stmt_executions_int (loop);
2324 if (estimated_niter == -1)
2325 estimated_niter = likely_max_stmt_executions_int (loop);
2326 }
2327 if (estimated_niter != -1
2328 && ((unsigned HOST_WIDE_INT) estimated_niter
2329 < MAX (th, (unsigned) min_profitable_estimate)))
2330 {
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "not vectorized: estimated iteration count too "
2334 "small.\n");
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_NOTE, vect_location,
2337 "not vectorized: estimated iteration count smaller "
2338 "than specified loop bound parameter or minimum "
2339 "profitable iterations (whichever is more "
2340 "conservative).\n");
2341 return -1;
2342 }
2343
2344 return 1;
2345 }
2346
2347 static opt_result
2348 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2349 vec<data_reference_p> *datarefs,
2350 unsigned int *n_stmts)
2351 {
2352 *n_stmts = 0;
2353 for (unsigned i = 0; i < loop->num_nodes; i++)
2354 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2355 !gsi_end_p (gsi); gsi_next (&gsi))
2356 {
2357 gimple *stmt = gsi_stmt (gsi);
2358 if (is_gimple_debug (stmt))
2359 continue;
2360 ++(*n_stmts);
2361 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2362 NULL, 0);
2363 if (!res)
2364 {
2365 if (is_gimple_call (stmt) && loop->safelen)
2366 {
2367 tree fndecl = gimple_call_fndecl (stmt), op;
2368 if (fndecl == NULL_TREE
2369 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2370 {
2371 fndecl = gimple_call_arg (stmt, 0);
2372 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2373 fndecl = TREE_OPERAND (fndecl, 0);
2374 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2375 }
2376 if (fndecl != NULL_TREE)
2377 {
2378 cgraph_node *node = cgraph_node::get (fndecl);
2379 if (node != NULL && node->simd_clones != NULL)
2380 {
2381 unsigned int j, n = gimple_call_num_args (stmt);
2382 for (j = 0; j < n; j++)
2383 {
2384 op = gimple_call_arg (stmt, j);
2385 if (DECL_P (op)
2386 || (REFERENCE_CLASS_P (op)
2387 && get_base_address (op)))
2388 break;
2389 }
2390 op = gimple_call_lhs (stmt);
2391 /* Ignore #pragma omp declare simd functions
2392 if they don't have data references in the
2393 call stmt itself. */
2394 if (j == n
2395 && !(op
2396 && (DECL_P (op)
2397 || (REFERENCE_CLASS_P (op)
2398 && get_base_address (op)))))
2399 continue;
2400 }
2401 }
2402 }
2403 return res;
2404 }
2405 /* If dependence analysis will give up due to the limit on the
2406 number of datarefs stop here and fail fatally. */
2407 if (datarefs->length ()
2408 > (unsigned)param_loop_max_datarefs_for_datadeps)
2409 return opt_result::failure_at (stmt, "exceeded param "
2410 "loop-max-datarefs-for-datadeps\n");
2411 }
2412 return opt_result::success ();
2413 }
2414
2415 /* Look for SLP-only access groups and turn each individual access into its own
2416 group. */
2417 static void
2418 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2419 {
2420 unsigned int i;
2421 struct data_reference *dr;
2422
2423 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2424
2425 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2426 FOR_EACH_VEC_ELT (datarefs, i, dr)
2427 {
2428 gcc_assert (DR_REF (dr));
2429 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2430
2431 /* Check if the load is a part of an interleaving chain. */
2432 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2433 {
2434 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2435 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2436 unsigned int group_size = DR_GROUP_SIZE (first_element);
2437
2438 /* Check if SLP-only groups. */
2439 if (!STMT_SLP_TYPE (stmt_info)
2440 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2441 {
2442 /* Dissolve the group. */
2443 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2444
2445 stmt_vec_info vinfo = first_element;
2446 while (vinfo)
2447 {
2448 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2449 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2450 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2451 DR_GROUP_SIZE (vinfo) = 1;
2452 if (STMT_VINFO_STRIDED_P (first_element))
2453 DR_GROUP_GAP (vinfo) = 0;
2454 else
2455 DR_GROUP_GAP (vinfo) = group_size - 1;
2456 /* Duplicate and adjust alignment info, it needs to
2457 be present on each group leader, see dr_misalignment. */
2458 if (vinfo != first_element)
2459 {
2460 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2461 dr_info2->target_alignment = dr_info->target_alignment;
2462 int misalignment = dr_info->misalignment;
2463 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2464 {
2465 HOST_WIDE_INT diff
2466 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2467 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2468 unsigned HOST_WIDE_INT align_c
2469 = dr_info->target_alignment.to_constant ();
2470 misalignment = (misalignment + diff) % align_c;
2471 }
2472 dr_info2->misalignment = misalignment;
2473 }
2474 vinfo = next;
2475 }
2476 }
2477 }
2478 }
2479 }
2480
2481 /* Determine if operating on full vectors for LOOP_VINFO might leave
2482 some scalar iterations still to do. If so, decide how we should
2483 handle those scalar iterations. The possibilities are:
2484
2485 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2486 In this case:
2487
2488 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2489 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2490 LOOP_VINFO_PEELING_FOR_NITER == false
2491
2492 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2493 to handle the remaining scalar iterations. In this case:
2494
2495 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2496 LOOP_VINFO_PEELING_FOR_NITER == true
2497
2498 There are two choices:
2499
2500 (2a) Consider vectorizing the epilogue loop at the same VF as the
2501 main loop, but using partial vectors instead of full vectors.
2502 In this case:
2503
2504 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2505
2506 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2507 In this case:
2508
2509 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2510 */
2511
2512 opt_result
2513 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2514 {
2515 /* Determine whether there would be any scalar iterations left over. */
2516 bool need_peeling_or_partial_vectors_p
2517 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2518
2519 /* Decide whether to vectorize the loop with partial vectors. */
2520 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2521 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2522 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2523 && need_peeling_or_partial_vectors_p)
2524 {
2525 /* For partial-vector-usage=1, try to push the handling of partial
2526 vectors to the epilogue, with the main loop continuing to operate
2527 on full vectors.
2528
2529 If we are unrolling we also do not want to use partial vectors. This
2530 is to avoid the overhead of generating multiple masks and also to
2531 avoid having to execute entire iterations of FALSE masked instructions
2532 when dealing with one or less full iterations.
2533
2534 ??? We could then end up failing to use partial vectors if we
2535 decide to peel iterations into a prologue, and if the main loop
2536 then ends up processing fewer than VF iterations. */
2537 if ((param_vect_partial_vector_usage == 1
2538 || loop_vinfo->suggested_unroll_factor > 1)
2539 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2540 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2541 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2542 else
2543 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2544 }
2545
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_NOTE, vect_location,
2548 "operating on %s vectors%s.\n",
2549 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2550 ? "partial" : "full",
2551 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2552 ? " for epilogue loop" : "");
2553
2554 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2555 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2556 && need_peeling_or_partial_vectors_p);
2557
2558 return opt_result::success ();
2559 }
2560
2561 /* Function vect_analyze_loop_2.
2562
2563 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2564 analyses will record information in some members of LOOP_VINFO. FATAL
2565 indicates if some analysis meets fatal error. If one non-NULL pointer
2566 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2567 worked out suggested unroll factor, while one NULL pointer shows it's
2568 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2569 is to hold the slp decision when the suggested unroll factor is worked
2570 out. */
2571 static opt_result
2572 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2573 unsigned *suggested_unroll_factor,
2574 bool& slp_done_for_suggested_uf)
2575 {
2576 opt_result ok = opt_result::success ();
2577 int res;
2578 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2579 poly_uint64 min_vf = 2;
2580 loop_vec_info orig_loop_vinfo = NULL;
2581
2582 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2583 loop_vec_info of the first vectorized loop. */
2584 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2585 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2586 else
2587 orig_loop_vinfo = loop_vinfo;
2588 gcc_assert (orig_loop_vinfo);
2589
2590 /* The first group of checks is independent of the vector size. */
2591 fatal = true;
2592
2593 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2594 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2595 return opt_result::failure_at (vect_location,
2596 "not vectorized: simd if(0)\n");
2597
2598 /* Find all data references in the loop (which correspond to vdefs/vuses)
2599 and analyze their evolution in the loop. */
2600
2601 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2602
2603 /* Gather the data references and count stmts in the loop. */
2604 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2605 {
2606 opt_result res
2607 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2608 &LOOP_VINFO_DATAREFS (loop_vinfo),
2609 &LOOP_VINFO_N_STMTS (loop_vinfo));
2610 if (!res)
2611 {
2612 if (dump_enabled_p ())
2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 "not vectorized: loop contains function "
2615 "calls or data references that cannot "
2616 "be analyzed\n");
2617 return res;
2618 }
2619 loop_vinfo->shared->save_datarefs ();
2620 }
2621 else
2622 loop_vinfo->shared->check_datarefs ();
2623
2624 /* Analyze the data references and also adjust the minimal
2625 vectorization factor according to the loads and stores. */
2626
2627 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2628 if (!ok)
2629 {
2630 if (dump_enabled_p ())
2631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2632 "bad data references.\n");
2633 return ok;
2634 }
2635
2636 /* Check if we are applying unroll factor now. */
2637 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2638 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2639
2640 /* If the slp decision is false when suggested unroll factor is worked
2641 out, and we are applying suggested unroll factor, we can simply skip
2642 all slp related analyses this time. */
2643 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2644
2645 /* Classify all cross-iteration scalar data-flow cycles.
2646 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2647 vect_analyze_scalar_cycles (loop_vinfo, slp);
2648
2649 vect_pattern_recog (loop_vinfo);
2650
2651 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2652
2653 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2654 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2655
2656 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2657 if (!ok)
2658 {
2659 if (dump_enabled_p ())
2660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2661 "bad data access.\n");
2662 return ok;
2663 }
2664
2665 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2666
2667 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2668 if (!ok)
2669 {
2670 if (dump_enabled_p ())
2671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2672 "unexpected pattern.\n");
2673 return ok;
2674 }
2675
2676 /* While the rest of the analysis below depends on it in some way. */
2677 fatal = false;
2678
2679 /* Analyze data dependences between the data-refs in the loop
2680 and adjust the maximum vectorization factor according to
2681 the dependences.
2682 FORNOW: fail at the first data dependence that we encounter. */
2683
2684 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2685 if (!ok)
2686 {
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2689 "bad data dependence.\n");
2690 return ok;
2691 }
2692 if (max_vf != MAX_VECTORIZATION_FACTOR
2693 && maybe_lt (max_vf, min_vf))
2694 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2695 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2696
2697 ok = vect_determine_vectorization_factor (loop_vinfo);
2698 if (!ok)
2699 {
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702 "can't determine vectorization factor.\n");
2703 return ok;
2704 }
2705 if (max_vf != MAX_VECTORIZATION_FACTOR
2706 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2707 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2708
2709 /* Compute the scalar iteration cost. */
2710 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2711
2712 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2713
2714 if (slp)
2715 {
2716 /* Check the SLP opportunities in the loop, analyze and build
2717 SLP trees. */
2718 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2719 if (!ok)
2720 return ok;
2721
2722 /* If there are any SLP instances mark them as pure_slp. */
2723 slp = vect_make_slp_decision (loop_vinfo);
2724 if (slp)
2725 {
2726 /* Find stmts that need to be both vectorized and SLPed. */
2727 vect_detect_hybrid_slp (loop_vinfo);
2728
2729 /* Update the vectorization factor based on the SLP decision. */
2730 vect_update_vf_for_slp (loop_vinfo);
2731
2732 /* Optimize the SLP graph with the vectorization factor fixed. */
2733 vect_optimize_slp (loop_vinfo);
2734
2735 /* Gather the loads reachable from the SLP graph entries. */
2736 vect_gather_slp_loads (loop_vinfo);
2737 }
2738 }
2739
2740 bool saved_can_use_partial_vectors_p
2741 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2742
2743 /* We don't expect to have to roll back to anything other than an empty
2744 set of rgroups. */
2745 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2746
2747 /* This is the point where we can re-start analysis with SLP forced off. */
2748 start_over:
2749
2750 /* Apply the suggested unrolling factor, this was determined by the backend
2751 during finish_cost the first time we ran the analyzis for this
2752 vector mode. */
2753 if (applying_suggested_uf)
2754 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2755
2756 /* Now the vectorization factor is final. */
2757 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2758 gcc_assert (known_ne (vectorization_factor, 0U));
2759
2760 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2761 {
2762 dump_printf_loc (MSG_NOTE, vect_location,
2763 "vectorization_factor = ");
2764 dump_dec (MSG_NOTE, vectorization_factor);
2765 dump_printf (MSG_NOTE, ", niters = %wd\n",
2766 LOOP_VINFO_INT_NITERS (loop_vinfo));
2767 }
2768
2769 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2770
2771 /* Analyze the alignment of the data-refs in the loop.
2772 Fail if a data reference is found that cannot be vectorized. */
2773
2774 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2775 if (!ok)
2776 {
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779 "bad data alignment.\n");
2780 return ok;
2781 }
2782
2783 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2784 It is important to call pruning after vect_analyze_data_ref_accesses,
2785 since we use grouping information gathered by interleaving analysis. */
2786 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2787 if (!ok)
2788 return ok;
2789
2790 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2791 vectorization, since we do not want to add extra peeling or
2792 add versioning for alignment. */
2793 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2794 /* This pass will decide on using loop versioning and/or loop peeling in
2795 order to enhance the alignment of data references in the loop. */
2796 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2797 if (!ok)
2798 return ok;
2799
2800 if (slp)
2801 {
2802 /* Analyze operations in the SLP instances. Note this may
2803 remove unsupported SLP instances which makes the above
2804 SLP kind detection invalid. */
2805 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2806 vect_slp_analyze_operations (loop_vinfo);
2807 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2808 {
2809 ok = opt_result::failure_at (vect_location,
2810 "unsupported SLP instances\n");
2811 goto again;
2812 }
2813
2814 /* Check whether any load in ALL SLP instances is possibly permuted. */
2815 slp_tree load_node, slp_root;
2816 unsigned i, x;
2817 slp_instance instance;
2818 bool can_use_lanes = true;
2819 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2820 {
2821 slp_root = SLP_INSTANCE_TREE (instance);
2822 int group_size = SLP_TREE_LANES (slp_root);
2823 tree vectype = SLP_TREE_VECTYPE (slp_root);
2824 bool loads_permuted = false;
2825 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2826 {
2827 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2828 continue;
2829 unsigned j;
2830 stmt_vec_info load_info;
2831 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2832 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2833 {
2834 loads_permuted = true;
2835 break;
2836 }
2837 }
2838
2839 /* If the loads and stores can be handled with load/store-lane
2840 instructions record it and move on to the next instance. */
2841 if (loads_permuted
2842 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2843 && vect_store_lanes_supported (vectype, group_size, false))
2844 {
2845 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2846 {
2847 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2848 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2849 /* Use SLP for strided accesses (or if we can't
2850 load-lanes). */
2851 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2852 || ! vect_load_lanes_supported
2853 (STMT_VINFO_VECTYPE (stmt_vinfo),
2854 DR_GROUP_SIZE (stmt_vinfo), false))
2855 break;
2856 }
2857
2858 can_use_lanes
2859 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2860
2861 if (can_use_lanes && dump_enabled_p ())
2862 dump_printf_loc (MSG_NOTE, vect_location,
2863 "SLP instance %p can use load/store-lanes\n",
2864 (void *) instance);
2865 }
2866 else
2867 {
2868 can_use_lanes = false;
2869 break;
2870 }
2871 }
2872
2873 /* If all SLP instances can use load/store-lanes abort SLP and try again
2874 with SLP disabled. */
2875 if (can_use_lanes)
2876 {
2877 ok = opt_result::failure_at (vect_location,
2878 "Built SLP cancelled: can use "
2879 "load/store-lanes\n");
2880 if (dump_enabled_p ())
2881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882 "Built SLP cancelled: all SLP instances support "
2883 "load/store-lanes\n");
2884 goto again;
2885 }
2886 }
2887
2888 /* Dissolve SLP-only groups. */
2889 vect_dissolve_slp_only_groups (loop_vinfo);
2890
2891 /* Scan all the remaining operations in the loop that are not subject
2892 to SLP and make sure they are vectorizable. */
2893 ok = vect_analyze_loop_operations (loop_vinfo);
2894 if (!ok)
2895 {
2896 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898 "bad operation or unsupported loop bound.\n");
2899 return ok;
2900 }
2901
2902 /* For now, we don't expect to mix both masking and length approaches for one
2903 loop, disable it if both are recorded. */
2904 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2905 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2906 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2907 {
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910 "can't vectorize a loop with partial vectors"
2911 " because we don't expect to mix different"
2912 " approaches with partial vectors for the"
2913 " same loop.\n");
2914 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2915 }
2916
2917 /* If we still have the option of using partial vectors,
2918 check whether we can generate the necessary loop controls. */
2919 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2920 {
2921 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2922 {
2923 if (!vect_verify_full_masking (loop_vinfo)
2924 && !vect_verify_full_masking_avx512 (loop_vinfo))
2925 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2926 }
2927 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2928 if (!vect_verify_loop_lens (loop_vinfo))
2929 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2930 }
2931
2932 /* If we're vectorizing a loop that uses length "controls" and
2933 can iterate more than once, we apply decrementing IV approach
2934 in loop control. */
2935 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2936 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2937 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2938 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2939 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2940 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2941 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2942
2943 /* If a loop uses length controls and has a decrementing loop control IV,
2944 we will normally pass that IV through a MIN_EXPR to calcaluate the
2945 basis for the length controls. E.g. in a loop that processes one
2946 element per scalar iteration, the number of elements would be
2947 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2948
2949 This MIN_EXPR approach allows us to use pointer IVs with an invariant
2950 step, since only the final iteration of the vector loop can have
2951 inactive lanes.
2952
2953 However, some targets have a dedicated instruction for calculating the
2954 preferred length, given the total number of elements that still need to
2955 be processed. This is encapsulated in the SELECT_VL internal function.
2956
2957 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2958 to determine the basis for the length controls. However, unlike the
2959 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2960 lanes inactive in any iteration of the vector loop, not just the last
2961 iteration. This SELECT_VL approach therefore requires us to use pointer
2962 IVs with variable steps.
2963
2964 Once we've decided how many elements should be processed by one
2965 iteration of the vector loop, we need to populate the rgroup controls.
2966 If a loop has multiple rgroups, we need to make sure that those rgroups
2967 "line up" (that is, they must be consistent about which elements are
2968 active and which aren't). This is done by vect_adjust_loop_lens_control.
2969
2970 In principle, it would be possible to use vect_adjust_loop_lens_control
2971 on either the result of a MIN_EXPR or the result of a SELECT_VL.
2972 However:
2973
2974 (1) In practice, it only makes sense to use SELECT_VL when a vector
2975 operation will be controlled directly by the result. It is not
2976 worth using SELECT_VL if it would only be the input to other
2977 calculations.
2978
2979 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2980 pointer IV will need N updates by a variable amount (N-1 updates
2981 within the iteration and 1 update to move to the next iteration).
2982
2983 Because of this, we prefer to use the MIN_EXPR approach whenever there
2984 is more than one length control.
2985
2986 In addition, SELECT_VL always operates to a granularity of 1 unit.
2987 If we wanted to use it to control an SLP operation on N consecutive
2988 elements, we would need to make the SELECT_VL inputs measure scalar
2989 iterations (rather than elements) and then multiply the SELECT_VL
2990 result by N. But using SELECT_VL this way is inefficient because
2991 of (1) above.
2992
2993 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2994 satisfied:
2995
2996 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2997 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2998
2999 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3000 we will fail to gain benefits of following unroll optimizations. We prefer
3001 using the MIN_EXPR approach in this situation. */
3002 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3003 {
3004 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3005 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3006 OPTIMIZE_FOR_SPEED)
3007 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3008 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3009 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3010 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3011 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3012 }
3013
3014 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3015 assuming that the loop will be used as a main loop. We will redo
3016 this analysis later if we instead decide to use the loop as an
3017 epilogue loop. */
3018 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3019 if (!ok)
3020 return ok;
3021
3022 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3023 to be able to handle fewer than VF scalars, or needs to have a lower VF
3024 than the main loop. */
3025 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3026 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3027 {
3028 poly_uint64 unscaled_vf
3029 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3030 orig_loop_vinfo->suggested_unroll_factor);
3031 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3032 return opt_result::failure_at (vect_location,
3033 "Vectorization factor too high for"
3034 " epilogue loop.\n");
3035 }
3036
3037 /* Check the costings of the loop make vectorizing worthwhile. */
3038 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3039 if (res < 0)
3040 {
3041 ok = opt_result::failure_at (vect_location,
3042 "Loop costings may not be worthwhile.\n");
3043 goto again;
3044 }
3045 if (!res)
3046 return opt_result::failure_at (vect_location,
3047 "Loop costings not worthwhile.\n");
3048
3049 /* If an epilogue loop is required make sure we can create one. */
3050 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3051 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3052 {
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3055 if (!vect_can_advance_ivs_p (loop_vinfo)
3056 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3057 single_exit (LOOP_VINFO_LOOP
3058 (loop_vinfo))))
3059 {
3060 ok = opt_result::failure_at (vect_location,
3061 "not vectorized: can't create required "
3062 "epilog loop\n");
3063 goto again;
3064 }
3065 }
3066
3067 /* During peeling, we need to check if number of loop iterations is
3068 enough for both peeled prolog loop and vector loop. This check
3069 can be merged along with threshold check of loop versioning, so
3070 increase threshold for this case if necessary.
3071
3072 If we are analyzing an epilogue we still want to check what its
3073 versioning threshold would be. If we decide to vectorize the epilogues we
3074 will want to use the lowest versioning threshold of all epilogues and main
3075 loop. This will enable us to enter a vectorized epilogue even when
3076 versioning the loop. We can't simply check whether the epilogue requires
3077 versioning though since we may have skipped some versioning checks when
3078 analyzing the epilogue. For instance, checks for alias versioning will be
3079 skipped when dealing with epilogues as we assume we already checked them
3080 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3081 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3082 {
3083 poly_uint64 niters_th = 0;
3084 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3085
3086 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3087 {
3088 /* Niters for peeled prolog loop. */
3089 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3090 {
3091 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3092 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3093 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3094 }
3095 else
3096 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3097 }
3098
3099 /* Niters for at least one iteration of vectorized loop. */
3100 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3101 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3102 /* One additional iteration because of peeling for gap. */
3103 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3104 niters_th += 1;
3105
3106 /* Use the same condition as vect_transform_loop to decide when to use
3107 the cost to determine a versioning threshold. */
3108 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3109 && ordered_p (th, niters_th))
3110 niters_th = ordered_max (poly_uint64 (th), niters_th);
3111
3112 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3113 }
3114
3115 gcc_assert (known_eq (vectorization_factor,
3116 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3117
3118 slp_done_for_suggested_uf = slp;
3119
3120 /* Ok to vectorize! */
3121 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3122 return opt_result::success ();
3123
3124 again:
3125 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3126 gcc_assert (!ok);
3127
3128 /* Try again with SLP forced off but if we didn't do any SLP there is
3129 no point in re-trying. */
3130 if (!slp)
3131 return ok;
3132
3133 /* If the slp decision is true when suggested unroll factor is worked
3134 out, and we are applying suggested unroll factor, we don't need to
3135 re-try any more. */
3136 if (applying_suggested_uf && slp_done_for_suggested_uf)
3137 return ok;
3138
3139 /* If there are reduction chains re-trying will fail anyway. */
3140 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3141 return ok;
3142
3143 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3144 via interleaving or lane instructions. */
3145 slp_instance instance;
3146 slp_tree node;
3147 unsigned i, j;
3148 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3149 {
3150 stmt_vec_info vinfo;
3151 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3152 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3153 continue;
3154 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3155 unsigned int size = DR_GROUP_SIZE (vinfo);
3156 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3157 if (! vect_store_lanes_supported (vectype, size, false)
3158 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3159 && ! vect_grouped_store_supported (vectype, size))
3160 return opt_result::failure_at (vinfo->stmt,
3161 "unsupported grouped store\n");
3162 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3163 {
3164 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3165 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3166 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3167 size = DR_GROUP_SIZE (vinfo);
3168 vectype = STMT_VINFO_VECTYPE (vinfo);
3169 if (! vect_load_lanes_supported (vectype, size, false)
3170 && ! vect_grouped_load_supported (vectype, single_element_p,
3171 size))
3172 return opt_result::failure_at (vinfo->stmt,
3173 "unsupported grouped load\n");
3174 }
3175 }
3176
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE, vect_location,
3179 "re-trying with SLP disabled\n");
3180
3181 /* Roll back state appropriately. No SLP this time. */
3182 slp = false;
3183 /* Restore vectorization factor as it were without SLP. */
3184 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3185 /* Free the SLP instances. */
3186 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3187 vect_free_slp_instance (instance);
3188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3189 /* Reset SLP type to loop_vect on all stmts. */
3190 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3191 {
3192 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3193 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3194 !gsi_end_p (si); gsi_next (&si))
3195 {
3196 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3197 STMT_SLP_TYPE (stmt_info) = loop_vect;
3198 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3199 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3200 {
3201 /* vectorizable_reduction adjusts reduction stmt def-types,
3202 restore them to that of the PHI. */
3203 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3204 = STMT_VINFO_DEF_TYPE (stmt_info);
3205 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3206 (STMT_VINFO_REDUC_DEF (stmt_info)))
3207 = STMT_VINFO_DEF_TYPE (stmt_info);
3208 }
3209 }
3210 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3211 !gsi_end_p (si); gsi_next (&si))
3212 {
3213 if (is_gimple_debug (gsi_stmt (si)))
3214 continue;
3215 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3216 STMT_SLP_TYPE (stmt_info) = loop_vect;
3217 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3218 {
3219 stmt_vec_info pattern_stmt_info
3220 = STMT_VINFO_RELATED_STMT (stmt_info);
3221 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3222 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3223
3224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3225 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3226 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3227 !gsi_end_p (pi); gsi_next (&pi))
3228 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3229 = loop_vect;
3230 }
3231 }
3232 }
3233 /* Free optimized alias test DDRS. */
3234 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3235 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3236 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3237 /* Reset target cost data. */
3238 delete loop_vinfo->vector_costs;
3239 loop_vinfo->vector_costs = nullptr;
3240 /* Reset accumulated rgroup information. */
3241 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3242 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3243 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3244 /* Reset assorted flags. */
3245 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3246 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3247 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3248 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3249 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3250 = saved_can_use_partial_vectors_p;
3251
3252 goto start_over;
3253 }
3254
3255 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3256 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3257 OLD_LOOP_VINFO is better unless something specifically indicates
3258 otherwise.
3259
3260 Note that this deliberately isn't a partial order. */
3261
3262 static bool
3263 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3264 loop_vec_info old_loop_vinfo)
3265 {
3266 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3267 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3268
3269 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3270 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3271
3272 /* Always prefer a VF of loop->simdlen over any other VF. */
3273 if (loop->simdlen)
3274 {
3275 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3276 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3277 if (new_simdlen_p != old_simdlen_p)
3278 return new_simdlen_p;
3279 }
3280
3281 const auto *old_costs = old_loop_vinfo->vector_costs;
3282 const auto *new_costs = new_loop_vinfo->vector_costs;
3283 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3284 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3285
3286 return new_costs->better_main_loop_than_p (old_costs);
3287 }
3288
3289 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3290 true if we should. */
3291
3292 static bool
3293 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3294 loop_vec_info old_loop_vinfo)
3295 {
3296 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3297 return false;
3298
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_NOTE, vect_location,
3301 "***** Preferring vector mode %s to vector mode %s\n",
3302 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3303 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3304 return true;
3305 }
3306
3307 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3308 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3309 MODE_I to the next mode useful to analyze.
3310 Return the loop_vinfo on success and wrapped null on failure. */
3311
3312 static opt_loop_vec_info
3313 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3314 const vect_loop_form_info *loop_form_info,
3315 loop_vec_info main_loop_vinfo,
3316 const vector_modes &vector_modes, unsigned &mode_i,
3317 machine_mode &autodetected_vector_mode,
3318 bool &fatal)
3319 {
3320 loop_vec_info loop_vinfo
3321 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3322
3323 machine_mode vector_mode = vector_modes[mode_i];
3324 loop_vinfo->vector_mode = vector_mode;
3325 unsigned int suggested_unroll_factor = 1;
3326 bool slp_done_for_suggested_uf = false;
3327
3328 /* Run the main analysis. */
3329 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3330 &suggested_unroll_factor,
3331 slp_done_for_suggested_uf);
3332 if (dump_enabled_p ())
3333 dump_printf_loc (MSG_NOTE, vect_location,
3334 "***** Analysis %s with vector mode %s\n",
3335 res ? "succeeded" : " failed",
3336 GET_MODE_NAME (loop_vinfo->vector_mode));
3337
3338 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3339 {
3340 if (dump_enabled_p ())
3341 dump_printf_loc (MSG_NOTE, vect_location,
3342 "***** Re-trying analysis for unrolling"
3343 " with unroll factor %d and slp %s.\n",
3344 suggested_unroll_factor,
3345 slp_done_for_suggested_uf ? "on" : "off");
3346 loop_vec_info unroll_vinfo
3347 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3348 unroll_vinfo->vector_mode = vector_mode;
3349 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3350 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3351 slp_done_for_suggested_uf);
3352 if (new_res)
3353 {
3354 delete loop_vinfo;
3355 loop_vinfo = unroll_vinfo;
3356 }
3357 else
3358 delete unroll_vinfo;
3359 }
3360
3361 /* Remember the autodetected vector mode. */
3362 if (vector_mode == VOIDmode)
3363 autodetected_vector_mode = loop_vinfo->vector_mode;
3364
3365 /* Advance mode_i, first skipping modes that would result in the
3366 same analysis result. */
3367 while (mode_i + 1 < vector_modes.length ()
3368 && vect_chooses_same_modes_p (loop_vinfo,
3369 vector_modes[mode_i + 1]))
3370 {
3371 if (dump_enabled_p ())
3372 dump_printf_loc (MSG_NOTE, vect_location,
3373 "***** The result for vector mode %s would"
3374 " be the same\n",
3375 GET_MODE_NAME (vector_modes[mode_i + 1]));
3376 mode_i += 1;
3377 }
3378 if (mode_i + 1 < vector_modes.length ()
3379 && VECTOR_MODE_P (autodetected_vector_mode)
3380 && (related_vector_mode (vector_modes[mode_i + 1],
3381 GET_MODE_INNER (autodetected_vector_mode))
3382 == autodetected_vector_mode)
3383 && (related_vector_mode (autodetected_vector_mode,
3384 GET_MODE_INNER (vector_modes[mode_i + 1]))
3385 == vector_modes[mode_i + 1]))
3386 {
3387 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE, vect_location,
3389 "***** Skipping vector mode %s, which would"
3390 " repeat the analysis for %s\n",
3391 GET_MODE_NAME (vector_modes[mode_i + 1]),
3392 GET_MODE_NAME (autodetected_vector_mode));
3393 mode_i += 1;
3394 }
3395 mode_i++;
3396
3397 if (!res)
3398 {
3399 delete loop_vinfo;
3400 if (fatal)
3401 gcc_checking_assert (main_loop_vinfo == NULL);
3402 return opt_loop_vec_info::propagate_failure (res);
3403 }
3404
3405 return opt_loop_vec_info::success (loop_vinfo);
3406 }
3407
3408 /* Function vect_analyze_loop.
3409
3410 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3411 for it. The different analyses will record information in the
3412 loop_vec_info struct. */
3413 opt_loop_vec_info
3414 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3415 {
3416 DUMP_VECT_SCOPE ("analyze_loop_nest");
3417
3418 if (loop_outer (loop)
3419 && loop_vec_info_for_loop (loop_outer (loop))
3420 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3421 return opt_loop_vec_info::failure_at (vect_location,
3422 "outer-loop already vectorized.\n");
3423
3424 if (!find_loop_nest (loop, &shared->loop_nest))
3425 return opt_loop_vec_info::failure_at
3426 (vect_location,
3427 "not vectorized: loop nest containing two or more consecutive inner"
3428 " loops cannot be vectorized\n");
3429
3430 /* Analyze the loop form. */
3431 vect_loop_form_info loop_form_info;
3432 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3433 if (!res)
3434 {
3435 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437 "bad loop form.\n");
3438 return opt_loop_vec_info::propagate_failure (res);
3439 }
3440 if (!integer_onep (loop_form_info.assumptions))
3441 {
3442 /* We consider to vectorize this loop by versioning it under
3443 some assumptions. In order to do this, we need to clear
3444 existing information computed by scev and niter analyzer. */
3445 scev_reset_htab ();
3446 free_numbers_of_iterations_estimates (loop);
3447 /* Also set flag for this loop so that following scev and niter
3448 analysis are done under the assumptions. */
3449 loop_constraint_set (loop, LOOP_C_FINITE);
3450 }
3451
3452 auto_vector_modes vector_modes;
3453 /* Autodetect first vector size we try. */
3454 vector_modes.safe_push (VOIDmode);
3455 unsigned int autovec_flags
3456 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3457 loop->simdlen != 0);
3458 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3459 && !unlimited_cost_model (loop));
3460 machine_mode autodetected_vector_mode = VOIDmode;
3461 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3462 unsigned int mode_i = 0;
3463 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3464
3465 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3466 a mode has not been analyzed. */
3467 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3468 for (unsigned i = 0; i < vector_modes.length (); ++i)
3469 cached_vf_per_mode.safe_push (0);
3470
3471 /* First determine the main loop vectorization mode, either the first
3472 one that works, starting with auto-detecting the vector mode and then
3473 following the targets order of preference, or the one with the
3474 lowest cost if pick_lowest_cost_p. */
3475 while (1)
3476 {
3477 bool fatal;
3478 unsigned int last_mode_i = mode_i;
3479 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3480 failed. */
3481 cached_vf_per_mode[last_mode_i] = -1;
3482 opt_loop_vec_info loop_vinfo
3483 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3484 NULL, vector_modes, mode_i,
3485 autodetected_vector_mode, fatal);
3486 if (fatal)
3487 break;
3488
3489 if (loop_vinfo)
3490 {
3491 /* Analyzis has been successful so update the VF value. The
3492 VF should always be a multiple of unroll_factor and we want to
3493 capture the original VF here. */
3494 cached_vf_per_mode[last_mode_i]
3495 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3496 loop_vinfo->suggested_unroll_factor);
3497 /* Once we hit the desired simdlen for the first time,
3498 discard any previous attempts. */
3499 if (simdlen
3500 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3501 {
3502 delete first_loop_vinfo;
3503 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3504 simdlen = 0;
3505 }
3506 else if (pick_lowest_cost_p
3507 && first_loop_vinfo
3508 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3509 {
3510 /* Pick loop_vinfo over first_loop_vinfo. */
3511 delete first_loop_vinfo;
3512 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3513 }
3514 if (first_loop_vinfo == NULL)
3515 first_loop_vinfo = loop_vinfo;
3516 else
3517 {
3518 delete loop_vinfo;
3519 loop_vinfo = opt_loop_vec_info::success (NULL);
3520 }
3521
3522 /* Commit to first_loop_vinfo if we have no reason to try
3523 alternatives. */
3524 if (!simdlen && !pick_lowest_cost_p)
3525 break;
3526 }
3527 if (mode_i == vector_modes.length ()
3528 || autodetected_vector_mode == VOIDmode)
3529 break;
3530
3531 /* Try the next biggest vector size. */
3532 if (dump_enabled_p ())
3533 dump_printf_loc (MSG_NOTE, vect_location,
3534 "***** Re-trying analysis with vector mode %s\n",
3535 GET_MODE_NAME (vector_modes[mode_i]));
3536 }
3537 if (!first_loop_vinfo)
3538 return opt_loop_vec_info::propagate_failure (res);
3539
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 "***** Choosing vector mode %s\n",
3543 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3544
3545 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3546 enabled, SIMDUID is not set, it is the innermost loop and we have
3547 either already found the loop's SIMDLEN or there was no SIMDLEN to
3548 begin with.
3549 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3550 bool vect_epilogues = (!simdlen
3551 && loop->inner == NULL
3552 && param_vect_epilogues_nomask
3553 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3554 && !loop->simduid);
3555 if (!vect_epilogues)
3556 return first_loop_vinfo;
3557
3558 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3559 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3560
3561 /* For epilogues start the analysis from the first mode. The motivation
3562 behind starting from the beginning comes from cases where the VECTOR_MODES
3563 array may contain length-agnostic and length-specific modes. Their
3564 ordering is not guaranteed, so we could end up picking a mode for the main
3565 loop that is after the epilogue's optimal mode. */
3566 vector_modes[0] = autodetected_vector_mode;
3567 mode_i = 0;
3568
3569 bool supports_partial_vectors =
3570 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3571 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3572
3573 while (1)
3574 {
3575 /* If the target does not support partial vectors we can shorten the
3576 number of modes to analyze for the epilogue as we know we can't pick a
3577 mode that would lead to a VF at least as big as the
3578 FIRST_VINFO_VF. */
3579 if (!supports_partial_vectors
3580 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3581 {
3582 mode_i++;
3583 if (mode_i == vector_modes.length ())
3584 break;
3585 continue;
3586 }
3587
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_NOTE, vect_location,
3590 "***** Re-trying epilogue analysis with vector "
3591 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3592
3593 bool fatal;
3594 opt_loop_vec_info loop_vinfo
3595 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3596 first_loop_vinfo,
3597 vector_modes, mode_i,
3598 autodetected_vector_mode, fatal);
3599 if (fatal)
3600 break;
3601
3602 if (loop_vinfo)
3603 {
3604 if (pick_lowest_cost_p)
3605 {
3606 /* Keep trying to roll back vectorization attempts while the
3607 loop_vec_infos they produced were worse than this one. */
3608 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3609 while (!vinfos.is_empty ()
3610 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3611 {
3612 gcc_assert (vect_epilogues);
3613 delete vinfos.pop ();
3614 }
3615 }
3616 /* For now only allow one epilogue loop. */
3617 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3618 {
3619 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3620 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3621 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3622 || maybe_ne (lowest_th, 0U));
3623 /* Keep track of the known smallest versioning
3624 threshold. */
3625 if (ordered_p (lowest_th, th))
3626 lowest_th = ordered_min (lowest_th, th);
3627 }
3628 else
3629 {
3630 delete loop_vinfo;
3631 loop_vinfo = opt_loop_vec_info::success (NULL);
3632 }
3633
3634 /* For now only allow one epilogue loop, but allow
3635 pick_lowest_cost_p to replace it, so commit to the
3636 first epilogue if we have no reason to try alternatives. */
3637 if (!pick_lowest_cost_p)
3638 break;
3639 }
3640
3641 if (mode_i == vector_modes.length ())
3642 break;
3643
3644 }
3645
3646 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3647 {
3648 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3649 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_NOTE, vect_location,
3651 "***** Choosing epilogue vector mode %s\n",
3652 GET_MODE_NAME
3653 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3654 }
3655
3656 return first_loop_vinfo;
3657 }
3658
3659 /* Return true if there is an in-order reduction function for CODE, storing
3660 it in *REDUC_FN if so. */
3661
3662 static bool
3663 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3664 {
3665 if (code == PLUS_EXPR)
3666 {
3667 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3668 return true;
3669 }
3670 return false;
3671 }
3672
3673 /* Function reduction_fn_for_scalar_code
3674
3675 Input:
3676 CODE - tree_code of a reduction operations.
3677
3678 Output:
3679 REDUC_FN - the corresponding internal function to be used to reduce the
3680 vector of partial results into a single scalar result, or IFN_LAST
3681 if the operation is a supported reduction operation, but does not have
3682 such an internal function.
3683
3684 Return FALSE if CODE currently cannot be vectorized as reduction. */
3685
3686 bool
3687 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3688 {
3689 if (code.is_tree_code ())
3690 switch (tree_code (code))
3691 {
3692 case MAX_EXPR:
3693 *reduc_fn = IFN_REDUC_MAX;
3694 return true;
3695
3696 case MIN_EXPR:
3697 *reduc_fn = IFN_REDUC_MIN;
3698 return true;
3699
3700 case PLUS_EXPR:
3701 *reduc_fn = IFN_REDUC_PLUS;
3702 return true;
3703
3704 case BIT_AND_EXPR:
3705 *reduc_fn = IFN_REDUC_AND;
3706 return true;
3707
3708 case BIT_IOR_EXPR:
3709 *reduc_fn = IFN_REDUC_IOR;
3710 return true;
3711
3712 case BIT_XOR_EXPR:
3713 *reduc_fn = IFN_REDUC_XOR;
3714 return true;
3715
3716 case MULT_EXPR:
3717 case MINUS_EXPR:
3718 *reduc_fn = IFN_LAST;
3719 return true;
3720
3721 default:
3722 return false;
3723 }
3724 else
3725 switch (combined_fn (code))
3726 {
3727 CASE_CFN_FMAX:
3728 *reduc_fn = IFN_REDUC_FMAX;
3729 return true;
3730
3731 CASE_CFN_FMIN:
3732 *reduc_fn = IFN_REDUC_FMIN;
3733 return true;
3734
3735 default:
3736 return false;
3737 }
3738 }
3739
3740 /* If there is a neutral value X such that a reduction would not be affected
3741 by the introduction of additional X elements, return that X, otherwise
3742 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3743 of the scalar elements. If the reduction has just a single initial value
3744 then INITIAL_VALUE is that value, otherwise it is null. */
3745
3746 tree
3747 neutral_op_for_reduction (tree scalar_type, code_helper code,
3748 tree initial_value)
3749 {
3750 if (code.is_tree_code ())
3751 switch (tree_code (code))
3752 {
3753 case WIDEN_SUM_EXPR:
3754 case DOT_PROD_EXPR:
3755 case SAD_EXPR:
3756 case PLUS_EXPR:
3757 case MINUS_EXPR:
3758 case BIT_IOR_EXPR:
3759 case BIT_XOR_EXPR:
3760 return build_zero_cst (scalar_type);
3761
3762 case MULT_EXPR:
3763 return build_one_cst (scalar_type);
3764
3765 case BIT_AND_EXPR:
3766 return build_all_ones_cst (scalar_type);
3767
3768 case MAX_EXPR:
3769 case MIN_EXPR:
3770 return initial_value;
3771
3772 default:
3773 return NULL_TREE;
3774 }
3775 else
3776 switch (combined_fn (code))
3777 {
3778 CASE_CFN_FMIN:
3779 CASE_CFN_FMAX:
3780 return initial_value;
3781
3782 default:
3783 return NULL_TREE;
3784 }
3785 }
3786
3787 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3788 STMT is printed with a message MSG. */
3789
3790 static void
3791 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3792 {
3793 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3794 }
3795
3796 /* Return true if we need an in-order reduction for operation CODE
3797 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3798 overflow must wrap. */
3799
3800 bool
3801 needs_fold_left_reduction_p (tree type, code_helper code)
3802 {
3803 /* CHECKME: check for !flag_finite_math_only too? */
3804 if (SCALAR_FLOAT_TYPE_P (type))
3805 {
3806 if (code.is_tree_code ())
3807 switch (tree_code (code))
3808 {
3809 case MIN_EXPR:
3810 case MAX_EXPR:
3811 return false;
3812
3813 default:
3814 return !flag_associative_math;
3815 }
3816 else
3817 switch (combined_fn (code))
3818 {
3819 CASE_CFN_FMIN:
3820 CASE_CFN_FMAX:
3821 return false;
3822
3823 default:
3824 return !flag_associative_math;
3825 }
3826 }
3827
3828 if (INTEGRAL_TYPE_P (type))
3829 return (!code.is_tree_code ()
3830 || !operation_no_trapping_overflow (type, tree_code (code)));
3831
3832 if (SAT_FIXED_POINT_TYPE_P (type))
3833 return true;
3834
3835 return false;
3836 }
3837
3838 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3839 has a handled computation expression. Store the main reduction
3840 operation in *CODE. */
3841
3842 static bool
3843 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3844 tree loop_arg, code_helper *code,
3845 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3846 {
3847 auto_bitmap visited;
3848 tree lookfor = PHI_RESULT (phi);
3849 ssa_op_iter curri;
3850 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3851 while (USE_FROM_PTR (curr) != loop_arg)
3852 curr = op_iter_next_use (&curri);
3853 curri.i = curri.numops;
3854 do
3855 {
3856 path.safe_push (std::make_pair (curri, curr));
3857 tree use = USE_FROM_PTR (curr);
3858 if (use == lookfor)
3859 break;
3860 gimple *def = SSA_NAME_DEF_STMT (use);
3861 if (gimple_nop_p (def)
3862 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3863 {
3864 pop:
3865 do
3866 {
3867 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3868 curri = x.first;
3869 curr = x.second;
3870 do
3871 curr = op_iter_next_use (&curri);
3872 /* Skip already visited or non-SSA operands (from iterating
3873 over PHI args). */
3874 while (curr != NULL_USE_OPERAND_P
3875 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3876 || ! bitmap_set_bit (visited,
3877 SSA_NAME_VERSION
3878 (USE_FROM_PTR (curr)))));
3879 }
3880 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3881 if (curr == NULL_USE_OPERAND_P)
3882 break;
3883 }
3884 else
3885 {
3886 if (gimple_code (def) == GIMPLE_PHI)
3887 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3888 else
3889 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3890 while (curr != NULL_USE_OPERAND_P
3891 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3892 || ! bitmap_set_bit (visited,
3893 SSA_NAME_VERSION
3894 (USE_FROM_PTR (curr)))))
3895 curr = op_iter_next_use (&curri);
3896 if (curr == NULL_USE_OPERAND_P)
3897 goto pop;
3898 }
3899 }
3900 while (1);
3901 if (dump_file && (dump_flags & TDF_DETAILS))
3902 {
3903 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3904 unsigned i;
3905 std::pair<ssa_op_iter, use_operand_p> *x;
3906 FOR_EACH_VEC_ELT (path, i, x)
3907 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3908 dump_printf (MSG_NOTE, "\n");
3909 }
3910
3911 /* Check whether the reduction path detected is valid. */
3912 bool fail = path.length () == 0;
3913 bool neg = false;
3914 int sign = -1;
3915 *code = ERROR_MARK;
3916 for (unsigned i = 1; i < path.length (); ++i)
3917 {
3918 gimple *use_stmt = USE_STMT (path[i].second);
3919 gimple_match_op op;
3920 if (!gimple_extract_op (use_stmt, &op))
3921 {
3922 fail = true;
3923 break;
3924 }
3925 unsigned int opi = op.num_ops;
3926 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3927 {
3928 /* The following make sure we can compute the operand index
3929 easily plus it mostly disallows chaining via COND_EXPR condition
3930 operands. */
3931 for (opi = 0; opi < op.num_ops; ++opi)
3932 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3933 break;
3934 }
3935 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3936 {
3937 for (opi = 0; opi < op.num_ops; ++opi)
3938 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3939 break;
3940 }
3941 if (opi == op.num_ops)
3942 {
3943 fail = true;
3944 break;
3945 }
3946 op.code = canonicalize_code (op.code, op.type);
3947 if (op.code == MINUS_EXPR)
3948 {
3949 op.code = PLUS_EXPR;
3950 /* Track whether we negate the reduction value each iteration. */
3951 if (op.ops[1] == op.ops[opi])
3952 neg = ! neg;
3953 }
3954 if (CONVERT_EXPR_CODE_P (op.code)
3955 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3956 ;
3957 else if (*code == ERROR_MARK)
3958 {
3959 *code = op.code;
3960 sign = TYPE_SIGN (op.type);
3961 }
3962 else if (op.code != *code)
3963 {
3964 fail = true;
3965 break;
3966 }
3967 else if ((op.code == MIN_EXPR
3968 || op.code == MAX_EXPR)
3969 && sign != TYPE_SIGN (op.type))
3970 {
3971 fail = true;
3972 break;
3973 }
3974 /* Check there's only a single stmt the op is used on. For the
3975 not value-changing tail and the last stmt allow out-of-loop uses.
3976 ??? We could relax this and handle arbitrary live stmts by
3977 forcing a scalar epilogue for example. */
3978 imm_use_iterator imm_iter;
3979 gimple *op_use_stmt;
3980 unsigned cnt = 0;
3981 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3982 if (!is_gimple_debug (op_use_stmt)
3983 && (*code != ERROR_MARK
3984 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3985 {
3986 /* We want to allow x + x but not x < 1 ? x : 2. */
3987 if (is_gimple_assign (op_use_stmt)
3988 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3989 {
3990 use_operand_p use_p;
3991 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3992 cnt++;
3993 }
3994 else
3995 cnt++;
3996 }
3997 if (cnt != 1)
3998 {
3999 fail = true;
4000 break;
4001 }
4002 }
4003 return ! fail && ! neg && *code != ERROR_MARK;
4004 }
4005
4006 bool
4007 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4008 tree loop_arg, enum tree_code code)
4009 {
4010 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4011 code_helper code_;
4012 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4013 && code_ == code);
4014 }
4015
4016
4017
4018 /* Function vect_is_simple_reduction
4019
4020 (1) Detect a cross-iteration def-use cycle that represents a simple
4021 reduction computation. We look for the following pattern:
4022
4023 loop_header:
4024 a1 = phi < a0, a2 >
4025 a3 = ...
4026 a2 = operation (a3, a1)
4027
4028 or
4029
4030 a3 = ...
4031 loop_header:
4032 a1 = phi < a0, a2 >
4033 a2 = operation (a3, a1)
4034
4035 such that:
4036 1. operation is commutative and associative and it is safe to
4037 change the order of the computation
4038 2. no uses for a2 in the loop (a2 is used out of the loop)
4039 3. no uses of a1 in the loop besides the reduction operation
4040 4. no uses of a1 outside the loop.
4041
4042 Conditions 1,4 are tested here.
4043 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4044
4045 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4046 nested cycles.
4047
4048 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4049 reductions:
4050
4051 a1 = phi < a0, a2 >
4052 inner loop (def of a3)
4053 a2 = phi < a3 >
4054
4055 (4) Detect condition expressions, ie:
4056 for (int i = 0; i < N; i++)
4057 if (a[i] < val)
4058 ret_val = a[i];
4059
4060 */
4061
4062 static stmt_vec_info
4063 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4064 bool *double_reduc, bool *reduc_chain_p, bool slp)
4065 {
4066 gphi *phi = as_a <gphi *> (phi_info->stmt);
4067 gimple *phi_use_stmt = NULL;
4068 imm_use_iterator imm_iter;
4069 use_operand_p use_p;
4070
4071 *double_reduc = false;
4072 *reduc_chain_p = false;
4073 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4074
4075 tree phi_name = PHI_RESULT (phi);
4076 /* ??? If there are no uses of the PHI result the inner loop reduction
4077 won't be detected as possibly double-reduction by vectorizable_reduction
4078 because that tries to walk the PHI arg from the preheader edge which
4079 can be constant. See PR60382. */
4080 if (has_zero_uses (phi_name))
4081 return NULL;
4082 class loop *loop = (gimple_bb (phi))->loop_father;
4083 unsigned nphi_def_loop_uses = 0;
4084 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4085 {
4086 gimple *use_stmt = USE_STMT (use_p);
4087 if (is_gimple_debug (use_stmt))
4088 continue;
4089
4090 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4091 {
4092 if (dump_enabled_p ())
4093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4094 "intermediate value used outside loop.\n");
4095
4096 return NULL;
4097 }
4098
4099 nphi_def_loop_uses++;
4100 phi_use_stmt = use_stmt;
4101 }
4102
4103 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4104 if (TREE_CODE (latch_def) != SSA_NAME)
4105 {
4106 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4108 "reduction: not ssa_name: %T\n", latch_def);
4109 return NULL;
4110 }
4111
4112 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4113 if (!def_stmt_info
4114 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4115 return NULL;
4116
4117 bool nested_in_vect_loop
4118 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4119 unsigned nlatch_def_loop_uses = 0;
4120 auto_vec<gphi *, 3> lcphis;
4121 bool inner_loop_of_double_reduc = false;
4122 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4123 {
4124 gimple *use_stmt = USE_STMT (use_p);
4125 if (is_gimple_debug (use_stmt))
4126 continue;
4127 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4128 nlatch_def_loop_uses++;
4129 else
4130 {
4131 /* We can have more than one loop-closed PHI. */
4132 lcphis.safe_push (as_a <gphi *> (use_stmt));
4133 if (nested_in_vect_loop
4134 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4135 == vect_double_reduction_def))
4136 inner_loop_of_double_reduc = true;
4137 }
4138 }
4139
4140 /* If we are vectorizing an inner reduction we are executing that
4141 in the original order only in case we are not dealing with a
4142 double reduction. */
4143 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4144 {
4145 if (dump_enabled_p ())
4146 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4147 "detected nested cycle: ");
4148 return def_stmt_info;
4149 }
4150
4151 /* When the inner loop of a double reduction ends up with more than
4152 one loop-closed PHI we have failed to classify alternate such
4153 PHIs as double reduction, leading to wrong code. See PR103237. */
4154 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4155 {
4156 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4158 "unhandle double reduction\n");
4159 return NULL;
4160 }
4161
4162 /* If this isn't a nested cycle or if the nested cycle reduction value
4163 is used ouside of the inner loop we cannot handle uses of the reduction
4164 value. */
4165 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4166 {
4167 if (dump_enabled_p ())
4168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4169 "reduction used in loop.\n");
4170 return NULL;
4171 }
4172
4173 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4174 defined in the inner loop. */
4175 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4176 {
4177 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4178 if (gimple_phi_num_args (def_stmt) != 1
4179 || TREE_CODE (op1) != SSA_NAME)
4180 {
4181 if (dump_enabled_p ())
4182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183 "unsupported phi node definition.\n");
4184
4185 return NULL;
4186 }
4187
4188 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4189 and the latch definition op1. */
4190 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4191 if (gimple_bb (def1)
4192 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4193 && loop->inner
4194 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4195 && (is_gimple_assign (def1) || is_gimple_call (def1))
4196 && is_a <gphi *> (phi_use_stmt)
4197 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4198 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4199 loop_latch_edge (loop->inner))))
4200 {
4201 if (dump_enabled_p ())
4202 report_vect_op (MSG_NOTE, def_stmt,
4203 "detected double reduction: ");
4204
4205 *double_reduc = true;
4206 return def_stmt_info;
4207 }
4208
4209 return NULL;
4210 }
4211
4212 /* Look for the expression computing latch_def from then loop PHI result. */
4213 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4214 code_helper code;
4215 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4216 path))
4217 {
4218 STMT_VINFO_REDUC_CODE (phi_info) = code;
4219 if (code == COND_EXPR && !nested_in_vect_loop)
4220 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4221
4222 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4223 reduction chain for which the additional restriction is that
4224 all operations in the chain are the same. */
4225 auto_vec<stmt_vec_info, 8> reduc_chain;
4226 unsigned i;
4227 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4228 for (i = path.length () - 1; i >= 1; --i)
4229 {
4230 gimple *stmt = USE_STMT (path[i].second);
4231 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4232 gimple_match_op op;
4233 if (!gimple_extract_op (stmt, &op))
4234 gcc_unreachable ();
4235 if (gassign *assign = dyn_cast<gassign *> (stmt))
4236 STMT_VINFO_REDUC_IDX (stmt_info)
4237 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4238 else
4239 {
4240 gcall *call = as_a<gcall *> (stmt);
4241 STMT_VINFO_REDUC_IDX (stmt_info)
4242 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4243 }
4244 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4245 && (i == 1 || i == path.length () - 1));
4246 if ((op.code != code && !leading_conversion)
4247 /* We can only handle the final value in epilogue
4248 generation for reduction chains. */
4249 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4250 is_slp_reduc = false;
4251 /* For reduction chains we support a trailing/leading
4252 conversions. We do not store those in the actual chain. */
4253 if (leading_conversion)
4254 continue;
4255 reduc_chain.safe_push (stmt_info);
4256 }
4257 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4258 {
4259 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4260 {
4261 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4262 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4263 }
4264 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4265 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4266
4267 /* Save the chain for further analysis in SLP detection. */
4268 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4269 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4270
4271 *reduc_chain_p = true;
4272 if (dump_enabled_p ())
4273 dump_printf_loc (MSG_NOTE, vect_location,
4274 "reduction: detected reduction chain\n");
4275 }
4276 else if (dump_enabled_p ())
4277 dump_printf_loc (MSG_NOTE, vect_location,
4278 "reduction: detected reduction\n");
4279
4280 return def_stmt_info;
4281 }
4282
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_NOTE, vect_location,
4285 "reduction: unknown pattern\n");
4286
4287 return NULL;
4288 }
4289
4290 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4291 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4292 or -1 if not known. */
4293
4294 static int
4295 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4296 {
4297 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4298 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4299 {
4300 if (dump_enabled_p ())
4301 dump_printf_loc (MSG_NOTE, vect_location,
4302 "cost model: epilogue peel iters set to vf/2 "
4303 "because loop iterations are unknown .\n");
4304 return assumed_vf / 2;
4305 }
4306 else
4307 {
4308 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4309 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4310 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4311 /* If we need to peel for gaps, but no peeling is required, we have to
4312 peel VF iterations. */
4313 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4314 peel_iters_epilogue = assumed_vf;
4315 return peel_iters_epilogue;
4316 }
4317 }
4318
4319 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4320 int
4321 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4322 int *peel_iters_epilogue,
4323 stmt_vector_for_cost *scalar_cost_vec,
4324 stmt_vector_for_cost *prologue_cost_vec,
4325 stmt_vector_for_cost *epilogue_cost_vec)
4326 {
4327 int retval = 0;
4328
4329 *peel_iters_epilogue
4330 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4331
4332 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4333 {
4334 /* If peeled iterations are known but number of scalar loop
4335 iterations are unknown, count a taken branch per peeled loop. */
4336 if (peel_iters_prologue > 0)
4337 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4338 vect_prologue);
4339 if (*peel_iters_epilogue > 0)
4340 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4341 vect_epilogue);
4342 }
4343
4344 stmt_info_for_cost *si;
4345 int j;
4346 if (peel_iters_prologue)
4347 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4348 retval += record_stmt_cost (prologue_cost_vec,
4349 si->count * peel_iters_prologue,
4350 si->kind, si->stmt_info, si->misalign,
4351 vect_prologue);
4352 if (*peel_iters_epilogue)
4353 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4354 retval += record_stmt_cost (epilogue_cost_vec,
4355 si->count * *peel_iters_epilogue,
4356 si->kind, si->stmt_info, si->misalign,
4357 vect_epilogue);
4358
4359 return retval;
4360 }
4361
4362 /* Function vect_estimate_min_profitable_iters
4363
4364 Return the number of iterations required for the vector version of the
4365 loop to be profitable relative to the cost of the scalar version of the
4366 loop.
4367
4368 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4369 of iterations for vectorization. -1 value means loop vectorization
4370 is not profitable. This returned value may be used for dynamic
4371 profitability check.
4372
4373 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4374 for static check against estimated number of iterations. */
4375
4376 static void
4377 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4378 int *ret_min_profitable_niters,
4379 int *ret_min_profitable_estimate,
4380 unsigned *suggested_unroll_factor)
4381 {
4382 int min_profitable_iters;
4383 int min_profitable_estimate;
4384 int peel_iters_prologue;
4385 int peel_iters_epilogue;
4386 unsigned vec_inside_cost = 0;
4387 int vec_outside_cost = 0;
4388 unsigned vec_prologue_cost = 0;
4389 unsigned vec_epilogue_cost = 0;
4390 int scalar_single_iter_cost = 0;
4391 int scalar_outside_cost = 0;
4392 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4393 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4394 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4395
4396 /* Cost model disabled. */
4397 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4398 {
4399 if (dump_enabled_p ())
4400 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4401 *ret_min_profitable_niters = 0;
4402 *ret_min_profitable_estimate = 0;
4403 return;
4404 }
4405
4406 /* Requires loop versioning tests to handle misalignment. */
4407 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4408 {
4409 /* FIXME: Make cost depend on complexity of individual check. */
4410 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4411 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4412 if (dump_enabled_p ())
4413 dump_printf (MSG_NOTE,
4414 "cost model: Adding cost of checks for loop "
4415 "versioning to treat misalignment.\n");
4416 }
4417
4418 /* Requires loop versioning with alias checks. */
4419 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4420 {
4421 /* FIXME: Make cost depend on complexity of individual check. */
4422 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4423 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4424 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4425 if (len)
4426 /* Count LEN - 1 ANDs and LEN comparisons. */
4427 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4428 scalar_stmt, vect_prologue);
4429 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4430 if (len)
4431 {
4432 /* Count LEN - 1 ANDs and LEN comparisons. */
4433 unsigned int nstmts = len * 2 - 1;
4434 /* +1 for each bias that needs adding. */
4435 for (unsigned int i = 0; i < len; ++i)
4436 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4437 nstmts += 1;
4438 (void) add_stmt_cost (target_cost_data, nstmts,
4439 scalar_stmt, vect_prologue);
4440 }
4441 if (dump_enabled_p ())
4442 dump_printf (MSG_NOTE,
4443 "cost model: Adding cost of checks for loop "
4444 "versioning aliasing.\n");
4445 }
4446
4447 /* Requires loop versioning with niter checks. */
4448 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4449 {
4450 /* FIXME: Make cost depend on complexity of individual check. */
4451 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4452 NULL, NULL, NULL_TREE, 0, vect_prologue);
4453 if (dump_enabled_p ())
4454 dump_printf (MSG_NOTE,
4455 "cost model: Adding cost of checks for loop "
4456 "versioning niters.\n");
4457 }
4458
4459 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4460 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4461 vect_prologue);
4462
4463 /* Count statements in scalar loop. Using this as scalar cost for a single
4464 iteration for now.
4465
4466 TODO: Add outer loop support.
4467
4468 TODO: Consider assigning different costs to different scalar
4469 statements. */
4470
4471 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4472
4473 /* Add additional cost for the peeled instructions in prologue and epilogue
4474 loop. (For fully-masked loops there will be no peeling.)
4475
4476 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4477 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4478
4479 TODO: Build an expression that represents peel_iters for prologue and
4480 epilogue to be used in a run-time test. */
4481
4482 bool prologue_need_br_taken_cost = false;
4483 bool prologue_need_br_not_taken_cost = false;
4484
4485 /* Calculate peel_iters_prologue. */
4486 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4487 peel_iters_prologue = 0;
4488 else if (npeel < 0)
4489 {
4490 peel_iters_prologue = assumed_vf / 2;
4491 if (dump_enabled_p ())
4492 dump_printf (MSG_NOTE, "cost model: "
4493 "prologue peel iters set to vf/2.\n");
4494
4495 /* If peeled iterations are unknown, count a taken branch and a not taken
4496 branch per peeled loop. Even if scalar loop iterations are known,
4497 vector iterations are not known since peeled prologue iterations are
4498 not known. Hence guards remain the same. */
4499 prologue_need_br_taken_cost = true;
4500 prologue_need_br_not_taken_cost = true;
4501 }
4502 else
4503 {
4504 peel_iters_prologue = npeel;
4505 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4506 /* If peeled iterations are known but number of scalar loop
4507 iterations are unknown, count a taken branch per peeled loop. */
4508 prologue_need_br_taken_cost = true;
4509 }
4510
4511 bool epilogue_need_br_taken_cost = false;
4512 bool epilogue_need_br_not_taken_cost = false;
4513
4514 /* Calculate peel_iters_epilogue. */
4515 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4516 /* We need to peel exactly one iteration for gaps. */
4517 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4518 else if (npeel < 0)
4519 {
4520 /* If peeling for alignment is unknown, loop bound of main loop
4521 becomes unknown. */
4522 peel_iters_epilogue = assumed_vf / 2;
4523 if (dump_enabled_p ())
4524 dump_printf (MSG_NOTE, "cost model: "
4525 "epilogue peel iters set to vf/2 because "
4526 "peeling for alignment is unknown.\n");
4527
4528 /* See the same reason above in peel_iters_prologue calculation. */
4529 epilogue_need_br_taken_cost = true;
4530 epilogue_need_br_not_taken_cost = true;
4531 }
4532 else
4533 {
4534 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4535 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4536 /* If peeled iterations are known but number of scalar loop
4537 iterations are unknown, count a taken branch per peeled loop. */
4538 epilogue_need_br_taken_cost = true;
4539 }
4540
4541 stmt_info_for_cost *si;
4542 int j;
4543 /* Add costs associated with peel_iters_prologue. */
4544 if (peel_iters_prologue)
4545 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4546 {
4547 (void) add_stmt_cost (target_cost_data,
4548 si->count * peel_iters_prologue, si->kind,
4549 si->stmt_info, si->node, si->vectype,
4550 si->misalign, vect_prologue);
4551 }
4552
4553 /* Add costs associated with peel_iters_epilogue. */
4554 if (peel_iters_epilogue)
4555 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4556 {
4557 (void) add_stmt_cost (target_cost_data,
4558 si->count * peel_iters_epilogue, si->kind,
4559 si->stmt_info, si->node, si->vectype,
4560 si->misalign, vect_epilogue);
4561 }
4562
4563 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4564
4565 if (prologue_need_br_taken_cost)
4566 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4567 vect_prologue);
4568
4569 if (prologue_need_br_not_taken_cost)
4570 (void) add_stmt_cost (target_cost_data, 1,
4571 cond_branch_not_taken, vect_prologue);
4572
4573 if (epilogue_need_br_taken_cost)
4574 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4575 vect_epilogue);
4576
4577 if (epilogue_need_br_not_taken_cost)
4578 (void) add_stmt_cost (target_cost_data, 1,
4579 cond_branch_not_taken, vect_epilogue);
4580
4581 /* Take care of special costs for rgroup controls of partial vectors. */
4582 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4583 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4584 == vect_partial_vectors_avx512))
4585 {
4586 /* Calculate how many masks we need to generate. */
4587 unsigned int num_masks = 0;
4588 bool need_saturation = false;
4589 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4590 if (rgm.type)
4591 {
4592 unsigned nvectors = rgm.factor;
4593 num_masks += nvectors;
4594 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4595 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4596 need_saturation = true;
4597 }
4598
4599 /* ??? The target isn't able to identify the costs below as
4600 producing masks so it cannot penaltize cases where we'd run
4601 out of mask registers for example. */
4602
4603 /* ??? We are also failing to account for smaller vector masks
4604 we generate by splitting larger masks in vect_get_loop_mask. */
4605
4606 /* In the worst case, we need to generate each mask in the prologue
4607 and in the loop body. We need one splat per group and one
4608 compare per mask.
4609
4610 Sometimes the prologue mask will fold to a constant,
4611 so the actual prologue cost might be smaller. However, it's
4612 simpler and safer to use the worst-case cost; if this ends up
4613 being the tie-breaker between vectorizing or not, then it's
4614 probably better not to vectorize. */
4615 (void) add_stmt_cost (target_cost_data,
4616 num_masks
4617 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4618 vector_stmt, NULL, NULL, NULL_TREE, 0,
4619 vect_prologue);
4620 (void) add_stmt_cost (target_cost_data,
4621 num_masks
4622 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4623 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4624
4625 /* When we need saturation we need it both in the prologue and
4626 the epilogue. */
4627 if (need_saturation)
4628 {
4629 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4630 NULL, NULL, NULL_TREE, 0, vect_prologue);
4631 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4632 NULL, NULL, NULL_TREE, 0, vect_body);
4633 }
4634 }
4635 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4636 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4637 == vect_partial_vectors_while_ult))
4638 {
4639 /* Calculate how many masks we need to generate. */
4640 unsigned int num_masks = 0;
4641 rgroup_controls *rgm;
4642 unsigned int num_vectors_m1;
4643 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4644 num_vectors_m1, rgm)
4645 if (rgm->type)
4646 num_masks += num_vectors_m1 + 1;
4647 gcc_assert (num_masks > 0);
4648
4649 /* In the worst case, we need to generate each mask in the prologue
4650 and in the loop body. One of the loop body mask instructions
4651 replaces the comparison in the scalar loop, and since we don't
4652 count the scalar comparison against the scalar body, we shouldn't
4653 count that vector instruction against the vector body either.
4654
4655 Sometimes we can use unpacks instead of generating prologue
4656 masks and sometimes the prologue mask will fold to a constant,
4657 so the actual prologue cost might be smaller. However, it's
4658 simpler and safer to use the worst-case cost; if this ends up
4659 being the tie-breaker between vectorizing or not, then it's
4660 probably better not to vectorize. */
4661 (void) add_stmt_cost (target_cost_data, num_masks,
4662 vector_stmt, NULL, NULL, NULL_TREE, 0,
4663 vect_prologue);
4664 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4665 vector_stmt, NULL, NULL, NULL_TREE, 0,
4666 vect_body);
4667 }
4668 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4669 {
4670 /* Referring to the functions vect_set_loop_condition_partial_vectors
4671 and vect_set_loop_controls_directly, we need to generate each
4672 length in the prologue and in the loop body if required. Although
4673 there are some possible optimizations, we consider the worst case
4674 here. */
4675
4676 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4677 signed char partial_load_store_bias
4678 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4679 bool need_iterate_p
4680 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4681 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4682
4683 /* Calculate how many statements to be added. */
4684 unsigned int prologue_stmts = 0;
4685 unsigned int body_stmts = 0;
4686
4687 rgroup_controls *rgc;
4688 unsigned int num_vectors_m1;
4689 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4690 if (rgc->type)
4691 {
4692 /* May need one SHIFT for nitems_total computation. */
4693 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4694 if (nitems != 1 && !niters_known_p)
4695 prologue_stmts += 1;
4696
4697 /* May need one MAX and one MINUS for wrap around. */
4698 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4699 prologue_stmts += 2;
4700
4701 /* Need one MAX and one MINUS for each batch limit excepting for
4702 the 1st one. */
4703 prologue_stmts += num_vectors_m1 * 2;
4704
4705 unsigned int num_vectors = num_vectors_m1 + 1;
4706
4707 /* Need to set up lengths in prologue, only one MIN required
4708 for each since start index is zero. */
4709 prologue_stmts += num_vectors;
4710
4711 /* If we have a non-zero partial load bias, we need one PLUS
4712 to adjust the load length. */
4713 if (partial_load_store_bias != 0)
4714 body_stmts += 1;
4715
4716 /* Each may need two MINs and one MINUS to update lengths in body
4717 for next iteration. */
4718 if (need_iterate_p)
4719 body_stmts += 3 * num_vectors;
4720 }
4721
4722 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4723 scalar_stmt, vect_prologue);
4724 (void) add_stmt_cost (target_cost_data, body_stmts,
4725 scalar_stmt, vect_body);
4726 }
4727
4728 /* FORNOW: The scalar outside cost is incremented in one of the
4729 following ways:
4730
4731 1. The vectorizer checks for alignment and aliasing and generates
4732 a condition that allows dynamic vectorization. A cost model
4733 check is ANDED with the versioning condition. Hence scalar code
4734 path now has the added cost of the versioning check.
4735
4736 if (cost > th & versioning_check)
4737 jmp to vector code
4738
4739 Hence run-time scalar is incremented by not-taken branch cost.
4740
4741 2. The vectorizer then checks if a prologue is required. If the
4742 cost model check was not done before during versioning, it has to
4743 be done before the prologue check.
4744
4745 if (cost <= th)
4746 prologue = scalar_iters
4747 if (prologue == 0)
4748 jmp to vector code
4749 else
4750 execute prologue
4751 if (prologue == num_iters)
4752 go to exit
4753
4754 Hence the run-time scalar cost is incremented by a taken branch,
4755 plus a not-taken branch, plus a taken branch cost.
4756
4757 3. The vectorizer then checks if an epilogue is required. If the
4758 cost model check was not done before during prologue check, it
4759 has to be done with the epilogue check.
4760
4761 if (prologue == 0)
4762 jmp to vector code
4763 else
4764 execute prologue
4765 if (prologue == num_iters)
4766 go to exit
4767 vector code:
4768 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4769 jmp to epilogue
4770
4771 Hence the run-time scalar cost should be incremented by 2 taken
4772 branches.
4773
4774 TODO: The back end may reorder the BBS's differently and reverse
4775 conditions/branch directions. Change the estimates below to
4776 something more reasonable. */
4777
4778 /* If the number of iterations is known and we do not do versioning, we can
4779 decide whether to vectorize at compile time. Hence the scalar version
4780 do not carry cost model guard costs. */
4781 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4782 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4783 {
4784 /* Cost model check occurs at versioning. */
4785 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4786 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4787 else
4788 {
4789 /* Cost model check occurs at prologue generation. */
4790 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4791 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4792 + vect_get_stmt_cost (cond_branch_not_taken);
4793 /* Cost model check occurs at epilogue generation. */
4794 else
4795 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4796 }
4797 }
4798
4799 /* Complete the target-specific cost calculations. */
4800 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4801 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4802 suggested_unroll_factor);
4803
4804 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4805 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4806 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4807 *suggested_unroll_factor,
4808 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4809 {
4810 if (dump_enabled_p ())
4811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4812 "can't unroll as unrolled vectorization factor larger"
4813 " than maximum vectorization factor: "
4814 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4815 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4816 *suggested_unroll_factor = 1;
4817 }
4818
4819 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4820
4821 if (dump_enabled_p ())
4822 {
4823 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4824 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4825 vec_inside_cost);
4826 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4827 vec_prologue_cost);
4828 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4829 vec_epilogue_cost);
4830 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4831 scalar_single_iter_cost);
4832 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4833 scalar_outside_cost);
4834 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4835 vec_outside_cost);
4836 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4837 peel_iters_prologue);
4838 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4839 peel_iters_epilogue);
4840 }
4841
4842 /* Calculate number of iterations required to make the vector version
4843 profitable, relative to the loop bodies only. The following condition
4844 must hold true:
4845 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4846 where
4847 SIC = scalar iteration cost, VIC = vector iteration cost,
4848 VOC = vector outside cost, VF = vectorization factor,
4849 NPEEL = prologue iterations + epilogue iterations,
4850 SOC = scalar outside cost for run time cost model check. */
4851
4852 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4853 - vec_inside_cost);
4854 if (saving_per_viter <= 0)
4855 {
4856 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4857 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4858 "vectorization did not happen for a simd loop");
4859
4860 if (dump_enabled_p ())
4861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4862 "cost model: the vector iteration cost = %d "
4863 "divided by the scalar iteration cost = %d "
4864 "is greater or equal to the vectorization factor = %d"
4865 ".\n",
4866 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4867 *ret_min_profitable_niters = -1;
4868 *ret_min_profitable_estimate = -1;
4869 return;
4870 }
4871
4872 /* ??? The "if" arm is written to handle all cases; see below for what
4873 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4874 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4875 {
4876 /* Rewriting the condition above in terms of the number of
4877 vector iterations (vniters) rather than the number of
4878 scalar iterations (niters) gives:
4879
4880 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4881
4882 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4883
4884 For integer N, X and Y when X > 0:
4885
4886 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4887 int outside_overhead = (vec_outside_cost
4888 - scalar_single_iter_cost * peel_iters_prologue
4889 - scalar_single_iter_cost * peel_iters_epilogue
4890 - scalar_outside_cost);
4891 /* We're only interested in cases that require at least one
4892 vector iteration. */
4893 int min_vec_niters = 1;
4894 if (outside_overhead > 0)
4895 min_vec_niters = outside_overhead / saving_per_viter + 1;
4896
4897 if (dump_enabled_p ())
4898 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4899 min_vec_niters);
4900
4901 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4902 {
4903 /* Now that we know the minimum number of vector iterations,
4904 find the minimum niters for which the scalar cost is larger:
4905
4906 SIC * niters > VIC * vniters + VOC - SOC
4907
4908 We know that the minimum niters is no more than
4909 vniters * VF + NPEEL, but it might be (and often is) less
4910 than that if a partial vector iteration is cheaper than the
4911 equivalent scalar code. */
4912 int threshold = (vec_inside_cost * min_vec_niters
4913 + vec_outside_cost
4914 - scalar_outside_cost);
4915 if (threshold <= 0)
4916 min_profitable_iters = 1;
4917 else
4918 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4919 }
4920 else
4921 /* Convert the number of vector iterations into a number of
4922 scalar iterations. */
4923 min_profitable_iters = (min_vec_niters * assumed_vf
4924 + peel_iters_prologue
4925 + peel_iters_epilogue);
4926 }
4927 else
4928 {
4929 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4930 * assumed_vf
4931 - vec_inside_cost * peel_iters_prologue
4932 - vec_inside_cost * peel_iters_epilogue);
4933 if (min_profitable_iters <= 0)
4934 min_profitable_iters = 0;
4935 else
4936 {
4937 min_profitable_iters /= saving_per_viter;
4938
4939 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4940 <= (((int) vec_inside_cost * min_profitable_iters)
4941 + (((int) vec_outside_cost - scalar_outside_cost)
4942 * assumed_vf)))
4943 min_profitable_iters++;
4944 }
4945 }
4946
4947 if (dump_enabled_p ())
4948 dump_printf (MSG_NOTE,
4949 " Calculated minimum iters for profitability: %d\n",
4950 min_profitable_iters);
4951
4952 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4953 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4954 /* We want the vectorized loop to execute at least once. */
4955 min_profitable_iters = assumed_vf + peel_iters_prologue;
4956 else if (min_profitable_iters < peel_iters_prologue)
4957 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4958 vectorized loop executes at least once. */
4959 min_profitable_iters = peel_iters_prologue;
4960
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_NOTE, vect_location,
4963 " Runtime profitability threshold = %d\n",
4964 min_profitable_iters);
4965
4966 *ret_min_profitable_niters = min_profitable_iters;
4967
4968 /* Calculate number of iterations required to make the vector version
4969 profitable, relative to the loop bodies only.
4970
4971 Non-vectorized variant is SIC * niters and it must win over vector
4972 variant on the expected loop trip count. The following condition must hold true:
4973 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4974
4975 if (vec_outside_cost <= 0)
4976 min_profitable_estimate = 0;
4977 /* ??? This "else if" arm is written to handle all cases; see below for
4978 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4979 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4980 {
4981 /* This is a repeat of the code above, but with + SOC rather
4982 than - SOC. */
4983 int outside_overhead = (vec_outside_cost
4984 - scalar_single_iter_cost * peel_iters_prologue
4985 - scalar_single_iter_cost * peel_iters_epilogue
4986 + scalar_outside_cost);
4987 int min_vec_niters = 1;
4988 if (outside_overhead > 0)
4989 min_vec_niters = outside_overhead / saving_per_viter + 1;
4990
4991 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4992 {
4993 int threshold = (vec_inside_cost * min_vec_niters
4994 + vec_outside_cost
4995 + scalar_outside_cost);
4996 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4997 }
4998 else
4999 min_profitable_estimate = (min_vec_niters * assumed_vf
5000 + peel_iters_prologue
5001 + peel_iters_epilogue);
5002 }
5003 else
5004 {
5005 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5006 * assumed_vf
5007 - vec_inside_cost * peel_iters_prologue
5008 - vec_inside_cost * peel_iters_epilogue)
5009 / ((scalar_single_iter_cost * assumed_vf)
5010 - vec_inside_cost);
5011 }
5012 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5013 if (dump_enabled_p ())
5014 dump_printf_loc (MSG_NOTE, vect_location,
5015 " Static estimate profitability threshold = %d\n",
5016 min_profitable_estimate);
5017
5018 *ret_min_profitable_estimate = min_profitable_estimate;
5019 }
5020
5021 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5022 vector elements (not bits) for a vector with NELT elements. */
5023 static void
5024 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5025 vec_perm_builder *sel)
5026 {
5027 /* The encoding is a single stepped pattern. Any wrap-around is handled
5028 by vec_perm_indices. */
5029 sel->new_vector (nelt, 1, 3);
5030 for (unsigned int i = 0; i < 3; i++)
5031 sel->quick_push (i + offset);
5032 }
5033
5034 /* Checks whether the target supports whole-vector shifts for vectors of mode
5035 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5036 it supports vec_perm_const with masks for all necessary shift amounts. */
5037 static bool
5038 have_whole_vector_shift (machine_mode mode)
5039 {
5040 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5041 return true;
5042
5043 /* Variable-length vectors should be handled via the optab. */
5044 unsigned int nelt;
5045 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5046 return false;
5047
5048 vec_perm_builder sel;
5049 vec_perm_indices indices;
5050 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5051 {
5052 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5053 indices.new_vector (sel, 2, nelt);
5054 if (!can_vec_perm_const_p (mode, mode, indices, false))
5055 return false;
5056 }
5057 return true;
5058 }
5059
5060 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5061 multiplication operands have differing signs and (b) we intend
5062 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5063 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5064
5065 static bool
5066 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5067 stmt_vec_info stmt_info)
5068 {
5069 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5070 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5071 return false;
5072
5073 tree rhs1 = gimple_assign_rhs1 (assign);
5074 tree rhs2 = gimple_assign_rhs2 (assign);
5075 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5076 return false;
5077
5078 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5079 gcc_assert (reduc_info->is_reduc_info);
5080 return !directly_supported_p (DOT_PROD_EXPR,
5081 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5082 optab_vector_mixed_sign);
5083 }
5084
5085 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5086 functions. Design better to avoid maintenance issues. */
5087
5088 /* Function vect_model_reduction_cost.
5089
5090 Models cost for a reduction operation, including the vector ops
5091 generated within the strip-mine loop in some cases, the initial
5092 definition before the loop, and the epilogue code that must be generated. */
5093
5094 static void
5095 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5096 stmt_vec_info stmt_info, internal_fn reduc_fn,
5097 vect_reduction_type reduction_type,
5098 int ncopies, stmt_vector_for_cost *cost_vec)
5099 {
5100 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5101 tree vectype;
5102 machine_mode mode;
5103 class loop *loop = NULL;
5104
5105 if (loop_vinfo)
5106 loop = LOOP_VINFO_LOOP (loop_vinfo);
5107
5108 /* Condition reductions generate two reductions in the loop. */
5109 if (reduction_type == COND_REDUCTION)
5110 ncopies *= 2;
5111
5112 vectype = STMT_VINFO_VECTYPE (stmt_info);
5113 mode = TYPE_MODE (vectype);
5114 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5115
5116 gimple_match_op op;
5117 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5118 gcc_unreachable ();
5119
5120 bool emulated_mixed_dot_prod
5121 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5122 if (reduction_type == EXTRACT_LAST_REDUCTION)
5123 /* No extra instructions are needed in the prologue. The loop body
5124 operations are costed in vectorizable_condition. */
5125 inside_cost = 0;
5126 else if (reduction_type == FOLD_LEFT_REDUCTION)
5127 {
5128 /* No extra instructions needed in the prologue. */
5129 prologue_cost = 0;
5130
5131 if (reduc_fn != IFN_LAST)
5132 /* Count one reduction-like operation per vector. */
5133 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5134 stmt_info, 0, vect_body);
5135 else
5136 {
5137 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5138 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5139 inside_cost = record_stmt_cost (cost_vec, nelements,
5140 vec_to_scalar, stmt_info, 0,
5141 vect_body);
5142 inside_cost += record_stmt_cost (cost_vec, nelements,
5143 scalar_stmt, stmt_info, 0,
5144 vect_body);
5145 }
5146 }
5147 else
5148 {
5149 /* Add in the cost of the initial definitions. */
5150 int prologue_stmts;
5151 if (reduction_type == COND_REDUCTION)
5152 /* For cond reductions we have four vectors: initial index, step,
5153 initial result of the data reduction, initial value of the index
5154 reduction. */
5155 prologue_stmts = 4;
5156 else if (emulated_mixed_dot_prod)
5157 /* We need the initial reduction value and two invariants:
5158 one that contains the minimum signed value and one that
5159 contains half of its negative. */
5160 prologue_stmts = 3;
5161 else
5162 prologue_stmts = 1;
5163 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5164 scalar_to_vec, stmt_info, 0,
5165 vect_prologue);
5166 }
5167
5168 /* Determine cost of epilogue code.
5169
5170 We have a reduction operator that will reduce the vector in one statement.
5171 Also requires scalar extract. */
5172
5173 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5174 {
5175 if (reduc_fn != IFN_LAST)
5176 {
5177 if (reduction_type == COND_REDUCTION)
5178 {
5179 /* An EQ stmt and an COND_EXPR stmt. */
5180 epilogue_cost += record_stmt_cost (cost_vec, 2,
5181 vector_stmt, stmt_info, 0,
5182 vect_epilogue);
5183 /* Reduction of the max index and a reduction of the found
5184 values. */
5185 epilogue_cost += record_stmt_cost (cost_vec, 2,
5186 vec_to_scalar, stmt_info, 0,
5187 vect_epilogue);
5188 /* A broadcast of the max value. */
5189 epilogue_cost += record_stmt_cost (cost_vec, 1,
5190 scalar_to_vec, stmt_info, 0,
5191 vect_epilogue);
5192 }
5193 else
5194 {
5195 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5196 stmt_info, 0, vect_epilogue);
5197 epilogue_cost += record_stmt_cost (cost_vec, 1,
5198 vec_to_scalar, stmt_info, 0,
5199 vect_epilogue);
5200 }
5201 }
5202 else if (reduction_type == COND_REDUCTION)
5203 {
5204 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5205 /* Extraction of scalar elements. */
5206 epilogue_cost += record_stmt_cost (cost_vec,
5207 2 * estimated_nunits,
5208 vec_to_scalar, stmt_info, 0,
5209 vect_epilogue);
5210 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5211 epilogue_cost += record_stmt_cost (cost_vec,
5212 2 * estimated_nunits - 3,
5213 scalar_stmt, stmt_info, 0,
5214 vect_epilogue);
5215 }
5216 else if (reduction_type == EXTRACT_LAST_REDUCTION
5217 || reduction_type == FOLD_LEFT_REDUCTION)
5218 /* No extra instructions need in the epilogue. */
5219 ;
5220 else
5221 {
5222 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5223 tree bitsize = TYPE_SIZE (op.type);
5224 int element_bitsize = tree_to_uhwi (bitsize);
5225 int nelements = vec_size_in_bits / element_bitsize;
5226
5227 if (op.code == COND_EXPR)
5228 op.code = MAX_EXPR;
5229
5230 /* We have a whole vector shift available. */
5231 if (VECTOR_MODE_P (mode)
5232 && directly_supported_p (op.code, vectype)
5233 && have_whole_vector_shift (mode))
5234 {
5235 /* Final reduction via vector shifts and the reduction operator.
5236 Also requires scalar extract. */
5237 epilogue_cost += record_stmt_cost (cost_vec,
5238 exact_log2 (nelements) * 2,
5239 vector_stmt, stmt_info, 0,
5240 vect_epilogue);
5241 epilogue_cost += record_stmt_cost (cost_vec, 1,
5242 vec_to_scalar, stmt_info, 0,
5243 vect_epilogue);
5244 }
5245 else
5246 /* Use extracts and reduction op for final reduction. For N
5247 elements, we have N extracts and N-1 reduction ops. */
5248 epilogue_cost += record_stmt_cost (cost_vec,
5249 nelements + nelements - 1,
5250 vector_stmt, stmt_info, 0,
5251 vect_epilogue);
5252 }
5253 }
5254
5255 if (dump_enabled_p ())
5256 dump_printf (MSG_NOTE,
5257 "vect_model_reduction_cost: inside_cost = %d, "
5258 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5259 prologue_cost, epilogue_cost);
5260 }
5261
5262 /* SEQ is a sequence of instructions that initialize the reduction
5263 described by REDUC_INFO. Emit them in the appropriate place. */
5264
5265 static void
5266 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5267 stmt_vec_info reduc_info, gimple *seq)
5268 {
5269 if (reduc_info->reused_accumulator)
5270 {
5271 /* When reusing an accumulator from the main loop, we only need
5272 initialization instructions if the main loop can be skipped.
5273 In that case, emit the initialization instructions at the end
5274 of the guard block that does the skip. */
5275 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5276 gcc_assert (skip_edge);
5277 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5278 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5279 }
5280 else
5281 {
5282 /* The normal case: emit the initialization instructions on the
5283 preheader edge. */
5284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5285 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5286 }
5287 }
5288
5289 /* Function get_initial_def_for_reduction
5290
5291 Input:
5292 REDUC_INFO - the info_for_reduction
5293 INIT_VAL - the initial value of the reduction variable
5294 NEUTRAL_OP - a value that has no effect on the reduction, as per
5295 neutral_op_for_reduction
5296
5297 Output:
5298 Return a vector variable, initialized according to the operation that
5299 STMT_VINFO performs. This vector will be used as the initial value
5300 of the vector of partial results.
5301
5302 The value we need is a vector in which element 0 has value INIT_VAL
5303 and every other element has value NEUTRAL_OP. */
5304
5305 static tree
5306 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5307 stmt_vec_info reduc_info,
5308 tree init_val, tree neutral_op)
5309 {
5310 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5311 tree scalar_type = TREE_TYPE (init_val);
5312 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5313 tree init_def;
5314 gimple_seq stmts = NULL;
5315
5316 gcc_assert (vectype);
5317
5318 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5319 || SCALAR_FLOAT_TYPE_P (scalar_type));
5320
5321 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5322 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5323
5324 if (operand_equal_p (init_val, neutral_op))
5325 {
5326 /* If both elements are equal then the vector described above is
5327 just a splat. */
5328 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5329 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5330 }
5331 else
5332 {
5333 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5334 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5335 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5336 {
5337 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5338 element 0. */
5339 init_def = gimple_build_vector_from_val (&stmts, vectype,
5340 neutral_op);
5341 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5342 vectype, init_def, init_val);
5343 }
5344 else
5345 {
5346 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5347 tree_vector_builder elts (vectype, 1, 2);
5348 elts.quick_push (init_val);
5349 elts.quick_push (neutral_op);
5350 init_def = gimple_build_vector (&stmts, &elts);
5351 }
5352 }
5353
5354 if (stmts)
5355 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5356 return init_def;
5357 }
5358
5359 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5360 which performs a reduction involving GROUP_SIZE scalar statements.
5361 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5362 is nonnull, introducing extra elements of that value will not change the
5363 result. */
5364
5365 static void
5366 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5367 stmt_vec_info reduc_info,
5368 vec<tree> *vec_oprnds,
5369 unsigned int number_of_vectors,
5370 unsigned int group_size, tree neutral_op)
5371 {
5372 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5373 unsigned HOST_WIDE_INT nunits;
5374 unsigned j, number_of_places_left_in_vector;
5375 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5376 unsigned int i;
5377
5378 gcc_assert (group_size == initial_values.length () || neutral_op);
5379
5380 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5381 created vectors. It is greater than 1 if unrolling is performed.
5382
5383 For example, we have two scalar operands, s1 and s2 (e.g., group of
5384 strided accesses of size two), while NUNITS is four (i.e., four scalars
5385 of this type can be packed in a vector). The output vector will contain
5386 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5387 will be 2).
5388
5389 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5390 vectors containing the operands.
5391
5392 For example, NUNITS is four as before, and the group size is 8
5393 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5394 {s5, s6, s7, s8}. */
5395
5396 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5397 nunits = group_size;
5398
5399 number_of_places_left_in_vector = nunits;
5400 bool constant_p = true;
5401 tree_vector_builder elts (vector_type, nunits, 1);
5402 elts.quick_grow (nunits);
5403 gimple_seq ctor_seq = NULL;
5404 for (j = 0; j < nunits * number_of_vectors; ++j)
5405 {
5406 tree op;
5407 i = j % group_size;
5408
5409 /* Get the def before the loop. In reduction chain we have only
5410 one initial value. Else we have as many as PHIs in the group. */
5411 if (i >= initial_values.length () || (j > i && neutral_op))
5412 op = neutral_op;
5413 else
5414 op = initial_values[i];
5415
5416 /* Create 'vect_ = {op0,op1,...,opn}'. */
5417 number_of_places_left_in_vector--;
5418 elts[nunits - number_of_places_left_in_vector - 1] = op;
5419 if (!CONSTANT_CLASS_P (op))
5420 constant_p = false;
5421
5422 if (number_of_places_left_in_vector == 0)
5423 {
5424 tree init;
5425 if (constant_p && !neutral_op
5426 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5427 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5428 /* Build the vector directly from ELTS. */
5429 init = gimple_build_vector (&ctor_seq, &elts);
5430 else if (neutral_op)
5431 {
5432 /* Build a vector of the neutral value and shift the
5433 other elements into place. */
5434 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5435 neutral_op);
5436 int k = nunits;
5437 while (k > 0 && elts[k - 1] == neutral_op)
5438 k -= 1;
5439 while (k > 0)
5440 {
5441 k -= 1;
5442 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5443 vector_type, init, elts[k]);
5444 }
5445 }
5446 else
5447 {
5448 /* First time round, duplicate ELTS to fill the
5449 required number of vectors. */
5450 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5451 elts, number_of_vectors, *vec_oprnds);
5452 break;
5453 }
5454 vec_oprnds->quick_push (init);
5455
5456 number_of_places_left_in_vector = nunits;
5457 elts.new_vector (vector_type, nunits, 1);
5458 elts.quick_grow (nunits);
5459 constant_p = true;
5460 }
5461 }
5462 if (ctor_seq != NULL)
5463 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5464 }
5465
5466 /* For a statement STMT_INFO taking part in a reduction operation return
5467 the stmt_vec_info the meta information is stored on. */
5468
5469 stmt_vec_info
5470 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5471 {
5472 stmt_info = vect_orig_stmt (stmt_info);
5473 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5474 if (!is_a <gphi *> (stmt_info->stmt)
5475 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5476 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5477 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5478 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5479 {
5480 if (gimple_phi_num_args (phi) == 1)
5481 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5482 }
5483 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5484 {
5485 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5486 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5487 stmt_info = info;
5488 }
5489 return stmt_info;
5490 }
5491
5492 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5493 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5494 return false. */
5495
5496 static bool
5497 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5498 stmt_vec_info reduc_info)
5499 {
5500 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5501 if (!main_loop_vinfo)
5502 return false;
5503
5504 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5505 return false;
5506
5507 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5508 auto_vec<tree, 16> main_loop_results (num_phis);
5509 auto_vec<tree, 16> initial_values (num_phis);
5510 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5511 {
5512 /* The epilogue loop can be entered either from the main loop or
5513 from an earlier guard block. */
5514 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5515 for (tree incoming_value : reduc_info->reduc_initial_values)
5516 {
5517 /* Look for:
5518
5519 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5520 INITIAL_VALUE(guard block)>. */
5521 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5522
5523 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5524 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5525
5526 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5527 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5528
5529 main_loop_results.quick_push (from_main_loop);
5530 initial_values.quick_push (from_skip);
5531 }
5532 }
5533 else
5534 /* The main loop dominates the epilogue loop. */
5535 main_loop_results.splice (reduc_info->reduc_initial_values);
5536
5537 /* See if the main loop has the kind of accumulator we need. */
5538 vect_reusable_accumulator *accumulator
5539 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5540 if (!accumulator
5541 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5542 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5543 accumulator->reduc_info->reduc_scalar_results.begin ()))
5544 return false;
5545
5546 /* Handle the case where we can reduce wider vectors to narrower ones. */
5547 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5548 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5549 unsigned HOST_WIDE_INT m;
5550 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5551 TYPE_VECTOR_SUBPARTS (vectype), &m))
5552 return false;
5553 /* Check the intermediate vector types and operations are available. */
5554 tree prev_vectype = old_vectype;
5555 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5556 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5557 {
5558 intermediate_nunits = exact_div (intermediate_nunits, 2);
5559 tree intermediate_vectype = get_related_vectype_for_scalar_type
5560 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5561 if (!intermediate_vectype
5562 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5563 intermediate_vectype)
5564 || !can_vec_extract (TYPE_MODE (prev_vectype),
5565 TYPE_MODE (intermediate_vectype)))
5566 return false;
5567 prev_vectype = intermediate_vectype;
5568 }
5569
5570 /* Non-SLP reductions might apply an adjustment after the reduction
5571 operation, in order to simplify the initialization of the accumulator.
5572 If the epilogue loop carries on from where the main loop left off,
5573 it should apply the same adjustment to the final reduction result.
5574
5575 If the epilogue loop can also be entered directly (rather than via
5576 the main loop), we need to be able to handle that case in the same way,
5577 with the same adjustment. (In principle we could add a PHI node
5578 to select the correct adjustment, but in practice that shouldn't be
5579 necessary.) */
5580 tree main_adjustment
5581 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5582 if (loop_vinfo->main_loop_edge && main_adjustment)
5583 {
5584 gcc_assert (num_phis == 1);
5585 tree initial_value = initial_values[0];
5586 /* Check that we can use INITIAL_VALUE as the adjustment and
5587 initialize the accumulator with a neutral value instead. */
5588 if (!operand_equal_p (initial_value, main_adjustment))
5589 return false;
5590 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5591 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5592 code, initial_value);
5593 }
5594 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5595 reduc_info->reduc_initial_values.truncate (0);
5596 reduc_info->reduc_initial_values.splice (initial_values);
5597 reduc_info->reused_accumulator = accumulator;
5598 return true;
5599 }
5600
5601 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5602 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5603
5604 static tree
5605 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5606 gimple_seq *seq)
5607 {
5608 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5609 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5610 tree stype = TREE_TYPE (vectype);
5611 tree new_temp = vec_def;
5612 while (nunits > nunits1)
5613 {
5614 nunits /= 2;
5615 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5616 stype, nunits);
5617 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5618
5619 /* The target has to make sure we support lowpart/highpart
5620 extraction, either via direct vector extract or through
5621 an integer mode punning. */
5622 tree dst1, dst2;
5623 gimple *epilog_stmt;
5624 if (convert_optab_handler (vec_extract_optab,
5625 TYPE_MODE (TREE_TYPE (new_temp)),
5626 TYPE_MODE (vectype1))
5627 != CODE_FOR_nothing)
5628 {
5629 /* Extract sub-vectors directly once vec_extract becomes
5630 a conversion optab. */
5631 dst1 = make_ssa_name (vectype1);
5632 epilog_stmt
5633 = gimple_build_assign (dst1, BIT_FIELD_REF,
5634 build3 (BIT_FIELD_REF, vectype1,
5635 new_temp, TYPE_SIZE (vectype1),
5636 bitsize_int (0)));
5637 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5638 dst2 = make_ssa_name (vectype1);
5639 epilog_stmt
5640 = gimple_build_assign (dst2, BIT_FIELD_REF,
5641 build3 (BIT_FIELD_REF, vectype1,
5642 new_temp, TYPE_SIZE (vectype1),
5643 bitsize_int (bitsize)));
5644 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5645 }
5646 else
5647 {
5648 /* Extract via punning to appropriately sized integer mode
5649 vector. */
5650 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5651 tree etype = build_vector_type (eltype, 2);
5652 gcc_assert (convert_optab_handler (vec_extract_optab,
5653 TYPE_MODE (etype),
5654 TYPE_MODE (eltype))
5655 != CODE_FOR_nothing);
5656 tree tem = make_ssa_name (etype);
5657 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5658 build1 (VIEW_CONVERT_EXPR,
5659 etype, new_temp));
5660 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5661 new_temp = tem;
5662 tem = make_ssa_name (eltype);
5663 epilog_stmt
5664 = gimple_build_assign (tem, BIT_FIELD_REF,
5665 build3 (BIT_FIELD_REF, eltype,
5666 new_temp, TYPE_SIZE (eltype),
5667 bitsize_int (0)));
5668 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5669 dst1 = make_ssa_name (vectype1);
5670 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5671 build1 (VIEW_CONVERT_EXPR,
5672 vectype1, tem));
5673 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5674 tem = make_ssa_name (eltype);
5675 epilog_stmt
5676 = gimple_build_assign (tem, BIT_FIELD_REF,
5677 build3 (BIT_FIELD_REF, eltype,
5678 new_temp, TYPE_SIZE (eltype),
5679 bitsize_int (bitsize)));
5680 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5681 dst2 = make_ssa_name (vectype1);
5682 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5683 build1 (VIEW_CONVERT_EXPR,
5684 vectype1, tem));
5685 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5686 }
5687
5688 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5689 }
5690
5691 return new_temp;
5692 }
5693
5694 /* Function vect_create_epilog_for_reduction
5695
5696 Create code at the loop-epilog to finalize the result of a reduction
5697 computation.
5698
5699 STMT_INFO is the scalar reduction stmt that is being vectorized.
5700 SLP_NODE is an SLP node containing a group of reduction statements. The
5701 first one in this group is STMT_INFO.
5702 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5703 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5704 (counting from 0)
5705
5706 This function:
5707 1. Completes the reduction def-use cycles.
5708 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5709 by calling the function specified by REDUC_FN if available, or by
5710 other means (whole-vector shifts or a scalar loop).
5711 The function also creates a new phi node at the loop exit to preserve
5712 loop-closed form, as illustrated below.
5713
5714 The flow at the entry to this function:
5715
5716 loop:
5717 vec_def = phi <vec_init, null> # REDUCTION_PHI
5718 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5719 s_loop = scalar_stmt # (scalar) STMT_INFO
5720 loop_exit:
5721 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5722 use <s_out0>
5723 use <s_out0>
5724
5725 The above is transformed by this function into:
5726
5727 loop:
5728 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5729 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5730 s_loop = scalar_stmt # (scalar) STMT_INFO
5731 loop_exit:
5732 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5733 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5734 v_out2 = reduce <v_out1>
5735 s_out3 = extract_field <v_out2, 0>
5736 s_out4 = adjust_result <s_out3>
5737 use <s_out4>
5738 use <s_out4>
5739 */
5740
5741 static void
5742 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5743 stmt_vec_info stmt_info,
5744 slp_tree slp_node,
5745 slp_instance slp_node_instance)
5746 {
5747 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5748 gcc_assert (reduc_info->is_reduc_info);
5749 /* For double reductions we need to get at the inner loop reduction
5750 stmt which has the meta info attached. Our stmt_info is that of the
5751 loop-closed PHI of the inner loop which we remember as
5752 def for the reduction PHI generation. */
5753 bool double_reduc = false;
5754 stmt_vec_info rdef_info = stmt_info;
5755 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5756 {
5757 gcc_assert (!slp_node);
5758 double_reduc = true;
5759 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5760 (stmt_info->stmt, 0));
5761 stmt_info = vect_stmt_to_vectorize (stmt_info);
5762 }
5763 gphi *reduc_def_stmt
5764 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5765 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5766 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5767 tree vectype;
5768 machine_mode mode;
5769 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5770 basic_block exit_bb;
5771 tree scalar_dest;
5772 tree scalar_type;
5773 gimple *new_phi = NULL, *phi;
5774 gimple_stmt_iterator exit_gsi;
5775 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5776 gimple *epilog_stmt = NULL;
5777 gimple *exit_phi;
5778 tree bitsize;
5779 tree def;
5780 tree orig_name, scalar_result;
5781 imm_use_iterator imm_iter, phi_imm_iter;
5782 use_operand_p use_p, phi_use_p;
5783 gimple *use_stmt;
5784 auto_vec<tree> reduc_inputs;
5785 int j, i;
5786 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5787 unsigned int group_size = 1, k;
5788 auto_vec<gimple *> phis;
5789 /* SLP reduction without reduction chain, e.g.,
5790 # a1 = phi <a2, a0>
5791 # b1 = phi <b2, b0>
5792 a2 = operation (a1)
5793 b2 = operation (b1) */
5794 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5795 bool direct_slp_reduc;
5796 tree induction_index = NULL_TREE;
5797
5798 if (slp_node)
5799 group_size = SLP_TREE_LANES (slp_node);
5800
5801 if (nested_in_vect_loop_p (loop, stmt_info))
5802 {
5803 outer_loop = loop;
5804 loop = loop->inner;
5805 gcc_assert (!slp_node && double_reduc);
5806 }
5807
5808 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5809 gcc_assert (vectype);
5810 mode = TYPE_MODE (vectype);
5811
5812 tree induc_val = NULL_TREE;
5813 tree adjustment_def = NULL;
5814 if (slp_node)
5815 ;
5816 else
5817 {
5818 /* Optimize: for induction condition reduction, if we can't use zero
5819 for induc_val, use initial_def. */
5820 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5821 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5822 else if (double_reduc)
5823 ;
5824 else
5825 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5826 }
5827
5828 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5829 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5830 if (slp_reduc)
5831 /* All statements produce live-out values. */
5832 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5833 else if (slp_node)
5834 {
5835 /* The last statement in the reduction chain produces the live-out
5836 value. Note SLP optimization can shuffle scalar stmts to
5837 optimize permutations so we have to search for the last stmt. */
5838 for (k = 0; k < group_size; ++k)
5839 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5840 {
5841 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5842 break;
5843 }
5844 }
5845
5846 unsigned vec_num;
5847 int ncopies;
5848 if (slp_node)
5849 {
5850 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5851 ncopies = 1;
5852 }
5853 else
5854 {
5855 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5856 vec_num = 1;
5857 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5858 }
5859
5860 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5861 which is updated with the current index of the loop for every match of
5862 the original loop's cond_expr (VEC_STMT). This results in a vector
5863 containing the last time the condition passed for that vector lane.
5864 The first match will be a 1 to allow 0 to be used for non-matching
5865 indexes. If there are no matches at all then the vector will be all
5866 zeroes.
5867
5868 PR92772: This algorithm is broken for architectures that support
5869 masked vectors, but do not provide fold_extract_last. */
5870 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5871 {
5872 auto_vec<std::pair<tree, bool>, 2> ccompares;
5873 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5874 cond_info = vect_stmt_to_vectorize (cond_info);
5875 while (cond_info != reduc_info)
5876 {
5877 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5878 {
5879 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5880 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5881 ccompares.safe_push
5882 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5883 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5884 }
5885 cond_info
5886 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5887 1 + STMT_VINFO_REDUC_IDX
5888 (cond_info)));
5889 cond_info = vect_stmt_to_vectorize (cond_info);
5890 }
5891 gcc_assert (ccompares.length () != 0);
5892
5893 tree indx_before_incr, indx_after_incr;
5894 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5895 int scalar_precision
5896 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5897 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5898 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5899 (TYPE_MODE (vectype), cr_index_scalar_type,
5900 TYPE_VECTOR_SUBPARTS (vectype));
5901
5902 /* First we create a simple vector induction variable which starts
5903 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5904 vector size (STEP). */
5905
5906 /* Create a {1,2,3,...} vector. */
5907 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5908
5909 /* Create a vector of the step value. */
5910 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5911 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5912
5913 /* Create an induction variable. */
5914 gimple_stmt_iterator incr_gsi;
5915 bool insert_after;
5916 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5917 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5918 insert_after, &indx_before_incr, &indx_after_incr);
5919
5920 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5921 filled with zeros (VEC_ZERO). */
5922
5923 /* Create a vector of 0s. */
5924 tree zero = build_zero_cst (cr_index_scalar_type);
5925 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5926
5927 /* Create a vector phi node. */
5928 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5929 new_phi = create_phi_node (new_phi_tree, loop->header);
5930 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5931 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5932
5933 /* Now take the condition from the loops original cond_exprs
5934 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5935 every match uses values from the induction variable
5936 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5937 (NEW_PHI_TREE).
5938 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5939 the new cond_expr (INDEX_COND_EXPR). */
5940 gimple_seq stmts = NULL;
5941 for (int i = ccompares.length () - 1; i != -1; --i)
5942 {
5943 tree ccompare = ccompares[i].first;
5944 if (ccompares[i].second)
5945 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5946 cr_index_vector_type,
5947 ccompare,
5948 indx_before_incr, new_phi_tree);
5949 else
5950 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5951 cr_index_vector_type,
5952 ccompare,
5953 new_phi_tree, indx_before_incr);
5954 }
5955 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5956
5957 /* Update the phi with the vec cond. */
5958 induction_index = new_phi_tree;
5959 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5960 loop_latch_edge (loop), UNKNOWN_LOCATION);
5961 }
5962
5963 /* 2. Create epilog code.
5964 The reduction epilog code operates across the elements of the vector
5965 of partial results computed by the vectorized loop.
5966 The reduction epilog code consists of:
5967
5968 step 1: compute the scalar result in a vector (v_out2)
5969 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5970 step 3: adjust the scalar result (s_out3) if needed.
5971
5972 Step 1 can be accomplished using one the following three schemes:
5973 (scheme 1) using reduc_fn, if available.
5974 (scheme 2) using whole-vector shifts, if available.
5975 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5976 combined.
5977
5978 The overall epilog code looks like this:
5979
5980 s_out0 = phi <s_loop> # original EXIT_PHI
5981 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5982 v_out2 = reduce <v_out1> # step 1
5983 s_out3 = extract_field <v_out2, 0> # step 2
5984 s_out4 = adjust_result <s_out3> # step 3
5985
5986 (step 3 is optional, and steps 1 and 2 may be combined).
5987 Lastly, the uses of s_out0 are replaced by s_out4. */
5988
5989
5990 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5991 v_out1 = phi <VECT_DEF>
5992 Store them in NEW_PHIS. */
5993 if (double_reduc)
5994 loop = outer_loop;
5995 exit_bb = single_exit (loop)->dest;
5996 exit_gsi = gsi_after_labels (exit_bb);
5997 reduc_inputs.create (slp_node ? vec_num : ncopies);
5998 for (unsigned i = 0; i < vec_num; i++)
5999 {
6000 gimple_seq stmts = NULL;
6001 if (slp_node)
6002 def = vect_get_slp_vect_def (slp_node, i);
6003 else
6004 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6005 for (j = 0; j < ncopies; j++)
6006 {
6007 tree new_def = copy_ssa_name (def);
6008 phi = create_phi_node (new_def, exit_bb);
6009 if (j)
6010 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6011 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6012 new_def = gimple_convert (&stmts, vectype, new_def);
6013 reduc_inputs.quick_push (new_def);
6014 }
6015 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6016 }
6017
6018 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6019 (i.e. when reduc_fn is not available) and in the final adjustment
6020 code (if needed). Also get the original scalar reduction variable as
6021 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6022 represents a reduction pattern), the tree-code and scalar-def are
6023 taken from the original stmt that the pattern-stmt (STMT) replaces.
6024 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6025 are taken from STMT. */
6026
6027 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6028 if (orig_stmt_info != stmt_info)
6029 {
6030 /* Reduction pattern */
6031 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6032 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6033 }
6034
6035 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6036 scalar_type = TREE_TYPE (scalar_dest);
6037 scalar_results.truncate (0);
6038 scalar_results.reserve_exact (group_size);
6039 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6040 bitsize = TYPE_SIZE (scalar_type);
6041
6042 /* True if we should implement SLP_REDUC using native reduction operations
6043 instead of scalar operations. */
6044 direct_slp_reduc = (reduc_fn != IFN_LAST
6045 && slp_reduc
6046 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6047
6048 /* In case of reduction chain, e.g.,
6049 # a1 = phi <a3, a0>
6050 a2 = operation (a1)
6051 a3 = operation (a2),
6052
6053 we may end up with more than one vector result. Here we reduce them
6054 to one vector.
6055
6056 The same is true for a SLP reduction, e.g.,
6057 # a1 = phi <a2, a0>
6058 # b1 = phi <b2, b0>
6059 a2 = operation (a1)
6060 b2 = operation (a2),
6061
6062 where we can end up with more than one vector as well. We can
6063 easily accumulate vectors when the number of vector elements is
6064 a multiple of the SLP group size.
6065
6066 The same is true if we couldn't use a single defuse cycle. */
6067 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6068 || direct_slp_reduc
6069 || (slp_reduc
6070 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6071 || ncopies > 1)
6072 {
6073 gimple_seq stmts = NULL;
6074 tree single_input = reduc_inputs[0];
6075 for (k = 1; k < reduc_inputs.length (); k++)
6076 single_input = gimple_build (&stmts, code, vectype,
6077 single_input, reduc_inputs[k]);
6078 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6079
6080 reduc_inputs.truncate (0);
6081 reduc_inputs.safe_push (single_input);
6082 }
6083
6084 tree orig_reduc_input = reduc_inputs[0];
6085
6086 /* If this loop is an epilogue loop that can be skipped after the
6087 main loop, we can only share a reduction operation between the
6088 main loop and the epilogue if we put it at the target of the
6089 skip edge.
6090
6091 We can still reuse accumulators if this check fails. Doing so has
6092 the minor(?) benefit of making the epilogue loop's scalar result
6093 independent of the main loop's scalar result. */
6094 bool unify_with_main_loop_p = false;
6095 if (reduc_info->reused_accumulator
6096 && loop_vinfo->skip_this_loop_edge
6097 && single_succ_p (exit_bb)
6098 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6099 {
6100 unify_with_main_loop_p = true;
6101
6102 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6103 reduc_inputs[0] = make_ssa_name (vectype);
6104 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6105 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6106 UNKNOWN_LOCATION);
6107 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6108 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6109 exit_gsi = gsi_after_labels (reduc_block);
6110 }
6111
6112 /* Shouldn't be used beyond this point. */
6113 exit_bb = nullptr;
6114
6115 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6116 && reduc_fn != IFN_LAST)
6117 {
6118 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6119 various data values where the condition matched and another vector
6120 (INDUCTION_INDEX) containing all the indexes of those matches. We
6121 need to extract the last matching index (which will be the index with
6122 highest value) and use this to index into the data vector.
6123 For the case where there were no matches, the data vector will contain
6124 all default values and the index vector will be all zeros. */
6125
6126 /* Get various versions of the type of the vector of indexes. */
6127 tree index_vec_type = TREE_TYPE (induction_index);
6128 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6129 tree index_scalar_type = TREE_TYPE (index_vec_type);
6130 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6131
6132 /* Get an unsigned integer version of the type of the data vector. */
6133 int scalar_precision
6134 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6135 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6136 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6137 vectype);
6138
6139 /* First we need to create a vector (ZERO_VEC) of zeros and another
6140 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6141 can create using a MAX reduction and then expanding.
6142 In the case where the loop never made any matches, the max index will
6143 be zero. */
6144
6145 /* Vector of {0, 0, 0,...}. */
6146 tree zero_vec = build_zero_cst (vectype);
6147
6148 /* Find maximum value from the vector of found indexes. */
6149 tree max_index = make_ssa_name (index_scalar_type);
6150 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6151 1, induction_index);
6152 gimple_call_set_lhs (max_index_stmt, max_index);
6153 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6154
6155 /* Vector of {max_index, max_index, max_index,...}. */
6156 tree max_index_vec = make_ssa_name (index_vec_type);
6157 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6158 max_index);
6159 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6160 max_index_vec_rhs);
6161 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6162
6163 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6164 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6165 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6166 otherwise. Only one value should match, resulting in a vector
6167 (VEC_COND) with one data value and the rest zeros.
6168 In the case where the loop never made any matches, every index will
6169 match, resulting in a vector with all data values (which will all be
6170 the default value). */
6171
6172 /* Compare the max index vector to the vector of found indexes to find
6173 the position of the max value. */
6174 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6175 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6176 induction_index,
6177 max_index_vec);
6178 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6179
6180 /* Use the compare to choose either values from the data vector or
6181 zero. */
6182 tree vec_cond = make_ssa_name (vectype);
6183 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6184 vec_compare,
6185 reduc_inputs[0],
6186 zero_vec);
6187 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6188
6189 /* Finally we need to extract the data value from the vector (VEC_COND)
6190 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6191 reduction, but because this doesn't exist, we can use a MAX reduction
6192 instead. The data value might be signed or a float so we need to cast
6193 it first.
6194 In the case where the loop never made any matches, the data values are
6195 all identical, and so will reduce down correctly. */
6196
6197 /* Make the matched data values unsigned. */
6198 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6199 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6200 vec_cond);
6201 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6202 VIEW_CONVERT_EXPR,
6203 vec_cond_cast_rhs);
6204 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6205
6206 /* Reduce down to a scalar value. */
6207 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6208 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6209 1, vec_cond_cast);
6210 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6211 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6212
6213 /* Convert the reduced value back to the result type and set as the
6214 result. */
6215 gimple_seq stmts = NULL;
6216 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6217 data_reduc);
6218 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6219 scalar_results.safe_push (new_temp);
6220 }
6221 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6222 && reduc_fn == IFN_LAST)
6223 {
6224 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6225 idx = 0;
6226 idx_val = induction_index[0];
6227 val = data_reduc[0];
6228 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6229 if (induction_index[i] > idx_val)
6230 val = data_reduc[i], idx_val = induction_index[i];
6231 return val; */
6232
6233 tree data_eltype = TREE_TYPE (vectype);
6234 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6235 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6236 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6237 /* Enforced by vectorizable_reduction, which ensures we have target
6238 support before allowing a conditional reduction on variable-length
6239 vectors. */
6240 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6241 tree idx_val = NULL_TREE, val = NULL_TREE;
6242 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6243 {
6244 tree old_idx_val = idx_val;
6245 tree old_val = val;
6246 idx_val = make_ssa_name (idx_eltype);
6247 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6248 build3 (BIT_FIELD_REF, idx_eltype,
6249 induction_index,
6250 bitsize_int (el_size),
6251 bitsize_int (off)));
6252 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6253 val = make_ssa_name (data_eltype);
6254 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6255 build3 (BIT_FIELD_REF,
6256 data_eltype,
6257 reduc_inputs[0],
6258 bitsize_int (el_size),
6259 bitsize_int (off)));
6260 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6261 if (off != 0)
6262 {
6263 tree new_idx_val = idx_val;
6264 if (off != v_size - el_size)
6265 {
6266 new_idx_val = make_ssa_name (idx_eltype);
6267 epilog_stmt = gimple_build_assign (new_idx_val,
6268 MAX_EXPR, idx_val,
6269 old_idx_val);
6270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271 }
6272 tree cond = make_ssa_name (boolean_type_node);
6273 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6274 idx_val, old_idx_val);
6275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6276 tree new_val = make_ssa_name (data_eltype);
6277 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6278 cond, val, old_val);
6279 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6280 idx_val = new_idx_val;
6281 val = new_val;
6282 }
6283 }
6284 /* Convert the reduced value back to the result type and set as the
6285 result. */
6286 gimple_seq stmts = NULL;
6287 val = gimple_convert (&stmts, scalar_type, val);
6288 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6289 scalar_results.safe_push (val);
6290 }
6291
6292 /* 2.3 Create the reduction code, using one of the three schemes described
6293 above. In SLP we simply need to extract all the elements from the
6294 vector (without reducing them), so we use scalar shifts. */
6295 else if (reduc_fn != IFN_LAST && !slp_reduc)
6296 {
6297 tree tmp;
6298 tree vec_elem_type;
6299
6300 /* Case 1: Create:
6301 v_out2 = reduc_expr <v_out1> */
6302
6303 if (dump_enabled_p ())
6304 dump_printf_loc (MSG_NOTE, vect_location,
6305 "Reduce using direct vector reduction.\n");
6306
6307 gimple_seq stmts = NULL;
6308 vec_elem_type = TREE_TYPE (vectype);
6309 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6310 vec_elem_type, reduc_inputs[0]);
6311 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6312 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6313
6314 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6315 && induc_val)
6316 {
6317 /* Earlier we set the initial value to be a vector if induc_val
6318 values. Check the result and if it is induc_val then replace
6319 with the original initial value, unless induc_val is
6320 the same as initial_def already. */
6321 tree zcompare = make_ssa_name (boolean_type_node);
6322 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6323 new_temp, induc_val);
6324 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6325 tree initial_def = reduc_info->reduc_initial_values[0];
6326 tmp = make_ssa_name (new_scalar_dest);
6327 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6328 initial_def, new_temp);
6329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6330 new_temp = tmp;
6331 }
6332
6333 scalar_results.safe_push (new_temp);
6334 }
6335 else if (direct_slp_reduc)
6336 {
6337 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6338 with the elements for other SLP statements replaced with the
6339 neutral value. We can then do a normal reduction on each vector. */
6340
6341 /* Enforced by vectorizable_reduction. */
6342 gcc_assert (reduc_inputs.length () == 1);
6343 gcc_assert (pow2p_hwi (group_size));
6344
6345 gimple_seq seq = NULL;
6346
6347 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6348 and the same element size as VECTYPE. */
6349 tree index = build_index_vector (vectype, 0, 1);
6350 tree index_type = TREE_TYPE (index);
6351 tree index_elt_type = TREE_TYPE (index_type);
6352 tree mask_type = truth_type_for (index_type);
6353
6354 /* Create a vector that, for each element, identifies which of
6355 the REDUC_GROUP_SIZE results should use it. */
6356 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6357 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6358 build_vector_from_val (index_type, index_mask));
6359
6360 /* Get a neutral vector value. This is simply a splat of the neutral
6361 scalar value if we have one, otherwise the initial scalar value
6362 is itself a neutral value. */
6363 tree vector_identity = NULL_TREE;
6364 tree neutral_op = NULL_TREE;
6365 if (slp_node)
6366 {
6367 tree initial_value = NULL_TREE;
6368 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6369 initial_value = reduc_info->reduc_initial_values[0];
6370 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6371 initial_value);
6372 }
6373 if (neutral_op)
6374 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6375 neutral_op);
6376 for (unsigned int i = 0; i < group_size; ++i)
6377 {
6378 /* If there's no univeral neutral value, we can use the
6379 initial scalar value from the original PHI. This is used
6380 for MIN and MAX reduction, for example. */
6381 if (!neutral_op)
6382 {
6383 tree scalar_value = reduc_info->reduc_initial_values[i];
6384 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6385 scalar_value);
6386 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6387 scalar_value);
6388 }
6389
6390 /* Calculate the equivalent of:
6391
6392 sel[j] = (index[j] == i);
6393
6394 which selects the elements of REDUC_INPUTS[0] that should
6395 be included in the result. */
6396 tree compare_val = build_int_cst (index_elt_type, i);
6397 compare_val = build_vector_from_val (index_type, compare_val);
6398 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6399 index, compare_val);
6400
6401 /* Calculate the equivalent of:
6402
6403 vec = seq ? reduc_inputs[0] : vector_identity;
6404
6405 VEC is now suitable for a full vector reduction. */
6406 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6407 sel, reduc_inputs[0], vector_identity);
6408
6409 /* Do the reduction and convert it to the appropriate type. */
6410 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6411 TREE_TYPE (vectype), vec);
6412 scalar = gimple_convert (&seq, scalar_type, scalar);
6413 scalar_results.safe_push (scalar);
6414 }
6415 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6416 }
6417 else
6418 {
6419 bool reduce_with_shift;
6420 tree vec_temp;
6421
6422 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6423
6424 /* See if the target wants to do the final (shift) reduction
6425 in a vector mode of smaller size and first reduce upper/lower
6426 halves against each other. */
6427 enum machine_mode mode1 = mode;
6428 tree stype = TREE_TYPE (vectype);
6429 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6430 unsigned nunits1 = nunits;
6431 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6432 && reduc_inputs.length () == 1)
6433 {
6434 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6435 /* For SLP reductions we have to make sure lanes match up, but
6436 since we're doing individual element final reduction reducing
6437 vector width here is even more important.
6438 ??? We can also separate lanes with permutes, for the common
6439 case of power-of-two group-size odd/even extracts would work. */
6440 if (slp_reduc && nunits != nunits1)
6441 {
6442 nunits1 = least_common_multiple (nunits1, group_size);
6443 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6444 }
6445 }
6446 if (!slp_reduc
6447 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6448 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6449
6450 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6451 stype, nunits1);
6452 reduce_with_shift = have_whole_vector_shift (mode1);
6453 if (!VECTOR_MODE_P (mode1)
6454 || !directly_supported_p (code, vectype1))
6455 reduce_with_shift = false;
6456
6457 /* First reduce the vector to the desired vector size we should
6458 do shift reduction on by combining upper and lower halves. */
6459 gimple_seq stmts = NULL;
6460 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6461 code, &stmts);
6462 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6463 reduc_inputs[0] = new_temp;
6464
6465 if (reduce_with_shift && !slp_reduc)
6466 {
6467 int element_bitsize = tree_to_uhwi (bitsize);
6468 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6469 for variable-length vectors and also requires direct target support
6470 for loop reductions. */
6471 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6472 int nelements = vec_size_in_bits / element_bitsize;
6473 vec_perm_builder sel;
6474 vec_perm_indices indices;
6475
6476 int elt_offset;
6477
6478 tree zero_vec = build_zero_cst (vectype1);
6479 /* Case 2: Create:
6480 for (offset = nelements/2; offset >= 1; offset/=2)
6481 {
6482 Create: va' = vec_shift <va, offset>
6483 Create: va = vop <va, va'>
6484 } */
6485
6486 tree rhs;
6487
6488 if (dump_enabled_p ())
6489 dump_printf_loc (MSG_NOTE, vect_location,
6490 "Reduce using vector shifts\n");
6491
6492 gimple_seq stmts = NULL;
6493 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6494 for (elt_offset = nelements / 2;
6495 elt_offset >= 1;
6496 elt_offset /= 2)
6497 {
6498 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6499 indices.new_vector (sel, 2, nelements);
6500 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6501 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6502 new_temp, zero_vec, mask);
6503 new_temp = gimple_build (&stmts, code,
6504 vectype1, new_name, new_temp);
6505 }
6506 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6507
6508 /* 2.4 Extract the final scalar result. Create:
6509 s_out3 = extract_field <v_out2, bitpos> */
6510
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_NOTE, vect_location,
6513 "extract scalar result\n");
6514
6515 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6516 bitsize, bitsize_zero_node);
6517 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6518 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6519 gimple_assign_set_lhs (epilog_stmt, new_temp);
6520 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6521 scalar_results.safe_push (new_temp);
6522 }
6523 else
6524 {
6525 /* Case 3: Create:
6526 s = extract_field <v_out2, 0>
6527 for (offset = element_size;
6528 offset < vector_size;
6529 offset += element_size;)
6530 {
6531 Create: s' = extract_field <v_out2, offset>
6532 Create: s = op <s, s'> // For non SLP cases
6533 } */
6534
6535 if (dump_enabled_p ())
6536 dump_printf_loc (MSG_NOTE, vect_location,
6537 "Reduce using scalar code.\n");
6538
6539 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6540 int element_bitsize = tree_to_uhwi (bitsize);
6541 tree compute_type = TREE_TYPE (vectype);
6542 gimple_seq stmts = NULL;
6543 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6544 {
6545 int bit_offset;
6546 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6547 vec_temp, bitsize, bitsize_zero_node);
6548
6549 /* In SLP we don't need to apply reduction operation, so we just
6550 collect s' values in SCALAR_RESULTS. */
6551 if (slp_reduc)
6552 scalar_results.safe_push (new_temp);
6553
6554 for (bit_offset = element_bitsize;
6555 bit_offset < vec_size_in_bits;
6556 bit_offset += element_bitsize)
6557 {
6558 tree bitpos = bitsize_int (bit_offset);
6559 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6560 compute_type, vec_temp,
6561 bitsize, bitpos);
6562 if (slp_reduc)
6563 {
6564 /* In SLP we don't need to apply reduction operation, so
6565 we just collect s' values in SCALAR_RESULTS. */
6566 new_temp = new_name;
6567 scalar_results.safe_push (new_name);
6568 }
6569 else
6570 new_temp = gimple_build (&stmts, code, compute_type,
6571 new_name, new_temp);
6572 }
6573 }
6574
6575 /* The only case where we need to reduce scalar results in SLP, is
6576 unrolling. If the size of SCALAR_RESULTS is greater than
6577 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6578 REDUC_GROUP_SIZE. */
6579 if (slp_reduc)
6580 {
6581 tree res, first_res, new_res;
6582
6583 /* Reduce multiple scalar results in case of SLP unrolling. */
6584 for (j = group_size; scalar_results.iterate (j, &res);
6585 j++)
6586 {
6587 first_res = scalar_results[j % group_size];
6588 new_res = gimple_build (&stmts, code, compute_type,
6589 first_res, res);
6590 scalar_results[j % group_size] = new_res;
6591 }
6592 scalar_results.truncate (group_size);
6593 for (k = 0; k < group_size; k++)
6594 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6595 scalar_results[k]);
6596 }
6597 else
6598 {
6599 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6600 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6601 scalar_results.safe_push (new_temp);
6602 }
6603
6604 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6605 }
6606
6607 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6608 && induc_val)
6609 {
6610 /* Earlier we set the initial value to be a vector if induc_val
6611 values. Check the result and if it is induc_val then replace
6612 with the original initial value, unless induc_val is
6613 the same as initial_def already. */
6614 tree zcompare = make_ssa_name (boolean_type_node);
6615 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6616 induc_val);
6617 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6618 tree initial_def = reduc_info->reduc_initial_values[0];
6619 tree tmp = make_ssa_name (new_scalar_dest);
6620 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6621 initial_def, new_temp);
6622 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6623 scalar_results[0] = tmp;
6624 }
6625 }
6626
6627 /* 2.5 Adjust the final result by the initial value of the reduction
6628 variable. (When such adjustment is not needed, then
6629 'adjustment_def' is zero). For example, if code is PLUS we create:
6630 new_temp = loop_exit_def + adjustment_def */
6631
6632 if (adjustment_def)
6633 {
6634 gcc_assert (!slp_reduc);
6635 gimple_seq stmts = NULL;
6636 if (double_reduc)
6637 {
6638 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6639 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6640 new_temp = gimple_build (&stmts, code, vectype,
6641 reduc_inputs[0], adjustment_def);
6642 }
6643 else
6644 {
6645 new_temp = scalar_results[0];
6646 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6647 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6648 adjustment_def);
6649 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6650 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6651 new_temp, adjustment_def);
6652 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6653 }
6654
6655 epilog_stmt = gimple_seq_last_stmt (stmts);
6656 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6657 scalar_results[0] = new_temp;
6658 }
6659
6660 /* Record this operation if it could be reused by the epilogue loop. */
6661 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6662 && reduc_inputs.length () == 1)
6663 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6664 { orig_reduc_input, reduc_info });
6665
6666 if (double_reduc)
6667 loop = outer_loop;
6668
6669 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6670 phis with new adjusted scalar results, i.e., replace use <s_out0>
6671 with use <s_out4>.
6672
6673 Transform:
6674 loop_exit:
6675 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6676 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6677 v_out2 = reduce <v_out1>
6678 s_out3 = extract_field <v_out2, 0>
6679 s_out4 = adjust_result <s_out3>
6680 use <s_out0>
6681 use <s_out0>
6682
6683 into:
6684
6685 loop_exit:
6686 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6687 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6688 v_out2 = reduce <v_out1>
6689 s_out3 = extract_field <v_out2, 0>
6690 s_out4 = adjust_result <s_out3>
6691 use <s_out4>
6692 use <s_out4> */
6693
6694 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6695 for (k = 0; k < live_out_stmts.size (); k++)
6696 {
6697 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6698 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6699
6700 phis.create (3);
6701 /* Find the loop-closed-use at the loop exit of the original scalar
6702 result. (The reduction result is expected to have two immediate uses,
6703 one at the latch block, and one at the loop exit). For double
6704 reductions we are looking for exit phis of the outer loop. */
6705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6706 {
6707 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6708 {
6709 if (!is_gimple_debug (USE_STMT (use_p)))
6710 phis.safe_push (USE_STMT (use_p));
6711 }
6712 else
6713 {
6714 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6715 {
6716 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6717
6718 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6719 {
6720 if (!flow_bb_inside_loop_p (loop,
6721 gimple_bb (USE_STMT (phi_use_p)))
6722 && !is_gimple_debug (USE_STMT (phi_use_p)))
6723 phis.safe_push (USE_STMT (phi_use_p));
6724 }
6725 }
6726 }
6727 }
6728
6729 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6730 {
6731 /* Replace the uses: */
6732 orig_name = PHI_RESULT (exit_phi);
6733
6734 /* Look for a single use at the target of the skip edge. */
6735 if (unify_with_main_loop_p)
6736 {
6737 use_operand_p use_p;
6738 gimple *user;
6739 if (!single_imm_use (orig_name, &use_p, &user))
6740 gcc_unreachable ();
6741 orig_name = gimple_get_lhs (user);
6742 }
6743
6744 scalar_result = scalar_results[k];
6745 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6746 {
6747 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6748 SET_USE (use_p, scalar_result);
6749 update_stmt (use_stmt);
6750 }
6751 }
6752
6753 phis.release ();
6754 }
6755 }
6756
6757 /* Return a vector of type VECTYPE that is equal to the vector select
6758 operation "MASK ? VEC : IDENTITY". Insert the select statements
6759 before GSI. */
6760
6761 static tree
6762 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6763 tree vec, tree identity)
6764 {
6765 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6766 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6767 mask, vec, identity);
6768 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6769 return cond;
6770 }
6771
6772 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6773 order, starting with LHS. Insert the extraction statements before GSI and
6774 associate the new scalar SSA names with variable SCALAR_DEST.
6775 Return the SSA name for the result. */
6776
6777 static tree
6778 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6779 tree_code code, tree lhs, tree vector_rhs)
6780 {
6781 tree vectype = TREE_TYPE (vector_rhs);
6782 tree scalar_type = TREE_TYPE (vectype);
6783 tree bitsize = TYPE_SIZE (scalar_type);
6784 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6785 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6786
6787 for (unsigned HOST_WIDE_INT bit_offset = 0;
6788 bit_offset < vec_size_in_bits;
6789 bit_offset += element_bitsize)
6790 {
6791 tree bitpos = bitsize_int (bit_offset);
6792 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6793 bitsize, bitpos);
6794
6795 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6796 rhs = make_ssa_name (scalar_dest, stmt);
6797 gimple_assign_set_lhs (stmt, rhs);
6798 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6799
6800 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6801 tree new_name = make_ssa_name (scalar_dest, stmt);
6802 gimple_assign_set_lhs (stmt, new_name);
6803 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6804 lhs = new_name;
6805 }
6806 return lhs;
6807 }
6808
6809 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6810 type of the vector input. */
6811
6812 static internal_fn
6813 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6814 {
6815 internal_fn mask_reduc_fn;
6816 internal_fn mask_len_reduc_fn;
6817
6818 switch (reduc_fn)
6819 {
6820 case IFN_FOLD_LEFT_PLUS:
6821 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6822 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6823 break;
6824
6825 default:
6826 return IFN_LAST;
6827 }
6828
6829 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6830 OPTIMIZE_FOR_SPEED))
6831 return mask_reduc_fn;
6832 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6833 OPTIMIZE_FOR_SPEED))
6834 return mask_len_reduc_fn;
6835 return IFN_LAST;
6836 }
6837
6838 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6839 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6840 statement. CODE is the operation performed by STMT_INFO and OPS are
6841 its scalar operands. REDUC_INDEX is the index of the operand in
6842 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6843 implements in-order reduction, or IFN_LAST if we should open-code it.
6844 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6845 that should be used to control the operation in a fully-masked loop. */
6846
6847 static bool
6848 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6849 stmt_vec_info stmt_info,
6850 gimple_stmt_iterator *gsi,
6851 gimple **vec_stmt, slp_tree slp_node,
6852 gimple *reduc_def_stmt,
6853 tree_code code, internal_fn reduc_fn,
6854 tree ops[3], tree vectype_in,
6855 int reduc_index, vec_loop_masks *masks,
6856 vec_loop_lens *lens)
6857 {
6858 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6859 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6860 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6861
6862 int ncopies;
6863 if (slp_node)
6864 ncopies = 1;
6865 else
6866 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867
6868 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6869 gcc_assert (ncopies == 1);
6870 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6871
6872 if (slp_node)
6873 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6874 TYPE_VECTOR_SUBPARTS (vectype_in)));
6875
6876 tree op0 = ops[1 - reduc_index];
6877
6878 int group_size = 1;
6879 stmt_vec_info scalar_dest_def_info;
6880 auto_vec<tree> vec_oprnds0;
6881 if (slp_node)
6882 {
6883 auto_vec<vec<tree> > vec_defs (2);
6884 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6885 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6886 vec_defs[0].release ();
6887 vec_defs[1].release ();
6888 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6889 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6890 }
6891 else
6892 {
6893 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6894 op0, &vec_oprnds0);
6895 scalar_dest_def_info = stmt_info;
6896 }
6897
6898 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6899 tree scalar_type = TREE_TYPE (scalar_dest);
6900 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6901
6902 int vec_num = vec_oprnds0.length ();
6903 gcc_assert (vec_num == 1 || slp_node);
6904 tree vec_elem_type = TREE_TYPE (vectype_out);
6905 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6906
6907 tree vector_identity = NULL_TREE;
6908 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6909 vector_identity = build_zero_cst (vectype_out);
6910
6911 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6912 int i;
6913 tree def0;
6914 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6915 {
6916 gimple *new_stmt;
6917 tree mask = NULL_TREE;
6918 tree len = NULL_TREE;
6919 tree bias = NULL_TREE;
6920 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6921 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6922 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6923 {
6924 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6925 i, 1);
6926 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6927 bias = build_int_cst (intQI_type_node, biasval);
6928 mask = build_minus_one_cst (truth_type_for (vectype_in));
6929 }
6930
6931 /* Handle MINUS by adding the negative. */
6932 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6933 {
6934 tree negated = make_ssa_name (vectype_out);
6935 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6936 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6937 def0 = negated;
6938 }
6939
6940 if (mask && mask_reduc_fn == IFN_LAST)
6941 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6942 vector_identity);
6943
6944 /* On the first iteration the input is simply the scalar phi
6945 result, and for subsequent iterations it is the output of
6946 the preceding operation. */
6947 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6948 {
6949 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6950 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6951 def0, mask, len, bias);
6952 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6953 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6954 def0, mask);
6955 else
6956 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6957 def0);
6958 /* For chained SLP reductions the output of the previous reduction
6959 operation serves as the input of the next. For the final statement
6960 the output cannot be a temporary - we reuse the original
6961 scalar destination of the last statement. */
6962 if (i != vec_num - 1)
6963 {
6964 gimple_set_lhs (new_stmt, scalar_dest_var);
6965 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6966 gimple_set_lhs (new_stmt, reduc_var);
6967 }
6968 }
6969 else
6970 {
6971 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6972 reduc_var, def0);
6973 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6974 /* Remove the statement, so that we can use the same code paths
6975 as for statements that we've just created. */
6976 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6977 gsi_remove (&tmp_gsi, true);
6978 }
6979
6980 if (i == vec_num - 1)
6981 {
6982 gimple_set_lhs (new_stmt, scalar_dest);
6983 vect_finish_replace_stmt (loop_vinfo,
6984 scalar_dest_def_info,
6985 new_stmt);
6986 }
6987 else
6988 vect_finish_stmt_generation (loop_vinfo,
6989 scalar_dest_def_info,
6990 new_stmt, gsi);
6991
6992 if (slp_node)
6993 slp_node->push_vec_def (new_stmt);
6994 else
6995 {
6996 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6997 *vec_stmt = new_stmt;
6998 }
6999 }
7000
7001 return true;
7002 }
7003
7004 /* Function is_nonwrapping_integer_induction.
7005
7006 Check if STMT_VINO (which is part of loop LOOP) both increments and
7007 does not cause overflow. */
7008
7009 static bool
7010 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7011 {
7012 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7013 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7014 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7015 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7016 widest_int ni, max_loop_value, lhs_max;
7017 wi::overflow_type overflow = wi::OVF_NONE;
7018
7019 /* Make sure the loop is integer based. */
7020 if (TREE_CODE (base) != INTEGER_CST
7021 || TREE_CODE (step) != INTEGER_CST)
7022 return false;
7023
7024 /* Check that the max size of the loop will not wrap. */
7025
7026 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7027 return true;
7028
7029 if (! max_stmt_executions (loop, &ni))
7030 return false;
7031
7032 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7033 &overflow);
7034 if (overflow)
7035 return false;
7036
7037 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7038 TYPE_SIGN (lhs_type), &overflow);
7039 if (overflow)
7040 return false;
7041
7042 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7043 <= TYPE_PRECISION (lhs_type));
7044 }
7045
7046 /* Check if masking can be supported by inserting a conditional expression.
7047 CODE is the code for the operation. COND_FN is the conditional internal
7048 function, if it exists. VECTYPE_IN is the type of the vector input. */
7049 static bool
7050 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7051 tree vectype_in)
7052 {
7053 if (cond_fn != IFN_LAST
7054 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7055 OPTIMIZE_FOR_SPEED))
7056 return false;
7057
7058 if (code.is_tree_code ())
7059 switch (tree_code (code))
7060 {
7061 case DOT_PROD_EXPR:
7062 case SAD_EXPR:
7063 return true;
7064
7065 default:
7066 break;
7067 }
7068 return false;
7069 }
7070
7071 /* Insert a conditional expression to enable masked vectorization. CODE is the
7072 code for the operation. VOP is the array of operands. MASK is the loop
7073 mask. GSI is a statement iterator used to place the new conditional
7074 expression. */
7075 static void
7076 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7077 gimple_stmt_iterator *gsi)
7078 {
7079 switch (tree_code (code))
7080 {
7081 case DOT_PROD_EXPR:
7082 {
7083 tree vectype = TREE_TYPE (vop[1]);
7084 tree zero = build_zero_cst (vectype);
7085 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7086 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7087 mask, vop[1], zero);
7088 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7089 vop[1] = masked_op1;
7090 break;
7091 }
7092
7093 case SAD_EXPR:
7094 {
7095 tree vectype = TREE_TYPE (vop[1]);
7096 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7097 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7098 mask, vop[1], vop[0]);
7099 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7100 vop[1] = masked_op1;
7101 break;
7102 }
7103
7104 default:
7105 gcc_unreachable ();
7106 }
7107 }
7108
7109 /* Function vectorizable_reduction.
7110
7111 Check if STMT_INFO performs a reduction operation that can be vectorized.
7112 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7113 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7114 Return true if STMT_INFO is vectorizable in this way.
7115
7116 This function also handles reduction idioms (patterns) that have been
7117 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7118 may be of this form:
7119 X = pattern_expr (arg0, arg1, ..., X)
7120 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7121 sequence that had been detected and replaced by the pattern-stmt
7122 (STMT_INFO).
7123
7124 This function also handles reduction of condition expressions, for example:
7125 for (int i = 0; i < N; i++)
7126 if (a[i] < value)
7127 last = a[i];
7128 This is handled by vectorising the loop and creating an additional vector
7129 containing the loop indexes for which "a[i] < value" was true. In the
7130 function epilogue this is reduced to a single max value and then used to
7131 index into the vector of results.
7132
7133 In some cases of reduction patterns, the type of the reduction variable X is
7134 different than the type of the other arguments of STMT_INFO.
7135 In such cases, the vectype that is used when transforming STMT_INFO into
7136 a vector stmt is different than the vectype that is used to determine the
7137 vectorization factor, because it consists of a different number of elements
7138 than the actual number of elements that are being operated upon in parallel.
7139
7140 For example, consider an accumulation of shorts into an int accumulator.
7141 On some targets it's possible to vectorize this pattern operating on 8
7142 shorts at a time (hence, the vectype for purposes of determining the
7143 vectorization factor should be V8HI); on the other hand, the vectype that
7144 is used to create the vector form is actually V4SI (the type of the result).
7145
7146 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7147 indicates what is the actual level of parallelism (V8HI in the example), so
7148 that the right vectorization factor would be derived. This vectype
7149 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7150 be used to create the vectorized stmt. The right vectype for the vectorized
7151 stmt is obtained from the type of the result X:
7152 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7153
7154 This means that, contrary to "regular" reductions (or "regular" stmts in
7155 general), the following equation:
7156 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7157 does *NOT* necessarily hold for reduction patterns. */
7158
7159 bool
7160 vectorizable_reduction (loop_vec_info loop_vinfo,
7161 stmt_vec_info stmt_info, slp_tree slp_node,
7162 slp_instance slp_node_instance,
7163 stmt_vector_for_cost *cost_vec)
7164 {
7165 tree vectype_in = NULL_TREE;
7166 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7167 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7168 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7169 stmt_vec_info cond_stmt_vinfo = NULL;
7170 int i;
7171 int ncopies;
7172 bool single_defuse_cycle = false;
7173 bool nested_cycle = false;
7174 bool double_reduc = false;
7175 int vec_num;
7176 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7177 tree cond_reduc_val = NULL_TREE;
7178
7179 /* Make sure it was already recognized as a reduction computation. */
7180 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7181 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7182 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7183 return false;
7184
7185 /* The stmt we store reduction analysis meta on. */
7186 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7187 reduc_info->is_reduc_info = true;
7188
7189 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7190 {
7191 if (is_a <gphi *> (stmt_info->stmt))
7192 {
7193 if (slp_node)
7194 {
7195 /* We eventually need to set a vector type on invariant
7196 arguments. */
7197 unsigned j;
7198 slp_tree child;
7199 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7200 if (!vect_maybe_update_slp_op_vectype
7201 (child, SLP_TREE_VECTYPE (slp_node)))
7202 {
7203 if (dump_enabled_p ())
7204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7205 "incompatible vector types for "
7206 "invariants\n");
7207 return false;
7208 }
7209 }
7210 /* Analysis for double-reduction is done on the outer
7211 loop PHI, nested cycles have no further restrictions. */
7212 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7213 }
7214 else
7215 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7216 return true;
7217 }
7218
7219 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7220 stmt_vec_info phi_info = stmt_info;
7221 if (!is_a <gphi *> (stmt_info->stmt))
7222 {
7223 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7224 return true;
7225 }
7226 if (slp_node)
7227 {
7228 slp_node_instance->reduc_phis = slp_node;
7229 /* ??? We're leaving slp_node to point to the PHIs, we only
7230 need it to get at the number of vector stmts which wasn't
7231 yet initialized for the instance root. */
7232 }
7233 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7234 {
7235 use_operand_p use_p;
7236 gimple *use_stmt;
7237 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7238 &use_p, &use_stmt);
7239 gcc_assert (res);
7240 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7241 }
7242
7243 /* PHIs should not participate in patterns. */
7244 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7245 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7246
7247 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7248 and compute the reduction chain length. Discover the real
7249 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7250 tree reduc_def
7251 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7252 loop_latch_edge
7253 (gimple_bb (reduc_def_phi)->loop_father));
7254 unsigned reduc_chain_length = 0;
7255 bool only_slp_reduc_chain = true;
7256 stmt_info = NULL;
7257 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7258 while (reduc_def != PHI_RESULT (reduc_def_phi))
7259 {
7260 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7261 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7262 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7263 {
7264 if (dump_enabled_p ())
7265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7266 "reduction chain broken by patterns.\n");
7267 return false;
7268 }
7269 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7270 only_slp_reduc_chain = false;
7271 /* For epilogue generation live members of the chain need
7272 to point back to the PHI via their original stmt for
7273 info_for_reduction to work. For SLP we need to look at
7274 all lanes here - even though we only will vectorize from
7275 the SLP node with live lane zero the other live lanes also
7276 need to be identified as part of a reduction to be able
7277 to skip code generation for them. */
7278 if (slp_for_stmt_info)
7279 {
7280 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7281 if (STMT_VINFO_LIVE_P (s))
7282 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7283 }
7284 else if (STMT_VINFO_LIVE_P (vdef))
7285 STMT_VINFO_REDUC_DEF (def) = phi_info;
7286 gimple_match_op op;
7287 if (!gimple_extract_op (vdef->stmt, &op))
7288 {
7289 if (dump_enabled_p ())
7290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7291 "reduction chain includes unsupported"
7292 " statement type.\n");
7293 return false;
7294 }
7295 if (CONVERT_EXPR_CODE_P (op.code))
7296 {
7297 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7298 {
7299 if (dump_enabled_p ())
7300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7301 "conversion in the reduction chain.\n");
7302 return false;
7303 }
7304 }
7305 else if (!stmt_info)
7306 /* First non-conversion stmt. */
7307 stmt_info = vdef;
7308 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7309 reduc_chain_length++;
7310 if (!stmt_info && slp_node)
7311 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7312 }
7313 /* PHIs should not participate in patterns. */
7314 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7315
7316 if (nested_in_vect_loop_p (loop, stmt_info))
7317 {
7318 loop = loop->inner;
7319 nested_cycle = true;
7320 }
7321
7322 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7323 element. */
7324 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7325 {
7326 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7327 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7328 }
7329 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7330 gcc_assert (slp_node
7331 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7332
7333 /* 1. Is vectorizable reduction? */
7334 /* Not supportable if the reduction variable is used in the loop, unless
7335 it's a reduction chain. */
7336 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7337 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7338 return false;
7339
7340 /* Reductions that are not used even in an enclosing outer-loop,
7341 are expected to be "live" (used out of the loop). */
7342 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7343 && !STMT_VINFO_LIVE_P (stmt_info))
7344 return false;
7345
7346 /* 2. Has this been recognized as a reduction pattern?
7347
7348 Check if STMT represents a pattern that has been recognized
7349 in earlier analysis stages. For stmts that represent a pattern,
7350 the STMT_VINFO_RELATED_STMT field records the last stmt in
7351 the original sequence that constitutes the pattern. */
7352
7353 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7354 if (orig_stmt_info)
7355 {
7356 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7357 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7358 }
7359
7360 /* 3. Check the operands of the operation. The first operands are defined
7361 inside the loop body. The last operand is the reduction variable,
7362 which is defined by the loop-header-phi. */
7363
7364 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7365 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7366 gimple_match_op op;
7367 if (!gimple_extract_op (stmt_info->stmt, &op))
7368 gcc_unreachable ();
7369 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7370 || op.code == WIDEN_SUM_EXPR
7371 || op.code == SAD_EXPR);
7372
7373 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7374 && !SCALAR_FLOAT_TYPE_P (op.type))
7375 return false;
7376
7377 /* Do not try to vectorize bit-precision reductions. */
7378 if (!type_has_mode_precision_p (op.type))
7379 return false;
7380
7381 /* For lane-reducing ops we're reducing the number of reduction PHIs
7382 which means the only use of that may be in the lane-reducing operation. */
7383 if (lane_reduc_code_p
7384 && reduc_chain_length != 1
7385 && !only_slp_reduc_chain)
7386 {
7387 if (dump_enabled_p ())
7388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7389 "lane-reducing reduction with extra stmts.\n");
7390 return false;
7391 }
7392
7393 /* All uses but the last are expected to be defined in the loop.
7394 The last use is the reduction variable. In case of nested cycle this
7395 assumption is not true: we use reduc_index to record the index of the
7396 reduction variable. */
7397 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7398 /* We need to skip an extra operand for COND_EXPRs with embedded
7399 comparison. */
7400 unsigned opno_adjust = 0;
7401 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7402 opno_adjust = 1;
7403 for (i = 0; i < (int) op.num_ops; i++)
7404 {
7405 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7406 if (i == 0 && op.code == COND_EXPR)
7407 continue;
7408
7409 stmt_vec_info def_stmt_info;
7410 enum vect_def_type dt;
7411 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7412 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7413 &vectype_op[i], &def_stmt_info))
7414 {
7415 if (dump_enabled_p ())
7416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7417 "use not simple.\n");
7418 return false;
7419 }
7420 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7421 continue;
7422
7423 /* There should be only one cycle def in the stmt, the one
7424 leading to reduc_def. */
7425 if (VECTORIZABLE_CYCLE_DEF (dt))
7426 return false;
7427
7428 if (!vectype_op[i])
7429 vectype_op[i]
7430 = get_vectype_for_scalar_type (loop_vinfo,
7431 TREE_TYPE (op.ops[i]), slp_op[i]);
7432
7433 /* To properly compute ncopies we are interested in the widest
7434 non-reduction input type in case we're looking at a widening
7435 accumulation that we later handle in vect_transform_reduction. */
7436 if (lane_reduc_code_p
7437 && vectype_op[i]
7438 && (!vectype_in
7439 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7440 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7441 vectype_in = vectype_op[i];
7442
7443 if (op.code == COND_EXPR)
7444 {
7445 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7446 if (dt == vect_constant_def)
7447 {
7448 cond_reduc_dt = dt;
7449 cond_reduc_val = op.ops[i];
7450 }
7451 if (dt == vect_induction_def
7452 && def_stmt_info
7453 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7454 {
7455 cond_reduc_dt = dt;
7456 cond_stmt_vinfo = def_stmt_info;
7457 }
7458 }
7459 }
7460 if (!vectype_in)
7461 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7462 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7463
7464 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7465 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7466 /* If we have a condition reduction, see if we can simplify it further. */
7467 if (v_reduc_type == COND_REDUCTION)
7468 {
7469 if (slp_node)
7470 return false;
7471
7472 /* When the condition uses the reduction value in the condition, fail. */
7473 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7474 {
7475 if (dump_enabled_p ())
7476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7477 "condition depends on previous iteration\n");
7478 return false;
7479 }
7480
7481 if (reduc_chain_length == 1
7482 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7483 vectype_in, OPTIMIZE_FOR_SPEED))
7484 {
7485 if (dump_enabled_p ())
7486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487 "optimizing condition reduction with"
7488 " FOLD_EXTRACT_LAST.\n");
7489 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7490 }
7491 else if (cond_reduc_dt == vect_induction_def)
7492 {
7493 tree base
7494 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7495 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7496
7497 gcc_assert (TREE_CODE (base) == INTEGER_CST
7498 && TREE_CODE (step) == INTEGER_CST);
7499 cond_reduc_val = NULL_TREE;
7500 enum tree_code cond_reduc_op_code = ERROR_MARK;
7501 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7502 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7503 ;
7504 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7505 above base; punt if base is the minimum value of the type for
7506 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7507 else if (tree_int_cst_sgn (step) == -1)
7508 {
7509 cond_reduc_op_code = MIN_EXPR;
7510 if (tree_int_cst_sgn (base) == -1)
7511 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7512 else if (tree_int_cst_lt (base,
7513 TYPE_MAX_VALUE (TREE_TYPE (base))))
7514 cond_reduc_val
7515 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7516 }
7517 else
7518 {
7519 cond_reduc_op_code = MAX_EXPR;
7520 if (tree_int_cst_sgn (base) == 1)
7521 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7522 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7523 base))
7524 cond_reduc_val
7525 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7526 }
7527 if (cond_reduc_val)
7528 {
7529 if (dump_enabled_p ())
7530 dump_printf_loc (MSG_NOTE, vect_location,
7531 "condition expression based on "
7532 "integer induction.\n");
7533 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7534 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7535 = cond_reduc_val;
7536 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7537 }
7538 }
7539 else if (cond_reduc_dt == vect_constant_def)
7540 {
7541 enum vect_def_type cond_initial_dt;
7542 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7543 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7544 if (cond_initial_dt == vect_constant_def
7545 && types_compatible_p (TREE_TYPE (cond_initial_val),
7546 TREE_TYPE (cond_reduc_val)))
7547 {
7548 tree e = fold_binary (LE_EXPR, boolean_type_node,
7549 cond_initial_val, cond_reduc_val);
7550 if (e && (integer_onep (e) || integer_zerop (e)))
7551 {
7552 if (dump_enabled_p ())
7553 dump_printf_loc (MSG_NOTE, vect_location,
7554 "condition expression based on "
7555 "compile time constant.\n");
7556 /* Record reduction code at analysis stage. */
7557 STMT_VINFO_REDUC_CODE (reduc_info)
7558 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7559 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7560 }
7561 }
7562 }
7563 }
7564
7565 if (STMT_VINFO_LIVE_P (phi_info))
7566 return false;
7567
7568 if (slp_node)
7569 ncopies = 1;
7570 else
7571 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7572
7573 gcc_assert (ncopies >= 1);
7574
7575 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7576
7577 if (nested_cycle)
7578 {
7579 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7580 == vect_double_reduction_def);
7581 double_reduc = true;
7582 }
7583
7584 /* 4.2. Check support for the epilog operation.
7585
7586 If STMT represents a reduction pattern, then the type of the
7587 reduction variable may be different than the type of the rest
7588 of the arguments. For example, consider the case of accumulation
7589 of shorts into an int accumulator; The original code:
7590 S1: int_a = (int) short_a;
7591 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7592
7593 was replaced with:
7594 STMT: int_acc = widen_sum <short_a, int_acc>
7595
7596 This means that:
7597 1. The tree-code that is used to create the vector operation in the
7598 epilog code (that reduces the partial results) is not the
7599 tree-code of STMT, but is rather the tree-code of the original
7600 stmt from the pattern that STMT is replacing. I.e, in the example
7601 above we want to use 'widen_sum' in the loop, but 'plus' in the
7602 epilog.
7603 2. The type (mode) we use to check available target support
7604 for the vector operation to be created in the *epilog*, is
7605 determined by the type of the reduction variable (in the example
7606 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7607 However the type (mode) we use to check available target support
7608 for the vector operation to be created *inside the loop*, is
7609 determined by the type of the other arguments to STMT (in the
7610 example we'd check this: optab_handler (widen_sum_optab,
7611 vect_short_mode)).
7612
7613 This is contrary to "regular" reductions, in which the types of all
7614 the arguments are the same as the type of the reduction variable.
7615 For "regular" reductions we can therefore use the same vector type
7616 (and also the same tree-code) when generating the epilog code and
7617 when generating the code inside the loop. */
7618
7619 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7620 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7621
7622 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7623 if (reduction_type == TREE_CODE_REDUCTION)
7624 {
7625 /* Check whether it's ok to change the order of the computation.
7626 Generally, when vectorizing a reduction we change the order of the
7627 computation. This may change the behavior of the program in some
7628 cases, so we need to check that this is ok. One exception is when
7629 vectorizing an outer-loop: the inner-loop is executed sequentially,
7630 and therefore vectorizing reductions in the inner-loop during
7631 outer-loop vectorization is safe. Likewise when we are vectorizing
7632 a series of reductions using SLP and the VF is one the reductions
7633 are performed in scalar order. */
7634 if (slp_node
7635 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7636 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7637 ;
7638 else if (needs_fold_left_reduction_p (op.type, orig_code))
7639 {
7640 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7641 is not directy used in stmt. */
7642 if (!only_slp_reduc_chain
7643 && reduc_chain_length != 1)
7644 {
7645 if (dump_enabled_p ())
7646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7647 "in-order reduction chain without SLP.\n");
7648 return false;
7649 }
7650 STMT_VINFO_REDUC_TYPE (reduc_info)
7651 = reduction_type = FOLD_LEFT_REDUCTION;
7652 }
7653 else if (!commutative_binary_op_p (orig_code, op.type)
7654 || !associative_binary_op_p (orig_code, op.type))
7655 {
7656 if (dump_enabled_p ())
7657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7658 "reduction: not commutative/associative");
7659 return false;
7660 }
7661 }
7662
7663 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7664 && ncopies > 1)
7665 {
7666 if (dump_enabled_p ())
7667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668 "multiple types in double reduction or condition "
7669 "reduction or fold-left reduction.\n");
7670 return false;
7671 }
7672
7673 internal_fn reduc_fn = IFN_LAST;
7674 if (reduction_type == TREE_CODE_REDUCTION
7675 || reduction_type == FOLD_LEFT_REDUCTION
7676 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7677 || reduction_type == CONST_COND_REDUCTION)
7678 {
7679 if (reduction_type == FOLD_LEFT_REDUCTION
7680 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7681 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7682 {
7683 if (reduc_fn != IFN_LAST
7684 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7685 OPTIMIZE_FOR_SPEED))
7686 {
7687 if (dump_enabled_p ())
7688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7689 "reduc op not supported by target.\n");
7690
7691 reduc_fn = IFN_LAST;
7692 }
7693 }
7694 else
7695 {
7696 if (!nested_cycle || double_reduc)
7697 {
7698 if (dump_enabled_p ())
7699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7700 "no reduc code for scalar code.\n");
7701
7702 return false;
7703 }
7704 }
7705 }
7706 else if (reduction_type == COND_REDUCTION)
7707 {
7708 int scalar_precision
7709 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7710 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7711 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7712 vectype_out);
7713
7714 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7715 OPTIMIZE_FOR_SPEED))
7716 reduc_fn = IFN_REDUC_MAX;
7717 }
7718 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7719
7720 if (reduction_type != EXTRACT_LAST_REDUCTION
7721 && (!nested_cycle || double_reduc)
7722 && reduc_fn == IFN_LAST
7723 && !nunits_out.is_constant ())
7724 {
7725 if (dump_enabled_p ())
7726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7727 "missing target support for reduction on"
7728 " variable-length vectors.\n");
7729 return false;
7730 }
7731
7732 /* For SLP reductions, see if there is a neutral value we can use. */
7733 tree neutral_op = NULL_TREE;
7734 if (slp_node)
7735 {
7736 tree initial_value = NULL_TREE;
7737 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7738 initial_value = vect_phi_initial_value (reduc_def_phi);
7739 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7740 orig_code, initial_value);
7741 }
7742
7743 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7744 {
7745 /* We can't support in-order reductions of code such as this:
7746
7747 for (int i = 0; i < n1; ++i)
7748 for (int j = 0; j < n2; ++j)
7749 l += a[j];
7750
7751 since GCC effectively transforms the loop when vectorizing:
7752
7753 for (int i = 0; i < n1 / VF; ++i)
7754 for (int j = 0; j < n2; ++j)
7755 for (int k = 0; k < VF; ++k)
7756 l += a[j];
7757
7758 which is a reassociation of the original operation. */
7759 if (dump_enabled_p ())
7760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7761 "in-order double reduction not supported.\n");
7762
7763 return false;
7764 }
7765
7766 if (reduction_type == FOLD_LEFT_REDUCTION
7767 && slp_node
7768 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7769 {
7770 /* We cannot use in-order reductions in this case because there is
7771 an implicit reassociation of the operations involved. */
7772 if (dump_enabled_p ())
7773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774 "in-order unchained SLP reductions not supported.\n");
7775 return false;
7776 }
7777
7778 /* For double reductions, and for SLP reductions with a neutral value,
7779 we construct a variable-length initial vector by loading a vector
7780 full of the neutral value and then shift-and-inserting the start
7781 values into the low-numbered elements. */
7782 if ((double_reduc || neutral_op)
7783 && !nunits_out.is_constant ()
7784 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7785 vectype_out, OPTIMIZE_FOR_SPEED))
7786 {
7787 if (dump_enabled_p ())
7788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7789 "reduction on variable-length vectors requires"
7790 " target support for a vector-shift-and-insert"
7791 " operation.\n");
7792 return false;
7793 }
7794
7795 /* Check extra constraints for variable-length unchained SLP reductions. */
7796 if (slp_node
7797 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7798 && !nunits_out.is_constant ())
7799 {
7800 /* We checked above that we could build the initial vector when
7801 there's a neutral element value. Check here for the case in
7802 which each SLP statement has its own initial value and in which
7803 that value needs to be repeated for every instance of the
7804 statement within the initial vector. */
7805 unsigned int group_size = SLP_TREE_LANES (slp_node);
7806 if (!neutral_op
7807 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7808 TREE_TYPE (vectype_out)))
7809 {
7810 if (dump_enabled_p ())
7811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7812 "unsupported form of SLP reduction for"
7813 " variable-length vectors: cannot build"
7814 " initial vector.\n");
7815 return false;
7816 }
7817 /* The epilogue code relies on the number of elements being a multiple
7818 of the group size. The duplicate-and-interleave approach to setting
7819 up the initial vector does too. */
7820 if (!multiple_p (nunits_out, group_size))
7821 {
7822 if (dump_enabled_p ())
7823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824 "unsupported form of SLP reduction for"
7825 " variable-length vectors: the vector size"
7826 " is not a multiple of the number of results.\n");
7827 return false;
7828 }
7829 }
7830
7831 if (reduction_type == COND_REDUCTION)
7832 {
7833 widest_int ni;
7834
7835 if (! max_loop_iterations (loop, &ni))
7836 {
7837 if (dump_enabled_p ())
7838 dump_printf_loc (MSG_NOTE, vect_location,
7839 "loop count not known, cannot create cond "
7840 "reduction.\n");
7841 return false;
7842 }
7843 /* Convert backedges to iterations. */
7844 ni += 1;
7845
7846 /* The additional index will be the same type as the condition. Check
7847 that the loop can fit into this less one (because we'll use up the
7848 zero slot for when there are no matches). */
7849 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7850 if (wi::geu_p (ni, wi::to_widest (max_index)))
7851 {
7852 if (dump_enabled_p ())
7853 dump_printf_loc (MSG_NOTE, vect_location,
7854 "loop size is greater than data size.\n");
7855 return false;
7856 }
7857 }
7858
7859 /* In case the vectorization factor (VF) is bigger than the number
7860 of elements that we can fit in a vectype (nunits), we have to generate
7861 more than one vector stmt - i.e - we need to "unroll" the
7862 vector stmt by a factor VF/nunits. For more details see documentation
7863 in vectorizable_operation. */
7864
7865 /* If the reduction is used in an outer loop we need to generate
7866 VF intermediate results, like so (e.g. for ncopies=2):
7867 r0 = phi (init, r0)
7868 r1 = phi (init, r1)
7869 r0 = x0 + r0;
7870 r1 = x1 + r1;
7871 (i.e. we generate VF results in 2 registers).
7872 In this case we have a separate def-use cycle for each copy, and therefore
7873 for each copy we get the vector def for the reduction variable from the
7874 respective phi node created for this copy.
7875
7876 Otherwise (the reduction is unused in the loop nest), we can combine
7877 together intermediate results, like so (e.g. for ncopies=2):
7878 r = phi (init, r)
7879 r = x0 + r;
7880 r = x1 + r;
7881 (i.e. we generate VF/2 results in a single register).
7882 In this case for each copy we get the vector def for the reduction variable
7883 from the vectorized reduction operation generated in the previous iteration.
7884
7885 This only works when we see both the reduction PHI and its only consumer
7886 in vectorizable_reduction and there are no intermediate stmts
7887 participating. When unrolling we want each unrolled iteration to have its
7888 own reduction accumulator since one of the main goals of unrolling a
7889 reduction is to reduce the aggregate loop-carried latency. */
7890 if (ncopies > 1
7891 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7892 && reduc_chain_length == 1
7893 && loop_vinfo->suggested_unroll_factor == 1)
7894 single_defuse_cycle = true;
7895
7896 if (single_defuse_cycle || lane_reduc_code_p)
7897 {
7898 gcc_assert (op.code != COND_EXPR);
7899
7900 /* 4. Supportable by target? */
7901 bool ok = true;
7902
7903 /* 4.1. check support for the operation in the loop
7904
7905 This isn't necessary for the lane reduction codes, since they
7906 can only be produced by pattern matching, and it's up to the
7907 pattern matcher to test for support. The main reason for
7908 specifically skipping this step is to avoid rechecking whether
7909 mixed-sign dot-products can be implemented using signed
7910 dot-products. */
7911 machine_mode vec_mode = TYPE_MODE (vectype_in);
7912 if (!lane_reduc_code_p
7913 && !directly_supported_p (op.code, vectype_in, optab_vector))
7914 {
7915 if (dump_enabled_p ())
7916 dump_printf (MSG_NOTE, "op not supported by target.\n");
7917 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7918 || !vect_can_vectorize_without_simd_p (op.code))
7919 ok = false;
7920 else
7921 if (dump_enabled_p ())
7922 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7923 }
7924
7925 if (vect_emulated_vector_p (vectype_in)
7926 && !vect_can_vectorize_without_simd_p (op.code))
7927 {
7928 if (dump_enabled_p ())
7929 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7930 return false;
7931 }
7932
7933 /* lane-reducing operations have to go through vect_transform_reduction.
7934 For the other cases try without the single cycle optimization. */
7935 if (!ok)
7936 {
7937 if (lane_reduc_code_p)
7938 return false;
7939 else
7940 single_defuse_cycle = false;
7941 }
7942 }
7943 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7944
7945 /* If the reduction stmt is one of the patterns that have lane
7946 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7947 if ((ncopies > 1 && ! single_defuse_cycle)
7948 && lane_reduc_code_p)
7949 {
7950 if (dump_enabled_p ())
7951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7952 "multi def-use cycle not possible for lane-reducing "
7953 "reduction operation\n");
7954 return false;
7955 }
7956
7957 if (slp_node
7958 && !(!single_defuse_cycle
7959 && !lane_reduc_code_p
7960 && reduction_type != FOLD_LEFT_REDUCTION))
7961 for (i = 0; i < (int) op.num_ops; i++)
7962 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7963 {
7964 if (dump_enabled_p ())
7965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966 "incompatible vector types for invariants\n");
7967 return false;
7968 }
7969
7970 if (slp_node)
7971 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7972 else
7973 vec_num = 1;
7974
7975 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7976 reduction_type, ncopies, cost_vec);
7977 /* Cost the reduction op inside the loop if transformed via
7978 vect_transform_reduction. Otherwise this is costed by the
7979 separate vectorizable_* routines. */
7980 if (single_defuse_cycle || lane_reduc_code_p)
7981 {
7982 int factor = 1;
7983 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7984 /* Three dot-products and a subtraction. */
7985 factor = 4;
7986 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7987 stmt_info, 0, vect_body);
7988 }
7989
7990 if (dump_enabled_p ()
7991 && reduction_type == FOLD_LEFT_REDUCTION)
7992 dump_printf_loc (MSG_NOTE, vect_location,
7993 "using an in-order (fold-left) reduction.\n");
7994 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7995 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7996 reductions go through their own vectorizable_* routines. */
7997 if (!single_defuse_cycle
7998 && !lane_reduc_code_p
7999 && reduction_type != FOLD_LEFT_REDUCTION)
8000 {
8001 stmt_vec_info tem
8002 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8003 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8004 {
8005 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8006 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8007 }
8008 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8009 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8010 }
8011 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8012 {
8013 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8014 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8015 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8016
8017 if (reduction_type != FOLD_LEFT_REDUCTION
8018 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8019 && (cond_fn == IFN_LAST
8020 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8021 OPTIMIZE_FOR_SPEED)))
8022 {
8023 if (dump_enabled_p ())
8024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8025 "can't operate on partial vectors because"
8026 " no conditional operation is available.\n");
8027 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8028 }
8029 else if (reduction_type == FOLD_LEFT_REDUCTION
8030 && reduc_fn == IFN_LAST
8031 && !expand_vec_cond_expr_p (vectype_in,
8032 truth_type_for (vectype_in),
8033 SSA_NAME))
8034 {
8035 if (dump_enabled_p ())
8036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8037 "can't operate on partial vectors because"
8038 " no conditional operation is available.\n");
8039 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8040 }
8041 else
8042 {
8043 internal_fn mask_reduc_fn
8044 = get_masked_reduction_fn (reduc_fn, vectype_in);
8045
8046 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8047 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8048 vectype_in, 1);
8049 else
8050 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8051 vectype_in, NULL);
8052 }
8053 }
8054 return true;
8055 }
8056
8057 /* STMT_INFO is a dot-product reduction whose multiplication operands
8058 have different signs. Emit a sequence to emulate the operation
8059 using a series of signed DOT_PROD_EXPRs and return the last
8060 statement generated. VEC_DEST is the result of the vector operation
8061 and VOP lists its inputs. */
8062
8063 static gassign *
8064 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8065 gimple_stmt_iterator *gsi, tree vec_dest,
8066 tree vop[3])
8067 {
8068 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8069 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8070 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8071 gimple *new_stmt;
8072
8073 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8074 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8075 std::swap (vop[0], vop[1]);
8076
8077 /* Convert all inputs to signed types. */
8078 for (int i = 0; i < 3; ++i)
8079 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8080 {
8081 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8082 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8083 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8084 vop[i] = tmp;
8085 }
8086
8087 /* In the comments below we assume 8-bit inputs for simplicity,
8088 but the approach works for any full integer type. */
8089
8090 /* Create a vector of -128. */
8091 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8092 tree min_narrow = build_vector_from_val (narrow_vectype,
8093 min_narrow_elttype);
8094
8095 /* Create a vector of 64. */
8096 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8097 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8098 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8099
8100 /* Emit: SUB_RES = VOP[0] - 128. */
8101 tree sub_res = make_ssa_name (narrow_vectype);
8102 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8103 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8104
8105 /* Emit:
8106
8107 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8108 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8109 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8110
8111 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8112 Doing the two 64 * y steps first allows more time to compute x. */
8113 tree stage1 = make_ssa_name (wide_vectype);
8114 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8115 vop[1], half_narrow, vop[2]);
8116 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8117
8118 tree stage2 = make_ssa_name (wide_vectype);
8119 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8120 vop[1], half_narrow, stage1);
8121 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8122
8123 tree stage3 = make_ssa_name (wide_vectype);
8124 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8125 sub_res, vop[1], stage2);
8126 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8127
8128 /* Convert STAGE3 to the reduction type. */
8129 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8130 }
8131
8132 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8133 value. */
8134
8135 bool
8136 vect_transform_reduction (loop_vec_info loop_vinfo,
8137 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8138 gimple **vec_stmt, slp_tree slp_node)
8139 {
8140 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8141 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8142 int i;
8143 int ncopies;
8144 int vec_num;
8145
8146 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8147 gcc_assert (reduc_info->is_reduc_info);
8148
8149 if (nested_in_vect_loop_p (loop, stmt_info))
8150 {
8151 loop = loop->inner;
8152 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8153 }
8154
8155 gimple_match_op op;
8156 if (!gimple_extract_op (stmt_info->stmt, &op))
8157 gcc_unreachable ();
8158
8159 /* All uses but the last are expected to be defined in the loop.
8160 The last use is the reduction variable. In case of nested cycle this
8161 assumption is not true: we use reduc_index to record the index of the
8162 reduction variable. */
8163 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8164 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8165 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8166 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8167
8168 if (slp_node)
8169 {
8170 ncopies = 1;
8171 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8172 }
8173 else
8174 {
8175 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8176 vec_num = 1;
8177 }
8178
8179 code_helper code = canonicalize_code (op.code, op.type);
8180 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8181 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8182 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8183 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8184
8185 /* Transform. */
8186 tree new_temp = NULL_TREE;
8187 auto_vec<tree> vec_oprnds0;
8188 auto_vec<tree> vec_oprnds1;
8189 auto_vec<tree> vec_oprnds2;
8190 tree def0;
8191
8192 if (dump_enabled_p ())
8193 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8194
8195 /* FORNOW: Multiple types are not supported for condition. */
8196 if (code == COND_EXPR)
8197 gcc_assert (ncopies == 1);
8198
8199 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8200
8201 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8202 if (reduction_type == FOLD_LEFT_REDUCTION)
8203 {
8204 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8205 gcc_assert (code.is_tree_code ());
8206 return vectorize_fold_left_reduction
8207 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8208 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8209 lens);
8210 }
8211
8212 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8213 gcc_assert (single_defuse_cycle
8214 || code == DOT_PROD_EXPR
8215 || code == WIDEN_SUM_EXPR
8216 || code == SAD_EXPR);
8217
8218 /* Create the destination vector */
8219 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8220 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8221
8222 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8223 single_defuse_cycle && reduc_index == 0
8224 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8225 single_defuse_cycle && reduc_index == 1
8226 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8227 op.num_ops == 3
8228 && !(single_defuse_cycle && reduc_index == 2)
8229 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8230 if (single_defuse_cycle)
8231 {
8232 gcc_assert (!slp_node);
8233 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8234 op.ops[reduc_index],
8235 reduc_index == 0 ? &vec_oprnds0
8236 : (reduc_index == 1 ? &vec_oprnds1
8237 : &vec_oprnds2));
8238 }
8239
8240 bool emulated_mixed_dot_prod
8241 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8242 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8243 {
8244 gimple *new_stmt;
8245 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8246 if (masked_loop_p && !mask_by_cond_expr)
8247 {
8248 /* No conditional ifns have been defined for dot-product yet. */
8249 gcc_assert (code != DOT_PROD_EXPR);
8250
8251 /* Make sure that the reduction accumulator is vop[0]. */
8252 if (reduc_index == 1)
8253 {
8254 gcc_assert (commutative_binary_op_p (code, op.type));
8255 std::swap (vop[0], vop[1]);
8256 }
8257 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8258 vec_num * ncopies, vectype_in, i);
8259 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8260 vop[0], vop[1], vop[0]);
8261 new_temp = make_ssa_name (vec_dest, call);
8262 gimple_call_set_lhs (call, new_temp);
8263 gimple_call_set_nothrow (call, true);
8264 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8265 new_stmt = call;
8266 }
8267 else
8268 {
8269 if (op.num_ops == 3)
8270 vop[2] = vec_oprnds2[i];
8271
8272 if (masked_loop_p && mask_by_cond_expr)
8273 {
8274 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8275 vec_num * ncopies, vectype_in, i);
8276 build_vect_cond_expr (code, vop, mask, gsi);
8277 }
8278
8279 if (emulated_mixed_dot_prod)
8280 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8281 vec_dest, vop);
8282 else if (code.is_internal_fn ())
8283 new_stmt = gimple_build_call_internal (internal_fn (code),
8284 op.num_ops,
8285 vop[0], vop[1], vop[2]);
8286 else
8287 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8288 vop[0], vop[1], vop[2]);
8289 new_temp = make_ssa_name (vec_dest, new_stmt);
8290 gimple_set_lhs (new_stmt, new_temp);
8291 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8292 }
8293
8294 if (slp_node)
8295 slp_node->push_vec_def (new_stmt);
8296 else if (single_defuse_cycle
8297 && i < ncopies - 1)
8298 {
8299 if (reduc_index == 0)
8300 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8301 else if (reduc_index == 1)
8302 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8303 else if (reduc_index == 2)
8304 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8305 }
8306 else
8307 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8308 }
8309
8310 if (!slp_node)
8311 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8312
8313 return true;
8314 }
8315
8316 /* Transform phase of a cycle PHI. */
8317
8318 bool
8319 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8320 stmt_vec_info stmt_info, gimple **vec_stmt,
8321 slp_tree slp_node, slp_instance slp_node_instance)
8322 {
8323 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8324 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8325 int i;
8326 int ncopies;
8327 int j;
8328 bool nested_cycle = false;
8329 int vec_num;
8330
8331 if (nested_in_vect_loop_p (loop, stmt_info))
8332 {
8333 loop = loop->inner;
8334 nested_cycle = true;
8335 }
8336
8337 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8338 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8339 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8340 gcc_assert (reduc_info->is_reduc_info);
8341
8342 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8343 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8344 /* Leave the scalar phi in place. */
8345 return true;
8346
8347 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8348 /* For a nested cycle we do not fill the above. */
8349 if (!vectype_in)
8350 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8351 gcc_assert (vectype_in);
8352
8353 if (slp_node)
8354 {
8355 /* The size vect_schedule_slp_instance computes is off for us. */
8356 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8357 * SLP_TREE_LANES (slp_node), vectype_in);
8358 ncopies = 1;
8359 }
8360 else
8361 {
8362 vec_num = 1;
8363 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8364 }
8365
8366 /* Check whether we should use a single PHI node and accumulate
8367 vectors to one before the backedge. */
8368 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8369 ncopies = 1;
8370
8371 /* Create the destination vector */
8372 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8373 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8374 vectype_out);
8375
8376 /* Get the loop-entry arguments. */
8377 tree vec_initial_def = NULL_TREE;
8378 auto_vec<tree> vec_initial_defs;
8379 if (slp_node)
8380 {
8381 vec_initial_defs.reserve (vec_num);
8382 if (nested_cycle)
8383 {
8384 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8385 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8386 &vec_initial_defs);
8387 }
8388 else
8389 {
8390 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8391 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8392 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8393
8394 unsigned int num_phis = stmts.length ();
8395 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8396 num_phis = 1;
8397 initial_values.reserve (num_phis);
8398 for (unsigned int i = 0; i < num_phis; ++i)
8399 {
8400 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8401 initial_values.quick_push (vect_phi_initial_value (this_phi));
8402 }
8403 if (vec_num == 1)
8404 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8405 if (!initial_values.is_empty ())
8406 {
8407 tree initial_value
8408 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8409 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8410 tree neutral_op
8411 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8412 code, initial_value);
8413 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8414 &vec_initial_defs, vec_num,
8415 stmts.length (), neutral_op);
8416 }
8417 }
8418 }
8419 else
8420 {
8421 /* Get at the scalar def before the loop, that defines the initial
8422 value of the reduction variable. */
8423 tree initial_def = vect_phi_initial_value (phi);
8424 reduc_info->reduc_initial_values.safe_push (initial_def);
8425 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8426 and we can't use zero for induc_val, use initial_def. Similarly
8427 for REDUC_MIN and initial_def larger than the base. */
8428 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8429 {
8430 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8431 if (TREE_CODE (initial_def) == INTEGER_CST
8432 && !integer_zerop (induc_val)
8433 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8434 && tree_int_cst_lt (initial_def, induc_val))
8435 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8436 && tree_int_cst_lt (induc_val, initial_def))))
8437 {
8438 induc_val = initial_def;
8439 /* Communicate we used the initial_def to epilouge
8440 generation. */
8441 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8442 }
8443 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8444 }
8445 else if (nested_cycle)
8446 {
8447 /* Do not use an adjustment def as that case is not supported
8448 correctly if ncopies is not one. */
8449 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8450 ncopies, initial_def,
8451 &vec_initial_defs);
8452 }
8453 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8454 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8455 /* Fill the initial vector with the initial scalar value. */
8456 vec_initial_def
8457 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8458 initial_def, initial_def);
8459 else
8460 {
8461 if (ncopies == 1)
8462 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8463 if (!reduc_info->reduc_initial_values.is_empty ())
8464 {
8465 initial_def = reduc_info->reduc_initial_values[0];
8466 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8467 tree neutral_op
8468 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8469 code, initial_def);
8470 gcc_assert (neutral_op);
8471 /* Try to simplify the vector initialization by applying an
8472 adjustment after the reduction has been performed. */
8473 if (!reduc_info->reused_accumulator
8474 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8475 && !operand_equal_p (neutral_op, initial_def))
8476 {
8477 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8478 = initial_def;
8479 initial_def = neutral_op;
8480 }
8481 vec_initial_def
8482 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8483 initial_def, neutral_op);
8484 }
8485 }
8486 }
8487
8488 if (vec_initial_def)
8489 {
8490 vec_initial_defs.create (ncopies);
8491 for (i = 0; i < ncopies; ++i)
8492 vec_initial_defs.quick_push (vec_initial_def);
8493 }
8494
8495 if (auto *accumulator = reduc_info->reused_accumulator)
8496 {
8497 tree def = accumulator->reduc_input;
8498 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8499 {
8500 unsigned int nreduc;
8501 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8502 (TREE_TYPE (def)),
8503 TYPE_VECTOR_SUBPARTS (vectype_out),
8504 &nreduc);
8505 gcc_assert (res);
8506 gimple_seq stmts = NULL;
8507 /* Reduce the single vector to a smaller one. */
8508 if (nreduc != 1)
8509 {
8510 /* Perform the reduction in the appropriate type. */
8511 tree rvectype = vectype_out;
8512 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8513 TREE_TYPE (TREE_TYPE (def))))
8514 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8515 TYPE_VECTOR_SUBPARTS
8516 (vectype_out));
8517 def = vect_create_partial_epilog (def, rvectype,
8518 STMT_VINFO_REDUC_CODE
8519 (reduc_info),
8520 &stmts);
8521 }
8522 /* The epilogue loop might use a different vector mode, like
8523 VNx2DI vs. V2DI. */
8524 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8525 {
8526 tree reduc_type = build_vector_type_for_mode
8527 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8528 def = gimple_convert (&stmts, reduc_type, def);
8529 }
8530 /* Adjust the input so we pick up the partially reduced value
8531 for the skip edge in vect_create_epilog_for_reduction. */
8532 accumulator->reduc_input = def;
8533 /* And the reduction could be carried out using a different sign. */
8534 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8535 def = gimple_convert (&stmts, vectype_out, def);
8536 if (loop_vinfo->main_loop_edge)
8537 {
8538 /* While we'd like to insert on the edge this will split
8539 blocks and disturb bookkeeping, we also will eventually
8540 need this on the skip edge. Rely on sinking to
8541 fixup optimal placement and insert in the pred. */
8542 gimple_stmt_iterator gsi
8543 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8544 /* Insert before a cond that eventually skips the
8545 epilogue. */
8546 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8547 gsi_prev (&gsi);
8548 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8549 }
8550 else
8551 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8552 stmts);
8553 }
8554 if (loop_vinfo->main_loop_edge)
8555 vec_initial_defs[0]
8556 = vect_get_main_loop_result (loop_vinfo, def,
8557 vec_initial_defs[0]);
8558 else
8559 vec_initial_defs.safe_push (def);
8560 }
8561
8562 /* Generate the reduction PHIs upfront. */
8563 for (i = 0; i < vec_num; i++)
8564 {
8565 tree vec_init_def = vec_initial_defs[i];
8566 for (j = 0; j < ncopies; j++)
8567 {
8568 /* Create the reduction-phi that defines the reduction
8569 operand. */
8570 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8571
8572 /* Set the loop-entry arg of the reduction-phi. */
8573 if (j != 0 && nested_cycle)
8574 vec_init_def = vec_initial_defs[j];
8575 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8576 UNKNOWN_LOCATION);
8577
8578 /* The loop-latch arg is set in epilogue processing. */
8579
8580 if (slp_node)
8581 slp_node->push_vec_def (new_phi);
8582 else
8583 {
8584 if (j == 0)
8585 *vec_stmt = new_phi;
8586 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8587 }
8588 }
8589 }
8590
8591 return true;
8592 }
8593
8594 /* Vectorizes LC PHIs. */
8595
8596 bool
8597 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8598 stmt_vec_info stmt_info, gimple **vec_stmt,
8599 slp_tree slp_node)
8600 {
8601 if (!loop_vinfo
8602 || !is_a <gphi *> (stmt_info->stmt)
8603 || gimple_phi_num_args (stmt_info->stmt) != 1)
8604 return false;
8605
8606 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8607 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8608 return false;
8609
8610 if (!vec_stmt) /* transformation not required. */
8611 {
8612 /* Deal with copies from externs or constants that disguise as
8613 loop-closed PHI nodes (PR97886). */
8614 if (slp_node
8615 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8616 SLP_TREE_VECTYPE (slp_node)))
8617 {
8618 if (dump_enabled_p ())
8619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8620 "incompatible vector types for invariants\n");
8621 return false;
8622 }
8623 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8624 return true;
8625 }
8626
8627 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8628 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8629 basic_block bb = gimple_bb (stmt_info->stmt);
8630 edge e = single_pred_edge (bb);
8631 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8632 auto_vec<tree> vec_oprnds;
8633 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8634 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8635 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8636 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8637 {
8638 /* Create the vectorized LC PHI node. */
8639 gphi *new_phi = create_phi_node (vec_dest, bb);
8640 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8641 if (slp_node)
8642 slp_node->push_vec_def (new_phi);
8643 else
8644 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8645 }
8646 if (!slp_node)
8647 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8648
8649 return true;
8650 }
8651
8652 /* Vectorizes PHIs. */
8653
8654 bool
8655 vectorizable_phi (vec_info *,
8656 stmt_vec_info stmt_info, gimple **vec_stmt,
8657 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8658 {
8659 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8660 return false;
8661
8662 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8663 return false;
8664
8665 tree vectype = SLP_TREE_VECTYPE (slp_node);
8666
8667 if (!vec_stmt) /* transformation not required. */
8668 {
8669 slp_tree child;
8670 unsigned i;
8671 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8672 if (!child)
8673 {
8674 if (dump_enabled_p ())
8675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8676 "PHI node with unvectorized backedge def\n");
8677 return false;
8678 }
8679 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8680 {
8681 if (dump_enabled_p ())
8682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8683 "incompatible vector types for invariants\n");
8684 return false;
8685 }
8686 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8687 && !useless_type_conversion_p (vectype,
8688 SLP_TREE_VECTYPE (child)))
8689 {
8690 /* With bools we can have mask and non-mask precision vectors
8691 or different non-mask precisions. while pattern recog is
8692 supposed to guarantee consistency here bugs in it can cause
8693 mismatches (PR103489 and PR103800 for example).
8694 Deal with them here instead of ICEing later. */
8695 if (dump_enabled_p ())
8696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8697 "incompatible vector type setup from "
8698 "bool pattern detection\n");
8699 return false;
8700 }
8701
8702 /* For single-argument PHIs assume coalescing which means zero cost
8703 for the scalar and the vector PHIs. This avoids artificially
8704 favoring the vector path (but may pessimize it in some cases). */
8705 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8706 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8707 vector_stmt, stmt_info, vectype, 0, vect_body);
8708 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8709 return true;
8710 }
8711
8712 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8713 basic_block bb = gimple_bb (stmt_info->stmt);
8714 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8715 auto_vec<gphi *> new_phis;
8716 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8717 {
8718 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8719
8720 /* Skip not yet vectorized defs. */
8721 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8722 && SLP_TREE_VEC_DEFS (child).is_empty ())
8723 continue;
8724
8725 auto_vec<tree> vec_oprnds;
8726 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8727 if (!new_phis.exists ())
8728 {
8729 new_phis.create (vec_oprnds.length ());
8730 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8731 {
8732 /* Create the vectorized LC PHI node. */
8733 new_phis.quick_push (create_phi_node (vec_dest, bb));
8734 slp_node->push_vec_def (new_phis[j]);
8735 }
8736 }
8737 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8738 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8739 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8740 }
8741 /* We should have at least one already vectorized child. */
8742 gcc_assert (new_phis.exists ());
8743
8744 return true;
8745 }
8746
8747 /* Vectorizes first order recurrences. An overview of the transformation
8748 is described below. Suppose we have the following loop.
8749
8750 int t = 0;
8751 for (int i = 0; i < n; ++i)
8752 {
8753 b[i] = a[i] - t;
8754 t = a[i];
8755 }
8756
8757 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8758 looks (simplified) like:
8759
8760 scalar.preheader:
8761 init = 0;
8762
8763 scalar.body:
8764 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8765 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8766 _1 = a[i]
8767 b[i] = _1 - _2
8768 if (i < n) goto scalar.body
8769
8770 In this example, _2 is a recurrence because it's value depends on the
8771 previous iteration. We vectorize this as (VF = 4)
8772
8773 vector.preheader:
8774 vect_init = vect_cst(..., ..., ..., 0)
8775
8776 vector.body
8777 i = PHI <0(vector.preheader), i+4(vector.body)>
8778 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8779 vect_2 = a[i, i+1, i+2, i+3];
8780 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8781 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8782 if (..) goto vector.body
8783
8784 In this function, vectorizable_recurr, we code generate both the
8785 vector PHI node and the permute since those together compute the
8786 vectorized value of the scalar PHI. We do not yet have the
8787 backedge value to fill in there nor into the vec_perm. Those
8788 are filled in maybe_set_vectorized_backedge_value and
8789 vect_schedule_scc.
8790
8791 TODO: Since the scalar loop does not have a use of the recurrence
8792 outside of the loop the natural way to implement peeling via
8793 vectorizing the live value doesn't work. For now peeling of loops
8794 with a recurrence is not implemented. For SLP the supported cases
8795 are restricted to those requiring a single vector recurrence PHI. */
8796
8797 bool
8798 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8799 gimple **vec_stmt, slp_tree slp_node,
8800 stmt_vector_for_cost *cost_vec)
8801 {
8802 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8803 return false;
8804
8805 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8806
8807 /* So far we only support first-order recurrence auto-vectorization. */
8808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8809 return false;
8810
8811 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8812 unsigned ncopies;
8813 if (slp_node)
8814 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8815 else
8816 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8817 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8818 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8819 /* We need to be able to make progress with a single vector. */
8820 if (maybe_gt (dist * 2, nunits))
8821 {
8822 if (dump_enabled_p ())
8823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8824 "first order recurrence exceeds half of "
8825 "a vector\n");
8826 return false;
8827 }
8828
8829 /* First-order recurrence autovectorization needs to handle permutation
8830 with indices = [nunits-1, nunits, nunits+1, ...]. */
8831 vec_perm_builder sel (nunits, 1, 3);
8832 for (int i = 0; i < 3; ++i)
8833 sel.quick_push (nunits - dist + i);
8834 vec_perm_indices indices (sel, 2, nunits);
8835
8836 if (!vec_stmt) /* transformation not required. */
8837 {
8838 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8839 indices))
8840 return false;
8841
8842 if (slp_node)
8843 {
8844 /* We eventually need to set a vector type on invariant
8845 arguments. */
8846 unsigned j;
8847 slp_tree child;
8848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8849 if (!vect_maybe_update_slp_op_vectype
8850 (child, SLP_TREE_VECTYPE (slp_node)))
8851 {
8852 if (dump_enabled_p ())
8853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8854 "incompatible vector types for "
8855 "invariants\n");
8856 return false;
8857 }
8858 }
8859 /* The recurrence costs the initialization vector and one permute
8860 for each copy. */
8861 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8862 stmt_info, 0, vect_prologue);
8863 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8864 stmt_info, 0, vect_body);
8865 if (dump_enabled_p ())
8866 dump_printf_loc (MSG_NOTE, vect_location,
8867 "vectorizable_recurr: inside_cost = %d, "
8868 "prologue_cost = %d .\n", inside_cost,
8869 prologue_cost);
8870
8871 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8872 return true;
8873 }
8874
8875 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8876 basic_block bb = gimple_bb (phi);
8877 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8878 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8879 {
8880 gimple_seq stmts = NULL;
8881 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8882 gsi_insert_seq_on_edge_immediate (pe, stmts);
8883 }
8884 tree vec_init = build_vector_from_val (vectype, preheader);
8885 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8886
8887 /* Create the vectorized first-order PHI node. */
8888 tree vec_dest = vect_get_new_vect_var (vectype,
8889 vect_simple_var, "vec_recur_");
8890 gphi *new_phi = create_phi_node (vec_dest, bb);
8891 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8892
8893 /* Insert shuffles the first-order recurrence autovectorization.
8894 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8895 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8896
8897 /* Insert the required permute after the latch definition. The
8898 second and later operands are tentative and will be updated when we have
8899 vectorized the latch definition. */
8900 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8901 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8902 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8903 gsi_next (&gsi2);
8904
8905 for (unsigned i = 0; i < ncopies; ++i)
8906 {
8907 vec_dest = make_ssa_name (vectype);
8908 gassign *vperm
8909 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8910 i == 0 ? gimple_phi_result (new_phi) : NULL,
8911 NULL, perm);
8912 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8913
8914 if (slp_node)
8915 slp_node->push_vec_def (vperm);
8916 else
8917 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8918 }
8919
8920 if (!slp_node)
8921 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8922 return true;
8923 }
8924
8925 /* Return true if VECTYPE represents a vector that requires lowering
8926 by the vector lowering pass. */
8927
8928 bool
8929 vect_emulated_vector_p (tree vectype)
8930 {
8931 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8932 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8933 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8934 }
8935
8936 /* Return true if we can emulate CODE on an integer mode representation
8937 of a vector. */
8938
8939 bool
8940 vect_can_vectorize_without_simd_p (tree_code code)
8941 {
8942 switch (code)
8943 {
8944 case PLUS_EXPR:
8945 case MINUS_EXPR:
8946 case NEGATE_EXPR:
8947 case BIT_AND_EXPR:
8948 case BIT_IOR_EXPR:
8949 case BIT_XOR_EXPR:
8950 case BIT_NOT_EXPR:
8951 return true;
8952
8953 default:
8954 return false;
8955 }
8956 }
8957
8958 /* Likewise, but taking a code_helper. */
8959
8960 bool
8961 vect_can_vectorize_without_simd_p (code_helper code)
8962 {
8963 return (code.is_tree_code ()
8964 && vect_can_vectorize_without_simd_p (tree_code (code)));
8965 }
8966
8967 /* Create vector init for vectorized iv. */
8968 static tree
8969 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8970 tree step_expr, poly_uint64 nunits,
8971 tree vectype,
8972 enum vect_induction_op_type induction_type)
8973 {
8974 unsigned HOST_WIDE_INT const_nunits;
8975 tree vec_shift, vec_init, new_name;
8976 unsigned i;
8977 tree itype = TREE_TYPE (vectype);
8978
8979 /* iv_loop is the loop to be vectorized. Create:
8980 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8981 new_name = gimple_convert (stmts, itype, init_expr);
8982 switch (induction_type)
8983 {
8984 case vect_step_op_shr:
8985 case vect_step_op_shl:
8986 /* Build the Initial value from shift_expr. */
8987 vec_init = gimple_build_vector_from_val (stmts,
8988 vectype,
8989 new_name);
8990 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8991 build_zero_cst (itype), step_expr);
8992 vec_init = gimple_build (stmts,
8993 (induction_type == vect_step_op_shr
8994 ? RSHIFT_EXPR : LSHIFT_EXPR),
8995 vectype, vec_init, vec_shift);
8996 break;
8997
8998 case vect_step_op_neg:
8999 {
9000 vec_init = gimple_build_vector_from_val (stmts,
9001 vectype,
9002 new_name);
9003 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9004 vectype, vec_init);
9005 /* The encoding has 2 interleaved stepped patterns. */
9006 vec_perm_builder sel (nunits, 2, 3);
9007 sel.quick_grow (6);
9008 for (i = 0; i < 3; i++)
9009 {
9010 sel[2 * i] = i;
9011 sel[2 * i + 1] = i + nunits;
9012 }
9013 vec_perm_indices indices (sel, 2, nunits);
9014 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9015 fail when vec_init is const vector. In that situation vec_perm is not
9016 really needed. */
9017 tree perm_mask_even
9018 = vect_gen_perm_mask_any (vectype, indices);
9019 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9020 vectype,
9021 vec_init, vec_neg,
9022 perm_mask_even);
9023 }
9024 break;
9025
9026 case vect_step_op_mul:
9027 {
9028 /* Use unsigned mult to avoid UD integer overflow. */
9029 gcc_assert (nunits.is_constant (&const_nunits));
9030 tree utype = unsigned_type_for (itype);
9031 tree uvectype = build_vector_type (utype,
9032 TYPE_VECTOR_SUBPARTS (vectype));
9033 new_name = gimple_convert (stmts, utype, new_name);
9034 vec_init = gimple_build_vector_from_val (stmts,
9035 uvectype,
9036 new_name);
9037 tree_vector_builder elts (uvectype, const_nunits, 1);
9038 tree elt_step = build_one_cst (utype);
9039
9040 elts.quick_push (elt_step);
9041 for (i = 1; i < const_nunits; i++)
9042 {
9043 /* Create: new_name_i = new_name + step_expr. */
9044 elt_step = gimple_build (stmts, MULT_EXPR,
9045 utype, elt_step, step_expr);
9046 elts.quick_push (elt_step);
9047 }
9048 /* Create a vector from [new_name_0, new_name_1, ...,
9049 new_name_nunits-1]. */
9050 tree vec_mul = gimple_build_vector (stmts, &elts);
9051 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9052 vec_init, vec_mul);
9053 vec_init = gimple_convert (stmts, vectype, vec_init);
9054 }
9055 break;
9056
9057 default:
9058 gcc_unreachable ();
9059 }
9060
9061 return vec_init;
9062 }
9063
9064 /* Peel init_expr by skip_niter for induction_type. */
9065 tree
9066 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9067 tree skip_niters, tree step_expr,
9068 enum vect_induction_op_type induction_type)
9069 {
9070 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9071 tree type = TREE_TYPE (init_expr);
9072 unsigned prec = TYPE_PRECISION (type);
9073 switch (induction_type)
9074 {
9075 case vect_step_op_neg:
9076 if (TREE_INT_CST_LOW (skip_niters) % 2)
9077 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9078 /* else no change. */
9079 break;
9080
9081 case vect_step_op_shr:
9082 case vect_step_op_shl:
9083 skip_niters = gimple_convert (stmts, type, skip_niters);
9084 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9085 /* When shift mount >= precision, need to avoid UD.
9086 In the original loop, there's no UD, and according to semantic,
9087 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9088 if (!tree_fits_uhwi_p (step_expr)
9089 || tree_to_uhwi (step_expr) >= prec)
9090 {
9091 if (induction_type == vect_step_op_shl
9092 || TYPE_UNSIGNED (type))
9093 init_expr = build_zero_cst (type);
9094 else
9095 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9096 init_expr,
9097 wide_int_to_tree (type, prec - 1));
9098 }
9099 else
9100 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9101 ? RSHIFT_EXPR : LSHIFT_EXPR),
9102 type, init_expr, step_expr);
9103 break;
9104
9105 case vect_step_op_mul:
9106 {
9107 tree utype = unsigned_type_for (type);
9108 init_expr = gimple_convert (stmts, utype, init_expr);
9109 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9110 wide_int begin = wi::to_wide (step_expr);
9111 for (unsigned i = 0; i != skipn - 1; i++)
9112 begin = wi::mul (begin, wi::to_wide (step_expr));
9113 tree mult_expr = wide_int_to_tree (utype, begin);
9114 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9115 init_expr = gimple_convert (stmts, type, init_expr);
9116 }
9117 break;
9118
9119 default:
9120 gcc_unreachable ();
9121 }
9122
9123 return init_expr;
9124 }
9125
9126 /* Create vector step for vectorized iv. */
9127 static tree
9128 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9129 poly_uint64 vf,
9130 enum vect_induction_op_type induction_type)
9131 {
9132 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9133 tree new_name = NULL;
9134 /* Step should be pow (step, vf) for mult induction. */
9135 if (induction_type == vect_step_op_mul)
9136 {
9137 gcc_assert (vf.is_constant ());
9138 wide_int begin = wi::to_wide (step_expr);
9139
9140 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9141 begin = wi::mul (begin, wi::to_wide (step_expr));
9142
9143 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9144 }
9145 else if (induction_type == vect_step_op_neg)
9146 /* Do nothing. */
9147 ;
9148 else
9149 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9150 expr, step_expr);
9151 return new_name;
9152 }
9153
9154 static tree
9155 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9156 stmt_vec_info stmt_info,
9157 tree new_name, tree vectype,
9158 enum vect_induction_op_type induction_type)
9159 {
9160 /* No step is needed for neg induction. */
9161 if (induction_type == vect_step_op_neg)
9162 return NULL;
9163
9164 tree t = unshare_expr (new_name);
9165 gcc_assert (CONSTANT_CLASS_P (new_name)
9166 || TREE_CODE (new_name) == SSA_NAME);
9167 tree new_vec = build_vector_from_val (vectype, t);
9168 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9169 new_vec, vectype, NULL);
9170 return vec_step;
9171 }
9172
9173 /* Update vectorized iv with vect_step, induc_def is init. */
9174 static tree
9175 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9176 tree induc_def, tree vec_step,
9177 enum vect_induction_op_type induction_type)
9178 {
9179 tree vec_def = induc_def;
9180 switch (induction_type)
9181 {
9182 case vect_step_op_mul:
9183 {
9184 /* Use unsigned mult to avoid UD integer overflow. */
9185 tree uvectype
9186 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9187 TYPE_VECTOR_SUBPARTS (vectype));
9188 vec_def = gimple_convert (stmts, uvectype, vec_def);
9189 vec_step = gimple_convert (stmts, uvectype, vec_step);
9190 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9191 vec_def, vec_step);
9192 vec_def = gimple_convert (stmts, vectype, vec_def);
9193 }
9194 break;
9195
9196 case vect_step_op_shr:
9197 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9198 vec_def, vec_step);
9199 break;
9200
9201 case vect_step_op_shl:
9202 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9203 vec_def, vec_step);
9204 break;
9205 case vect_step_op_neg:
9206 vec_def = induc_def;
9207 /* Do nothing. */
9208 break;
9209 default:
9210 gcc_unreachable ();
9211 }
9212
9213 return vec_def;
9214
9215 }
9216
9217 /* Function vectorizable_induction
9218
9219 Check if STMT_INFO performs an nonlinear induction computation that can be
9220 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9221 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9222 basic block.
9223 Return true if STMT_INFO is vectorizable in this way. */
9224
9225 static bool
9226 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9227 stmt_vec_info stmt_info,
9228 gimple **vec_stmt, slp_tree slp_node,
9229 stmt_vector_for_cost *cost_vec)
9230 {
9231 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9232 unsigned ncopies;
9233 bool nested_in_vect_loop = false;
9234 class loop *iv_loop;
9235 tree vec_def;
9236 edge pe = loop_preheader_edge (loop);
9237 basic_block new_bb;
9238 tree vec_init, vec_step;
9239 tree new_name;
9240 gimple *new_stmt;
9241 gphi *induction_phi;
9242 tree induc_def, vec_dest;
9243 tree init_expr, step_expr;
9244 tree niters_skip;
9245 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9246 unsigned i;
9247 gimple_stmt_iterator si;
9248
9249 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9250
9251 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9252 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9253 enum vect_induction_op_type induction_type
9254 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9255
9256 gcc_assert (induction_type > vect_step_op_add);
9257
9258 if (slp_node)
9259 ncopies = 1;
9260 else
9261 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9262 gcc_assert (ncopies >= 1);
9263
9264 /* FORNOW. Only handle nonlinear induction in the same loop. */
9265 if (nested_in_vect_loop_p (loop, stmt_info))
9266 {
9267 if (dump_enabled_p ())
9268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9269 "nonlinear induction in nested loop.\n");
9270 return false;
9271 }
9272
9273 iv_loop = loop;
9274 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9275
9276 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9277 update for each iv and a permutation to generate wanted vector iv. */
9278 if (slp_node)
9279 {
9280 if (dump_enabled_p ())
9281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9282 "SLP induction not supported for nonlinear"
9283 " induction.\n");
9284 return false;
9285 }
9286
9287 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9288 {
9289 if (dump_enabled_p ())
9290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9291 "floating point nonlinear induction vectorization"
9292 " not supported.\n");
9293 return false;
9294 }
9295
9296 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9297 init_expr = vect_phi_initial_value (phi);
9298 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9299 && TREE_CODE (step_expr) == INTEGER_CST);
9300 /* step_expr should be aligned with init_expr,
9301 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9302 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9303
9304 if (TREE_CODE (init_expr) == INTEGER_CST)
9305 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9306 else
9307 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9308 TREE_TYPE (init_expr)));
9309
9310 switch (induction_type)
9311 {
9312 case vect_step_op_neg:
9313 if (TREE_CODE (init_expr) != INTEGER_CST
9314 && TREE_CODE (init_expr) != REAL_CST)
9315 {
9316 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9317 if (!directly_supported_p (NEGATE_EXPR, vectype))
9318 return false;
9319
9320 /* The encoding has 2 interleaved stepped patterns. */
9321 vec_perm_builder sel (nunits, 2, 3);
9322 machine_mode mode = TYPE_MODE (vectype);
9323 sel.quick_grow (6);
9324 for (i = 0; i < 3; i++)
9325 {
9326 sel[i * 2] = i;
9327 sel[i * 2 + 1] = i + nunits;
9328 }
9329 vec_perm_indices indices (sel, 2, nunits);
9330 if (!can_vec_perm_const_p (mode, mode, indices))
9331 return false;
9332 }
9333 break;
9334
9335 case vect_step_op_mul:
9336 {
9337 /* Check for backend support of MULT_EXPR. */
9338 if (!directly_supported_p (MULT_EXPR, vectype))
9339 return false;
9340
9341 /* ?? How to construct vector step for variable number vector.
9342 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9343 if (!vf.is_constant ())
9344 return false;
9345 }
9346 break;
9347
9348 case vect_step_op_shr:
9349 /* Check for backend support of RSHIFT_EXPR. */
9350 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9351 return false;
9352
9353 /* Don't shift more than type precision to avoid UD. */
9354 if (!tree_fits_uhwi_p (step_expr)
9355 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9356 TYPE_PRECISION (TREE_TYPE (init_expr))))
9357 return false;
9358 break;
9359
9360 case vect_step_op_shl:
9361 /* Check for backend support of RSHIFT_EXPR. */
9362 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9363 return false;
9364
9365 /* Don't shift more than type precision to avoid UD. */
9366 if (!tree_fits_uhwi_p (step_expr)
9367 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9368 TYPE_PRECISION (TREE_TYPE (init_expr))))
9369 return false;
9370
9371 break;
9372
9373 default:
9374 gcc_unreachable ();
9375 }
9376
9377 if (!vec_stmt) /* transformation not required. */
9378 {
9379 unsigned inside_cost = 0, prologue_cost = 0;
9380 /* loop cost for vec_loop. Neg induction doesn't have any
9381 inside_cost. */
9382 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9383 stmt_info, 0, vect_body);
9384
9385 /* loop cost for vec_loop. Neg induction doesn't have any
9386 inside_cost. */
9387 if (induction_type == vect_step_op_neg)
9388 inside_cost = 0;
9389
9390 /* prologue cost for vec_init and vec_step. */
9391 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9392 stmt_info, 0, vect_prologue);
9393
9394 if (dump_enabled_p ())
9395 dump_printf_loc (MSG_NOTE, vect_location,
9396 "vect_model_induction_cost: inside_cost = %d, "
9397 "prologue_cost = %d. \n", inside_cost,
9398 prologue_cost);
9399
9400 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9401 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9402 return true;
9403 }
9404
9405 /* Transform. */
9406
9407 /* Compute a vector variable, initialized with the first VF values of
9408 the induction variable. E.g., for an iv with IV_PHI='X' and
9409 evolution S, for a vector of 4 units, we want to compute:
9410 [X, X + S, X + 2*S, X + 3*S]. */
9411
9412 if (dump_enabled_p ())
9413 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9414
9415 pe = loop_preheader_edge (iv_loop);
9416 /* Find the first insertion point in the BB. */
9417 basic_block bb = gimple_bb (phi);
9418 si = gsi_after_labels (bb);
9419
9420 gimple_seq stmts = NULL;
9421
9422 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9423 /* If we are using the loop mask to "peel" for alignment then we need
9424 to adjust the start value here. */
9425 if (niters_skip != NULL_TREE)
9426 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9427 step_expr, induction_type);
9428
9429 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9430 step_expr, nunits, vectype,
9431 induction_type);
9432 if (stmts)
9433 {
9434 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9435 gcc_assert (!new_bb);
9436 }
9437
9438 stmts = NULL;
9439 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9440 vf, induction_type);
9441 if (stmts)
9442 {
9443 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9444 gcc_assert (!new_bb);
9445 }
9446
9447 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9448 new_name, vectype,
9449 induction_type);
9450 /* Create the following def-use cycle:
9451 loop prolog:
9452 vec_init = ...
9453 vec_step = ...
9454 loop:
9455 vec_iv = PHI <vec_init, vec_loop>
9456 ...
9457 STMT
9458 ...
9459 vec_loop = vec_iv + vec_step; */
9460
9461 /* Create the induction-phi that defines the induction-operand. */
9462 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9463 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9464 induc_def = PHI_RESULT (induction_phi);
9465
9466 /* Create the iv update inside the loop. */
9467 stmts = NULL;
9468 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9469 induc_def, vec_step,
9470 induction_type);
9471
9472 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9473 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9474
9475 /* Set the arguments of the phi node: */
9476 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9477 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9478 UNKNOWN_LOCATION);
9479
9480 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9481 *vec_stmt = induction_phi;
9482
9483 /* In case that vectorization factor (VF) is bigger than the number
9484 of elements that we can fit in a vectype (nunits), we have to generate
9485 more than one vector stmt - i.e - we need to "unroll" the
9486 vector stmt by a factor VF/nunits. For more details see documentation
9487 in vectorizable_operation. */
9488
9489 if (ncopies > 1)
9490 {
9491 stmts = NULL;
9492 /* FORNOW. This restriction should be relaxed. */
9493 gcc_assert (!nested_in_vect_loop);
9494
9495 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9496 nunits, induction_type);
9497
9498 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9499 new_name, vectype,
9500 induction_type);
9501 vec_def = induc_def;
9502 for (i = 1; i < ncopies; i++)
9503 {
9504 /* vec_i = vec_prev + vec_step. */
9505 stmts = NULL;
9506 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9507 vec_def, vec_step,
9508 induction_type);
9509 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9510 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9511 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9512 }
9513 }
9514
9515 if (dump_enabled_p ())
9516 dump_printf_loc (MSG_NOTE, vect_location,
9517 "transform induction: created def-use cycle: %G%G",
9518 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9519
9520 return true;
9521 }
9522
9523 /* Function vectorizable_induction
9524
9525 Check if STMT_INFO performs an induction computation that can be vectorized.
9526 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9527 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9528 Return true if STMT_INFO is vectorizable in this way. */
9529
9530 bool
9531 vectorizable_induction (loop_vec_info loop_vinfo,
9532 stmt_vec_info stmt_info,
9533 gimple **vec_stmt, slp_tree slp_node,
9534 stmt_vector_for_cost *cost_vec)
9535 {
9536 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9537 unsigned ncopies;
9538 bool nested_in_vect_loop = false;
9539 class loop *iv_loop;
9540 tree vec_def;
9541 edge pe = loop_preheader_edge (loop);
9542 basic_block new_bb;
9543 tree new_vec, vec_init, vec_step, t;
9544 tree new_name;
9545 gimple *new_stmt;
9546 gphi *induction_phi;
9547 tree induc_def, vec_dest;
9548 tree init_expr, step_expr;
9549 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9550 unsigned i;
9551 tree expr;
9552 gimple_stmt_iterator si;
9553 enum vect_induction_op_type induction_type
9554 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9555
9556 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9557 if (!phi)
9558 return false;
9559
9560 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9561 return false;
9562
9563 /* Make sure it was recognized as induction computation. */
9564 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9565 return false;
9566
9567 /* Handle nonlinear induction in a separate place. */
9568 if (induction_type != vect_step_op_add)
9569 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9570 vec_stmt, slp_node, cost_vec);
9571
9572 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9573 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9574
9575 if (slp_node)
9576 ncopies = 1;
9577 else
9578 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9579 gcc_assert (ncopies >= 1);
9580
9581 /* FORNOW. These restrictions should be relaxed. */
9582 if (nested_in_vect_loop_p (loop, stmt_info))
9583 {
9584 imm_use_iterator imm_iter;
9585 use_operand_p use_p;
9586 gimple *exit_phi;
9587 edge latch_e;
9588 tree loop_arg;
9589
9590 if (ncopies > 1)
9591 {
9592 if (dump_enabled_p ())
9593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9594 "multiple types in nested loop.\n");
9595 return false;
9596 }
9597
9598 exit_phi = NULL;
9599 latch_e = loop_latch_edge (loop->inner);
9600 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9601 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9602 {
9603 gimple *use_stmt = USE_STMT (use_p);
9604 if (is_gimple_debug (use_stmt))
9605 continue;
9606
9607 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9608 {
9609 exit_phi = use_stmt;
9610 break;
9611 }
9612 }
9613 if (exit_phi)
9614 {
9615 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9616 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9617 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9618 {
9619 if (dump_enabled_p ())
9620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9621 "inner-loop induction only used outside "
9622 "of the outer vectorized loop.\n");
9623 return false;
9624 }
9625 }
9626
9627 nested_in_vect_loop = true;
9628 iv_loop = loop->inner;
9629 }
9630 else
9631 iv_loop = loop;
9632 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9633
9634 if (slp_node && !nunits.is_constant ())
9635 {
9636 /* The current SLP code creates the step value element-by-element. */
9637 if (dump_enabled_p ())
9638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9639 "SLP induction not supported for variable-length"
9640 " vectors.\n");
9641 return false;
9642 }
9643
9644 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9645 {
9646 if (dump_enabled_p ())
9647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9648 "floating point induction vectorization disabled\n");
9649 return false;
9650 }
9651
9652 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9653 gcc_assert (step_expr != NULL_TREE);
9654 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9655
9656 /* Check for backend support of PLUS/MINUS_EXPR. */
9657 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9658 || !directly_supported_p (MINUS_EXPR, step_vectype))
9659 return false;
9660
9661 if (!vec_stmt) /* transformation not required. */
9662 {
9663 unsigned inside_cost = 0, prologue_cost = 0;
9664 if (slp_node)
9665 {
9666 /* We eventually need to set a vector type on invariant
9667 arguments. */
9668 unsigned j;
9669 slp_tree child;
9670 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9671 if (!vect_maybe_update_slp_op_vectype
9672 (child, SLP_TREE_VECTYPE (slp_node)))
9673 {
9674 if (dump_enabled_p ())
9675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9676 "incompatible vector types for "
9677 "invariants\n");
9678 return false;
9679 }
9680 /* loop cost for vec_loop. */
9681 inside_cost
9682 = record_stmt_cost (cost_vec,
9683 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9684 vector_stmt, stmt_info, 0, vect_body);
9685 /* prologue cost for vec_init (if not nested) and step. */
9686 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9687 scalar_to_vec,
9688 stmt_info, 0, vect_prologue);
9689 }
9690 else /* if (!slp_node) */
9691 {
9692 /* loop cost for vec_loop. */
9693 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9694 stmt_info, 0, vect_body);
9695 /* prologue cost for vec_init and vec_step. */
9696 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9697 stmt_info, 0, vect_prologue);
9698 }
9699 if (dump_enabled_p ())
9700 dump_printf_loc (MSG_NOTE, vect_location,
9701 "vect_model_induction_cost: inside_cost = %d, "
9702 "prologue_cost = %d .\n", inside_cost,
9703 prologue_cost);
9704
9705 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9706 DUMP_VECT_SCOPE ("vectorizable_induction");
9707 return true;
9708 }
9709
9710 /* Transform. */
9711
9712 /* Compute a vector variable, initialized with the first VF values of
9713 the induction variable. E.g., for an iv with IV_PHI='X' and
9714 evolution S, for a vector of 4 units, we want to compute:
9715 [X, X + S, X + 2*S, X + 3*S]. */
9716
9717 if (dump_enabled_p ())
9718 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9719
9720 pe = loop_preheader_edge (iv_loop);
9721 /* Find the first insertion point in the BB. */
9722 basic_block bb = gimple_bb (phi);
9723 si = gsi_after_labels (bb);
9724
9725 /* For SLP induction we have to generate several IVs as for example
9726 with group size 3 we need
9727 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9728 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9729 if (slp_node)
9730 {
9731 /* Enforced above. */
9732 unsigned int const_nunits = nunits.to_constant ();
9733
9734 /* The initial values are vectorized, but any lanes > group_size
9735 need adjustment. */
9736 slp_tree init_node
9737 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9738
9739 /* Gather steps. Since we do not vectorize inductions as
9740 cycles we have to reconstruct the step from SCEV data. */
9741 unsigned group_size = SLP_TREE_LANES (slp_node);
9742 tree *steps = XALLOCAVEC (tree, group_size);
9743 tree *inits = XALLOCAVEC (tree, group_size);
9744 stmt_vec_info phi_info;
9745 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9746 {
9747 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9748 if (!init_node)
9749 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9750 pe->dest_idx);
9751 }
9752
9753 /* Now generate the IVs. */
9754 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9755 gcc_assert ((const_nunits * nvects) % group_size == 0);
9756 unsigned nivs;
9757 if (nested_in_vect_loop)
9758 nivs = nvects;
9759 else
9760 {
9761 /* Compute the number of distinct IVs we need. First reduce
9762 group_size if it is a multiple of const_nunits so we get
9763 one IV for a group_size of 4 but const_nunits 2. */
9764 unsigned group_sizep = group_size;
9765 if (group_sizep % const_nunits == 0)
9766 group_sizep = group_sizep / const_nunits;
9767 nivs = least_common_multiple (group_sizep,
9768 const_nunits) / const_nunits;
9769 }
9770 tree stept = TREE_TYPE (step_vectype);
9771 tree lupdate_mul = NULL_TREE;
9772 if (!nested_in_vect_loop)
9773 {
9774 /* The number of iterations covered in one vector iteration. */
9775 unsigned lup_mul = (nvects * const_nunits) / group_size;
9776 lupdate_mul
9777 = build_vector_from_val (step_vectype,
9778 SCALAR_FLOAT_TYPE_P (stept)
9779 ? build_real_from_wide (stept, lup_mul,
9780 UNSIGNED)
9781 : build_int_cstu (stept, lup_mul));
9782 }
9783 tree peel_mul = NULL_TREE;
9784 gimple_seq init_stmts = NULL;
9785 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9786 {
9787 if (SCALAR_FLOAT_TYPE_P (stept))
9788 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9789 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9790 else
9791 peel_mul = gimple_convert (&init_stmts, stept,
9792 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9793 peel_mul = gimple_build_vector_from_val (&init_stmts,
9794 step_vectype, peel_mul);
9795 }
9796 unsigned ivn;
9797 auto_vec<tree> vec_steps;
9798 for (ivn = 0; ivn < nivs; ++ivn)
9799 {
9800 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9801 tree_vector_builder init_elts (vectype, const_nunits, 1);
9802 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9803 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9804 {
9805 /* The scalar steps of the IVs. */
9806 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9807 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9808 step_elts.quick_push (elt);
9809 if (!init_node)
9810 {
9811 /* The scalar inits of the IVs if not vectorized. */
9812 elt = inits[(ivn*const_nunits + eltn) % group_size];
9813 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9814 TREE_TYPE (elt)))
9815 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9816 TREE_TYPE (vectype), elt);
9817 init_elts.quick_push (elt);
9818 }
9819 /* The number of steps to add to the initial values. */
9820 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9821 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9822 ? build_real_from_wide (stept,
9823 mul_elt, UNSIGNED)
9824 : build_int_cstu (stept, mul_elt));
9825 }
9826 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9827 vec_steps.safe_push (vec_step);
9828 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9829 if (peel_mul)
9830 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9831 step_mul, peel_mul);
9832 if (!init_node)
9833 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9834
9835 /* Create the induction-phi that defines the induction-operand. */
9836 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9837 "vec_iv_");
9838 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9839 induc_def = PHI_RESULT (induction_phi);
9840
9841 /* Create the iv update inside the loop */
9842 tree up = vec_step;
9843 if (lupdate_mul)
9844 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9845 vec_step, lupdate_mul);
9846 gimple_seq stmts = NULL;
9847 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9848 vec_def = gimple_build (&stmts,
9849 PLUS_EXPR, step_vectype, vec_def, up);
9850 vec_def = gimple_convert (&stmts, vectype, vec_def);
9851 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9852 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9853 UNKNOWN_LOCATION);
9854
9855 if (init_node)
9856 vec_init = vect_get_slp_vect_def (init_node, ivn);
9857 if (!nested_in_vect_loop
9858 && !integer_zerop (step_mul))
9859 {
9860 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9861 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9862 vec_step, step_mul);
9863 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9864 vec_def, up);
9865 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9866 }
9867
9868 /* Set the arguments of the phi node: */
9869 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9870
9871 slp_node->push_vec_def (induction_phi);
9872 }
9873 if (!nested_in_vect_loop)
9874 {
9875 /* Fill up to the number of vectors we need for the whole group. */
9876 nivs = least_common_multiple (group_size,
9877 const_nunits) / const_nunits;
9878 vec_steps.reserve (nivs-ivn);
9879 for (; ivn < nivs; ++ivn)
9880 {
9881 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9882 vec_steps.quick_push (vec_steps[0]);
9883 }
9884 }
9885
9886 /* Re-use IVs when we can. We are generating further vector
9887 stmts by adding VF' * stride to the IVs generated above. */
9888 if (ivn < nvects)
9889 {
9890 unsigned vfp
9891 = least_common_multiple (group_size, const_nunits) / group_size;
9892 tree lupdate_mul
9893 = build_vector_from_val (step_vectype,
9894 SCALAR_FLOAT_TYPE_P (stept)
9895 ? build_real_from_wide (stept,
9896 vfp, UNSIGNED)
9897 : build_int_cstu (stept, vfp));
9898 for (; ivn < nvects; ++ivn)
9899 {
9900 gimple *iv
9901 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9902 tree def = gimple_get_lhs (iv);
9903 if (ivn < 2*nivs)
9904 vec_steps[ivn - nivs]
9905 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9906 vec_steps[ivn - nivs], lupdate_mul);
9907 gimple_seq stmts = NULL;
9908 def = gimple_convert (&stmts, step_vectype, def);
9909 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9910 def, vec_steps[ivn % nivs]);
9911 def = gimple_convert (&stmts, vectype, def);
9912 if (gimple_code (iv) == GIMPLE_PHI)
9913 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9914 else
9915 {
9916 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9917 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9918 }
9919 slp_node->push_vec_def (def);
9920 }
9921 }
9922
9923 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9924 gcc_assert (!new_bb);
9925
9926 return true;
9927 }
9928
9929 init_expr = vect_phi_initial_value (phi);
9930
9931 gimple_seq stmts = NULL;
9932 if (!nested_in_vect_loop)
9933 {
9934 /* Convert the initial value to the IV update type. */
9935 tree new_type = TREE_TYPE (step_expr);
9936 init_expr = gimple_convert (&stmts, new_type, init_expr);
9937
9938 /* If we are using the loop mask to "peel" for alignment then we need
9939 to adjust the start value here. */
9940 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9941 if (skip_niters != NULL_TREE)
9942 {
9943 if (FLOAT_TYPE_P (vectype))
9944 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9945 skip_niters);
9946 else
9947 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9948 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9949 skip_niters, step_expr);
9950 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9951 init_expr, skip_step);
9952 }
9953 }
9954
9955 if (stmts)
9956 {
9957 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9958 gcc_assert (!new_bb);
9959 }
9960
9961 /* Create the vector that holds the initial_value of the induction. */
9962 if (nested_in_vect_loop)
9963 {
9964 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9965 been created during vectorization of previous stmts. We obtain it
9966 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9967 auto_vec<tree> vec_inits;
9968 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9969 init_expr, &vec_inits);
9970 vec_init = vec_inits[0];
9971 /* If the initial value is not of proper type, convert it. */
9972 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9973 {
9974 new_stmt
9975 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9976 vect_simple_var,
9977 "vec_iv_"),
9978 VIEW_CONVERT_EXPR,
9979 build1 (VIEW_CONVERT_EXPR, vectype,
9980 vec_init));
9981 vec_init = gimple_assign_lhs (new_stmt);
9982 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9983 new_stmt);
9984 gcc_assert (!new_bb);
9985 }
9986 }
9987 else
9988 {
9989 /* iv_loop is the loop to be vectorized. Create:
9990 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9991 stmts = NULL;
9992 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9993
9994 unsigned HOST_WIDE_INT const_nunits;
9995 if (nunits.is_constant (&const_nunits))
9996 {
9997 tree_vector_builder elts (step_vectype, const_nunits, 1);
9998 elts.quick_push (new_name);
9999 for (i = 1; i < const_nunits; i++)
10000 {
10001 /* Create: new_name_i = new_name + step_expr */
10002 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10003 new_name, step_expr);
10004 elts.quick_push (new_name);
10005 }
10006 /* Create a vector from [new_name_0, new_name_1, ...,
10007 new_name_nunits-1] */
10008 vec_init = gimple_build_vector (&stmts, &elts);
10009 }
10010 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10011 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10012 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10013 new_name, step_expr);
10014 else
10015 {
10016 /* Build:
10017 [base, base, base, ...]
10018 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10019 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10020 gcc_assert (flag_associative_math);
10021 tree index = build_index_vector (step_vectype, 0, 1);
10022 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10023 new_name);
10024 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10025 step_expr);
10026 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10027 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10028 vec_init, step_vec);
10029 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10030 vec_init, base_vec);
10031 }
10032 vec_init = gimple_convert (&stmts, vectype, vec_init);
10033
10034 if (stmts)
10035 {
10036 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10037 gcc_assert (!new_bb);
10038 }
10039 }
10040
10041
10042 /* Create the vector that holds the step of the induction. */
10043 if (nested_in_vect_loop)
10044 /* iv_loop is nested in the loop to be vectorized. Generate:
10045 vec_step = [S, S, S, S] */
10046 new_name = step_expr;
10047 else
10048 {
10049 /* iv_loop is the loop to be vectorized. Generate:
10050 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10051 gimple_seq seq = NULL;
10052 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10053 {
10054 expr = build_int_cst (integer_type_node, vf);
10055 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10056 }
10057 else
10058 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10059 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10060 expr, step_expr);
10061 if (seq)
10062 {
10063 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10064 gcc_assert (!new_bb);
10065 }
10066 }
10067
10068 t = unshare_expr (new_name);
10069 gcc_assert (CONSTANT_CLASS_P (new_name)
10070 || TREE_CODE (new_name) == SSA_NAME);
10071 new_vec = build_vector_from_val (step_vectype, t);
10072 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10073 new_vec, step_vectype, NULL);
10074
10075
10076 /* Create the following def-use cycle:
10077 loop prolog:
10078 vec_init = ...
10079 vec_step = ...
10080 loop:
10081 vec_iv = PHI <vec_init, vec_loop>
10082 ...
10083 STMT
10084 ...
10085 vec_loop = vec_iv + vec_step; */
10086
10087 /* Create the induction-phi that defines the induction-operand. */
10088 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10089 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10090 induc_def = PHI_RESULT (induction_phi);
10091
10092 /* Create the iv update inside the loop */
10093 stmts = NULL;
10094 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10095 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10096 vec_def = gimple_convert (&stmts, vectype, vec_def);
10097 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10098 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10099
10100 /* Set the arguments of the phi node: */
10101 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10102 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10103 UNKNOWN_LOCATION);
10104
10105 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10106 *vec_stmt = induction_phi;
10107
10108 /* In case that vectorization factor (VF) is bigger than the number
10109 of elements that we can fit in a vectype (nunits), we have to generate
10110 more than one vector stmt - i.e - we need to "unroll" the
10111 vector stmt by a factor VF/nunits. For more details see documentation
10112 in vectorizable_operation. */
10113
10114 if (ncopies > 1)
10115 {
10116 gimple_seq seq = NULL;
10117 /* FORNOW. This restriction should be relaxed. */
10118 gcc_assert (!nested_in_vect_loop);
10119
10120 /* Create the vector that holds the step of the induction. */
10121 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10122 {
10123 expr = build_int_cst (integer_type_node, nunits);
10124 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10125 }
10126 else
10127 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10128 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10129 expr, step_expr);
10130 if (seq)
10131 {
10132 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10133 gcc_assert (!new_bb);
10134 }
10135
10136 t = unshare_expr (new_name);
10137 gcc_assert (CONSTANT_CLASS_P (new_name)
10138 || TREE_CODE (new_name) == SSA_NAME);
10139 new_vec = build_vector_from_val (step_vectype, t);
10140 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10141 new_vec, step_vectype, NULL);
10142
10143 vec_def = induc_def;
10144 for (i = 1; i < ncopies + 1; i++)
10145 {
10146 /* vec_i = vec_prev + vec_step */
10147 gimple_seq stmts = NULL;
10148 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10149 vec_def = gimple_build (&stmts,
10150 PLUS_EXPR, step_vectype, vec_def, vec_step);
10151 vec_def = gimple_convert (&stmts, vectype, vec_def);
10152
10153 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10154 if (i < ncopies)
10155 {
10156 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10157 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10158 }
10159 else
10160 {
10161 /* vec_1 = vec_iv + (VF/n * S)
10162 vec_2 = vec_1 + (VF/n * S)
10163 ...
10164 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10165
10166 vec_n is used as vec_loop to save the large step register and
10167 related operations. */
10168 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10169 UNKNOWN_LOCATION);
10170 }
10171 }
10172 }
10173
10174 if (dump_enabled_p ())
10175 dump_printf_loc (MSG_NOTE, vect_location,
10176 "transform induction: created def-use cycle: %G%G",
10177 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10178
10179 return true;
10180 }
10181
10182 /* Function vectorizable_live_operation.
10183
10184 STMT_INFO computes a value that is used outside the loop. Check if
10185 it can be supported. */
10186
10187 bool
10188 vectorizable_live_operation (vec_info *vinfo,
10189 stmt_vec_info stmt_info,
10190 gimple_stmt_iterator *gsi,
10191 slp_tree slp_node, slp_instance slp_node_instance,
10192 int slp_index, bool vec_stmt_p,
10193 stmt_vector_for_cost *cost_vec)
10194 {
10195 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10196 imm_use_iterator imm_iter;
10197 tree lhs, lhs_type, bitsize;
10198 tree vectype = (slp_node
10199 ? SLP_TREE_VECTYPE (slp_node)
10200 : STMT_VINFO_VECTYPE (stmt_info));
10201 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10202 int ncopies;
10203 gimple *use_stmt;
10204 auto_vec<tree> vec_oprnds;
10205 int vec_entry = 0;
10206 poly_uint64 vec_index = 0;
10207
10208 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10209
10210 /* If a stmt of a reduction is live, vectorize it via
10211 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10212 validity so just trigger the transform here. */
10213 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10214 {
10215 if (!vec_stmt_p)
10216 return true;
10217 if (slp_node)
10218 {
10219 /* For reduction chains the meta-info is attached to
10220 the group leader. */
10221 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10222 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10223 /* For SLP reductions we vectorize the epilogue for
10224 all involved stmts together. */
10225 else if (slp_index != 0)
10226 return true;
10227 }
10228 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10229 gcc_assert (reduc_info->is_reduc_info);
10230 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10231 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10232 return true;
10233 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10234 slp_node_instance);
10235 return true;
10236 }
10237
10238 /* If STMT is not relevant and it is a simple assignment and its inputs are
10239 invariant then it can remain in place, unvectorized. The original last
10240 scalar value that it computes will be used. */
10241 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10242 {
10243 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10244 if (dump_enabled_p ())
10245 dump_printf_loc (MSG_NOTE, vect_location,
10246 "statement is simple and uses invariant. Leaving in "
10247 "place.\n");
10248 return true;
10249 }
10250
10251 if (slp_node)
10252 ncopies = 1;
10253 else
10254 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10255
10256 if (slp_node)
10257 {
10258 gcc_assert (slp_index >= 0);
10259
10260 /* Get the last occurrence of the scalar index from the concatenation of
10261 all the slp vectors. Calculate which slp vector it is and the index
10262 within. */
10263 int num_scalar = SLP_TREE_LANES (slp_node);
10264 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10265 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10266
10267 /* Calculate which vector contains the result, and which lane of
10268 that vector we need. */
10269 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10270 {
10271 if (dump_enabled_p ())
10272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10273 "Cannot determine which vector holds the"
10274 " final result.\n");
10275 return false;
10276 }
10277 }
10278
10279 if (!vec_stmt_p)
10280 {
10281 /* No transformation required. */
10282 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10283 {
10284 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10285 OPTIMIZE_FOR_SPEED))
10286 {
10287 if (dump_enabled_p ())
10288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10289 "can't operate on partial vectors "
10290 "because the target doesn't support extract "
10291 "last reduction.\n");
10292 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10293 }
10294 else if (slp_node)
10295 {
10296 if (dump_enabled_p ())
10297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10298 "can't operate on partial vectors "
10299 "because an SLP statement is live after "
10300 "the loop.\n");
10301 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10302 }
10303 else if (ncopies > 1)
10304 {
10305 if (dump_enabled_p ())
10306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10307 "can't operate on partial vectors "
10308 "because ncopies is greater than 1.\n");
10309 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10310 }
10311 else
10312 {
10313 gcc_assert (ncopies == 1 && !slp_node);
10314 vect_record_loop_mask (loop_vinfo,
10315 &LOOP_VINFO_MASKS (loop_vinfo),
10316 1, vectype, NULL);
10317 }
10318 }
10319 /* ??? Enable for loop costing as well. */
10320 if (!loop_vinfo)
10321 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10322 0, vect_epilogue);
10323 return true;
10324 }
10325
10326 /* Use the lhs of the original scalar statement. */
10327 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10328 if (dump_enabled_p ())
10329 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10330 "stmt %G", stmt);
10331
10332 lhs = gimple_get_lhs (stmt);
10333 lhs_type = TREE_TYPE (lhs);
10334
10335 bitsize = vector_element_bits_tree (vectype);
10336
10337 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10338 tree vec_lhs, bitstart;
10339 gimple *vec_stmt;
10340 if (slp_node)
10341 {
10342 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10343
10344 /* Get the correct slp vectorized stmt. */
10345 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10346 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10347
10348 /* Get entry to use. */
10349 bitstart = bitsize_int (vec_index);
10350 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10351 }
10352 else
10353 {
10354 /* For multiple copies, get the last copy. */
10355 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10356 vec_lhs = gimple_get_lhs (vec_stmt);
10357
10358 /* Get the last lane in the vector. */
10359 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10360 }
10361
10362 if (loop_vinfo)
10363 {
10364 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10365 requirement, insert one phi node for it. It looks like:
10366 loop;
10367 BB:
10368 # lhs' = PHI <lhs>
10369 ==>
10370 loop;
10371 BB:
10372 # vec_lhs' = PHI <vec_lhs>
10373 new_tree = lane_extract <vec_lhs', ...>;
10374 lhs' = new_tree; */
10375
10376 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10377 basic_block exit_bb = single_exit (loop)->dest;
10378 gcc_assert (single_pred_p (exit_bb));
10379
10380 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10381 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10382 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10383
10384 gimple_seq stmts = NULL;
10385 tree new_tree;
10386 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10387 {
10388 /* Emit:
10389
10390 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10391
10392 where VEC_LHS is the vectorized live-out result and MASK is
10393 the loop mask for the final iteration. */
10394 gcc_assert (ncopies == 1 && !slp_node);
10395 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10396 tree mask = vect_get_loop_mask (loop_vinfo, gsi,
10397 &LOOP_VINFO_MASKS (loop_vinfo),
10398 1, vectype, 0);
10399 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10400 mask, vec_lhs_phi);
10401
10402 /* Convert the extracted vector element to the scalar type. */
10403 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10404 }
10405 else
10406 {
10407 tree bftype = TREE_TYPE (vectype);
10408 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10409 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10410 new_tree = build3 (BIT_FIELD_REF, bftype,
10411 vec_lhs_phi, bitsize, bitstart);
10412 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10413 &stmts, true, NULL_TREE);
10414 }
10415
10416 if (stmts)
10417 {
10418 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10419 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10420
10421 /* Remove existing phi from lhs and create one copy from new_tree. */
10422 tree lhs_phi = NULL_TREE;
10423 gimple_stmt_iterator gsi;
10424 for (gsi = gsi_start_phis (exit_bb);
10425 !gsi_end_p (gsi); gsi_next (&gsi))
10426 {
10427 gimple *phi = gsi_stmt (gsi);
10428 if ((gimple_phi_arg_def (phi, 0) == lhs))
10429 {
10430 remove_phi_node (&gsi, false);
10431 lhs_phi = gimple_phi_result (phi);
10432 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10433 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10434 break;
10435 }
10436 }
10437 }
10438
10439 /* Replace use of lhs with newly computed result. If the use stmt is a
10440 single arg PHI, just replace all uses of PHI result. It's necessary
10441 because lcssa PHI defining lhs may be before newly inserted stmt. */
10442 use_operand_p use_p;
10443 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10444 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10445 && !is_gimple_debug (use_stmt))
10446 {
10447 if (gimple_code (use_stmt) == GIMPLE_PHI
10448 && gimple_phi_num_args (use_stmt) == 1)
10449 {
10450 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10451 }
10452 else
10453 {
10454 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10455 SET_USE (use_p, new_tree);
10456 }
10457 update_stmt (use_stmt);
10458 }
10459 }
10460 else
10461 {
10462 /* For basic-block vectorization simply insert the lane-extraction. */
10463 tree bftype = TREE_TYPE (vectype);
10464 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10465 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10466 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10467 vec_lhs, bitsize, bitstart);
10468 gimple_seq stmts = NULL;
10469 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10470 &stmts, true, NULL_TREE);
10471 if (TREE_CODE (new_tree) == SSA_NAME
10472 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10473 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10474 if (is_a <gphi *> (vec_stmt))
10475 {
10476 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10477 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10478 }
10479 else
10480 {
10481 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10482 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10483 }
10484
10485 /* Replace use of lhs with newly computed result. If the use stmt is a
10486 single arg PHI, just replace all uses of PHI result. It's necessary
10487 because lcssa PHI defining lhs may be before newly inserted stmt. */
10488 use_operand_p use_p;
10489 stmt_vec_info use_stmt_info;
10490 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10491 if (!is_gimple_debug (use_stmt)
10492 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10493 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10494 {
10495 /* ??? This can happen when the live lane ends up being
10496 used in a vector construction code-generated by an
10497 external SLP node (and code-generation for that already
10498 happened). See gcc.dg/vect/bb-slp-47.c.
10499 Doing this is what would happen if that vector CTOR
10500 were not code-generated yet so it is not too bad.
10501 ??? In fact we'd likely want to avoid this situation
10502 in the first place. */
10503 if (TREE_CODE (new_tree) == SSA_NAME
10504 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10505 && gimple_code (use_stmt) != GIMPLE_PHI
10506 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10507 use_stmt))
10508 {
10509 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10510 gcc_checking_assert (code == SSA_NAME
10511 || code == CONSTRUCTOR
10512 || code == VIEW_CONVERT_EXPR
10513 || CONVERT_EXPR_CODE_P (code));
10514 if (dump_enabled_p ())
10515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10516 "Using original scalar computation for "
10517 "live lane because use preceeds vector "
10518 "def\n");
10519 continue;
10520 }
10521 /* ??? It can also happen that we end up pulling a def into
10522 a loop where replacing out-of-loop uses would require
10523 a new LC SSA PHI node. Retain the original scalar in
10524 those cases as well. PR98064. */
10525 if (TREE_CODE (new_tree) == SSA_NAME
10526 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10527 && (gimple_bb (use_stmt)->loop_father
10528 != gimple_bb (vec_stmt)->loop_father)
10529 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10530 gimple_bb (use_stmt)->loop_father))
10531 {
10532 if (dump_enabled_p ())
10533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10534 "Using original scalar computation for "
10535 "live lane because there is an out-of-loop "
10536 "definition for it\n");
10537 continue;
10538 }
10539 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10540 SET_USE (use_p, new_tree);
10541 update_stmt (use_stmt);
10542 }
10543 }
10544
10545 return true;
10546 }
10547
10548 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10549
10550 static void
10551 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10552 {
10553 ssa_op_iter op_iter;
10554 imm_use_iterator imm_iter;
10555 def_operand_p def_p;
10556 gimple *ustmt;
10557
10558 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10559 {
10560 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10561 {
10562 basic_block bb;
10563
10564 if (!is_gimple_debug (ustmt))
10565 continue;
10566
10567 bb = gimple_bb (ustmt);
10568
10569 if (!flow_bb_inside_loop_p (loop, bb))
10570 {
10571 if (gimple_debug_bind_p (ustmt))
10572 {
10573 if (dump_enabled_p ())
10574 dump_printf_loc (MSG_NOTE, vect_location,
10575 "killing debug use\n");
10576
10577 gimple_debug_bind_reset_value (ustmt);
10578 update_stmt (ustmt);
10579 }
10580 else
10581 gcc_unreachable ();
10582 }
10583 }
10584 }
10585 }
10586
10587 /* Given loop represented by LOOP_VINFO, return true if computation of
10588 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10589 otherwise. */
10590
10591 static bool
10592 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10593 {
10594 /* Constant case. */
10595 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10596 {
10597 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10598 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10599
10600 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10601 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10602 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10603 return true;
10604 }
10605
10606 widest_int max;
10607 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10608 /* Check the upper bound of loop niters. */
10609 if (get_max_loop_iterations (loop, &max))
10610 {
10611 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10612 signop sgn = TYPE_SIGN (type);
10613 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10614 if (max < type_max)
10615 return true;
10616 }
10617 return false;
10618 }
10619
10620 /* Return a mask type with half the number of elements as OLD_TYPE,
10621 given that it should have mode NEW_MODE. */
10622
10623 tree
10624 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10625 {
10626 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10627 return build_truth_vector_type_for_mode (nunits, new_mode);
10628 }
10629
10630 /* Return a mask type with twice as many elements as OLD_TYPE,
10631 given that it should have mode NEW_MODE. */
10632
10633 tree
10634 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10635 {
10636 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10637 return build_truth_vector_type_for_mode (nunits, new_mode);
10638 }
10639
10640 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10641 contain a sequence of NVECTORS masks that each control a vector of type
10642 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10643 these vector masks with the vector version of SCALAR_MASK. */
10644
10645 void
10646 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10647 unsigned int nvectors, tree vectype, tree scalar_mask)
10648 {
10649 gcc_assert (nvectors != 0);
10650
10651 if (scalar_mask)
10652 {
10653 scalar_cond_masked_key cond (scalar_mask, nvectors);
10654 loop_vinfo->scalar_cond_masked_set.add (cond);
10655 }
10656
10657 masks->mask_set.add (std::make_pair (vectype, nvectors));
10658 }
10659
10660 /* Given a complete set of masks MASKS, extract mask number INDEX
10661 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10662 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10663
10664 See the comment above vec_loop_masks for more details about the mask
10665 arrangement. */
10666
10667 tree
10668 vect_get_loop_mask (loop_vec_info loop_vinfo,
10669 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10670 unsigned int nvectors, tree vectype, unsigned int index)
10671 {
10672 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10673 == vect_partial_vectors_while_ult)
10674 {
10675 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10676 tree mask_type = rgm->type;
10677
10678 /* Populate the rgroup's mask array, if this is the first time we've
10679 used it. */
10680 if (rgm->controls.is_empty ())
10681 {
10682 rgm->controls.safe_grow_cleared (nvectors, true);
10683 for (unsigned int i = 0; i < nvectors; ++i)
10684 {
10685 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10686 /* Provide a dummy definition until the real one is available. */
10687 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10688 rgm->controls[i] = mask;
10689 }
10690 }
10691
10692 tree mask = rgm->controls[index];
10693 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10694 TYPE_VECTOR_SUBPARTS (vectype)))
10695 {
10696 /* A loop mask for data type X can be reused for data type Y
10697 if X has N times more elements than Y and if Y's elements
10698 are N times bigger than X's. In this case each sequence
10699 of N elements in the loop mask will be all-zero or all-one.
10700 We can then view-convert the mask so that each sequence of
10701 N elements is replaced by a single element. */
10702 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10703 TYPE_VECTOR_SUBPARTS (vectype)));
10704 gimple_seq seq = NULL;
10705 mask_type = truth_type_for (vectype);
10706 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10707 if (seq)
10708 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10709 }
10710 return mask;
10711 }
10712 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10713 == vect_partial_vectors_avx512)
10714 {
10715 /* The number of scalars per iteration and the number of vectors are
10716 both compile-time constants. */
10717 unsigned int nscalars_per_iter
10718 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10719 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10720
10721 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10722
10723 /* The stored nV is dependent on the mask type produced. */
10724 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10725 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10726 == rgm->factor);
10727 nvectors = rgm->factor;
10728
10729 /* Populate the rgroup's mask array, if this is the first time we've
10730 used it. */
10731 if (rgm->controls.is_empty ())
10732 {
10733 rgm->controls.safe_grow_cleared (nvectors, true);
10734 for (unsigned int i = 0; i < nvectors; ++i)
10735 {
10736 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10737 /* Provide a dummy definition until the real one is available. */
10738 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10739 rgm->controls[i] = mask;
10740 }
10741 }
10742 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10743 TYPE_VECTOR_SUBPARTS (vectype)))
10744 return rgm->controls[index];
10745
10746 /* Split the vector if needed. Since we are dealing with integer mode
10747 masks with AVX512 we can operate on the integer representation
10748 performing the whole vector shifting. */
10749 unsigned HOST_WIDE_INT factor;
10750 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10751 TYPE_VECTOR_SUBPARTS (vectype), &factor);
10752 gcc_assert (ok);
10753 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10754 tree mask_type = truth_type_for (vectype);
10755 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10756 unsigned vi = index / factor;
10757 unsigned vpart = index % factor;
10758 tree vec = rgm->controls[vi];
10759 gimple_seq seq = NULL;
10760 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10761 lang_hooks.types.type_for_mode
10762 (TYPE_MODE (rgm->type), 1), vec);
10763 /* For integer mode masks simply shift the right bits into position. */
10764 if (vpart != 0)
10765 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10766 build_int_cst (integer_type_node,
10767 (TYPE_VECTOR_SUBPARTS (vectype)
10768 * vpart)));
10769 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10770 (TYPE_MODE (mask_type), 1), vec);
10771 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10772 if (seq)
10773 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10774 return vec;
10775 }
10776 else
10777 gcc_unreachable ();
10778 }
10779
10780 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10781 lengths for controlling an operation on VECTYPE. The operation splits
10782 each element of VECTYPE into FACTOR separate subelements, measuring the
10783 length as a number of these subelements. */
10784
10785 void
10786 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10787 unsigned int nvectors, tree vectype, unsigned int factor)
10788 {
10789 gcc_assert (nvectors != 0);
10790 if (lens->length () < nvectors)
10791 lens->safe_grow_cleared (nvectors, true);
10792 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10793
10794 /* The number of scalars per iteration, scalar occupied bytes and
10795 the number of vectors are both compile-time constants. */
10796 unsigned int nscalars_per_iter
10797 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10798 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10799
10800 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10801 {
10802 /* For now, we only support cases in which all loads and stores fall back
10803 to VnQI or none do. */
10804 gcc_assert (!rgl->max_nscalars_per_iter
10805 || (rgl->factor == 1 && factor == 1)
10806 || (rgl->max_nscalars_per_iter * rgl->factor
10807 == nscalars_per_iter * factor));
10808 rgl->max_nscalars_per_iter = nscalars_per_iter;
10809 rgl->type = vectype;
10810 rgl->factor = factor;
10811 }
10812 }
10813
10814 /* Given a complete set of lengths LENS, extract length number INDEX
10815 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10816 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10817 multipled by the number of elements that should be processed.
10818 Insert any set-up statements before GSI. */
10819
10820 tree
10821 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10822 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10823 unsigned int index, unsigned int factor)
10824 {
10825 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10826 bool use_bias_adjusted_len =
10827 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10828
10829 /* Populate the rgroup's len array, if this is the first time we've
10830 used it. */
10831 if (rgl->controls.is_empty ())
10832 {
10833 rgl->controls.safe_grow_cleared (nvectors, true);
10834 for (unsigned int i = 0; i < nvectors; ++i)
10835 {
10836 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10837 gcc_assert (len_type != NULL_TREE);
10838
10839 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10840
10841 /* Provide a dummy definition until the real one is available. */
10842 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10843 rgl->controls[i] = len;
10844
10845 if (use_bias_adjusted_len)
10846 {
10847 gcc_assert (i == 0);
10848 tree adjusted_len =
10849 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10850 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10851 rgl->bias_adjusted_ctrl = adjusted_len;
10852 }
10853 }
10854 }
10855
10856 if (use_bias_adjusted_len)
10857 return rgl->bias_adjusted_ctrl;
10858
10859 tree loop_len = rgl->controls[index];
10860 if (rgl->factor == 1 && factor == 1)
10861 {
10862 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10863 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10864 if (maybe_ne (nunits1, nunits2))
10865 {
10866 /* A loop len for data type X can be reused for data type Y
10867 if X has N times more elements than Y and if Y's elements
10868 are N times bigger than X's. */
10869 gcc_assert (multiple_p (nunits1, nunits2));
10870 factor = exact_div (nunits1, nunits2).to_constant ();
10871 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10872 gimple_seq seq = NULL;
10873 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10874 build_int_cst (iv_type, factor));
10875 if (seq)
10876 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10877 }
10878 }
10879 return loop_len;
10880 }
10881
10882 /* Scale profiling counters by estimation for LOOP which is vectorized
10883 by factor VF.
10884 If FLAT is true, the loop we started with had unrealistically flat
10885 profile. */
10886
10887 static void
10888 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10889 {
10890 /* For flat profiles do not scale down proportionally by VF and only
10891 cap by known iteration count bounds. */
10892 if (flat)
10893 {
10894 if (dump_file && (dump_flags & TDF_DETAILS))
10895 fprintf (dump_file,
10896 "Vectorized loop profile seems flat; not scaling iteration "
10897 "count down by the vectorization factor %i\n", vf);
10898 scale_loop_profile (loop, profile_probability::always (),
10899 get_likely_max_loop_iterations_int (loop));
10900 return;
10901 }
10902 /* Loop body executes VF fewer times and exit increases VF times. */
10903 edge exit_e = single_exit (loop);
10904 profile_count entry_count = loop_preheader_edge (loop)->count ();
10905
10906 /* If we have unreliable loop profile avoid dropping entry
10907 count bellow header count. This can happen since loops
10908 has unrealistically low trip counts. */
10909 while (vf > 1
10910 && loop->header->count > entry_count
10911 && loop->header->count < entry_count * vf)
10912 {
10913 if (dump_file && (dump_flags & TDF_DETAILS))
10914 fprintf (dump_file,
10915 "Vectorization factor %i seems too large for profile "
10916 "prevoiusly believed to be consistent; reducing.\n", vf);
10917 vf /= 2;
10918 }
10919
10920 if (entry_count.nonzero_p ())
10921 set_edge_probability_and_rescale_others
10922 (exit_e,
10923 entry_count.probability_in (loop->header->count / vf));
10924 /* Avoid producing very large exit probability when we do not have
10925 sensible profile. */
10926 else if (exit_e->probability < profile_probability::always () / (vf * 2))
10927 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10928 loop->latch->count = single_pred_edge (loop->latch)->count ();
10929
10930 scale_loop_profile (loop, profile_probability::always () / vf,
10931 get_likely_max_loop_iterations_int (loop));
10932 }
10933
10934 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10935 latch edge values originally defined by it. */
10936
10937 static void
10938 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10939 stmt_vec_info def_stmt_info)
10940 {
10941 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10942 if (!def || TREE_CODE (def) != SSA_NAME)
10943 return;
10944 stmt_vec_info phi_info;
10945 imm_use_iterator iter;
10946 use_operand_p use_p;
10947 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10948 {
10949 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10950 if (!phi)
10951 continue;
10952 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10953 && (phi_info = loop_vinfo->lookup_stmt (phi))
10954 && STMT_VINFO_RELEVANT_P (phi_info)))
10955 continue;
10956 loop_p loop = gimple_bb (phi)->loop_father;
10957 edge e = loop_latch_edge (loop);
10958 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10959 continue;
10960
10961 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10962 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10963 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10964 {
10965 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10966 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10967 gcc_assert (phi_defs.length () == latch_defs.length ());
10968 for (unsigned i = 0; i < phi_defs.length (); ++i)
10969 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10970 gimple_get_lhs (latch_defs[i]), e,
10971 gimple_phi_arg_location (phi, e->dest_idx));
10972 }
10973 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10974 {
10975 /* For first order recurrences we have to update both uses of
10976 the latch definition, the one in the PHI node and the one
10977 in the generated VEC_PERM_EXPR. */
10978 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10979 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10980 gcc_assert (phi_defs.length () == latch_defs.length ());
10981 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10982 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10983 for (unsigned i = 0; i < phi_defs.length (); ++i)
10984 {
10985 gassign *perm = as_a <gassign *> (phi_defs[i]);
10986 if (i > 0)
10987 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10988 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10989 update_stmt (perm);
10990 }
10991 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10992 gimple_phi_arg_location (phi, e->dest_idx));
10993 }
10994 }
10995 }
10996
10997 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10998 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10999 stmt_vec_info. */
11000
11001 static bool
11002 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11003 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11004 {
11005 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11006 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11007
11008 if (dump_enabled_p ())
11009 dump_printf_loc (MSG_NOTE, vect_location,
11010 "------>vectorizing statement: %G", stmt_info->stmt);
11011
11012 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11013 vect_loop_kill_debug_uses (loop, stmt_info);
11014
11015 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11016 && !STMT_VINFO_LIVE_P (stmt_info))
11017 return false;
11018
11019 if (STMT_VINFO_VECTYPE (stmt_info))
11020 {
11021 poly_uint64 nunits
11022 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11023 if (!STMT_SLP_TYPE (stmt_info)
11024 && maybe_ne (nunits, vf)
11025 && dump_enabled_p ())
11026 /* For SLP VF is set according to unrolling factor, and not
11027 to vector size, hence for SLP this print is not valid. */
11028 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11029 }
11030
11031 /* Pure SLP statements have already been vectorized. We still need
11032 to apply loop vectorization to hybrid SLP statements. */
11033 if (PURE_SLP_STMT (stmt_info))
11034 return false;
11035
11036 if (dump_enabled_p ())
11037 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11038
11039 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11040 *seen_store = stmt_info;
11041
11042 return true;
11043 }
11044
11045 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11046 in the hash_map with its corresponding values. */
11047
11048 static tree
11049 find_in_mapping (tree t, void *context)
11050 {
11051 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11052
11053 tree *value = mapping->get (t);
11054 return value ? *value : t;
11055 }
11056
11057 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11058 original loop that has now been vectorized.
11059
11060 The inits of the data_references need to be advanced with the number of
11061 iterations of the main loop. This has been computed in vect_do_peeling and
11062 is stored in parameter ADVANCE. We first restore the data_references
11063 initial offset with the values recored in ORIG_DRS_INIT.
11064
11065 Since the loop_vec_info of this EPILOGUE was constructed for the original
11066 loop, its stmt_vec_infos all point to the original statements. These need
11067 to be updated to point to their corresponding copies as well as the SSA_NAMES
11068 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11069
11070 The data_reference's connections also need to be updated. Their
11071 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11072 stmt_vec_infos, their statements need to point to their corresponding copy,
11073 if they are gather loads or scatter stores then their reference needs to be
11074 updated to point to its corresponding copy and finally we set
11075 'base_misaligned' to false as we have already peeled for alignment in the
11076 prologue of the main loop. */
11077
11078 static void
11079 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11080 {
11081 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11082 auto_vec<gimple *> stmt_worklist;
11083 hash_map<tree,tree> mapping;
11084 gimple *orig_stmt, *new_stmt;
11085 gimple_stmt_iterator epilogue_gsi;
11086 gphi_iterator epilogue_phi_gsi;
11087 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11088 basic_block *epilogue_bbs = get_loop_body (epilogue);
11089 unsigned i;
11090
11091 free (LOOP_VINFO_BBS (epilogue_vinfo));
11092 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11093
11094 /* Advance data_reference's with the number of iterations of the previous
11095 loop and its prologue. */
11096 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11097
11098
11099 /* The EPILOGUE loop is a copy of the original loop so they share the same
11100 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11101 point to the copied statements. We also create a mapping of all LHS' in
11102 the original loop and all the LHS' in the EPILOGUE and create worklists to
11103 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11104 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11105 {
11106 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11107 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11108 {
11109 new_stmt = epilogue_phi_gsi.phi ();
11110
11111 gcc_assert (gimple_uid (new_stmt) > 0);
11112 stmt_vinfo
11113 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11114
11115 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11116 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11117
11118 mapping.put (gimple_phi_result (orig_stmt),
11119 gimple_phi_result (new_stmt));
11120 /* PHI nodes can not have patterns or related statements. */
11121 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11122 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11123 }
11124
11125 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11126 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11127 {
11128 new_stmt = gsi_stmt (epilogue_gsi);
11129 if (is_gimple_debug (new_stmt))
11130 continue;
11131
11132 gcc_assert (gimple_uid (new_stmt) > 0);
11133 stmt_vinfo
11134 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11135
11136 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11137 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11138
11139 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11140 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11141
11142 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11143 {
11144 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11145 for (gimple_stmt_iterator gsi = gsi_start (seq);
11146 !gsi_end_p (gsi); gsi_next (&gsi))
11147 stmt_worklist.safe_push (gsi_stmt (gsi));
11148 }
11149
11150 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11151 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11152 {
11153 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11154 stmt_worklist.safe_push (stmt);
11155 /* Set BB such that the assert in
11156 'get_initial_def_for_reduction' is able to determine that
11157 the BB of the related stmt is inside this loop. */
11158 gimple_set_bb (stmt,
11159 gimple_bb (new_stmt));
11160 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11161 gcc_assert (related_vinfo == NULL
11162 || related_vinfo == stmt_vinfo);
11163 }
11164 }
11165 }
11166
11167 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11168 using the original main loop and thus need to be updated to refer to the
11169 cloned variables used in the epilogue. */
11170 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11171 {
11172 gimple *stmt = stmt_worklist[i];
11173 tree *new_op;
11174
11175 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11176 {
11177 tree op = gimple_op (stmt, j);
11178 if ((new_op = mapping.get(op)))
11179 gimple_set_op (stmt, j, *new_op);
11180 else
11181 {
11182 /* PR92429: The last argument of simplify_replace_tree disables
11183 folding when replacing arguments. This is required as
11184 otherwise you might end up with different statements than the
11185 ones analyzed in vect_loop_analyze, leading to different
11186 vectorization. */
11187 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11188 &find_in_mapping, &mapping, false);
11189 gimple_set_op (stmt, j, op);
11190 }
11191 }
11192 }
11193
11194 struct data_reference *dr;
11195 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11196 FOR_EACH_VEC_ELT (datarefs, i, dr)
11197 {
11198 orig_stmt = DR_STMT (dr);
11199 gcc_assert (gimple_uid (orig_stmt) > 0);
11200 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11201 /* Data references for gather loads and scatter stores do not use the
11202 updated offset we set using ADVANCE. Instead we have to make sure the
11203 reference in the data references point to the corresponding copy of
11204 the original in the epilogue. */
11205 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11206 == VMAT_GATHER_SCATTER)
11207 {
11208 DR_REF (dr)
11209 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11210 &find_in_mapping, &mapping);
11211 DR_BASE_ADDRESS (dr)
11212 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11213 &find_in_mapping, &mapping);
11214 }
11215 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11216 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11217 /* The vector size of the epilogue is smaller than that of the main loop
11218 so the alignment is either the same or lower. This means the dr will
11219 thus by definition be aligned. */
11220 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11221 }
11222
11223 epilogue_vinfo->shared->datarefs_copy.release ();
11224 epilogue_vinfo->shared->save_datarefs ();
11225 }
11226
11227 /* Function vect_transform_loop.
11228
11229 The analysis phase has determined that the loop is vectorizable.
11230 Vectorize the loop - created vectorized stmts to replace the scalar
11231 stmts in the loop, and update the loop exit condition.
11232 Returns scalar epilogue loop if any. */
11233
11234 class loop *
11235 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11236 {
11237 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11238 class loop *epilogue = NULL;
11239 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11240 int nbbs = loop->num_nodes;
11241 int i;
11242 tree niters_vector = NULL_TREE;
11243 tree step_vector = NULL_TREE;
11244 tree niters_vector_mult_vf = NULL_TREE;
11245 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11246 unsigned int lowest_vf = constant_lower_bound (vf);
11247 gimple *stmt;
11248 bool check_profitability = false;
11249 unsigned int th;
11250 bool flat = maybe_flat_loop_profile (loop);
11251
11252 DUMP_VECT_SCOPE ("vec_transform_loop");
11253
11254 loop_vinfo->shared->check_datarefs ();
11255
11256 /* Use the more conservative vectorization threshold. If the number
11257 of iterations is constant assume the cost check has been performed
11258 by our caller. If the threshold makes all loops profitable that
11259 run at least the (estimated) vectorization factor number of times
11260 checking is pointless, too. */
11261 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11262 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11263 {
11264 if (dump_enabled_p ())
11265 dump_printf_loc (MSG_NOTE, vect_location,
11266 "Profitability threshold is %d loop iterations.\n",
11267 th);
11268 check_profitability = true;
11269 }
11270
11271 /* Make sure there exists a single-predecessor exit bb. Do this before
11272 versioning. */
11273 edge e = single_exit (loop);
11274 if (! single_pred_p (e->dest))
11275 {
11276 split_loop_exit_edge (e, true);
11277 if (dump_enabled_p ())
11278 dump_printf (MSG_NOTE, "split exit edge\n");
11279 }
11280
11281 /* Version the loop first, if required, so the profitability check
11282 comes first. */
11283
11284 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11285 {
11286 class loop *sloop
11287 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11288 sloop->force_vectorize = false;
11289 check_profitability = false;
11290 }
11291
11292 /* Make sure there exists a single-predecessor exit bb also on the
11293 scalar loop copy. Do this after versioning but before peeling
11294 so CFG structure is fine for both scalar and if-converted loop
11295 to make slpeel_duplicate_current_defs_from_edges face matched
11296 loop closed PHI nodes on the exit. */
11297 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11298 {
11299 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11300 if (! single_pred_p (e->dest))
11301 {
11302 split_loop_exit_edge (e, true);
11303 if (dump_enabled_p ())
11304 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11305 }
11306 }
11307
11308 tree niters = vect_build_loop_niters (loop_vinfo);
11309 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11310 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11311 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11312 tree advance;
11313 drs_init_vec orig_drs_init;
11314
11315 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11316 &step_vector, &niters_vector_mult_vf, th,
11317 check_profitability, niters_no_overflow,
11318 &advance);
11319 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11320 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11321 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11322 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11323
11324 if (niters_vector == NULL_TREE)
11325 {
11326 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11327 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11328 && known_eq (lowest_vf, vf))
11329 {
11330 niters_vector
11331 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11332 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11333 step_vector = build_one_cst (TREE_TYPE (niters));
11334 }
11335 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11336 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11337 &step_vector, niters_no_overflow);
11338 else
11339 /* vect_do_peeling subtracted the number of peeled prologue
11340 iterations from LOOP_VINFO_NITERS. */
11341 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11342 &niters_vector, &step_vector,
11343 niters_no_overflow);
11344 }
11345
11346 /* 1) Make sure the loop header has exactly two entries
11347 2) Make sure we have a preheader basic block. */
11348
11349 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11350
11351 split_edge (loop_preheader_edge (loop));
11352
11353 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11354 /* This will deal with any possible peeling. */
11355 vect_prepare_for_masked_peels (loop_vinfo);
11356
11357 /* Schedule the SLP instances first, then handle loop vectorization
11358 below. */
11359 if (!loop_vinfo->slp_instances.is_empty ())
11360 {
11361 DUMP_VECT_SCOPE ("scheduling SLP instances");
11362 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11363 }
11364
11365 /* FORNOW: the vectorizer supports only loops which body consist
11366 of one basic block (header + empty latch). When the vectorizer will
11367 support more involved loop forms, the order by which the BBs are
11368 traversed need to be reconsidered. */
11369
11370 for (i = 0; i < nbbs; i++)
11371 {
11372 basic_block bb = bbs[i];
11373 stmt_vec_info stmt_info;
11374
11375 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11376 gsi_next (&si))
11377 {
11378 gphi *phi = si.phi ();
11379 if (dump_enabled_p ())
11380 dump_printf_loc (MSG_NOTE, vect_location,
11381 "------>vectorizing phi: %G", (gimple *) phi);
11382 stmt_info = loop_vinfo->lookup_stmt (phi);
11383 if (!stmt_info)
11384 continue;
11385
11386 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11387 vect_loop_kill_debug_uses (loop, stmt_info);
11388
11389 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11390 && !STMT_VINFO_LIVE_P (stmt_info))
11391 continue;
11392
11393 if (STMT_VINFO_VECTYPE (stmt_info)
11394 && (maybe_ne
11395 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11396 && dump_enabled_p ())
11397 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11398
11399 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11400 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11401 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11402 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11403 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11404 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11405 && ! PURE_SLP_STMT (stmt_info))
11406 {
11407 if (dump_enabled_p ())
11408 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11409 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11410 }
11411 }
11412
11413 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11414 gsi_next (&si))
11415 {
11416 gphi *phi = si.phi ();
11417 stmt_info = loop_vinfo->lookup_stmt (phi);
11418 if (!stmt_info)
11419 continue;
11420
11421 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11422 && !STMT_VINFO_LIVE_P (stmt_info))
11423 continue;
11424
11425 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11426 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11427 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11428 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11429 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11430 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11431 && ! PURE_SLP_STMT (stmt_info))
11432 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11433 }
11434
11435 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11436 !gsi_end_p (si);)
11437 {
11438 stmt = gsi_stmt (si);
11439 /* During vectorization remove existing clobber stmts. */
11440 if (gimple_clobber_p (stmt))
11441 {
11442 unlink_stmt_vdef (stmt);
11443 gsi_remove (&si, true);
11444 release_defs (stmt);
11445 }
11446 else
11447 {
11448 /* Ignore vector stmts created in the outer loop. */
11449 stmt_info = loop_vinfo->lookup_stmt (stmt);
11450
11451 /* vector stmts created in the outer-loop during vectorization of
11452 stmts in an inner-loop may not have a stmt_info, and do not
11453 need to be vectorized. */
11454 stmt_vec_info seen_store = NULL;
11455 if (stmt_info)
11456 {
11457 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11458 {
11459 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11460 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11461 !gsi_end_p (subsi); gsi_next (&subsi))
11462 {
11463 stmt_vec_info pat_stmt_info
11464 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11465 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11466 &si, &seen_store);
11467 }
11468 stmt_vec_info pat_stmt_info
11469 = STMT_VINFO_RELATED_STMT (stmt_info);
11470 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11471 &si, &seen_store))
11472 maybe_set_vectorized_backedge_value (loop_vinfo,
11473 pat_stmt_info);
11474 }
11475 else
11476 {
11477 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11478 &seen_store))
11479 maybe_set_vectorized_backedge_value (loop_vinfo,
11480 stmt_info);
11481 }
11482 }
11483 gsi_next (&si);
11484 if (seen_store)
11485 {
11486 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11487 /* Interleaving. If IS_STORE is TRUE, the
11488 vectorization of the interleaving chain was
11489 completed - free all the stores in the chain. */
11490 vect_remove_stores (loop_vinfo,
11491 DR_GROUP_FIRST_ELEMENT (seen_store));
11492 else
11493 /* Free the attached stmt_vec_info and remove the stmt. */
11494 loop_vinfo->remove_stmt (stmt_info);
11495 }
11496 }
11497 }
11498
11499 /* Stub out scalar statements that must not survive vectorization.
11500 Doing this here helps with grouped statements, or statements that
11501 are involved in patterns. */
11502 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11503 !gsi_end_p (gsi); gsi_next (&gsi))
11504 {
11505 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11506 if (!call || !gimple_call_internal_p (call))
11507 continue;
11508 internal_fn ifn = gimple_call_internal_fn (call);
11509 if (ifn == IFN_MASK_LOAD)
11510 {
11511 tree lhs = gimple_get_lhs (call);
11512 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11513 {
11514 tree zero = build_zero_cst (TREE_TYPE (lhs));
11515 gimple *new_stmt = gimple_build_assign (lhs, zero);
11516 gsi_replace (&gsi, new_stmt, true);
11517 }
11518 }
11519 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11520 {
11521 tree lhs = gimple_get_lhs (call);
11522 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11523 {
11524 tree else_arg
11525 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11526 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11527 gsi_replace (&gsi, new_stmt, true);
11528 }
11529 }
11530 }
11531 } /* BBs in loop */
11532
11533 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11534 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11535 if (integer_onep (step_vector))
11536 niters_no_overflow = true;
11537 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11538 niters_vector_mult_vf, !niters_no_overflow);
11539
11540 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11541
11542 /* True if the final iteration might not handle a full vector's
11543 worth of scalar iterations. */
11544 bool final_iter_may_be_partial
11545 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11546 /* The minimum number of iterations performed by the epilogue. This
11547 is 1 when peeling for gaps because we always need a final scalar
11548 iteration. */
11549 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11550 /* +1 to convert latch counts to loop iteration counts,
11551 -min_epilogue_iters to remove iterations that cannot be performed
11552 by the vector code. */
11553 int bias_for_lowest = 1 - min_epilogue_iters;
11554 int bias_for_assumed = bias_for_lowest;
11555 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11556 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11557 {
11558 /* When the amount of peeling is known at compile time, the first
11559 iteration will have exactly alignment_npeels active elements.
11560 In the worst case it will have at least one. */
11561 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11562 bias_for_lowest += lowest_vf - min_first_active;
11563 bias_for_assumed += assumed_vf - min_first_active;
11564 }
11565 /* In these calculations the "- 1" converts loop iteration counts
11566 back to latch counts. */
11567 if (loop->any_upper_bound)
11568 {
11569 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11570 loop->nb_iterations_upper_bound
11571 = (final_iter_may_be_partial
11572 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11573 lowest_vf) - 1
11574 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11575 lowest_vf) - 1);
11576 if (main_vinfo
11577 /* Both peeling for alignment and peeling for gaps can end up
11578 with the scalar epilogue running for more than VF-1 iterations. */
11579 && !main_vinfo->peeling_for_alignment
11580 && !main_vinfo->peeling_for_gaps)
11581 {
11582 unsigned int bound;
11583 poly_uint64 main_iters
11584 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11585 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11586 main_iters
11587 = upper_bound (main_iters,
11588 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11589 if (can_div_away_from_zero_p (main_iters,
11590 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11591 &bound))
11592 loop->nb_iterations_upper_bound
11593 = wi::umin ((widest_int) (bound - 1),
11594 loop->nb_iterations_upper_bound);
11595 }
11596 }
11597 if (loop->any_likely_upper_bound)
11598 loop->nb_iterations_likely_upper_bound
11599 = (final_iter_may_be_partial
11600 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11601 + bias_for_lowest, lowest_vf) - 1
11602 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11603 + bias_for_lowest, lowest_vf) - 1);
11604 if (loop->any_estimate)
11605 loop->nb_iterations_estimate
11606 = (final_iter_may_be_partial
11607 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11608 assumed_vf) - 1
11609 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11610 assumed_vf) - 1);
11611 scale_profile_for_vect_loop (loop, assumed_vf, flat);
11612
11613 if (dump_enabled_p ())
11614 {
11615 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11616 {
11617 dump_printf_loc (MSG_NOTE, vect_location,
11618 "LOOP VECTORIZED\n");
11619 if (loop->inner)
11620 dump_printf_loc (MSG_NOTE, vect_location,
11621 "OUTER LOOP VECTORIZED\n");
11622 dump_printf (MSG_NOTE, "\n");
11623 }
11624 else
11625 dump_printf_loc (MSG_NOTE, vect_location,
11626 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11627 GET_MODE_NAME (loop_vinfo->vector_mode));
11628 }
11629
11630 /* Loops vectorized with a variable factor won't benefit from
11631 unrolling/peeling. */
11632 if (!vf.is_constant ())
11633 {
11634 loop->unroll = 1;
11635 if (dump_enabled_p ())
11636 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11637 " variable-length vectorization factor\n");
11638 }
11639 /* Free SLP instances here because otherwise stmt reference counting
11640 won't work. */
11641 slp_instance instance;
11642 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11643 vect_free_slp_instance (instance);
11644 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11645 /* Clear-up safelen field since its value is invalid after vectorization
11646 since vectorized loop can have loop-carried dependencies. */
11647 loop->safelen = 0;
11648
11649 if (epilogue)
11650 {
11651 update_epilogue_loop_vinfo (epilogue, advance);
11652
11653 epilogue->simduid = loop->simduid;
11654 epilogue->force_vectorize = loop->force_vectorize;
11655 epilogue->dont_vectorize = false;
11656 }
11657
11658 return epilogue;
11659 }
11660
11661 /* The code below is trying to perform simple optimization - revert
11662 if-conversion for masked stores, i.e. if the mask of a store is zero
11663 do not perform it and all stored value producers also if possible.
11664 For example,
11665 for (i=0; i<n; i++)
11666 if (c[i])
11667 {
11668 p1[i] += 1;
11669 p2[i] = p3[i] +2;
11670 }
11671 this transformation will produce the following semi-hammock:
11672
11673 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11674 {
11675 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11676 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11677 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11678 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11679 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11680 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11681 }
11682 */
11683
11684 void
11685 optimize_mask_stores (class loop *loop)
11686 {
11687 basic_block *bbs = get_loop_body (loop);
11688 unsigned nbbs = loop->num_nodes;
11689 unsigned i;
11690 basic_block bb;
11691 class loop *bb_loop;
11692 gimple_stmt_iterator gsi;
11693 gimple *stmt;
11694 auto_vec<gimple *> worklist;
11695 auto_purge_vect_location sentinel;
11696
11697 vect_location = find_loop_location (loop);
11698 /* Pick up all masked stores in loop if any. */
11699 for (i = 0; i < nbbs; i++)
11700 {
11701 bb = bbs[i];
11702 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11703 gsi_next (&gsi))
11704 {
11705 stmt = gsi_stmt (gsi);
11706 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11707 worklist.safe_push (stmt);
11708 }
11709 }
11710
11711 free (bbs);
11712 if (worklist.is_empty ())
11713 return;
11714
11715 /* Loop has masked stores. */
11716 while (!worklist.is_empty ())
11717 {
11718 gimple *last, *last_store;
11719 edge e, efalse;
11720 tree mask;
11721 basic_block store_bb, join_bb;
11722 gimple_stmt_iterator gsi_to;
11723 tree vdef, new_vdef;
11724 gphi *phi;
11725 tree vectype;
11726 tree zero;
11727
11728 last = worklist.pop ();
11729 mask = gimple_call_arg (last, 2);
11730 bb = gimple_bb (last);
11731 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11732 the same loop as if_bb. It could be different to LOOP when two
11733 level loop-nest is vectorized and mask_store belongs to the inner
11734 one. */
11735 e = split_block (bb, last);
11736 bb_loop = bb->loop_father;
11737 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11738 join_bb = e->dest;
11739 store_bb = create_empty_bb (bb);
11740 add_bb_to_loop (store_bb, bb_loop);
11741 e->flags = EDGE_TRUE_VALUE;
11742 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11743 /* Put STORE_BB to likely part. */
11744 efalse->probability = profile_probability::likely ();
11745 e->probability = efalse->probability.invert ();
11746 store_bb->count = efalse->count ();
11747 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11748 if (dom_info_available_p (CDI_DOMINATORS))
11749 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11750 if (dump_enabled_p ())
11751 dump_printf_loc (MSG_NOTE, vect_location,
11752 "Create new block %d to sink mask stores.",
11753 store_bb->index);
11754 /* Create vector comparison with boolean result. */
11755 vectype = TREE_TYPE (mask);
11756 zero = build_zero_cst (vectype);
11757 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11758 gsi = gsi_last_bb (bb);
11759 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11760 /* Create new PHI node for vdef of the last masked store:
11761 .MEM_2 = VDEF <.MEM_1>
11762 will be converted to
11763 .MEM.3 = VDEF <.MEM_1>
11764 and new PHI node will be created in join bb
11765 .MEM_2 = PHI <.MEM_1, .MEM_3>
11766 */
11767 vdef = gimple_vdef (last);
11768 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11769 gimple_set_vdef (last, new_vdef);
11770 phi = create_phi_node (vdef, join_bb);
11771 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11772
11773 /* Put all masked stores with the same mask to STORE_BB if possible. */
11774 while (true)
11775 {
11776 gimple_stmt_iterator gsi_from;
11777 gimple *stmt1 = NULL;
11778
11779 /* Move masked store to STORE_BB. */
11780 last_store = last;
11781 gsi = gsi_for_stmt (last);
11782 gsi_from = gsi;
11783 /* Shift GSI to the previous stmt for further traversal. */
11784 gsi_prev (&gsi);
11785 gsi_to = gsi_start_bb (store_bb);
11786 gsi_move_before (&gsi_from, &gsi_to);
11787 /* Setup GSI_TO to the non-empty block start. */
11788 gsi_to = gsi_start_bb (store_bb);
11789 if (dump_enabled_p ())
11790 dump_printf_loc (MSG_NOTE, vect_location,
11791 "Move stmt to created bb\n%G", last);
11792 /* Move all stored value producers if possible. */
11793 while (!gsi_end_p (gsi))
11794 {
11795 tree lhs;
11796 imm_use_iterator imm_iter;
11797 use_operand_p use_p;
11798 bool res;
11799
11800 /* Skip debug statements. */
11801 if (is_gimple_debug (gsi_stmt (gsi)))
11802 {
11803 gsi_prev (&gsi);
11804 continue;
11805 }
11806 stmt1 = gsi_stmt (gsi);
11807 /* Do not consider statements writing to memory or having
11808 volatile operand. */
11809 if (gimple_vdef (stmt1)
11810 || gimple_has_volatile_ops (stmt1))
11811 break;
11812 gsi_from = gsi;
11813 gsi_prev (&gsi);
11814 lhs = gimple_get_lhs (stmt1);
11815 if (!lhs)
11816 break;
11817
11818 /* LHS of vectorized stmt must be SSA_NAME. */
11819 if (TREE_CODE (lhs) != SSA_NAME)
11820 break;
11821
11822 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11823 {
11824 /* Remove dead scalar statement. */
11825 if (has_zero_uses (lhs))
11826 {
11827 gsi_remove (&gsi_from, true);
11828 continue;
11829 }
11830 }
11831
11832 /* Check that LHS does not have uses outside of STORE_BB. */
11833 res = true;
11834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11835 {
11836 gimple *use_stmt;
11837 use_stmt = USE_STMT (use_p);
11838 if (is_gimple_debug (use_stmt))
11839 continue;
11840 if (gimple_bb (use_stmt) != store_bb)
11841 {
11842 res = false;
11843 break;
11844 }
11845 }
11846 if (!res)
11847 break;
11848
11849 if (gimple_vuse (stmt1)
11850 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11851 break;
11852
11853 /* Can move STMT1 to STORE_BB. */
11854 if (dump_enabled_p ())
11855 dump_printf_loc (MSG_NOTE, vect_location,
11856 "Move stmt to created bb\n%G", stmt1);
11857 gsi_move_before (&gsi_from, &gsi_to);
11858 /* Shift GSI_TO for further insertion. */
11859 gsi_prev (&gsi_to);
11860 }
11861 /* Put other masked stores with the same mask to STORE_BB. */
11862 if (worklist.is_empty ()
11863 || gimple_call_arg (worklist.last (), 2) != mask
11864 || worklist.last () != stmt1)
11865 break;
11866 last = worklist.pop ();
11867 }
11868 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11869 }
11870 }
11871
11872 /* Decide whether it is possible to use a zero-based induction variable
11873 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11874 the value that the induction variable must be able to hold in order
11875 to ensure that the rgroups eventually have no active vector elements.
11876 Return -1 otherwise. */
11877
11878 widest_int
11879 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11880 {
11881 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11882 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11883 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11884
11885 /* Calculate the value that the induction variable must be able
11886 to hit in order to ensure that we end the loop with an all-false mask.
11887 This involves adding the maximum number of inactive trailing scalar
11888 iterations. */
11889 widest_int iv_limit = -1;
11890 if (max_loop_iterations (loop, &iv_limit))
11891 {
11892 if (niters_skip)
11893 {
11894 /* Add the maximum number of skipped iterations to the
11895 maximum iteration count. */
11896 if (TREE_CODE (niters_skip) == INTEGER_CST)
11897 iv_limit += wi::to_widest (niters_skip);
11898 else
11899 iv_limit += max_vf - 1;
11900 }
11901 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11902 /* Make a conservatively-correct assumption. */
11903 iv_limit += max_vf - 1;
11904
11905 /* IV_LIMIT is the maximum number of latch iterations, which is also
11906 the maximum in-range IV value. Round this value down to the previous
11907 vector alignment boundary and then add an extra full iteration. */
11908 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11909 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11910 }
11911 return iv_limit;
11912 }
11913
11914 /* For the given rgroup_controls RGC, check whether an induction variable
11915 would ever hit a value that produces a set of all-false masks or zero
11916 lengths before wrapping around. Return true if it's possible to wrap
11917 around before hitting the desirable value, otherwise return false. */
11918
11919 bool
11920 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11921 {
11922 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11923
11924 if (iv_limit == -1)
11925 return true;
11926
11927 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11928 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11929 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11930
11931 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11932 return true;
11933
11934 return false;
11935 }